chrispie commited on
Commit
6c3e1ec
·
verified ·
1 Parent(s): b212660

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -24
app.py CHANGED
@@ -5,7 +5,7 @@ from threading import Thread
5
 
6
  #Load the model
7
  model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq'
8
- model = HQQModelForCausalLM.from_quantized(model_id, adapter='adapter_v0.1.lora', device='cpu')
9
  tokenizer = AutoTokenizer.from_pretrained(model_id)
10
 
11
  #Setup Inference Mode
@@ -16,9 +16,9 @@ model.config.use_cache = True
16
  model.eval();
17
 
18
  # Optional: torch compile for faster inference
19
- # model = torch.compile(model)
20
 
21
- def chat_processor(chat, max_new_tokens=100, do_sample=True, device='cpu'):
22
  tokenizer.use_default_system_prompt = False
23
  streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
24
 
@@ -47,27 +47,16 @@ def chat_processor(chat, max_new_tokens=100, do_sample=True, device='cpu'):
47
 
48
  #torch.cuda.empty_cache()
49
 
50
- return streamer
51
 
52
- with gr.Blocks() as demo:
53
- chatbot = gr.Chatbot()
54
- msg = gr.Textbox()
55
- clear = gr.Button("Clear")
 
 
56
 
57
- def user(user_message, history):
58
- return "", history + [[user_message, None]]
59
 
60
- def bot(history):
61
- print("Question: ", history[-1][0])
62
- stream = chat_processor(chat=history[-1][0])
63
- history[-1][1] = ""
64
- for character in stream:
65
- print(character)
66
- history[-1][1] += character
67
- yield history
68
-
69
- msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
70
- clear.click(lambda: None, None, chatbot, queue=False)
71
-
72
- demo.queue()
73
- demo.launch()
 
5
 
6
  #Load the model
7
  model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq'
8
+ model = HQQModelForCausalLM.from_quantized(model_id, adapter='adapter_v0.1.lora', device='cuda')
9
  tokenizer = AutoTokenizer.from_pretrained(model_id)
10
 
11
  #Setup Inference Mode
 
16
  model.eval();
17
 
18
  # Optional: torch compile for faster inference
19
+ model = torch.compile(model)
20
 
21
+ def chat_processor(chat, max_new_tokens=100, do_sample=True, device='cuda'):
22
  tokenizer.use_default_system_prompt = False
23
  streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
24
 
 
47
 
48
  #torch.cuda.empty_cache()
49
 
50
+ return t, streamer
51
 
52
+ def chat(message, history):
53
+ t, stream = chat_processor(chat=message)
54
+ response = ""
55
+ for character in stream:
56
+ response += character
57
+ yield response
58
 
59
+ t.join()
60
+ torch.cuda.empty_cache()
61
 
62
+ gr.ChatInterface(chat).launch()