Spaces:

chrispie
/

llama-hqq-1-bit

Runtime error

App Files Files Community

chrispie commited on Mar 30, 2024

Commit

6c3e1ec

verified ·

1 Parent(s): b212660

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -24

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ from threading import Thread
 #Load the model
 model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq'
-model     = HQQModelForCausalLM.from_quantized(model_id, adapter='adapter_v0.1.lora', device='cpu')
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 #Setup Inference Mode
@@ -16,9 +16,9 @@ model.config.use_cache  = True
 model.eval();
 # Optional: torch compile for faster inference
-# model = torch.compile(model)
-def chat_processor(chat, max_new_tokens=100, do_sample=True, device='cpu'):
     tokenizer.use_default_system_prompt = False
     streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
@@ -47,27 +47,16 @@ def chat_processor(chat, max_new_tokens=100, do_sample=True, device='cpu'):
     #torch.cuda.empty_cache()
-    return streamer
-with gr.Blocks() as demo:
-    chatbot = gr.Chatbot()
-    msg = gr.Textbox()
-    clear = gr.Button("Clear")
-    def user(user_message, history):
-        return "", history + [[user_message, None]]
-    def bot(history):
-        print("Question: ", history[-1][0])
-        stream = chat_processor(chat=history[-1][0])
-        history[-1][1] = ""
-        for character in stream:
-            print(character)
-            history[-1][1] += character
-            yield history
-    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
-    clear.click(lambda: None, None, chatbot, queue=False)
-demo.queue()
-demo.launch()

 #Load the model
 model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq'
+model     = HQQModelForCausalLM.from_quantized(model_id, adapter='adapter_v0.1.lora', device='cuda')
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 #Setup Inference Mode
 model.eval();
 # Optional: torch compile for faster inference
+model = torch.compile(model)
+def chat_processor(chat, max_new_tokens=100, do_sample=True, device='cuda'):
     tokenizer.use_default_system_prompt = False
     streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     #torch.cuda.empty_cache()
+    return t, streamer
+def chat(message, history):
+    t, stream = chat_processor(chat=message)
+    response = ""
+    for character in stream:
+        response += character
+        yield response
+    t.join()
+    torch.cuda.empty_cache()
+gr.ChatInterface(chat).launch()