chrispie commited on
Commit
eacbb97
·
verified ·
1 Parent(s): 2922088

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -0
app.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
3
+ import torch, transformers
4
+ from threading import Thread
5
+
6
+ #Load the model
7
+ model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq'
8
+ model = HQQModelForCausalLM.from_quantized(model_id, adapter='adapter_v0.1.lora')
9
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
10
+
11
+ #Setup Inference Mode
12
+ tokenizer.add_bos_token = False
13
+ tokenizer.add_eos_token = False
14
+ if not tokenizer.pad_token: tokenizer.add_special_tokens({'pad_token': '[PAD]'})
15
+ model.config.use_cache = True
16
+ model.eval();
17
+
18
+ # Optional: torch compile for faster inference
19
+ model = torch.compile(model)
20
+
21
+ def chat_processor(chat, max_new_tokens=100, do_sample=True, device='cpu'):
22
+ tokenizer.use_default_system_prompt = False
23
+ streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
24
+
25
+ generate_params = dict(
26
+ tokenizer("<s> [INST] " + chat + " [/INST] ", return_tensors="pt").to(device),
27
+ streamer=streamer,
28
+ max_new_tokens=max_new_tokens,
29
+ do_sample=do_sample,
30
+ pad_token_id=tokenizer.pad_token_id,
31
+ top_p=0.90 if do_sample else None,
32
+ top_k=50 if do_sample else None,
33
+ temperature= 0.6 if do_sample else None,
34
+ num_beams=1,
35
+ repetition_penalty=1.2,
36
+ )
37
+
38
+ t = Thread(target=model.generate, kwargs=generate_params)
39
+ t.start()
40
+
41
+ #print("User: ", chat);
42
+ #print("Assistant: ");
43
+ #outputs = ""
44
+ #for text in streamer:
45
+ # outputs += text
46
+ # print(text, end="", flush=True)
47
+
48
+ #torch.cuda.empty_cache()
49
+
50
+ return streamer
51
+
52
+ with gr.Blocks() as demo:
53
+ chatbot = gr.Chatbot()
54
+ msg = gr.Textbox()
55
+ clear = gr.Button("Clear")
56
+
57
+ def user(user_message, history):
58
+ return "", history + [[user_message, None]]
59
+
60
+ def bot(history):
61
+ print("Question: ", history[-1][0])
62
+ stream = chat_processor(chat=history[-1][0])
63
+ history[-1][1] = ""
64
+ for character in stream:
65
+ print(character)
66
+ history[-1][1] += character
67
+ yield history
68
+
69
+ msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
70
+ clear.click(lambda: None, None, chatbot, queue=False)
71
+
72
+ demo.queue()
73
+ demo.launch()