Spaces:

KumaTea
/

KumaGLM

Runtime error

App Files Files Community

KumaTea commited on Apr 19, 2023

Commit

0143ad2

1 Parent(s): 1c99887

No need for starting prompt in API

Browse files

Files changed (1) hide show

app.py +26 -16

app.py CHANGED Viewed

@@ -17,6 +17,14 @@ import gradio as gr
 from transformers import AutoTokenizer, GenerationConfig, AutoModel
 gr_title = """<h1 align="center">KumaGLM</h1>
 <h3 align='center'>这是一个 AI Kuma，你可以与他聊天，或者直接在文本框按下Enter</h3>
 <p align='center'>采样范围 2020/06/13 - 2023/04/15</p>
@@ -33,7 +41,7 @@ gr_footer =  """<p align='center'>
 <p align='center'>
 <em>每天起床第一句！</em>
 </p>"""
-default_start = ["你是Kuma，请和我聊天，每句话以两个竖杠分隔。", "好的，你想聊什么？"]
 # device = torch.device('cpu')
@@ -45,11 +53,11 @@ logging.basicConfig(
     datefmt='%m/%d %H:%M:%S')
 model = AutoModel.from_pretrained(
-    "KumaTea/twitter-int8",
     trust_remote_code=True,
-    revision="1136001"
 ).float()  # .to(device)
-tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, revision="4de8efe")
 # dump a log to ensure everything works well
 # print(model.peft_config)
@@ -60,7 +68,7 @@ model.eval()
 torch.set_default_tensor_type(torch.FloatTensor)
-def evaluate(context, temperature, top_p, top_k=None):
     generation_config = GenerationConfig(
         temperature=temperature,
         top_p=top_p,
@@ -71,21 +79,23 @@ def evaluate(context, temperature, top_p, top_k=None):
     )
     with torch.no_grad():
         # input_text = f"Context: {context}Answer: "
-        input_text = '||'.join(default_start) + '||'
-        input_text += context + '||'
-        logging.info('[API] Incoming request: ' + input_text)
-        ids = tokenizer([input_text], return_tensors="pt")
         inputs = ids.to("cpu")
         out = model.generate(
             **inputs,
-            max_length=224,
             generation_config=generation_config
         )
         out = out.tolist()[0]
         decoder_output = tokenizer.decode(out)
         # out_text = decoder_output.split("Answer: ")[1]
         out_text = decoder_output
-        logging.info('[API] Result: ' + out_text)
         return out_text
@@ -117,17 +127,17 @@ def evaluate_stream(msg, history, temperature, top_p):
     context = context.replace(r'<br>', '')
     # TODO: Avoid the tokens are too long.
-    CUTOFF = 224
-    while len(tokenizer.encode(context)) > CUTOFF:
         # save 15 token size for the answer
         context = context[15:]
     h = []
-    logging.info('[UI] Incoming request: ' + context)
-    for response, h in model.stream_chat(tokenizer, context, h, max_length=CUTOFF, top_p=top_p, temperature=temperature):
         history[-1][1] = response
         yield history, ""
-    logging.info('[UI] Result: ' + response)
 with gr.Blocks() as demo:

 from transformers import AutoTokenizer, GenerationConfig, AutoModel
+chatglm = 'THUDM/chatglm-6b'
+chatglm_rev = '4de8efe'
+int8_model = 'KumaTea/twitter-int8'
+int8_model_rev = '1136001'
+max_length = 224
+default_start = ["你是Kuma，请和我聊天，每句话以两个竖杠分隔。", "好的，你想聊什么？"]
 gr_title = """<h1 align="center">KumaGLM</h1>
 <h3 align='center'>这是一个 AI Kuma，你可以与他聊天，或者直接在文本框按下Enter</h3>
 <p align='center'>采样范围 2020/06/13 - 2023/04/15</p>
 <p align='center'>
 <em>每天起床第一句！</em>
 </p>"""
 # device = torch.device('cpu')
     datefmt='%m/%d %H:%M:%S')
 model = AutoModel.from_pretrained(
+    int8_model,
     trust_remote_code=True,
+    revision=int8_model_rev
 ).float()  # .to(device)
+tokenizer = AutoTokenizer.from_pretrained(chatglm, trust_remote_code=True, revision=chatglm_rev)
 # dump a log to ensure everything works well
 # print(model.peft_config)
 torch.set_default_tensor_type(torch.FloatTensor)
+def evaluate(context, temperature, top_p):
     generation_config = GenerationConfig(
         temperature=temperature,
         top_p=top_p,
     )
     with torch.no_grad():
         # input_text = f"Context: {context}Answer: "
+        # input_text = '||'.join(default_start) + '||'
+        # No need for starting prompt in API
+        if not context.endswith('||'):
+            context += '||'
+        logging.info('[API] Request: ' + context)
+        ids = tokenizer([context], return_tensors="pt")
         inputs = ids.to("cpu")
         out = model.generate(
             **inputs,
+            max_length=max_length,
             generation_config=generation_config
         )
         out = out.tolist()[0]
         decoder_output = tokenizer.decode(out)
         # out_text = decoder_output.split("Answer: ")[1]
         out_text = decoder_output
+        logging.info('[API] Results: ' + out_text)
         return out_text
     context = context.replace(r'<br>', '')
     # TODO: Avoid the tokens are too long.
+    # CUTOFF = 224
+    while len(tokenizer.encode(context)) > max_length:
         # save 15 token size for the answer
         context = context[15:]
     h = []
+    logging.info('[UI] Request: ' + context)
+    for response, h in model.stream_chat(tokenizer, context, h, max_length=max_length, top_p=top_p, temperature=temperature):
         history[-1][1] = response
         yield history, ""
+    logging.info('[UI] Results: ' + response)
 with gr.Blocks() as demo: