Spaces:

zidsi
/

SLlamica_test

Sleeping

App Files Files Community

zidsi commited on Dec 19, 2024

Commit

5ab747e

1 Parent(s): 226c90f

add params

Browse files

Files changed (1) hide show

app.py +16 -17

app.py CHANGED Viewed

@@ -7,20 +7,20 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
 import os
 HF_TOKEN = os.getenv('HF_TOKEN')
-checkpoint = "zidsi/SLlamica_PT4SFT_v2"
 device = "cuda"  # "cuda" or "cpu"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint,token=HF_TOKEN)
 model = AutoModelForCausalLM.from_pretrained(checkpoint,token=HF_TOKEN)
 model.to(device)
 @spaces.GPU
-def predict(message, history):
     history.append({"role": "user", "content": message})
     input_text = tokenizer.apply_chat_template(history, tokenize=False)
     inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
     # Use TextStreamer for streaming response
-    streamer = TextStreamer(tokenizer)
-    outputs = model.generate(inputs, max_new_tokens=512, temperature=0.2, top_p=0.9, do_sample=True, streamer=streamer)
     # Despite returning the usual output, the streamer will also print the generated text to stdout.
     decoded = tokenizer.decode(outputs[0])
@@ -32,20 +32,19 @@ For information on how to customize the ChatInterface, peruse the gradio docs: h
 """
 demo = gr.ChatInterface(
     predict, type="messages",
 )
-# additional_inputs=[
-#        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-#        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-#        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-#        gr.Slider(
-#            minimum=0.1,
-#            maximum=1.0,
-#            value=0.95,
-#            step=0.05,
-#            label="Top-p (nucleus sampling)",
-#        ),
-#    ],
 if __name__ == "__main__":
     demo.launch()

 import os
 HF_TOKEN = os.getenv('HF_TOKEN')
+checkpoint = "zidsi/SLlamica_PT4SFT_v1"
 device = "cuda"  # "cuda" or "cpu"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint,token=HF_TOKEN)
 model = AutoModelForCausalLM.from_pretrained(checkpoint,token=HF_TOKEN)
 model.to(device)
 @spaces.GPU
+def predict(message, history,max_new_tokens,temperature,top_p):
     history.append({"role": "user", "content": message})
     input_text = tokenizer.apply_chat_template(history, tokenize=False)
     inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
     # Use TextStreamer for streaming response
+    # streamer = TextStreamer(tokenizer)
+    outputs = model.generate(inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=True)
     # Despite returning the usual output, the streamer will also print the generated text to stdout.
     decoded = tokenizer.decode(outputs[0])
 """
 demo = gr.ChatInterface(
     predict, type="messages",
+additional_inputs=[
+        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(
+            minimum=0.1,
+            maximum=1.0,
+            value=0.95,
+            step=0.05,
+            label="Top-p (nucleus sampling)",
+        ),
+    ],
 )
 if __name__ == "__main__":
     demo.launch()