Spaces:

Asilbek14
/

zephyr-for-mobile

Running

App Files Files Community

Asilbek14 commited on Aug 20

Commit

17da298

verified ·

1 Parent(s): 275a6bc

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -42

app.py CHANGED Viewed

@@ -1,13 +1,15 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-from transformers import pipeline
 # ---------------- CONFIG ----------------
-MODEL_REPO = "TheBloke/vicuna-13b-1.1-HF"
 TRANSLATOR_MODEL = "facebook/m2m100_418M"
 SYSTEM_PROMPT_DEFAULT = (
-    "You are Vicuna, a formal and polite AI assistant. "
     "Always respond formally and answer appropriately depending on the selected explanation style."
 )
@@ -16,8 +18,27 @@ MAX_NEW_TOKENS_DEFAULT = 300
 TEMP_DEFAULT = 0.3
 TOP_P_DEFAULT = 0.9
-# Clients
-client = InferenceClient(MODEL_REPO)
 translator = pipeline("translation", model=TRANSLATOR_MODEL)
 # ---------------- HELPERS ----------------
@@ -29,19 +50,18 @@ def is_translation_request(message: str) -> bool:
     return non_ascii_ratio > 0.4
 # ---------------- CHAT FUNCTION ----------------
-def stream_response(message, chat_history, system_message, max_tokens, temperature, top_p, response_style):
     if is_translation_request(message):
         try:
             translated = translator(message, src_lang="auto", tgt_lang="en")[0]["translation_text"]
             chat_history.append({"role": "user", "content": message})
             chat_history.append({"role": "assistant", "content": translated})
-            yield "", chat_history
-            return
         except Exception as e:
             chat_history.append({"role": "user", "content": message})
             chat_history.append({"role": "assistant", "content": f"⚠️ Translation failed: {str(e)}"})
-            yield "", chat_history
-            return
     # Apply response style
     if response_style == "No explanation":
@@ -51,48 +71,33 @@ def stream_response(message, chat_history, system_message, max_tokens, temperatu
     elif response_style == "Detailed explanation":
         system_message += " Provide a thorough and detailed answer with reasoning and examples."
-    # Format chat for Vicuna (text-generation)
-    formatted_prompt = system_message + "\n\n"
     for turn in chat_history:
-        role = turn["role"]
-        content = turn["content"]
-        formatted_prompt += f"{role.capitalize()}: {content}\n"
-    formatted_prompt += f"User: {message}\nAssistant:"
     # Append user turn first
     chat_history.append({"role": "user", "content": message})
     response = ""
-    chat_history.append({"role": "assistant", "content": ""})  # placeholder
     try:
-        for token in client.text_generation(
-            prompt=formatted_prompt,
             max_new_tokens=max_tokens,
-            stream=True,
             temperature=temperature,
-            top_p=top_p,
-        ):
-            piece = ""
-            if isinstance(token, str):
-                piece = token
-            elif isinstance(token, dict):
-                if "token" in token and "text" in token["token"]:
-                    piece = token["token"]["text"]
-                elif "generated_text" in token:
-                    piece = token["generated_text"]
-            response += piece
-            chat_history[-1]["content"] = response
-            yield "", chat_history
     except Exception as e:
-        chat_history[-1]["content"] = f"⚠️ Error generating response: {str(e)}"
-        yield "", chat_history
-    yield "", chat_history
 # ---------------- UI ----------------
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="pink")) as demo:
-    gr.Markdown("# 🦙 Vicuna-13B Chat + 🌍 Translator")
     chatbot = gr.Chatbot(type="messages", height=500, show_copy_button=True, label="Chat Assistant")
@@ -113,12 +118,12 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="pink"))
         max_tokens = gr.Slider(32, 2048, value=MAX_NEW_TOKENS_DEFAULT, step=16, label="Max new tokens")
     # Events
-    send_btn.click(stream_response, [msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style], [msg, chatbot])
-    msg.submit(stream_response, [msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style], [msg, chatbot])
     clear_btn.click(lambda: [], None, chatbot, queue=False)
     gr.Markdown("---")
-    gr.Markdown("🔗 Built with ❤️ using [Vicuna-13B](https://huggingface.co/TheBloke/vicuna-13b-1.1-HF) & [M2M100](https://huggingface.co/facebook/m2m100_418M).")
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+from transformers import AutoTokenizer, pipeline
+from auto_gptq import AutoGPTQForCausalLM
 # ---------------- CONFIG ----------------
+MODEL_NAME = "TheBloke/vicuna-13b-v1.3.0-GPTQ"
+MODEL_BASENAME = "vicuna-13b-v1.3.0-GPTQ-4bit-128g.no-act.order"
 TRANSLATOR_MODEL = "facebook/m2m100_418M"
 SYSTEM_PROMPT_DEFAULT = (
+    "You are Vicuna-13B, a formal and polite AI assistant. "
     "Always respond formally and answer appropriately depending on the selected explanation style."
 )
 TEMP_DEFAULT = 0.3
 TOP_P_DEFAULT = 0.9
+# ---------------- LOAD MODELS ----------------
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
+model = AutoGPTQForCausalLM.from_quantized(
+    MODEL_NAME,
+    model_basename=MODEL_BASENAME,
+    use_safetensors=True,
+    trust_remote_code=True,
+    device="cuda:0",  # GPU
+    use_triton=False,
+    quantize_config=None
+)
+generator = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    max_new_tokens=MAX_NEW_TOKENS_DEFAULT,
+    temperature=TEMP_DEFAULT,
+    top_p=TOP_P_DEFAULT,
+    repetition_penalty=1.15
+)
 translator = pipeline("translation", model=TRANSLATOR_MODEL)
 # ---------------- HELPERS ----------------
     return non_ascii_ratio > 0.4
 # ---------------- CHAT FUNCTION ----------------
+def chat_response(message, chat_history, system_message, max_tokens, temperature, top_p, response_style):
+    # Translation
     if is_translation_request(message):
         try:
             translated = translator(message, src_lang="auto", tgt_lang="en")[0]["translation_text"]
             chat_history.append({"role": "user", "content": message})
             chat_history.append({"role": "assistant", "content": translated})
+            return "", chat_history
         except Exception as e:
             chat_history.append({"role": "user", "content": message})
             chat_history.append({"role": "assistant", "content": f"⚠️ Translation failed: {str(e)}"})
+            return "", chat_history
     # Apply response style
     if response_style == "No explanation":
     elif response_style == "Detailed explanation":
         system_message += " Provide a thorough and detailed answer with reasoning and examples."
+    # Format prompt
+    prompt = system_message + "\n\n"
     for turn in chat_history:
+        prompt += f"{turn['role'].capitalize()}: {turn['content']}\n"
+    prompt += f"User: {message}\nAssistant:"
     # Append user turn first
     chat_history.append({"role": "user", "content": message})
     response = ""
     try:
+        output = generator(
+            prompt,
             max_new_tokens=max_tokens,
             temperature=temperature,
+            top_p=top_p
+        )
+        response_text = output[0]["generated_text"].split("Assistant:")[-1].strip()
+        chat_history.append({"role": "assistant", "content": response_text})
     except Exception as e:
+        chat_history.append({"role": "assistant", "content": f"⚠️ Error generating response: {str(e)}"})
+    return "", chat_history
 # ---------------- UI ----------------
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="pink")) as demo:
+    gr.Markdown("# Vicuna-13B Chat + 🌍 Translator")
     chatbot = gr.Chatbot(type="messages", height=500, show_copy_button=True, label="Chat Assistant")
         max_tokens = gr.Slider(32, 2048, value=MAX_NEW_TOKENS_DEFAULT, step=16, label="Max new tokens")
     # Events
+    send_btn.click(chat_response, [msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style], [msg, chatbot])
+    msg.submit(chat_response, [msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style], [msg, chatbot])
     clear_btn.click(lambda: [], None, chatbot, queue=False)
     gr.Markdown("---")
+    gr.Markdown("🔗 Built with ❤️ using [Vicuna-13B](https://huggingface.co/TheBloke/vicuna-13b-v1.3.0-GPTQ) & [M2M100](https://huggingface.co/facebook/m2m100_418M).")
 if __name__ == "__main__":
     demo.launch()