Spaces:

Asilbek14
/

zephyr-for-mobile

Running

App Files Files Community

Asilbek14 commited on Aug 19

Commit

f7a5317

verified ·

1 Parent(s): feb390d

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -50

app.py CHANGED Viewed

@@ -4,23 +4,22 @@ from transformers import pipeline
 # ---------------- CONFIG ----------------
 MODEL_REPO = "HuggingFaceH4/zephyr-7b-beta"
-TRANSLATOR_MODEL = "facebook/m2m100_418M"  # multilingual translator
 SYSTEM_PROMPT_DEFAULT = (
     "You are Zephyr, a concise and polite AI assistant. "
-    "Always respond in a formal tone and provide only the direct answer unless the user requests more detail."
 )
-MAX_NEW_TOKENS_DEFAULT = 512  # increased to handle long answers
-TEMP_DEFAULT = 0.7
-TOP_P_DEFAULT = 0.95
-MAX_HISTORY_MESSAGES = 10  # limit chat history to prevent repetition
 # Clients
 client = InferenceClient(MODEL_REPO)
 translator = pipeline("translation", model=TRANSLATOR_MODEL)
 # ---------------- HELPERS ----------------
 def is_translation_request(message: str) -> bool:
     triggers = ["translate", "traduce", "ترجم", "traduire", "übersetze"]
@@ -29,10 +28,8 @@ def is_translation_request(message: str) -> bool:
     non_ascii_ratio = sum(1 for c in message if ord(c) > 127) / max(len(message), 1)
     return non_ascii_ratio > 0.4
 # ---------------- CHAT FUNCTION ----------------
 def stream_response(message, chat_history, system_message, max_tokens, temperature, top_p, response_style):
-    # --- Translation handling ---
     if is_translation_request(message):
         try:
             translated = translator(message, src_lang="auto", tgt_lang="en")[0]["translation_text"]
@@ -46,43 +43,40 @@ def stream_response(message, chat_history, system_message, max_tokens, temperatu
             yield "", chat_history
             return
-    # --- Apply response style ---
     if response_style == "No explanation":
-        style_prompt = " Only provide the direct answer with no explanation."
     elif response_style == "Short explanation":
-        style_prompt = " Provide a concise answer with a one-sentence explanation."
-    else:  # Detailed explanation
-        style_prompt = " Provide a thorough and detailed answer with reasoning and examples."
-    # --- Prepare messages ---
-    # Only keep the last N messages to prevent repetition
-    truncated_history = chat_history[-MAX_HISTORY_MESSAGES:]
-    messages = [{"role": "system", "content": system_message + style_prompt}] + truncated_history
     messages.append({"role": "user", "content": message})
-    # Append user and placeholder for assistant
     chat_history.append({"role": "user", "content": message})
-    chat_history.append({"role": "assistant", "content": ""})
     response = ""
-    # --- Stream response ---
-    for msg in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = msg.choices[0].delta.content or ""
-        response += token
-        chat_history[-1]["content"] = response
         yield "", chat_history
-    # Clear input box after streaming
     yield "", chat_history
 # ---------------- UI ----------------
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="pink")) as demo:
     gr.Markdown("# 🤖 Zephyr-7B Chat + 🌍 Translator")
@@ -98,24 +92,16 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="pink"))
         system_prompt = gr.Textbox(label="System Prompt", value=SYSTEM_PROMPT_DEFAULT, lines=3)
         response_style = gr.Dropdown(
             ["No explanation", "Short explanation", "Detailed explanation"],
-            value="No explanation",
             label="Response Style"
         )
         temperature = gr.Slider(0.1, 1.5, value=TEMP_DEFAULT, step=0.1, label="Temperature")
         top_p = gr.Slider(0.1, 1.0, value=TOP_P_DEFAULT, step=0.05, label="Top-p")
-        max_tokens = gr.Slider(128, 2048, value=MAX_NEW_TOKENS_DEFAULT, step=16, label="Max new tokens")
-    # --- Events ---
-    send_btn.click(
-        stream_response,
-        [msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style],
-        [msg, chatbot]
-    )
-    msg.submit(
-        stream_response,
-        [msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style],
-        [msg, chatbot]
-    )
     clear_btn.click(lambda: [], None, chatbot, queue=False)
     gr.Markdown("---")

 # ---------------- CONFIG ----------------
 MODEL_REPO = "HuggingFaceH4/zephyr-7b-beta"
+TRANSLATOR_MODEL = "facebook/m2m100_418M"
 SYSTEM_PROMPT_DEFAULT = (
     "You are Zephyr, a concise and polite AI assistant. "
+    "Always respond formally and answer appropriately depending on the selected explanation style."
 )
+# ✅ Optimized defaults
+MAX_NEW_TOKENS_DEFAULT = 300
+TEMP_DEFAULT = 0.3
+TOP_P_DEFAULT = 0.9
 # Clients
 client = InferenceClient(MODEL_REPO)
 translator = pipeline("translation", model=TRANSLATOR_MODEL)
 # ---------------- HELPERS ----------------
 def is_translation_request(message: str) -> bool:
     triggers = ["translate", "traduce", "ترجم", "traduire", "übersetze"]
     non_ascii_ratio = sum(1 for c in message if ord(c) > 127) / max(len(message), 1)
     return non_ascii_ratio > 0.4
 # ---------------- CHAT FUNCTION ----------------
 def stream_response(message, chat_history, system_message, max_tokens, temperature, top_p, response_style):
     if is_translation_request(message):
         try:
             translated = translator(message, src_lang="auto", tgt_lang="en")[0]["translation_text"]
             yield "", chat_history
             return
+    # Apply response style
     if response_style == "No explanation":
+        system_message += " Only provide the direct answer with no explanation."
     elif response_style == "Short explanation":
+        system_message += " Provide a concise answer with a one-sentence explanation."
+    elif response_style == "Detailed explanation":
+        system_message += " Provide a thorough and detailed answer with reasoning and examples."
+    messages = [{"role": "system", "content": system_message}] + chat_history
     messages.append({"role": "user", "content": message})
+    # Append user first
     chat_history.append({"role": "user", "content": message})
     response = ""
+    chat_history.append({"role": "assistant", "content": ""})  # placeholder
+    try:
+        for msg in client.chat_completion(
+            messages,
+            max_tokens=max_tokens,
+            stream=True,
+            temperature=temperature,
+            top_p=top_p,
+        ):
+            token = msg.choices[0].delta.content or ""
+            response += token
+            chat_history[-1]["content"] = response
+            yield "", chat_history
+    except Exception as e:
+        chat_history[-1]["content"] = f"⚠️ Error generating response: {str(e)}"
         yield "", chat_history
     yield "", chat_history
 # ---------------- UI ----------------
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="pink")) as demo:
     gr.Markdown("# 🤖 Zephyr-7B Chat + 🌍 Translator")
         system_prompt = gr.Textbox(label="System Prompt", value=SYSTEM_PROMPT_DEFAULT, lines=3)
         response_style = gr.Dropdown(
             ["No explanation", "Short explanation", "Detailed explanation"],
+            value="Detailed explanation",
             label="Response Style"
         )
         temperature = gr.Slider(0.1, 1.5, value=TEMP_DEFAULT, step=0.1, label="Temperature")
         top_p = gr.Slider(0.1, 1.0, value=TOP_P_DEFAULT, step=0.05, label="Top-p")
+        max_tokens = gr.Slider(32, 2048, value=MAX_NEW_TOKENS_DEFAULT, step=16, label="Max new tokens")
+    # Events
+    send_btn.click(stream_response, [msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style], [msg, chatbot])
+    msg.submit(stream_response, [msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style], [msg, chatbot])
     clear_btn.click(lambda: [], None, chatbot, queue=False)
     gr.Markdown("---")