Spaces:

Asilbek14
/

zephyr-for-mobile

Running

App Files Files Community

Asilbek14 commited on Aug 20

Commit

590a39e

verified ·

1 Parent(s): 9c8aa08

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -92

app.py CHANGED Viewed

@@ -1,109 +1,69 @@
 import gradio as gr
-from transformers import AutoTokenizer
-from auto_gptq import AutoGPTQForCausalLM
-from transformers import pipeline
-import torch
-import os
 # ---------------- CONFIG ----------------
-MODEL_NAME = "TheBloke/vicuna-7b-1.1-HF"
-MODEL_TYPE = "GPTQ"  # AutoGPTQ model type
-TRANSLATOR_MODEL = "facebook/m2m100_418M"
 SYSTEM_PROMPT_DEFAULT = (
-    "You are Vicuna 7B, a formal and polite AI assistant. "
-    "Always respond formally and answer appropriately depending on the selected explanation style."
 )
-# Defaults
-MAX_NEW_TOKENS_DEFAULT = 300
 TEMP_DEFAULT = 0.3
 TOP_P_DEFAULT = 0.9
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# ---------------- MODEL ----------------
-offload_dir = "./offload"  # Folder for offloading if VRAM is insufficient
-os.makedirs(offload_dir, exist_ok=True)
-print("Loading tokenizer and GPTQ model...")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
-model = AutoGPTQForCausalLM.from_quantized(
-    MODEL_NAME,
-    device=DEVICE,
-    use_triton=False,
-    use_safetensors=True,
-    offload_folder=offload_dir
 )
-print("Model loaded successfully!")
-# ---------------- TRANSLATOR ----------------
-translator = pipeline("translation", model=TRANSLATOR_MODEL)
 # ---------------- HELPERS ----------------
-def is_translation_request(message: str) -> bool:
-    triggers = ["translate", "traduce", "ترجم", "traduire", "übersetze"]
-    if any(t in message.lower() for t in triggers):
-        return True
-    non_ascii_ratio = sum(1 for c in message if ord(c) > 127) / max(len(message), 1)
-    return non_ascii_ratio > 0.4
-def format_prompt(message, system_message):
-    return f"{system_message}\nUser: {message}\nAssistant:"
-# ---------------- CHAT FUNCTION ----------------
-def stream_response(message, chat_history, system_message, max_tokens, temperature, top_p, response_style):
-    # Translation handling
-    if is_translation_request(message):
-        try:
-            translated = translator(message, src_lang="auto", tgt_lang="en")[0]["translation_text"]
-            chat_history.append({"role": "user", "content": message})
-            chat_history.append({"role": "assistant", "content": translated})
-            yield "", chat_history
-            return
-        except Exception as e:
-            chat_history.append({"role": "user", "content": message})
-            chat_history.append({"role": "assistant", "content": f"⚠️ Translation failed: {str(e)}"})
-            yield "", chat_history
-            return
-    # Apply response style
     if response_style == "No explanation":
-        system_message += " Only provide the direct answer with no explanation."
     elif response_style == "Short explanation":
-        system_message += " Provide a concise answer with a one-sentence explanation."
     elif response_style == "Detailed explanation":
-        system_message += " Provide a thorough and detailed answer with reasoning and examples."
-    prompt = format_prompt(message, system_message)
-    # Append user turn
-    chat_history.append({"role": "user", "content": message})
-    chat_history.append({"role": "assistant", "content": ""})  # placeholder
-    try:
-        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(DEVICE)
-        # Generate with streaming
-        with torch.no_grad():
-            output_ids = model.generate(
-                input_ids,
-                max_new_tokens=max_tokens,
-                do_sample=True,
-                temperature=temperature,
-                top_p=top_p
-            )
-        response = tokenizer.decode(output_ids[0][input_ids.shape[-1]:], skip_special_tokens=True)
-        chat_history[-1]["content"] = response
-        yield "", chat_history
-    except Exception as e:
-        chat_history[-1]["content"] = f"⚠️ Error generating response: {str(e)}"
-        yield "", chat_history
 # ---------------- UI ----------------
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="pink")) as demo:
-    gr.Markdown("# 🦙 Vicuna-7B Chat + 🌍 Translator")
-    chatbot = gr.Chatbot(type="messages", height=500, show_copy_button=True, label="Chat Assistant")
     with gr.Row():
         msg = gr.Textbox(label="💬 Your Message", placeholder="Type here…", scale=6)
@@ -121,13 +81,17 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="pink"))
         top_p = gr.Slider(0.1, 1.0, value=TOP_P_DEFAULT, step=0.05, label="Top-p")
         max_tokens = gr.Slider(32, 2048, value=MAX_NEW_TOKENS_DEFAULT, step=16, label="Max new tokens")
-    # Events
-    send_btn.click(stream_response, [msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style], [msg, chatbot])
-    msg.submit(stream_response, [msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style], [msg, chatbot])
     clear_btn.click(lambda: [], None, chatbot, queue=False)
-    gr.Markdown("---")
-    gr.Markdown("🔗 Built with ❤️ using [Vicuna-7B](https://huggingface.co/TheBloke/vicuna-7b-1.1-HF) & [M2M100](https://huggingface.co/facebook/m2m100_418M).")
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 # ---------------- CONFIG ----------------
+MODEL_NAME = "google/gemma-2b-it-pytorch"  # instruction-tuned Gemma 2B
 SYSTEM_PROMPT_DEFAULT = (
+    "You are a formal and polite AI assistant. "
+    "Always respond appropriately depending on the selected explanation style."
 )
+MAX_NEW_TOKENS_DEFAULT = 256
 TEMP_DEFAULT = 0.3
 TOP_P_DEFAULT = 0.9
+# ---------------- LOAD MODEL ----------------
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
+generator = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    device=0 if model.device.type == "cuda" else -1,
 )
 # ---------------- HELPERS ----------------
+def format_prompt(chat_history, user_message, system_message, response_style):
+    prompt = system_message + "\n\n"
+    for turn in chat_history:
+        prompt += f"{turn['role'].capitalize()}: {turn['content']}\n"
+    prompt += f"User: {user_message}\nAssistant:"
     if response_style == "No explanation":
+        prompt += " Only provide the direct answer with no explanation."
     elif response_style == "Short explanation":
+        prompt += " Provide a concise answer with a one-sentence explanation."
     elif response_style == "Detailed explanation":
+        prompt += " Provide a thorough and detailed answer with reasoning and examples."
+    return prompt
+# ---------------- CHAT FUNCTION ----------------
+def chat(user_message, chat_history, system_message, max_tokens, temperature, top_p, response_style):
+    chat_history = chat_history or []
+    prompt = format_prompt(chat_history, user_message, system_message, response_style)
+    output = generator(
+        prompt,
+        max_new_tokens=max_tokens,
+        do_sample=True,
+        temperature=temperature,
+        top_p=top_p,
+    )[0]['generated_text']
+    # Remove prompt part from output
+    response = output[len(prompt):].strip()
+    chat_history.append({"role": "user", "content": user_message})
+    chat_history.append({"role": "assistant", "content": response})
+    return "", chat_history
 # ---------------- UI ----------------
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="pink")) as demo:
+    gr.Markdown("# 🧠 Gemma 2B Chat Assistant")
+    chatbot = gr.Chatbot(type="messages", height=500, show_copy_button=True)
     with gr.Row():
         msg = gr.Textbox(label="💬 Your Message", placeholder="Type here…", scale=6)
         top_p = gr.Slider(0.1, 1.0, value=TOP_P_DEFAULT, step=0.05, label="Top-p")
         max_tokens = gr.Slider(32, 2048, value=MAX_NEW_TOKENS_DEFAULT, step=16, label="Max new tokens")
+    send_btn.click(
+        chat,
+        [msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style],
+        [msg, chatbot]
+    )
+    msg.submit(
+        chat,
+        [msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style],
+        [msg, chatbot]
+    )
     clear_btn.click(lambda: [], None, chatbot, queue=False)
 if __name__ == "__main__":
     demo.launch()