Spaces:

Asilbek14
/

zephyr-for-mobile

Running

App Files Files Community

Asilbek14 commited on Aug 20

Commit

9c8aa08

verified ·

1 Parent(s): 8d6719c

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -32

app.py CHANGED Viewed

@@ -1,34 +1,41 @@
 import gradio as gr
-import torch
 from transformers import AutoTokenizer
 from auto_gptq import AutoGPTQForCausalLM
 from transformers import pipeline
 # ---------------- CONFIG ----------------
-MODEL_REPO = "TheBloke/vicuna-13b-v1.3.0-GPTQ"
 TRANSLATOR_MODEL = "facebook/m2m100_418M"
 SYSTEM_PROMPT_DEFAULT = (
-    "You are Vicuna-13B, a formal and polite AI assistant. "
     "Always respond formally and answer appropriately depending on the selected explanation style."
 )
-# ✅ Optimized defaults
 MAX_NEW_TOKENS_DEFAULT = 300
 TEMP_DEFAULT = 0.3
 TOP_P_DEFAULT = 0.9
-# ---------------- DEVICE ----------------
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# ---------------- MODEL & TOKENIZER ----------------
-tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO, use_fast=True)
 model = AutoGPTQForCausalLM.from_quantized(
-    MODEL_REPO,
-    device=device,
     use_safetensors=True,
-    device_map="auto"
 )
 # ---------------- TRANSLATOR ----------------
 translator = pipeline("translation", model=TRANSLATOR_MODEL)
@@ -41,8 +48,12 @@ def is_translation_request(message: str) -> bool:
     non_ascii_ratio = sum(1 for c in message if ord(c) > 127) / max(len(message), 1)
     return non_ascii_ratio > 0.4
 # ---------------- CHAT FUNCTION ----------------
 def stream_response(message, chat_history, system_message, max_tokens, temperature, top_p, response_style):
     if is_translation_request(message):
         try:
             translated = translator(message, src_lang="auto", tgt_lang="en")[0]["translation_text"]
@@ -64,39 +75,33 @@ def stream_response(message, chat_history, system_message, max_tokens, temperatu
     elif response_style == "Detailed explanation":
         system_message += " Provide a thorough and detailed answer with reasoning and examples."
-    # Format chat into a single prompt
-    formatted_prompt = system_message + "\n\n"
-    for turn in chat_history:
-        formatted_prompt += f"{turn['role'].capitalize()}: {turn['content']}\n"
-    formatted_prompt += f"User: {message}\nAssistant:"
-    # Append user turn first
     chat_history.append({"role": "user", "content": message})
-    response = ""
     chat_history.append({"role": "assistant", "content": ""})  # placeholder
     try:
-        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
-        generation = model.generate(
-            **inputs,
-            max_new_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p
-        )
-        output_text = tokenizer.decode(generation[0], skip_special_tokens=True)
-        response += output_text.split("Assistant:")[-1].strip()
         chat_history[-1]["content"] = response
         yield "", chat_history
     except Exception as e:
         chat_history[-1]["content"] = f"⚠️ Error generating response: {str(e)}"
         yield "", chat_history
-    yield "", chat_history
 # ---------------- UI ----------------
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="pink")) as demo:
-    gr.Markdown("# 🦙 Vicuna-13B Chat + 🌍 Translator")
     chatbot = gr.Chatbot(type="messages", height=500, show_copy_button=True, label="Chat Assistant")
@@ -122,7 +127,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="pink"))
     clear_btn.click(lambda: [], None, chatbot, queue=False)
     gr.Markdown("---")
-    gr.Markdown("🔗 Built with ❤️ using [Vicuna-13B GPTQ](https://huggingface.co/TheBloke/vicuna-13b-v1.3.0-GPTQ) & [M2M100](https://huggingface.co/facebook/m2m100_418M).")
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 from transformers import AutoTokenizer
 from auto_gptq import AutoGPTQForCausalLM
 from transformers import pipeline
+import torch
+import os
 # ---------------- CONFIG ----------------
+MODEL_NAME = "TheBloke/vicuna-7b-1.1-HF"
+MODEL_TYPE = "GPTQ"  # AutoGPTQ model type
 TRANSLATOR_MODEL = "facebook/m2m100_418M"
 SYSTEM_PROMPT_DEFAULT = (
+    "You are Vicuna 7B, a formal and polite AI assistant. "
     "Always respond formally and answer appropriately depending on the selected explanation style."
 )
+# Defaults
 MAX_NEW_TOKENS_DEFAULT = 300
 TEMP_DEFAULT = 0.3
 TOP_P_DEFAULT = 0.9
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# ---------------- MODEL ----------------
+offload_dir = "./offload"  # Folder for offloading if VRAM is insufficient
+os.makedirs(offload_dir, exist_ok=True)
+print("Loading tokenizer and GPTQ model...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
 model = AutoGPTQForCausalLM.from_quantized(
+    MODEL_NAME,
+    device=DEVICE,
+    use_triton=False,
     use_safetensors=True,
+    offload_folder=offload_dir
 )
+print("Model loaded successfully!")
 # ---------------- TRANSLATOR ----------------
 translator = pipeline("translation", model=TRANSLATOR_MODEL)
     non_ascii_ratio = sum(1 for c in message if ord(c) > 127) / max(len(message), 1)
     return non_ascii_ratio > 0.4
+def format_prompt(message, system_message):
+    return f"{system_message}\nUser: {message}\nAssistant:"
 # ---------------- CHAT FUNCTION ----------------
 def stream_response(message, chat_history, system_message, max_tokens, temperature, top_p, response_style):
+    # Translation handling
     if is_translation_request(message):
         try:
             translated = translator(message, src_lang="auto", tgt_lang="en")[0]["translation_text"]
     elif response_style == "Detailed explanation":
         system_message += " Provide a thorough and detailed answer with reasoning and examples."
+    prompt = format_prompt(message, system_message)
+    # Append user turn
     chat_history.append({"role": "user", "content": message})
     chat_history.append({"role": "assistant", "content": ""})  # placeholder
     try:
+        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(DEVICE)
+        # Generate with streaming
+        with torch.no_grad():
+            output_ids = model.generate(
+                input_ids,
+                max_new_tokens=max_tokens,
+                do_sample=True,
+                temperature=temperature,
+                top_p=top_p
+            )
+        response = tokenizer.decode(output_ids[0][input_ids.shape[-1]:], skip_special_tokens=True)
         chat_history[-1]["content"] = response
         yield "", chat_history
     except Exception as e:
         chat_history[-1]["content"] = f"⚠️ Error generating response: {str(e)}"
         yield "", chat_history
 # ---------------- UI ----------------
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="pink")) as demo:
+    gr.Markdown("# 🦙 Vicuna-7B Chat + 🌍 Translator")
     chatbot = gr.Chatbot(type="messages", height=500, show_copy_button=True, label="Chat Assistant")
     clear_btn.click(lambda: [], None, chatbot, queue=False)
     gr.Markdown("---")
+    gr.Markdown("🔗 Built with ❤️ using [Vicuna-7B](https://huggingface.co/TheBloke/vicuna-7b-1.1-HF) & [M2M100](https://huggingface.co/facebook/m2m100_418M).")
 if __name__ == "__main__":
     demo.launch()