Spaces:

yarenty
/

Chat_tester

Running

App Files Files Community

yarenty commited on Aug 29

Commit

eb28058

1 Parent(s): ab99686

use proper models

Browse files

Files changed (2) hide show

__pycache__/app.cpython-312.pyc +0 -0
app.py +100 -26

__pycache__/app.cpython-312.pyc ADDED Viewed

Binary file (28.3 kB). View file

app.py CHANGED Viewed

@@ -40,6 +40,12 @@ cancel_event = threading.Event()
 # Torch-Compatible Model Definitions with Adjusted Descriptions
 # ------------------------------
 MODELS = {
     # … your existing entries …
     "gpt-oss-20b": {"repo_id": "openai/gpt-oss-20b", "description": "openai/gpt-oss-20b"},
     "Qwen2.5-Taiwan-1.5B-Instruct": {"repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct", "description": "Qwen2.5-Taiwan-1.5B-Instruct"},
@@ -51,9 +57,9 @@ MODELS = {
         "repo_id":"lianghsun/Gemma-3-Taiwan-270M-it",
         "description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
     },
-    "gemma-3-270m-it":{
-    "repo_id":"google/gemma-3-270m-it",
-    "description":"Gemma‑3‑270M‑IT is a compact, 270‑million‑parameter language model fine‑tuned for Italian, offering fast and efficient on‑device text generation and comprehension in the Italian language.",
     },
     "SmolLM-135M-Taiwan-Instruct-v1.0": {
         "repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
@@ -143,6 +149,33 @@ MODELS = {
 # Global cache for pipelines to avoid re-loading.
 PIPELINES = {}
 def load_pipeline(model_name):
     """
     Load and cache a transformers pipeline for text generation.
@@ -159,6 +192,16 @@ def load_pipeline(model_name):
     repo = MODELS[model_name]["repo_id"]
     logger.info(f"📦 Repository: {repo}")
     # Load tokenizer
     logger.info(f"🔤 Loading tokenizer for {repo}...")
     try:
@@ -166,8 +209,15 @@ def load_pipeline(model_name):
                     token=access_token if access_token else None)
         logger.info(f"✅ Tokenizer loaded successfully")
     except Exception as e:
-        logger.error(f"❌ Failed to load tokenizer: {e}")
-        raise
     # Try different data types for optimal performance
     dtypes_to_try = [
@@ -195,8 +245,14 @@ def load_pipeline(model_name):
             return pipe
         except Exception as e:
-            logger.warning(f"⚠️ Failed to load with {dtype_desc}: {e}")
-            continue
     # Final fallback without specific dtype
     logger.warning(f"🔄 Attempting final fallback load without specific dtype...")
@@ -243,13 +299,21 @@ def retrieve_context(query, max_results=6, max_chars=600):
         return []
 def format_conversation(history, system_prompt, tokenizer):
     if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
-        messages = [{"role": "system", "content": system_prompt.strip()}] + history
         return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
     else:
         # Fallback for base LMs without chat template
         prompt = system_prompt.strip() + "\n"
-        for msg in history:
             if msg['role'] == 'user':
                 prompt += "User: " + msg['content'].strip() + "\n"
             elif msg['role'] == 'assistant':
@@ -273,9 +337,18 @@ def chat_response(user_msg, chat_history, system_prompt,
     logger.info(f"🔍 Web search enabled: {enable_search}")
     logger.info(f"⚙️ Generation params: max_tokens={max_tokens}, temp={temperature}, top_k={top_k}, top_p={top_p}")
     cancel_event.clear()
     history = list(chat_history or [])
-    history.append({'role': 'user', 'content': user_msg})
     logger.info(f"📝 Chat history length: {len(history)} messages")
     # Launch web search if enabled
@@ -404,25 +477,21 @@ def chat_response(user_msg, chat_history, system_prompt,
                 logger.info("💭 Detected thinking block start")
                 in_thought = True
                 # Insert thought placeholder
-                history.append({
-                    'role': 'assistant',
-                    'content': '',
-                    'metadata': {'title': '💭 Thought'}
-                })
                 # Capture after opening tag
                 after = text.split('<think>', 1)[1]
                 thought_buf += after
                 # If closing tag in same chunk
                 if '</think>' in thought_buf:
                     before, after2 = thought_buf.split('</think>', 1)
-                    history[-1]['content'] = before.strip()
                     in_thought = False
                     logger.info("💭 Thinking block completed, starting answer")
                     # Start answer buffer
                     answer_buf = after2
-                    history.append({'role': 'assistant', 'content': answer_buf})
                 else:
-                    history[-1]['content'] = thought_buf
                 yield history, debug
                 continue
@@ -431,23 +500,23 @@ def chat_response(user_msg, chat_history, system_prompt,
                 thought_buf += text
                 if '</think>' in thought_buf:
                     before, after2 = thought_buf.split('</think>', 1)
-                    history[-1]['content'] = before.strip()
                     in_thought = False
                     logger.info("💭 Thinking block completed, starting answer")
                     # Start answer buffer
                     answer_buf = after2
-                    history.append({'role': 'assistant', 'content': answer_buf})
                 else:
-                    history[-1]['content'] = thought_buf
                 yield history, debug
                 continue
             # Stream answer
             if not answer_buf:
                 logger.info("📝 Starting answer generation")
-                history.append({'role': 'assistant', 'content': ''})
             answer_buf += text
-            history[-1]['content'] = answer_buf
             yield history, debug
         gen_thread.join()
@@ -455,7 +524,7 @@ def chat_response(user_msg, chat_history, system_prompt,
         yield history, debug + prompt_debug
     except Exception as e:
         logger.error(f"❌ Error during generation: {e}")
-        history.append({'role': 'assistant', 'content': f"Error: {e}"})
         yield history, debug
     finally:
         logger.info("🧹 Cleaning up memory...")
@@ -478,6 +547,7 @@ def update_default_prompt(enable_search):
 with gr.Blocks(title="LLM Inference") as demo:
     gr.Markdown("## 🧠 LLM Inference with Web Search")
     gr.Markdown("Interact with the model. Select parameters and chat below.")
     with gr.Row():
         with gr.Column(scale=3):
             model_dd = gr.Dropdown(label="Select Model", choices=list(MODELS.keys()), value=list(MODELS.keys())[0])
@@ -496,7 +566,7 @@ with gr.Blocks(title="LLM Inference") as demo:
             clr = gr.Button("Clear Chat")
             cnl = gr.Button("Cancel Generation")
         with gr.Column(scale=7):
-            chat = gr.Chatbot(type="messages")
             txt = gr.Textbox(placeholder="Type your message and press Enter...")
             dbg = gr.Markdown()
@@ -508,4 +578,8 @@ with gr.Blocks(title="LLM Inference") as demo:
                        model_dd, max_tok, temp, k, p, rp, st],
                outputs=[chat, dbg])
     logger.info("🚀 Starting Gradio application...")
-    demo.launch()

 # Torch-Compatible Model Definitions with Adjusted Descriptions
 # ------------------------------
 MODELS = {
+    # Accessible models (no gating required)
+    "Qwen2.5-3B-Instruct": {"repo_id": "Qwen/Qwen2.5-3B-Instruct", "description": "Qwen2.5-3B-Instruct - accessible and reliable"},
+    "Qwen2.5-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-7B-Instruct", "description": "Qwen2.5-7B-Instruct - accessible and reliable"},
+    "microsoft-DialoGPT-medium": {"repo_id": "microsoft/DialoGPT-medium", "description": "Microsoft DialoGPT Medium - accessible conversational model"},
+    "microsoft-DialoGPT-large": {"repo_id": "microsoft/DialoGPT-large", "description": "Microsoft DialoGPT Large - accessible conversational model"},
     # … your existing entries …
     "gpt-oss-20b": {"repo_id": "openai/gpt-oss-20b", "description": "openai/gpt-oss-20b"},
     "Qwen2.5-Taiwan-1.5B-Instruct": {"repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct", "description": "Qwen2.5-Taiwan-1.5B-Instruct"},
         "repo_id":"lianghsun/Gemma-3-Taiwan-270M-it",
         "description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
     },
+    "gemma-2-2b-it":{
+    "repo_id":"google/gemma-2-2b-it",
+    "description":"Gemma 2 2B Instruction-Tuned model - accessible alternative to Gemma 3",
     },
     "SmolLM-135M-Taiwan-Instruct-v1.0": {
         "repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
 # Global cache for pipelines to avoid re-loading.
 PIPELINES = {}
+def check_model_accessibility(repo_id, token=None):
+    """
+    Check if a model is accessible without actually loading it.
+    Returns True if accessible, False if gated, raises exception for other errors.
+    """
+    try:
+        from huggingface_hub import HfApi
+        api = HfApi(token=token)
+        model_info = api.model_info(repo_id)
+        # Check if model is gated
+        if hasattr(model_info, 'gated') and model_info.gated:
+            logger.warning(f"⚠️ Model {repo_id} is gated and requires special access")
+            return False
+        logger.info(f"✅ Model {repo_id} is accessible")
+        return True
+    except Exception as e:
+        error_msg = str(e)
+        if "gated" in error_msg.lower() or "401" in error_msg or "access" in error_msg.lower():
+            logger.warning(f"⚠️ Model {repo_id} appears to be gated or requires access")
+            return False
+        else:
+            logger.error(f"❌ Error checking model accessibility: {e}")
+            raise
 def load_pipeline(model_name):
     """
     Load and cache a transformers pipeline for text generation.
     repo = MODELS[model_name]["repo_id"]
     logger.info(f"📦 Repository: {repo}")
+    # Check model accessibility first
+    try:
+        if not check_model_accessibility(repo, access_token):
+            raise Exception(f"Model {repo} is gated and requires special access. Please request access at https://huggingface.co/{repo} or choose a different model.")
+    except Exception as e:
+        if "gated" in str(e).lower() or "access" in str(e).lower():
+            raise
+        else:
+            logger.warning(f"⚠️ Could not check model accessibility, proceeding with load attempt: {e}")
     # Load tokenizer
     logger.info(f"🔤 Loading tokenizer for {repo}...")
     try:
                     token=access_token if access_token else None)
         logger.info(f"✅ Tokenizer loaded successfully")
     except Exception as e:
+        error_msg = str(e)
+        if "gated repo" in error_msg or "401" in error_msg or "Access to model" in error_msg:
+            logger.error(f"❌ Model {repo} is gated and requires special access permissions")
+            logger.error(f"💡 Please visit https://huggingface.co/{repo} to request access")
+            logger.error(f"💡 Or try a different model from the list")
+            raise Exception(f"Model {repo} requires special access. Please request access at https://huggingface.co/{repo} or choose a different model.")
+        else:
+            logger.error(f"❌ Failed to load tokenizer: {e}")
+            raise
     # Try different data types for optimal performance
     dtypes_to_try = [
             return pipe
         except Exception as e:
+            error_msg = str(e)
+            if "gated repo" in error_msg or "401" in error_msg or "Access to model" in error_msg:
+                logger.error(f"❌ Model {repo} is gated and requires special access permissions")
+                logger.error(f"💡 Please visit https://huggingface.co/{repo} to request access")
+                raise Exception(f"Model {repo} requires special access. Please request access at https://huggingface.co/{repo} or choose a different model.")
+            else:
+                logger.warning(f"⚠️ Failed to load with {dtype_desc}: {e}")
+                continue
     # Final fallback without specific dtype
     logger.warning(f"🔄 Attempting final fallback load without specific dtype...")
         return []
 def format_conversation(history, system_prompt, tokenizer):
+    # Convert Gradio tuple format to message format for tokenizer
+    messages = [{"role": "system", "content": system_prompt.strip()}]
+    for user_msg, bot_msg in history:
+        if user_msg:  # Add user message
+            messages.append({"role": "user", "content": user_msg})
+        if bot_msg:  # Add bot message
+            messages.append({"role": "assistant", "content": bot_msg})
     if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
         return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
     else:
         # Fallback for base LMs without chat template
         prompt = system_prompt.strip() + "\n"
+        for msg in messages[1:]:  # Skip system message
             if msg['role'] == 'user':
                 prompt += "User: " + msg['content'].strip() + "\n"
             elif msg['role'] == 'assistant':
     logger.info(f"🔍 Web search enabled: {enable_search}")
     logger.info(f"⚙️ Generation params: max_tokens={max_tokens}, temp={temperature}, top_k={top_k}, top_p={top_p}")
+    # Validate inputs
+    if not user_msg or not user_msg.strip():
+        logger.error("❌ Empty user message received")
+        return [], "Error: Empty message received"
+    if model_name not in MODELS:
+        logger.error(f"❌ Invalid model name: {model_name}")
+        return [], f"Error: Invalid model '{model_name}'"
     cancel_event.clear()
     history = list(chat_history or [])
+    history.append((user_msg, None))  # Add user message, bot response will be added later
     logger.info(f"📝 Chat history length: {len(history)} messages")
     # Launch web search if enabled
                 logger.info("💭 Detected thinking block start")
                 in_thought = True
                 # Insert thought placeholder
+                history.append((None, "💭 Thinking..."))
                 # Capture after opening tag
                 after = text.split('<think>', 1)[1]
                 thought_buf += after
                 # If closing tag in same chunk
                 if '</think>' in thought_buf:
                     before, after2 = thought_buf.split('</think>', 1)
+                    history[-1] = (None, f"💭 {before.strip()}")
                     in_thought = False
                     logger.info("💭 Thinking block completed, starting answer")
                     # Start answer buffer
                     answer_buf = after2
+                    history.append((None, answer_buf))
                 else:
+                    history[-1] = (None, f"💭 {thought_buf}")
                 yield history, debug
                 continue
                 thought_buf += text
                 if '</think>' in thought_buf:
                     before, after2 = thought_buf.split('</think>', 1)
+                    history[-1] = (None, f"💭 {before.strip()}")
                     in_thought = False
                     logger.info("💭 Thinking block completed, starting answer")
                     # Start answer buffer
                     answer_buf = after2
+                    history.append((None, answer_buf))
                 else:
+                    history[-1] = (None, f"💭 {thought_buf}")
                 yield history, debug
                 continue
             # Stream answer
             if not answer_buf:
                 logger.info("📝 Starting answer generation")
+                history.append((None, ''))
             answer_buf += text
+            history[-1] = (None, answer_buf)
             yield history, debug
         gen_thread.join()
         yield history, debug + prompt_debug
     except Exception as e:
         logger.error(f"❌ Error during generation: {e}")
+        history.append((None, f"Error: {e}"))
         yield history, debug
     finally:
         logger.info("🧹 Cleaning up memory...")
 with gr.Blocks(title="LLM Inference") as demo:
     gr.Markdown("## 🧠 LLM Inference with Web Search")
     gr.Markdown("Interact with the model. Select parameters and chat below.")
+    gr.Markdown("💡 **Tip**: If you get access errors, try models like 'Qwen2.5-3B-Instruct' or 'microsoft-DialoGPT-medium' which are publicly accessible.")
     with gr.Row():
         with gr.Column(scale=3):
             model_dd = gr.Dropdown(label="Select Model", choices=list(MODELS.keys()), value=list(MODELS.keys())[0])
             clr = gr.Button("Clear Chat")
             cnl = gr.Button("Cancel Generation")
         with gr.Column(scale=7):
+            chat = gr.Chatbot()
             txt = gr.Textbox(placeholder="Type your message and press Enter...")
             dbg = gr.Markdown()
                        model_dd, max_tok, temp, k, p, rp, st],
                outputs=[chat, dbg])
     logger.info("🚀 Starting Gradio application...")
+    try:
+        demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
+    except Exception as e:
+        logger.error(f"❌ Failed to launch Gradio app: {e}")
+        raise