Spaces:

luminoussg
/

choupijiang

Sleeping

App Files Files Community

luminoussg commited on 29 days ago

Commit

c1e5d4c

verified ·

1 Parent(s): edb32fe

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -59

app.py CHANGED Viewed

@@ -1,85 +1,82 @@
-import os
 import gradio as gr
 import requests
-import json
-# Get the Hugging Face API key from Spaces secrets.
 HF_API_KEY = os.getenv("HF_API_KEY")
-# Model endpoints on Hugging Face
 MODEL_ENDPOINTS = {
     "Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
     "Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
     "Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
 }
-# System prompts for each model
-SYSTEM_PROMPTS = {
-    "Qwen2.5-72B-Instruct": "System: You are a knowledgeable assistant for general inquiries.",
-    "Llama3.3-70B-Instruct": "System: You are a research expert assistant specialized in in-depth analysis.",
-    "Qwen2.5-Coder-32B-Instruct": "System: You are a coding expert who helps with code-related tasks.",
-}
-def query_model(prompt, model_endpoint, system_prompt):
     headers = {
         "Authorization": f"Bearer {HF_API_KEY}",
-        "Content-Type": "application/json",
-        "Accept": "application/json"
     }
-    # Format the prompt to include the system instruction and structure the conversation.
-    formatted_prompt = f"{system_prompt}\nUser: {prompt}\nAssistant:"
-    # Include the stop sequence so generation halts when the next user turn starts.
-    data = {
-        "inputs": formatted_prompt,
         "parameters": {
-            "max_new_tokens": 512,
-            "temperature": 0.6,
-            "stop_sequences": ["\nUser:"]
         }
     }
-    response = requests.post(model_endpoint, headers=headers, json=data)
-    # Uncomment the next line to print raw API responses for debugging.
-    # print("Raw response:", response.text)
-    try:
-        result = response.json()
-    except Exception:
-        return f"Error: Unable to parse JSON. Response: {response.text}"
-    if isinstance(result, dict) and "error" in result:
-        return f"Error: {result['error']}"
-    try:
-        generated_text = result[0].get("generated_text", "No generated_text found in response")
-        # Optionally, strip off the prompt if needed:
-        # generated_text = generated_text[len(formatted_prompt):].strip()
-        return generated_text
-    except Exception:
-        return f"Error: Unexpected response format: {json.dumps(result)}"
-def chat_with_models(user_input, history):
     responses = []
-    for model_name, endpoint in MODEL_ENDPOINTS.items():
-        system_prompt = SYSTEM_PROMPTS.get(model_name, "")
-        model_response = query_model(user_input, endpoint, system_prompt)
-        responses.append(f"**{model_name}**: {model_response}")
-    combined_answer = "\n\n".join(responses)
-    history.append((user_input, combined_answer))
-    return history, history
-with gr.Blocks() as demo:
-    gr.Markdown("# Multi-LLM Chatbot using Hugging Face Inference API with Stop Sequences")
-    chatbot = gr.Chatbot()
-    msg = gr.Textbox(label="Your Message")
-    clear = gr.Button("Clear")
-    def clear_chat():
-        return [], []
-    msg.submit(fn=chat_with_models, inputs=[msg, chatbot], outputs=[chatbot, chatbot])
-    clear.click(fn=clear_chat, outputs=[chatbot, chatbot])
-demo.launch()

 import gradio as gr
+import os
 import requests
+import threading
+from typing import List, Dict, Any
+# Get the Hugging Face API key from Spaces secrets
 HF_API_KEY = os.getenv("HF_API_KEY")
+# Model endpoints configuration
 MODEL_ENDPOINTS = {
     "Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
     "Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
     "Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
 }
+def query_model(model_name: str, messages: List[Dict[str, str]]) -> str:
+    """Query a single model with the chat history"""
+    endpoint = MODEL_ENDPOINTS[model_name]
     headers = {
         "Authorization": f"Bearer {HF_API_KEY}",
+        "Content-Type": "application/json"
     }
+    # Format the prompt according to each model's requirements
+    prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
+    payload = {
+        "inputs": prompt,
         "parameters": {
+            "max_tokens": 1024,
+            "temperature": 0.7,
+            "stop_sequences": ["\nUser:", "\nAssistant:", "###"]
         }
     }
+    try:
+        response = requests.post(endpoint, json=payload, headers=headers)
+        response.raise_for_status()
+        return response.json()[0]['generated_text']
+    except Exception as e:
+        return f"Error from {model_name}: {str(e)}"
+def respond(message: str, history: List[List[str]]) -> str:
+    """Handle chat responses from all models"""
+    # Prepare messages in OpenAI format
+    messages = [{"role": "user", "content": message}]
+    # Create threads for concurrent model queries
+    threads = []
+    results = {}
+    def get_model_response(model_name):
+        results[model_name] = query_model(model_name, messages)
+    for model_name in MODEL_ENDPOINTS:
+        thread = threading.Thread(target=get_model_response, args=(model_name,))
+        thread.start()
+        threads.append(thread)
+    # Wait for all threads to complete
+    for thread in threads:
+        thread.join()
+    # Format responses from all models
     responses = []
+    for model_name, response in results.items():
+        responses.append(f"**{model_name}**:\n{response}")
+    return "\n\n".join(responses)
+# Create the Gradio interface
+chat_interface = gr.ChatInterface(
+    respond,
+    title="Multi-LLM Collaboration Chat",
+    description="A group chat with Qwen2.5-72B, Llama3.3-70B, and Qwen2.5-Coder-32B",
+    examples=["How can I optimize Python code?", "Explain quantum computing basics"],
+    theme="soft"
+)
+if __name__ == "__main__":
+    chat_interface.launch(share=True)