luminoussg commited on
Commit
c1e5d4c
·
verified ·
1 Parent(s): edb32fe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -59
app.py CHANGED
@@ -1,85 +1,82 @@
1
- import os
2
  import gradio as gr
 
3
  import requests
4
- import json
 
5
 
6
- # Get the Hugging Face API key from Spaces secrets.
7
  HF_API_KEY = os.getenv("HF_API_KEY")
8
 
9
- # Model endpoints on Hugging Face
10
  MODEL_ENDPOINTS = {
11
  "Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
12
  "Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
13
  "Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
14
  }
15
 
16
- # System prompts for each model
17
- SYSTEM_PROMPTS = {
18
- "Qwen2.5-72B-Instruct": "System: You are a knowledgeable assistant for general inquiries.",
19
- "Llama3.3-70B-Instruct": "System: You are a research expert assistant specialized in in-depth analysis.",
20
- "Qwen2.5-Coder-32B-Instruct": "System: You are a coding expert who helps with code-related tasks.",
21
- }
22
-
23
- def query_model(prompt, model_endpoint, system_prompt):
24
  headers = {
25
  "Authorization": f"Bearer {HF_API_KEY}",
26
- "Content-Type": "application/json",
27
- "Accept": "application/json"
28
  }
29
- # Format the prompt to include the system instruction and structure the conversation.
30
- formatted_prompt = f"{system_prompt}\nUser: {prompt}\nAssistant:"
31
 
32
- # Include the stop sequence so generation halts when the next user turn starts.
33
- data = {
34
- "inputs": formatted_prompt,
 
 
35
  "parameters": {
36
- "max_new_tokens": 512,
37
- "temperature": 0.6,
38
- "stop_sequences": ["\nUser:"]
39
  }
40
  }
41
 
42
- response = requests.post(model_endpoint, headers=headers, json=data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- # Uncomment the next line to print raw API responses for debugging.
45
- # print("Raw response:", response.text)
46
 
47
- try:
48
- result = response.json()
49
- except Exception:
50
- return f"Error: Unable to parse JSON. Response: {response.text}"
51
 
52
- if isinstance(result, dict) and "error" in result:
53
- return f"Error: {result['error']}"
 
54
 
55
- try:
56
- generated_text = result[0].get("generated_text", "No generated_text found in response")
57
- # Optionally, strip off the prompt if needed:
58
- # generated_text = generated_text[len(formatted_prompt):].strip()
59
- return generated_text
60
- except Exception:
61
- return f"Error: Unexpected response format: {json.dumps(result)}"
62
-
63
- def chat_with_models(user_input, history):
64
  responses = []
65
- for model_name, endpoint in MODEL_ENDPOINTS.items():
66
- system_prompt = SYSTEM_PROMPTS.get(model_name, "")
67
- model_response = query_model(user_input, endpoint, system_prompt)
68
- responses.append(f"**{model_name}**: {model_response}")
69
- combined_answer = "\n\n".join(responses)
70
- history.append((user_input, combined_answer))
71
- return history, history
72
-
73
- with gr.Blocks() as demo:
74
- gr.Markdown("# Multi-LLM Chatbot using Hugging Face Inference API with Stop Sequences")
75
- chatbot = gr.Chatbot()
76
- msg = gr.Textbox(label="Your Message")
77
- clear = gr.Button("Clear")
78
-
79
- def clear_chat():
80
- return [], []
81
 
82
- msg.submit(fn=chat_with_models, inputs=[msg, chatbot], outputs=[chatbot, chatbot])
83
- clear.click(fn=clear_chat, outputs=[chatbot, chatbot])
 
 
 
 
 
 
84
 
85
- demo.launch()
 
 
 
1
  import gradio as gr
2
+ import os
3
  import requests
4
+ import threading
5
+ from typing import List, Dict, Any
6
 
7
+ # Get the Hugging Face API key from Spaces secrets
8
  HF_API_KEY = os.getenv("HF_API_KEY")
9
 
10
+ # Model endpoints configuration
11
  MODEL_ENDPOINTS = {
12
  "Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
13
  "Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
14
  "Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
15
  }
16
 
17
+ def query_model(model_name: str, messages: List[Dict[str, str]]) -> str:
18
+ """Query a single model with the chat history"""
19
+ endpoint = MODEL_ENDPOINTS[model_name]
 
 
 
 
 
20
  headers = {
21
  "Authorization": f"Bearer {HF_API_KEY}",
22
+ "Content-Type": "application/json"
 
23
  }
 
 
24
 
25
+ # Format the prompt according to each model's requirements
26
+ prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
27
+
28
+ payload = {
29
+ "inputs": prompt,
30
  "parameters": {
31
+ "max_tokens": 1024,
32
+ "temperature": 0.7,
33
+ "stop_sequences": ["\nUser:", "\nAssistant:", "###"]
34
  }
35
  }
36
 
37
+ try:
38
+ response = requests.post(endpoint, json=payload, headers=headers)
39
+ response.raise_for_status()
40
+ return response.json()[0]['generated_text']
41
+ except Exception as e:
42
+ return f"Error from {model_name}: {str(e)}"
43
+
44
+ def respond(message: str, history: List[List[str]]) -> str:
45
+ """Handle chat responses from all models"""
46
+ # Prepare messages in OpenAI format
47
+ messages = [{"role": "user", "content": message}]
48
+
49
+ # Create threads for concurrent model queries
50
+ threads = []
51
+ results = {}
52
 
53
+ def get_model_response(model_name):
54
+ results[model_name] = query_model(model_name, messages)
55
 
56
+ for model_name in MODEL_ENDPOINTS:
57
+ thread = threading.Thread(target=get_model_response, args=(model_name,))
58
+ thread.start()
59
+ threads.append(thread)
60
 
61
+ # Wait for all threads to complete
62
+ for thread in threads:
63
+ thread.join()
64
 
65
+ # Format responses from all models
 
 
 
 
 
 
 
 
66
  responses = []
67
+ for model_name, response in results.items():
68
+ responses.append(f"**{model_name}**:\n{response}")
69
+
70
+ return "\n\n".join(responses)
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
+ # Create the Gradio interface
73
+ chat_interface = gr.ChatInterface(
74
+ respond,
75
+ title="Multi-LLM Collaboration Chat",
76
+ description="A group chat with Qwen2.5-72B, Llama3.3-70B, and Qwen2.5-Coder-32B",
77
+ examples=["How can I optimize Python code?", "Explain quantum computing basics"],
78
+ theme="soft"
79
+ )
80
 
81
+ if __name__ == "__main__":
82
+ chat_interface.launch(share=True)