Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,85 +1,82 @@
|
|
1 |
-
import os
|
2 |
import gradio as gr
|
|
|
3 |
import requests
|
4 |
-
import
|
|
|
5 |
|
6 |
-
# Get the Hugging Face API key from Spaces secrets
|
7 |
HF_API_KEY = os.getenv("HF_API_KEY")
|
8 |
|
9 |
-
# Model endpoints
|
10 |
MODEL_ENDPOINTS = {
|
11 |
"Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
|
12 |
"Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
|
13 |
"Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
|
14 |
}
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
"Llama3.3-70B-Instruct": "System: You are a research expert assistant specialized in in-depth analysis.",
|
20 |
-
"Qwen2.5-Coder-32B-Instruct": "System: You are a coding expert who helps with code-related tasks.",
|
21 |
-
}
|
22 |
-
|
23 |
-
def query_model(prompt, model_endpoint, system_prompt):
|
24 |
headers = {
|
25 |
"Authorization": f"Bearer {HF_API_KEY}",
|
26 |
-
"Content-Type": "application/json"
|
27 |
-
"Accept": "application/json"
|
28 |
}
|
29 |
-
# Format the prompt to include the system instruction and structure the conversation.
|
30 |
-
formatted_prompt = f"{system_prompt}\nUser: {prompt}\nAssistant:"
|
31 |
|
32 |
-
#
|
33 |
-
|
34 |
-
|
|
|
|
|
35 |
"parameters": {
|
36 |
-
"
|
37 |
-
"temperature": 0.
|
38 |
-
"stop_sequences": ["\nUser:"]
|
39 |
}
|
40 |
}
|
41 |
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
|
45 |
-
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
|
52 |
-
|
53 |
-
|
|
|
54 |
|
55 |
-
|
56 |
-
generated_text = result[0].get("generated_text", "No generated_text found in response")
|
57 |
-
# Optionally, strip off the prompt if needed:
|
58 |
-
# generated_text = generated_text[len(formatted_prompt):].strip()
|
59 |
-
return generated_text
|
60 |
-
except Exception:
|
61 |
-
return f"Error: Unexpected response format: {json.dumps(result)}"
|
62 |
-
|
63 |
-
def chat_with_models(user_input, history):
|
64 |
responses = []
|
65 |
-
for model_name,
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
combined_answer = "\n\n".join(responses)
|
70 |
-
history.append((user_input, combined_answer))
|
71 |
-
return history, history
|
72 |
-
|
73 |
-
with gr.Blocks() as demo:
|
74 |
-
gr.Markdown("# Multi-LLM Chatbot using Hugging Face Inference API with Stop Sequences")
|
75 |
-
chatbot = gr.Chatbot()
|
76 |
-
msg = gr.Textbox(label="Your Message")
|
77 |
-
clear = gr.Button("Clear")
|
78 |
-
|
79 |
-
def clear_chat():
|
80 |
-
return [], []
|
81 |
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
-
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import os
|
3 |
import requests
|
4 |
+
import threading
|
5 |
+
from typing import List, Dict, Any
|
6 |
|
7 |
+
# Get the Hugging Face API key from Spaces secrets
|
8 |
HF_API_KEY = os.getenv("HF_API_KEY")
|
9 |
|
10 |
+
# Model endpoints configuration
|
11 |
MODEL_ENDPOINTS = {
|
12 |
"Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
|
13 |
"Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
|
14 |
"Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
|
15 |
}
|
16 |
|
17 |
+
def query_model(model_name: str, messages: List[Dict[str, str]]) -> str:
|
18 |
+
"""Query a single model with the chat history"""
|
19 |
+
endpoint = MODEL_ENDPOINTS[model_name]
|
|
|
|
|
|
|
|
|
|
|
20 |
headers = {
|
21 |
"Authorization": f"Bearer {HF_API_KEY}",
|
22 |
+
"Content-Type": "application/json"
|
|
|
23 |
}
|
|
|
|
|
24 |
|
25 |
+
# Format the prompt according to each model's requirements
|
26 |
+
prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
|
27 |
+
|
28 |
+
payload = {
|
29 |
+
"inputs": prompt,
|
30 |
"parameters": {
|
31 |
+
"max_tokens": 1024,
|
32 |
+
"temperature": 0.7,
|
33 |
+
"stop_sequences": ["\nUser:", "\nAssistant:", "###"]
|
34 |
}
|
35 |
}
|
36 |
|
37 |
+
try:
|
38 |
+
response = requests.post(endpoint, json=payload, headers=headers)
|
39 |
+
response.raise_for_status()
|
40 |
+
return response.json()[0]['generated_text']
|
41 |
+
except Exception as e:
|
42 |
+
return f"Error from {model_name}: {str(e)}"
|
43 |
+
|
44 |
+
def respond(message: str, history: List[List[str]]) -> str:
|
45 |
+
"""Handle chat responses from all models"""
|
46 |
+
# Prepare messages in OpenAI format
|
47 |
+
messages = [{"role": "user", "content": message}]
|
48 |
+
|
49 |
+
# Create threads for concurrent model queries
|
50 |
+
threads = []
|
51 |
+
results = {}
|
52 |
|
53 |
+
def get_model_response(model_name):
|
54 |
+
results[model_name] = query_model(model_name, messages)
|
55 |
|
56 |
+
for model_name in MODEL_ENDPOINTS:
|
57 |
+
thread = threading.Thread(target=get_model_response, args=(model_name,))
|
58 |
+
thread.start()
|
59 |
+
threads.append(thread)
|
60 |
|
61 |
+
# Wait for all threads to complete
|
62 |
+
for thread in threads:
|
63 |
+
thread.join()
|
64 |
|
65 |
+
# Format responses from all models
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
responses = []
|
67 |
+
for model_name, response in results.items():
|
68 |
+
responses.append(f"**{model_name}**:\n{response}")
|
69 |
+
|
70 |
+
return "\n\n".join(responses)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
+
# Create the Gradio interface
|
73 |
+
chat_interface = gr.ChatInterface(
|
74 |
+
respond,
|
75 |
+
title="Multi-LLM Collaboration Chat",
|
76 |
+
description="A group chat with Qwen2.5-72B, Llama3.3-70B, and Qwen2.5-Coder-32B",
|
77 |
+
examples=["How can I optimize Python code?", "Explain quantum computing basics"],
|
78 |
+
theme="soft"
|
79 |
+
)
|
80 |
|
81 |
+
if __name__ == "__main__":
|
82 |
+
chat_interface.launch(share=True)
|