3Luik / app.py
K00B404's picture
Update app.py
933bcdc verified
import os
import requests
from PIL import Image
from io import BytesIO
from huggingface_hub import InferenceClient
import gradio as gr
# Tokens for Hugging Face API
read_token = os.getenv('HF_READ')
write_token = os.getenv('HF_WRITE')
# Model configurations
HEADERS = {"Authorization": f"Bearer {read_token}"}
BASE_URL='https://api-inference.huggingface.co/models/'
CHAT_MODEL = "mistralai/Mistral-Nemo-Instruct-2407"
WHISPER_API_URL = "distil-whisper/distil-large-v2"
BARK_API_URL = "suno/bark"
FLUX_API_URL = "enhanceaiteam/Flux-uncensored"
# Initialize Hugging Face Inference Client
client = InferenceClient(api_key=read_token)
# Chatbot system prompt
system_prompt = """
You are an empathetic and knowledgeable AI assistant designed to engage in meaningful conversations,
assist with tasks, and provide accurate information.
You can also generate vivid visuals!
To request an image, include a description between the IMG tags, like this:
##IMG: A serene forest at dawn with a golden glow :IMG##
To request an websearch, include a search Q between the URL tags, like this:
##URL: information about rainbow farting unicorns :URL##
"""
chat_history = []
def tagger(bot_response):
"""
Extract tags from the bot response and return the filtered response text and tags.
Args:
bot_response (str): The full response text from the chatbot.
Returns:
tuple: A tuple containing:
- filtered_response (str): The response text with tags removed.
- tags (dict): A dictionary of extracted tags and their values.
"""
import re
tags = {}
filtered_response = bot_response
# Match patterns ##URL: ... :URL## tags:
url_pattern = r"##URL:(.+?):URL##"
url_matches = re.findall(url_pattern, bot_response)
if url_matches:
tags['url'] = url_matches
filtered_response = re.sub(url_pattern, "", filtered_response).strip()
# Match patterns like ##IMG: ... :IMG##
img_pattern = r"##IMG:(.+?):IMG##"
img_matches = re.findall(img_pattern, bot_response)
if img_matches:
tags['images'] = img_matches
# Remove image tags from the response text
filtered_response = re.sub(img_pattern, "", filtered_response).strip()
# Additional tags can be added here as needed
# For example, if you want to support ##AUDIO: ... :AUDIO## tags:
# audio_pattern = r"##AUDIO:(.+?):AUDIO##"
# audio_matches = re.findall(audio_pattern, bot_response)
# if audio_matches:
# tags['audio'] = audio_matches
# filtered_response = re.sub(audio_pattern, "", filtered_response).strip()
return filtered_response, tags
def speech_to_text(filename):
"""Convert speech to text using Whisper API."""
try:
with open(filename, "rb") as f:
data = f.read()
response = requests.post(BASE_URL+WHISPER_API_URL, headers=HEADERS, data=data)
if response.status_code == 200:
return response.json().get("text", "Could not recognize speech")
print(f"Whisper Error: {response.status_code} - {response.text}")
except Exception as e:
print(f"Exception in speech_to_text: {e}")
return None
def chatbot_logic(input_text):
"""Generate a response from the chatbot and handle tags."""
global chat_history
chat_history.append({"role": "user", "content": input_text})
messages = [{"role": "system", "content": system_prompt}] + chat_history
try:
completion = client.chat.completions.create(
model=CHAT_MODEL,
messages=messages,
max_tokens=500
)
response_text = completion.choices[0].message["content"]
# Use tagger to process tags and clean response text
response_text, tags = tagger(response_text)
chat_history.append({"role": "assistant", "content": response_text})
# Extract image prompt from tags if present
image_prompt = tags.get("images")[0] if "images" in tags else None
return response_text, image_prompt
except Exception as e:
print(f"Chatbot Error: {e}")
return None, None
def text_to_speech(text):
"""Convert text to speech using Bark API."""
try:
response = requests.post(BASE_URL+BARK_API_URL, headers=HEADERS, json={"inputs": text})
if response.status_code == 200:
return response.content
print(f"Bark Error: {response.status_code} - {response.text}")
except Exception as e:
print(f"Exception in text_to_speech: {e}")
return None
def generate_image(prompt):
"""Generate an image using the Flux API."""
try:
response = requests.post(BASE_URL+FLUX_API_URL, headers=HEADERS, json={"inputs": prompt})
if response.status_code == 200:
return Image.open(BytesIO(response.content))
print(f"Flux Error: {response.status_code} - {response.text}")
except Exception as e:
print(f"Exception in generate_image: {e}")
return None
def process_chat(audio_file):
"""Process user input, generate response, and optionally create media."""
# Step 1: Speech-to-text
recognized_text = speech_to_text(audio_file)
if not recognized_text:
return "Speech recognition failed.", None, None
# Step 2: Chatbot response
response_text, image_prompt = chatbot_logic(recognized_text)
if not response_text:
return "Failed to generate chatbot response.", None, None
# Step 3: Text-to-speech
audio_response = text_to_speech(response_text)
# Step 4: Optional image generation
generated_image = generate_image(image_prompt) if image_prompt else None
return response_text, Audio(audio_response, autoplay=True), generated_image
def create_ui():
"""Build and launch the Gradio interface."""
with gr.Blocks(title="Enhanced Voice-to-Voice Chatbot with Images") as ui:
gr.Markdown("## Voice-to-Voice AI Chatbot\nTalk to the AI and see its responses, including images it generates!")
audio_input = gr.Audio(type="filepath", label="Input Audio File")
submit_button = gr.Button("Submit")
with gr.Row():
chatbot_response = gr.Textbox(label="Chatbot Response", lines=4)
with gr.Row():
audio_output = gr.Audio(label="Audio Response")
image_output = gr.Image(label="Generated Image")
submit_button.click(
fn=process_chat,
inputs=audio_input,
outputs=[chatbot_response, audio_output, image_output],
show_progress=True
)
return ui
if __name__ == "__main__":
create_ui().launch(debug=True)