File size: 6,708 Bytes
e63873f 99419a2 e63873f 0ecd9af e7693f3 e63873f 0ecd9af 2356f9a 0ecd9af e7693f3 e63873f 0ecd9af 677a78e 0ecd9af 677a78e 0ecd9af 677a78e 0ecd9af 677a78e 0ecd9af e63873f 0ecd9af e63873f 0ecd9af e63873f 0ecd9af e63873f 0ecd9af e63873f 0ecd9af e63873f 0ecd9af e63873f 0ecd9af 933bcdc 0ecd9af e63873f 0ecd9af e63873f 0ecd9af e63873f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import os
import requests
from PIL import Image
from io import BytesIO
from huggingface_hub import InferenceClient
import gradio as gr
# Tokens for Hugging Face API
read_token = os.getenv('HF_READ')
write_token = os.getenv('HF_WRITE')
# Model configurations
HEADERS = {"Authorization": f"Bearer {read_token}"}
BASE_URL='https://api-inference.huggingface.co/models/'
CHAT_MODEL = "mistralai/Mistral-Nemo-Instruct-2407"
WHISPER_API_URL = "distil-whisper/distil-large-v2"
BARK_API_URL = "suno/bark"
FLUX_API_URL = "enhanceaiteam/Flux-uncensored"
# Initialize Hugging Face Inference Client
client = InferenceClient(api_key=read_token)
# Chatbot system prompt
system_prompt = """
You are an empathetic and knowledgeable AI assistant designed to engage in meaningful conversations,
assist with tasks, and provide accurate information.
You can also generate vivid visuals!
To request an image, include a description between the IMG tags, like this:
##IMG: A serene forest at dawn with a golden glow :IMG##
To request an websearch, include a search Q between the URL tags, like this:
##URL: information about rainbow farting unicorns :URL##
"""
chat_history = []
def tagger(bot_response):
"""
Extract tags from the bot response and return the filtered response text and tags.
Args:
bot_response (str): The full response text from the chatbot.
Returns:
tuple: A tuple containing:
- filtered_response (str): The response text with tags removed.
- tags (dict): A dictionary of extracted tags and their values.
"""
import re
tags = {}
filtered_response = bot_response
# Match patterns ##URL: ... :URL## tags:
url_pattern = r"##URL:(.+?):URL##"
url_matches = re.findall(url_pattern, bot_response)
if url_matches:
tags['url'] = url_matches
filtered_response = re.sub(url_pattern, "", filtered_response).strip()
# Match patterns like ##IMG: ... :IMG##
img_pattern = r"##IMG:(.+?):IMG##"
img_matches = re.findall(img_pattern, bot_response)
if img_matches:
tags['images'] = img_matches
# Remove image tags from the response text
filtered_response = re.sub(img_pattern, "", filtered_response).strip()
# Additional tags can be added here as needed
# For example, if you want to support ##AUDIO: ... :AUDIO## tags:
# audio_pattern = r"##AUDIO:(.+?):AUDIO##"
# audio_matches = re.findall(audio_pattern, bot_response)
# if audio_matches:
# tags['audio'] = audio_matches
# filtered_response = re.sub(audio_pattern, "", filtered_response).strip()
return filtered_response, tags
def speech_to_text(filename):
"""Convert speech to text using Whisper API."""
try:
with open(filename, "rb") as f:
data = f.read()
response = requests.post(BASE_URL+WHISPER_API_URL, headers=HEADERS, data=data)
if response.status_code == 200:
return response.json().get("text", "Could not recognize speech")
print(f"Whisper Error: {response.status_code} - {response.text}")
except Exception as e:
print(f"Exception in speech_to_text: {e}")
return None
def chatbot_logic(input_text):
"""Generate a response from the chatbot and handle tags."""
global chat_history
chat_history.append({"role": "user", "content": input_text})
messages = [{"role": "system", "content": system_prompt}] + chat_history
try:
completion = client.chat.completions.create(
model=CHAT_MODEL,
messages=messages,
max_tokens=500
)
response_text = completion.choices[0].message["content"]
# Use tagger to process tags and clean response text
response_text, tags = tagger(response_text)
chat_history.append({"role": "assistant", "content": response_text})
# Extract image prompt from tags if present
image_prompt = tags.get("images")[0] if "images" in tags else None
return response_text, image_prompt
except Exception as e:
print(f"Chatbot Error: {e}")
return None, None
def text_to_speech(text):
"""Convert text to speech using Bark API."""
try:
response = requests.post(BASE_URL+BARK_API_URL, headers=HEADERS, json={"inputs": text})
if response.status_code == 200:
return response.content
print(f"Bark Error: {response.status_code} - {response.text}")
except Exception as e:
print(f"Exception in text_to_speech: {e}")
return None
def generate_image(prompt):
"""Generate an image using the Flux API."""
try:
response = requests.post(BASE_URL+FLUX_API_URL, headers=HEADERS, json={"inputs": prompt})
if response.status_code == 200:
return Image.open(BytesIO(response.content))
print(f"Flux Error: {response.status_code} - {response.text}")
except Exception as e:
print(f"Exception in generate_image: {e}")
return None
def process_chat(audio_file):
"""Process user input, generate response, and optionally create media."""
# Step 1: Speech-to-text
recognized_text = speech_to_text(audio_file)
if not recognized_text:
return "Speech recognition failed.", None, None
# Step 2: Chatbot response
response_text, image_prompt = chatbot_logic(recognized_text)
if not response_text:
return "Failed to generate chatbot response.", None, None
# Step 3: Text-to-speech
audio_response = text_to_speech(response_text)
# Step 4: Optional image generation
generated_image = generate_image(image_prompt) if image_prompt else None
return response_text, Audio(audio_response, autoplay=True), generated_image
def create_ui():
"""Build and launch the Gradio interface."""
with gr.Blocks(title="Enhanced Voice-to-Voice Chatbot with Images") as ui:
gr.Markdown("## Voice-to-Voice AI Chatbot\nTalk to the AI and see its responses, including images it generates!")
audio_input = gr.Audio(type="filepath", label="Input Audio File")
submit_button = gr.Button("Submit")
with gr.Row():
chatbot_response = gr.Textbox(label="Chatbot Response", lines=4)
with gr.Row():
audio_output = gr.Audio(label="Audio Response")
image_output = gr.Image(label="Generated Image")
submit_button.click(
fn=process_chat,
inputs=audio_input,
outputs=[chatbot_response, audio_output, image_output],
show_progress=True
)
return ui
if __name__ == "__main__":
create_ui().launch(debug=True)
|