File size: 6,708 Bytes
e63873f
 
 
 
 
99419a2
e63873f
 
0ecd9af
e7693f3
 
e63873f
0ecd9af
 
 
2356f9a
0ecd9af
 
 
 
 
 
e7693f3
e63873f
0ecd9af
 
 
 
 
677a78e
0ecd9af
677a78e
 
 
 
0ecd9af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
677a78e
 
 
 
 
 
 
0ecd9af
 
 
 
 
 
 
 
677a78e
0ecd9af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e63873f
0ecd9af
 
 
 
 
e63873f
 
0ecd9af
 
e63873f
 
0ecd9af
 
 
 
 
 
 
 
e63873f
0ecd9af
 
 
 
e63873f
 
0ecd9af
 
 
 
 
 
 
 
 
e63873f
 
0ecd9af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e63873f
0ecd9af
 
 
 
 
 
 
 
 
933bcdc
 
0ecd9af
e63873f
 
0ecd9af
e63873f
0ecd9af
e63873f
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import os
import requests
from PIL import Image
from io import BytesIO
from huggingface_hub import InferenceClient

import gradio as gr

# Tokens for Hugging Face API
read_token = os.getenv('HF_READ')
write_token = os.getenv('HF_WRITE')

# Model configurations
HEADERS = {"Authorization": f"Bearer {read_token}"}
BASE_URL='https://api-inference.huggingface.co/models/'

CHAT_MODEL = "mistralai/Mistral-Nemo-Instruct-2407"
WHISPER_API_URL = "distil-whisper/distil-large-v2"
BARK_API_URL = "suno/bark"
FLUX_API_URL = "enhanceaiteam/Flux-uncensored"

# Initialize Hugging Face Inference Client
client = InferenceClient(api_key=read_token)

# Chatbot system prompt
system_prompt = """
You are an empathetic and knowledgeable AI assistant designed to engage in meaningful conversations,
assist with tasks, and provide accurate information. 
You can also generate vivid visuals!

To request an image, include a description between the IMG tags, like this:
    ##IMG: A serene forest at dawn with a golden glow :IMG##

To request an websearch, include a search Q between the URL tags, like this:
    ##URL: information about rainbow farting unicorns :URL##
"""

chat_history = []

def tagger(bot_response):
    """
    Extract tags from the bot response and return the filtered response text and tags.
    
    Args:
        bot_response (str): The full response text from the chatbot.
        
    Returns:
        tuple: A tuple containing:
            - filtered_response (str): The response text with tags removed.
            - tags (dict): A dictionary of extracted tags and their values.
    """
    import re

    tags = {}
    filtered_response = bot_response

    # Match patterns ##URL: ... :URL## tags:
    url_pattern = r"##URL:(.+?):URL##"
    url_matches = re.findall(url_pattern, bot_response)
    if url_matches:
        tags['url'] = url_matches
        filtered_response = re.sub(url_pattern, "", filtered_response).strip()

    # Match patterns like ##IMG: ... :IMG##
    img_pattern = r"##IMG:(.+?):IMG##"
    img_matches = re.findall(img_pattern, bot_response)
    if img_matches:
        tags['images'] = img_matches
        # Remove image tags from the response text
        filtered_response = re.sub(img_pattern, "", filtered_response).strip()

    
    # Additional tags can be added here as needed
    # For example, if you want to support ##AUDIO: ... :AUDIO## tags:
    # audio_pattern = r"##AUDIO:(.+?):AUDIO##"
    # audio_matches = re.findall(audio_pattern, bot_response)
    # if audio_matches:
    #     tags['audio'] = audio_matches
    #     filtered_response = re.sub(audio_pattern, "", filtered_response).strip()

    return filtered_response, tags
    
def speech_to_text(filename):
    """Convert speech to text using Whisper API."""
    try:
        with open(filename, "rb") as f:
            data = f.read()
        response = requests.post(BASE_URL+WHISPER_API_URL, headers=HEADERS, data=data)
        if response.status_code == 200:
            return response.json().get("text", "Could not recognize speech")
        print(f"Whisper Error: {response.status_code} - {response.text}")
    except Exception as e:
        print(f"Exception in speech_to_text: {e}")
    return None

def chatbot_logic(input_text):
    """Generate a response from the chatbot and handle tags."""
    global chat_history
    chat_history.append({"role": "user", "content": input_text})
    messages = [{"role": "system", "content": system_prompt}] + chat_history

    try:
        completion = client.chat.completions.create(
            model=CHAT_MODEL,
            messages=messages,
            max_tokens=500
        )
        response_text = completion.choices[0].message["content"]
        
        # Use tagger to process tags and clean response text
        response_text, tags = tagger(response_text)
        chat_history.append({"role": "assistant", "content": response_text})
        
        # Extract image prompt from tags if present
        image_prompt = tags.get("images")[0] if "images" in tags else None

        return response_text, image_prompt
    except Exception as e:
        print(f"Chatbot Error: {e}")
    return None, None

def text_to_speech(text):
    """Convert text to speech using Bark API."""
    try:
        response = requests.post(BASE_URL+BARK_API_URL, headers=HEADERS, json={"inputs": text})
        if response.status_code == 200:
            return response.content
        print(f"Bark Error: {response.status_code} - {response.text}")
    except Exception as e:
        print(f"Exception in text_to_speech: {e}")
    return None

def generate_image(prompt):
    """Generate an image using the Flux API."""
    try:
        response = requests.post(BASE_URL+FLUX_API_URL, headers=HEADERS, json={"inputs": prompt})
        if response.status_code == 200:
            return Image.open(BytesIO(response.content))
        print(f"Flux Error: {response.status_code} - {response.text}")
    except Exception as e:
        print(f"Exception in generate_image: {e}")
    return None

def process_chat(audio_file):
    """Process user input, generate response, and optionally create media."""
    # Step 1: Speech-to-text
    recognized_text = speech_to_text(audio_file)
    if not recognized_text:
        return "Speech recognition failed.", None, None

    # Step 2: Chatbot response
    response_text, image_prompt = chatbot_logic(recognized_text)
    if not response_text:
        return "Failed to generate chatbot response.", None, None

    # Step 3: Text-to-speech
    audio_response = text_to_speech(response_text)

    # Step 4: Optional image generation
    generated_image = generate_image(image_prompt) if image_prompt else None

    return response_text, Audio(audio_response, autoplay=True), generated_image

def create_ui():
    """Build and launch the Gradio interface."""
    with gr.Blocks(title="Enhanced Voice-to-Voice Chatbot with Images") as ui:
        gr.Markdown("## Voice-to-Voice AI Chatbot\nTalk to the AI and see its responses, including images it generates!")
        audio_input = gr.Audio(type="filepath", label="Input Audio File")

        submit_button = gr.Button("Submit")
        
        with gr.Row():
            chatbot_response = gr.Textbox(label="Chatbot Response", lines=4)
        with gr.Row():
            audio_output = gr.Audio(label="Audio Response")
            image_output = gr.Image(label="Generated Image")
        
        submit_button.click(
            fn=process_chat,
            inputs=audio_input,
            outputs=[chatbot_response, audio_output, image_output],
            show_progress=True
        )
    
    return ui

if __name__ == "__main__":
    create_ui().launch(debug=True)