Spaces:
Running
Running
github-actions[bot]
commited on
Commit
Β·
69427f5
1
Parent(s):
675c4cb
π€ Auto-deploy from GitHub (push) - fe69584 - 2025-07-28 05:16:36 UTC
Browse files- apps/gradio-app/README.md +13 -5
- apps/gradio-app/src/fitness_gradio/ui/app.py +16 -8
- apps/gradio-app/src/fitness_gradio/ui/components.py +22 -4
- apps/gradio-app/src/fitness_gradio/ui/handlers.py +155 -41
- apps/gradio-app/src/fitness_gradio/ui/styles.py +33 -1
- apps/gradio-app/src/fitness_gradio/ui/tts_utils.py +277 -0
- apps/gradio-app/test_audio_debug.py +83 -0
- apps/gradio-app/test_tts.py +46 -0
- shared/src/fitness_core/agents/providers.py +6 -1
apps/gradio-app/README.md
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
# Fitness Gradio App
|
| 2 |
|
| 3 |
-
Web interface for the Fitness AI Assistant using Gradio with voice input support.
|
| 4 |
|
| 5 |
## Features
|
| 6 |
|
| 7 |
- Interactive chat interface with multimodal input
|
| 8 |
- **Voice input** via microphone button (powered by Groq Whisper)
|
|
|
|
| 9 |
- Multi-provider model support (Groq, Anthropic, OpenAI)
|
| 10 |
- Real-time streaming responses
|
| 11 |
- Fitness plan generation
|
|
@@ -14,7 +15,7 @@ Web interface for the Fitness AI Assistant using Gradio with voice input support
|
|
| 14 |
## Quick Start
|
| 15 |
|
| 16 |
```bash
|
| 17 |
-
# Set your API key for voice functionality
|
| 18 |
$env:GROQ_API_KEY = "your-groq-api-key"
|
| 19 |
|
| 20 |
# Install and run
|
|
@@ -22,12 +23,19 @@ poetry install
|
|
| 22 |
poetry run python -m fitness_gradio.main
|
| 23 |
```
|
| 24 |
|
| 25 |
-
## Voice Setup
|
| 26 |
|
| 27 |
1. Get a [Groq API key](https://console.groq.com/keys)
|
| 28 |
2. Set `GROQ_API_KEY` environment variable
|
| 29 |
-
3. Click the microphone button in the chat interface
|
| 30 |
-
4.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
See [VOICE_SETUP.md](VOICE_SETUP.md) for detailed setup instructions.
|
| 33 |
|
|
|
|
| 1 |
# Fitness Gradio App
|
| 2 |
|
| 3 |
+
Web interface for the Fitness AI Assistant using Gradio with voice input and text-to-speech support.
|
| 4 |
|
| 5 |
## Features
|
| 6 |
|
| 7 |
- Interactive chat interface with multimodal input
|
| 8 |
- **Voice input** via microphone button (powered by Groq Whisper)
|
| 9 |
+
- **Text-to-Speech output** with 19 English and 4 Arabic voices (powered by Groq PlayAI TTS)
|
| 10 |
- Multi-provider model support (Groq, Anthropic, OpenAI)
|
| 11 |
- Real-time streaming responses
|
| 12 |
- Fitness plan generation
|
|
|
|
| 15 |
## Quick Start
|
| 16 |
|
| 17 |
```bash
|
| 18 |
+
# Set your API key for voice functionality and TTS
|
| 19 |
$env:GROQ_API_KEY = "your-groq-api-key"
|
| 20 |
|
| 21 |
# Install and run
|
|
|
|
| 23 |
poetry run python -m fitness_gradio.main
|
| 24 |
```
|
| 25 |
|
| 26 |
+
## Voice & TTS Setup
|
| 27 |
|
| 28 |
1. Get a [Groq API key](https://console.groq.com/keys)
|
| 29 |
2. Set `GROQ_API_KEY` environment variable
|
| 30 |
+
3. **Voice Input**: Click the microphone button in the chat interface
|
| 31 |
+
4. **Text-to-Speech**: Enable the "π Enable Text-to-Speech" checkbox
|
| 32 |
+
5. Allow browser microphone access when prompted (for voice input)
|
| 33 |
+
|
| 34 |
+
### Available TTS Voices
|
| 35 |
+
|
| 36 |
+
**English (playai-tts)**: 19 voices including Celeste-PlayAI (default), Fritz-PlayAI, Arista-PlayAI, Atlas-PlayAI, Basil-PlayAI, Briggs-PlayAI, Calum-PlayAI, Cheyenne-PlayAI, and more.
|
| 37 |
+
|
| 38 |
+
**Arabic (playai-tts-arabic)**: 4 voices including Amira-PlayAI (default), Ahmad-PlayAI, Khalid-PlayAI, Nasser-PlayAI.
|
| 39 |
|
| 40 |
See [VOICE_SETUP.md](VOICE_SETUP.md) for detailed setup instructions.
|
| 41 |
|
apps/gradio-app/src/fitness_gradio/ui/app.py
CHANGED
|
@@ -36,11 +36,16 @@ class FitnessAppUI:
|
|
| 36 |
(model_dropdown, selected_model) = UIComponents.create_model_selection_section()
|
| 37 |
|
| 38 |
# Main chat interface
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
chat_input = UIComponents.create_chat_input()
|
| 41 |
|
| 42 |
# Control buttons
|
| 43 |
-
clear_btn, streaming_toggle = UIComponents.create_control_buttons()
|
| 44 |
|
| 45 |
# Examples section
|
| 46 |
UIComponents.create_examples_section(chat_input)
|
|
@@ -51,8 +56,8 @@ class FitnessAppUI:
|
|
| 51 |
|
| 52 |
# Event handlers
|
| 53 |
self._setup_event_handlers(
|
| 54 |
-
chatbot, chat_input, clear_btn, streaming_toggle,
|
| 55 |
-
model_dropdown, selected_model
|
| 56 |
)
|
| 57 |
|
| 58 |
def _setup_event_handlers(
|
|
@@ -61,8 +66,10 @@ class FitnessAppUI:
|
|
| 61 |
chat_input: gr.MultimodalTextbox,
|
| 62 |
clear_btn: gr.Button,
|
| 63 |
streaming_toggle: gr.Checkbox,
|
|
|
|
| 64 |
model_dropdown: gr.Dropdown,
|
| 65 |
-
selected_model: gr.Textbox
|
|
|
|
| 66 |
) -> None:
|
| 67 |
"""Set up all event handlers."""
|
| 68 |
|
|
@@ -70,12 +77,13 @@ class FitnessAppUI:
|
|
| 70 |
chat_msg = chat_input.submit(
|
| 71 |
UIHandlers.add_message_with_audio,
|
| 72 |
[chatbot, chat_input],
|
| 73 |
-
[chatbot, chat_input]
|
|
|
|
| 74 |
)
|
| 75 |
bot_msg = chat_msg.then(
|
| 76 |
UIHandlers.dynamic_bot,
|
| 77 |
-
[chatbot, streaming_toggle, selected_model],
|
| 78 |
-
chatbot,
|
| 79 |
api_name="bot_response"
|
| 80 |
)
|
| 81 |
bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
|
|
|
|
| 36 |
(model_dropdown, selected_model) = UIComponents.create_model_selection_section()
|
| 37 |
|
| 38 |
# Main chat interface
|
| 39 |
+
with gr.Row():
|
| 40 |
+
with gr.Column():
|
| 41 |
+
chatbot = UIComponents.create_chatbot()
|
| 42 |
+
with gr.Column(scale=0.3):
|
| 43 |
+
output_audio = UIComponents.create_output_audio()
|
| 44 |
+
|
| 45 |
chat_input = UIComponents.create_chat_input()
|
| 46 |
|
| 47 |
# Control buttons
|
| 48 |
+
clear_btn, streaming_toggle, tts_toggle = UIComponents.create_control_buttons()
|
| 49 |
|
| 50 |
# Examples section
|
| 51 |
UIComponents.create_examples_section(chat_input)
|
|
|
|
| 56 |
|
| 57 |
# Event handlers
|
| 58 |
self._setup_event_handlers(
|
| 59 |
+
chatbot, chat_input, clear_btn, streaming_toggle, tts_toggle,
|
| 60 |
+
model_dropdown, selected_model, output_audio
|
| 61 |
)
|
| 62 |
|
| 63 |
def _setup_event_handlers(
|
|
|
|
| 66 |
chat_input: gr.MultimodalTextbox,
|
| 67 |
clear_btn: gr.Button,
|
| 68 |
streaming_toggle: gr.Checkbox,
|
| 69 |
+
tts_toggle: gr.Checkbox,
|
| 70 |
model_dropdown: gr.Dropdown,
|
| 71 |
+
selected_model: gr.Textbox,
|
| 72 |
+
output_audio: gr.Audio
|
| 73 |
) -> None:
|
| 74 |
"""Set up all event handlers."""
|
| 75 |
|
|
|
|
| 77 |
chat_msg = chat_input.submit(
|
| 78 |
UIHandlers.add_message_with_audio,
|
| 79 |
[chatbot, chat_input],
|
| 80 |
+
[chatbot, chat_input],
|
| 81 |
+
queue=False
|
| 82 |
)
|
| 83 |
bot_msg = chat_msg.then(
|
| 84 |
UIHandlers.dynamic_bot,
|
| 85 |
+
[chatbot, streaming_toggle, tts_toggle, selected_model],
|
| 86 |
+
[chatbot, output_audio],
|
| 87 |
api_name="bot_response"
|
| 88 |
)
|
| 89 |
bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
|
apps/gradio-app/src/fitness_gradio/ui/components.py
CHANGED
|
@@ -158,6 +158,18 @@ class UIComponents:
|
|
| 158 |
render_markdown=True
|
| 159 |
)
|
| 160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
@staticmethod
|
| 162 |
def create_chat_input() -> gr.MultimodalTextbox:
|
| 163 |
"""Create the chat input component."""
|
|
@@ -166,16 +178,17 @@ class UIComponents:
|
|
| 166 |
file_count="multiple",
|
| 167 |
placeholder="Ask me about fitness, request a workout plan, or get meal planning advice...",
|
| 168 |
show_label=False,
|
| 169 |
-
sources=["microphone", "upload"], #
|
|
|
|
| 170 |
)
|
| 171 |
|
| 172 |
@staticmethod
|
| 173 |
def create_control_buttons() -> tuple:
|
| 174 |
"""
|
| 175 |
-
Create the control buttons (clear, streaming toggle).
|
| 176 |
|
| 177 |
Returns:
|
| 178 |
-
Tuple of (clear_btn, streaming_toggle)
|
| 179 |
"""
|
| 180 |
with gr.Row():
|
| 181 |
clear_btn = gr.Button("ποΈ Clear Conversation", variant="secondary", size="sm")
|
|
@@ -184,8 +197,13 @@ class UIComponents:
|
|
| 184 |
value=True,
|
| 185 |
info="Stream responses in real-time as the agent generates them"
|
| 186 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
-
return clear_btn, streaming_toggle
|
| 189 |
|
| 190 |
@staticmethod
|
| 191 |
def create_examples_section(chat_input: gr.MultimodalTextbox) -> gr.Examples:
|
|
|
|
| 158 |
render_markdown=True
|
| 159 |
)
|
| 160 |
|
| 161 |
+
@staticmethod
|
| 162 |
+
def create_output_audio() -> gr.Audio:
|
| 163 |
+
"""Create the output audio component for TTS responses."""
|
| 164 |
+
return gr.Audio(
|
| 165 |
+
label="π Audio Response",
|
| 166 |
+
streaming=False, # Disable streaming to avoid ffmpeg issues
|
| 167 |
+
autoplay=True,
|
| 168 |
+
show_download_button=True,
|
| 169 |
+
show_share_button=False,
|
| 170 |
+
format="wav" # Explicitly set format to WAV
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
@staticmethod
|
| 174 |
def create_chat_input() -> gr.MultimodalTextbox:
|
| 175 |
"""Create the chat input component."""
|
|
|
|
| 178 |
file_count="multiple",
|
| 179 |
placeholder="Ask me about fitness, request a workout plan, or get meal planning advice...",
|
| 180 |
show_label=False,
|
| 181 |
+
sources=["microphone", "upload"], # Enable microphone and file uploads
|
| 182 |
+
submit_btn=True, # Ensure submit button is available
|
| 183 |
)
|
| 184 |
|
| 185 |
@staticmethod
|
| 186 |
def create_control_buttons() -> tuple:
|
| 187 |
"""
|
| 188 |
+
Create the control buttons (clear, streaming toggle, TTS toggle).
|
| 189 |
|
| 190 |
Returns:
|
| 191 |
+
Tuple of (clear_btn, streaming_toggle, tts_toggle)
|
| 192 |
"""
|
| 193 |
with gr.Row():
|
| 194 |
clear_btn = gr.Button("ποΈ Clear Conversation", variant="secondary", size="sm")
|
|
|
|
| 197 |
value=True,
|
| 198 |
info="Stream responses in real-time as the agent generates them"
|
| 199 |
)
|
| 200 |
+
tts_toggle = gr.Checkbox(
|
| 201 |
+
label="π Enable Text-to-Speech",
|
| 202 |
+
value=False,
|
| 203 |
+
info="Convert AI responses to speech using Groq's TTS models"
|
| 204 |
+
)
|
| 205 |
|
| 206 |
+
return clear_btn, streaming_toggle, tts_toggle
|
| 207 |
|
| 208 |
@staticmethod
|
| 209 |
def create_examples_section(chat_input: gr.MultimodalTextbox) -> gr.Examples:
|
apps/gradio-app/src/fitness_gradio/ui/handlers.py
CHANGED
|
@@ -9,6 +9,7 @@ from typing import List, Dict, Union, Generator, Any, Tuple, Optional
|
|
| 9 |
from fitness_core.agents import FitnessAgent
|
| 10 |
from fitness_core.services import ConversationManager, AgentRunner, ResponseFormatter
|
| 11 |
from fitness_core.utils import get_logger
|
|
|
|
| 12 |
|
| 13 |
logger = get_logger(__name__)
|
| 14 |
|
|
@@ -137,60 +138,100 @@ Please check your API keys and try a different model."""
|
|
| 137 |
Tuple of (updated_history, cleared_input)
|
| 138 |
"""
|
| 139 |
try:
|
|
|
|
| 140 |
user_content_parts = []
|
|
|
|
|
|
|
| 141 |
|
| 142 |
# Handle file uploads (including audio from microphone)
|
| 143 |
if message.get("files"):
|
|
|
|
| 144 |
for file_path in message["files"]:
|
| 145 |
if file_path: # Validate file path exists
|
|
|
|
| 146 |
# Check if this is an audio file (from microphone recording)
|
| 147 |
if UIHandlers.is_audio_file(file_path):
|
| 148 |
-
logger.info(f"
|
| 149 |
# Process audio file for transcription
|
| 150 |
transcribed_text = UIHandlers.process_audio_file(file_path)
|
| 151 |
|
| 152 |
if transcribed_text and not transcribed_text.startswith("["):
|
| 153 |
-
|
| 154 |
-
display_text = f"π€ {transcribed_text}"
|
| 155 |
user_content_parts.append(transcribed_text) # Add clean text to conversation
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
history.append({
|
| 159 |
-
"role": "user",
|
| 160 |
-
"content": display_text
|
| 161 |
-
})
|
| 162 |
else:
|
| 163 |
-
#
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
})
|
| 168 |
else:
|
| 169 |
# Handle non-audio file uploads
|
| 170 |
file_content = f"[File uploaded: {file_path}]"
|
| 171 |
user_content_parts.append(file_content)
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
"content": {"path": file_path}
|
| 176 |
-
})
|
| 177 |
|
| 178 |
# Handle text input
|
|
|
|
| 179 |
if message.get("text") and message["text"].strip():
|
| 180 |
text_content = message["text"].strip()
|
| 181 |
user_content_parts.append(text_content)
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
history.append({
|
| 184 |
-
"role": "user",
|
| 185 |
"content": text_content
|
| 186 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
# Add to conversation manager (combine all content)
|
| 189 |
if user_content_parts:
|
| 190 |
combined_content = "\n".join(user_content_parts)
|
| 191 |
conversation_manager.add_user_message(combined_content)
|
| 192 |
-
logger.info(f"Added user message to conversation. {conversation_manager.get_history_summary()}")
|
|
|
|
|
|
|
| 193 |
|
|
|
|
| 194 |
return history, gr.MultimodalTextbox(value=None, interactive=False)
|
| 195 |
|
| 196 |
except Exception as e:
|
|
@@ -264,17 +305,19 @@ Please check your API keys and try a different model."""
|
|
| 264 |
@staticmethod
|
| 265 |
def bot_with_real_streaming(
|
| 266 |
history: List[Dict],
|
| 267 |
-
model_name: str = None
|
| 268 |
-
|
|
|
|
| 269 |
"""
|
| 270 |
Bot function with real-time streaming from the agent
|
| 271 |
|
| 272 |
Args:
|
| 273 |
history: Current Gradio chat history (for display only)
|
| 274 |
model_name: Model to use for the agent
|
|
|
|
| 275 |
|
| 276 |
Yields:
|
| 277 |
-
Updated history
|
| 278 |
"""
|
| 279 |
try:
|
| 280 |
# Get agent instance with specified model
|
|
@@ -294,15 +337,18 @@ Please check your API keys and try a different model."""
|
|
| 294 |
try:
|
| 295 |
content_chunks = []
|
| 296 |
final_result = None
|
|
|
|
| 297 |
|
| 298 |
for chunk in AgentRunner.run_agent_with_streaming_sync(agent, agent_input):
|
| 299 |
if chunk['type'] == 'final_result':
|
| 300 |
final_result = chunk['result']
|
| 301 |
if chunk['content']:
|
| 302 |
content_chunks.append(chunk['content'])
|
|
|
|
| 303 |
elif chunk['type'] == 'error':
|
| 304 |
final_result = chunk['result']
|
| 305 |
content_chunks.append(chunk['content'])
|
|
|
|
| 306 |
|
| 307 |
# Update conversation manager
|
| 308 |
if final_result:
|
|
@@ -313,34 +359,50 @@ Please check your API keys and try a different model."""
|
|
| 313 |
if content_chunks:
|
| 314 |
for content in content_chunks:
|
| 315 |
history[-1]["content"] = content
|
| 316 |
-
yield history
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
else:
|
| 318 |
-
|
| 319 |
-
|
|
|
|
| 320 |
|
| 321 |
except Exception as e:
|
| 322 |
logger.error(f"Error in streaming execution: {str(e)}")
|
| 323 |
-
|
| 324 |
-
|
|
|
|
| 325 |
|
| 326 |
except Exception as e:
|
| 327 |
logger.error(f"Bot streaming function error: {str(e)}")
|
| 328 |
if len(history) == 0 or history[-1].get("role") != "assistant":
|
| 329 |
history.append({"role": "assistant", "content": ""})
|
| 330 |
history[-1]["content"] = "I apologize, but I'm experiencing technical difficulties. Please try again in a moment."
|
| 331 |
-
yield history
|
| 332 |
|
| 333 |
@staticmethod
|
| 334 |
-
def bot(history: List[Dict], model_name: str = None) -> Generator[List[Dict], None, None]:
|
| 335 |
"""
|
| 336 |
Main bot function with simulated streaming
|
| 337 |
|
| 338 |
Args:
|
| 339 |
history: Current Gradio chat history (for display only)
|
| 340 |
model_name: Model to use for the agent
|
|
|
|
| 341 |
|
| 342 |
Yields:
|
| 343 |
-
Updated history
|
| 344 |
"""
|
| 345 |
try:
|
| 346 |
# Get agent instance with specified model
|
|
@@ -361,36 +423,50 @@ Please check your API keys and try a different model."""
|
|
| 361 |
response = ResponseFormatter.extract_response_content(result)
|
| 362 |
|
| 363 |
# Stream the response with simulated typing
|
| 364 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
|
| 366 |
except Exception as e:
|
| 367 |
logger.error(f"Bot function error: {str(e)}")
|
| 368 |
error_response = "I apologize, but I'm experiencing technical difficulties. Please try again in a moment."
|
| 369 |
-
|
|
|
|
| 370 |
|
| 371 |
@staticmethod
|
| 372 |
def dynamic_bot(
|
| 373 |
history: List[Dict],
|
| 374 |
-
use_real_streaming: bool = True,
|
|
|
|
| 375 |
model_name: str = None
|
| 376 |
-
) -> Generator[List[Dict], None, None]:
|
| 377 |
"""
|
| 378 |
-
Dynamic bot function that can switch between streaming modes
|
| 379 |
|
| 380 |
Args:
|
| 381 |
history: Current Gradio chat history (for display only)
|
| 382 |
use_real_streaming: Whether to use real-time streaming from agent
|
|
|
|
| 383 |
model_name: Model to use for the agent
|
| 384 |
|
| 385 |
Yields:
|
| 386 |
-
Updated history
|
| 387 |
"""
|
| 388 |
if use_real_streaming:
|
| 389 |
logger.info("Using real-time streaming mode")
|
| 390 |
-
yield from UIHandlers.bot_with_real_streaming(history, model_name)
|
| 391 |
else:
|
| 392 |
logger.info("Using simulated streaming mode")
|
| 393 |
-
yield from UIHandlers.bot(history, model_name)
|
| 394 |
|
| 395 |
@staticmethod
|
| 396 |
def clear_conversation() -> List[Dict]:
|
|
@@ -404,3 +480,41 @@ Please check your API keys and try a different model."""
|
|
| 404 |
conversation_manager.clear_history()
|
| 405 |
logger.info("Conversation history cleared")
|
| 406 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
from fitness_core.agents import FitnessAgent
|
| 10 |
from fitness_core.services import ConversationManager, AgentRunner, ResponseFormatter
|
| 11 |
from fitness_core.utils import get_logger
|
| 12 |
+
from .tts_utils import generate_speech_for_text, generate_speech_for_session, clean_tts_markup
|
| 13 |
|
| 14 |
logger = get_logger(__name__)
|
| 15 |
|
|
|
|
| 138 |
Tuple of (updated_history, cleared_input)
|
| 139 |
"""
|
| 140 |
try:
|
| 141 |
+
logger.info(f"Processing message: {message}")
|
| 142 |
user_content_parts = []
|
| 143 |
+
has_audio_content = False
|
| 144 |
+
audio_transcription = None
|
| 145 |
|
| 146 |
# Handle file uploads (including audio from microphone)
|
| 147 |
if message.get("files"):
|
| 148 |
+
logger.info(f"Found {len(message['files'])} files in message")
|
| 149 |
for file_path in message["files"]:
|
| 150 |
if file_path: # Validate file path exists
|
| 151 |
+
logger.info(f"Processing file: {file_path}")
|
| 152 |
# Check if this is an audio file (from microphone recording)
|
| 153 |
if UIHandlers.is_audio_file(file_path):
|
| 154 |
+
logger.info(f"Detected audio file: {file_path}")
|
| 155 |
# Process audio file for transcription
|
| 156 |
transcribed_text = UIHandlers.process_audio_file(file_path)
|
| 157 |
|
| 158 |
if transcribed_text and not transcribed_text.startswith("["):
|
| 159 |
+
audio_transcription = transcribed_text
|
|
|
|
| 160 |
user_content_parts.append(transcribed_text) # Add clean text to conversation
|
| 161 |
+
has_audio_content = True
|
| 162 |
+
logger.info(f"Successfully transcribed audio: '{transcribed_text[:50]}...'")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
else:
|
| 164 |
+
# Handle transcription error
|
| 165 |
+
audio_transcription = transcribed_text
|
| 166 |
+
has_audio_content = True # Still mark as audio content even if failed
|
| 167 |
+
logger.warning(f"Audio transcription failed: {transcribed_text}")
|
|
|
|
| 168 |
else:
|
| 169 |
# Handle non-audio file uploads
|
| 170 |
file_content = f"[File uploaded: {file_path}]"
|
| 171 |
user_content_parts.append(file_content)
|
| 172 |
+
logger.info(f"Added file upload to content: {file_path}")
|
| 173 |
+
else:
|
| 174 |
+
logger.info("No files found in message")
|
|
|
|
|
|
|
| 175 |
|
| 176 |
# Handle text input
|
| 177 |
+
text_content = None
|
| 178 |
if message.get("text") and message["text"].strip():
|
| 179 |
text_content = message["text"].strip()
|
| 180 |
user_content_parts.append(text_content)
|
| 181 |
+
logger.info(f"Found text content: '{text_content[:50]}...'")
|
| 182 |
+
else:
|
| 183 |
+
logger.info("No text content found in message")
|
| 184 |
+
|
| 185 |
+
# Add appropriate message to chat history
|
| 186 |
+
if has_audio_content and audio_transcription:
|
| 187 |
+
if audio_transcription.startswith("["):
|
| 188 |
+
# Transcription error - show error message
|
| 189 |
+
display_text = f"π€ {audio_transcription}"
|
| 190 |
+
else:
|
| 191 |
+
# Successful transcription - show with microphone icon
|
| 192 |
+
display_text = f"π€ {audio_transcription}"
|
| 193 |
+
|
| 194 |
+
history.append({
|
| 195 |
+
"role": "user",
|
| 196 |
+
"content": display_text
|
| 197 |
+
})
|
| 198 |
+
logger.info(f"Added audio message to chat history: '{display_text}'")
|
| 199 |
+
|
| 200 |
+
# If there's also text content, add it separately
|
| 201 |
+
if text_content:
|
| 202 |
+
history.append({
|
| 203 |
+
"role": "user",
|
| 204 |
+
"content": text_content
|
| 205 |
+
})
|
| 206 |
+
logger.info(f"Added additional text content to history: '{text_content[:50]}...'")
|
| 207 |
+
|
| 208 |
+
elif text_content:
|
| 209 |
+
# Only text content, no audio
|
| 210 |
history.append({
|
| 211 |
+
"role": "user",
|
| 212 |
"content": text_content
|
| 213 |
})
|
| 214 |
+
logger.info(f"Added text-only message to chat history: '{text_content[:50]}...'")
|
| 215 |
+
|
| 216 |
+
elif message.get("files") and not has_audio_content:
|
| 217 |
+
# File uploads that aren't audio
|
| 218 |
+
for file_path in message["files"]:
|
| 219 |
+
if file_path and not UIHandlers.is_audio_file(file_path):
|
| 220 |
+
history.append({
|
| 221 |
+
"role": "user",
|
| 222 |
+
"content": {"path": file_path}
|
| 223 |
+
})
|
| 224 |
+
logger.info(f"Added file upload to history: {file_path}")
|
| 225 |
|
| 226 |
# Add to conversation manager (combine all content)
|
| 227 |
if user_content_parts:
|
| 228 |
combined_content = "\n".join(user_content_parts)
|
| 229 |
conversation_manager.add_user_message(combined_content)
|
| 230 |
+
logger.info(f"Added user message to conversation manager. Content parts: {len(user_content_parts)}, Combined: '{combined_content[:100]}...', {conversation_manager.get_history_summary()}")
|
| 231 |
+
else:
|
| 232 |
+
logger.warning("No user content parts found in message - this may indicate an issue")
|
| 233 |
|
| 234 |
+
logger.info(f"Final history length: {len(history)}")
|
| 235 |
return history, gr.MultimodalTextbox(value=None, interactive=False)
|
| 236 |
|
| 237 |
except Exception as e:
|
|
|
|
| 305 |
@staticmethod
|
| 306 |
def bot_with_real_streaming(
|
| 307 |
history: List[Dict],
|
| 308 |
+
model_name: str = None,
|
| 309 |
+
use_tts: bool = False
|
| 310 |
+
) -> Generator[Tuple[List[Dict], Optional[str]], None, None]:
|
| 311 |
"""
|
| 312 |
Bot function with real-time streaming from the agent
|
| 313 |
|
| 314 |
Args:
|
| 315 |
history: Current Gradio chat history (for display only)
|
| 316 |
model_name: Model to use for the agent
|
| 317 |
+
use_tts: Whether to generate text-to-speech for the response
|
| 318 |
|
| 319 |
Yields:
|
| 320 |
+
Tuple of (Updated history, audio_file_path or None)
|
| 321 |
"""
|
| 322 |
try:
|
| 323 |
# Get agent instance with specified model
|
|
|
|
| 337 |
try:
|
| 338 |
content_chunks = []
|
| 339 |
final_result = None
|
| 340 |
+
final_content = ""
|
| 341 |
|
| 342 |
for chunk in AgentRunner.run_agent_with_streaming_sync(agent, agent_input):
|
| 343 |
if chunk['type'] == 'final_result':
|
| 344 |
final_result = chunk['result']
|
| 345 |
if chunk['content']:
|
| 346 |
content_chunks.append(chunk['content'])
|
| 347 |
+
final_content = chunk['content']
|
| 348 |
elif chunk['type'] == 'error':
|
| 349 |
final_result = chunk['result']
|
| 350 |
content_chunks.append(chunk['content'])
|
| 351 |
+
final_content = chunk['content']
|
| 352 |
|
| 353 |
# Update conversation manager
|
| 354 |
if final_result:
|
|
|
|
| 359 |
if content_chunks:
|
| 360 |
for content in content_chunks:
|
| 361 |
history[-1]["content"] = content
|
| 362 |
+
yield history, None # No audio during streaming
|
| 363 |
+
final_content = content
|
| 364 |
+
|
| 365 |
+
# Generate TTS for the final response if enabled
|
| 366 |
+
if use_tts and final_content:
|
| 367 |
+
audio_file = UIHandlers._generate_tts_for_response_sync(final_content)
|
| 368 |
+
if audio_file:
|
| 369 |
+
# Return the final history with the audio file
|
| 370 |
+
yield history, audio_file
|
| 371 |
+
else:
|
| 372 |
+
yield history, None
|
| 373 |
+
else:
|
| 374 |
+
yield history, None
|
| 375 |
+
|
| 376 |
else:
|
| 377 |
+
error_msg = "I apologize, but I didn't receive a response. Please try again."
|
| 378 |
+
history[-1]["content"] = error_msg
|
| 379 |
+
yield history, None
|
| 380 |
|
| 381 |
except Exception as e:
|
| 382 |
logger.error(f"Error in streaming execution: {str(e)}")
|
| 383 |
+
error_msg = f"Sorry, I encountered an error while processing your request: {str(e)}"
|
| 384 |
+
history[-1]["content"] = error_msg
|
| 385 |
+
yield history, None
|
| 386 |
|
| 387 |
except Exception as e:
|
| 388 |
logger.error(f"Bot streaming function error: {str(e)}")
|
| 389 |
if len(history) == 0 or history[-1].get("role") != "assistant":
|
| 390 |
history.append({"role": "assistant", "content": ""})
|
| 391 |
history[-1]["content"] = "I apologize, but I'm experiencing technical difficulties. Please try again in a moment."
|
| 392 |
+
yield history, None
|
| 393 |
|
| 394 |
@staticmethod
|
| 395 |
+
def bot(history: List[Dict], model_name: str = None, use_tts: bool = False) -> Generator[Tuple[List[Dict], Optional[str]], None, None]:
|
| 396 |
"""
|
| 397 |
Main bot function with simulated streaming
|
| 398 |
|
| 399 |
Args:
|
| 400 |
history: Current Gradio chat history (for display only)
|
| 401 |
model_name: Model to use for the agent
|
| 402 |
+
use_tts: Whether to generate text-to-speech for the response
|
| 403 |
|
| 404 |
Yields:
|
| 405 |
+
Tuple of (Updated history, audio_file_path or None)
|
| 406 |
"""
|
| 407 |
try:
|
| 408 |
# Get agent instance with specified model
|
|
|
|
| 423 |
response = ResponseFormatter.extract_response_content(result)
|
| 424 |
|
| 425 |
# Stream the response with simulated typing
|
| 426 |
+
for updated_history in ResponseFormatter.stream_response(response, history):
|
| 427 |
+
yield updated_history, None # No audio during streaming
|
| 428 |
+
|
| 429 |
+
# Generate TTS for the final response if enabled
|
| 430 |
+
if use_tts and response:
|
| 431 |
+
audio_file = UIHandlers._generate_tts_for_response_sync(response)
|
| 432 |
+
if audio_file:
|
| 433 |
+
yield history, audio_file
|
| 434 |
+
else:
|
| 435 |
+
yield history, None
|
| 436 |
+
else:
|
| 437 |
+
yield history, None
|
| 438 |
|
| 439 |
except Exception as e:
|
| 440 |
logger.error(f"Bot function error: {str(e)}")
|
| 441 |
error_response = "I apologize, but I'm experiencing technical difficulties. Please try again in a moment."
|
| 442 |
+
for updated_history in ResponseFormatter.stream_response(error_response, history):
|
| 443 |
+
yield updated_history, None
|
| 444 |
|
| 445 |
@staticmethod
|
| 446 |
def dynamic_bot(
|
| 447 |
history: List[Dict],
|
| 448 |
+
use_real_streaming: bool = True,
|
| 449 |
+
use_tts: bool = False,
|
| 450 |
model_name: str = None
|
| 451 |
+
) -> Generator[Tuple[List[Dict], Optional[str]], None, None]:
|
| 452 |
"""
|
| 453 |
+
Dynamic bot function that can switch between streaming modes and TTS
|
| 454 |
|
| 455 |
Args:
|
| 456 |
history: Current Gradio chat history (for display only)
|
| 457 |
use_real_streaming: Whether to use real-time streaming from agent
|
| 458 |
+
use_tts: Whether to generate text-to-speech for the response
|
| 459 |
model_name: Model to use for the agent
|
| 460 |
|
| 461 |
Yields:
|
| 462 |
+
Tuple of (Updated history, audio_file_path or None)
|
| 463 |
"""
|
| 464 |
if use_real_streaming:
|
| 465 |
logger.info("Using real-time streaming mode")
|
| 466 |
+
yield from UIHandlers.bot_with_real_streaming(history, model_name, use_tts)
|
| 467 |
else:
|
| 468 |
logger.info("Using simulated streaming mode")
|
| 469 |
+
yield from UIHandlers.bot(history, model_name, use_tts)
|
| 470 |
|
| 471 |
@staticmethod
|
| 472 |
def clear_conversation() -> List[Dict]:
|
|
|
|
| 480 |
conversation_manager.clear_history()
|
| 481 |
logger.info("Conversation history cleared")
|
| 482 |
return []
|
| 483 |
+
|
| 484 |
+
@staticmethod
|
| 485 |
+
def _generate_tts_for_response_sync(text: str) -> Optional[str]:
|
| 486 |
+
"""
|
| 487 |
+
Generate TTS audio for a response text synchronously.
|
| 488 |
+
|
| 489 |
+
Args:
|
| 490 |
+
text: The text to convert to speech
|
| 491 |
+
|
| 492 |
+
Returns:
|
| 493 |
+
Path to generated audio file or None if error
|
| 494 |
+
"""
|
| 495 |
+
try:
|
| 496 |
+
if not text or not text.strip():
|
| 497 |
+
return None
|
| 498 |
+
|
| 499 |
+
# Clean the text for TTS
|
| 500 |
+
clean_text = clean_tts_markup(text)
|
| 501 |
+
|
| 502 |
+
# Limit text length for TTS (Groq has 10K char limit)
|
| 503 |
+
if len(clean_text) > 8000: # Leave some buffer
|
| 504 |
+
clean_text = clean_text[:8000] + "..."
|
| 505 |
+
logger.info(f"Truncated TTS text to 8000 characters")
|
| 506 |
+
|
| 507 |
+
logger.info(f"Generating TTS for response ({len(clean_text)} chars)")
|
| 508 |
+
|
| 509 |
+
# Generate TTS using session persistence
|
| 510 |
+
audio_file = generate_speech_for_session(clean_text)
|
| 511 |
+
if audio_file:
|
| 512 |
+
logger.info(f"TTS audio generated: {audio_file}")
|
| 513 |
+
return audio_file
|
| 514 |
+
else:
|
| 515 |
+
logger.warning("Failed to generate TTS audio")
|
| 516 |
+
return None
|
| 517 |
+
|
| 518 |
+
except Exception as e:
|
| 519 |
+
logger.error(f"TTS generation error: {str(e)}")
|
| 520 |
+
return None
|
apps/gradio-app/src/fitness_gradio/ui/styles.py
CHANGED
|
@@ -53,6 +53,30 @@ MAIN_CSS = """
|
|
| 53 |
font-weight: bold;
|
| 54 |
}
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
/* Ensure all text in model-info respects dark theme */
|
| 57 |
.model-info * {
|
| 58 |
color: inherit !important;
|
|
@@ -88,7 +112,9 @@ HELP_CONTENT = """
|
|
| 88 |
**How to interact:**
|
| 89 |
- **π¬ Type messages** in the text box
|
| 90 |
- **π€ Record voice messages** using the circular microphone button (requires Groq API key)
|
| 91 |
-
-
|
|
|
|
|
|
|
| 92 |
|
| 93 |
**To get the best results:**
|
| 94 |
- Tell me your fitness level (beginner, intermediate, advanced)
|
|
@@ -98,6 +124,12 @@ HELP_CONTENT = """
|
|
| 98 |
|
| 99 |
**Voice Input Setup:**
|
| 100 |
- Set your `GROQ_API_KEY` environment variable to enable voice transcription
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
- Click the circular microphone icon in the input box and speak your message
|
| 102 |
- The system will convert your speech to text automatically using Groq's Whisper
|
| 103 |
|
|
|
|
| 53 |
font-weight: bold;
|
| 54 |
}
|
| 55 |
|
| 56 |
+
/* TTS control styling */
|
| 57 |
+
.tts-checkbox {
|
| 58 |
+
background: linear-gradient(135deg, rgba(99, 102, 241, 0.1), rgba(139, 92, 246, 0.1)) !important;
|
| 59 |
+
border: 1px solid rgba(99, 102, 241, 0.3) !important;
|
| 60 |
+
border-radius: 8px !important;
|
| 61 |
+
padding: 8px !important;
|
| 62 |
+
transition: all 0.3s ease !important;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
.tts-checkbox:hover {
|
| 66 |
+
background: linear-gradient(135deg, rgba(99, 102, 241, 0.2), rgba(139, 92, 246, 0.2)) !important;
|
| 67 |
+
border-color: rgba(99, 102, 241, 0.5) !important;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
.tts-active {
|
| 71 |
+
animation: pulse-tts 2s infinite;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
@keyframes pulse-tts {
|
| 75 |
+
0% { box-shadow: 0 0 0 0 rgba(99, 102, 241, 0.7); }
|
| 76 |
+
70% { box-shadow: 0 0 0 10px rgba(99, 102, 241, 0); }
|
| 77 |
+
100% { box-shadow: 0 0 0 0 rgba(99, 102, 241, 0); }
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
/* Ensure all text in model-info respects dark theme */
|
| 81 |
.model-info * {
|
| 82 |
color: inherit !important;
|
|
|
|
| 112 |
**How to interact:**
|
| 113 |
- **π¬ Type messages** in the text box
|
| 114 |
- **π€ Record voice messages** using the circular microphone button (requires Groq API key)
|
| 115 |
+
- **οΏ½ Enable Text-to-Speech** to hear AI responses spoken aloud (requires Groq API key)
|
| 116 |
+
- **π Enable Real-time Streaming** for faster response display
|
| 117 |
+
- **οΏ½π Upload files** if needed for context
|
| 118 |
|
| 119 |
**To get the best results:**
|
| 120 |
- Tell me your fitness level (beginner, intermediate, advanced)
|
|
|
|
| 124 |
|
| 125 |
**Voice Input Setup:**
|
| 126 |
- Set your `GROQ_API_KEY` environment variable to enable voice transcription
|
| 127 |
+
|
| 128 |
+
**Text-to-Speech Setup:**
|
| 129 |
+
- Set your `GROQ_API_KEY` environment variable to enable audio generation
|
| 130 |
+
- Choose from 19 English voices or 4 Arabic voices
|
| 131 |
+
- Audio is automatically generated when TTS is enabled
|
| 132 |
+
- Responses are cleaned of markdown formatting for better speech quality
|
| 133 |
- Click the circular microphone icon in the input box and speak your message
|
| 134 |
- The system will convert your speech to text automatically using Groq's Whisper
|
| 135 |
|
apps/gradio-app/src/fitness_gradio/ui/tts_utils.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text-to-Speech utilities using Groq's TTS models.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
import tempfile
|
| 6 |
+
import logging
|
| 7 |
+
import requests
|
| 8 |
+
from typing import Optional, Union
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
from fitness_core.utils import get_logger
|
| 12 |
+
|
| 13 |
+
logger = get_logger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class GroqTTS:
|
| 17 |
+
"""Groq Text-to-Speech service wrapper."""
|
| 18 |
+
|
| 19 |
+
# Available English voices for playai-tts
|
| 20 |
+
ENGLISH_VOICES = [
|
| 21 |
+
"Arista-PlayAI", "Atlas-PlayAI", "Basil-PlayAI", "Briggs-PlayAI",
|
| 22 |
+
"Calum-PlayAI", "Celeste-PlayAI", "Cheyenne-PlayAI", "Chip-PlayAI",
|
| 23 |
+
"Cillian-PlayAI", "Deedee-PlayAI", "Fritz-PlayAI", "Gail-PlayAI",
|
| 24 |
+
"Indigo-PlayAI", "Mamaw-PlayAI", "Mason-PlayAI", "Mikail-PlayAI",
|
| 25 |
+
"Mitch-PlayAI", "Quinn-PlayAI", "Thunder-PlayAI"
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
# Available Arabic voices for playai-tts-arabic
|
| 29 |
+
ARABIC_VOICES = [
|
| 30 |
+
"Ahmad-PlayAI", "Amira-PlayAI", "Khalid-PlayAI", "Nasser-PlayAI"
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
# Default voice selections
|
| 34 |
+
DEFAULT_ENGLISH_VOICE = "Celeste-PlayAI" # Pleasant female voice
|
| 35 |
+
DEFAULT_ARABIC_VOICE = "Amira-PlayAI" # Pleasant female voice
|
| 36 |
+
|
| 37 |
+
def __init__(self, api_key: Optional[str] = None):
|
| 38 |
+
"""
|
| 39 |
+
Initialize the GroqTTS client.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
api_key: Groq API key. If None, will try to get from GROQ_API_KEY env var.
|
| 43 |
+
"""
|
| 44 |
+
self.api_key = api_key or os.getenv("GROQ_API_KEY")
|
| 45 |
+
if not self.api_key:
|
| 46 |
+
raise ValueError("Groq API key is required. Set GROQ_API_KEY environment variable or pass api_key parameter.")
|
| 47 |
+
|
| 48 |
+
self.temp_dir = Path(tempfile.gettempdir()) / "fitness_app_tts"
|
| 49 |
+
self.temp_dir.mkdir(exist_ok=True)
|
| 50 |
+
|
| 51 |
+
# Create a session-specific directory for persistent audio files
|
| 52 |
+
self.session_dir = self.temp_dir / "session_audio"
|
| 53 |
+
self.session_dir.mkdir(exist_ok=True)
|
| 54 |
+
|
| 55 |
+
def text_to_speech(
|
| 56 |
+
self,
|
| 57 |
+
text: str,
|
| 58 |
+
voice: Optional[str] = None,
|
| 59 |
+
model: str = "playai-tts",
|
| 60 |
+
response_format: str = "wav",
|
| 61 |
+
output_file: Optional[Union[str, Path]] = None
|
| 62 |
+
) -> Optional[str]:
|
| 63 |
+
"""
|
| 64 |
+
Convert text to speech using Groq's TTS API.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
text: Text to convert to speech (max 10K characters)
|
| 68 |
+
voice: Voice to use. If None, uses default voice based on model
|
| 69 |
+
model: TTS model to use ("playai-tts" or "playai-tts-arabic")
|
| 70 |
+
response_format: Audio format ("wav")
|
| 71 |
+
output_file: Path to save audio file. If None, creates temp file
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
Path to the generated audio file, or None if error
|
| 75 |
+
"""
|
| 76 |
+
try:
|
| 77 |
+
# Validate text length
|
| 78 |
+
if len(text) > 10000:
|
| 79 |
+
logger.warning(f"Text too long ({len(text)} chars), truncating to 10K characters")
|
| 80 |
+
text = text[:10000]
|
| 81 |
+
|
| 82 |
+
# Set default voice based on model
|
| 83 |
+
if voice is None:
|
| 84 |
+
if model == "playai-tts-arabic":
|
| 85 |
+
voice = self.DEFAULT_ARABIC_VOICE
|
| 86 |
+
else:
|
| 87 |
+
voice = self.DEFAULT_ENGLISH_VOICE
|
| 88 |
+
|
| 89 |
+
# Validate voice for model
|
| 90 |
+
if model == "playai-tts" and voice not in self.ENGLISH_VOICES:
|
| 91 |
+
logger.warning(f"Voice {voice} not valid for English model, using default")
|
| 92 |
+
voice = self.DEFAULT_ENGLISH_VOICE
|
| 93 |
+
elif model == "playai-tts-arabic" and voice not in self.ARABIC_VOICES:
|
| 94 |
+
logger.warning(f"Voice {voice} not valid for Arabic model, using default")
|
| 95 |
+
voice = self.DEFAULT_ARABIC_VOICE
|
| 96 |
+
|
| 97 |
+
# Create output file path - use session directory for persistence
|
| 98 |
+
if output_file is None:
|
| 99 |
+
# Create a unique filename using a hash of the text and timestamp
|
| 100 |
+
import time
|
| 101 |
+
timestamp = int(time.time() * 1000) # milliseconds for uniqueness
|
| 102 |
+
text_hash = hash(text) % 100000
|
| 103 |
+
output_file = self.session_dir / f"tts_output_{text_hash}_{timestamp}.wav"
|
| 104 |
+
else:
|
| 105 |
+
output_file = Path(output_file)
|
| 106 |
+
|
| 107 |
+
logger.info(f"Generating TTS for {len(text)} chars using {model} with {voice}")
|
| 108 |
+
|
| 109 |
+
# Generate speech using the correct API structure
|
| 110 |
+
headers = {
|
| 111 |
+
"Authorization": f"Bearer {self.api_key}",
|
| 112 |
+
"Content-Type": "application/json"
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
data = {
|
| 116 |
+
"model": model,
|
| 117 |
+
"input": text,
|
| 118 |
+
"voice": voice,
|
| 119 |
+
"response_format": response_format
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
response = requests.post(
|
| 123 |
+
"https://api.groq.com/openai/v1/audio/speech",
|
| 124 |
+
headers=headers,
|
| 125 |
+
json=data
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
if response.status_code == 200:
|
| 129 |
+
# Write audio content to file
|
| 130 |
+
with open(output_file, 'wb') as f:
|
| 131 |
+
f.write(response.content)
|
| 132 |
+
|
| 133 |
+
logger.info(f"TTS audio saved to: {output_file}")
|
| 134 |
+
return str(output_file)
|
| 135 |
+
else:
|
| 136 |
+
logger.error(f"TTS API error: {response.status_code} - {response.text}")
|
| 137 |
+
return None
|
| 138 |
+
|
| 139 |
+
except Exception as e:
|
| 140 |
+
logger.error(f"Error generating TTS: {str(e)}")
|
| 141 |
+
return None
|
| 142 |
+
|
| 143 |
+
def cleanup_temp_files(self, max_age_hours: int = 24) -> None:
|
| 144 |
+
"""
|
| 145 |
+
Clean up old temporary audio files.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
max_age_hours: Delete files older than this many hours
|
| 149 |
+
"""
|
| 150 |
+
try:
|
| 151 |
+
import time
|
| 152 |
+
current_time = time.time()
|
| 153 |
+
max_age_seconds = max_age_hours * 3600
|
| 154 |
+
|
| 155 |
+
for file_path in self.temp_dir.glob("*.wav"):
|
| 156 |
+
if current_time - file_path.stat().st_mtime > max_age_seconds:
|
| 157 |
+
file_path.unlink()
|
| 158 |
+
logger.debug(f"Deleted old TTS file: {file_path}")
|
| 159 |
+
|
| 160 |
+
except Exception as e:
|
| 161 |
+
logger.error(f"Error cleaning up temp files: {str(e)}")
|
| 162 |
+
|
| 163 |
+
@classmethod
|
| 164 |
+
def get_available_voices(cls, model: str = "playai-tts") -> list[str]:
|
| 165 |
+
"""
|
| 166 |
+
Get list of available voices for a model.
|
| 167 |
+
|
| 168 |
+
Args:
|
| 169 |
+
model: Model name ("playai-tts" or "playai-tts-arabic")
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
List of available voice names
|
| 173 |
+
"""
|
| 174 |
+
if model == "playai-tts-arabic":
|
| 175 |
+
return cls.ARABIC_VOICES.copy()
|
| 176 |
+
else:
|
| 177 |
+
return cls.ENGLISH_VOICES.copy()
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
# Global TTS instance (lazy initialized)
|
| 181 |
+
_tts_instance: Optional[GroqTTS] = None
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def get_tts_instance() -> Optional[GroqTTS]:
|
| 185 |
+
"""
|
| 186 |
+
Get or create a global TTS instance.
|
| 187 |
+
|
| 188 |
+
Returns:
|
| 189 |
+
GroqTTS instance or None if API key not available
|
| 190 |
+
"""
|
| 191 |
+
global _tts_instance
|
| 192 |
+
|
| 193 |
+
if _tts_instance is None:
|
| 194 |
+
try:
|
| 195 |
+
_tts_instance = GroqTTS()
|
| 196 |
+
except ValueError as e:
|
| 197 |
+
logger.warning(f"TTS not available: {str(e)}")
|
| 198 |
+
return None
|
| 199 |
+
|
| 200 |
+
return _tts_instance
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def generate_speech_for_text(text: str, voice: Optional[str] = None) -> Optional[str]:
|
| 204 |
+
"""
|
| 205 |
+
Convenience function to generate speech for text.
|
| 206 |
+
|
| 207 |
+
Args:
|
| 208 |
+
text: Text to convert to speech
|
| 209 |
+
voice: Voice to use (optional)
|
| 210 |
+
|
| 211 |
+
Returns:
|
| 212 |
+
Path to generated audio file or None if error
|
| 213 |
+
"""
|
| 214 |
+
tts = get_tts_instance()
|
| 215 |
+
if tts is None:
|
| 216 |
+
return None
|
| 217 |
+
|
| 218 |
+
return tts.text_to_speech(text, voice=voice)
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def generate_speech_for_session(text: str, voice: Optional[str] = None) -> Optional[str]:
|
| 222 |
+
"""
|
| 223 |
+
Generate speech for text with session persistence for chat interface.
|
| 224 |
+
|
| 225 |
+
Args:
|
| 226 |
+
text: Text to convert to speech
|
| 227 |
+
voice: Voice to use (optional)
|
| 228 |
+
|
| 229 |
+
Returns:
|
| 230 |
+
Path to generated audio file that persists for the session, or None if error
|
| 231 |
+
"""
|
| 232 |
+
tts = get_tts_instance()
|
| 233 |
+
if tts is None:
|
| 234 |
+
return None
|
| 235 |
+
|
| 236 |
+
# Generate audio in session directory for persistence
|
| 237 |
+
audio_file = tts.text_to_speech(text, voice=voice)
|
| 238 |
+
if audio_file:
|
| 239 |
+
# Ensure the file is in the session directory
|
| 240 |
+
audio_path = Path(audio_file)
|
| 241 |
+
if audio_path.exists():
|
| 242 |
+
logger.info(f"Session TTS audio available at: {audio_file}")
|
| 243 |
+
return str(audio_file)
|
| 244 |
+
|
| 245 |
+
return None
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def clean_tts_markup(text: str) -> str:
|
| 249 |
+
"""
|
| 250 |
+
Clean text for TTS by removing markdown and other markup.
|
| 251 |
+
|
| 252 |
+
Args:
|
| 253 |
+
text: Text that may contain markdown
|
| 254 |
+
|
| 255 |
+
Returns:
|
| 256 |
+
Clean text suitable for TTS
|
| 257 |
+
"""
|
| 258 |
+
import re
|
| 259 |
+
|
| 260 |
+
# Remove markdown formatting
|
| 261 |
+
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) # Bold
|
| 262 |
+
text = re.sub(r'\*(.*?)\*', r'\1', text) # Italics
|
| 263 |
+
text = re.sub(r'`(.*?)`', r'\1', text) # Inline code
|
| 264 |
+
text = re.sub(r'```.*?```', '', text, flags=re.DOTALL) # Code blocks
|
| 265 |
+
text = re.sub(r'#{1,6}\s*(.*)', r'\1', text) # Headers
|
| 266 |
+
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) # Links
|
| 267 |
+
text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', '', text) # Images
|
| 268 |
+
text = re.sub(r'^[-*+]\s+', '', text, flags=re.MULTILINE) # Lists
|
| 269 |
+
text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE) # Numbered lists
|
| 270 |
+
text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE) # Quotes
|
| 271 |
+
|
| 272 |
+
# Clean up extra whitespace
|
| 273 |
+
text = re.sub(r'\n\s*\n', '\n\n', text) # Multiple newlines
|
| 274 |
+
text = re.sub(r'[ \t]+', ' ', text) # Multiple spaces
|
| 275 |
+
text = text.strip()
|
| 276 |
+
|
| 277 |
+
return text
|
apps/gradio-app/test_audio_debug.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Debug script to test audio transcription functionality
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import logging
|
| 8 |
+
|
| 9 |
+
# Add the src directory to the Python path
|
| 10 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
| 11 |
+
|
| 12 |
+
from fitness_gradio.ui.handlers import UIHandlers
|
| 13 |
+
|
| 14 |
+
# Set up logging
|
| 15 |
+
logging.basicConfig(level=logging.INFO)
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
def test_audio_message_processing():
|
| 19 |
+
"""Test the add_message_with_audio function"""
|
| 20 |
+
|
| 21 |
+
# Mock message with audio file (you'd need to provide a real audio file path to test)
|
| 22 |
+
test_audio_file = "test_audio.wav" # Replace with actual audio file path
|
| 23 |
+
|
| 24 |
+
# Test case 1: Audio only message
|
| 25 |
+
mock_audio_message = {
|
| 26 |
+
"files": [test_audio_file],
|
| 27 |
+
"text": ""
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
# Test case 2: Text only message
|
| 31 |
+
mock_text_message = {
|
| 32 |
+
"files": [],
|
| 33 |
+
"text": "Hello, this is a test message"
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
# Test case 3: Combined audio and text
|
| 37 |
+
mock_combined_message = {
|
| 38 |
+
"files": [test_audio_file],
|
| 39 |
+
"text": "Additional text content"
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
initial_history = []
|
| 43 |
+
|
| 44 |
+
print("Testing audio message processing...")
|
| 45 |
+
|
| 46 |
+
# Test text message
|
| 47 |
+
print("\n=== Testing text-only message ===")
|
| 48 |
+
try:
|
| 49 |
+
history, input_state = UIHandlers.add_message_with_audio(initial_history.copy(), mock_text_message)
|
| 50 |
+
print(f"History after text message: {history}")
|
| 51 |
+
print(f"Input state: {input_state}")
|
| 52 |
+
except Exception as e:
|
| 53 |
+
print(f"Error with text message: {e}")
|
| 54 |
+
|
| 55 |
+
# Test audio message (only if audio file exists)
|
| 56 |
+
if os.path.exists(test_audio_file):
|
| 57 |
+
print("\n=== Testing audio-only message ===")
|
| 58 |
+
try:
|
| 59 |
+
history, input_state = UIHandlers.add_message_with_audio(initial_history.copy(), mock_audio_message)
|
| 60 |
+
print(f"History after audio message: {history}")
|
| 61 |
+
print(f"Input state: {input_state}")
|
| 62 |
+
except Exception as e:
|
| 63 |
+
print(f"Error with audio message: {e}")
|
| 64 |
+
else:
|
| 65 |
+
print(f"\n=== Skipping audio test (file {test_audio_file} not found) ===")
|
| 66 |
+
|
| 67 |
+
# Test is_audio_file function
|
| 68 |
+
print("\n=== Testing audio file detection ===")
|
| 69 |
+
test_files = [
|
| 70 |
+
"test.wav",
|
| 71 |
+
"test.mp3",
|
| 72 |
+
"test.m4a",
|
| 73 |
+
"test.txt",
|
| 74 |
+
"test.jpg",
|
| 75 |
+
"test.webm"
|
| 76 |
+
]
|
| 77 |
+
|
| 78 |
+
for test_file in test_files:
|
| 79 |
+
is_audio = UIHandlers.is_audio_file(test_file)
|
| 80 |
+
print(f"{test_file}: {'Audio' if is_audio else 'Not audio'}")
|
| 81 |
+
|
| 82 |
+
if __name__ == "__main__":
|
| 83 |
+
test_audio_message_processing()
|
apps/gradio-app/test_tts.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test TTS functionality
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from fitness_gradio.ui.tts_utils import GroqTTS, generate_speech_for_text, clean_tts_markup
|
| 6 |
+
|
| 7 |
+
def test_tts_setup():
|
| 8 |
+
"""Test basic TTS setup and functionality."""
|
| 9 |
+
print("Testing TTS setup...")
|
| 10 |
+
|
| 11 |
+
# Check if API key is available
|
| 12 |
+
api_key = os.getenv("GROQ_API_KEY")
|
| 13 |
+
if not api_key:
|
| 14 |
+
print("β GROQ_API_KEY not found in environment variables")
|
| 15 |
+
print("Please set GROQ_API_KEY to test TTS functionality")
|
| 16 |
+
return False
|
| 17 |
+
|
| 18 |
+
print("β
GROQ_API_KEY found")
|
| 19 |
+
|
| 20 |
+
# Test TTS instance creation
|
| 21 |
+
try:
|
| 22 |
+
tts = GroqTTS()
|
| 23 |
+
print("β
GroqTTS instance created successfully")
|
| 24 |
+
except Exception as e:
|
| 25 |
+
print(f"β Failed to create GroqTTS instance: {e}")
|
| 26 |
+
return False
|
| 27 |
+
|
| 28 |
+
# Test text cleaning
|
| 29 |
+
test_text = "**Hello** this is a *test* with `code` and [links](http://example.com)"
|
| 30 |
+
cleaned = clean_tts_markup(test_text)
|
| 31 |
+
print(f"Text cleaning test:")
|
| 32 |
+
print(f" Original: {test_text}")
|
| 33 |
+
print(f" Cleaned: {cleaned}")
|
| 34 |
+
|
| 35 |
+
# Test voice lists
|
| 36 |
+
english_voices = GroqTTS.get_available_voices("playai-tts")
|
| 37 |
+
arabic_voices = GroqTTS.get_available_voices("playai-tts-arabic")
|
| 38 |
+
print(f"β
Available English voices: {len(english_voices)}")
|
| 39 |
+
print(f"β
Available Arabic voices: {len(arabic_voices)}")
|
| 40 |
+
|
| 41 |
+
print("π All TTS setup tests passed!")
|
| 42 |
+
print("Note: Actual TTS generation will be tested when the UI is used with a valid API key.")
|
| 43 |
+
return True
|
| 44 |
+
|
| 45 |
+
if __name__ == "__main__":
|
| 46 |
+
test_tts_setup()
|
shared/src/fitness_core/agents/providers.py
CHANGED
|
@@ -65,7 +65,10 @@ class ModelProvider:
|
|
| 65 |
# Whisper models (Speech-to-Text)
|
| 66 |
"whisper-large-v3": "litellm/groq/whisper-large-v3", # Whisper Large v3
|
| 67 |
"whisper-large-v3-turbo": "litellm/groq/whisper-large-v3-turbo", # Whisper Large v3 Turbo
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
| 69 |
}
|
| 70 |
|
| 71 |
@classmethod
|
|
@@ -102,6 +105,8 @@ class ModelProvider:
|
|
| 102 |
"kimi-k2-instruct": "Moonshot Kimi K2 MoE - 1T parameters with tool use (Moonshot via Groq)",
|
| 103 |
"whisper-large-v3": "OpenAI Whisper Large v3 - best speech-to-text (OpenAI via Groq)",
|
| 104 |
"whisper-large-v3-turbo": "OpenAI Whisper Large v3 Turbo - faster speech-to-text (OpenAI via Groq)",
|
|
|
|
|
|
|
| 105 |
}
|
| 106 |
return model_info.get(model_name, "Model information not available")
|
| 107 |
|
|
|
|
| 65 |
# Whisper models (Speech-to-Text)
|
| 66 |
"whisper-large-v3": "litellm/groq/whisper-large-v3", # Whisper Large v3
|
| 67 |
"whisper-large-v3-turbo": "litellm/groq/whisper-large-v3-turbo", # Whisper Large v3 Turbo
|
| 68 |
+
|
| 69 |
+
# PlayAI TTS models (Text-to-Speech)
|
| 70 |
+
"playai-tts": "litellm/groq/playai-tts", # English TTS model
|
| 71 |
+
"playai-tts-arabic": "litellm/groq/playai-tts-arabic", # Arabic TTS model
|
| 72 |
}
|
| 73 |
|
| 74 |
@classmethod
|
|
|
|
| 105 |
"kimi-k2-instruct": "Moonshot Kimi K2 MoE - 1T parameters with tool use (Moonshot via Groq)",
|
| 106 |
"whisper-large-v3": "OpenAI Whisper Large v3 - best speech-to-text (OpenAI via Groq)",
|
| 107 |
"whisper-large-v3-turbo": "OpenAI Whisper Large v3 Turbo - faster speech-to-text (OpenAI via Groq)",
|
| 108 |
+
"playai-tts": "PlayAI English TTS - high-quality text-to-speech with 19 voices (PlayAI via Groq)",
|
| 109 |
+
"playai-tts-arabic": "PlayAI Arabic TTS - high-quality Arabic text-to-speech with 4 voices (PlayAI via Groq)",
|
| 110 |
}
|
| 111 |
return model_info.get(model_name, "Model information not available")
|
| 112 |
|