Spaces:

sdeery
/

fitness-app

Running

App Files Files Community

github-actions[bot] commited on Jul 28

Commit

69427f5

1 Parent(s): 675c4cb

🤖 Auto-deploy from GitHub (push) - fe69584 - 2025-07-28 05:16:36 UTC

Browse files

Files changed (9) hide show

apps/gradio-app/README.md +13 -5
apps/gradio-app/src/fitness_gradio/ui/app.py +16 -8
apps/gradio-app/src/fitness_gradio/ui/components.py +22 -4
apps/gradio-app/src/fitness_gradio/ui/handlers.py +155 -41
apps/gradio-app/src/fitness_gradio/ui/styles.py +33 -1
apps/gradio-app/src/fitness_gradio/ui/tts_utils.py +277 -0
apps/gradio-app/test_audio_debug.py +83 -0
apps/gradio-app/test_tts.py +46 -0
shared/src/fitness_core/agents/providers.py +6 -1

apps/gradio-app/README.md CHANGED Viewed

@@ -1,11 +1,12 @@
 # Fitness Gradio App
-Web interface for the Fitness AI Assistant using Gradio with voice input support.
 ## Features
 - Interactive chat interface with multimodal input
 - **Voice input** via microphone button (powered by Groq Whisper)
 - Multi-provider model support (Groq, Anthropic, OpenAI)
 - Real-time streaming responses
 - Fitness plan generation
@@ -14,7 +15,7 @@ Web interface for the Fitness AI Assistant using Gradio with voice input support
 ## Quick Start
 ```bash
-# Set your API key for voice functionality
 $env:GROQ_API_KEY = "your-groq-api-key"
 # Install and run
@@ -22,12 +23,19 @@ poetry install
 poetry run python -m fitness_gradio.main
 ```
-## Voice Setup
 1. Get a [Groq API key](https://console.groq.com/keys)
 2. Set `GROQ_API_KEY` environment variable
-3. Click the microphone button in the chat interface
-4. Allow browser microphone access when prompted
 See [VOICE_SETUP.md](VOICE_SETUP.md) for detailed setup instructions.

 # Fitness Gradio App
+Web interface for the Fitness AI Assistant using Gradio with voice input and text-to-speech support.
 ## Features
 - Interactive chat interface with multimodal input
 - **Voice input** via microphone button (powered by Groq Whisper)
+- **Text-to-Speech output** with 19 English and 4 Arabic voices (powered by Groq PlayAI TTS)
 - Multi-provider model support (Groq, Anthropic, OpenAI)
 - Real-time streaming responses
 - Fitness plan generation
 ## Quick Start
 ```bash
+# Set your API key for voice functionality and TTS
 $env:GROQ_API_KEY = "your-groq-api-key"
 # Install and run
 poetry run python -m fitness_gradio.main
 ```
+## Voice & TTS Setup
 1. Get a [Groq API key](https://console.groq.com/keys)
 2. Set `GROQ_API_KEY` environment variable
+3. **Voice Input**: Click the microphone button in the chat interface
+4. **Text-to-Speech**: Enable the "🔊 Enable Text-to-Speech" checkbox
+5. Allow browser microphone access when prompted (for voice input)
+### Available TTS Voices
+**English (playai-tts)**: 19 voices including Celeste-PlayAI (default), Fritz-PlayAI, Arista-PlayAI, Atlas-PlayAI, Basil-PlayAI, Briggs-PlayAI, Calum-PlayAI, Cheyenne-PlayAI, and more.
+**Arabic (playai-tts-arabic)**: 4 voices including Amira-PlayAI (default), Ahmad-PlayAI, Khalid-PlayAI, Nasser-PlayAI.
 See [VOICE_SETUP.md](VOICE_SETUP.md) for detailed setup instructions.

apps/gradio-app/src/fitness_gradio/ui/app.py CHANGED Viewed

@@ -36,11 +36,16 @@ class FitnessAppUI:
                 (model_dropdown, selected_model) = UIComponents.create_model_selection_section()
             # Main chat interface
-            chatbot = UIComponents.create_chatbot()
             chat_input = UIComponents.create_chat_input()
             # Control buttons
-            clear_btn, streaming_toggle = UIComponents.create_control_buttons()
             # Examples section
             UIComponents.create_examples_section(chat_input)
@@ -51,8 +56,8 @@ class FitnessAppUI:
             # Event handlers
             self._setup_event_handlers(
-                chatbot, chat_input, clear_btn, streaming_toggle,
-                model_dropdown, selected_model
             )
     def _setup_event_handlers(
@@ -61,8 +66,10 @@ class FitnessAppUI:
         chat_input: gr.MultimodalTextbox,
         clear_btn: gr.Button,
         streaming_toggle: gr.Checkbox,
         model_dropdown: gr.Dropdown,
-        selected_model: gr.Textbox
     ) -> None:
         """Set up all event handlers."""
@@ -70,12 +77,13 @@ class FitnessAppUI:
         chat_msg = chat_input.submit(
             UIHandlers.add_message_with_audio,
             [chatbot, chat_input],
-            [chatbot, chat_input]
         )
         bot_msg = chat_msg.then(
             UIHandlers.dynamic_bot,
-            [chatbot, streaming_toggle, selected_model],
-            chatbot,
             api_name="bot_response"
         )
         bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])

                 (model_dropdown, selected_model) = UIComponents.create_model_selection_section()
             # Main chat interface
+            with gr.Row():
+                with gr.Column():
+                    chatbot = UIComponents.create_chatbot()
+                with gr.Column(scale=0.3):
+                    output_audio = UIComponents.create_output_audio()
             chat_input = UIComponents.create_chat_input()
             # Control buttons
+            clear_btn, streaming_toggle, tts_toggle = UIComponents.create_control_buttons()
             # Examples section
             UIComponents.create_examples_section(chat_input)
             # Event handlers
             self._setup_event_handlers(
+                chatbot, chat_input, clear_btn, streaming_toggle, tts_toggle,
+                model_dropdown, selected_model, output_audio
             )
     def _setup_event_handlers(
         chat_input: gr.MultimodalTextbox,
         clear_btn: gr.Button,
         streaming_toggle: gr.Checkbox,
+        tts_toggle: gr.Checkbox,
         model_dropdown: gr.Dropdown,
+        selected_model: gr.Textbox,
+        output_audio: gr.Audio
     ) -> None:
         """Set up all event handlers."""
         chat_msg = chat_input.submit(
             UIHandlers.add_message_with_audio,
             [chatbot, chat_input],
+            [chatbot, chat_input],
+            queue=False
         )
         bot_msg = chat_msg.then(
             UIHandlers.dynamic_bot,
+            [chatbot, streaming_toggle, tts_toggle, selected_model],
+            [chatbot, output_audio],
             api_name="bot_response"
         )
         bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])

apps/gradio-app/src/fitness_gradio/ui/components.py CHANGED Viewed

@@ -158,6 +158,18 @@ class UIComponents:
             render_markdown=True
         )
     @staticmethod
     def create_chat_input() -> gr.MultimodalTextbox:
         """Create the chat input component."""
@@ -166,16 +178,17 @@ class UIComponents:
             file_count="multiple",
             placeholder="Ask me about fitness, request a workout plan, or get meal planning advice...",
             show_label=False,
-            sources=["microphone", "upload"],  # Re-enable microphone in multimodal for the circular button
         )
     @staticmethod
     def create_control_buttons() -> tuple:
         """
-        Create the control buttons (clear, streaming toggle).
         Returns:
-            Tuple of (clear_btn, streaming_toggle)
         """
         with gr.Row():
             clear_btn = gr.Button("🗑️ Clear Conversation", variant="secondary", size="sm")
@@ -184,8 +197,13 @@ class UIComponents:
                 value=True,
                 info="Stream responses in real-time as the agent generates them"
             )
-        return clear_btn, streaming_toggle
     @staticmethod
     def create_examples_section(chat_input: gr.MultimodalTextbox) -> gr.Examples:

             render_markdown=True
         )
+    @staticmethod
+    def create_output_audio() -> gr.Audio:
+        """Create the output audio component for TTS responses."""
+        return gr.Audio(
+            label="🔊 Audio Response",
+            streaming=False,  # Disable streaming to avoid ffmpeg issues
+            autoplay=True,
+            show_download_button=True,
+            show_share_button=False,
+            format="wav"  # Explicitly set format to WAV
+        )
     @staticmethod
     def create_chat_input() -> gr.MultimodalTextbox:
         """Create the chat input component."""
             file_count="multiple",
             placeholder="Ask me about fitness, request a workout plan, or get meal planning advice...",
             show_label=False,
+            sources=["microphone", "upload"],  # Enable microphone and file uploads
+            submit_btn=True,  # Ensure submit button is available
         )
     @staticmethod
     def create_control_buttons() -> tuple:
         """
+        Create the control buttons (clear, streaming toggle, TTS toggle).
         Returns:
+            Tuple of (clear_btn, streaming_toggle, tts_toggle)
         """
         with gr.Row():
             clear_btn = gr.Button("🗑️ Clear Conversation", variant="secondary", size="sm")
                 value=True,
                 info="Stream responses in real-time as the agent generates them"
             )
+            tts_toggle = gr.Checkbox(
+                label="🔊 Enable Text-to-Speech",
+                value=False,
+                info="Convert AI responses to speech using Groq's TTS models"
+            )
+        return clear_btn, streaming_toggle, tts_toggle
     @staticmethod
     def create_examples_section(chat_input: gr.MultimodalTextbox) -> gr.Examples:

apps/gradio-app/src/fitness_gradio/ui/handlers.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import List, Dict, Union, Generator, Any, Tuple, Optional
 from fitness_core.agents import FitnessAgent
 from fitness_core.services import ConversationManager, AgentRunner, ResponseFormatter
 from fitness_core.utils import get_logger
 logger = get_logger(__name__)
@@ -137,60 +138,100 @@ Please check your API keys and try a different model."""
             Tuple of (updated_history, cleared_input)
         """
         try:
             user_content_parts = []
             # Handle file uploads (including audio from microphone)
             if message.get("files"):
                 for file_path in message["files"]:
                     if file_path:  # Validate file path exists
                         # Check if this is an audio file (from microphone recording)
                         if UIHandlers.is_audio_file(file_path):
-                            logger.info(f"Processing audio file: {file_path}")
                             # Process audio file for transcription
                             transcribed_text = UIHandlers.process_audio_file(file_path)
                             if transcribed_text and not transcribed_text.startswith("["):
-                                # Add voice message indicator to the text
-                                display_text = f"🎤 {transcribed_text}"
                                 user_content_parts.append(transcribed_text)  # Add clean text to conversation
-                                # Add to Gradio history for display
-                                history.append({
-                                    "role": "user",
-                                    "content": display_text
-                                })
                             else:
-                                # Show transcription error
-                                history.append({
-                                    "role": "user",
-                                    "content": f"🎤 {transcribed_text}"
-                                })
                         else:
                             # Handle non-audio file uploads
                             file_content = f"[File uploaded: {file_path}]"
                             user_content_parts.append(file_content)
-                            # Add to Gradio history for display
-                            history.append({
-                                "role": "user",
-                                "content": {"path": file_path}
-                            })
             # Handle text input
             if message.get("text") and message["text"].strip():
                 text_content = message["text"].strip()
                 user_content_parts.append(text_content)
-                # Add to Gradio history for display
                 history.append({
-                    "role": "user",
                     "content": text_content
                 })
             # Add to conversation manager (combine all content)
             if user_content_parts:
                 combined_content = "\n".join(user_content_parts)
                 conversation_manager.add_user_message(combined_content)
-                logger.info(f"Added user message to conversation. {conversation_manager.get_history_summary()}")
             return history, gr.MultimodalTextbox(value=None, interactive=False)
         except Exception as e:
@@ -264,17 +305,19 @@ Please check your API keys and try a different model."""
     @staticmethod
     def bot_with_real_streaming(
         history: List[Dict],
-        model_name: str = None
-    ) -> Generator[List[Dict], None, None]:
         """
         Bot function with real-time streaming from the agent
         Args:
             history: Current Gradio chat history (for display only)
             model_name: Model to use for the agent
         Yields:
-            Updated history with real-time streaming response
         """
         try:
             # Get agent instance with specified model
@@ -294,15 +337,18 @@ Please check your API keys and try a different model."""
             try:
                 content_chunks = []
                 final_result = None
                 for chunk in AgentRunner.run_agent_with_streaming_sync(agent, agent_input):
                     if chunk['type'] == 'final_result':
                         final_result = chunk['result']
                         if chunk['content']:
                             content_chunks.append(chunk['content'])
                     elif chunk['type'] == 'error':
                         final_result = chunk['result']
                         content_chunks.append(chunk['content'])
                 # Update conversation manager
                 if final_result:
@@ -313,34 +359,50 @@ Please check your API keys and try a different model."""
                 if content_chunks:
                     for content in content_chunks:
                         history[-1]["content"] = content
-                        yield history
                 else:
-                    history[-1]["content"] = "I apologize, but I didn't receive a response. Please try again."
-                    yield history
             except Exception as e:
                 logger.error(f"Error in streaming execution: {str(e)}")
-                history[-1]["content"] = f"Sorry, I encountered an error while processing your request: {str(e)}"
-                yield history
         except Exception as e:
             logger.error(f"Bot streaming function error: {str(e)}")
             if len(history) == 0 or history[-1].get("role") != "assistant":
                 history.append({"role": "assistant", "content": ""})
             history[-1]["content"] = "I apologize, but I'm experiencing technical difficulties. Please try again in a moment."
-            yield history
     @staticmethod
-    def bot(history: List[Dict], model_name: str = None) -> Generator[List[Dict], None, None]:
         """
         Main bot function with simulated streaming
         Args:
             history: Current Gradio chat history (for display only)
             model_name: Model to use for the agent
         Yields:
-            Updated history with bot response
         """
         try:
             # Get agent instance with specified model
@@ -361,36 +423,50 @@ Please check your API keys and try a different model."""
             response = ResponseFormatter.extract_response_content(result)
             # Stream the response with simulated typing
-            yield from ResponseFormatter.stream_response(response, history)
         except Exception as e:
             logger.error(f"Bot function error: {str(e)}")
             error_response = "I apologize, but I'm experiencing technical difficulties. Please try again in a moment."
-            yield from ResponseFormatter.stream_response(error_response, history)
     @staticmethod
     def dynamic_bot(
         history: List[Dict],
-        use_real_streaming: bool = True,
         model_name: str = None
-    ) -> Generator[List[Dict], None, None]:
         """
-        Dynamic bot function that can switch between streaming modes
         Args:
             history: Current Gradio chat history (for display only)
             use_real_streaming: Whether to use real-time streaming from agent
             model_name: Model to use for the agent
         Yields:
-            Updated history with bot response
         """
         if use_real_streaming:
             logger.info("Using real-time streaming mode")
-            yield from UIHandlers.bot_with_real_streaming(history, model_name)
         else:
             logger.info("Using simulated streaming mode")
-            yield from UIHandlers.bot(history, model_name)
     @staticmethod
     def clear_conversation() -> List[Dict]:
@@ -404,3 +480,41 @@ Please check your API keys and try a different model."""
         conversation_manager.clear_history()
         logger.info("Conversation history cleared")
         return []

 from fitness_core.agents import FitnessAgent
 from fitness_core.services import ConversationManager, AgentRunner, ResponseFormatter
 from fitness_core.utils import get_logger
+from .tts_utils import generate_speech_for_text, generate_speech_for_session, clean_tts_markup
 logger = get_logger(__name__)
             Tuple of (updated_history, cleared_input)
         """
         try:
+            logger.info(f"Processing message: {message}")
             user_content_parts = []
+            has_audio_content = False
+            audio_transcription = None
             # Handle file uploads (including audio from microphone)
             if message.get("files"):
+                logger.info(f"Found {len(message['files'])} files in message")
                 for file_path in message["files"]:
                     if file_path:  # Validate file path exists
+                        logger.info(f"Processing file: {file_path}")
                         # Check if this is an audio file (from microphone recording)
                         if UIHandlers.is_audio_file(file_path):
+                            logger.info(f"Detected audio file: {file_path}")
                             # Process audio file for transcription
                             transcribed_text = UIHandlers.process_audio_file(file_path)
                             if transcribed_text and not transcribed_text.startswith("["):
+                                audio_transcription = transcribed_text
                                 user_content_parts.append(transcribed_text)  # Add clean text to conversation
+                                has_audio_content = True
+                                logger.info(f"Successfully transcribed audio: '{transcribed_text[:50]}...'")
                             else:
+                                # Handle transcription error
+                                audio_transcription = transcribed_text
+                                has_audio_content = True  # Still mark as audio content even if failed
+                                logger.warning(f"Audio transcription failed: {transcribed_text}")
                         else:
                             # Handle non-audio file uploads
                             file_content = f"[File uploaded: {file_path}]"
                             user_content_parts.append(file_content)
+                            logger.info(f"Added file upload to content: {file_path}")
+            else:
+                logger.info("No files found in message")
             # Handle text input
+            text_content = None
             if message.get("text") and message["text"].strip():
                 text_content = message["text"].strip()
                 user_content_parts.append(text_content)
+                logger.info(f"Found text content: '{text_content[:50]}...'")
+            else:
+                logger.info("No text content found in message")
+            # Add appropriate message to chat history
+            if has_audio_content and audio_transcription:
+                if audio_transcription.startswith("["):
+                    # Transcription error - show error message
+                    display_text = f"🎤 {audio_transcription}"
+                else:
+                    # Successful transcription - show with microphone icon
+                    display_text = f"🎤 {audio_transcription}"
+                history.append({
+                    "role": "user",
+                    "content": display_text
+                })
+                logger.info(f"Added audio message to chat history: '{display_text}'")
+                # If there's also text content, add it separately
+                if text_content:
+                    history.append({
+                        "role": "user",
+                        "content": text_content
+                    })
+                    logger.info(f"Added additional text content to history: '{text_content[:50]}...'")
+            elif text_content:
+                # Only text content, no audio
                 history.append({
+                    "role": "user",
                     "content": text_content
                 })
+                logger.info(f"Added text-only message to chat history: '{text_content[:50]}...'")
+            elif message.get("files") and not has_audio_content:
+                # File uploads that aren't audio
+                for file_path in message["files"]:
+                    if file_path and not UIHandlers.is_audio_file(file_path):
+                        history.append({
+                            "role": "user",
+                            "content": {"path": file_path}
+                        })
+                        logger.info(f"Added file upload to history: {file_path}")
             # Add to conversation manager (combine all content)
             if user_content_parts:
                 combined_content = "\n".join(user_content_parts)
                 conversation_manager.add_user_message(combined_content)
+                logger.info(f"Added user message to conversation manager. Content parts: {len(user_content_parts)}, Combined: '{combined_content[:100]}...', {conversation_manager.get_history_summary()}")
+            else:
+                logger.warning("No user content parts found in message - this may indicate an issue")
+            logger.info(f"Final history length: {len(history)}")
             return history, gr.MultimodalTextbox(value=None, interactive=False)
         except Exception as e:
     @staticmethod
     def bot_with_real_streaming(
         history: List[Dict],
+        model_name: str = None,
+        use_tts: bool = False
+    ) -> Generator[Tuple[List[Dict], Optional[str]], None, None]:
         """
         Bot function with real-time streaming from the agent
         Args:
             history: Current Gradio chat history (for display only)
             model_name: Model to use for the agent
+            use_tts: Whether to generate text-to-speech for the response
         Yields:
+            Tuple of (Updated history, audio_file_path or None)
         """
         try:
             # Get agent instance with specified model
             try:
                 content_chunks = []
                 final_result = None
+                final_content = ""
                 for chunk in AgentRunner.run_agent_with_streaming_sync(agent, agent_input):
                     if chunk['type'] == 'final_result':
                         final_result = chunk['result']
                         if chunk['content']:
                             content_chunks.append(chunk['content'])
+                            final_content = chunk['content']
                     elif chunk['type'] == 'error':
                         final_result = chunk['result']
                         content_chunks.append(chunk['content'])
+                        final_content = chunk['content']
                 # Update conversation manager
                 if final_result:
                 if content_chunks:
                     for content in content_chunks:
                         history[-1]["content"] = content
+                        yield history, None  # No audio during streaming
+                        final_content = content
+                    # Generate TTS for the final response if enabled
+                    if use_tts and final_content:
+                        audio_file = UIHandlers._generate_tts_for_response_sync(final_content)
+                        if audio_file:
+                            # Return the final history with the audio file
+                            yield history, audio_file
+                        else:
+                            yield history, None
+                    else:
+                        yield history, None
                 else:
+                    error_msg = "I apologize, but I didn't receive a response. Please try again."
+                    history[-1]["content"] = error_msg
+                    yield history, None
             except Exception as e:
                 logger.error(f"Error in streaming execution: {str(e)}")
+                error_msg = f"Sorry, I encountered an error while processing your request: {str(e)}"
+                history[-1]["content"] = error_msg
+                yield history, None
         except Exception as e:
             logger.error(f"Bot streaming function error: {str(e)}")
             if len(history) == 0 or history[-1].get("role") != "assistant":
                 history.append({"role": "assistant", "content": ""})
             history[-1]["content"] = "I apologize, but I'm experiencing technical difficulties. Please try again in a moment."
+            yield history, None
     @staticmethod
+    def bot(history: List[Dict], model_name: str = None, use_tts: bool = False) -> Generator[Tuple[List[Dict], Optional[str]], None, None]:
         """
         Main bot function with simulated streaming
         Args:
             history: Current Gradio chat history (for display only)
             model_name: Model to use for the agent
+            use_tts: Whether to generate text-to-speech for the response
         Yields:
+            Tuple of (Updated history, audio_file_path or None)
         """
         try:
             # Get agent instance with specified model
             response = ResponseFormatter.extract_response_content(result)
             # Stream the response with simulated typing
+            for updated_history in ResponseFormatter.stream_response(response, history):
+                yield updated_history, None  # No audio during streaming
+            # Generate TTS for the final response if enabled
+            if use_tts and response:
+                audio_file = UIHandlers._generate_tts_for_response_sync(response)
+                if audio_file:
+                    yield history, audio_file
+                else:
+                    yield history, None
+            else:
+                yield history, None
         except Exception as e:
             logger.error(f"Bot function error: {str(e)}")
             error_response = "I apologize, but I'm experiencing technical difficulties. Please try again in a moment."
+            for updated_history in ResponseFormatter.stream_response(error_response, history):
+                yield updated_history, None
     @staticmethod
     def dynamic_bot(
         history: List[Dict],
+        use_real_streaming: bool = True,
+        use_tts: bool = False,
         model_name: str = None
+    ) -> Generator[Tuple[List[Dict], Optional[str]], None, None]:
         """
+        Dynamic bot function that can switch between streaming modes and TTS
         Args:
             history: Current Gradio chat history (for display only)
             use_real_streaming: Whether to use real-time streaming from agent
+            use_tts: Whether to generate text-to-speech for the response
             model_name: Model to use for the agent
         Yields:
+            Tuple of (Updated history, audio_file_path or None)
         """
         if use_real_streaming:
             logger.info("Using real-time streaming mode")
+            yield from UIHandlers.bot_with_real_streaming(history, model_name, use_tts)
         else:
             logger.info("Using simulated streaming mode")
+            yield from UIHandlers.bot(history, model_name, use_tts)
     @staticmethod
     def clear_conversation() -> List[Dict]:
         conversation_manager.clear_history()
         logger.info("Conversation history cleared")
         return []
+    @staticmethod
+    def _generate_tts_for_response_sync(text: str) -> Optional[str]:
+        """
+        Generate TTS audio for a response text synchronously.
+        Args:
+            text: The text to convert to speech
+        Returns:
+            Path to generated audio file or None if error
+        """
+        try:
+            if not text or not text.strip():
+                return None
+            # Clean the text for TTS
+            clean_text = clean_tts_markup(text)
+            # Limit text length for TTS (Groq has 10K char limit)
+            if len(clean_text) > 8000:  # Leave some buffer
+                clean_text = clean_text[:8000] + "..."
+                logger.info(f"Truncated TTS text to 8000 characters")
+            logger.info(f"Generating TTS for response ({len(clean_text)} chars)")
+            # Generate TTS using session persistence
+            audio_file = generate_speech_for_session(clean_text)
+            if audio_file:
+                logger.info(f"TTS audio generated: {audio_file}")
+                return audio_file
+            else:
+                logger.warning("Failed to generate TTS audio")
+                return None
+        except Exception as e:
+            logger.error(f"TTS generation error: {str(e)}")
+            return None

apps/gradio-app/src/fitness_gradio/ui/styles.py CHANGED Viewed

@@ -53,6 +53,30 @@ MAIN_CSS = """
     font-weight: bold;
 }
 /* Ensure all text in model-info respects dark theme */
 .model-info * {
     color: inherit !important;
@@ -88,7 +112,9 @@ HELP_CONTENT = """
 **How to interact:**
 - **💬 Type messages** in the text box
 - **🎤 Record voice messages** using the circular microphone button (requires Groq API key)
-- **📎 Upload files** if needed for context
 **To get the best results:**
 - Tell me your fitness level (beginner, intermediate, advanced)
@@ -98,6 +124,12 @@ HELP_CONTENT = """
 **Voice Input Setup:**
 - Set your `GROQ_API_KEY` environment variable to enable voice transcription
 - Click the circular microphone icon in the input box and speak your message
 - The system will convert your speech to text automatically using Groq's Whisper

     font-weight: bold;
 }
+/* TTS control styling */
+.tts-checkbox {
+    background: linear-gradient(135deg, rgba(99, 102, 241, 0.1), rgba(139, 92, 246, 0.1)) !important;
+    border: 1px solid rgba(99, 102, 241, 0.3) !important;
+    border-radius: 8px !important;
+    padding: 8px !important;
+    transition: all 0.3s ease !important;
+}
+.tts-checkbox:hover {
+    background: linear-gradient(135deg, rgba(99, 102, 241, 0.2), rgba(139, 92, 246, 0.2)) !important;
+    border-color: rgba(99, 102, 241, 0.5) !important;
+}
+.tts-active {
+    animation: pulse-tts 2s infinite;
+}
+@keyframes pulse-tts {
+    0% { box-shadow: 0 0 0 0 rgba(99, 102, 241, 0.7); }
+    70% { box-shadow: 0 0 0 10px rgba(99, 102, 241, 0); }
+    100% { box-shadow: 0 0 0 0 rgba(99, 102, 241, 0); }
+}
 /* Ensure all text in model-info respects dark theme */
 .model-info * {
     color: inherit !important;
 **How to interact:**
 - **💬 Type messages** in the text box
 - **🎤 Record voice messages** using the circular microphone button (requires Groq API key)
+- **� Enable Text-to-Speech** to hear AI responses spoken aloud (requires Groq API key)
+- **🚀 Enable Real-time Streaming** for faster response display
+- **�📎 Upload files** if needed for context
 **To get the best results:**
 - Tell me your fitness level (beginner, intermediate, advanced)
 **Voice Input Setup:**
 - Set your `GROQ_API_KEY` environment variable to enable voice transcription
+**Text-to-Speech Setup:**
+- Set your `GROQ_API_KEY` environment variable to enable audio generation
+- Choose from 19 English voices or 4 Arabic voices
+- Audio is automatically generated when TTS is enabled
+- Responses are cleaned of markdown formatting for better speech quality
 - Click the circular microphone icon in the input box and speak your message
 - The system will convert your speech to text automatically using Groq's Whisper

apps/gradio-app/src/fitness_gradio/ui/tts_utils.py ADDED Viewed

	@@ -0,0 +1,277 @@

+"""
+Text-to-Speech utilities using Groq's TTS models.
+"""
+import os
+import tempfile
+import logging
+import requests
+from typing import Optional, Union
+from pathlib import Path
+from fitness_core.utils import get_logger
+logger = get_logger(__name__)
+class GroqTTS:
+    """Groq Text-to-Speech service wrapper."""
+    # Available English voices for playai-tts
+    ENGLISH_VOICES = [
+        "Arista-PlayAI", "Atlas-PlayAI", "Basil-PlayAI", "Briggs-PlayAI",
+        "Calum-PlayAI", "Celeste-PlayAI", "Cheyenne-PlayAI", "Chip-PlayAI",
+        "Cillian-PlayAI", "Deedee-PlayAI", "Fritz-PlayAI", "Gail-PlayAI",
+        "Indigo-PlayAI", "Mamaw-PlayAI", "Mason-PlayAI", "Mikail-PlayAI",
+        "Mitch-PlayAI", "Quinn-PlayAI", "Thunder-PlayAI"
+    ]
+    # Available Arabic voices for playai-tts-arabic
+    ARABIC_VOICES = [
+        "Ahmad-PlayAI", "Amira-PlayAI", "Khalid-PlayAI", "Nasser-PlayAI"
+    ]
+    # Default voice selections
+    DEFAULT_ENGLISH_VOICE = "Celeste-PlayAI"  # Pleasant female voice
+    DEFAULT_ARABIC_VOICE = "Amira-PlayAI"    # Pleasant female voice
+    def __init__(self, api_key: Optional[str] = None):
+        """
+        Initialize the GroqTTS client.
+        Args:
+            api_key: Groq API key. If None, will try to get from GROQ_API_KEY env var.
+        """
+        self.api_key = api_key or os.getenv("GROQ_API_KEY")
+        if not self.api_key:
+            raise ValueError("Groq API key is required. Set GROQ_API_KEY environment variable or pass api_key parameter.")
+        self.temp_dir = Path(tempfile.gettempdir()) / "fitness_app_tts"
+        self.temp_dir.mkdir(exist_ok=True)
+        # Create a session-specific directory for persistent audio files
+        self.session_dir = self.temp_dir / "session_audio"
+        self.session_dir.mkdir(exist_ok=True)
+    def text_to_speech(
+        self,
+        text: str,
+        voice: Optional[str] = None,
+        model: str = "playai-tts",
+        response_format: str = "wav",
+        output_file: Optional[Union[str, Path]] = None
+    ) -> Optional[str]:
+        """
+        Convert text to speech using Groq's TTS API.
+        Args:
+            text: Text to convert to speech (max 10K characters)
+            voice: Voice to use. If None, uses default voice based on model
+            model: TTS model to use ("playai-tts" or "playai-tts-arabic")
+            response_format: Audio format ("wav")
+            output_file: Path to save audio file. If None, creates temp file
+        Returns:
+            Path to the generated audio file, or None if error
+        """
+        try:
+            # Validate text length
+            if len(text) > 10000:
+                logger.warning(f"Text too long ({len(text)} chars), truncating to 10K characters")
+                text = text[:10000]
+            # Set default voice based on model
+            if voice is None:
+                if model == "playai-tts-arabic":
+                    voice = self.DEFAULT_ARABIC_VOICE
+                else:
+                    voice = self.DEFAULT_ENGLISH_VOICE
+            # Validate voice for model
+            if model == "playai-tts" and voice not in self.ENGLISH_VOICES:
+                logger.warning(f"Voice {voice} not valid for English model, using default")
+                voice = self.DEFAULT_ENGLISH_VOICE
+            elif model == "playai-tts-arabic" and voice not in self.ARABIC_VOICES:
+                logger.warning(f"Voice {voice} not valid for Arabic model, using default")
+                voice = self.DEFAULT_ARABIC_VOICE
+            # Create output file path - use session directory for persistence
+            if output_file is None:
+                # Create a unique filename using a hash of the text and timestamp
+                import time
+                timestamp = int(time.time() * 1000)  # milliseconds for uniqueness
+                text_hash = hash(text) % 100000
+                output_file = self.session_dir / f"tts_output_{text_hash}_{timestamp}.wav"
+            else:
+                output_file = Path(output_file)
+            logger.info(f"Generating TTS for {len(text)} chars using {model} with {voice}")
+            # Generate speech using the correct API structure
+            headers = {
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json"
+            }
+            data = {
+                "model": model,
+                "input": text,
+                "voice": voice,
+                "response_format": response_format
+            }
+            response = requests.post(
+                "https://api.groq.com/openai/v1/audio/speech",
+                headers=headers,
+                json=data
+            )
+            if response.status_code == 200:
+                # Write audio content to file
+                with open(output_file, 'wb') as f:
+                    f.write(response.content)
+                logger.info(f"TTS audio saved to: {output_file}")
+                return str(output_file)
+            else:
+                logger.error(f"TTS API error: {response.status_code} - {response.text}")
+                return None
+        except Exception as e:
+            logger.error(f"Error generating TTS: {str(e)}")
+            return None
+    def cleanup_temp_files(self, max_age_hours: int = 24) -> None:
+        """
+        Clean up old temporary audio files.
+        Args:
+            max_age_hours: Delete files older than this many hours
+        """
+        try:
+            import time
+            current_time = time.time()
+            max_age_seconds = max_age_hours * 3600
+            for file_path in self.temp_dir.glob("*.wav"):
+                if current_time - file_path.stat().st_mtime > max_age_seconds:
+                    file_path.unlink()
+                    logger.debug(f"Deleted old TTS file: {file_path}")
+        except Exception as e:
+            logger.error(f"Error cleaning up temp files: {str(e)}")
+    @classmethod
+    def get_available_voices(cls, model: str = "playai-tts") -> list[str]:
+        """
+        Get list of available voices for a model.
+        Args:
+            model: Model name ("playai-tts" or "playai-tts-arabic")
+        Returns:
+            List of available voice names
+        """
+        if model == "playai-tts-arabic":
+            return cls.ARABIC_VOICES.copy()
+        else:
+            return cls.ENGLISH_VOICES.copy()
+# Global TTS instance (lazy initialized)
+_tts_instance: Optional[GroqTTS] = None
+def get_tts_instance() -> Optional[GroqTTS]:
+    """
+    Get or create a global TTS instance.
+    Returns:
+        GroqTTS instance or None if API key not available
+    """
+    global _tts_instance
+    if _tts_instance is None:
+        try:
+            _tts_instance = GroqTTS()
+        except ValueError as e:
+            logger.warning(f"TTS not available: {str(e)}")
+            return None
+    return _tts_instance
+def generate_speech_for_text(text: str, voice: Optional[str] = None) -> Optional[str]:
+    """
+    Convenience function to generate speech for text.
+    Args:
+        text: Text to convert to speech
+        voice: Voice to use (optional)
+    Returns:
+        Path to generated audio file or None if error
+    """
+    tts = get_tts_instance()
+    if tts is None:
+        return None
+    return tts.text_to_speech(text, voice=voice)
+def generate_speech_for_session(text: str, voice: Optional[str] = None) -> Optional[str]:
+    """
+    Generate speech for text with session persistence for chat interface.
+    Args:
+        text: Text to convert to speech
+        voice: Voice to use (optional)
+    Returns:
+        Path to generated audio file that persists for the session, or None if error
+    """
+    tts = get_tts_instance()
+    if tts is None:
+        return None
+    # Generate audio in session directory for persistence
+    audio_file = tts.text_to_speech(text, voice=voice)
+    if audio_file:
+        # Ensure the file is in the session directory
+        audio_path = Path(audio_file)
+        if audio_path.exists():
+            logger.info(f"Session TTS audio available at: {audio_file}")
+            return str(audio_file)
+    return None
+def clean_tts_markup(text: str) -> str:
+    """
+    Clean text for TTS by removing markdown and other markup.
+    Args:
+        text: Text that may contain markdown
+    Returns:
+        Clean text suitable for TTS
+    """
+    import re
+    # Remove markdown formatting
+    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)  # Bold
+    text = re.sub(r'\*(.*?)\*', r'\1', text)      # Italics
+    text = re.sub(r'`(.*?)`', r'\1', text)        # Inline code
+    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)  # Code blocks
+    text = re.sub(r'#{1,6}\s*(.*)', r'\1', text)  # Headers
+    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)  # Links
+    text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', '', text)    # Images
+    text = re.sub(r'^[-*+]\s+', '', text, flags=re.MULTILINE)  # Lists
+    text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE)  # Numbered lists
+    text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)      # Quotes
+    # Clean up extra whitespace
+    text = re.sub(r'\n\s*\n', '\n\n', text)       # Multiple newlines
+    text = re.sub(r'[ \t]+', ' ', text)           # Multiple spaces
+    text = text.strip()
+    return text

apps/gradio-app/test_audio_debug.py ADDED Viewed

	@@ -0,0 +1,83 @@

+#!/usr/bin/env python3
+"""
+Debug script to test audio transcription functionality
+"""
+import os
+import sys
+import logging
+# Add the src directory to the Python path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
+from fitness_gradio.ui.handlers import UIHandlers
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def test_audio_message_processing():
+    """Test the add_message_with_audio function"""
+    # Mock message with audio file (you'd need to provide a real audio file path to test)
+    test_audio_file = "test_audio.wav"  # Replace with actual audio file path
+    # Test case 1: Audio only message
+    mock_audio_message = {
+        "files": [test_audio_file],
+        "text": ""
+    }
+    # Test case 2: Text only message
+    mock_text_message = {
+        "files": [],
+        "text": "Hello, this is a test message"
+    }
+    # Test case 3: Combined audio and text
+    mock_combined_message = {
+        "files": [test_audio_file],
+        "text": "Additional text content"
+    }
+    initial_history = []
+    print("Testing audio message processing...")
+    # Test text message
+    print("\n=== Testing text-only message ===")
+    try:
+        history, input_state = UIHandlers.add_message_with_audio(initial_history.copy(), mock_text_message)
+        print(f"History after text message: {history}")
+        print(f"Input state: {input_state}")
+    except Exception as e:
+        print(f"Error with text message: {e}")
+    # Test audio message (only if audio file exists)
+    if os.path.exists(test_audio_file):
+        print("\n=== Testing audio-only message ===")
+        try:
+            history, input_state = UIHandlers.add_message_with_audio(initial_history.copy(), mock_audio_message)
+            print(f"History after audio message: {history}")
+            print(f"Input state: {input_state}")
+        except Exception as e:
+            print(f"Error with audio message: {e}")
+    else:
+        print(f"\n=== Skipping audio test (file {test_audio_file} not found) ===")
+    # Test is_audio_file function
+    print("\n=== Testing audio file detection ===")
+    test_files = [
+        "test.wav",
+        "test.mp3",
+        "test.m4a",
+        "test.txt",
+        "test.jpg",
+        "test.webm"
+    ]
+    for test_file in test_files:
+        is_audio = UIHandlers.is_audio_file(test_file)
+        print(f"{test_file}: {'Audio' if is_audio else 'Not audio'}")
+if __name__ == "__main__":
+    test_audio_message_processing()

apps/gradio-app/test_tts.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+Test TTS functionality
+"""
+import os
+from fitness_gradio.ui.tts_utils import GroqTTS, generate_speech_for_text, clean_tts_markup
+def test_tts_setup():
+    """Test basic TTS setup and functionality."""
+    print("Testing TTS setup...")
+    # Check if API key is available
+    api_key = os.getenv("GROQ_API_KEY")
+    if not api_key:
+        print("❌ GROQ_API_KEY not found in environment variables")
+        print("Please set GROQ_API_KEY to test TTS functionality")
+        return False
+    print("✅ GROQ_API_KEY found")
+    # Test TTS instance creation
+    try:
+        tts = GroqTTS()
+        print("✅ GroqTTS instance created successfully")
+    except Exception as e:
+        print(f"❌ Failed to create GroqTTS instance: {e}")
+        return False
+    # Test text cleaning
+    test_text = "**Hello** this is a *test* with `code` and [links](http://example.com)"
+    cleaned = clean_tts_markup(test_text)
+    print(f"Text cleaning test:")
+    print(f"  Original: {test_text}")
+    print(f"  Cleaned:  {cleaned}")
+    # Test voice lists
+    english_voices = GroqTTS.get_available_voices("playai-tts")
+    arabic_voices = GroqTTS.get_available_voices("playai-tts-arabic")
+    print(f"✅ Available English voices: {len(english_voices)}")
+    print(f"✅ Available Arabic voices: {len(arabic_voices)}")
+    print("🎉 All TTS setup tests passed!")
+    print("Note: Actual TTS generation will be tested when the UI is used with a valid API key.")
+    return True
+if __name__ == "__main__":
+    test_tts_setup()

shared/src/fitness_core/agents/providers.py CHANGED Viewed

@@ -65,7 +65,10 @@ class ModelProvider:
         # Whisper models (Speech-to-Text)
         "whisper-large-v3": "litellm/groq/whisper-large-v3",                   # Whisper Large v3
         "whisper-large-v3-turbo": "litellm/groq/whisper-large-v3-turbo",       # Whisper Large v3 Turbo
-        "o3-mini": "o3-mini",                                  # Latest reasoning model
     }
     @classmethod
@@ -102,6 +105,8 @@ class ModelProvider:
             "kimi-k2-instruct": "Moonshot Kimi K2 MoE - 1T parameters with tool use (Moonshot via Groq)",
             "whisper-large-v3": "OpenAI Whisper Large v3 - best speech-to-text (OpenAI via Groq)",
             "whisper-large-v3-turbo": "OpenAI Whisper Large v3 Turbo - faster speech-to-text (OpenAI via Groq)",
         }
         return model_info.get(model_name, "Model information not available")

         # Whisper models (Speech-to-Text)
         "whisper-large-v3": "litellm/groq/whisper-large-v3",                   # Whisper Large v3
         "whisper-large-v3-turbo": "litellm/groq/whisper-large-v3-turbo",       # Whisper Large v3 Turbo
+        # PlayAI TTS models (Text-to-Speech)
+        "playai-tts": "litellm/groq/playai-tts",                               # English TTS model
+        "playai-tts-arabic": "litellm/groq/playai-tts-arabic",                 # Arabic TTS model
     }
     @classmethod
             "kimi-k2-instruct": "Moonshot Kimi K2 MoE - 1T parameters with tool use (Moonshot via Groq)",
             "whisper-large-v3": "OpenAI Whisper Large v3 - best speech-to-text (OpenAI via Groq)",
             "whisper-large-v3-turbo": "OpenAI Whisper Large v3 Turbo - faster speech-to-text (OpenAI via Groq)",
+            "playai-tts": "PlayAI English TTS - high-quality text-to-speech with 19 voices (PlayAI via Groq)",
+            "playai-tts-arabic": "PlayAI Arabic TTS - high-quality Arabic text-to-speech with 4 voices (PlayAI via Groq)",
         }
         return model_info.get(model_name, "Model information not available")