Spaces:
Runtime error
Runtime error
| """ | |
| Optimized SpeechT5 Armenian TTS Application | |
| ========================================== | |
| High-performance Gradio application with advanced optimization features. | |
| """ | |
| import gradio as gr | |
| import numpy as np | |
| import logging | |
| import time | |
| from typing import Tuple, Optional | |
| import os | |
| import sys | |
| # Add src to path for imports | |
| sys.path.append(os.path.join(os.path.dirname(__file__), 'src')) | |
| from src.pipeline import TTSPipeline | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # Global pipeline instance | |
| tts_pipeline: Optional[TTSPipeline] = None | |
| def initialize_pipeline(): | |
| """Initialize the TTS pipeline with error handling.""" | |
| global tts_pipeline | |
| try: | |
| logger.info("Initializing TTS Pipeline...") | |
| tts_pipeline = TTSPipeline( | |
| model_checkpoint="Edmon02/TTS_NB_2", | |
| max_chunk_length=200, # Optimal for 5-20s clips | |
| crossfade_duration=0.1, | |
| use_mixed_precision=True | |
| ) | |
| # Apply production optimizations | |
| tts_pipeline.optimize_for_production() | |
| logger.info("TTS Pipeline initialized successfully") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Failed to initialize TTS pipeline: {e}") | |
| return False | |
| def predict(text: str, speaker: str, | |
| enable_chunking: bool = True, | |
| apply_processing: bool = True) -> Tuple[int, np.ndarray]: | |
| """ | |
| Main prediction function with optimization and error handling. | |
| Args: | |
| text: Input text to synthesize | |
| speaker: Speaker selection | |
| enable_chunking: Whether to enable intelligent chunking | |
| apply_processing: Whether to apply audio post-processing | |
| Returns: | |
| Tuple of (sample_rate, audio_array) | |
| """ | |
| global tts_pipeline | |
| start_time = time.time() | |
| try: | |
| # Validate inputs | |
| if not text or not text.strip(): | |
| logger.warning("Empty text provided") | |
| return 16000, np.zeros(0, dtype=np.int16) | |
| if tts_pipeline is None: | |
| logger.error("TTS pipeline not initialized") | |
| return 16000, np.zeros(0, dtype=np.int16) | |
| # Extract speaker code from selection | |
| speaker_code = speaker.split("(")[0].strip() | |
| # Log request | |
| logger.info(f"Processing request: {len(text)} chars, speaker: {speaker_code}") | |
| # Synthesize speech | |
| sample_rate, audio = tts_pipeline.synthesize( | |
| text=text, | |
| speaker=speaker_code, | |
| enable_chunking=enable_chunking, | |
| apply_audio_processing=apply_processing | |
| ) | |
| # Log performance | |
| total_time = time.time() - start_time | |
| audio_duration = len(audio) / sample_rate if len(audio) > 0 else 0 | |
| rtf = total_time / audio_duration if audio_duration > 0 else float('inf') | |
| logger.info(f"Request completed in {total_time:.3f}s (RTF: {rtf:.2f})") | |
| return sample_rate, audio | |
| except Exception as e: | |
| logger.error(f"Prediction failed: {e}") | |
| return 16000, np.zeros(0, dtype=np.int16) | |
| def get_performance_info() -> str: | |
| """Get performance statistics as formatted string.""" | |
| global tts_pipeline | |
| if tts_pipeline is None: | |
| return "Pipeline not initialized" | |
| try: | |
| stats = tts_pipeline.get_performance_stats() | |
| info = f""" | |
| **Performance Statistics:** | |
| - Total Inferences: {stats['pipeline_stats']['total_inferences']} | |
| - Average Processing Time: {stats['pipeline_stats']['avg_processing_time']:.3f}s | |
| - Translation Cache Size: {stats['text_processor_stats']['translation_cache_size']} | |
| - Model Inferences: {stats['model_stats']['total_inferences']} | |
| - Average Model Time: {stats['model_stats'].get('avg_inference_time', 0):.3f}s | |
| """ | |
| return info.strip() | |
| except Exception as e: | |
| return f"Error getting performance info: {e}" | |
| def health_check() -> str: | |
| """Perform system health check.""" | |
| global tts_pipeline | |
| if tts_pipeline is None: | |
| return "โ Pipeline not initialized" | |
| try: | |
| health = tts_pipeline.health_check() | |
| if health["status"] == "healthy": | |
| return "โ All systems operational" | |
| elif health["status"] == "degraded": | |
| return "โ ๏ธ Some components have issues" | |
| else: | |
| return f"โ System error: {health.get('error', 'Unknown error')}" | |
| except Exception as e: | |
| return f"โ Health check failed: {e}" | |
| # Application metadata | |
| TITLE = "๐ค SpeechT5 Armenian TTS - Optimized" | |
| DESCRIPTION = """ | |
| # High-Performance Armenian Text-to-Speech | |
| This is an **optimized version** of SpeechT5 for Armenian language synthesis, featuring: | |
| ### ๐ **Performance Optimizations** | |
| - **Intelligent Text Chunking**: Handles long texts by splitting them intelligently at sentence boundaries | |
| - **Caching**: Translation and embedding caching for faster repeated requests | |
| - **Mixed Precision**: GPU optimization with FP16 inference when available | |
| - **Crossfading**: Smooth audio transitions between chunks for natural-sounding longer texts | |
| ### ๐ฏ **Advanced Features** | |
| - **Smart Text Processing**: Automatic number-to-word conversion with Armenian translation | |
| - **Audio Post-Processing**: Noise gating, normalization, and dynamic range optimization | |
| - **Robust Error Handling**: Graceful fallbacks and comprehensive logging | |
| - **Real-time Performance Monitoring**: Track processing times and system health | |
| ### ๐ **Usage Tips** | |
| - **Short texts** (< 200 chars): Processed directly for maximum speed | |
| - **Long texts**: Automatically chunked with overlap for seamless audio | |
| - **Numbers**: Automatically converted to Armenian words | |
| - **Performance**: Enable chunking for texts longer than a few sentences | |
| ### ๐ต **Audio Quality** | |
| - Sample Rate: 16 kHz | |
| - Optimized for natural prosody and clear pronunciation | |
| - Cross-fade transitions for multi-chunk synthesis | |
| The model was trained on short clips (5-20s) but uses advanced algorithms to handle longer texts effectively. | |
| """ | |
| EXAMPLES = [ | |
| # Short examples for quick testing | |
| ["ิฒีกึึ ีฑีฅีฆ, ีซีถีนีบีฅีีฝ ีฅึ:", "BDL (male)", True, True], | |
| ["ิฑีตีฝึ ึ ีฃีฅีฒีฅึีซีฏ ึ ึ ีง:", "BDL (male)", False, True], | |
| # Medium examples demonstrating chunking | |
| ["ีีกีตีกีฝีฟีกีถีถ ีธึีถีซ ีฐีกึีธึีฝีฟ ีบีกีฟีดีธึีฉีตีธึีถ ึ ีดีทีกีฏีธึีตีฉ: ิตึึีกีถีจ ีดีกีตึีกึีกีฒีกึีถ ีง, ีธึีถ ีธึีถีซ 2800 ีฟีกึีพีก ีบีกีฟีดีธึีฉีตีธึีถ:", "BDL (male)", True, True], | |
| # Long example with numbers | |
| ["ิฑึีกึีกีฟ ีฌีฅีผีจ ีขีกึีฑึีธึีฉีตีธึีถีจ 5165 ีดีฅีฟึ ีง: ิฑีตีถ ีีกีตีกีฝีฟีกีถีซ ีญีธึีฐึีคีกีถีซีทีถ ีง ึ ีฃีฟีถีพีธึีด ีง ินีธึึึีซีกีตีซ ีฟีกึีกีฎึีธึีด: ิผีฅีผีกีถ ีพึีก ีจีฝีฟ ิฑีฝีฟีพีกีฎีกีทีถีนีซี ีฏีกีถีฃีถีฅีฌ ีง ีีธีตีซ ีฟีกีบีกีถีจ 40 ึ ึีพีก ีปึีฐีฅีฒีฅีฒีซึ ีฐีฅีฟีธ:", "BDL (male)", True, True], | |
| # Technical example | |
| ["ีีฅึีฅีถีกีตีซ ีทีกึีชีซีนีจ 150 ีฑีซีธึีช ีง ึ 2.0 ีฌีซีฟึ ีฎีกีพีกีฌ ีธึีถีซ: ิฑีตีถ ีฏีกึีธีฒ ีง ีกึีกีฃีกึีถีฅีฌ 0-ีซึ 100 ีฏีด/ีช 8.5 ีพีกีตึีฏีตีกีถีธึีด:", "BDL (male)", True, True], | |
| ] | |
| # Custom CSS for better styling | |
| CUSTOM_CSS = """ | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| margin: auto !important; | |
| } | |
| .performance-info { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| padding: 15px; | |
| border-radius: 10px; | |
| color: white; | |
| margin: 10px 0; | |
| } | |
| .health-status { | |
| padding: 10px; | |
| border-radius: 8px; | |
| margin: 10px 0; | |
| font-weight: bold; | |
| } | |
| .status-healthy { background-color: #d4edda; color: #155724; } | |
| .status-warning { background-color: #fff3cd; color: #856404; } | |
| .status-error { background-color: #f8d7da; color: #721c24; } | |
| """ | |
| def create_interface(): | |
| """Create and configure the Gradio interface.""" | |
| with gr.Blocks( | |
| theme=gr.themes.Soft(), | |
| css=CUSTOM_CSS, | |
| title="SpeechT5 Armenian TTS" | |
| ) as interface: | |
| # Header | |
| gr.Markdown(f"# {TITLE}") | |
| gr.Markdown(DESCRIPTION) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| # Main input controls | |
| text_input = gr.Textbox( | |
| label="๐ Input Text (Armenian)", | |
| placeholder="ีีธึีฟึีกีฃึีฅึ ีฑีฅึ ีฟีฅึีฝีฟีจ ีกีตีฝีฟีฅีฒ...", | |
| lines=3, | |
| max_lines=10 | |
| ) | |
| with gr.Row(): | |
| speaker_input = gr.Radio( | |
| label="๐ญ Speaker", | |
| choices=["BDL (male)"], | |
| value="BDL (male)" | |
| ) | |
| with gr.Row(): | |
| chunking_checkbox = gr.Checkbox( | |
| label="๐งฉ Enable Intelligent Chunking", | |
| value=True, | |
| info="Automatically split long texts for better quality" | |
| ) | |
| processing_checkbox = gr.Checkbox( | |
| label="๐๏ธ Apply Audio Processing", | |
| value=True, | |
| info="Apply noise gating, normalization, and crossfading" | |
| ) | |
| # Generate button | |
| generate_btn = gr.Button( | |
| "๐ค Generate Speech", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(scale=1): | |
| # System information panel | |
| gr.Markdown("### ๐ System Status") | |
| health_display = gr.Textbox( | |
| label="Health Status", | |
| value="Initializing...", | |
| interactive=False, | |
| max_lines=1 | |
| ) | |
| performance_display = gr.Textbox( | |
| label="Performance Stats", | |
| value="No data yet", | |
| interactive=False, | |
| max_lines=8 | |
| ) | |
| refresh_btn = gr.Button("๐ Refresh Stats", size="sm") | |
| # Output | |
| audio_output = gr.Audio( | |
| label="๐ Generated Speech", | |
| type="numpy", | |
| interactive=False | |
| ) | |
| # Examples section | |
| gr.Markdown("### ๐ก Example Texts") | |
| gr.Examples( | |
| examples=EXAMPLES, | |
| inputs=[text_input, speaker_input, chunking_checkbox, processing_checkbox], | |
| outputs=[audio_output], | |
| fn=predict, | |
| cache_examples=False, | |
| label="Click any example to try it:" | |
| ) | |
| # Event handlers | |
| generate_btn.click( | |
| fn=predict, | |
| inputs=[text_input, speaker_input, chunking_checkbox, processing_checkbox], | |
| outputs=[audio_output], | |
| show_progress=True | |
| ) | |
| refresh_btn.click( | |
| fn=lambda: (health_check(), get_performance_info()), | |
| outputs=[health_display, performance_display], | |
| show_progress=False | |
| ) | |
| # Auto-refresh health status on load | |
| interface.load( | |
| fn=lambda: (health_check(), get_performance_info()), | |
| outputs=[health_display, performance_display] | |
| ) | |
| return interface | |
| def main(): | |
| """Main application entry point.""" | |
| logger.info("Starting SpeechT5 Armenian TTS Application") | |
| # Initialize pipeline | |
| if not initialize_pipeline(): | |
| logger.error("Failed to initialize TTS pipeline - exiting") | |
| sys.exit(1) | |
| # Create and launch interface | |
| interface = create_interface() | |
| # Launch with optimized settings | |
| interface.launch( | |
| share=True, | |
| inbrowser=False, | |
| show_error=True, | |
| quiet=False, | |
| server_name="0.0.0.0", # Allow external connections | |
| server_port=7860, # Standard Gradio port | |
| enable_queue=True, # Enable queuing for better performance | |
| max_threads=4, # Limit concurrent requests | |
| ) | |
| if __name__ == "__main__": | |
| main() | |