Spaces:
Running
Running
| # --------------------------------------------------------------- | |
| # app.py β "TTS Showcase" (Gradio Implementation) | |
| # --------------------------------------------------------------- | |
| import os | |
| import gradio as gr | |
| # ---------- 1. Demo metadata ---------- | |
| MODELS = { | |
| "nari-labs/Dia-1.6B": "Dia-1.6B", | |
| "hexgrad/Kokoro-82M": "Kokoro-82M", | |
| "sesame/csm-1b": "csm-1b", | |
| "SparkAudio/Spark-TTS-0.5B": "Spark-TTS-0.5B", | |
| "canopylabs/orpheus-3b-0.1-ft": "Orpheus-3b-0.1-ft", | |
| "SWivid/F5-TTS": "F5-TTS", | |
| "Zyphra/Zonos-v0.1-transformer": "Zonos-v0.1-transformer", | |
| "coqui/XTTS-v2": "XTTS-v2", | |
| "HKUSTAudio/Llasa-3B": "Llasa-3B", | |
| "amphion/MaskGCT": "MaskGCT", | |
| "OuteAI/Llama-OuteTTS-1.0-1B": "Llama-OuteTTS-1.0-1B", | |
| "ByteDance/MegaTTS3": "MegaTTS3" | |
| } | |
| # Performance ratings for each model | |
| MODEL_RATINGS = { | |
| "nari-labs/Dia-1.6B": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Good"}, | |
| "hexgrad/Kokoro-82M": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"}, | |
| "sesame/csm-1b": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"}, | |
| "SparkAudio/Spark-TTS-0.5B": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"}, | |
| "canopylabs/orpheus-3b-0.1-ft": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"}, | |
| "SWivid/F5-TTS": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"}, | |
| "Zyphra/Zonos-v0.1-transformer": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Excellent"}, | |
| "coqui/XTTS-v2": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"}, | |
| "HKUSTAudio/Llasa-3B": {"naturalness": "Excellent", "intelligibility": "Good", "controllability": "Moderate"}, | |
| "amphion/MaskGCT": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"}, | |
| "OuteAI/Llama-OuteTTS-1.0-1B": {"naturalness": "Moderate", "intelligibility": "Moderate", "controllability": "Moderate"}, | |
| "ByteDance/MegaTTS3": {"naturalness": "Good", "intelligibility": "Good", "controllability": "Moderate"} | |
| } | |
| # Model descriptions for better understanding | |
| MODEL_DESCRIPTIONS = { | |
| "nari-labs/Dia-1.6B": "Expressive conversational voice with moderate quality", | |
| "hexgrad/Kokoro-82M": "Lightweight powerhouse with excellent clarity", | |
| "sesame/csm-1b": "High-quality synthesis with excellent naturalness", | |
| "SparkAudio/Spark-TTS-0.5B": "Efficient model with excellent performance", | |
| "canopylabs/orpheus-3b-0.1-ft": "Fine-tuned large model with superior quality", | |
| "SWivid/F5-TTS": "Advanced flow-based synthesis with top ratings", | |
| "Zyphra/Zonos-v0.1-transformer": "Highly controllable transformer-based model", | |
| "coqui/XTTS-v2": "Multi-lingual excellence with proven performance", | |
| "HKUSTAudio/Llasa-3B": "Large-scale audio synthesis model", | |
| "amphion/MaskGCT": "Masked generative modeling approach", | |
| "OuteAI/Llama-OuteTTS-1.0-1B": "LLM-based TTS with moderate performance", | |
| "ByteDance/MegaTTS3": "Industrial-grade TTS solution" | |
| } | |
| # Folder that contains subfolders with the audio clips | |
| SAMPLES_DIR = "samples" | |
| CLIP_NAME = "generated-audio.wav" | |
| # Test prompt used for evaluation | |
| TEST_PROMPT = "Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!" | |
| def repo_to_slug(repo: str) -> str: | |
| """Convert huggingface/xxx to huggingface_xxx for folder naming.""" | |
| return repo.replace("/", "_") | |
| def get_rating_emoji(rating: str) -> str: | |
| """Convert rating to emoji.""" | |
| if rating == "Excellent": | |
| return "π’" | |
| elif rating == "Good": | |
| return "π‘" | |
| else: | |
| return "π " | |
| def get_audio_path(repo: str) -> str: | |
| """Get the audio file path for a given repository.""" | |
| audio_path = os.path.join(SAMPLES_DIR, repo_to_slug(repo), CLIP_NAME) | |
| return audio_path if os.path.isfile(audio_path) else None | |
| def filter_models(search_term: str): | |
| """Filter models based on search term.""" | |
| if not search_term.strip(): | |
| return list(MODELS.keys()) | |
| search_lower = search_term.lower().strip() | |
| return [ | |
| repo for repo, name in MODELS.items() | |
| if search_lower in repo.lower() or search_lower in name.lower() | |
| ] | |
| def create_model_card(repo: str) -> str: | |
| """Create a formatted model card with ratings and description.""" | |
| display_name = MODELS[repo] | |
| description = MODEL_DESCRIPTIONS.get(repo, "High-quality TTS model") | |
| ratings = MODEL_RATINGS.get(repo, {}) | |
| card_html = f""" | |
| <div class="model-card" style="border: 1px solid #ddd; border-radius: 12px; padding: 20px; margin: 10px 0; background: white;"> | |
| <h3 style="color: #2c3e50; margin-top: 0;">π€ {display_name}</h3> | |
| <div style="display: flex; gap: 15px; margin: 15px 0;"> | |
| <span style="color: #888;"><strong style="color: #888;">Naturalness:</strong> {get_rating_emoji(ratings.get('naturalness', 'Moderate'))} {ratings.get('naturalness', 'Moderate')}</span> | |
| <span style="color: #888;"><strong style="color: #888;">Intelligibility:</strong> {get_rating_emoji(ratings.get('intelligibility', 'Moderate'))} {ratings.get('intelligibility', 'Moderate')}</span> | |
| <span style="color: #888;"><strong style="color: #888;">Controllability:</strong> {get_rating_emoji(ratings.get('controllability', 'Moderate'))} {ratings.get('controllability', 'Moderate')}</span> | |
| </div> | |
| <p style="font-size: 0.9em; color: #888; margin: 5px 0;">Repository: <code style="color: #888;">{repo}</code></p> | |
| </div> | |
| """ | |
| return card_html | |
| # ---------- 2. Custom CSS ---------- | |
| custom_css = """ | |
| #title { | |
| text-align: center; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| color: white; | |
| padding: 2rem; | |
| border-radius: 15px; | |
| margin-bottom: 2rem; | |
| } | |
| #intro-section { | |
| background: #f8f9fa; | |
| color: #2c3e50; | |
| padding: 1.5rem; | |
| border-radius: 10px; | |
| margin: 1rem 0; | |
| border-left: 4px solid #667eea; | |
| } | |
| #intro-section h2, | |
| #intro-section h3 { | |
| color: #2c3e50; | |
| } | |
| #intro-section p { | |
| color: #34495e; | |
| } | |
| #intro-section ul li { | |
| color: #34495e; | |
| } | |
| #intro-section .mission-text { | |
| color: #667eea !important; | |
| font-weight: bold; | |
| text-align: center; | |
| } | |
| #intro-section strong { | |
| color: #2c3e50 !important; | |
| } | |
| #intro-section em { | |
| color: #2c3e50 !important; | |
| } | |
| #intro-section .mission-text strong { | |
| color: #667eea !important; | |
| } | |
| #test-prompt { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| color: white; | |
| padding: 1.5rem; | |
| border-radius: 10px; | |
| text-align: center; | |
| margin: 1rem 0; | |
| } | |
| .model-grid { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); | |
| gap: 1rem; | |
| margin: 1rem 0; | |
| } | |
| #footer { | |
| text-align: center; | |
| padding: 2rem; | |
| color: #666; | |
| border-top: 1px solid #eee; | |
| margin-top: 2rem; | |
| } | |
| /* make all the text in our whiteβbackground cards dark */ | |
| .model-grid .gr-html * { | |
| color: #2c3e50 !important; | |
| } | |
| .model-card { | |
| background: white; | |
| color: #2c3e50 !important; | |
| border: 1px solid #ddd; | |
| border-radius: 12px; | |
| padding: 20px; | |
| margin: 10px 0; | |
| } | |
| """ | |
| # ---------- 3. Main Gradio Interface ---------- | |
| def create_interface(): | |
| with gr.Blocks(css=custom_css, title="ποΈ TTS Model Gallery", theme=gr.themes.Soft()) as demo: | |
| # Header Section | |
| gr.HTML(""" | |
| <div id="title"> | |
| <h1>ποΈ Open-Source Text-to-Speech Model Gallery</h1> | |
| </div> | |
| """) | |
| # Introduction Section | |
| gr.HTML(""" | |
| <div id="intro-section"> | |
| <h3>π¬ Our Exciting Quest</h3> | |
| <p>We're on a thrilling journey to help developers discover the perfect TTS models for their innovative audio projects! | |
| We've put these 12 cutting-edge models through their paces using a scientifically designed universal test prompt.</p> | |
| <p><strong>Featured TTS Engines:</strong></p> | |
| <ul> | |
| <li>π <strong>Dia-1.6B</strong> - Expressive conversational voice</li> | |
| <li>πͺ <strong>Kokoro-82M</strong> - Lightweight powerhouse</li> | |
| <li>π¨ <strong>F5-TTS</strong> - Advanced flow-based synthesis</li> | |
| <li>π΅ <strong>XTTS-v2</strong> - Multi-lingual excellence</li> | |
| <li>πΌ <strong>MaskGCT</strong> - Masked generative modeling</li> | |
| <li>π€ <strong>Llasa-3B</strong> - Large-scale audio synthesis</li> | |
| <li><em>...and 6 more incredible models!</em></li> | |
| </ul> | |
| </div> | |
| """) | |
| # Test Prompt Section | |
| # gr.HTML(f""" | |
| # <div id="test-prompt"> | |
| # <h3>π― Universal Test Prompt</h3> | |
| # <p style="font-style: italic; font-size: 1.1em;">"{TEST_PROMPT}"</p> | |
| # <p style="font-size: 0.9em; opacity: 0.9;"> | |
| # Carefully crafted to test naturalness, intelligibility, and technical pronunciation across all models | |
| # </p> | |
| # </div> | |
| # """) | |
| # Evaluation Criteria | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;"> | |
| <div style="font-size: 2rem;">π</div> | |
| <strong>Naturalness</strong><br> | |
| <small>Human-like quality & emotional expression</small> | |
| </div> | |
| """) | |
| with gr.Column(): | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;"> | |
| <div style="font-size: 2rem;">π£οΈ</div> | |
| <strong>Intelligibility</strong><br> | |
| <small>Clarity & pronunciation accuracy</small> | |
| </div> | |
| """) | |
| with gr.Column(): | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;"> | |
| <div style="font-size: 2rem;">ποΈ</div> | |
| <strong>Controllability</strong><br> | |
| <small>Tone, pace & parameter flexibility</small> | |
| </div> | |
| """) | |
| gr.Markdown("---") | |
| # Search and Filter Section | |
| with gr.Row(): | |
| search_box = gr.Textbox( | |
| label="π Search Models", | |
| placeholder="Filter by name or family (e.g., 'F5', 'TTS', '3B')", | |
| value="", | |
| scale=3 | |
| ) | |
| clear_btn = gr.Button("Clear", scale=1) | |
| # Model Gallery Section | |
| gr.Markdown("## π§ Model Gallery") | |
| # Create model cards and audio players | |
| model_components = [] | |
| for repo, display_name in MODELS.items(): | |
| with gr.Group(): | |
| # Model information card | |
| model_info = gr.HTML(create_model_card(repo)) | |
| # Audio player | |
| audio_path = get_audio_path(repo) | |
| if audio_path: | |
| audio_player = gr.Audio( | |
| value=audio_path, | |
| label=f"π΅ {display_name} Audio Sample", | |
| interactive=False | |
| ) | |
| else: | |
| audio_player = gr.HTML(f"<p style='color: red;'>π€·ββοΈ Audio sample not found for {display_name}</p>") | |
| model_components.append((repo, model_info, audio_player)) | |
| # Search functionality | |
| def update_visibility(search_term): | |
| filtered_repos = filter_models(search_term) | |
| updates = [] | |
| for repo, model_info, audio_player in model_components: | |
| visible = repo in filtered_repos | |
| updates.extend([ | |
| gr.update(visible=visible), # model_info | |
| gr.update(visible=visible) # audio_player | |
| ]) | |
| return updates | |
| # Connect search functionality | |
| search_box.change( | |
| fn=update_visibility, | |
| inputs=[search_box], | |
| outputs=[comp for repo, model_info, audio_player in model_components for comp in [model_info, audio_player]] | |
| ) | |
| clear_btn.click( | |
| fn=lambda: "", | |
| outputs=[search_box] | |
| ) | |
| # Methodology Section | |
| with gr.Accordion("π Detailed Evaluation Methodology", open=False): | |
| gr.Markdown(""" | |
| ### Test Prompt | |
| `Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!` | |
| ### Model Evaluation Criteria: | |
| π **Naturalness (Human-like Quality)** | |
| - Prosody and rhythm patterns | |
| - Emotional expression capability | |
| - Voice texture and warmth | |
| - Natural breathing and pauses | |
| π£οΈ **Intelligibility (Clarity & Accuracy)** | |
| - Word pronunciation precision | |
| - Consonant and vowel clarity | |
| - Sentence comprehensibility | |
| - Technical term handling | |
| ποΈ **Controllability (Flexibility)** | |
| - Parameter responsiveness | |
| - Tone modification capability | |
| - Speed and pitch control | |
| - Customization potential | |
| ### Key Insights: | |
| - Smaller models (82M-500M) can excel in specific scenarios | |
| - Larger models (1B-3B+) offer more versatility but require more resources | |
| - Architecture matters as much as parameter count | |
| - Training data quality significantly impacts output quality | |
| """) | |
| # Footer | |
| # gr.HTML(""" | |
| # <div id="footer"> | |
| # <p><strong>π Ready to deploy your own TTS model?</strong></p> | |
| # <p>This demo showcases the power of open-source TTS technology. Each model offers unique strengths for different applications.</p> | |
| # <p><em>Built with β€οΈ using Gradio β’ All models are open-source and available on Hugging Face</em></p> | |
| # <p>β‘ Powered by Inferless</p> | |
| # </div> | |
| # """) | |
| return demo | |
| # ---------- 4. Launch the application ---------- | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch( | |
| share=True, | |
| inbrowser=True, | |
| show_error=True | |
| ) |