Spaces:

Edmon02
/

SpeechT5_hy

Runtime error

App Files Files Community

SpeechT5_hy / app.py

Edmon02

Update deployment scripts and README for optimized TTS configuration and features

b729af6 6 months ago

raw

history blame

12.4 kB

	"""
	Optimized SpeechT5 Armenian TTS Application
	==========================================

	High-performance Gradio application with advanced optimization features.
	"""

	import gradio as gr
	import numpy as np
	import logging
	import time
	from typing import Tuple, Optional
	import os
	import sys

	# Add src to path for imports
	sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))

	from src.pipeline import TTSPipeline

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	# Global pipeline instance
	tts_pipeline: Optional[TTSPipeline] = None


	def initialize_pipeline():
	"""Initialize the TTS pipeline with error handling."""
	global tts_pipeline

	try:
	logger.info("Initializing TTS Pipeline...")
	tts_pipeline = TTSPipeline(
	model_checkpoint="Edmon02/TTS_NB_2",
	max_chunk_length=200, # Optimal for 5-20s clips
	crossfade_duration=0.1,
	use_mixed_precision=True
	)

	# Apply production optimizations
	tts_pipeline.optimize_for_production()

	logger.info("TTS Pipeline initialized successfully")
	return True

	except Exception as e:
	logger.error(f"Failed to initialize TTS pipeline: {e}")
	return False


	def predict(text: str, speaker: str,
	enable_chunking: bool = True,
	apply_processing: bool = True) -> Tuple[int, np.ndarray]:
	"""
	Main prediction function with optimization and error handling.

	Args:
	text: Input text to synthesize
	speaker: Speaker selection
	enable_chunking: Whether to enable intelligent chunking
	apply_processing: Whether to apply audio post-processing

	Returns:
	Tuple of (sample_rate, audio_array)
	"""
	global tts_pipeline

	start_time = time.time()

	try:
	# Validate inputs
	if not text or not text.strip():
	logger.warning("Empty text provided")
	return 16000, np.zeros(0, dtype=np.int16)

	if tts_pipeline is None:
	logger.error("TTS pipeline not initialized")
	return 16000, np.zeros(0, dtype=np.int16)

	# Extract speaker code from selection
	speaker_code = speaker.split("(")[0].strip()

	# Log request
	logger.info(f"Processing request: {len(text)} chars, speaker: {speaker_code}")

	# Synthesize speech
	sample_rate, audio = tts_pipeline.synthesize(
	text=text,
	speaker=speaker_code,
	enable_chunking=enable_chunking,
	apply_audio_processing=apply_processing
	)

	# Log performance
	total_time = time.time() - start_time
	audio_duration = len(audio) / sample_rate if len(audio) > 0 else 0
	rtf = total_time / audio_duration if audio_duration > 0 else float('inf')

	logger.info(f"Request completed in {total_time:.3f}s (RTF: {rtf:.2f})")

	return sample_rate, audio

	except Exception as e:
	logger.error(f"Prediction failed: {e}")
	return 16000, np.zeros(0, dtype=np.int16)


	def get_performance_info() -> str:
	"""Get performance statistics as formatted string."""
	global tts_pipeline

	if tts_pipeline is None:
	return "Pipeline not initialized"

	try:
	stats = tts_pipeline.get_performance_stats()

	info = f"""
	Performance Statistics:
	- Total Inferences: {stats['pipeline_stats']['total_inferences']}
	- Average Processing Time: {stats['pipeline_stats']['avg_processing_time']:.3f}s
	- Translation Cache Size: {stats['text_processor_stats']['translation_cache_size']}
	- Model Inferences: {stats['model_stats']['total_inferences']}
	- Average Model Time: {stats['model_stats'].get('avg_inference_time', 0):.3f}s
	"""

	return info.strip()

	except Exception as e:
	return f"Error getting performance info: {e}"


	def health_check() -> str:
	"""Perform system health check."""
	global tts_pipeline

	if tts_pipeline is None:
	return "❌ Pipeline not initialized"

	try:
	health = tts_pipeline.health_check()

	if health["status"] == "healthy":
	return "✅ All systems operational"
	elif health["status"] == "degraded":
	return "⚠️ Some components have issues"
	else:
	return f"❌ System error: {health.get('error', 'Unknown error')}"

	except Exception as e:
	return f"❌ Health check failed: {e}"


	# Application metadata
	TITLE = "🎤 SpeechT5 Armenian TTS - Optimized"

	DESCRIPTION = """
	# High-Performance Armenian Text-to-Speech

	This is an optimized version of SpeechT5 for Armenian language synthesis, featuring:

	### 🚀 Performance Optimizations
	- Intelligent Text Chunking: Handles long texts by splitting them intelligently at sentence boundaries
	- Caching: Translation and embedding caching for faster repeated requests
	- Mixed Precision: GPU optimization with FP16 inference when available
	- Crossfading: Smooth audio transitions between chunks for natural-sounding longer texts

	### 🎯 Advanced Features
	- Smart Text Processing: Automatic number-to-word conversion with Armenian translation
	- Audio Post-Processing: Noise gating, normalization, and dynamic range optimization
	- Robust Error Handling: Graceful fallbacks and comprehensive logging
	- Real-time Performance Monitoring: Track processing times and system health

	### 📝 Usage Tips
	- Short texts (< 200 chars): Processed directly for maximum speed
	- Long texts: Automatically chunked with overlap for seamless audio
	- Numbers: Automatically converted to Armenian words
	- Performance: Enable chunking for texts longer than a few sentences

	### 🎵 Audio Quality
	- Sample Rate: 16 kHz
	- Optimized for natural prosody and clear pronunciation
	- Cross-fade transitions for multi-chunk synthesis

	The model was trained on short clips (5-20s) but uses advanced algorithms to handle longer texts effectively.
	"""

	EXAMPLES = [
	# Short examples for quick testing
	["Բարև ձեզ, ինչպե՞ս եք:", "BDL (male)", True, True],
	["Այսօր գեղեցիկ օր է:", "BDL (male)", False, True],

	# Medium examples demonstrating chunking
	["Հայաստանն ունի հարուստ պատմություն և մշակույթ: Երևանը մայրաքաղաքն է, որն ունի 2800 տարվա պատմություն:", "BDL (male)", True, True],

	# Long example with numbers
	["Արարատ լեռը բարձրությունը 5165 մետր է: Այն Հայաստանի խորհրդանիշն է և գտնվում է Թուրքիայի տարածքում: Լեռան վրա ըստ Աստվածաշնչի՝ կանգնել է Նոյի տապանը 40 օրվա ջրհեղեղից հետո:", "BDL (male)", True, True],

	# Technical example
	["Մեքենայի շարժիչը 150 ձիուժ է և 2.0 լիտր ծավալ ունի: Այն կարող է արագացնել 0-ից 100 կմ/ժ 8.5 վայրկյանում:", "BDL (male)", True, True],
	]

	# Custom CSS for better styling
	CUSTOM_CSS = """
	.gradio-container {
	max-width: 1200px !important;
	margin: auto !important;
	}

	.performance-info {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	padding: 15px;
	border-radius: 10px;
	color: white;
	margin: 10px 0;
	}

	.health-status {
	padding: 10px;
	border-radius: 8px;
	margin: 10px 0;
	font-weight: bold;
	}

	.status-healthy { background-color: #d4edda; color: #155724; }
	.status-warning { background-color: #fff3cd; color: #856404; }
	.status-error { background-color: #f8d7da; color: #721c24; }
	"""


	def create_interface():
	"""Create and configure the Gradio interface."""

	with gr.Blocks(
	theme=gr.themes.Soft(),
	css=CUSTOM_CSS,
	title="SpeechT5 Armenian TTS"
	) as interface:

	# Header
	gr.Markdown(f"# {TITLE}")
	gr.Markdown(DESCRIPTION)

	with gr.Row():
	with gr.Column(scale=2):
	# Main input controls
	text_input = gr.Textbox(
	label="📝 Input Text (Armenian)",
	placeholder="Մուտքագրեք ձեր տեքստը այստեղ...",
	lines=3,
	max_lines=10
	)

	with gr.Row():
	speaker_input = gr.Radio(
	label="🎭 Speaker",
	choices=["BDL (male)"],
	value="BDL (male)"
	)

	with gr.Row():
	chunking_checkbox = gr.Checkbox(
	label="🧩 Enable Intelligent Chunking",
	value=True,
	info="Automatically split long texts for better quality"
	)
	processing_checkbox = gr.Checkbox(
	label="🎚️ Apply Audio Processing",
	value=True,
	info="Apply noise gating, normalization, and crossfading"
	)

	# Generate button
	generate_btn = gr.Button(
	"🎤 Generate Speech",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=1):
	# System information panel
	gr.Markdown("### 📊 System Status")

	health_display = gr.Textbox(
	label="Health Status",
	value="Initializing...",
	interactive=False,
	max_lines=1
	)

	performance_display = gr.Textbox(
	label="Performance Stats",
	value="No data yet",
	interactive=False,
	max_lines=8
	)

	refresh_btn = gr.Button("🔄 Refresh Stats", size="sm")

	# Output
	audio_output = gr.Audio(
	label="🔊 Generated Speech",
	type="numpy",
	interactive=False
	)

	# Examples section
	gr.Markdown("### 💡 Example Texts")
	gr.Examples(
	examples=EXAMPLES,
	inputs=[text_input, speaker_input, chunking_checkbox, processing_checkbox],
	outputs=[audio_output],
	fn=predict,
	cache_examples=False,
	label="Click any example to try it:"
	)

	# Event handlers
	generate_btn.click(
	fn=predict,
	inputs=[text_input, speaker_input, chunking_checkbox, processing_checkbox],
	outputs=[audio_output],
	show_progress=True
	)

	refresh_btn.click(
	fn=lambda: (health_check(), get_performance_info()),
	outputs=[health_display, performance_display],
	show_progress=False
	)

	# Auto-refresh health status on load
	interface.load(
	fn=lambda: (health_check(), get_performance_info()),
	outputs=[health_display, performance_display]
	)

	return interface


	def main():
	"""Main application entry point."""
	logger.info("Starting SpeechT5 Armenian TTS Application")

	# Initialize pipeline
	if not initialize_pipeline():
	logger.error("Failed to initialize TTS pipeline - exiting")
	sys.exit(1)

	# Create and launch interface
	interface = create_interface()

	# Launch with optimized settings
	interface.launch(
	share=True,
	inbrowser=False,
	show_error=True,
	quiet=False,
	server_name="0.0.0.0", # Allow external connections
	server_port=7860, # Standard Gradio port
	enable_queue=True, # Enable queuing for better performance
	max_threads=4, # Limit concurrent requests
	)


	if __name__ == "__main__":
	main()