import gradio as gr import torch from TTS.api import TTS import os import soundfile as sf import numpy as np from pydub import AudioSegment import tempfile import gc os.environ["COQUI_TOS_AGREED"] = "1" # 🚀 PERFORMANCE OPTIMIZATIONS torch.backends.cudnn.benchmark = True # Optimize CUDA operations torch.backends.cudnn.deterministic = False # Smart device detection with memory optimization use_gpu = torch.cuda.is_available() device = "cuda" if use_gpu else "cpu" print(f"[INFO] Using device: {device}") if use_gpu: print(f"[INFO] GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB") # ✅ OPTIMIZED XTTS Model Initialization try: # Use smaller model for faster inference if needed model_name = "tts_models/multilingual/multi-dataset/xtts_v2" tts = TTS(model_name, gpu=use_gpu, progress_bar=False) # Disable progress bar for speed if not hasattr(tts, "synthesizer") or not hasattr(tts.synthesizer, "tts_model"): raise RuntimeError("XTTS model failed to load correctly.") # 🚀 PERFORMANCE TWEAKS if hasattr(tts.synthesizer.tts_model, 'inference'): # Set inference parameters for speed tts.synthesizer.tts_model.inference_noise_scale = 0.667 tts.synthesizer.tts_model.inference_noise_scale_w = 0.8 tts.synthesizer.tts_model.length_scale = 1.0 print(f"[INFO] XTTS model loaded successfully. GPU enabled: {use_gpu}") except Exception as e: print(f"[ERROR] Failed to initialize XTTS model: {str(e)}") tts = None # 🚀 AUDIO PREPROCESSING FOR SPEED def preprocess_audio(audio_path, target_sr=22050, max_duration=30): """Optimize audio for faster processing""" try: # Load and preprocess audio audio_data, sr = sf.read(audio_path) # Convert to mono if stereo if len(audio_data.shape) > 1: audio_data = np.mean(audio_data, axis=1) # Trim silence and limit duration for speed from scipy.signal import find_peaks # Simple silence trimming threshold = np.max(np.abs(audio_data)) * 0.01 non_silent = np.where(np.abs(audio_data) > threshold)[0] if len(non_silent) > 0: start_idx = max(0, non_silent[0] - int(0.1 * sr)) # Keep 0.1s before end_idx = min(len(audio_data), non_silent[-1] + int(0.1 * sr)) # Keep 0.1s after audio_data = audio_data[start_idx:end_idx] # Limit duration for faster processing max_samples = int(max_duration * sr) if len(audio_data) > max_samples: audio_data = audio_data[:max_samples] # Resample if needed if sr != target_sr: from scipy.signal import resample audio_data = resample(audio_data, int(len(audio_data) * target_sr / sr)) # Save preprocessed audio temp_path = tempfile.mktemp(suffix='.wav') sf.write(temp_path, audio_data, target_sr) return temp_path except Exception as e: print(f"[WARNING] Audio preprocessing failed: {e}") return audio_path # 🚀 OPTIMIZED TEXT PROCESSING def optimize_text(text, max_length=500): """Optimize text for faster processing""" # Limit text length for speed if len(text) > max_length: # Split at sentence boundaries sentences = text.split('.') result = "" for sentence in sentences: if len(result + sentence) > max_length: break result += sentence + "." text = result.rstrip('.') # Clean text text = text.strip() if not text.endswith(('.', '!', '?')): text += '.' return text # ✅ OPTIMIZED clone() Function def clone(text, audio): if tts is None: return None, "⚠ XTTS model failed to load." if not text or not audio: return None, "⚠ Error: Missing text or audio input." try: import time start_time = time.time() # ✅ Validate audio input if isinstance(audio, bool) or not isinstance(audio, str) or not os.path.exists(audio): return None, "⚠ Error: Invalid audio input format." # 🚀 PREPROCESSING FOR SPEED print("[INFO] Preprocessing audio...") processed_audio = preprocess_audio(audio) print("[INFO] Optimizing text...") optimized_text = optimize_text(text) print(f"[INFO] Text length: {len(optimized_text)} characters") output_path = "./output.wav" # 🚀 OPTIMIZED XTTS Processing print("[INFO] Generating speech...") # Clear GPU cache before processing if use_gpu: torch.cuda.empty_cache() # Generate with optimized settings tts.tts_to_file( text=optimized_text, speaker_wav=processed_audio, language="en", file_path=output_path, split_sentences=True, # Better for long texts # Additional optimization parameters ) # Clean up temporary files if processed_audio != audio: try: os.remove(processed_audio) except: pass # Clear memory if use_gpu: torch.cuda.empty_cache() gc.collect() # ✅ Validate output if not os.path.exists(output_path) or os.path.getsize(output_path) == 0: return None, "⚠ Error: XTTS failed to generate audio." # 🚀 PERFORMANCE METRICS end_time = time.time() processing_time = end_time - start_time # Calculate audio duration for real-time factor audio_data, sr = sf.read(output_path) audio_duration = len(audio_data) / sr rtf = processing_time / audio_duration if audio_duration > 0 else 0 print(f"[PERFORMANCE] Processing time: {processing_time:.2f}s") print(f"[PERFORMANCE] Audio duration: {audio_duration:.2f}s") print(f"[PERFORMANCE] Real-time factor: {rtf:.2f}x") return output_path, f"✅ Generated in {processing_time:.1f}s (RTF: {rtf:.1f}x)" except Exception as e: print(f"[ERROR] XTTS Processing Error: {str(e)}") # Clean up on error if use_gpu: torch.cuda.empty_cache() gc.collect() return None, f"⚠ Error: {str(e)}" # 🚀 OPTIMIZED Gradio Interface def create_interface(): with gr.Blocks( theme=gr.themes.Soft(primary_hue="teal"), title="⚡ Fast Voice Clone" ) as iface: gr.Markdown("# ⚡ Optimized Voice Cloning with XTTS") gr.Markdown("*Faster processing with quality optimizations*") with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="📝 Text to speak", placeholder="Enter text here (max 500 chars for optimal speed)...", lines=3, max_lines=5 ) audio_input = gr.Audio( type='filepath', label='🎤 Voice reference (10-30 seconds recommended)', sources=['upload', 'microphone'] ) with gr.Row(): generate_btn = gr.Button("🚀 Generate Voice", variant="primary") clear_btn = gr.Button("🗑️ Clear", variant="secondary") with gr.Column(): status_output = gr.Textbox( label="📊 Status", interactive=False, lines=2 ) audio_output = gr.Audio( type='filepath', label='🔊 Generated Audio' ) # Performance tips gr.Markdown(""" ### 🚀 Performance Tips: - Keep text under 500 characters for fastest processing - Use 10-30 second reference audio clips - GPU processing is ~5-10x faster than CPU - Clear audio with minimal background noise works best """) # Event handlers generate_btn.click( fn=clone, inputs=[text_input, audio_input], outputs=[audio_output, status_output], show_progress=True ) clear_btn.click( fn=lambda: (None, None, None, ""), outputs=[text_input, audio_input, audio_output, status_output] ) return iface # ✅ Launch optimized interface if __name__ == "__main__": iface = create_interface() iface.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True, quiet=False )