Spaces:

Hematej
/

conqui-tts2

Build error

App Files Files Community

Hematej commited on Jun 7

Commit

da3316c

verified ·

1 Parent(s): a9c8950

Update app.py

Browse files

Files changed (1) hide show

app.py +235 -33

app.py CHANGED Viewed

@@ -3,64 +3,266 @@ import torch
 from TTS.api import TTS
 import os
 import soundfile as sf
 os.environ["COQUI_TOS_AGREED"] = "1"
-# Smart device detection
 use_gpu = torch.cuda.is_available()
-# ✅ XTTS Model Initialization with Proper Error Handling
 try:
-    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=use_gpu, progress_bar=True)
     if not hasattr(tts, "synthesizer") or not hasattr(tts.synthesizer, "tts_model"):
         raise RuntimeError("XTTS model failed to load correctly.")
     print(f"[INFO] XTTS model loaded successfully. GPU enabled: {use_gpu}")
 except Exception as e:
     print(f"[ERROR] Failed to initialize XTTS model: {str(e)}")
-    tts = None  # Prevents further crashes
-# ✅ Fixed clone() Function
 def clone(text, audio):
     if tts is None:
         return None, "⚠ XTTS model failed to load."
     if not text or not audio:
         return None, "⚠ Error: Missing text or audio input."
     try:
         # ✅ Validate audio input
         if isinstance(audio, bool) or not isinstance(audio, str) or not os.path.exists(audio):
             return None, "⚠ Error: Invalid audio input format."
         output_path = "./output.wav"
-        # ✅ XTTS Processing with Error Handling
-        tts.tts_to_file(text=text, speaker_wav=audio, language="en", file_path=output_path)
-        # ✅ Ensure output file is valid before passing to Gradio
         if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
             return None, "⚠ Error: XTTS failed to generate audio."
-        # ✅ Convert output file format
-        audio_data, samplerate = sf.read(output_path)
-        sf.write(output_path, audio_data, samplerate)
-        return output_path
     except Exception as e:
         print(f"[ERROR] XTTS Processing Error: {str(e)}")
-        return None
-# ✅ Fixed Gradio Setup
-iface = gr.Interface(
-    fn=clone,
-    inputs=[
-        gr.Textbox(label='Text'),
-        gr.Audio(type='filepath', label='Voice reference audio file')
-    ],
-    outputs=gr.Audio(type='filepath'),
-    title="Voice Clone",
-    flagging_mode="never",
-    cache_examples=False,
-    theme=gr.themes.Base(primary_hue="teal", secondary_hue="teal", neutral_hue="slate"),
-)
-iface.launch()

 from TTS.api import TTS
 import os
 import soundfile as sf
+import numpy as np
+from pydub import AudioSegment
+import tempfile
+import gc
 os.environ["COQUI_TOS_AGREED"] = "1"
+# 🚀 PERFORMANCE OPTIMIZATIONS
+torch.backends.cudnn.benchmark = True  # Optimize CUDA operations
+torch.backends.cudnn.deterministic = False
+# Smart device detection with memory optimization
 use_gpu = torch.cuda.is_available()
+device = "cuda" if use_gpu else "cpu"
+print(f"[INFO] Using device: {device}")
+if use_gpu:
+    print(f"[INFO] GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
+# ✅ OPTIMIZED XTTS Model Initialization
 try:
+    # Use smaller model for faster inference if needed
+    model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
+    tts = TTS(model_name, gpu=use_gpu, progress_bar=False)  # Disable progress bar for speed
     if not hasattr(tts, "synthesizer") or not hasattr(tts.synthesizer, "tts_model"):
         raise RuntimeError("XTTS model failed to load correctly.")
+    # 🚀 PERFORMANCE TWEAKS
+    if hasattr(tts.synthesizer.tts_model, 'inference'):
+        # Set inference parameters for speed
+        tts.synthesizer.tts_model.inference_noise_scale = 0.667
+        tts.synthesizer.tts_model.inference_noise_scale_w = 0.8
+        tts.synthesizer.tts_model.length_scale = 1.0
     print(f"[INFO] XTTS model loaded successfully. GPU enabled: {use_gpu}")
 except Exception as e:
     print(f"[ERROR] Failed to initialize XTTS model: {str(e)}")
+    tts = None
+# 🚀 AUDIO PREPROCESSING FOR SPEED
+def preprocess_audio(audio_path, target_sr=22050, max_duration=30):
+    """Optimize audio for faster processing"""
+    try:
+        # Load and preprocess audio
+        audio_data, sr = sf.read(audio_path)
+        # Convert to mono if stereo
+        if len(audio_data.shape) > 1:
+            audio_data = np.mean(audio_data, axis=1)
+        # Trim silence and limit duration for speed
+        from scipy.signal import find_peaks
+        # Simple silence trimming
+        threshold = np.max(np.abs(audio_data)) * 0.01
+        non_silent = np.where(np.abs(audio_data) > threshold)[0]
+        if len(non_silent) > 0:
+            start_idx = max(0, non_silent[0] - int(0.1 * sr))  # Keep 0.1s before
+            end_idx = min(len(audio_data), non_silent[-1] + int(0.1 * sr))  # Keep 0.1s after
+            audio_data = audio_data[start_idx:end_idx]
+        # Limit duration for faster processing
+        max_samples = int(max_duration * sr)
+        if len(audio_data) > max_samples:
+            audio_data = audio_data[:max_samples]
+        # Resample if needed
+        if sr != target_sr:
+            from scipy.signal import resample
+            audio_data = resample(audio_data, int(len(audio_data) * target_sr / sr))
+        # Save preprocessed audio
+        temp_path = tempfile.mktemp(suffix='.wav')
+        sf.write(temp_path, audio_data, target_sr)
+        return temp_path
+    except Exception as e:
+        print(f"[WARNING] Audio preprocessing failed: {e}")
+        return audio_path
+# 🚀 OPTIMIZED TEXT PROCESSING
+def optimize_text(text, max_length=500):
+    """Optimize text for faster processing"""
+    # Limit text length for speed
+    if len(text) > max_length:
+        # Split at sentence boundaries
+        sentences = text.split('.')
+        result = ""
+        for sentence in sentences:
+            if len(result + sentence) > max_length:
+                break
+            result += sentence + "."
+        text = result.rstrip('.')
+    # Clean text
+    text = text.strip()
+    if not text.endswith(('.', '!', '?')):
+        text += '.'
+    return text
+# ✅ OPTIMIZED clone() Function
 def clone(text, audio):
     if tts is None:
         return None, "⚠ XTTS model failed to load."
     if not text or not audio:
         return None, "⚠ Error: Missing text or audio input."
     try:
+        import time
+        start_time = time.time()
         # ✅ Validate audio input
         if isinstance(audio, bool) or not isinstance(audio, str) or not os.path.exists(audio):
             return None, "⚠ Error: Invalid audio input format."
+        # 🚀 PREPROCESSING FOR SPEED
+        print("[INFO] Preprocessing audio...")
+        processed_audio = preprocess_audio(audio)
+        print("[INFO] Optimizing text...")
+        optimized_text = optimize_text(text)
+        print(f"[INFO] Text length: {len(optimized_text)} characters")
         output_path = "./output.wav"
+        # 🚀 OPTIMIZED XTTS Processing
+        print("[INFO] Generating speech...")
+        # Clear GPU cache before processing
+        if use_gpu:
+            torch.cuda.empty_cache()
+        # Generate with optimized settings
+        tts.tts_to_file(
+            text=optimized_text,
+            speaker_wav=processed_audio,
+            language="en",
+            file_path=output_path,
+            split_sentences=True,  # Better for long texts
+            # Additional optimization parameters
+        )
+        # Clean up temporary files
+        if processed_audio != audio:
+            try:
+                os.remove(processed_audio)
+            except:
+                pass
+        # Clear memory
+        if use_gpu:
+            torch.cuda.empty_cache()
+        gc.collect()
+        # ✅ Validate output
         if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
             return None, "⚠ Error: XTTS failed to generate audio."
+        # 🚀 PERFORMANCE METRICS
+        end_time = time.time()
+        processing_time = end_time - start_time
+        # Calculate audio duration for real-time factor
+        audio_data, sr = sf.read(output_path)
+        audio_duration = len(audio_data) / sr
+        rtf = processing_time / audio_duration if audio_duration > 0 else 0
+        print(f"[PERFORMANCE] Processing time: {processing_time:.2f}s")
+        print(f"[PERFORMANCE] Audio duration: {audio_duration:.2f}s")
+        print(f"[PERFORMANCE] Real-time factor: {rtf:.2f}x")
+        return output_path, f"✅ Generated in {processing_time:.1f}s (RTF: {rtf:.1f}x)"
     except Exception as e:
         print(f"[ERROR] XTTS Processing Error: {str(e)}")
+        # Clean up on error
+        if use_gpu:
+            torch.cuda.empty_cache()
+        gc.collect()
+        return None, f"⚠ Error: {str(e)}"
+# 🚀 OPTIMIZED Gradio Interface
+def create_interface():
+    with gr.Blocks(
+        theme=gr.themes.Soft(primary_hue="teal"),
+        title="⚡ Fast Voice Clone"
+    ) as iface:
+        gr.Markdown("# ⚡ Optimized Voice Cloning with XTTS")
+        gr.Markdown("*Faster processing with quality optimizations*")
+        with gr.Row():
+            with gr.Column():
+                text_input = gr.Textbox(
+                    label="📝 Text to speak",
+                    placeholder="Enter text here (max 500 chars for optimal speed)...",
+                    lines=3,
+                    max_lines=5
+                )
+                audio_input = gr.Audio(
+                    type='filepath',
+                    label='🎤 Voice reference (10-30 seconds recommended)',
+                    sources=['upload', 'microphone']
+                )
+                with gr.Row():
+                    generate_btn = gr.Button("🚀 Generate Voice", variant="primary")
+                    clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+            with gr.Column():
+                status_output = gr.Textbox(
+                    label="📊 Status",
+                    interactive=False,
+                    lines=2
+                )
+                audio_output = gr.Audio(
+                    type='filepath',
+                    label='🔊 Generated Audio'
+                )
+        # Performance tips
+        gr.Markdown("""
+        ### 🚀 Performance Tips:
+        - Keep text under 500 characters for fastest processing
+        - Use 10-30 second reference audio clips
+        - GPU processing is ~5-10x faster than CPU
+        - Clear audio with minimal background noise works best
+        """)
+        # Event handlers
+        generate_btn.click(
+            fn=clone,
+            inputs=[text_input, audio_input],
+            outputs=[audio_output, status_output],
+            show_progress=True
+        )
+        clear_btn.click(
+            fn=lambda: (None, None, None, ""),
+            outputs=[text_input, audio_input, audio_output, status_output]
+        )
+    return iface
+# ✅ Launch optimized interface
+if __name__ == "__main__":
+    iface = create_interface()
+    iface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True,
+        quiet=False
+    )