Spaces:
Build error
Build error
import gradio as gr | |
import torch | |
from TTS.api import TTS | |
import os | |
import soundfile as sf | |
import numpy as np | |
from pydub import AudioSegment | |
import tempfile | |
import gc | |
os.environ["COQUI_TOS_AGREED"] = "1" | |
# π PERFORMANCE OPTIMIZATIONS | |
torch.backends.cudnn.benchmark = True # Optimize CUDA operations | |
torch.backends.cudnn.deterministic = False | |
# Smart device detection with memory optimization | |
use_gpu = torch.cuda.is_available() | |
device = "cuda" if use_gpu else "cpu" | |
print(f"[INFO] Using device: {device}") | |
if use_gpu: | |
print(f"[INFO] GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB") | |
# β OPTIMIZED XTTS Model Initialization | |
try: | |
# Use smaller model for faster inference if needed | |
model_name = "tts_models/multilingual/multi-dataset/xtts_v2" | |
tts = TTS(model_name, gpu=use_gpu, progress_bar=False) # Disable progress bar for speed | |
if not hasattr(tts, "synthesizer") or not hasattr(tts.synthesizer, "tts_model"): | |
raise RuntimeError("XTTS model failed to load correctly.") | |
# π PERFORMANCE TWEAKS | |
if hasattr(tts.synthesizer.tts_model, 'inference'): | |
# Set inference parameters for speed | |
tts.synthesizer.tts_model.inference_noise_scale = 0.667 | |
tts.synthesizer.tts_model.inference_noise_scale_w = 0.8 | |
tts.synthesizer.tts_model.length_scale = 1.0 | |
print(f"[INFO] XTTS model loaded successfully. GPU enabled: {use_gpu}") | |
except Exception as e: | |
print(f"[ERROR] Failed to initialize XTTS model: {str(e)}") | |
tts = None | |
# π AUDIO PREPROCESSING FOR SPEED | |
def preprocess_audio(audio_path, target_sr=22050, max_duration=30): | |
"""Optimize audio for faster processing""" | |
try: | |
# Load and preprocess audio | |
audio_data, sr = sf.read(audio_path) | |
# Convert to mono if stereo | |
if len(audio_data.shape) > 1: | |
audio_data = np.mean(audio_data, axis=1) | |
# Trim silence and limit duration for speed | |
from scipy.signal import find_peaks | |
# Simple silence trimming | |
threshold = np.max(np.abs(audio_data)) * 0.01 | |
non_silent = np.where(np.abs(audio_data) > threshold)[0] | |
if len(non_silent) > 0: | |
start_idx = max(0, non_silent[0] - int(0.1 * sr)) # Keep 0.1s before | |
end_idx = min(len(audio_data), non_silent[-1] + int(0.1 * sr)) # Keep 0.1s after | |
audio_data = audio_data[start_idx:end_idx] | |
# Limit duration for faster processing | |
max_samples = int(max_duration * sr) | |
if len(audio_data) > max_samples: | |
audio_data = audio_data[:max_samples] | |
# Resample if needed | |
if sr != target_sr: | |
from scipy.signal import resample | |
audio_data = resample(audio_data, int(len(audio_data) * target_sr / sr)) | |
# Save preprocessed audio | |
temp_path = tempfile.mktemp(suffix='.wav') | |
sf.write(temp_path, audio_data, target_sr) | |
return temp_path | |
except Exception as e: | |
print(f"[WARNING] Audio preprocessing failed: {e}") | |
return audio_path | |
# π OPTIMIZED TEXT PROCESSING | |
def optimize_text(text, max_length=500): | |
"""Optimize text for faster processing""" | |
# Limit text length for speed | |
if len(text) > max_length: | |
# Split at sentence boundaries | |
sentences = text.split('.') | |
result = "" | |
for sentence in sentences: | |
if len(result + sentence) > max_length: | |
break | |
result += sentence + "." | |
text = result.rstrip('.') | |
# Clean text | |
text = text.strip() | |
if not text.endswith(('.', '!', '?')): | |
text += '.' | |
return text | |
# β OPTIMIZED clone() Function | |
def clone(text, audio): | |
if tts is None: | |
return None, "β XTTS model failed to load." | |
if not text or not audio: | |
return None, "β Error: Missing text or audio input." | |
try: | |
import time | |
start_time = time.time() | |
# β Validate audio input | |
if isinstance(audio, bool) or not isinstance(audio, str) or not os.path.exists(audio): | |
return None, "β Error: Invalid audio input format." | |
# π PREPROCESSING FOR SPEED | |
print("[INFO] Preprocessing audio...") | |
processed_audio = preprocess_audio(audio) | |
print("[INFO] Optimizing text...") | |
optimized_text = optimize_text(text) | |
print(f"[INFO] Text length: {len(optimized_text)} characters") | |
output_path = "./output.wav" | |
# π OPTIMIZED XTTS Processing | |
print("[INFO] Generating speech...") | |
# Clear GPU cache before processing | |
if use_gpu: | |
torch.cuda.empty_cache() | |
# Generate with optimized settings | |
tts.tts_to_file( | |
text=optimized_text, | |
speaker_wav=processed_audio, | |
language="en", | |
file_path=output_path, | |
split_sentences=True, # Better for long texts | |
# Additional optimization parameters | |
) | |
# Clean up temporary files | |
if processed_audio != audio: | |
try: | |
os.remove(processed_audio) | |
except: | |
pass | |
# Clear memory | |
if use_gpu: | |
torch.cuda.empty_cache() | |
gc.collect() | |
# β Validate output | |
if not os.path.exists(output_path) or os.path.getsize(output_path) == 0: | |
return None, "β Error: XTTS failed to generate audio." | |
# π PERFORMANCE METRICS | |
end_time = time.time() | |
processing_time = end_time - start_time | |
# Calculate audio duration for real-time factor | |
audio_data, sr = sf.read(output_path) | |
audio_duration = len(audio_data) / sr | |
rtf = processing_time / audio_duration if audio_duration > 0 else 0 | |
print(f"[PERFORMANCE] Processing time: {processing_time:.2f}s") | |
print(f"[PERFORMANCE] Audio duration: {audio_duration:.2f}s") | |
print(f"[PERFORMANCE] Real-time factor: {rtf:.2f}x") | |
return output_path, f"β Generated in {processing_time:.1f}s (RTF: {rtf:.1f}x)" | |
except Exception as e: | |
print(f"[ERROR] XTTS Processing Error: {str(e)}") | |
# Clean up on error | |
if use_gpu: | |
torch.cuda.empty_cache() | |
gc.collect() | |
return None, f"β Error: {str(e)}" | |
# π OPTIMIZED Gradio Interface | |
def create_interface(): | |
with gr.Blocks( | |
theme=gr.themes.Soft(primary_hue="teal"), | |
title="β‘ Fast Voice Clone" | |
) as iface: | |
gr.Markdown("# β‘ Optimized Voice Cloning with XTTS") | |
gr.Markdown("*Faster processing with quality optimizations*") | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox( | |
label="π Text to speak", | |
placeholder="Enter text here (max 500 chars for optimal speed)...", | |
lines=3, | |
max_lines=5 | |
) | |
audio_input = gr.Audio( | |
type='filepath', | |
label='π€ Voice reference (10-30 seconds recommended)', | |
sources=['upload', 'microphone'] | |
) | |
with gr.Row(): | |
generate_btn = gr.Button("π Generate Voice", variant="primary") | |
clear_btn = gr.Button("ποΈ Clear", variant="secondary") | |
with gr.Column(): | |
status_output = gr.Textbox( | |
label="π Status", | |
interactive=False, | |
lines=2 | |
) | |
audio_output = gr.Audio( | |
type='filepath', | |
label='π Generated Audio' | |
) | |
# Performance tips | |
gr.Markdown(""" | |
### π Performance Tips: | |
- Keep text under 500 characters for fastest processing | |
- Use 10-30 second reference audio clips | |
- GPU processing is ~5-10x faster than CPU | |
- Clear audio with minimal background noise works best | |
""") | |
# Event handlers | |
generate_btn.click( | |
fn=clone, | |
inputs=[text_input, audio_input], | |
outputs=[audio_output, status_output], | |
show_progress=True | |
) | |
clear_btn.click( | |
fn=lambda: (None, None, None, ""), | |
outputs=[text_input, audio_input, audio_output, status_output] | |
) | |
return iface | |
# β Launch optimized interface | |
if __name__ == "__main__": | |
iface = create_interface() | |
iface.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=False, | |
show_error=True, | |
quiet=False | |
) |