conqui-tts2 / app.py
Hematej's picture
Update app.py
da3316c verified
raw
history blame
9 kB
import gradio as gr
import torch
from TTS.api import TTS
import os
import soundfile as sf
import numpy as np
from pydub import AudioSegment
import tempfile
import gc
os.environ["COQUI_TOS_AGREED"] = "1"
# πŸš€ PERFORMANCE OPTIMIZATIONS
torch.backends.cudnn.benchmark = True # Optimize CUDA operations
torch.backends.cudnn.deterministic = False
# Smart device detection with memory optimization
use_gpu = torch.cuda.is_available()
device = "cuda" if use_gpu else "cpu"
print(f"[INFO] Using device: {device}")
if use_gpu:
print(f"[INFO] GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
# βœ… OPTIMIZED XTTS Model Initialization
try:
# Use smaller model for faster inference if needed
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name, gpu=use_gpu, progress_bar=False) # Disable progress bar for speed
if not hasattr(tts, "synthesizer") or not hasattr(tts.synthesizer, "tts_model"):
raise RuntimeError("XTTS model failed to load correctly.")
# πŸš€ PERFORMANCE TWEAKS
if hasattr(tts.synthesizer.tts_model, 'inference'):
# Set inference parameters for speed
tts.synthesizer.tts_model.inference_noise_scale = 0.667
tts.synthesizer.tts_model.inference_noise_scale_w = 0.8
tts.synthesizer.tts_model.length_scale = 1.0
print(f"[INFO] XTTS model loaded successfully. GPU enabled: {use_gpu}")
except Exception as e:
print(f"[ERROR] Failed to initialize XTTS model: {str(e)}")
tts = None
# πŸš€ AUDIO PREPROCESSING FOR SPEED
def preprocess_audio(audio_path, target_sr=22050, max_duration=30):
"""Optimize audio for faster processing"""
try:
# Load and preprocess audio
audio_data, sr = sf.read(audio_path)
# Convert to mono if stereo
if len(audio_data.shape) > 1:
audio_data = np.mean(audio_data, axis=1)
# Trim silence and limit duration for speed
from scipy.signal import find_peaks
# Simple silence trimming
threshold = np.max(np.abs(audio_data)) * 0.01
non_silent = np.where(np.abs(audio_data) > threshold)[0]
if len(non_silent) > 0:
start_idx = max(0, non_silent[0] - int(0.1 * sr)) # Keep 0.1s before
end_idx = min(len(audio_data), non_silent[-1] + int(0.1 * sr)) # Keep 0.1s after
audio_data = audio_data[start_idx:end_idx]
# Limit duration for faster processing
max_samples = int(max_duration * sr)
if len(audio_data) > max_samples:
audio_data = audio_data[:max_samples]
# Resample if needed
if sr != target_sr:
from scipy.signal import resample
audio_data = resample(audio_data, int(len(audio_data) * target_sr / sr))
# Save preprocessed audio
temp_path = tempfile.mktemp(suffix='.wav')
sf.write(temp_path, audio_data, target_sr)
return temp_path
except Exception as e:
print(f"[WARNING] Audio preprocessing failed: {e}")
return audio_path
# πŸš€ OPTIMIZED TEXT PROCESSING
def optimize_text(text, max_length=500):
"""Optimize text for faster processing"""
# Limit text length for speed
if len(text) > max_length:
# Split at sentence boundaries
sentences = text.split('.')
result = ""
for sentence in sentences:
if len(result + sentence) > max_length:
break
result += sentence + "."
text = result.rstrip('.')
# Clean text
text = text.strip()
if not text.endswith(('.', '!', '?')):
text += '.'
return text
# βœ… OPTIMIZED clone() Function
def clone(text, audio):
if tts is None:
return None, "⚠ XTTS model failed to load."
if not text or not audio:
return None, "⚠ Error: Missing text or audio input."
try:
import time
start_time = time.time()
# βœ… Validate audio input
if isinstance(audio, bool) or not isinstance(audio, str) or not os.path.exists(audio):
return None, "⚠ Error: Invalid audio input format."
# πŸš€ PREPROCESSING FOR SPEED
print("[INFO] Preprocessing audio...")
processed_audio = preprocess_audio(audio)
print("[INFO] Optimizing text...")
optimized_text = optimize_text(text)
print(f"[INFO] Text length: {len(optimized_text)} characters")
output_path = "./output.wav"
# πŸš€ OPTIMIZED XTTS Processing
print("[INFO] Generating speech...")
# Clear GPU cache before processing
if use_gpu:
torch.cuda.empty_cache()
# Generate with optimized settings
tts.tts_to_file(
text=optimized_text,
speaker_wav=processed_audio,
language="en",
file_path=output_path,
split_sentences=True, # Better for long texts
# Additional optimization parameters
)
# Clean up temporary files
if processed_audio != audio:
try:
os.remove(processed_audio)
except:
pass
# Clear memory
if use_gpu:
torch.cuda.empty_cache()
gc.collect()
# βœ… Validate output
if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
return None, "⚠ Error: XTTS failed to generate audio."
# πŸš€ PERFORMANCE METRICS
end_time = time.time()
processing_time = end_time - start_time
# Calculate audio duration for real-time factor
audio_data, sr = sf.read(output_path)
audio_duration = len(audio_data) / sr
rtf = processing_time / audio_duration if audio_duration > 0 else 0
print(f"[PERFORMANCE] Processing time: {processing_time:.2f}s")
print(f"[PERFORMANCE] Audio duration: {audio_duration:.2f}s")
print(f"[PERFORMANCE] Real-time factor: {rtf:.2f}x")
return output_path, f"βœ… Generated in {processing_time:.1f}s (RTF: {rtf:.1f}x)"
except Exception as e:
print(f"[ERROR] XTTS Processing Error: {str(e)}")
# Clean up on error
if use_gpu:
torch.cuda.empty_cache()
gc.collect()
return None, f"⚠ Error: {str(e)}"
# πŸš€ OPTIMIZED Gradio Interface
def create_interface():
with gr.Blocks(
theme=gr.themes.Soft(primary_hue="teal"),
title="⚑ Fast Voice Clone"
) as iface:
gr.Markdown("# ⚑ Optimized Voice Cloning with XTTS")
gr.Markdown("*Faster processing with quality optimizations*")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="πŸ“ Text to speak",
placeholder="Enter text here (max 500 chars for optimal speed)...",
lines=3,
max_lines=5
)
audio_input = gr.Audio(
type='filepath',
label='🎀 Voice reference (10-30 seconds recommended)',
sources=['upload', 'microphone']
)
with gr.Row():
generate_btn = gr.Button("πŸš€ Generate Voice", variant="primary")
clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
with gr.Column():
status_output = gr.Textbox(
label="πŸ“Š Status",
interactive=False,
lines=2
)
audio_output = gr.Audio(
type='filepath',
label='πŸ”Š Generated Audio'
)
# Performance tips
gr.Markdown("""
### πŸš€ Performance Tips:
- Keep text under 500 characters for fastest processing
- Use 10-30 second reference audio clips
- GPU processing is ~5-10x faster than CPU
- Clear audio with minimal background noise works best
""")
# Event handlers
generate_btn.click(
fn=clone,
inputs=[text_input, audio_input],
outputs=[audio_output, status_output],
show_progress=True
)
clear_btn.click(
fn=lambda: (None, None, None, ""),
outputs=[text_input, audio_input, audio_output, status_output]
)
return iface
# βœ… Launch optimized interface
if __name__ == "__main__":
iface = create_interface()
iface.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True,
quiet=False
)