Spaces:

Hematej
/

conqui-tts2

Build error

App Files Files Community

conqui-tts2 / app.py

Hematej

Update app.py

da3316c verified 3 months ago

raw

history blame

9 kB

	import gradio as gr
	import torch
	from TTS.api import TTS
	import os
	import soundfile as sf
	import numpy as np
	from pydub import AudioSegment
	import tempfile
	import gc

	os.environ["COQUI_TOS_AGREED"] = "1"

	# 🚀 PERFORMANCE OPTIMIZATIONS
	torch.backends.cudnn.benchmark = True # Optimize CUDA operations
	torch.backends.cudnn.deterministic = False

	# Smart device detection with memory optimization
	use_gpu = torch.cuda.is_available()
	device = "cuda" if use_gpu else "cpu"

	print(f"[INFO] Using device: {device}")
	if use_gpu:
	print(f"[INFO] GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

	# ✅ OPTIMIZED XTTS Model Initialization
	try:
	# Use smaller model for faster inference if needed
	model_name = "tts_models/multilingual/multi-dataset/xtts_v2"

	tts = TTS(model_name, gpu=use_gpu, progress_bar=False) # Disable progress bar for speed

	if not hasattr(tts, "synthesizer") or not hasattr(tts.synthesizer, "tts_model"):
	raise RuntimeError("XTTS model failed to load correctly.")

	# 🚀 PERFORMANCE TWEAKS
	if hasattr(tts.synthesizer.tts_model, 'inference'):
	# Set inference parameters for speed
	tts.synthesizer.tts_model.inference_noise_scale = 0.667
	tts.synthesizer.tts_model.inference_noise_scale_w = 0.8
	tts.synthesizer.tts_model.length_scale = 1.0

	print(f"[INFO] XTTS model loaded successfully. GPU enabled: {use_gpu}")

	except Exception as e:
	print(f"[ERROR] Failed to initialize XTTS model: {str(e)}")
	tts = None

	# 🚀 AUDIO PREPROCESSING FOR SPEED
	def preprocess_audio(audio_path, target_sr=22050, max_duration=30):
	"""Optimize audio for faster processing"""
	try:
	# Load and preprocess audio
	audio_data, sr = sf.read(audio_path)

	# Convert to mono if stereo
	if len(audio_data.shape) > 1:
	audio_data = np.mean(audio_data, axis=1)

	# Trim silence and limit duration for speed
	from scipy.signal import find_peaks

	# Simple silence trimming
	threshold = np.max(np.abs(audio_data)) * 0.01
	non_silent = np.where(np.abs(audio_data) > threshold)[0]

	if len(non_silent) > 0:
	start_idx = max(0, non_silent[0] - int(0.1 * sr)) # Keep 0.1s before
	end_idx = min(len(audio_data), non_silent[-1] + int(0.1 * sr)) # Keep 0.1s after
	audio_data = audio_data[start_idx:end_idx]

	# Limit duration for faster processing
	max_samples = int(max_duration * sr)
	if len(audio_data) > max_samples:
	audio_data = audio_data[:max_samples]

	# Resample if needed
	if sr != target_sr:
	from scipy.signal import resample
	audio_data = resample(audio_data, int(len(audio_data) * target_sr / sr))

	# Save preprocessed audio
	temp_path = tempfile.mktemp(suffix='.wav')
	sf.write(temp_path, audio_data, target_sr)

	return temp_path

	except Exception as e:
	print(f"[WARNING] Audio preprocessing failed: {e}")
	return audio_path

	# 🚀 OPTIMIZED TEXT PROCESSING
	def optimize_text(text, max_length=500):
	"""Optimize text for faster processing"""
	# Limit text length for speed
	if len(text) > max_length:
	# Split at sentence boundaries
	sentences = text.split('.')
	result = ""
	for sentence in sentences:
	if len(result + sentence) > max_length:
	break
	result += sentence + "."
	text = result.rstrip('.')

	# Clean text
	text = text.strip()
	if not text.endswith(('.', '!', '?')):
	text += '.'

	return text

	# ✅ OPTIMIZED clone() Function
	def clone(text, audio):
	if tts is None:
	return None, "⚠ XTTS model failed to load."

	if not text or not audio:
	return None, "⚠ Error: Missing text or audio input."

	try:
	import time
	start_time = time.time()

	# ✅ Validate audio input
	if isinstance(audio, bool) or not isinstance(audio, str) or not os.path.exists(audio):
	return None, "⚠ Error: Invalid audio input format."

	# 🚀 PREPROCESSING FOR SPEED
	print("[INFO] Preprocessing audio...")
	processed_audio = preprocess_audio(audio)

	print("[INFO] Optimizing text...")
	optimized_text = optimize_text(text)
	print(f"[INFO] Text length: {len(optimized_text)} characters")

	output_path = "./output.wav"

	# 🚀 OPTIMIZED XTTS Processing
	print("[INFO] Generating speech...")

	# Clear GPU cache before processing
	if use_gpu:
	torch.cuda.empty_cache()

	# Generate with optimized settings
	tts.tts_to_file(
	text=optimized_text,
	speaker_wav=processed_audio,
	language="en",
	file_path=output_path,
	split_sentences=True, # Better for long texts
	# Additional optimization parameters
	)

	# Clean up temporary files
	if processed_audio != audio:
	try:
	os.remove(processed_audio)
	except:
	pass

	# Clear memory
	if use_gpu:
	torch.cuda.empty_cache()
	gc.collect()

	# ✅ Validate output
	if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
	return None, "⚠ Error: XTTS failed to generate audio."

	# 🚀 PERFORMANCE METRICS
	end_time = time.time()
	processing_time = end_time - start_time

	# Calculate audio duration for real-time factor
	audio_data, sr = sf.read(output_path)
	audio_duration = len(audio_data) / sr
	rtf = processing_time / audio_duration if audio_duration > 0 else 0

	print(f"[PERFORMANCE] Processing time: {processing_time:.2f}s")
	print(f"[PERFORMANCE] Audio duration: {audio_duration:.2f}s")
	print(f"[PERFORMANCE] Real-time factor: {rtf:.2f}x")

	return output_path, f"✅ Generated in {processing_time:.1f}s (RTF: {rtf:.1f}x)"

	except Exception as e:
	print(f"[ERROR] XTTS Processing Error: {str(e)}")
	# Clean up on error
	if use_gpu:
	torch.cuda.empty_cache()
	gc.collect()
	return None, f"⚠ Error: {str(e)}"

	# 🚀 OPTIMIZED Gradio Interface
	def create_interface():
	with gr.Blocks(
	theme=gr.themes.Soft(primary_hue="teal"),
	title="⚡ Fast Voice Clone"
	) as iface:

	gr.Markdown("# ⚡ Optimized Voice Cloning with XTTS")
	gr.Markdown("Faster processing with quality optimizations")

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="📝 Text to speak",
	placeholder="Enter text here (max 500 chars for optimal speed)...",
	lines=3,
	max_lines=5
	)

	audio_input = gr.Audio(
	type='filepath',
	label='🎤 Voice reference (10-30 seconds recommended)',
	sources=['upload', 'microphone']
	)

	with gr.Row():
	generate_btn = gr.Button("🚀 Generate Voice", variant="primary")
	clear_btn = gr.Button("🗑️ Clear", variant="secondary")

	with gr.Column():
	status_output = gr.Textbox(
	label="📊 Status",
	interactive=False,
	lines=2
	)

	audio_output = gr.Audio(
	type='filepath',
	label='🔊 Generated Audio'
	)

	# Performance tips
	gr.Markdown("""
	### 🚀 Performance Tips:
	- Keep text under 500 characters for fastest processing
	- Use 10-30 second reference audio clips
	- GPU processing is ~5-10x faster than CPU
	- Clear audio with minimal background noise works best
	""")

	# Event handlers
	generate_btn.click(
	fn=clone,
	inputs=[text_input, audio_input],
	outputs=[audio_output, status_output],
	show_progress=True
	)

	clear_btn.click(
	fn=lambda: (None, None, None, ""),
	outputs=[text_input, audio_input, audio_output, status_output]
	)

	return iface

	# ✅ Launch optimized interface
	if __name__ == "__main__":
	iface = create_interface()
	iface.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True,
	quiet=False
	)