Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -3,64 +3,266 @@ import torch
|
|
3 |
from TTS.api import TTS
|
4 |
import os
|
5 |
import soundfile as sf
|
|
|
|
|
|
|
|
|
6 |
|
7 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
8 |
|
9 |
-
#
|
|
|
|
|
|
|
|
|
10 |
use_gpu = torch.cuda.is_available()
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
# β
XTTS Model Initialization
|
13 |
try:
|
14 |
-
|
|
|
|
|
|
|
|
|
15 |
if not hasattr(tts, "synthesizer") or not hasattr(tts.synthesizer, "tts_model"):
|
16 |
raise RuntimeError("XTTS model failed to load correctly.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
print(f"[INFO] XTTS model loaded successfully. GPU enabled: {use_gpu}")
|
|
|
18 |
except Exception as e:
|
19 |
print(f"[ERROR] Failed to initialize XTTS model: {str(e)}")
|
20 |
-
tts = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
# β
|
23 |
def clone(text, audio):
|
24 |
if tts is None:
|
25 |
return None, "β XTTS model failed to load."
|
26 |
-
|
27 |
if not text or not audio:
|
28 |
return None, "β Error: Missing text or audio input."
|
29 |
-
|
30 |
try:
|
|
|
|
|
|
|
31 |
# β
Validate audio input
|
32 |
if isinstance(audio, bool) or not isinstance(audio, str) or not os.path.exists(audio):
|
33 |
return None, "β Error: Invalid audio input format."
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
output_path = "./output.wav"
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
|
41 |
return None, "β Error: XTTS failed to generate audio."
|
42 |
-
|
43 |
-
#
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
except Exception as e:
|
49 |
print(f"[ERROR] XTTS Processing Error: {str(e)}")
|
50 |
-
|
|
|
|
|
|
|
|
|
51 |
|
52 |
-
#
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from TTS.api import TTS
|
4 |
import os
|
5 |
import soundfile as sf
|
6 |
+
import numpy as np
|
7 |
+
from pydub import AudioSegment
|
8 |
+
import tempfile
|
9 |
+
import gc
|
10 |
|
11 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
12 |
|
13 |
+
# π PERFORMANCE OPTIMIZATIONS
|
14 |
+
torch.backends.cudnn.benchmark = True # Optimize CUDA operations
|
15 |
+
torch.backends.cudnn.deterministic = False
|
16 |
+
|
17 |
+
# Smart device detection with memory optimization
|
18 |
use_gpu = torch.cuda.is_available()
|
19 |
+
device = "cuda" if use_gpu else "cpu"
|
20 |
+
|
21 |
+
print(f"[INFO] Using device: {device}")
|
22 |
+
if use_gpu:
|
23 |
+
print(f"[INFO] GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
|
24 |
|
25 |
+
# β
OPTIMIZED XTTS Model Initialization
|
26 |
try:
|
27 |
+
# Use smaller model for faster inference if needed
|
28 |
+
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
|
29 |
+
|
30 |
+
tts = TTS(model_name, gpu=use_gpu, progress_bar=False) # Disable progress bar for speed
|
31 |
+
|
32 |
if not hasattr(tts, "synthesizer") or not hasattr(tts.synthesizer, "tts_model"):
|
33 |
raise RuntimeError("XTTS model failed to load correctly.")
|
34 |
+
|
35 |
+
# π PERFORMANCE TWEAKS
|
36 |
+
if hasattr(tts.synthesizer.tts_model, 'inference'):
|
37 |
+
# Set inference parameters for speed
|
38 |
+
tts.synthesizer.tts_model.inference_noise_scale = 0.667
|
39 |
+
tts.synthesizer.tts_model.inference_noise_scale_w = 0.8
|
40 |
+
tts.synthesizer.tts_model.length_scale = 1.0
|
41 |
+
|
42 |
print(f"[INFO] XTTS model loaded successfully. GPU enabled: {use_gpu}")
|
43 |
+
|
44 |
except Exception as e:
|
45 |
print(f"[ERROR] Failed to initialize XTTS model: {str(e)}")
|
46 |
+
tts = None
|
47 |
+
|
48 |
+
# π AUDIO PREPROCESSING FOR SPEED
|
49 |
+
def preprocess_audio(audio_path, target_sr=22050, max_duration=30):
|
50 |
+
"""Optimize audio for faster processing"""
|
51 |
+
try:
|
52 |
+
# Load and preprocess audio
|
53 |
+
audio_data, sr = sf.read(audio_path)
|
54 |
+
|
55 |
+
# Convert to mono if stereo
|
56 |
+
if len(audio_data.shape) > 1:
|
57 |
+
audio_data = np.mean(audio_data, axis=1)
|
58 |
+
|
59 |
+
# Trim silence and limit duration for speed
|
60 |
+
from scipy.signal import find_peaks
|
61 |
+
|
62 |
+
# Simple silence trimming
|
63 |
+
threshold = np.max(np.abs(audio_data)) * 0.01
|
64 |
+
non_silent = np.where(np.abs(audio_data) > threshold)[0]
|
65 |
+
|
66 |
+
if len(non_silent) > 0:
|
67 |
+
start_idx = max(0, non_silent[0] - int(0.1 * sr)) # Keep 0.1s before
|
68 |
+
end_idx = min(len(audio_data), non_silent[-1] + int(0.1 * sr)) # Keep 0.1s after
|
69 |
+
audio_data = audio_data[start_idx:end_idx]
|
70 |
+
|
71 |
+
# Limit duration for faster processing
|
72 |
+
max_samples = int(max_duration * sr)
|
73 |
+
if len(audio_data) > max_samples:
|
74 |
+
audio_data = audio_data[:max_samples]
|
75 |
+
|
76 |
+
# Resample if needed
|
77 |
+
if sr != target_sr:
|
78 |
+
from scipy.signal import resample
|
79 |
+
audio_data = resample(audio_data, int(len(audio_data) * target_sr / sr))
|
80 |
+
|
81 |
+
# Save preprocessed audio
|
82 |
+
temp_path = tempfile.mktemp(suffix='.wav')
|
83 |
+
sf.write(temp_path, audio_data, target_sr)
|
84 |
+
|
85 |
+
return temp_path
|
86 |
+
|
87 |
+
except Exception as e:
|
88 |
+
print(f"[WARNING] Audio preprocessing failed: {e}")
|
89 |
+
return audio_path
|
90 |
+
|
91 |
+
# π OPTIMIZED TEXT PROCESSING
|
92 |
+
def optimize_text(text, max_length=500):
|
93 |
+
"""Optimize text for faster processing"""
|
94 |
+
# Limit text length for speed
|
95 |
+
if len(text) > max_length:
|
96 |
+
# Split at sentence boundaries
|
97 |
+
sentences = text.split('.')
|
98 |
+
result = ""
|
99 |
+
for sentence in sentences:
|
100 |
+
if len(result + sentence) > max_length:
|
101 |
+
break
|
102 |
+
result += sentence + "."
|
103 |
+
text = result.rstrip('.')
|
104 |
+
|
105 |
+
# Clean text
|
106 |
+
text = text.strip()
|
107 |
+
if not text.endswith(('.', '!', '?')):
|
108 |
+
text += '.'
|
109 |
+
|
110 |
+
return text
|
111 |
|
112 |
+
# β
OPTIMIZED clone() Function
|
113 |
def clone(text, audio):
|
114 |
if tts is None:
|
115 |
return None, "β XTTS model failed to load."
|
116 |
+
|
117 |
if not text or not audio:
|
118 |
return None, "β Error: Missing text or audio input."
|
119 |
+
|
120 |
try:
|
121 |
+
import time
|
122 |
+
start_time = time.time()
|
123 |
+
|
124 |
# β
Validate audio input
|
125 |
if isinstance(audio, bool) or not isinstance(audio, str) or not os.path.exists(audio):
|
126 |
return None, "β Error: Invalid audio input format."
|
127 |
+
|
128 |
+
# π PREPROCESSING FOR SPEED
|
129 |
+
print("[INFO] Preprocessing audio...")
|
130 |
+
processed_audio = preprocess_audio(audio)
|
131 |
+
|
132 |
+
print("[INFO] Optimizing text...")
|
133 |
+
optimized_text = optimize_text(text)
|
134 |
+
print(f"[INFO] Text length: {len(optimized_text)} characters")
|
135 |
+
|
136 |
output_path = "./output.wav"
|
137 |
+
|
138 |
+
# π OPTIMIZED XTTS Processing
|
139 |
+
print("[INFO] Generating speech...")
|
140 |
+
|
141 |
+
# Clear GPU cache before processing
|
142 |
+
if use_gpu:
|
143 |
+
torch.cuda.empty_cache()
|
144 |
+
|
145 |
+
# Generate with optimized settings
|
146 |
+
tts.tts_to_file(
|
147 |
+
text=optimized_text,
|
148 |
+
speaker_wav=processed_audio,
|
149 |
+
language="en",
|
150 |
+
file_path=output_path,
|
151 |
+
split_sentences=True, # Better for long texts
|
152 |
+
# Additional optimization parameters
|
153 |
+
)
|
154 |
+
|
155 |
+
# Clean up temporary files
|
156 |
+
if processed_audio != audio:
|
157 |
+
try:
|
158 |
+
os.remove(processed_audio)
|
159 |
+
except:
|
160 |
+
pass
|
161 |
+
|
162 |
+
# Clear memory
|
163 |
+
if use_gpu:
|
164 |
+
torch.cuda.empty_cache()
|
165 |
+
gc.collect()
|
166 |
+
|
167 |
+
# β
Validate output
|
168 |
if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
|
169 |
return None, "β Error: XTTS failed to generate audio."
|
170 |
+
|
171 |
+
# π PERFORMANCE METRICS
|
172 |
+
end_time = time.time()
|
173 |
+
processing_time = end_time - start_time
|
174 |
+
|
175 |
+
# Calculate audio duration for real-time factor
|
176 |
+
audio_data, sr = sf.read(output_path)
|
177 |
+
audio_duration = len(audio_data) / sr
|
178 |
+
rtf = processing_time / audio_duration if audio_duration > 0 else 0
|
179 |
+
|
180 |
+
print(f"[PERFORMANCE] Processing time: {processing_time:.2f}s")
|
181 |
+
print(f"[PERFORMANCE] Audio duration: {audio_duration:.2f}s")
|
182 |
+
print(f"[PERFORMANCE] Real-time factor: {rtf:.2f}x")
|
183 |
+
|
184 |
+
return output_path, f"β
Generated in {processing_time:.1f}s (RTF: {rtf:.1f}x)"
|
185 |
+
|
186 |
except Exception as e:
|
187 |
print(f"[ERROR] XTTS Processing Error: {str(e)}")
|
188 |
+
# Clean up on error
|
189 |
+
if use_gpu:
|
190 |
+
torch.cuda.empty_cache()
|
191 |
+
gc.collect()
|
192 |
+
return None, f"β Error: {str(e)}"
|
193 |
|
194 |
+
# π OPTIMIZED Gradio Interface
|
195 |
+
def create_interface():
|
196 |
+
with gr.Blocks(
|
197 |
+
theme=gr.themes.Soft(primary_hue="teal"),
|
198 |
+
title="β‘ Fast Voice Clone"
|
199 |
+
) as iface:
|
200 |
+
|
201 |
+
gr.Markdown("# β‘ Optimized Voice Cloning with XTTS")
|
202 |
+
gr.Markdown("*Faster processing with quality optimizations*")
|
203 |
+
|
204 |
+
with gr.Row():
|
205 |
+
with gr.Column():
|
206 |
+
text_input = gr.Textbox(
|
207 |
+
label="π Text to speak",
|
208 |
+
placeholder="Enter text here (max 500 chars for optimal speed)...",
|
209 |
+
lines=3,
|
210 |
+
max_lines=5
|
211 |
+
)
|
212 |
+
|
213 |
+
audio_input = gr.Audio(
|
214 |
+
type='filepath',
|
215 |
+
label='π€ Voice reference (10-30 seconds recommended)',
|
216 |
+
sources=['upload', 'microphone']
|
217 |
+
)
|
218 |
+
|
219 |
+
with gr.Row():
|
220 |
+
generate_btn = gr.Button("π Generate Voice", variant="primary")
|
221 |
+
clear_btn = gr.Button("ποΈ Clear", variant="secondary")
|
222 |
+
|
223 |
+
with gr.Column():
|
224 |
+
status_output = gr.Textbox(
|
225 |
+
label="π Status",
|
226 |
+
interactive=False,
|
227 |
+
lines=2
|
228 |
+
)
|
229 |
+
|
230 |
+
audio_output = gr.Audio(
|
231 |
+
type='filepath',
|
232 |
+
label='π Generated Audio'
|
233 |
+
)
|
234 |
+
|
235 |
+
# Performance tips
|
236 |
+
gr.Markdown("""
|
237 |
+
### π Performance Tips:
|
238 |
+
- Keep text under 500 characters for fastest processing
|
239 |
+
- Use 10-30 second reference audio clips
|
240 |
+
- GPU processing is ~5-10x faster than CPU
|
241 |
+
- Clear audio with minimal background noise works best
|
242 |
+
""")
|
243 |
+
|
244 |
+
# Event handlers
|
245 |
+
generate_btn.click(
|
246 |
+
fn=clone,
|
247 |
+
inputs=[text_input, audio_input],
|
248 |
+
outputs=[audio_output, status_output],
|
249 |
+
show_progress=True
|
250 |
+
)
|
251 |
+
|
252 |
+
clear_btn.click(
|
253 |
+
fn=lambda: (None, None, None, ""),
|
254 |
+
outputs=[text_input, audio_input, audio_output, status_output]
|
255 |
+
)
|
256 |
+
|
257 |
+
return iface
|
258 |
|
259 |
+
# β
Launch optimized interface
|
260 |
+
if __name__ == "__main__":
|
261 |
+
iface = create_interface()
|
262 |
+
iface.launch(
|
263 |
+
server_name="0.0.0.0",
|
264 |
+
server_port=7860,
|
265 |
+
share=False,
|
266 |
+
show_error=True,
|
267 |
+
quiet=False
|
268 |
+
)
|