Hematej commited on
Commit
da3316c
Β·
verified Β·
1 Parent(s): a9c8950

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +235 -33
app.py CHANGED
@@ -3,64 +3,266 @@ import torch
3
  from TTS.api import TTS
4
  import os
5
  import soundfile as sf
 
 
 
 
6
 
7
  os.environ["COQUI_TOS_AGREED"] = "1"
8
 
9
- # Smart device detection
 
 
 
 
10
  use_gpu = torch.cuda.is_available()
 
 
 
 
 
11
 
12
- # βœ… XTTS Model Initialization with Proper Error Handling
13
  try:
14
- tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=use_gpu, progress_bar=True)
 
 
 
 
15
  if not hasattr(tts, "synthesizer") or not hasattr(tts.synthesizer, "tts_model"):
16
  raise RuntimeError("XTTS model failed to load correctly.")
 
 
 
 
 
 
 
 
17
  print(f"[INFO] XTTS model loaded successfully. GPU enabled: {use_gpu}")
 
18
  except Exception as e:
19
  print(f"[ERROR] Failed to initialize XTTS model: {str(e)}")
20
- tts = None # Prevents further crashes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- # βœ… Fixed clone() Function
23
  def clone(text, audio):
24
  if tts is None:
25
  return None, "⚠ XTTS model failed to load."
26
-
27
  if not text or not audio:
28
  return None, "⚠ Error: Missing text or audio input."
29
-
30
  try:
 
 
 
31
  # βœ… Validate audio input
32
  if isinstance(audio, bool) or not isinstance(audio, str) or not os.path.exists(audio):
33
  return None, "⚠ Error: Invalid audio input format."
34
-
 
 
 
 
 
 
 
 
35
  output_path = "./output.wav"
36
- # βœ… XTTS Processing with Error Handling
37
- tts.tts_to_file(text=text, speaker_wav=audio, language="en", file_path=output_path)
38
-
39
- # βœ… Ensure output file is valid before passing to Gradio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
41
  return None, "⚠ Error: XTTS failed to generate audio."
42
-
43
- # βœ… Convert output file format
44
- audio_data, samplerate = sf.read(output_path)
45
- sf.write(output_path, audio_data, samplerate)
46
-
47
- return output_path
 
 
 
 
 
 
 
 
 
 
48
  except Exception as e:
49
  print(f"[ERROR] XTTS Processing Error: {str(e)}")
50
- return None
 
 
 
 
51
 
52
- # βœ… Fixed Gradio Setup
53
- iface = gr.Interface(
54
- fn=clone,
55
- inputs=[
56
- gr.Textbox(label='Text'),
57
- gr.Audio(type='filepath', label='Voice reference audio file')
58
- ],
59
- outputs=gr.Audio(type='filepath'),
60
- title="Voice Clone",
61
- flagging_mode="never",
62
- cache_examples=False,
63
- theme=gr.themes.Base(primary_hue="teal", secondary_hue="teal", neutral_hue="slate"),
64
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- iface.launch()
 
 
 
 
 
 
 
 
 
 
3
  from TTS.api import TTS
4
  import os
5
  import soundfile as sf
6
+ import numpy as np
7
+ from pydub import AudioSegment
8
+ import tempfile
9
+ import gc
10
 
11
  os.environ["COQUI_TOS_AGREED"] = "1"
12
 
13
+ # πŸš€ PERFORMANCE OPTIMIZATIONS
14
+ torch.backends.cudnn.benchmark = True # Optimize CUDA operations
15
+ torch.backends.cudnn.deterministic = False
16
+
17
+ # Smart device detection with memory optimization
18
  use_gpu = torch.cuda.is_available()
19
+ device = "cuda" if use_gpu else "cpu"
20
+
21
+ print(f"[INFO] Using device: {device}")
22
+ if use_gpu:
23
+ print(f"[INFO] GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
24
 
25
+ # βœ… OPTIMIZED XTTS Model Initialization
26
  try:
27
+ # Use smaller model for faster inference if needed
28
+ model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
29
+
30
+ tts = TTS(model_name, gpu=use_gpu, progress_bar=False) # Disable progress bar for speed
31
+
32
  if not hasattr(tts, "synthesizer") or not hasattr(tts.synthesizer, "tts_model"):
33
  raise RuntimeError("XTTS model failed to load correctly.")
34
+
35
+ # πŸš€ PERFORMANCE TWEAKS
36
+ if hasattr(tts.synthesizer.tts_model, 'inference'):
37
+ # Set inference parameters for speed
38
+ tts.synthesizer.tts_model.inference_noise_scale = 0.667
39
+ tts.synthesizer.tts_model.inference_noise_scale_w = 0.8
40
+ tts.synthesizer.tts_model.length_scale = 1.0
41
+
42
  print(f"[INFO] XTTS model loaded successfully. GPU enabled: {use_gpu}")
43
+
44
  except Exception as e:
45
  print(f"[ERROR] Failed to initialize XTTS model: {str(e)}")
46
+ tts = None
47
+
48
+ # πŸš€ AUDIO PREPROCESSING FOR SPEED
49
+ def preprocess_audio(audio_path, target_sr=22050, max_duration=30):
50
+ """Optimize audio for faster processing"""
51
+ try:
52
+ # Load and preprocess audio
53
+ audio_data, sr = sf.read(audio_path)
54
+
55
+ # Convert to mono if stereo
56
+ if len(audio_data.shape) > 1:
57
+ audio_data = np.mean(audio_data, axis=1)
58
+
59
+ # Trim silence and limit duration for speed
60
+ from scipy.signal import find_peaks
61
+
62
+ # Simple silence trimming
63
+ threshold = np.max(np.abs(audio_data)) * 0.01
64
+ non_silent = np.where(np.abs(audio_data) > threshold)[0]
65
+
66
+ if len(non_silent) > 0:
67
+ start_idx = max(0, non_silent[0] - int(0.1 * sr)) # Keep 0.1s before
68
+ end_idx = min(len(audio_data), non_silent[-1] + int(0.1 * sr)) # Keep 0.1s after
69
+ audio_data = audio_data[start_idx:end_idx]
70
+
71
+ # Limit duration for faster processing
72
+ max_samples = int(max_duration * sr)
73
+ if len(audio_data) > max_samples:
74
+ audio_data = audio_data[:max_samples]
75
+
76
+ # Resample if needed
77
+ if sr != target_sr:
78
+ from scipy.signal import resample
79
+ audio_data = resample(audio_data, int(len(audio_data) * target_sr / sr))
80
+
81
+ # Save preprocessed audio
82
+ temp_path = tempfile.mktemp(suffix='.wav')
83
+ sf.write(temp_path, audio_data, target_sr)
84
+
85
+ return temp_path
86
+
87
+ except Exception as e:
88
+ print(f"[WARNING] Audio preprocessing failed: {e}")
89
+ return audio_path
90
+
91
+ # πŸš€ OPTIMIZED TEXT PROCESSING
92
+ def optimize_text(text, max_length=500):
93
+ """Optimize text for faster processing"""
94
+ # Limit text length for speed
95
+ if len(text) > max_length:
96
+ # Split at sentence boundaries
97
+ sentences = text.split('.')
98
+ result = ""
99
+ for sentence in sentences:
100
+ if len(result + sentence) > max_length:
101
+ break
102
+ result += sentence + "."
103
+ text = result.rstrip('.')
104
+
105
+ # Clean text
106
+ text = text.strip()
107
+ if not text.endswith(('.', '!', '?')):
108
+ text += '.'
109
+
110
+ return text
111
 
112
+ # βœ… OPTIMIZED clone() Function
113
  def clone(text, audio):
114
  if tts is None:
115
  return None, "⚠ XTTS model failed to load."
116
+
117
  if not text or not audio:
118
  return None, "⚠ Error: Missing text or audio input."
119
+
120
  try:
121
+ import time
122
+ start_time = time.time()
123
+
124
  # βœ… Validate audio input
125
  if isinstance(audio, bool) or not isinstance(audio, str) or not os.path.exists(audio):
126
  return None, "⚠ Error: Invalid audio input format."
127
+
128
+ # πŸš€ PREPROCESSING FOR SPEED
129
+ print("[INFO] Preprocessing audio...")
130
+ processed_audio = preprocess_audio(audio)
131
+
132
+ print("[INFO] Optimizing text...")
133
+ optimized_text = optimize_text(text)
134
+ print(f"[INFO] Text length: {len(optimized_text)} characters")
135
+
136
  output_path = "./output.wav"
137
+
138
+ # πŸš€ OPTIMIZED XTTS Processing
139
+ print("[INFO] Generating speech...")
140
+
141
+ # Clear GPU cache before processing
142
+ if use_gpu:
143
+ torch.cuda.empty_cache()
144
+
145
+ # Generate with optimized settings
146
+ tts.tts_to_file(
147
+ text=optimized_text,
148
+ speaker_wav=processed_audio,
149
+ language="en",
150
+ file_path=output_path,
151
+ split_sentences=True, # Better for long texts
152
+ # Additional optimization parameters
153
+ )
154
+
155
+ # Clean up temporary files
156
+ if processed_audio != audio:
157
+ try:
158
+ os.remove(processed_audio)
159
+ except:
160
+ pass
161
+
162
+ # Clear memory
163
+ if use_gpu:
164
+ torch.cuda.empty_cache()
165
+ gc.collect()
166
+
167
+ # βœ… Validate output
168
  if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
169
  return None, "⚠ Error: XTTS failed to generate audio."
170
+
171
+ # πŸš€ PERFORMANCE METRICS
172
+ end_time = time.time()
173
+ processing_time = end_time - start_time
174
+
175
+ # Calculate audio duration for real-time factor
176
+ audio_data, sr = sf.read(output_path)
177
+ audio_duration = len(audio_data) / sr
178
+ rtf = processing_time / audio_duration if audio_duration > 0 else 0
179
+
180
+ print(f"[PERFORMANCE] Processing time: {processing_time:.2f}s")
181
+ print(f"[PERFORMANCE] Audio duration: {audio_duration:.2f}s")
182
+ print(f"[PERFORMANCE] Real-time factor: {rtf:.2f}x")
183
+
184
+ return output_path, f"βœ… Generated in {processing_time:.1f}s (RTF: {rtf:.1f}x)"
185
+
186
  except Exception as e:
187
  print(f"[ERROR] XTTS Processing Error: {str(e)}")
188
+ # Clean up on error
189
+ if use_gpu:
190
+ torch.cuda.empty_cache()
191
+ gc.collect()
192
+ return None, f"⚠ Error: {str(e)}"
193
 
194
+ # πŸš€ OPTIMIZED Gradio Interface
195
+ def create_interface():
196
+ with gr.Blocks(
197
+ theme=gr.themes.Soft(primary_hue="teal"),
198
+ title="⚑ Fast Voice Clone"
199
+ ) as iface:
200
+
201
+ gr.Markdown("# ⚑ Optimized Voice Cloning with XTTS")
202
+ gr.Markdown("*Faster processing with quality optimizations*")
203
+
204
+ with gr.Row():
205
+ with gr.Column():
206
+ text_input = gr.Textbox(
207
+ label="πŸ“ Text to speak",
208
+ placeholder="Enter text here (max 500 chars for optimal speed)...",
209
+ lines=3,
210
+ max_lines=5
211
+ )
212
+
213
+ audio_input = gr.Audio(
214
+ type='filepath',
215
+ label='🎀 Voice reference (10-30 seconds recommended)',
216
+ sources=['upload', 'microphone']
217
+ )
218
+
219
+ with gr.Row():
220
+ generate_btn = gr.Button("πŸš€ Generate Voice", variant="primary")
221
+ clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
222
+
223
+ with gr.Column():
224
+ status_output = gr.Textbox(
225
+ label="πŸ“Š Status",
226
+ interactive=False,
227
+ lines=2
228
+ )
229
+
230
+ audio_output = gr.Audio(
231
+ type='filepath',
232
+ label='πŸ”Š Generated Audio'
233
+ )
234
+
235
+ # Performance tips
236
+ gr.Markdown("""
237
+ ### πŸš€ Performance Tips:
238
+ - Keep text under 500 characters for fastest processing
239
+ - Use 10-30 second reference audio clips
240
+ - GPU processing is ~5-10x faster than CPU
241
+ - Clear audio with minimal background noise works best
242
+ """)
243
+
244
+ # Event handlers
245
+ generate_btn.click(
246
+ fn=clone,
247
+ inputs=[text_input, audio_input],
248
+ outputs=[audio_output, status_output],
249
+ show_progress=True
250
+ )
251
+
252
+ clear_btn.click(
253
+ fn=lambda: (None, None, None, ""),
254
+ outputs=[text_input, audio_input, audio_output, status_output]
255
+ )
256
+
257
+ return iface
258
 
259
+ # βœ… Launch optimized interface
260
+ if __name__ == "__main__":
261
+ iface = create_interface()
262
+ iface.launch(
263
+ server_name="0.0.0.0",
264
+ server_port=7860,
265
+ share=False,
266
+ show_error=True,
267
+ quiet=False
268
+ )