Rogerjs commited on
Commit
27e6d88
·
verified ·
1 Parent(s): 909dbdf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -196
app.py CHANGED
@@ -45,212 +45,182 @@ class VoiceSynthesizer:
45
  except Exception as e:
46
  print(f"Bark model loading error: {e}")
47
 
48
- def process_reference_audio(self, reference_audio):
49
- """Process and store reference audio for voice cloning"""
50
- try:
51
- # Gradio can pass audio in different formats
52
- if reference_audio is None:
53
- return "No audio provided"
54
-
55
- # Handle different input types
56
- if isinstance(reference_audio, tuple):
57
- # Gradio typically returns (sample_rate, audio_array)
58
- if len(reference_audio) == 2:
59
- sample_rate, audio_data = reference_audio
60
- else:
61
- audio_data = reference_audio[0]
62
- sample_rate = SAMPLE_RATE # Default to Bark sample rate
63
- elif isinstance(reference_audio, np.ndarray):
64
- audio_data = reference_audio
65
- sample_rate = SAMPLE_RATE
66
- else:
67
- return "Invalid audio format"
68
-
69
- # Ensure audio is numpy array
70
- audio_data = np.asarray(audio_data)
71
-
72
- # Handle multi-channel audio
73
- if audio_data.ndim > 1:
74
- audio_data = audio_data.mean(axis=1)
75
-
76
- # Trim or pad to standard length
77
- max_duration = 10 # 10 seconds
78
- max_samples = max_duration * sample_rate
79
-
80
- if len(audio_data) > max_samples:
81
- audio_data = audio_data[:max_samples]
82
-
83
- # Resample if necessary
84
- if sample_rate != SAMPLE_RATE:
85
- from scipy.signal import resample
86
- audio_data = resample(audio_data, int(len(audio_data) * SAMPLE_RATE / sample_rate))
87
-
88
- # Save reference audio
89
- ref_filename = os.path.join(self.working_dir, "reference_voice.wav")
90
- sf.write(ref_filename, audio_data, SAMPLE_RATE)
91
-
92
- # Store reference voice
93
- self.reference_voice = ref_filename
94
-
95
- return "Reference voice processed successfully"
96
 
97
- except Exception as e:
98
- print(f"Reference audio processing error: {e}")
99
- import traceback
100
- traceback.print_exc()
101
- return f"Error processing reference audio: {str(e)}"
102
-
103
- def _generate_bark_speech(self, text, voice_preset=None):
104
- """Generate speech using Bark"""
105
- # Default Bark voice presets
106
- voice_presets = [
107
- "v2/en_speaker_6", # Female
108
- "v2/en_speaker_3", # Male
109
- "v2/en_speaker_9", # Neutral
110
- ]
111
-
112
- # Prepare history prompt
113
- history_prompt = None
114
-
115
- # Check if a reference voice is available
116
- if self.reference_voice is not None:
117
- # Use saved reference voice file
118
- history_prompt = self.reference_voice
119
- elif voice_preset:
120
- # Use predefined voice preset
121
- history_prompt = voice_presets[0] if "v2/en_speaker" not in voice_preset else voice_preset
122
 
123
- # Generate audio with or without history prompt
124
- try:
125
- if history_prompt:
126
- audio_array = generate_audio(
127
- text,
128
- history_prompt=history_prompt
129
- )
130
- else:
131
- # Fallback to default generation
132
- audio_array = generate_audio(text)
133
-
134
- # Save generated audio
135
- filename = f"bark_speech_{int(time.time())}.wav"
136
- filepath = os.path.join(self.working_dir, filename)
137
- wavfile.write(filepath, SAMPLE_RATE, audio_array)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
- return filepath, None
140
-
141
- except Exception as e:
142
- print(f"Bark speech generation error: {e}")
143
- import traceback
144
- traceback.print_exc()
145
- return None, f"Error in Bark speech generation: {str(e)}"
146
- def create_interface():
147
- synthesizer = VoiceSynthesizer()
148
 
149
- with gr.Blocks() as interface:
150
- gr.Markdown("# 🎙️ Advanced Voice Synthesis")
151
-
152
- with gr.Row():
153
- with gr.Column():
154
- gr.Markdown("## 1. Capture Reference Voice")
155
- reference_audio = gr.Audio(sources=["microphone", "upload"], type="numpy")
156
- process_ref_btn = gr.Button("Process Reference Voice")
157
- process_ref_output = gr.Textbox(label="Reference Voice Processing")
158
-
159
- with gr.Column():
160
- gr.Markdown("## 2. Generate Speech")
161
- text_input = gr.Textbox(label="Enter Text to Speak")
162
-
163
- # Model Selection
164
- model_dropdown = gr.Dropdown(
165
- choices=[
166
- "bark (Suno AI)",
167
- "speecht5 (Microsoft)"
168
- ],
169
- label="Select TTS Model",
170
- value="bark (Suno AI)"
 
 
 
 
171
  )
172
-
173
- # Voice Preset Dropdowns
174
- with gr.Row():
175
- bark_preset = gr.Dropdown(
176
- choices=[
177
- "v2/en_speaker_6 (Female)",
178
- "v2/en_speaker_3 (Male)",
179
- "v2/en_speaker_9 (Neutral)"
180
- ],
181
- label="Bark Voice Preset",
182
- visible=True
183
- )
184
-
185
- speecht5_preset = gr.Dropdown(
186
- choices=[
187
- "Default Speaker"
188
- ],
189
- label="SpeechT5 Speaker",
190
- visible=False
191
- )
192
-
193
- generate_btn = gr.Button("Generate Speech")
194
- audio_output = gr.Audio(label="Generated Speech")
195
- error_output = gr.Textbox(label="Errors", visible=True)
196
 
197
- # Process reference audio
198
- process_ref_btn.click(
199
- fn=synthesizer.process_reference_audio,
200
- inputs=reference_audio,
201
- outputs=process_ref_output
202
- )
 
 
 
 
 
 
 
203
 
204
- # Dynamic model and preset visibility
205
- def update_model_visibility(model):
206
- if "bark" in model.lower():
207
- return {
208
- bark_preset: gr.update(visible=True),
209
- speecht5_preset: gr.update(visible=False)
210
- }
211
  else:
212
- return {
213
- bark_preset: gr.update(visible=False),
214
- speecht5_preset: gr.update(visible=True)
215
- }
216
 
217
- model_dropdown.change(
218
- fn=update_model_visibility,
219
- inputs=model_dropdown,
220
- outputs=[bark_preset, speecht5_preset]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  )
222
 
223
- # Speech generation logic
224
- def generate_speech_wrapper(text, model, bark_preset, speecht5_preset):
225
- # Map model name
226
- model_map = {
227
- "bark (Suno AI)": "bark",
228
- "speecht5 (Microsoft)": "speecht5"
229
- }
230
-
231
- # Select appropriate preset
232
- preset = bark_preset if "bark" in model else speecht5_preset
233
-
234
- return synthesizer.generate_speech(
235
- text,
236
- model_name=model_map[model],
237
- voice_preset=preset
238
- )
239
 
240
- generate_btn.click(
241
- fn=generate_speech_wrapper,
242
- inputs=[text_input, model_dropdown, bark_preset, speecht5_preset],
243
- outputs=[audio_output, error_output]
244
- )
245
-
246
- return interface
247
 
248
- if __name__ == "__main__":
249
- interface = create_interface()
250
- interface.launch(
251
- share=False,
252
- debug=True,
253
- show_error=True,
254
- server_name='0.0.0.0',
255
- server_port=7860
256
- )
 
45
  except Exception as e:
46
  print(f"Bark model loading error: {e}")
47
 
48
+ def _initialize_bark(self):
49
+ """Bark model initialization (already done in __init__)"""
50
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ def _initialize_speecht5(self):
53
+ """Initialize SpeechT5 model from Hugging Face"""
54
+ try:
55
+ # Load SpeechT5 model and processor
56
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
57
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
58
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
59
+
60
+ # Load speaker embeddings
61
+ embeddings_dataset = datasets.load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
62
+ speaker_embeddings = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
63
+
64
+ return {
65
+ "model": model,
66
+ "processor": processor,
67
+ "vocoder": vocoder,
68
+ "speaker_embeddings": speaker_embeddings
69
+ }
70
+ except Exception as e:
71
+ print(f"SpeechT5 model loading error: {e}")
72
+ return None
 
 
 
 
73
 
74
+ def process_reference_audio(self, reference_audio):
75
+ """Process and store reference audio for voice cloning"""
76
+ try:
77
+ # Gradio can pass audio in different formats
78
+ if reference_audio is None:
79
+ return "No audio provided"
80
+
81
+ # Handle different input types
82
+ if isinstance(reference_audio, tuple):
83
+ # Gradio typically returns (sample_rate, audio_array)
84
+ if len(reference_audio) == 2:
85
+ sample_rate, audio_data = reference_audio
86
+ else:
87
+ audio_data = reference_audio[0]
88
+ sample_rate = SAMPLE_RATE # Default to Bark sample rate
89
+ elif isinstance(reference_audio, np.ndarray):
90
+ audio_data = reference_audio
91
+ sample_rate = SAMPLE_RATE
92
+ else:
93
+ return "Invalid audio format"
94
+
95
+ # Ensure audio is numpy array
96
+ audio_data = np.asarray(audio_data)
97
+
98
+ # Handle multi-channel audio
99
+ if audio_data.ndim > 1:
100
+ audio_data = audio_data.mean(axis=1)
101
+
102
+ # Trim or pad to standard length
103
+ max_duration = 10 # 10 seconds
104
+ max_samples = max_duration * sample_rate
105
+
106
+ if len(audio_data) > max_samples:
107
+ audio_data = audio_data[:max_samples]
108
+
109
+ # Resample if necessary
110
+ if sample_rate != SAMPLE_RATE:
111
+ from scipy.signal import resample
112
+ audio_data = resample(audio_data, int(len(audio_data) * SAMPLE_RATE / sample_rate))
113
+
114
+ # Save reference audio
115
+ ref_filename = os.path.join(self.working_dir, "reference_voice.wav")
116
+ sf.write(ref_filename, audio_data, SAMPLE_RATE)
117
+
118
+ # Store reference voice
119
+ self.reference_voice = ref_filename
120
+
121
+ return "Reference voice processed successfully"
122
 
123
+ except Exception as e:
124
+ print(f"Reference audio processing error: {e}")
125
+ import traceback
126
+ traceback.print_exc()
127
+ return f"Error processing reference audio: {str(e)}"
 
 
 
 
128
 
129
+ def _generate_bark_speech(self, text, voice_preset=None):
130
+ """Generate speech using Bark"""
131
+ # Default Bark voice presets
132
+ voice_presets = [
133
+ "v2/en_speaker_6", # Female
134
+ "v2/en_speaker_3", # Male
135
+ "v2/en_speaker_9", # Neutral
136
+ ]
137
+
138
+ # Prepare history prompt
139
+ history_prompt = None
140
+
141
+ # Check if a reference voice is available
142
+ if self.reference_voice is not None:
143
+ # Use saved reference voice file
144
+ history_prompt = self.reference_voice
145
+ elif voice_preset:
146
+ # Use predefined voice preset
147
+ history_prompt = voice_presets[0] if "v2/en_speaker" not in voice_preset else voice_preset
148
+
149
+ # Generate audio with or without history prompt
150
+ try:
151
+ if history_prompt:
152
+ audio_array = generate_audio(
153
+ text,
154
+ history_prompt=history_prompt
155
  )
156
+ else:
157
+ # Fallback to default generation
158
+ audio_array = generate_audio(text)
159
+
160
+ # Save generated audio
161
+ filename = f"bark_speech_{int(time.time())}.wav"
162
+ filepath = os.path.join(self.working_dir, filename)
163
+ wavfile.write(filepath, SAMPLE_RATE, audio_array)
164
+
165
+ return filepath, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
+ except Exception as e:
168
+ print(f"Bark speech generation error: {e}")
169
+ import traceback
170
+ traceback.print_exc()
171
+ return None, f"Error in Bark speech generation: {str(e)}"
172
+
173
+ def generate_speech(self, text, model_name=None, voice_preset=None):
174
+ """Generate speech using selected model"""
175
+ if not text or not text.strip():
176
+ return None, "Please enter some text to speak"
177
+
178
+ # Use specified model or current model
179
+ current_model = model_name or self.current_model
180
 
181
+ try:
182
+ if current_model == "bark":
183
+ return self._generate_bark_speech(text, voice_preset)
184
+ elif current_model == "speecht5":
185
+ return self._generate_speecht5_speech(text, voice_preset)
 
 
186
  else:
187
+ raise ValueError(f"Unsupported model: {current_model}")
 
 
 
188
 
189
+ except Exception as e:
190
+ print(f"Speech generation error: {e}")
191
+ import traceback
192
+ traceback.print_exc()
193
+ return None, f"Error generating speech: {str(e)}"
194
+
195
+ def _generate_speecht5_speech(self, text, speaker_id=None):
196
+ """Generate speech using SpeechT5"""
197
+ # Ensure model is initialized
198
+ speecht5_models = self.models["speecht5"]()
199
+ if not speecht5_models:
200
+ return None, "SpeechT5 model not loaded"
201
+
202
+ model = speecht5_models["model"]
203
+ processor = speecht5_models["processor"]
204
+ vocoder = speecht5_models["vocoder"]
205
+ speaker_embeddings = speecht5_models["speaker_embeddings"]
206
+
207
+ # Prepare inputs
208
+ inputs = processor(text=text, return_tensors="pt")
209
+
210
+ # Generate speech
211
+ speech = model.generate_speech(
212
+ inputs["input_ids"],
213
+ speaker_embeddings
214
  )
215
 
216
+ # Convert to numpy array
217
+ audio_array = speech.numpy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
+ # Save generated audio
220
+ filename = f"speecht5_speech_{int(time.time())}.wav"
221
+ filepath = os.path.join(self.working_dir, filename)
222
+ wavfile.write(filepath, 16000, audio_array)
223
+
224
+ return filepath, None
 
225
 
226
+ # Rest of the code remains the same...