Rogerjs commited on
Commit
b4f1e5a
·
verified ·
1 Parent(s): 3e1f471

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +184 -63
app.py CHANGED
@@ -2,114 +2,235 @@ import gradio as gr
2
  import numpy as np
3
  import os
4
  import time
 
5
  from scipy.io import wavfile
6
 
7
- # Explicitly import Bark components
8
  from bark import generate_audio, SAMPLE_RATE
9
  from bark.generation import preload_models
10
 
11
- class VoiceCloningApp:
 
 
 
 
12
  def __init__(self):
13
  # Create working directory
14
  self.base_dir = os.path.dirname(os.path.abspath(__file__))
15
  self.working_dir = os.path.join(self.base_dir, "working_files")
16
  os.makedirs(self.working_dir, exist_ok=True)
17
 
18
- # Explicit model loading with error handling
 
 
 
 
 
 
 
 
 
19
  try:
20
  print("Attempting to load Bark models...")
21
  preload_models()
22
  print("Bark models loaded successfully.")
23
  except Exception as e:
24
- print(f"Error loading Bark models: {e}")
25
- import traceback
26
- traceback.print_exc()
27
- raise RuntimeError(f"Could not load Bark models. Error: {e}")
28
-
29
- def process_reference_audio(self, audio_data):
30
- """Simple audio processing"""
31
- if audio_data is None:
32
- return "Please provide an audio input"
33
-
34
  try:
35
- # Unpack audio data
36
- sample_rate, audio_array = audio_data
 
 
37
 
38
- # Normalize audio
39
- audio_array = audio_array / np.max(np.abs(audio_array))
 
40
 
41
- # Save reference audio
42
- filename = f"reference_{int(time.time())}.wav"
43
- filepath = os.path.join(self.working_dir, filename)
44
- wavfile.write(filepath, sample_rate, audio_array)
45
-
46
- return "✅ Audio captured successfully!"
47
-
48
  except Exception as e:
49
- return f"Error processing audio: {str(e)}"
50
-
51
- def generate_speech(self, text):
52
- """Generate speech using Bark"""
 
 
 
 
 
 
 
53
  if not text or not text.strip():
54
  return None, "Please enter some text to speak"
55
 
 
 
 
56
  try:
57
- # Generate audio with explicit error handling
58
- print(f"Generating speech for text: {text}")
59
-
60
- # Simplified audio generation
61
- audio_array = generate_audio(
62
- text,
63
- history_prompt=None,
64
- )
65
-
66
- # Save generated audio
67
- filename = f"generated_speech_{int(time.time())}.wav"
68
- filepath = os.path.join(self.working_dir, filename)
69
- wavfile.write(filepath, SAMPLE_RATE, audio_array)
70
-
71
- return filepath, None
72
 
73
  except Exception as e:
74
  print(f"Speech generation error: {e}")
75
  import traceback
76
  traceback.print_exc()
77
  return None, f"Error generating speech: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  def create_interface():
80
- # Ensure working directory exists
81
- working_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "working_files")
82
- os.makedirs(working_dir, exist_ok=True)
83
-
84
- app = VoiceCloningApp()
85
 
86
  with gr.Blocks() as interface:
87
- gr.Markdown("# 🎙️ Voice Cloning App")
88
 
89
  with gr.Row():
90
  with gr.Column():
91
- gr.Markdown("## 1. Capture Reference Voice")
92
- reference_audio = gr.Audio(sources=["microphone", "upload"], type="numpy")
93
- process_btn = gr.Button("Process Reference Voice")
94
- process_output = gr.Textbox(label="Processing Result")
95
-
96
- with gr.Column():
97
- gr.Markdown("## 2. Generate Speech")
98
  text_input = gr.Textbox(label="Enter Text to Speak")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  generate_btn = gr.Button("Generate Speech")
100
  audio_output = gr.Audio(label="Generated Speech")
101
  error_output = gr.Textbox(label="Errors", visible=True)
102
 
103
- # Bind functions
104
- process_btn.click(
105
- fn=app.process_reference_audio,
106
- inputs=reference_audio,
107
- outputs=process_output
 
 
 
 
 
 
 
 
 
 
 
 
108
  )
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  generate_btn.click(
111
- fn=app.generate_speech,
112
- inputs=text_input,
113
  outputs=[audio_output, error_output]
114
  )
115
 
 
2
  import numpy as np
3
  import os
4
  import time
5
+ import torch
6
  from scipy.io import wavfile
7
 
8
+ # Bark imports
9
  from bark import generate_audio, SAMPLE_RATE
10
  from bark.generation import preload_models
11
 
12
+ # Hugging Face Transformers
13
+ from transformers import AutoModelForTextToSpeech, AutoProcessor, AutoTokenizer
14
+ from transformers import SpeechT5HifiGan, SpeechT5ForTextToSpeech, SpeechT5Processor
15
+
16
+ class VoiceSynthesizer:
17
  def __init__(self):
18
  # Create working directory
19
  self.base_dir = os.path.dirname(os.path.abspath(__file__))
20
  self.working_dir = os.path.join(self.base_dir, "working_files")
21
  os.makedirs(self.working_dir, exist_ok=True)
22
 
23
+ # Initialize models dictionary
24
+ self.models = {
25
+ "bark": self._initialize_bark,
26
+ "speecht5": self._initialize_speecht5
27
+ }
28
+
29
+ # Default model
30
+ self.current_model = "bark"
31
+
32
+ # Initialize Bark models
33
  try:
34
  print("Attempting to load Bark models...")
35
  preload_models()
36
  print("Bark models loaded successfully.")
37
  except Exception as e:
38
+ print(f"Bark model loading error: {e}")
39
+
40
+ def _initialize_bark(self):
41
+ """Bark model initialization (already done in __init__)"""
42
+ return None
43
+
44
+ def _initialize_speecht5(self):
45
+ """Initialize SpeechT5 model from Hugging Face"""
 
 
46
  try:
47
+ # Load SpeechT5 model and processor
48
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
49
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
50
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
51
 
52
+ # Load speaker embeddings
53
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
54
+ speaker_embeddings = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
55
 
56
+ return {
57
+ "model": model,
58
+ "processor": processor,
59
+ "vocoder": vocoder,
60
+ "speaker_embeddings": speaker_embeddings
61
+ }
 
62
  except Exception as e:
63
+ print(f"SpeechT5 model loading error: {e}")
64
+ return None
65
+
66
+ def set_model(self, model_name):
67
+ """Set the current model for speech synthesis"""
68
+ if model_name not in self.models:
69
+ raise ValueError(f"Model {model_name} not supported")
70
+ self.current_model = model_name
71
+
72
+ def generate_speech(self, text, model_name=None, voice_preset=None):
73
+ """Generate speech using selected model"""
74
  if not text or not text.strip():
75
  return None, "Please enter some text to speak"
76
 
77
+ # Use specified model or current model
78
+ current_model = model_name or self.current_model
79
+
80
  try:
81
+ if current_model == "bark":
82
+ return self._generate_bark_speech(text, voice_preset)
83
+ elif current_model == "speecht5":
84
+ return self._generate_speecht5_speech(text, voice_preset)
85
+ else:
86
+ raise ValueError(f"Unsupported model: {current_model}")
 
 
 
 
 
 
 
 
 
87
 
88
  except Exception as e:
89
  print(f"Speech generation error: {e}")
90
  import traceback
91
  traceback.print_exc()
92
  return None, f"Error generating speech: {str(e)}"
93
+
94
+ def _generate_bark_speech(self, text, voice_preset=None):
95
+ """Generate speech using Bark"""
96
+ # List of Bark voice presets
97
+ voice_presets = [
98
+ "v2/en_speaker_6", # Female
99
+ "v2/en_speaker_3", # Male
100
+ "v2/en_speaker_9", # Neutral
101
+ ]
102
+
103
+ # Select voice preset
104
+ history_prompt = voice_preset if voice_preset else voice_presets[0]
105
+
106
+ # Generate audio
107
+ audio_array = generate_audio(
108
+ text,
109
+ history_prompt=history_prompt
110
+ )
111
+
112
+ # Save generated audio
113
+ filename = f"bark_speech_{int(time.time())}.wav"
114
+ filepath = os.path.join(self.working_dir, filename)
115
+ wavfile.write(filepath, SAMPLE_RATE, audio_array)
116
+
117
+ return filepath, None
118
+
119
+ def _generate_speecht5_speech(self, text, speaker_id=None):
120
+ """Generate speech using SpeechT5"""
121
+ # Ensure model is initialized
122
+ speecht5_models = self.models["speecht5"]()
123
+ if not speecht5_models:
124
+ return None, "SpeechT5 model not loaded"
125
+
126
+ model = speecht5_models["model"]
127
+ processor = speecht5_models["processor"]
128
+ vocoder = speecht5_models["vocoder"]
129
+ speaker_embeddings = speecht5_models["speaker_embeddings"]
130
+
131
+ # Prepare inputs
132
+ inputs = processor(text=text, return_tensors="pt")
133
+
134
+ # Generate speech
135
+ speech = model.generate_speech(
136
+ inputs["input_ids"],
137
+ speaker_embeddings
138
+ )
139
+
140
+ # Convert to numpy array
141
+ audio_array = speech.numpy()
142
+
143
+ # Save generated audio
144
+ filename = f"speecht5_speech_{int(time.time())}.wav"
145
+ filepath = os.path.join(self.working_dir, filename)
146
+ wavfile.write(filepath, 16000, audio_array)
147
+
148
+ return filepath, None
149
 
150
  def create_interface():
151
+ synthesizer = VoiceSynthesizer()
 
 
 
 
152
 
153
  with gr.Blocks() as interface:
154
+ gr.Markdown("# 🎙️ Advanced Voice Synthesis")
155
 
156
  with gr.Row():
157
  with gr.Column():
158
+ gr.Markdown("## Speech Generation")
 
 
 
 
 
 
159
  text_input = gr.Textbox(label="Enter Text to Speak")
160
+
161
+ # Model Selection
162
+ model_dropdown = gr.Dropdown(
163
+ choices=[
164
+ "bark (Suno AI)",
165
+ "speecht5 (Microsoft)"
166
+ ],
167
+ label="Select TTS Model",
168
+ value="bark (Suno AI)"
169
+ )
170
+
171
+ # Voice Preset Dropdowns
172
+ with gr.Row():
173
+ bark_preset = gr.Dropdown(
174
+ choices=[
175
+ "v2/en_speaker_6 (Female)",
176
+ "v2/en_speaker_3 (Male)",
177
+ "v2/en_speaker_9 (Neutral)"
178
+ ],
179
+ label="Bark Voice Preset",
180
+ visible=True
181
+ )
182
+
183
+ speecht5_preset = gr.Dropdown(
184
+ choices=[
185
+ "Default Speaker"
186
+ ],
187
+ label="SpeechT5 Speaker",
188
+ visible=False
189
+ )
190
+
191
  generate_btn = gr.Button("Generate Speech")
192
  audio_output = gr.Audio(label="Generated Speech")
193
  error_output = gr.Textbox(label="Errors", visible=True)
194
 
195
+ # Dynamic model and preset visibility
196
+ def update_model_visibility(model):
197
+ if "bark" in model.lower():
198
+ return {
199
+ bark_preset: gr.update(visible=True),
200
+ speecht5_preset: gr.update(visible=False)
201
+ }
202
+ else:
203
+ return {
204
+ bark_preset: gr.update(visible=False),
205
+ speecht5_preset: gr.update(visible=True)
206
+ }
207
+
208
+ model_dropdown.change(
209
+ fn=update_model_visibility,
210
+ inputs=model_dropdown,
211
+ outputs=[bark_preset, speecht5_preset]
212
  )
213
 
214
+ # Speech generation logic
215
+ def generate_speech_wrapper(text, model, bark_preset, speecht5_preset):
216
+ # Map model name
217
+ model_map = {
218
+ "bark (Suno AI)": "bark",
219
+ "speecht5 (Microsoft)": "speecht5"
220
+ }
221
+
222
+ # Select appropriate preset
223
+ preset = bark_preset if "bark" in model else speecht5_preset
224
+
225
+ return synthesizer.generate_speech(
226
+ text,
227
+ model_name=model_map[model],
228
+ voice_preset=preset
229
+ )
230
+
231
  generate_btn.click(
232
+ fn=generate_speech_wrapper,
233
+ inputs=[text_input, model_dropdown, bark_preset, speecht5_preset],
234
  outputs=[audio_output, error_output]
235
  )
236