yasserrmd commited on
Commit
2565173
Β·
verified Β·
1 Parent(s): 03e7073

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -55
app.py CHANGED
@@ -73,22 +73,23 @@ class VibeVoiceDemo:
73
  cfg_scale: float = 1.3):
74
  """Final audio generation only (no streaming)."""
75
  self.is_generating = True
76
-
77
  if not script.strip():
78
  raise gr.Error("Please provide a script.")
79
-
80
  if num_speakers < 1 or num_speakers > 4:
81
  raise gr.Error("Number of speakers must be 1–4.")
82
-
 
83
  selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
84
  for i, sp in enumerate(selected):
85
  if not sp or sp not in self.available_voices:
86
  raise gr.Error(f"Invalid speaker {i+1} selection.")
87
-
88
  voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected]
89
  if any(len(v) == 0 for v in voice_samples):
90
  raise gr.Error("Failed to load one or more voice samples.")
91
-
92
  # format script
93
  lines = script.strip().split("\n")
94
  formatted = []
@@ -102,7 +103,7 @@ class VibeVoiceDemo:
102
  sp_id = i % num_speakers
103
  formatted.append(f"Speaker {sp_id}: {line}")
104
  formatted_script = "\n".join(formatted)
105
-
106
  # processor input
107
  inputs = self.processor(
108
  text=[formatted_script],
@@ -110,7 +111,7 @@ class VibeVoiceDemo:
110
  padding=True,
111
  return_tensors="pt"
112
  )
113
-
114
  start = time.time()
115
  outputs = self.model.generate(
116
  **inputs,
@@ -118,35 +119,47 @@ class VibeVoiceDemo:
118
  tokenizer=self.processor.tokenizer,
119
  verbose=False
120
  )
121
-
122
- # --- handle model output ---
123
  if hasattr(outputs, "audio"):
124
  audio = outputs.audio
125
- elif hasattr(outputs, "audios"):
126
  audio = outputs.audios[0]
 
 
 
 
 
 
127
  else:
128
- raise gr.Error("Model did not return audio in expected format.")
129
-
 
130
  if torch.is_tensor(audio):
131
  audio = audio.float().cpu().numpy()
132
  if audio.ndim > 1:
133
  audio = audio.squeeze()
134
-
135
  sample_rate = 24000
136
- audio16 = convert_to_16_bit_wav(audio)
137
-
138
- # --- save automatically to disk ---
 
139
  os.makedirs("outputs", exist_ok=True)
 
 
140
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
141
  file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
142
- sf.write(file_path, audio16, sample_rate)
 
143
  print(f"πŸ’Ύ Saved podcast to {file_path}")
144
-
145
- total_dur = len(audio16) / sample_rate
146
  log = f"βœ… Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
147
-
148
  self.is_generating = False
149
- return (sample_rate, audio16), log
 
150
 
151
  def load_example_scripts(self):
152
  examples_dir = os.path.join(os.path.dirname(__file__), "text_examples")
@@ -175,42 +188,112 @@ def convert_to_16_bit_wav(data):
175
 
176
 
177
  def create_demo_interface(demo_instance: VibeVoiceDemo):
178
- with gr.Blocks(
179
- title="VibeVoice - AI Podcast Generator",
180
- theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple")
181
- ) as interface:
182
-
183
- gr.Markdown("## πŸŽ™οΈ VibeVoice Podcast Generator (Final Audio Only)")
184
-
185
- num_speakers = gr.Slider(1, 4, value=2, step=1, label="Number of Speakers")
186
- available_speaker_names = list(demo_instance.available_voices.keys())
187
- default_speakers = available_speaker_names[:4]
188
-
189
- speaker_selections = []
190
- for i in range(4):
191
- speaker = gr.Dropdown(
192
- choices=available_speaker_names,
193
- value=default_speakers[i] if i < len(default_speakers) else None,
194
- label=f"Speaker {i+1}",
195
- visible=(i < 2)
196
- )
197
- speaker_selections.append(speaker)
198
 
199
- cfg_scale = gr.Slider(1.0, 2.0, value=1.3, step=0.05, label="CFG Scale")
 
200
 
201
- script_input = gr.Textbox(
202
- label="Podcast Script",
203
- placeholder="Enter your script here...",
204
- lines=10
 
 
 
205
  )
206
-
207
- generate_btn = gr.Button("πŸš€ Generate Podcast")
208
- audio_output = gr.Audio(
209
- label="Generated Podcast (Download)",
210
- type="numpy",
211
- show_download_button=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  )
213
- log_output = gr.Textbox(label="Log", interactive=False, lines=5)
214
 
215
  def generate_podcast_wrapper(num_speakers, script, *speakers_and_params):
216
  try:
@@ -233,10 +316,39 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
233
  generate_btn.click(
234
  fn=generate_podcast_wrapper,
235
  inputs=[num_speakers, script_input] + speaker_selections + [cfg_scale],
236
- outputs=[audio_output, log_output]
 
237
  )
238
 
239
- return interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
 
242
  def run_demo(
 
73
  cfg_scale: float = 1.3):
74
  """Final audio generation only (no streaming)."""
75
  self.is_generating = True
76
+
77
  if not script.strip():
78
  raise gr.Error("Please provide a script.")
79
+
80
  if num_speakers < 1 or num_speakers > 4:
81
  raise gr.Error("Number of speakers must be 1–4.")
82
+
83
+ # collect speakers
84
  selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
85
  for i, sp in enumerate(selected):
86
  if not sp or sp not in self.available_voices:
87
  raise gr.Error(f"Invalid speaker {i+1} selection.")
88
+
89
  voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected]
90
  if any(len(v) == 0 for v in voice_samples):
91
  raise gr.Error("Failed to load one or more voice samples.")
92
+
93
  # format script
94
  lines = script.strip().split("\n")
95
  formatted = []
 
103
  sp_id = i % num_speakers
104
  formatted.append(f"Speaker {sp_id}: {line}")
105
  formatted_script = "\n".join(formatted)
106
+
107
  # processor input
108
  inputs = self.processor(
109
  text=[formatted_script],
 
111
  padding=True,
112
  return_tensors="pt"
113
  )
114
+
115
  start = time.time()
116
  outputs = self.model.generate(
117
  **inputs,
 
119
  tokenizer=self.processor.tokenizer,
120
  verbose=False
121
  )
122
+
123
+ # --- handle model output robustly ---
124
  if hasattr(outputs, "audio"):
125
  audio = outputs.audio
126
+ elif hasattr(outputs, "audios") and outputs.audios:
127
  audio = outputs.audios[0]
128
+ elif hasattr(outputs, "waveform"):
129
+ audio = outputs.waveform
130
+ elif hasattr(outputs, "waveforms") and outputs.waveforms:
131
+ audio = outputs.waveforms[0]
132
+ elif hasattr(outputs, "speech_outputs") and outputs.speech_outputs:
133
+ audio = outputs.speech_outputs[0]
134
  else:
135
+ raise gr.Error(f"Model did not return audio in expected format. Got attributes: {dir(outputs)}")
136
+
137
+ # convert to numpy
138
  if torch.is_tensor(audio):
139
  audio = audio.float().cpu().numpy()
140
  if audio.ndim > 1:
141
  audio = audio.squeeze()
142
+
143
  sample_rate = 24000
144
+ # ensure float32 for saving and returning
145
+ audio = audio.astype("float32")
146
+
147
+ # save automatically to disk
148
  os.makedirs("outputs", exist_ok=True)
149
+ from datetime import datetime
150
+ import soundfile as sf
151
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
152
  file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
153
+ sf.write(file_path, audio, sample_rate) # soundfile handles float32
154
+
155
  print(f"πŸ’Ύ Saved podcast to {file_path}")
156
+
157
+ total_dur = len(audio) / sample_rate
158
  log = f"βœ… Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
159
+
160
  self.is_generating = False
161
+ return (sample_rate, audio), log
162
+
163
 
164
  def load_example_scripts(self):
165
  examples_dir = os.path.join(os.path.dirname(__file__), "text_examples")
 
188
 
189
 
190
  def create_demo_interface(demo_instance: VibeVoiceDemo):
191
+ """Create the Gradio interface (final audio only, no streaming)."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
+ # Custom CSS for high-end aesthetics
194
+ custom_css = """ ... """ # (keep your CSS unchanged)
195
 
196
+ with gr.Blocks(
197
+ title="VibeVoice - AI Podcast Generator",
198
+ css=custom_css,
199
+ theme=gr.themes.Soft(
200
+ primary_hue="blue",
201
+ secondary_hue="purple",
202
+ neutral_hue="slate",
203
  )
204
+ ) as interface:
205
+
206
+ # Header
207
+ gr.HTML("""
208
+ <div class="main-header">
209
+ <h1>πŸŽ™οΈ Vibe Podcasting</h1>
210
+ <p>Generating Long-form Multi-speaker AI Podcast with VibeVoice</p>
211
+ </div>
212
+ """)
213
+
214
+ with gr.Row():
215
+ # Left column - Settings
216
+ with gr.Column(scale=1, elem_classes="settings-card"):
217
+ gr.Markdown("### πŸŽ›οΈ **Podcast Settings**")
218
+
219
+ num_speakers = gr.Slider(
220
+ minimum=1, maximum=4, value=2, step=1,
221
+ label="Number of Speakers",
222
+ elem_classes="slider-container"
223
+ )
224
+
225
+ gr.Markdown("### 🎭 **Speaker Selection**")
226
+ available_speaker_names = list(demo_instance.available_voices.keys())
227
+ default_speakers = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman']
228
+
229
+ speaker_selections = []
230
+ for i in range(4):
231
+ default_value = default_speakers[i] if i < len(default_speakers) else None
232
+ speaker = gr.Dropdown(
233
+ choices=available_speaker_names,
234
+ value=default_value,
235
+ label=f"Speaker {i+1}",
236
+ visible=(i < 2),
237
+ elem_classes="speaker-item"
238
+ )
239
+ speaker_selections.append(speaker)
240
+
241
+ gr.Markdown("### βš™οΈ **Advanced Settings**")
242
+ with gr.Accordion("Generation Parameters", open=False):
243
+ cfg_scale = gr.Slider(
244
+ minimum=1.0, maximum=2.0, value=1.3, step=0.05,
245
+ label="CFG Scale (Guidance Strength)",
246
+ elem_classes="slider-container"
247
+ )
248
+
249
+ # Right column - Generation
250
+ with gr.Column(scale=2, elem_classes="generation-card"):
251
+ gr.Markdown("### πŸ“ **Script Input**")
252
+ script_input = gr.Textbox(
253
+ label="Conversation Script",
254
+ placeholder="Enter your podcast script here...",
255
+ lines=12,
256
+ max_lines=20,
257
+ elem_classes="script-input"
258
+ )
259
+
260
+ with gr.Row():
261
+ random_example_btn = gr.Button(
262
+ "🎲 Random Example", size="lg",
263
+ variant="secondary", elem_classes="random-btn", scale=1
264
+ )
265
+ generate_btn = gr.Button(
266
+ "πŸš€ Generate Podcast", size="lg",
267
+ variant="primary", elem_classes="generate-btn", scale=2
268
+ )
269
+
270
+ # Output section
271
+ gr.Markdown("### 🎡 **Generated Podcast**")
272
+ complete_audio_output = gr.Audio(
273
+ label="Complete Podcast (Download)",
274
+ type="numpy",
275
+ elem_classes="audio-output complete-audio-section",
276
+ autoplay=False,
277
+ show_download_button=True,
278
+ visible=True
279
+ )
280
+
281
+ log_output = gr.Textbox(
282
+ label="Generation Log",
283
+ lines=8, max_lines=15,
284
+ interactive=False,
285
+ elem_classes="log-output"
286
+ )
287
+
288
+ # === logic ===
289
+ def update_speaker_visibility(num_speakers):
290
+ return [gr.update(visible=(i < num_speakers)) for i in range(4)]
291
+
292
+ num_speakers.change(
293
+ fn=update_speaker_visibility,
294
+ inputs=[num_speakers],
295
+ outputs=speaker_selections
296
  )
 
297
 
298
  def generate_podcast_wrapper(num_speakers, script, *speakers_and_params):
299
  try:
 
316
  generate_btn.click(
317
  fn=generate_podcast_wrapper,
318
  inputs=[num_speakers, script_input] + speaker_selections + [cfg_scale],
319
+ outputs=[complete_audio_output, log_output],
320
+ queue=True
321
  )
322
 
323
+ def load_random_example():
324
+ import random
325
+ examples = getattr(demo_instance, "example_scripts", [])
326
+ if not examples:
327
+ examples = [
328
+ [2, "Speaker 0: Welcome to our AI podcast demo!\nSpeaker 1: Thanks, excited to be here!"]
329
+ ]
330
+ num_speakers_value, script_value = random.choice(examples)
331
+ return num_speakers_value, script_value
332
+
333
+ random_example_btn.click(
334
+ fn=load_random_example,
335
+ inputs=[],
336
+ outputs=[num_speakers, script_input],
337
+ queue=False
338
+ )
339
+
340
+ gr.Markdown("### πŸ“š **Example Scripts**")
341
+ examples = getattr(demo_instance, "example_scripts", []) or [
342
+ [1, "Speaker 1: Welcome to our AI podcast demo. This is a sample script."]
343
+ ]
344
+ gr.Examples(
345
+ examples=examples,
346
+ inputs=[num_speakers, script_input],
347
+ label="Try these example scripts:"
348
+ )
349
+
350
+ return interface
351
+
352
 
353
 
354
  def run_demo(