yasserrmd commited on
Commit
41155d1
·
verified ·
1 Parent(s): c731fc4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -329
app.py CHANGED
@@ -31,8 +31,8 @@ from transformers import set_seed
31
  logging.set_verbosity_info()
32
  logger = logging.get_logger(__name__)
33
 
34
- import os
35
- os.environ["FLASH_ATTENTION_2"] = "0"
36
 
37
 
38
  class VibeVoiceDemo:
@@ -117,7 +117,6 @@ class VibeVoiceDemo:
117
  print(f"Found {len(self.available_voices)} voice files in {voices_dir}")
118
  print(f"Available voices: {', '.join(self.available_voices.keys())}")
119
 
120
- @spaces.GPU
121
  def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
122
  """Read and preprocess audio file."""
123
  try:
@@ -132,336 +131,95 @@ class VibeVoiceDemo:
132
  return np.array([])
133
 
134
  @spaces.GPU
135
- def generate_podcast_streaming(self,
136
- num_speakers: int,
137
- script: str,
138
- speaker_1: str = None,
139
- speaker_2: str = None,
140
- speaker_3: str = None,
141
- speaker_4: str = None,
142
- cfg_scale: float = 1.3) -> Iterator[tuple]:
143
- try:
144
-
145
- # Reset stop flag and set generating state
146
- self.stop_generation = False
147
- self.is_generating = True
148
-
149
- # Validate inputs
150
- if not script.strip():
151
- self.is_generating = False
152
- raise gr.Error("Error: Please provide a script.")
153
-
154
- # Defend against common mistake
155
- script = script.replace("’", "'")
156
-
157
- if num_speakers < 1 or num_speakers > 4:
158
- self.is_generating = False
159
- raise gr.Error("Error: Number of speakers must be between 1 and 4.")
160
-
161
- # Collect selected speakers
162
- selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
163
-
164
- # Validate speaker selections
165
- for i, speaker in enumerate(selected_speakers):
166
- if not speaker or speaker not in self.available_voices:
167
- self.is_generating = False
168
- raise gr.Error(f"Error: Please select a valid speaker for Speaker {i+1}.")
169
-
170
- # Build initial log
171
- log = f"🎙️ Generating podcast with {num_speakers} speakers\n"
172
- log += f"📊 Parameters: CFG Scale={cfg_scale}, Inference Steps={self.inference_steps}\n"
173
- log += f"🎭 Speakers: {', '.join(selected_speakers)}\n"
174
-
175
- # Check for stop signal
176
- if self.stop_generation:
177
- self.is_generating = False
178
- yield None, "🛑 Generation stopped by user", gr.update(visible=False)
179
- return
180
-
181
- # Load voice samples
182
- voice_samples = []
183
- for speaker_name in selected_speakers:
184
- audio_path = self.available_voices[speaker_name]
185
- audio_data = self.read_audio(audio_path)
186
- if len(audio_data) == 0:
187
- self.is_generating = False
188
- raise gr.Error(f"Error: Failed to load audio for {speaker_name}")
189
- voice_samples.append(audio_data)
190
-
191
- # log += f"✅ Loaded {len(voice_samples)} voice samples\n"
192
-
193
- # Check for stop signal
194
- if self.stop_generation:
195
- self.is_generating = False
196
- yield None, "🛑 Generation stopped by user", gr.update(visible=False)
197
- return
198
-
199
- # Parse script to assign speaker ID's
200
- lines = script.strip().split('\n')
201
- formatted_script_lines = []
202
-
203
- for line in lines:
204
- line = line.strip()
205
- if not line:
206
- continue
207
-
208
- # Check if line already has speaker format
209
- if line.startswith('Speaker ') and ':' in line:
210
- formatted_script_lines.append(line)
211
- else:
212
- # Auto-assign to speakers in rotation
213
- speaker_id = len(formatted_script_lines) % num_speakers
214
- formatted_script_lines.append(f"Speaker {speaker_id}: {line}")
215
-
216
- formatted_script = '\n'.join(formatted_script_lines)
217
- log += f"📝 Formatted script with {len(formatted_script_lines)} turns\n\n"
218
- log += "🔄 Processing with VibeVoice (streaming mode)...\n"
219
-
220
- # Check for stop signal before processing
221
- if self.stop_generation:
222
- self.is_generating = False
223
- yield None, "🛑 Generation stopped by user", gr.update(visible=False)
224
- return
225
-
226
- start_time = time.time()
227
-
228
- inputs = self.processor(
229
- text=[formatted_script],
230
- voice_samples=[voice_samples],
231
- padding=True,
232
- return_tensors="pt",
233
- return_attention_mask=True,
234
- )
235
-
236
- # Create audio streamer
237
- audio_streamer = AudioStreamer(
238
- batch_size=1,
239
- stop_signal=None,
240
- timeout=None
241
- )
242
-
243
- # Store current streamer for potential stopping
244
- self.current_streamer = audio_streamer
245
-
246
- # Start generation in a separate thread
247
- generation_thread = threading.Thread(
248
- target=self._generate_with_streamer,
249
- args=(inputs, cfg_scale, audio_streamer)
250
- )
251
- generation_thread.start()
252
-
253
- # Wait for generation to actually start producing audio
254
- time.sleep(1) # Reduced from 3 to 1 second
255
-
256
- # Check for stop signal after thread start
257
- if self.stop_generation:
258
- audio_streamer.end()
259
- generation_thread.join(timeout=5.0) # Wait up to 5 seconds for thread to finish
260
- self.is_generating = False
261
- yield None, "🛑 Generation stopped by user", gr.update(visible=False)
262
- return
263
-
264
- # Collect audio chunks as they arrive
265
- sample_rate = 24000
266
- all_audio_chunks = [] # For final statistics
267
- pending_chunks = [] # Buffer for accumulating small chunks
268
- chunk_count = 0
269
- last_yield_time = time.time()
270
- min_yield_interval = 15 # Yield every 15 seconds
271
- min_chunk_size = sample_rate * 30 # At least 2 seconds of audio
272
-
273
- # Get the stream for the first (and only) sample
274
- audio_stream = audio_streamer.get_stream(0)
275
-
276
- has_yielded_audio = False
277
- has_received_chunks = False # Track if we received any chunks at all
278
-
279
- for audio_chunk in audio_stream:
280
- # Check for stop signal in the streaming loop
281
- if self.stop_generation:
282
- audio_streamer.end()
283
- break
284
-
285
- chunk_count += 1
286
- has_received_chunks = True # Mark that we received at least one chunk
287
-
288
- # Convert tensor to numpy
289
- if torch.is_tensor(audio_chunk):
290
- # Convert bfloat16 to float32 first, then to numpy
291
- if audio_chunk.dtype == torch.bfloat16:
292
- audio_chunk = audio_chunk.float()
293
- audio_np = audio_chunk.cpu().numpy().astype(np.float32)
294
- else:
295
- audio_np = np.array(audio_chunk, dtype=np.float32)
296
-
297
- # Ensure audio is 1D and properly normalized
298
- if len(audio_np.shape) > 1:
299
- audio_np = audio_np.squeeze()
300
-
301
- # Convert to 16-bit for Gradio
302
- audio_16bit = convert_to_16_bit_wav(audio_np)
303
-
304
- # Store for final statistics
305
- all_audio_chunks.append(audio_16bit)
306
-
307
- # Add to pending chunks buffer
308
- pending_chunks.append(audio_16bit)
309
-
310
- # Calculate pending audio size
311
- pending_audio_size = sum(len(chunk) for chunk in pending_chunks)
312
- current_time = time.time()
313
- time_since_last_yield = current_time - last_yield_time
314
-
315
- # Decide whether to yield
316
- should_yield = False
317
- if not has_yielded_audio and pending_audio_size >= min_chunk_size:
318
- # First yield: wait for minimum chunk size
319
- should_yield = True
320
- has_yielded_audio = True
321
- elif has_yielded_audio and (pending_audio_size >= min_chunk_size or time_since_last_yield >= min_yield_interval):
322
- # Subsequent yields: either enough audio or enough time has passed
323
- should_yield = True
324
-
325
- if should_yield and pending_chunks:
326
- # Concatenate and yield only the new audio chunks
327
- new_audio = np.concatenate(pending_chunks)
328
- new_duration = len(new_audio) / sample_rate
329
- total_duration = sum(len(chunk) for chunk in all_audio_chunks) / sample_rate
330
-
331
- log_update = log + f"🎵 Streaming: {total_duration:.1f}s generated (chunk {chunk_count})\n"
332
-
333
- # Yield streaming audio chunk and keep complete_audio as None during streaming
334
- yield (sample_rate, new_audio), None, log_update, gr.update(visible=True)
335
-
336
- # Clear pending chunks after yielding
337
- pending_chunks = []
338
- last_yield_time = current_time
339
-
340
- # Yield any remaining chunks
341
- if pending_chunks:
342
- final_new_audio = np.concatenate(pending_chunks)
343
- total_duration = sum(len(chunk) for chunk in all_audio_chunks) / sample_rate
344
- log_update = log + f"🎵 Streaming final chunk: {total_duration:.1f}s total\n"
345
- yield (sample_rate, final_new_audio), None, log_update, gr.update(visible=True)
346
- has_yielded_audio = True # Mark that we yielded audio
347
-
348
- # Wait for generation to complete (with timeout to prevent hanging)
349
- generation_thread.join(timeout=5.0) # Increased timeout to 5 seconds
350
-
351
- # If thread is still alive after timeout, force end
352
- if generation_thread.is_alive():
353
- print("Warning: Generation thread did not complete within timeout")
354
- audio_streamer.end()
355
- generation_thread.join(timeout=5.0)
356
 
357
- # Clean up
358
- self.current_streamer = None
359
- self.is_generating = False
360
-
361
- generation_time = time.time() - start_time
362
-
363
- # Check if stopped by user
364
- if self.stop_generation:
365
- yield None, None, "🛑 Generation stopped by user", gr.update(visible=False)
366
- return
367
-
368
- # Debug logging
369
- # print(f"Debug: has_received_chunks={has_received_chunks}, chunk_count={chunk_count}, all_audio_chunks length={len(all_audio_chunks)}")
370
-
371
- # Check if we received any chunks but didn't yield audio
372
- if has_received_chunks and not has_yielded_audio and all_audio_chunks:
373
- # We have chunks but didn't meet the yield criteria, yield them now
374
- complete_audio = np.concatenate(all_audio_chunks)
375
- final_duration = len(complete_audio) / sample_rate
376
-
377
- final_log = log + f"⏱️ Generation completed in {generation_time:.2f} seconds\n"
378
- final_log += f"🎵 Final audio duration: {final_duration:.2f} seconds\n"
379
- final_log += f"📊 Total chunks: {chunk_count}\n"
380
- final_log += "✨ Generation successful! Complete audio is ready.\n"
381
- final_log += "💡 Not satisfied? You can regenerate or adjust the CFG scale for different results."
382
-
383
- # Yield the complete audio
384
- yield None, (sample_rate, complete_audio), final_log, gr.update(visible=False)
385
- return
386
-
387
- if not has_received_chunks:
388
- error_log = log + f"\n❌ Error: No audio chunks were received from the model. Generation time: {generation_time:.2f}s"
389
- yield None, None, error_log, gr.update(visible=False)
390
- return
391
-
392
- if not has_yielded_audio:
393
- error_log = log + f"\n❌ Error: Audio was generated but not streamed. Chunk count: {chunk_count}"
394
- yield None, None, error_log, gr.update(visible=False)
395
- return
396
 
397
- # Prepare the complete audio
398
- if all_audio_chunks:
399
- complete_audio = np.concatenate(all_audio_chunks)
400
- final_duration = len(complete_audio) / sample_rate
401
-
402
- final_log = log + f"⏱️ Generation completed in {generation_time:.2f} seconds\n"
403
- final_log += f"🎵 Final audio duration: {final_duration:.2f} seconds\n"
404
- final_log += f"📊 Total chunks: {chunk_count}\n"
405
- final_log += "✨ Generation successful! Complete audio is ready in the 'Complete Audio' tab.\n"
406
- final_log += "💡 Not satisfied? You can regenerate or adjust the CFG scale for different results."
407
-
408
- # Final yield: Clear streaming audio and provide complete audio
409
- yield None, (sample_rate, complete_audio), final_log, gr.update(visible=False)
410
- else:
411
- final_log = log + "❌ No audio was generated."
412
- yield None, None, final_log, gr.update(visible=False)
 
 
 
 
 
 
 
 
 
 
 
413
 
414
- except gr.Error as e:
415
- # Handle Gradio-specific errors (like input validation)
416
- self.is_generating = False
417
- self.current_streamer = None
418
- error_msg = f"❌ Input Error: {str(e)}"
419
- print(error_msg)
420
- yield None, None, error_msg, gr.update(visible=False)
421
-
422
- except Exception as e:
423
- self.is_generating = False
424
- self.current_streamer = None
425
- error_msg = f"❌ An unexpected error occurred: {str(e)}"
426
- print(error_msg)
427
- import traceback
428
- traceback.print_exc()
429
- yield None, None, error_msg, gr.update(visible=False)
430
 
431
- @spaces.GPU
432
- def _generate_with_streamer(self, inputs, cfg_scale, audio_streamer):
433
- """Helper method to run generation with streamer in a separate thread."""
434
- try:
435
- # Check for stop signal before starting generation
436
- if self.stop_generation:
437
- audio_streamer.end()
438
- return
439
-
440
- # Define a stop check function that can be called from generate
441
- def check_stop_generation():
442
- return self.stop_generation
443
-
444
- outputs = self.model.generate(
445
- **inputs,
446
- max_new_tokens=None,
447
- cfg_scale=cfg_scale,
448
- tokenizer=self.processor.tokenizer,
449
- generation_config={
450
- 'do_sample': False,
451
- },
452
- audio_streamer=audio_streamer,
453
- stop_check_fn=check_stop_generation, # Pass the stop check function
454
- verbose=False, # Disable verbose in streaming mode
455
- refresh_negative=True,
456
- )
457
-
458
- except Exception as e:
459
- print(f"Error in generation thread: {e}")
460
- traceback.print_exc()
461
- # Make sure to end the stream on error
462
- audio_streamer.end()
463
-
464
- @spaces.GPU
465
  def stop_audio_generation(self):
466
  """Stop the current audio generation process."""
467
  self.stop_generation = True
 
31
  logging.set_verbosity_info()
32
  logger = logging.get_logger(__name__)
33
 
34
+ # import os
35
+ # os.environ["FLASH_ATTENTION_2"] = "0"
36
 
37
 
38
  class VibeVoiceDemo:
 
117
  print(f"Found {len(self.available_voices)} voice files in {voices_dir}")
118
  print(f"Available voices: {', '.join(self.available_voices.keys())}")
119
 
 
120
  def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
121
  """Read and preprocess audio file."""
122
  try:
 
131
  return np.array([])
132
 
133
  @spaces.GPU
134
+ def generate_podcast(self, num_speakers: int, script: str,
135
+ speaker_1: str = None, speaker_2: str = None,
136
+ speaker_3: str = None, speaker_4: str = None,
137
+ cfg_scale: float = 1.3):
138
+ """Single GPU function for full generation (streaming + final)."""
139
+ self.stop_generation = False
140
+ self.is_generating = True
141
+
142
+ if not script.strip():
143
+ raise gr.Error("Please provide a script.")
144
+
145
+ if num_speakers < 1 or num_speakers > 4:
146
+ raise gr.Error("Number of speakers must be 1–4.")
147
+
148
+ selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
149
+ for i, sp in enumerate(selected):
150
+ if not sp or sp not in self.available_voices:
151
+ raise gr.Error(f"Invalid speaker {i+1} selection.")
152
+
153
+ # load voices
154
+ voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected]
155
+ if any(len(v) == 0 for v in voice_samples):
156
+ raise gr.Error("Failed to load one or more voice samples.")
157
+
158
+ # format script
159
+ lines = script.strip().split("\n")
160
+ formatted = []
161
+ for i, line in enumerate(lines):
162
+ line = line.strip()
163
+ if not line:
164
+ continue
165
+ if line.startswith("Speaker "):
166
+ formatted.append(line)
167
+ else:
168
+ sp_id = i % num_speakers
169
+ formatted.append(f"Speaker {sp_id}: {line}")
170
+ formatted_script = "\n".join(formatted)
171
+
172
+ # processor input
173
+ inputs = self.processor(
174
+ text=[formatted_script],
175
+ voice_samples=[voice_samples],
176
+ padding=True,
177
+ return_tensors="pt"
178
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
+ # === direct generation with streamer ===
181
+ from vibevoice import AudioStreamer, convert_to_16_bit_wav
182
+ audio_streamer = AudioStreamer(batch_size=1)
183
+ start = time.time()
184
+ outputs = self.model.generate(
185
+ **inputs,
186
+ cfg_scale=cfg_scale,
187
+ tokenizer=self.processor.tokenizer,
188
+ audio_streamer=audio_streamer,
189
+ verbose=False
190
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
+ sample_rate = 24000
193
+ audio_stream = audio_streamer.get_stream(0)
194
+ all_chunks, pending = [], []
195
+ min_chunk_size = sample_rate * 2
196
+ last_yield = time.time()
197
+
198
+ for chunk in audio_stream:
199
+ if torch.is_tensor(chunk):
200
+ chunk = chunk.float().cpu().numpy()
201
+ if chunk.ndim > 1:
202
+ chunk = chunk.squeeze()
203
+ chunk16 = convert_to_16_bit_wav(chunk)
204
+ all_chunks.append(chunk16)
205
+ pending.append(chunk16)
206
+ if sum(len(c) for c in pending) >= min_chunk_size or (time.time() - last_yield) > 5:
207
+ new_audio = np.concatenate(pending)
208
+ yield (sample_rate, new_audio), None, f"Streaming {len(all_chunks)} chunks..."
209
+ pending = []
210
+ last_yield = time.time()
211
+
212
+ if all_chunks:
213
+ complete = np.concatenate(all_chunks)
214
+ total_dur = len(complete) / sample_rate
215
+ log = f"✅ Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio"
216
+ yield None, (sample_rate, complete), log
217
+ else:
218
+ yield None, None, "❌ No audio generated."
219
 
220
+ self.is_generating = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  def stop_audio_generation(self):
224
  """Stop the current audio generation process."""
225
  self.stop_generation = True