Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -31,8 +31,8 @@ from transformers import set_seed
|
|
31 |
logging.set_verbosity_info()
|
32 |
logger = logging.get_logger(__name__)
|
33 |
|
34 |
-
import os
|
35 |
-
os.environ["FLASH_ATTENTION_2"] = "0"
|
36 |
|
37 |
|
38 |
class VibeVoiceDemo:
|
@@ -117,7 +117,6 @@ class VibeVoiceDemo:
|
|
117 |
print(f"Found {len(self.available_voices)} voice files in {voices_dir}")
|
118 |
print(f"Available voices: {', '.join(self.available_voices.keys())}")
|
119 |
|
120 |
-
@spaces.GPU
|
121 |
def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
|
122 |
"""Read and preprocess audio file."""
|
123 |
try:
|
@@ -132,336 +131,95 @@ class VibeVoiceDemo:
|
|
132 |
return np.array([])
|
133 |
|
134 |
@spaces.GPU
|
135 |
-
def
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
raise gr.Error("
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
# Load voice samples
|
182 |
-
voice_samples = []
|
183 |
-
for speaker_name in selected_speakers:
|
184 |
-
audio_path = self.available_voices[speaker_name]
|
185 |
-
audio_data = self.read_audio(audio_path)
|
186 |
-
if len(audio_data) == 0:
|
187 |
-
self.is_generating = False
|
188 |
-
raise gr.Error(f"Error: Failed to load audio for {speaker_name}")
|
189 |
-
voice_samples.append(audio_data)
|
190 |
-
|
191 |
-
# log += f"✅ Loaded {len(voice_samples)} voice samples\n"
|
192 |
-
|
193 |
-
# Check for stop signal
|
194 |
-
if self.stop_generation:
|
195 |
-
self.is_generating = False
|
196 |
-
yield None, "🛑 Generation stopped by user", gr.update(visible=False)
|
197 |
-
return
|
198 |
-
|
199 |
-
# Parse script to assign speaker ID's
|
200 |
-
lines = script.strip().split('\n')
|
201 |
-
formatted_script_lines = []
|
202 |
-
|
203 |
-
for line in lines:
|
204 |
-
line = line.strip()
|
205 |
-
if not line:
|
206 |
-
continue
|
207 |
-
|
208 |
-
# Check if line already has speaker format
|
209 |
-
if line.startswith('Speaker ') and ':' in line:
|
210 |
-
formatted_script_lines.append(line)
|
211 |
-
else:
|
212 |
-
# Auto-assign to speakers in rotation
|
213 |
-
speaker_id = len(formatted_script_lines) % num_speakers
|
214 |
-
formatted_script_lines.append(f"Speaker {speaker_id}: {line}")
|
215 |
-
|
216 |
-
formatted_script = '\n'.join(formatted_script_lines)
|
217 |
-
log += f"📝 Formatted script with {len(formatted_script_lines)} turns\n\n"
|
218 |
-
log += "🔄 Processing with VibeVoice (streaming mode)...\n"
|
219 |
-
|
220 |
-
# Check for stop signal before processing
|
221 |
-
if self.stop_generation:
|
222 |
-
self.is_generating = False
|
223 |
-
yield None, "🛑 Generation stopped by user", gr.update(visible=False)
|
224 |
-
return
|
225 |
-
|
226 |
-
start_time = time.time()
|
227 |
-
|
228 |
-
inputs = self.processor(
|
229 |
-
text=[formatted_script],
|
230 |
-
voice_samples=[voice_samples],
|
231 |
-
padding=True,
|
232 |
-
return_tensors="pt",
|
233 |
-
return_attention_mask=True,
|
234 |
-
)
|
235 |
-
|
236 |
-
# Create audio streamer
|
237 |
-
audio_streamer = AudioStreamer(
|
238 |
-
batch_size=1,
|
239 |
-
stop_signal=None,
|
240 |
-
timeout=None
|
241 |
-
)
|
242 |
-
|
243 |
-
# Store current streamer for potential stopping
|
244 |
-
self.current_streamer = audio_streamer
|
245 |
-
|
246 |
-
# Start generation in a separate thread
|
247 |
-
generation_thread = threading.Thread(
|
248 |
-
target=self._generate_with_streamer,
|
249 |
-
args=(inputs, cfg_scale, audio_streamer)
|
250 |
-
)
|
251 |
-
generation_thread.start()
|
252 |
-
|
253 |
-
# Wait for generation to actually start producing audio
|
254 |
-
time.sleep(1) # Reduced from 3 to 1 second
|
255 |
-
|
256 |
-
# Check for stop signal after thread start
|
257 |
-
if self.stop_generation:
|
258 |
-
audio_streamer.end()
|
259 |
-
generation_thread.join(timeout=5.0) # Wait up to 5 seconds for thread to finish
|
260 |
-
self.is_generating = False
|
261 |
-
yield None, "🛑 Generation stopped by user", gr.update(visible=False)
|
262 |
-
return
|
263 |
-
|
264 |
-
# Collect audio chunks as they arrive
|
265 |
-
sample_rate = 24000
|
266 |
-
all_audio_chunks = [] # For final statistics
|
267 |
-
pending_chunks = [] # Buffer for accumulating small chunks
|
268 |
-
chunk_count = 0
|
269 |
-
last_yield_time = time.time()
|
270 |
-
min_yield_interval = 15 # Yield every 15 seconds
|
271 |
-
min_chunk_size = sample_rate * 30 # At least 2 seconds of audio
|
272 |
-
|
273 |
-
# Get the stream for the first (and only) sample
|
274 |
-
audio_stream = audio_streamer.get_stream(0)
|
275 |
-
|
276 |
-
has_yielded_audio = False
|
277 |
-
has_received_chunks = False # Track if we received any chunks at all
|
278 |
-
|
279 |
-
for audio_chunk in audio_stream:
|
280 |
-
# Check for stop signal in the streaming loop
|
281 |
-
if self.stop_generation:
|
282 |
-
audio_streamer.end()
|
283 |
-
break
|
284 |
-
|
285 |
-
chunk_count += 1
|
286 |
-
has_received_chunks = True # Mark that we received at least one chunk
|
287 |
-
|
288 |
-
# Convert tensor to numpy
|
289 |
-
if torch.is_tensor(audio_chunk):
|
290 |
-
# Convert bfloat16 to float32 first, then to numpy
|
291 |
-
if audio_chunk.dtype == torch.bfloat16:
|
292 |
-
audio_chunk = audio_chunk.float()
|
293 |
-
audio_np = audio_chunk.cpu().numpy().astype(np.float32)
|
294 |
-
else:
|
295 |
-
audio_np = np.array(audio_chunk, dtype=np.float32)
|
296 |
-
|
297 |
-
# Ensure audio is 1D and properly normalized
|
298 |
-
if len(audio_np.shape) > 1:
|
299 |
-
audio_np = audio_np.squeeze()
|
300 |
-
|
301 |
-
# Convert to 16-bit for Gradio
|
302 |
-
audio_16bit = convert_to_16_bit_wav(audio_np)
|
303 |
-
|
304 |
-
# Store for final statistics
|
305 |
-
all_audio_chunks.append(audio_16bit)
|
306 |
-
|
307 |
-
# Add to pending chunks buffer
|
308 |
-
pending_chunks.append(audio_16bit)
|
309 |
-
|
310 |
-
# Calculate pending audio size
|
311 |
-
pending_audio_size = sum(len(chunk) for chunk in pending_chunks)
|
312 |
-
current_time = time.time()
|
313 |
-
time_since_last_yield = current_time - last_yield_time
|
314 |
-
|
315 |
-
# Decide whether to yield
|
316 |
-
should_yield = False
|
317 |
-
if not has_yielded_audio and pending_audio_size >= min_chunk_size:
|
318 |
-
# First yield: wait for minimum chunk size
|
319 |
-
should_yield = True
|
320 |
-
has_yielded_audio = True
|
321 |
-
elif has_yielded_audio and (pending_audio_size >= min_chunk_size or time_since_last_yield >= min_yield_interval):
|
322 |
-
# Subsequent yields: either enough audio or enough time has passed
|
323 |
-
should_yield = True
|
324 |
-
|
325 |
-
if should_yield and pending_chunks:
|
326 |
-
# Concatenate and yield only the new audio chunks
|
327 |
-
new_audio = np.concatenate(pending_chunks)
|
328 |
-
new_duration = len(new_audio) / sample_rate
|
329 |
-
total_duration = sum(len(chunk) for chunk in all_audio_chunks) / sample_rate
|
330 |
-
|
331 |
-
log_update = log + f"🎵 Streaming: {total_duration:.1f}s generated (chunk {chunk_count})\n"
|
332 |
-
|
333 |
-
# Yield streaming audio chunk and keep complete_audio as None during streaming
|
334 |
-
yield (sample_rate, new_audio), None, log_update, gr.update(visible=True)
|
335 |
-
|
336 |
-
# Clear pending chunks after yielding
|
337 |
-
pending_chunks = []
|
338 |
-
last_yield_time = current_time
|
339 |
-
|
340 |
-
# Yield any remaining chunks
|
341 |
-
if pending_chunks:
|
342 |
-
final_new_audio = np.concatenate(pending_chunks)
|
343 |
-
total_duration = sum(len(chunk) for chunk in all_audio_chunks) / sample_rate
|
344 |
-
log_update = log + f"🎵 Streaming final chunk: {total_duration:.1f}s total\n"
|
345 |
-
yield (sample_rate, final_new_audio), None, log_update, gr.update(visible=True)
|
346 |
-
has_yielded_audio = True # Mark that we yielded audio
|
347 |
-
|
348 |
-
# Wait for generation to complete (with timeout to prevent hanging)
|
349 |
-
generation_thread.join(timeout=5.0) # Increased timeout to 5 seconds
|
350 |
-
|
351 |
-
# If thread is still alive after timeout, force end
|
352 |
-
if generation_thread.is_alive():
|
353 |
-
print("Warning: Generation thread did not complete within timeout")
|
354 |
-
audio_streamer.end()
|
355 |
-
generation_thread.join(timeout=5.0)
|
356 |
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
# Debug logging
|
369 |
-
# print(f"Debug: has_received_chunks={has_received_chunks}, chunk_count={chunk_count}, all_audio_chunks length={len(all_audio_chunks)}")
|
370 |
-
|
371 |
-
# Check if we received any chunks but didn't yield audio
|
372 |
-
if has_received_chunks and not has_yielded_audio and all_audio_chunks:
|
373 |
-
# We have chunks but didn't meet the yield criteria, yield them now
|
374 |
-
complete_audio = np.concatenate(all_audio_chunks)
|
375 |
-
final_duration = len(complete_audio) / sample_rate
|
376 |
-
|
377 |
-
final_log = log + f"⏱️ Generation completed in {generation_time:.2f} seconds\n"
|
378 |
-
final_log += f"🎵 Final audio duration: {final_duration:.2f} seconds\n"
|
379 |
-
final_log += f"📊 Total chunks: {chunk_count}\n"
|
380 |
-
final_log += "✨ Generation successful! Complete audio is ready.\n"
|
381 |
-
final_log += "💡 Not satisfied? You can regenerate or adjust the CFG scale for different results."
|
382 |
-
|
383 |
-
# Yield the complete audio
|
384 |
-
yield None, (sample_rate, complete_audio), final_log, gr.update(visible=False)
|
385 |
-
return
|
386 |
-
|
387 |
-
if not has_received_chunks:
|
388 |
-
error_log = log + f"\n❌ Error: No audio chunks were received from the model. Generation time: {generation_time:.2f}s"
|
389 |
-
yield None, None, error_log, gr.update(visible=False)
|
390 |
-
return
|
391 |
-
|
392 |
-
if not has_yielded_audio:
|
393 |
-
error_log = log + f"\n❌ Error: Audio was generated but not streamed. Chunk count: {chunk_count}"
|
394 |
-
yield None, None, error_log, gr.update(visible=False)
|
395 |
-
return
|
396 |
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
413 |
|
414 |
-
|
415 |
-
# Handle Gradio-specific errors (like input validation)
|
416 |
-
self.is_generating = False
|
417 |
-
self.current_streamer = None
|
418 |
-
error_msg = f"❌ Input Error: {str(e)}"
|
419 |
-
print(error_msg)
|
420 |
-
yield None, None, error_msg, gr.update(visible=False)
|
421 |
-
|
422 |
-
except Exception as e:
|
423 |
-
self.is_generating = False
|
424 |
-
self.current_streamer = None
|
425 |
-
error_msg = f"❌ An unexpected error occurred: {str(e)}"
|
426 |
-
print(error_msg)
|
427 |
-
import traceback
|
428 |
-
traceback.print_exc()
|
429 |
-
yield None, None, error_msg, gr.update(visible=False)
|
430 |
|
431 |
-
|
432 |
-
def _generate_with_streamer(self, inputs, cfg_scale, audio_streamer):
|
433 |
-
"""Helper method to run generation with streamer in a separate thread."""
|
434 |
-
try:
|
435 |
-
# Check for stop signal before starting generation
|
436 |
-
if self.stop_generation:
|
437 |
-
audio_streamer.end()
|
438 |
-
return
|
439 |
-
|
440 |
-
# Define a stop check function that can be called from generate
|
441 |
-
def check_stop_generation():
|
442 |
-
return self.stop_generation
|
443 |
-
|
444 |
-
outputs = self.model.generate(
|
445 |
-
**inputs,
|
446 |
-
max_new_tokens=None,
|
447 |
-
cfg_scale=cfg_scale,
|
448 |
-
tokenizer=self.processor.tokenizer,
|
449 |
-
generation_config={
|
450 |
-
'do_sample': False,
|
451 |
-
},
|
452 |
-
audio_streamer=audio_streamer,
|
453 |
-
stop_check_fn=check_stop_generation, # Pass the stop check function
|
454 |
-
verbose=False, # Disable verbose in streaming mode
|
455 |
-
refresh_negative=True,
|
456 |
-
)
|
457 |
-
|
458 |
-
except Exception as e:
|
459 |
-
print(f"Error in generation thread: {e}")
|
460 |
-
traceback.print_exc()
|
461 |
-
# Make sure to end the stream on error
|
462 |
-
audio_streamer.end()
|
463 |
-
|
464 |
-
@spaces.GPU
|
465 |
def stop_audio_generation(self):
|
466 |
"""Stop the current audio generation process."""
|
467 |
self.stop_generation = True
|
|
|
31 |
logging.set_verbosity_info()
|
32 |
logger = logging.get_logger(__name__)
|
33 |
|
34 |
+
# import os
|
35 |
+
# os.environ["FLASH_ATTENTION_2"] = "0"
|
36 |
|
37 |
|
38 |
class VibeVoiceDemo:
|
|
|
117 |
print(f"Found {len(self.available_voices)} voice files in {voices_dir}")
|
118 |
print(f"Available voices: {', '.join(self.available_voices.keys())}")
|
119 |
|
|
|
120 |
def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
|
121 |
"""Read and preprocess audio file."""
|
122 |
try:
|
|
|
131 |
return np.array([])
|
132 |
|
133 |
@spaces.GPU
|
134 |
+
def generate_podcast(self, num_speakers: int, script: str,
|
135 |
+
speaker_1: str = None, speaker_2: str = None,
|
136 |
+
speaker_3: str = None, speaker_4: str = None,
|
137 |
+
cfg_scale: float = 1.3):
|
138 |
+
"""Single GPU function for full generation (streaming + final)."""
|
139 |
+
self.stop_generation = False
|
140 |
+
self.is_generating = True
|
141 |
+
|
142 |
+
if not script.strip():
|
143 |
+
raise gr.Error("Please provide a script.")
|
144 |
+
|
145 |
+
if num_speakers < 1 or num_speakers > 4:
|
146 |
+
raise gr.Error("Number of speakers must be 1–4.")
|
147 |
+
|
148 |
+
selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
|
149 |
+
for i, sp in enumerate(selected):
|
150 |
+
if not sp or sp not in self.available_voices:
|
151 |
+
raise gr.Error(f"Invalid speaker {i+1} selection.")
|
152 |
+
|
153 |
+
# load voices
|
154 |
+
voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected]
|
155 |
+
if any(len(v) == 0 for v in voice_samples):
|
156 |
+
raise gr.Error("Failed to load one or more voice samples.")
|
157 |
+
|
158 |
+
# format script
|
159 |
+
lines = script.strip().split("\n")
|
160 |
+
formatted = []
|
161 |
+
for i, line in enumerate(lines):
|
162 |
+
line = line.strip()
|
163 |
+
if not line:
|
164 |
+
continue
|
165 |
+
if line.startswith("Speaker "):
|
166 |
+
formatted.append(line)
|
167 |
+
else:
|
168 |
+
sp_id = i % num_speakers
|
169 |
+
formatted.append(f"Speaker {sp_id}: {line}")
|
170 |
+
formatted_script = "\n".join(formatted)
|
171 |
+
|
172 |
+
# processor input
|
173 |
+
inputs = self.processor(
|
174 |
+
text=[formatted_script],
|
175 |
+
voice_samples=[voice_samples],
|
176 |
+
padding=True,
|
177 |
+
return_tensors="pt"
|
178 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
|
180 |
+
# === direct generation with streamer ===
|
181 |
+
from vibevoice import AudioStreamer, convert_to_16_bit_wav
|
182 |
+
audio_streamer = AudioStreamer(batch_size=1)
|
183 |
+
start = time.time()
|
184 |
+
outputs = self.model.generate(
|
185 |
+
**inputs,
|
186 |
+
cfg_scale=cfg_scale,
|
187 |
+
tokenizer=self.processor.tokenizer,
|
188 |
+
audio_streamer=audio_streamer,
|
189 |
+
verbose=False
|
190 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
+
sample_rate = 24000
|
193 |
+
audio_stream = audio_streamer.get_stream(0)
|
194 |
+
all_chunks, pending = [], []
|
195 |
+
min_chunk_size = sample_rate * 2
|
196 |
+
last_yield = time.time()
|
197 |
+
|
198 |
+
for chunk in audio_stream:
|
199 |
+
if torch.is_tensor(chunk):
|
200 |
+
chunk = chunk.float().cpu().numpy()
|
201 |
+
if chunk.ndim > 1:
|
202 |
+
chunk = chunk.squeeze()
|
203 |
+
chunk16 = convert_to_16_bit_wav(chunk)
|
204 |
+
all_chunks.append(chunk16)
|
205 |
+
pending.append(chunk16)
|
206 |
+
if sum(len(c) for c in pending) >= min_chunk_size or (time.time() - last_yield) > 5:
|
207 |
+
new_audio = np.concatenate(pending)
|
208 |
+
yield (sample_rate, new_audio), None, f"Streaming {len(all_chunks)} chunks..."
|
209 |
+
pending = []
|
210 |
+
last_yield = time.time()
|
211 |
+
|
212 |
+
if all_chunks:
|
213 |
+
complete = np.concatenate(all_chunks)
|
214 |
+
total_dur = len(complete) / sample_rate
|
215 |
+
log = f"✅ Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio"
|
216 |
+
yield None, (sample_rate, complete), log
|
217 |
+
else:
|
218 |
+
yield None, None, "❌ No audio generated."
|
219 |
|
220 |
+
self.is_generating = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
|
222 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
def stop_audio_generation(self):
|
224 |
"""Stop the current audio generation process."""
|
225 |
self.stop_generation = True
|