Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -31,8 +31,8 @@ from transformers import set_seed
|
|
| 31 |
logging.set_verbosity_info()
|
| 32 |
logger = logging.get_logger(__name__)
|
| 33 |
|
| 34 |
-
import os
|
| 35 |
-
os.environ["FLASH_ATTENTION_2"] = "0"
|
| 36 |
|
| 37 |
|
| 38 |
class VibeVoiceDemo:
|
|
@@ -117,7 +117,6 @@ class VibeVoiceDemo:
|
|
| 117 |
print(f"Found {len(self.available_voices)} voice files in {voices_dir}")
|
| 118 |
print(f"Available voices: {', '.join(self.available_voices.keys())}")
|
| 119 |
|
| 120 |
-
@spaces.GPU
|
| 121 |
def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
|
| 122 |
"""Read and preprocess audio file."""
|
| 123 |
try:
|
|
@@ -132,336 +131,95 @@ class VibeVoiceDemo:
|
|
| 132 |
return np.array([])
|
| 133 |
|
| 134 |
@spaces.GPU
|
| 135 |
-
def
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
raise gr.Error("
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
# Load voice samples
|
| 182 |
-
voice_samples = []
|
| 183 |
-
for speaker_name in selected_speakers:
|
| 184 |
-
audio_path = self.available_voices[speaker_name]
|
| 185 |
-
audio_data = self.read_audio(audio_path)
|
| 186 |
-
if len(audio_data) == 0:
|
| 187 |
-
self.is_generating = False
|
| 188 |
-
raise gr.Error(f"Error: Failed to load audio for {speaker_name}")
|
| 189 |
-
voice_samples.append(audio_data)
|
| 190 |
-
|
| 191 |
-
# log += f"✅ Loaded {len(voice_samples)} voice samples\n"
|
| 192 |
-
|
| 193 |
-
# Check for stop signal
|
| 194 |
-
if self.stop_generation:
|
| 195 |
-
self.is_generating = False
|
| 196 |
-
yield None, "🛑 Generation stopped by user", gr.update(visible=False)
|
| 197 |
-
return
|
| 198 |
-
|
| 199 |
-
# Parse script to assign speaker ID's
|
| 200 |
-
lines = script.strip().split('\n')
|
| 201 |
-
formatted_script_lines = []
|
| 202 |
-
|
| 203 |
-
for line in lines:
|
| 204 |
-
line = line.strip()
|
| 205 |
-
if not line:
|
| 206 |
-
continue
|
| 207 |
-
|
| 208 |
-
# Check if line already has speaker format
|
| 209 |
-
if line.startswith('Speaker ') and ':' in line:
|
| 210 |
-
formatted_script_lines.append(line)
|
| 211 |
-
else:
|
| 212 |
-
# Auto-assign to speakers in rotation
|
| 213 |
-
speaker_id = len(formatted_script_lines) % num_speakers
|
| 214 |
-
formatted_script_lines.append(f"Speaker {speaker_id}: {line}")
|
| 215 |
-
|
| 216 |
-
formatted_script = '\n'.join(formatted_script_lines)
|
| 217 |
-
log += f"📝 Formatted script with {len(formatted_script_lines)} turns\n\n"
|
| 218 |
-
log += "🔄 Processing with VibeVoice (streaming mode)...\n"
|
| 219 |
-
|
| 220 |
-
# Check for stop signal before processing
|
| 221 |
-
if self.stop_generation:
|
| 222 |
-
self.is_generating = False
|
| 223 |
-
yield None, "🛑 Generation stopped by user", gr.update(visible=False)
|
| 224 |
-
return
|
| 225 |
-
|
| 226 |
-
start_time = time.time()
|
| 227 |
-
|
| 228 |
-
inputs = self.processor(
|
| 229 |
-
text=[formatted_script],
|
| 230 |
-
voice_samples=[voice_samples],
|
| 231 |
-
padding=True,
|
| 232 |
-
return_tensors="pt",
|
| 233 |
-
return_attention_mask=True,
|
| 234 |
-
)
|
| 235 |
-
|
| 236 |
-
# Create audio streamer
|
| 237 |
-
audio_streamer = AudioStreamer(
|
| 238 |
-
batch_size=1,
|
| 239 |
-
stop_signal=None,
|
| 240 |
-
timeout=None
|
| 241 |
-
)
|
| 242 |
-
|
| 243 |
-
# Store current streamer for potential stopping
|
| 244 |
-
self.current_streamer = audio_streamer
|
| 245 |
-
|
| 246 |
-
# Start generation in a separate thread
|
| 247 |
-
generation_thread = threading.Thread(
|
| 248 |
-
target=self._generate_with_streamer,
|
| 249 |
-
args=(inputs, cfg_scale, audio_streamer)
|
| 250 |
-
)
|
| 251 |
-
generation_thread.start()
|
| 252 |
-
|
| 253 |
-
# Wait for generation to actually start producing audio
|
| 254 |
-
time.sleep(1) # Reduced from 3 to 1 second
|
| 255 |
-
|
| 256 |
-
# Check for stop signal after thread start
|
| 257 |
-
if self.stop_generation:
|
| 258 |
-
audio_streamer.end()
|
| 259 |
-
generation_thread.join(timeout=5.0) # Wait up to 5 seconds for thread to finish
|
| 260 |
-
self.is_generating = False
|
| 261 |
-
yield None, "🛑 Generation stopped by user", gr.update(visible=False)
|
| 262 |
-
return
|
| 263 |
-
|
| 264 |
-
# Collect audio chunks as they arrive
|
| 265 |
-
sample_rate = 24000
|
| 266 |
-
all_audio_chunks = [] # For final statistics
|
| 267 |
-
pending_chunks = [] # Buffer for accumulating small chunks
|
| 268 |
-
chunk_count = 0
|
| 269 |
-
last_yield_time = time.time()
|
| 270 |
-
min_yield_interval = 15 # Yield every 15 seconds
|
| 271 |
-
min_chunk_size = sample_rate * 30 # At least 2 seconds of audio
|
| 272 |
-
|
| 273 |
-
# Get the stream for the first (and only) sample
|
| 274 |
-
audio_stream = audio_streamer.get_stream(0)
|
| 275 |
-
|
| 276 |
-
has_yielded_audio = False
|
| 277 |
-
has_received_chunks = False # Track if we received any chunks at all
|
| 278 |
-
|
| 279 |
-
for audio_chunk in audio_stream:
|
| 280 |
-
# Check for stop signal in the streaming loop
|
| 281 |
-
if self.stop_generation:
|
| 282 |
-
audio_streamer.end()
|
| 283 |
-
break
|
| 284 |
-
|
| 285 |
-
chunk_count += 1
|
| 286 |
-
has_received_chunks = True # Mark that we received at least one chunk
|
| 287 |
-
|
| 288 |
-
# Convert tensor to numpy
|
| 289 |
-
if torch.is_tensor(audio_chunk):
|
| 290 |
-
# Convert bfloat16 to float32 first, then to numpy
|
| 291 |
-
if audio_chunk.dtype == torch.bfloat16:
|
| 292 |
-
audio_chunk = audio_chunk.float()
|
| 293 |
-
audio_np = audio_chunk.cpu().numpy().astype(np.float32)
|
| 294 |
-
else:
|
| 295 |
-
audio_np = np.array(audio_chunk, dtype=np.float32)
|
| 296 |
-
|
| 297 |
-
# Ensure audio is 1D and properly normalized
|
| 298 |
-
if len(audio_np.shape) > 1:
|
| 299 |
-
audio_np = audio_np.squeeze()
|
| 300 |
-
|
| 301 |
-
# Convert to 16-bit for Gradio
|
| 302 |
-
audio_16bit = convert_to_16_bit_wav(audio_np)
|
| 303 |
-
|
| 304 |
-
# Store for final statistics
|
| 305 |
-
all_audio_chunks.append(audio_16bit)
|
| 306 |
-
|
| 307 |
-
# Add to pending chunks buffer
|
| 308 |
-
pending_chunks.append(audio_16bit)
|
| 309 |
-
|
| 310 |
-
# Calculate pending audio size
|
| 311 |
-
pending_audio_size = sum(len(chunk) for chunk in pending_chunks)
|
| 312 |
-
current_time = time.time()
|
| 313 |
-
time_since_last_yield = current_time - last_yield_time
|
| 314 |
-
|
| 315 |
-
# Decide whether to yield
|
| 316 |
-
should_yield = False
|
| 317 |
-
if not has_yielded_audio and pending_audio_size >= min_chunk_size:
|
| 318 |
-
# First yield: wait for minimum chunk size
|
| 319 |
-
should_yield = True
|
| 320 |
-
has_yielded_audio = True
|
| 321 |
-
elif has_yielded_audio and (pending_audio_size >= min_chunk_size or time_since_last_yield >= min_yield_interval):
|
| 322 |
-
# Subsequent yields: either enough audio or enough time has passed
|
| 323 |
-
should_yield = True
|
| 324 |
-
|
| 325 |
-
if should_yield and pending_chunks:
|
| 326 |
-
# Concatenate and yield only the new audio chunks
|
| 327 |
-
new_audio = np.concatenate(pending_chunks)
|
| 328 |
-
new_duration = len(new_audio) / sample_rate
|
| 329 |
-
total_duration = sum(len(chunk) for chunk in all_audio_chunks) / sample_rate
|
| 330 |
-
|
| 331 |
-
log_update = log + f"🎵 Streaming: {total_duration:.1f}s generated (chunk {chunk_count})\n"
|
| 332 |
-
|
| 333 |
-
# Yield streaming audio chunk and keep complete_audio as None during streaming
|
| 334 |
-
yield (sample_rate, new_audio), None, log_update, gr.update(visible=True)
|
| 335 |
-
|
| 336 |
-
# Clear pending chunks after yielding
|
| 337 |
-
pending_chunks = []
|
| 338 |
-
last_yield_time = current_time
|
| 339 |
-
|
| 340 |
-
# Yield any remaining chunks
|
| 341 |
-
if pending_chunks:
|
| 342 |
-
final_new_audio = np.concatenate(pending_chunks)
|
| 343 |
-
total_duration = sum(len(chunk) for chunk in all_audio_chunks) / sample_rate
|
| 344 |
-
log_update = log + f"🎵 Streaming final chunk: {total_duration:.1f}s total\n"
|
| 345 |
-
yield (sample_rate, final_new_audio), None, log_update, gr.update(visible=True)
|
| 346 |
-
has_yielded_audio = True # Mark that we yielded audio
|
| 347 |
-
|
| 348 |
-
# Wait for generation to complete (with timeout to prevent hanging)
|
| 349 |
-
generation_thread.join(timeout=5.0) # Increased timeout to 5 seconds
|
| 350 |
-
|
| 351 |
-
# If thread is still alive after timeout, force end
|
| 352 |
-
if generation_thread.is_alive():
|
| 353 |
-
print("Warning: Generation thread did not complete within timeout")
|
| 354 |
-
audio_streamer.end()
|
| 355 |
-
generation_thread.join(timeout=5.0)
|
| 356 |
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
# Debug logging
|
| 369 |
-
# print(f"Debug: has_received_chunks={has_received_chunks}, chunk_count={chunk_count}, all_audio_chunks length={len(all_audio_chunks)}")
|
| 370 |
-
|
| 371 |
-
# Check if we received any chunks but didn't yield audio
|
| 372 |
-
if has_received_chunks and not has_yielded_audio and all_audio_chunks:
|
| 373 |
-
# We have chunks but didn't meet the yield criteria, yield them now
|
| 374 |
-
complete_audio = np.concatenate(all_audio_chunks)
|
| 375 |
-
final_duration = len(complete_audio) / sample_rate
|
| 376 |
-
|
| 377 |
-
final_log = log + f"⏱️ Generation completed in {generation_time:.2f} seconds\n"
|
| 378 |
-
final_log += f"🎵 Final audio duration: {final_duration:.2f} seconds\n"
|
| 379 |
-
final_log += f"📊 Total chunks: {chunk_count}\n"
|
| 380 |
-
final_log += "✨ Generation successful! Complete audio is ready.\n"
|
| 381 |
-
final_log += "💡 Not satisfied? You can regenerate or adjust the CFG scale for different results."
|
| 382 |
-
|
| 383 |
-
# Yield the complete audio
|
| 384 |
-
yield None, (sample_rate, complete_audio), final_log, gr.update(visible=False)
|
| 385 |
-
return
|
| 386 |
-
|
| 387 |
-
if not has_received_chunks:
|
| 388 |
-
error_log = log + f"\n❌ Error: No audio chunks were received from the model. Generation time: {generation_time:.2f}s"
|
| 389 |
-
yield None, None, error_log, gr.update(visible=False)
|
| 390 |
-
return
|
| 391 |
-
|
| 392 |
-
if not has_yielded_audio:
|
| 393 |
-
error_log = log + f"\n❌ Error: Audio was generated but not streamed. Chunk count: {chunk_count}"
|
| 394 |
-
yield None, None, error_log, gr.update(visible=False)
|
| 395 |
-
return
|
| 396 |
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
|
| 414 |
-
|
| 415 |
-
# Handle Gradio-specific errors (like input validation)
|
| 416 |
-
self.is_generating = False
|
| 417 |
-
self.current_streamer = None
|
| 418 |
-
error_msg = f"❌ Input Error: {str(e)}"
|
| 419 |
-
print(error_msg)
|
| 420 |
-
yield None, None, error_msg, gr.update(visible=False)
|
| 421 |
-
|
| 422 |
-
except Exception as e:
|
| 423 |
-
self.is_generating = False
|
| 424 |
-
self.current_streamer = None
|
| 425 |
-
error_msg = f"❌ An unexpected error occurred: {str(e)}"
|
| 426 |
-
print(error_msg)
|
| 427 |
-
import traceback
|
| 428 |
-
traceback.print_exc()
|
| 429 |
-
yield None, None, error_msg, gr.update(visible=False)
|
| 430 |
|
| 431 |
-
|
| 432 |
-
def _generate_with_streamer(self, inputs, cfg_scale, audio_streamer):
|
| 433 |
-
"""Helper method to run generation with streamer in a separate thread."""
|
| 434 |
-
try:
|
| 435 |
-
# Check for stop signal before starting generation
|
| 436 |
-
if self.stop_generation:
|
| 437 |
-
audio_streamer.end()
|
| 438 |
-
return
|
| 439 |
-
|
| 440 |
-
# Define a stop check function that can be called from generate
|
| 441 |
-
def check_stop_generation():
|
| 442 |
-
return self.stop_generation
|
| 443 |
-
|
| 444 |
-
outputs = self.model.generate(
|
| 445 |
-
**inputs,
|
| 446 |
-
max_new_tokens=None,
|
| 447 |
-
cfg_scale=cfg_scale,
|
| 448 |
-
tokenizer=self.processor.tokenizer,
|
| 449 |
-
generation_config={
|
| 450 |
-
'do_sample': False,
|
| 451 |
-
},
|
| 452 |
-
audio_streamer=audio_streamer,
|
| 453 |
-
stop_check_fn=check_stop_generation, # Pass the stop check function
|
| 454 |
-
verbose=False, # Disable verbose in streaming mode
|
| 455 |
-
refresh_negative=True,
|
| 456 |
-
)
|
| 457 |
-
|
| 458 |
-
except Exception as e:
|
| 459 |
-
print(f"Error in generation thread: {e}")
|
| 460 |
-
traceback.print_exc()
|
| 461 |
-
# Make sure to end the stream on error
|
| 462 |
-
audio_streamer.end()
|
| 463 |
-
|
| 464 |
-
@spaces.GPU
|
| 465 |
def stop_audio_generation(self):
|
| 466 |
"""Stop the current audio generation process."""
|
| 467 |
self.stop_generation = True
|
|
|
|
| 31 |
logging.set_verbosity_info()
|
| 32 |
logger = logging.get_logger(__name__)
|
| 33 |
|
| 34 |
+
# import os
|
| 35 |
+
# os.environ["FLASH_ATTENTION_2"] = "0"
|
| 36 |
|
| 37 |
|
| 38 |
class VibeVoiceDemo:
|
|
|
|
| 117 |
print(f"Found {len(self.available_voices)} voice files in {voices_dir}")
|
| 118 |
print(f"Available voices: {', '.join(self.available_voices.keys())}")
|
| 119 |
|
|
|
|
| 120 |
def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
|
| 121 |
"""Read and preprocess audio file."""
|
| 122 |
try:
|
|
|
|
| 131 |
return np.array([])
|
| 132 |
|
| 133 |
@spaces.GPU
|
| 134 |
+
def generate_podcast(self, num_speakers: int, script: str,
|
| 135 |
+
speaker_1: str = None, speaker_2: str = None,
|
| 136 |
+
speaker_3: str = None, speaker_4: str = None,
|
| 137 |
+
cfg_scale: float = 1.3):
|
| 138 |
+
"""Single GPU function for full generation (streaming + final)."""
|
| 139 |
+
self.stop_generation = False
|
| 140 |
+
self.is_generating = True
|
| 141 |
+
|
| 142 |
+
if not script.strip():
|
| 143 |
+
raise gr.Error("Please provide a script.")
|
| 144 |
+
|
| 145 |
+
if num_speakers < 1 or num_speakers > 4:
|
| 146 |
+
raise gr.Error("Number of speakers must be 1–4.")
|
| 147 |
+
|
| 148 |
+
selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
|
| 149 |
+
for i, sp in enumerate(selected):
|
| 150 |
+
if not sp or sp not in self.available_voices:
|
| 151 |
+
raise gr.Error(f"Invalid speaker {i+1} selection.")
|
| 152 |
+
|
| 153 |
+
# load voices
|
| 154 |
+
voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected]
|
| 155 |
+
if any(len(v) == 0 for v in voice_samples):
|
| 156 |
+
raise gr.Error("Failed to load one or more voice samples.")
|
| 157 |
+
|
| 158 |
+
# format script
|
| 159 |
+
lines = script.strip().split("\n")
|
| 160 |
+
formatted = []
|
| 161 |
+
for i, line in enumerate(lines):
|
| 162 |
+
line = line.strip()
|
| 163 |
+
if not line:
|
| 164 |
+
continue
|
| 165 |
+
if line.startswith("Speaker "):
|
| 166 |
+
formatted.append(line)
|
| 167 |
+
else:
|
| 168 |
+
sp_id = i % num_speakers
|
| 169 |
+
formatted.append(f"Speaker {sp_id}: {line}")
|
| 170 |
+
formatted_script = "\n".join(formatted)
|
| 171 |
+
|
| 172 |
+
# processor input
|
| 173 |
+
inputs = self.processor(
|
| 174 |
+
text=[formatted_script],
|
| 175 |
+
voice_samples=[voice_samples],
|
| 176 |
+
padding=True,
|
| 177 |
+
return_tensors="pt"
|
| 178 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
+
# === direct generation with streamer ===
|
| 181 |
+
from vibevoice import AudioStreamer, convert_to_16_bit_wav
|
| 182 |
+
audio_streamer = AudioStreamer(batch_size=1)
|
| 183 |
+
start = time.time()
|
| 184 |
+
outputs = self.model.generate(
|
| 185 |
+
**inputs,
|
| 186 |
+
cfg_scale=cfg_scale,
|
| 187 |
+
tokenizer=self.processor.tokenizer,
|
| 188 |
+
audio_streamer=audio_streamer,
|
| 189 |
+
verbose=False
|
| 190 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
+
sample_rate = 24000
|
| 193 |
+
audio_stream = audio_streamer.get_stream(0)
|
| 194 |
+
all_chunks, pending = [], []
|
| 195 |
+
min_chunk_size = sample_rate * 2
|
| 196 |
+
last_yield = time.time()
|
| 197 |
+
|
| 198 |
+
for chunk in audio_stream:
|
| 199 |
+
if torch.is_tensor(chunk):
|
| 200 |
+
chunk = chunk.float().cpu().numpy()
|
| 201 |
+
if chunk.ndim > 1:
|
| 202 |
+
chunk = chunk.squeeze()
|
| 203 |
+
chunk16 = convert_to_16_bit_wav(chunk)
|
| 204 |
+
all_chunks.append(chunk16)
|
| 205 |
+
pending.append(chunk16)
|
| 206 |
+
if sum(len(c) for c in pending) >= min_chunk_size or (time.time() - last_yield) > 5:
|
| 207 |
+
new_audio = np.concatenate(pending)
|
| 208 |
+
yield (sample_rate, new_audio), None, f"Streaming {len(all_chunks)} chunks..."
|
| 209 |
+
pending = []
|
| 210 |
+
last_yield = time.time()
|
| 211 |
+
|
| 212 |
+
if all_chunks:
|
| 213 |
+
complete = np.concatenate(all_chunks)
|
| 214 |
+
total_dur = len(complete) / sample_rate
|
| 215 |
+
log = f"✅ Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio"
|
| 216 |
+
yield None, (sample_rate, complete), log
|
| 217 |
+
else:
|
| 218 |
+
yield None, None, "❌ No audio generated."
|
| 219 |
|
| 220 |
+
self.is_generating = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
def stop_audio_generation(self):
|
| 224 |
"""Stop the current audio generation process."""
|
| 225 |
self.stop_generation = True
|