Spaces:

yasserrmd
/

VibeVoice

Running on Zero

App Files Files Community

yasserrmd commited on Aug 26

Commit

41155d1

verified ·

1 Parent(s): c731fc4

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -329

app.py CHANGED Viewed

@@ -31,8 +31,8 @@ from transformers import set_seed
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
-import os
-os.environ["FLASH_ATTENTION_2"] = "0"
 class VibeVoiceDemo:
@@ -117,7 +117,6 @@ class VibeVoiceDemo:
         print(f"Found {len(self.available_voices)} voice files in {voices_dir}")
         print(f"Available voices: {', '.join(self.available_voices.keys())}")
-    @spaces.GPU
     def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
         """Read and preprocess audio file."""
         try:
@@ -132,336 +131,95 @@ class VibeVoiceDemo:
             return np.array([])
     @spaces.GPU
-    def generate_podcast_streaming(self,
-                                 num_speakers: int,
-                                 script: str,
-                                 speaker_1: str = None,
-                                 speaker_2: str = None,
-                                 speaker_3: str = None,
-                                 speaker_4: str = None,
-                                 cfg_scale: float = 1.3) -> Iterator[tuple]:
-        try:
-            # Reset stop flag and set generating state
-            self.stop_generation = False
-            self.is_generating = True
-            # Validate inputs
-            if not script.strip():
-                self.is_generating = False
-                raise gr.Error("Error: Please provide a script.")
-            # Defend against common mistake
-            script = script.replace("’", "'")
-            if num_speakers < 1 or num_speakers > 4:
-                self.is_generating = False
-                raise gr.Error("Error: Number of speakers must be between 1 and 4.")
-            # Collect selected speakers
-            selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
-            # Validate speaker selections
-            for i, speaker in enumerate(selected_speakers):
-                if not speaker or speaker not in self.available_voices:
-                    self.is_generating = False
-                    raise gr.Error(f"Error: Please select a valid speaker for Speaker {i+1}.")
-            # Build initial log
-            log = f"🎙️ Generating podcast with {num_speakers} speakers\n"
-            log += f"📊 Parameters: CFG Scale={cfg_scale}, Inference Steps={self.inference_steps}\n"
-            log += f"🎭 Speakers: {', '.join(selected_speakers)}\n"
-            # Check for stop signal
-            if self.stop_generation:
-                self.is_generating = False
-                yield None, "🛑 Generation stopped by user", gr.update(visible=False)
-                return
-            # Load voice samples
-            voice_samples = []
-            for speaker_name in selected_speakers:
-                audio_path = self.available_voices[speaker_name]
-                audio_data = self.read_audio(audio_path)
-                if len(audio_data) == 0:
-                    self.is_generating = False
-                    raise gr.Error(f"Error: Failed to load audio for {speaker_name}")
-                voice_samples.append(audio_data)
-            # log += f"✅ Loaded {len(voice_samples)} voice samples\n"
-            # Check for stop signal
-            if self.stop_generation:
-                self.is_generating = False
-                yield None, "🛑 Generation stopped by user", gr.update(visible=False)
-                return
-            # Parse script to assign speaker ID's
-            lines = script.strip().split('\n')
-            formatted_script_lines = []
-            for line in lines:
-                line = line.strip()
-                if not line:
-                    continue
-                # Check if line already has speaker format
-                if line.startswith('Speaker ') and ':' in line:
-                    formatted_script_lines.append(line)
-                else:
-                    # Auto-assign to speakers in rotation
-                    speaker_id = len(formatted_script_lines) % num_speakers
-                    formatted_script_lines.append(f"Speaker {speaker_id}: {line}")
-            formatted_script = '\n'.join(formatted_script_lines)
-            log += f"📝 Formatted script with {len(formatted_script_lines)} turns\n\n"
-            log += "🔄 Processing with VibeVoice (streaming mode)...\n"
-            # Check for stop signal before processing
-            if self.stop_generation:
-                self.is_generating = False
-                yield None, "🛑 Generation stopped by user", gr.update(visible=False)
-                return
-            start_time = time.time()
-            inputs = self.processor(
-                text=[formatted_script],
-                voice_samples=[voice_samples],
-                padding=True,
-                return_tensors="pt",
-                return_attention_mask=True,
-            )
-            # Create audio streamer
-            audio_streamer = AudioStreamer(
-                batch_size=1,
-                stop_signal=None,
-                timeout=None
-            )
-            # Store current streamer for potential stopping
-            self.current_streamer = audio_streamer
-            # Start generation in a separate thread
-            generation_thread = threading.Thread(
-                target=self._generate_with_streamer,
-                args=(inputs, cfg_scale, audio_streamer)
-            )
-            generation_thread.start()
-            # Wait for generation to actually start producing audio
-            time.sleep(1)  # Reduced from 3 to 1 second
-            # Check for stop signal after thread start
-            if self.stop_generation:
-                audio_streamer.end()
-                generation_thread.join(timeout=5.0)  # Wait up to 5 seconds for thread to finish
-                self.is_generating = False
-                yield None, "🛑 Generation stopped by user", gr.update(visible=False)
-                return
-            # Collect audio chunks as they arrive
-            sample_rate = 24000
-            all_audio_chunks = []  # For final statistics
-            pending_chunks = []  # Buffer for accumulating small chunks
-            chunk_count = 0
-            last_yield_time = time.time()
-            min_yield_interval = 15 # Yield every 15 seconds
-            min_chunk_size = sample_rate * 30 # At least 2 seconds of audio
-            # Get the stream for the first (and only) sample
-            audio_stream = audio_streamer.get_stream(0)
-            has_yielded_audio = False
-            has_received_chunks = False  # Track if we received any chunks at all
-            for audio_chunk in audio_stream:
-                # Check for stop signal in the streaming loop
-                if self.stop_generation:
-                    audio_streamer.end()
-                    break
-                chunk_count += 1
-                has_received_chunks = True  # Mark that we received at least one chunk
-                # Convert tensor to numpy
-                if torch.is_tensor(audio_chunk):
-                    # Convert bfloat16 to float32 first, then to numpy
-                    if audio_chunk.dtype == torch.bfloat16:
-                        audio_chunk = audio_chunk.float()
-                    audio_np = audio_chunk.cpu().numpy().astype(np.float32)
-                else:
-                    audio_np = np.array(audio_chunk, dtype=np.float32)
-                # Ensure audio is 1D and properly normalized
-                if len(audio_np.shape) > 1:
-                    audio_np = audio_np.squeeze()
-                # Convert to 16-bit for Gradio
-                audio_16bit = convert_to_16_bit_wav(audio_np)
-                # Store for final statistics
-                all_audio_chunks.append(audio_16bit)
-                # Add to pending chunks buffer
-                pending_chunks.append(audio_16bit)
-                # Calculate pending audio size
-                pending_audio_size = sum(len(chunk) for chunk in pending_chunks)
-                current_time = time.time()
-                time_since_last_yield = current_time - last_yield_time
-                # Decide whether to yield
-                should_yield = False
-                if not has_yielded_audio and pending_audio_size >= min_chunk_size:
-                    # First yield: wait for minimum chunk size
-                    should_yield = True
-                    has_yielded_audio = True
-                elif has_yielded_audio and (pending_audio_size >= min_chunk_size or time_since_last_yield >= min_yield_interval):
-                    # Subsequent yields: either enough audio or enough time has passed
-                    should_yield = True
-                if should_yield and pending_chunks:
-                    # Concatenate and yield only the new audio chunks
-                    new_audio = np.concatenate(pending_chunks)
-                    new_duration = len(new_audio) / sample_rate
-                    total_duration = sum(len(chunk) for chunk in all_audio_chunks) / sample_rate
-                    log_update = log + f"🎵 Streaming: {total_duration:.1f}s generated (chunk {chunk_count})\n"
-                    # Yield streaming audio chunk and keep complete_audio as None during streaming
-                    yield (sample_rate, new_audio), None, log_update, gr.update(visible=True)
-                    # Clear pending chunks after yielding
-                    pending_chunks = []
-                    last_yield_time = current_time
-            # Yield any remaining chunks
-            if pending_chunks:
-                final_new_audio = np.concatenate(pending_chunks)
-                total_duration = sum(len(chunk) for chunk in all_audio_chunks) / sample_rate
-                log_update = log + f"🎵 Streaming final chunk: {total_duration:.1f}s total\n"
-                yield (sample_rate, final_new_audio), None, log_update, gr.update(visible=True)
-                has_yielded_audio = True  # Mark that we yielded audio
-            # Wait for generation to complete (with timeout to prevent hanging)
-            generation_thread.join(timeout=5.0)  # Increased timeout to 5 seconds
-            # If thread is still alive after timeout, force end
-            if generation_thread.is_alive():
-                print("Warning: Generation thread did not complete within timeout")
-                audio_streamer.end()
-                generation_thread.join(timeout=5.0)
-            # Clean up
-            self.current_streamer = None
-            self.is_generating = False
-            generation_time = time.time() - start_time
-            # Check if stopped by user
-            if self.stop_generation:
-                yield None, None, "🛑 Generation stopped by user", gr.update(visible=False)
-                return
-            # Debug logging
-            # print(f"Debug: has_received_chunks={has_received_chunks}, chunk_count={chunk_count}, all_audio_chunks length={len(all_audio_chunks)}")
-            # Check if we received any chunks but didn't yield audio
-            if has_received_chunks and not has_yielded_audio and all_audio_chunks:
-                # We have chunks but didn't meet the yield criteria, yield them now
-                complete_audio = np.concatenate(all_audio_chunks)
-                final_duration = len(complete_audio) / sample_rate
-                final_log = log + f"⏱️ Generation completed in {generation_time:.2f} seconds\n"
-                final_log += f"🎵 Final audio duration: {final_duration:.2f} seconds\n"
-                final_log += f"📊 Total chunks: {chunk_count}\n"
-                final_log += "✨ Generation successful! Complete audio is ready.\n"
-                final_log += "💡 Not satisfied? You can regenerate or adjust the CFG scale for different results."
-                # Yield the complete audio
-                yield None, (sample_rate, complete_audio), final_log, gr.update(visible=False)
-                return
-            if not has_received_chunks:
-                error_log = log + f"\n❌ Error: No audio chunks were received from the model. Generation time: {generation_time:.2f}s"
-                yield None, None, error_log, gr.update(visible=False)
-                return
-            if not has_yielded_audio:
-                error_log = log + f"\n❌ Error: Audio was generated but not streamed. Chunk count: {chunk_count}"
-                yield None, None, error_log, gr.update(visible=False)
-                return
-            # Prepare the complete audio
-            if all_audio_chunks:
-                complete_audio = np.concatenate(all_audio_chunks)
-                final_duration = len(complete_audio) / sample_rate
-                final_log = log + f"⏱️ Generation completed in {generation_time:.2f} seconds\n"
-                final_log += f"🎵 Final audio duration: {final_duration:.2f} seconds\n"
-                final_log += f"📊 Total chunks: {chunk_count}\n"
-                final_log += "✨ Generation successful! Complete audio is ready in the 'Complete Audio' tab.\n"
-                final_log += "💡 Not satisfied? You can regenerate or adjust the CFG scale for different results."
-                # Final yield: Clear streaming audio and provide complete audio
-                yield None, (sample_rate, complete_audio), final_log, gr.update(visible=False)
-            else:
-                final_log = log + "❌ No audio was generated."
-                yield None, None, final_log, gr.update(visible=False)
-        except gr.Error as e:
-            # Handle Gradio-specific errors (like input validation)
-            self.is_generating = False
-            self.current_streamer = None
-            error_msg = f"❌ Input Error: {str(e)}"
-            print(error_msg)
-            yield None, None, error_msg, gr.update(visible=False)
-        except Exception as e:
-            self.is_generating = False
-            self.current_streamer = None
-            error_msg = f"❌ An unexpected error occurred: {str(e)}"
-            print(error_msg)
-            import traceback
-            traceback.print_exc()
-            yield None, None, error_msg, gr.update(visible=False)
-    @spaces.GPU
-    def _generate_with_streamer(self, inputs, cfg_scale, audio_streamer):
-        """Helper method to run generation with streamer in a separate thread."""
-        try:
-            # Check for stop signal before starting generation
-            if self.stop_generation:
-                audio_streamer.end()
-                return
-            # Define a stop check function that can be called from generate
-            def check_stop_generation():
-                return self.stop_generation
-            outputs = self.model.generate(
-                **inputs,
-                max_new_tokens=None,
-                cfg_scale=cfg_scale,
-                tokenizer=self.processor.tokenizer,
-                generation_config={
-                    'do_sample': False,
-                },
-                audio_streamer=audio_streamer,
-                stop_check_fn=check_stop_generation,  # Pass the stop check function
-                verbose=False,  # Disable verbose in streaming mode
-                refresh_negative=True,
-            )
-        except Exception as e:
-            print(f"Error in generation thread: {e}")
-            traceback.print_exc()
-            # Make sure to end the stream on error
-            audio_streamer.end()
-    @spaces.GPU
     def stop_audio_generation(self):
         """Stop the current audio generation process."""
         self.stop_generation = True

 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
+# import os
+# os.environ["FLASH_ATTENTION_2"] = "0"
 class VibeVoiceDemo:
         print(f"Found {len(self.available_voices)} voice files in {voices_dir}")
         print(f"Available voices: {', '.join(self.available_voices.keys())}")
     def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
         """Read and preprocess audio file."""
         try:
             return np.array([])
     @spaces.GPU
+    def generate_podcast(self, num_speakers: int, script: str,
+                         speaker_1: str = None, speaker_2: str = None,
+                         speaker_3: str = None, speaker_4: str = None,
+                         cfg_scale: float = 1.3):
+        """Single GPU function for full generation (streaming + final)."""
+        self.stop_generation = False
+        self.is_generating = True
+        if not script.strip():
+            raise gr.Error("Please provide a script.")
+        if num_speakers < 1 or num_speakers > 4:
+            raise gr.Error("Number of speakers must be 1–4.")
+        selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
+        for i, sp in enumerate(selected):
+            if not sp or sp not in self.available_voices:
+                raise gr.Error(f"Invalid speaker {i+1} selection.")
+        # load voices
+        voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected]
+        if any(len(v) == 0 for v in voice_samples):
+            raise gr.Error("Failed to load one or more voice samples.")
+        # format script
+        lines = script.strip().split("\n")
+        formatted = []
+        for i, line in enumerate(lines):
+            line = line.strip()
+            if not line:
+                continue
+            if line.startswith("Speaker "):
+                formatted.append(line)
+            else:
+                sp_id = i % num_speakers
+                formatted.append(f"Speaker {sp_id}: {line}")
+        formatted_script = "\n".join(formatted)
+        # processor input
+        inputs = self.processor(
+            text=[formatted_script],
+            voice_samples=[voice_samples],
+            padding=True,
+            return_tensors="pt"
+        )
+        # === direct generation with streamer ===
+        from vibevoice import AudioStreamer, convert_to_16_bit_wav
+        audio_streamer = AudioStreamer(batch_size=1)
+        start = time.time()
+        outputs = self.model.generate(
+            **inputs,
+            cfg_scale=cfg_scale,
+            tokenizer=self.processor.tokenizer,
+            audio_streamer=audio_streamer,
+            verbose=False
+        )
+        sample_rate = 24000
+        audio_stream = audio_streamer.get_stream(0)
+        all_chunks, pending = [], []
+        min_chunk_size = sample_rate * 2
+        last_yield = time.time()
+        for chunk in audio_stream:
+            if torch.is_tensor(chunk):
+                chunk = chunk.float().cpu().numpy()
+            if chunk.ndim > 1:
+                chunk = chunk.squeeze()
+            chunk16 = convert_to_16_bit_wav(chunk)
+            all_chunks.append(chunk16)
+            pending.append(chunk16)
+            if sum(len(c) for c in pending) >= min_chunk_size or (time.time() - last_yield) > 5:
+                new_audio = np.concatenate(pending)
+                yield (sample_rate, new_audio), None, f"Streaming {len(all_chunks)} chunks..."
+                pending = []
+                last_yield = time.time()
+        if all_chunks:
+            complete = np.concatenate(all_chunks)
+            total_dur = len(complete) / sample_rate
+            log = f"✅ Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio"
+            yield None, (sample_rate, complete), log
+        else:
+            yield None, None, "❌ No audio generated."
+        self.is_generating = False
     def stop_audio_generation(self):
         """Stop the current audio generation process."""
         self.stop_generation = True