import argparse import os import time import numpy as np import gradio as gr import librosa import soundfile as sf import torch import traceback from spaces import GPU from vibevoice.modular.configuration_vibevoice import VibeVoiceConfig from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor from transformers.utils import logging from transformers import set_seed logging.set_verbosity_info() logger = logging.get_logger(__name__) class VibeVoiceDemo: def __init__(self, model_path: str, device: str = "cuda", inference_steps: int = 5): self.model_path = model_path self.device = device self.inference_steps = inference_steps self.is_generating = False self.processor = None self.model = None self.available_voices = {} self.load_model() self.setup_voice_presets() self.load_example_scripts() def load_model(self): print(f"Loading processor & model from {self.model_path}") self.processor = VibeVoiceProcessor.from_pretrained(self.model_path) self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( self.model_path, torch_dtype=torch.bfloat16, device_map=self.device ) self.model.eval() self.model.set_ddpm_inference_steps(num_steps=self.inference_steps) def setup_voice_presets(self): voices_dir = os.path.join(os.path.dirname(__file__), "voices") if not os.path.exists(voices_dir): print(f"Warning: Voices directory not found at {voices_dir}") return wav_files = [f for f in os.listdir(voices_dir) if f.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac'))] for wav_file in wav_files: name = os.path.splitext(wav_file)[0] self.available_voices[name] = os.path.join(voices_dir, wav_file) print(f"Voices loaded: {list(self.available_voices.keys())}") def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray: try: wav, sr = sf.read(audio_path) if len(wav.shape) > 1: wav = np.mean(wav, axis=1) if sr != target_sr: wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr) return wav except Exception as e: print(f"Error reading audio {audio_path}: {e}") return np.array([]) @GPU def generate_podcast(self, num_speakers: int, script: str, speaker_1: str = None, speaker_2: str = None, speaker_3: str = None, speaker_4: str = None, cfg_scale: float = 1.3): """Final audio generation only (no streaming).""" self.is_generating = True if not script.strip(): raise gr.Error("Please provide a script.") if num_speakers < 1 or num_speakers > 4: raise gr.Error("Number of speakers must be 1–4.") selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers] for i, sp in enumerate(selected): if not sp or sp not in self.available_voices: raise gr.Error(f"Invalid speaker {i+1} selection.") voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected] if any(len(v) == 0 for v in voice_samples): raise gr.Error("Failed to load one or more voice samples.") # format script lines = script.strip().split("\n") formatted = [] for i, line in enumerate(lines): line = line.strip() if not line: continue if line.startswith("Speaker "): formatted.append(line) else: sp_id = i % num_speakers formatted.append(f"Speaker {sp_id}: {line}") formatted_script = "\n".join(formatted) # processor input inputs = self.processor( text=[formatted_script], voice_samples=[voice_samples], padding=True, return_tensors="pt" ) start = time.time() outputs = self.model.generate( **inputs, cfg_scale=cfg_scale, tokenizer=self.processor.tokenizer, verbose=False ) # Extract audio if isinstance(outputs, dict) and "audio" in outputs: audio = outputs["audio"] else: audio = outputs if torch.is_tensor(audio): audio = audio.float().cpu().numpy() if audio.ndim > 1: audio = audio.squeeze() sample_rate = 24000 audio16 = convert_to_16_bit_wav(audio) total_dur = len(audio16) / sample_rate log = f"✅ Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio" self.is_generating = False return (sample_rate, audio16), log def load_example_scripts(self): examples_dir = os.path.join(os.path.dirname(__file__), "text_examples") self.example_scripts = [] if not os.path.exists(examples_dir): return txt_files = sorted([f for f in os.listdir(examples_dir) if f.lower().endswith('.txt')]) for txt_file in txt_files: try: with open(os.path.join(examples_dir, txt_file), 'r', encoding='utf-8') as f: script_content = f.read().strip() if script_content: self.example_scripts.append([1, script_content]) except Exception as e: print(f"Error loading {txt_file}: {e}") def convert_to_16_bit_wav(data): if torch.is_tensor(data): data = data.detach().cpu().numpy() data = np.array(data) if np.max(np.abs(data)) > 1.0: data = data / np.max(np.abs(data)) return (data * 32767).astype(np.int16) def create_demo_interface(demo_instance: VibeVoiceDemo): with gr.Blocks( title="VibeVoice - AI Podcast Generator", theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple") ) as interface: gr.Markdown("## 🎙️ VibeVoice Podcast Generator (Final Audio Only)") num_speakers = gr.Slider(1, 4, value=2, step=1, label="Number of Speakers") available_speaker_names = list(demo_instance.available_voices.keys()) default_speakers = available_speaker_names[:4] speaker_selections = [] for i in range(4): speaker = gr.Dropdown( choices=available_speaker_names, value=default_speakers[i] if i < len(default_speakers) else None, label=f"Speaker {i+1}", visible=(i < 2) ) speaker_selections.append(speaker) cfg_scale = gr.Slider(1.0, 2.0, value=1.3, step=0.05, label="CFG Scale") script_input = gr.Textbox( label="Podcast Script", placeholder="Enter your script here...", lines=10 ) generate_btn = gr.Button("🚀 Generate Podcast") audio_output = gr.Audio( label="Generated Podcast (Download)", type="numpy", show_download_button=True ) log_output = gr.Textbox(label="Log", interactive=False, lines=5) def generate_podcast_wrapper(num_speakers, script, *speakers_and_params): try: speakers = speakers_and_params[:4] cfg_scale = speakers_and_params[4] audio, log = demo_instance.generate_podcast( num_speakers=int(num_speakers), script=script, speaker_1=speakers[0], speaker_2=speakers[1], speaker_3=speakers[2], speaker_4=speakers[3], cfg_scale=cfg_scale ) return audio, log except Exception as e: traceback.print_exc() return None, f"❌ Error: {str(e)}" generate_btn.click( fn=generate_podcast_wrapper, inputs=[num_speakers, script_input] + speaker_selections + [cfg_scale], outputs=[audio_output, log_output] ) return interface def run_demo( model_path: str = "microsoft/VibeVoice-1.5B", device: str = "cuda", inference_steps: int = 5, share: bool = True, ): set_seed(42) demo_instance = VibeVoiceDemo(model_path, device, inference_steps) interface = create_demo_interface(demo_instance) interface.queue().launch( share=share, server_name="0.0.0.0" if share else "127.0.0.1", show_error=True, show_api=False ) if __name__ == "__main__": run_demo()