Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -73,22 +73,23 @@ class VibeVoiceDemo:
|
|
73 |
cfg_scale: float = 1.3):
|
74 |
"""Final audio generation only (no streaming)."""
|
75 |
self.is_generating = True
|
76 |
-
|
77 |
if not script.strip():
|
78 |
raise gr.Error("Please provide a script.")
|
79 |
-
|
80 |
if num_speakers < 1 or num_speakers > 4:
|
81 |
raise gr.Error("Number of speakers must be 1β4.")
|
82 |
-
|
|
|
83 |
selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
|
84 |
for i, sp in enumerate(selected):
|
85 |
if not sp or sp not in self.available_voices:
|
86 |
raise gr.Error(f"Invalid speaker {i+1} selection.")
|
87 |
-
|
88 |
voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected]
|
89 |
if any(len(v) == 0 for v in voice_samples):
|
90 |
raise gr.Error("Failed to load one or more voice samples.")
|
91 |
-
|
92 |
# format script
|
93 |
lines = script.strip().split("\n")
|
94 |
formatted = []
|
@@ -102,7 +103,7 @@ class VibeVoiceDemo:
|
|
102 |
sp_id = i % num_speakers
|
103 |
formatted.append(f"Speaker {sp_id}: {line}")
|
104 |
formatted_script = "\n".join(formatted)
|
105 |
-
|
106 |
# processor input
|
107 |
inputs = self.processor(
|
108 |
text=[formatted_script],
|
@@ -110,7 +111,7 @@ class VibeVoiceDemo:
|
|
110 |
padding=True,
|
111 |
return_tensors="pt"
|
112 |
)
|
113 |
-
|
114 |
start = time.time()
|
115 |
outputs = self.model.generate(
|
116 |
**inputs,
|
@@ -118,35 +119,47 @@ class VibeVoiceDemo:
|
|
118 |
tokenizer=self.processor.tokenizer,
|
119 |
verbose=False
|
120 |
)
|
121 |
-
|
122 |
-
# --- handle model output ---
|
123 |
if hasattr(outputs, "audio"):
|
124 |
audio = outputs.audio
|
125 |
-
elif hasattr(outputs, "audios"):
|
126 |
audio = outputs.audios[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
else:
|
128 |
-
raise gr.Error("Model did not return audio in expected format.")
|
129 |
-
|
|
|
130 |
if torch.is_tensor(audio):
|
131 |
audio = audio.float().cpu().numpy()
|
132 |
if audio.ndim > 1:
|
133 |
audio = audio.squeeze()
|
134 |
-
|
135 |
sample_rate = 24000
|
136 |
-
|
137 |
-
|
138 |
-
|
|
|
139 |
os.makedirs("outputs", exist_ok=True)
|
|
|
|
|
140 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
141 |
file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
|
142 |
-
sf.write(file_path,
|
|
|
143 |
print(f"πΎ Saved podcast to {file_path}")
|
144 |
-
|
145 |
-
total_dur = len(
|
146 |
log = f"β
Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
|
147 |
-
|
148 |
self.is_generating = False
|
149 |
-
return (sample_rate,
|
|
|
150 |
|
151 |
def load_example_scripts(self):
|
152 |
examples_dir = os.path.join(os.path.dirname(__file__), "text_examples")
|
@@ -175,42 +188,112 @@ def convert_to_16_bit_wav(data):
|
|
175 |
|
176 |
|
177 |
def create_demo_interface(demo_instance: VibeVoiceDemo):
|
178 |
-
|
179 |
-
title="VibeVoice - AI Podcast Generator",
|
180 |
-
theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple")
|
181 |
-
) as interface:
|
182 |
-
|
183 |
-
gr.Markdown("## ποΈ VibeVoice Podcast Generator (Final Audio Only)")
|
184 |
-
|
185 |
-
num_speakers = gr.Slider(1, 4, value=2, step=1, label="Number of Speakers")
|
186 |
-
available_speaker_names = list(demo_instance.available_voices.keys())
|
187 |
-
default_speakers = available_speaker_names[:4]
|
188 |
-
|
189 |
-
speaker_selections = []
|
190 |
-
for i in range(4):
|
191 |
-
speaker = gr.Dropdown(
|
192 |
-
choices=available_speaker_names,
|
193 |
-
value=default_speakers[i] if i < len(default_speakers) else None,
|
194 |
-
label=f"Speaker {i+1}",
|
195 |
-
visible=(i < 2)
|
196 |
-
)
|
197 |
-
speaker_selections.append(speaker)
|
198 |
|
199 |
-
|
|
|
200 |
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
|
|
|
|
|
|
205 |
)
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
)
|
213 |
-
log_output = gr.Textbox(label="Log", interactive=False, lines=5)
|
214 |
|
215 |
def generate_podcast_wrapper(num_speakers, script, *speakers_and_params):
|
216 |
try:
|
@@ -233,10 +316,39 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
|
|
233 |
generate_btn.click(
|
234 |
fn=generate_podcast_wrapper,
|
235 |
inputs=[num_speakers, script_input] + speaker_selections + [cfg_scale],
|
236 |
-
outputs=[
|
|
|
237 |
)
|
238 |
|
239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
|
241 |
|
242 |
def run_demo(
|
|
|
73 |
cfg_scale: float = 1.3):
|
74 |
"""Final audio generation only (no streaming)."""
|
75 |
self.is_generating = True
|
76 |
+
|
77 |
if not script.strip():
|
78 |
raise gr.Error("Please provide a script.")
|
79 |
+
|
80 |
if num_speakers < 1 or num_speakers > 4:
|
81 |
raise gr.Error("Number of speakers must be 1β4.")
|
82 |
+
|
83 |
+
# collect speakers
|
84 |
selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
|
85 |
for i, sp in enumerate(selected):
|
86 |
if not sp or sp not in self.available_voices:
|
87 |
raise gr.Error(f"Invalid speaker {i+1} selection.")
|
88 |
+
|
89 |
voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected]
|
90 |
if any(len(v) == 0 for v in voice_samples):
|
91 |
raise gr.Error("Failed to load one or more voice samples.")
|
92 |
+
|
93 |
# format script
|
94 |
lines = script.strip().split("\n")
|
95 |
formatted = []
|
|
|
103 |
sp_id = i % num_speakers
|
104 |
formatted.append(f"Speaker {sp_id}: {line}")
|
105 |
formatted_script = "\n".join(formatted)
|
106 |
+
|
107 |
# processor input
|
108 |
inputs = self.processor(
|
109 |
text=[formatted_script],
|
|
|
111 |
padding=True,
|
112 |
return_tensors="pt"
|
113 |
)
|
114 |
+
|
115 |
start = time.time()
|
116 |
outputs = self.model.generate(
|
117 |
**inputs,
|
|
|
119 |
tokenizer=self.processor.tokenizer,
|
120 |
verbose=False
|
121 |
)
|
122 |
+
|
123 |
+
# --- handle model output robustly ---
|
124 |
if hasattr(outputs, "audio"):
|
125 |
audio = outputs.audio
|
126 |
+
elif hasattr(outputs, "audios") and outputs.audios:
|
127 |
audio = outputs.audios[0]
|
128 |
+
elif hasattr(outputs, "waveform"):
|
129 |
+
audio = outputs.waveform
|
130 |
+
elif hasattr(outputs, "waveforms") and outputs.waveforms:
|
131 |
+
audio = outputs.waveforms[0]
|
132 |
+
elif hasattr(outputs, "speech_outputs") and outputs.speech_outputs:
|
133 |
+
audio = outputs.speech_outputs[0]
|
134 |
else:
|
135 |
+
raise gr.Error(f"Model did not return audio in expected format. Got attributes: {dir(outputs)}")
|
136 |
+
|
137 |
+
# convert to numpy
|
138 |
if torch.is_tensor(audio):
|
139 |
audio = audio.float().cpu().numpy()
|
140 |
if audio.ndim > 1:
|
141 |
audio = audio.squeeze()
|
142 |
+
|
143 |
sample_rate = 24000
|
144 |
+
# ensure float32 for saving and returning
|
145 |
+
audio = audio.astype("float32")
|
146 |
+
|
147 |
+
# save automatically to disk
|
148 |
os.makedirs("outputs", exist_ok=True)
|
149 |
+
from datetime import datetime
|
150 |
+
import soundfile as sf
|
151 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
152 |
file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
|
153 |
+
sf.write(file_path, audio, sample_rate) # soundfile handles float32
|
154 |
+
|
155 |
print(f"πΎ Saved podcast to {file_path}")
|
156 |
+
|
157 |
+
total_dur = len(audio) / sample_rate
|
158 |
log = f"β
Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
|
159 |
+
|
160 |
self.is_generating = False
|
161 |
+
return (sample_rate, audio), log
|
162 |
+
|
163 |
|
164 |
def load_example_scripts(self):
|
165 |
examples_dir = os.path.join(os.path.dirname(__file__), "text_examples")
|
|
|
188 |
|
189 |
|
190 |
def create_demo_interface(demo_instance: VibeVoiceDemo):
|
191 |
+
"""Create the Gradio interface (final audio only, no streaming)."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
|
193 |
+
# Custom CSS for high-end aesthetics
|
194 |
+
custom_css = """ ... """ # (keep your CSS unchanged)
|
195 |
|
196 |
+
with gr.Blocks(
|
197 |
+
title="VibeVoice - AI Podcast Generator",
|
198 |
+
css=custom_css,
|
199 |
+
theme=gr.themes.Soft(
|
200 |
+
primary_hue="blue",
|
201 |
+
secondary_hue="purple",
|
202 |
+
neutral_hue="slate",
|
203 |
)
|
204 |
+
) as interface:
|
205 |
+
|
206 |
+
# Header
|
207 |
+
gr.HTML("""
|
208 |
+
<div class="main-header">
|
209 |
+
<h1>ποΈ Vibe Podcasting</h1>
|
210 |
+
<p>Generating Long-form Multi-speaker AI Podcast with VibeVoice</p>
|
211 |
+
</div>
|
212 |
+
""")
|
213 |
+
|
214 |
+
with gr.Row():
|
215 |
+
# Left column - Settings
|
216 |
+
with gr.Column(scale=1, elem_classes="settings-card"):
|
217 |
+
gr.Markdown("### ποΈ **Podcast Settings**")
|
218 |
+
|
219 |
+
num_speakers = gr.Slider(
|
220 |
+
minimum=1, maximum=4, value=2, step=1,
|
221 |
+
label="Number of Speakers",
|
222 |
+
elem_classes="slider-container"
|
223 |
+
)
|
224 |
+
|
225 |
+
gr.Markdown("### π **Speaker Selection**")
|
226 |
+
available_speaker_names = list(demo_instance.available_voices.keys())
|
227 |
+
default_speakers = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman']
|
228 |
+
|
229 |
+
speaker_selections = []
|
230 |
+
for i in range(4):
|
231 |
+
default_value = default_speakers[i] if i < len(default_speakers) else None
|
232 |
+
speaker = gr.Dropdown(
|
233 |
+
choices=available_speaker_names,
|
234 |
+
value=default_value,
|
235 |
+
label=f"Speaker {i+1}",
|
236 |
+
visible=(i < 2),
|
237 |
+
elem_classes="speaker-item"
|
238 |
+
)
|
239 |
+
speaker_selections.append(speaker)
|
240 |
+
|
241 |
+
gr.Markdown("### βοΈ **Advanced Settings**")
|
242 |
+
with gr.Accordion("Generation Parameters", open=False):
|
243 |
+
cfg_scale = gr.Slider(
|
244 |
+
minimum=1.0, maximum=2.0, value=1.3, step=0.05,
|
245 |
+
label="CFG Scale (Guidance Strength)",
|
246 |
+
elem_classes="slider-container"
|
247 |
+
)
|
248 |
+
|
249 |
+
# Right column - Generation
|
250 |
+
with gr.Column(scale=2, elem_classes="generation-card"):
|
251 |
+
gr.Markdown("### π **Script Input**")
|
252 |
+
script_input = gr.Textbox(
|
253 |
+
label="Conversation Script",
|
254 |
+
placeholder="Enter your podcast script here...",
|
255 |
+
lines=12,
|
256 |
+
max_lines=20,
|
257 |
+
elem_classes="script-input"
|
258 |
+
)
|
259 |
+
|
260 |
+
with gr.Row():
|
261 |
+
random_example_btn = gr.Button(
|
262 |
+
"π² Random Example", size="lg",
|
263 |
+
variant="secondary", elem_classes="random-btn", scale=1
|
264 |
+
)
|
265 |
+
generate_btn = gr.Button(
|
266 |
+
"π Generate Podcast", size="lg",
|
267 |
+
variant="primary", elem_classes="generate-btn", scale=2
|
268 |
+
)
|
269 |
+
|
270 |
+
# Output section
|
271 |
+
gr.Markdown("### π΅ **Generated Podcast**")
|
272 |
+
complete_audio_output = gr.Audio(
|
273 |
+
label="Complete Podcast (Download)",
|
274 |
+
type="numpy",
|
275 |
+
elem_classes="audio-output complete-audio-section",
|
276 |
+
autoplay=False,
|
277 |
+
show_download_button=True,
|
278 |
+
visible=True
|
279 |
+
)
|
280 |
+
|
281 |
+
log_output = gr.Textbox(
|
282 |
+
label="Generation Log",
|
283 |
+
lines=8, max_lines=15,
|
284 |
+
interactive=False,
|
285 |
+
elem_classes="log-output"
|
286 |
+
)
|
287 |
+
|
288 |
+
# === logic ===
|
289 |
+
def update_speaker_visibility(num_speakers):
|
290 |
+
return [gr.update(visible=(i < num_speakers)) for i in range(4)]
|
291 |
+
|
292 |
+
num_speakers.change(
|
293 |
+
fn=update_speaker_visibility,
|
294 |
+
inputs=[num_speakers],
|
295 |
+
outputs=speaker_selections
|
296 |
)
|
|
|
297 |
|
298 |
def generate_podcast_wrapper(num_speakers, script, *speakers_and_params):
|
299 |
try:
|
|
|
316 |
generate_btn.click(
|
317 |
fn=generate_podcast_wrapper,
|
318 |
inputs=[num_speakers, script_input] + speaker_selections + [cfg_scale],
|
319 |
+
outputs=[complete_audio_output, log_output],
|
320 |
+
queue=True
|
321 |
)
|
322 |
|
323 |
+
def load_random_example():
|
324 |
+
import random
|
325 |
+
examples = getattr(demo_instance, "example_scripts", [])
|
326 |
+
if not examples:
|
327 |
+
examples = [
|
328 |
+
[2, "Speaker 0: Welcome to our AI podcast demo!\nSpeaker 1: Thanks, excited to be here!"]
|
329 |
+
]
|
330 |
+
num_speakers_value, script_value = random.choice(examples)
|
331 |
+
return num_speakers_value, script_value
|
332 |
+
|
333 |
+
random_example_btn.click(
|
334 |
+
fn=load_random_example,
|
335 |
+
inputs=[],
|
336 |
+
outputs=[num_speakers, script_input],
|
337 |
+
queue=False
|
338 |
+
)
|
339 |
+
|
340 |
+
gr.Markdown("### π **Example Scripts**")
|
341 |
+
examples = getattr(demo_instance, "example_scripts", []) or [
|
342 |
+
[1, "Speaker 1: Welcome to our AI podcast demo. This is a sample script."]
|
343 |
+
]
|
344 |
+
gr.Examples(
|
345 |
+
examples=examples,
|
346 |
+
inputs=[num_speakers, script_input],
|
347 |
+
label="Try these example scripts:"
|
348 |
+
)
|
349 |
+
|
350 |
+
return interface
|
351 |
+
|
352 |
|
353 |
|
354 |
def run_demo(
|