Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,29 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import spaces
|
| 3 |
import torch
|
| 4 |
import torchaudio
|
|
@@ -8,12 +33,20 @@ from os import getenv
|
|
| 8 |
from zonos.model import Zonos
|
| 9 |
from zonos.conditioning import make_cond_dict, supported_language_codes
|
| 10 |
|
| 11 |
-
device = "cuda"
|
| 12 |
MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
|
| 13 |
-
MODELS = {
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def update_ui(model_choice):
|
| 19 |
"""
|
|
@@ -359,8 +392,8 @@ def build_interface():
|
|
| 359 |
with gr.Row():
|
| 360 |
with gr.Column(scale=2):
|
| 361 |
model_choice = gr.Dropdown(
|
| 362 |
-
choices=
|
| 363 |
-
value=
|
| 364 |
label="Zonos Model Type",
|
| 365 |
info="Select the model variant to use.",
|
| 366 |
)
|
|
@@ -378,7 +411,7 @@ def build_interface():
|
|
| 378 |
)
|
| 379 |
with gr.Column(scale=1):
|
| 380 |
prefix_audio = gr.Audio(
|
| 381 |
-
value="assets/silence_100ms.wav",
|
| 382 |
label="Optional Prefix Audio (continue from this audio)",
|
| 383 |
type="filepath",
|
| 384 |
)
|
|
@@ -460,90 +493,91 @@ def build_interface():
|
|
| 460 |
generate_button = gr.Button("Generate Audio", elem_classes="generate-button")
|
| 461 |
output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True, elem_classes="audio-output")
|
| 462 |
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
|
|
|
| 547 |
|
| 548 |
return demo
|
| 549 |
|
|
|
|
| 1 |
import os
|
| 2 |
+
import sys
|
| 3 |
+
import subprocess
|
| 4 |
+
|
| 5 |
+
# Emergency flash-attn installation if not found
|
| 6 |
+
try:
|
| 7 |
+
import flash_attn
|
| 8 |
+
except ImportError:
|
| 9 |
+
print("flash_attn not found, attempting to install...")
|
| 10 |
+
try:
|
| 11 |
+
# Try installing pre-built wheel first (fastest)
|
| 12 |
+
subprocess.run([
|
| 13 |
+
sys.executable, "-m", "pip", "install",
|
| 14 |
+
"https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.2.post1/flash_attn-2.7.2.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"
|
| 15 |
+
], check=True)
|
| 16 |
+
except:
|
| 17 |
+
# Fallback: install without CUDA build (slower but more compatible)
|
| 18 |
+
env = os.environ.copy()
|
| 19 |
+
env["FLASH_ATTENTION_SKIP_CUDA_BUILD"] = "TRUE"
|
| 20 |
+
subprocess.run([
|
| 21 |
+
sys.executable, "-m", "pip", "install", "flash-attn", "--no-build-isolation"
|
| 22 |
+
], env=env, check=True)
|
| 23 |
+
|
| 24 |
+
# Restart the script after installation
|
| 25 |
+
os.execv(sys.executable, [sys.executable] + sys.argv)
|
| 26 |
+
|
| 27 |
import spaces
|
| 28 |
import torch
|
| 29 |
import torchaudio
|
|
|
|
| 33 |
from zonos.model import Zonos
|
| 34 |
from zonos.conditioning import make_cond_dict, supported_language_codes
|
| 35 |
|
| 36 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 37 |
MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
|
| 38 |
+
MODELS = {}
|
| 39 |
+
|
| 40 |
+
# Load models with error handling
|
| 41 |
+
for name in MODEL_NAMES:
|
| 42 |
+
try:
|
| 43 |
+
MODELS[name] = Zonos.from_pretrained(name, device=device)
|
| 44 |
+
MODELS[name].requires_grad_(False).eval()
|
| 45 |
+
print(f"Successfully loaded model: {name}")
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"Failed to load model {name}: {e}")
|
| 48 |
+
if not MODELS: # If no models loaded at all
|
| 49 |
+
raise
|
| 50 |
|
| 51 |
def update_ui(model_choice):
|
| 52 |
"""
|
|
|
|
| 392 |
with gr.Row():
|
| 393 |
with gr.Column(scale=2):
|
| 394 |
model_choice = gr.Dropdown(
|
| 395 |
+
choices=list(MODELS.keys()),
|
| 396 |
+
value=list(MODELS.keys())[0] if MODELS else None,
|
| 397 |
label="Zonos Model Type",
|
| 398 |
info="Select the model variant to use.",
|
| 399 |
)
|
|
|
|
| 411 |
)
|
| 412 |
with gr.Column(scale=1):
|
| 413 |
prefix_audio = gr.Audio(
|
| 414 |
+
value="assets/silence_100ms.wav" if os.path.exists("assets/silence_100ms.wav") else None,
|
| 415 |
label="Optional Prefix Audio (continue from this audio)",
|
| 416 |
type="filepath",
|
| 417 |
)
|
|
|
|
| 493 |
generate_button = gr.Button("Generate Audio", elem_classes="generate-button")
|
| 494 |
output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True, elem_classes="audio-output")
|
| 495 |
|
| 496 |
+
if MODELS: # Only set up callbacks if models loaded successfully
|
| 497 |
+
model_choice.change(
|
| 498 |
+
fn=update_ui,
|
| 499 |
+
inputs=[model_choice],
|
| 500 |
+
outputs=[
|
| 501 |
+
text,
|
| 502 |
+
language,
|
| 503 |
+
speaker_audio,
|
| 504 |
+
prefix_audio,
|
| 505 |
+
emotion1,
|
| 506 |
+
emotion2,
|
| 507 |
+
emotion3,
|
| 508 |
+
emotion4,
|
| 509 |
+
emotion5,
|
| 510 |
+
emotion6,
|
| 511 |
+
emotion7,
|
| 512 |
+
emotion8,
|
| 513 |
+
vq_single_slider,
|
| 514 |
+
fmax_slider,
|
| 515 |
+
pitch_std_slider,
|
| 516 |
+
speaking_rate_slider,
|
| 517 |
+
dnsmos_slider,
|
| 518 |
+
speaker_noised_checkbox,
|
| 519 |
+
unconditional_keys,
|
| 520 |
+
],
|
| 521 |
+
)
|
| 522 |
+
|
| 523 |
+
# On page load, trigger the same UI refresh
|
| 524 |
+
demo.load(
|
| 525 |
+
fn=update_ui,
|
| 526 |
+
inputs=[model_choice],
|
| 527 |
+
outputs=[
|
| 528 |
+
text,
|
| 529 |
+
language,
|
| 530 |
+
speaker_audio,
|
| 531 |
+
prefix_audio,
|
| 532 |
+
emotion1,
|
| 533 |
+
emotion2,
|
| 534 |
+
emotion3,
|
| 535 |
+
emotion4,
|
| 536 |
+
emotion5,
|
| 537 |
+
emotion6,
|
| 538 |
+
emotion7,
|
| 539 |
+
emotion8,
|
| 540 |
+
vq_single_slider,
|
| 541 |
+
fmax_slider,
|
| 542 |
+
pitch_std_slider,
|
| 543 |
+
speaking_rate_slider,
|
| 544 |
+
dnsmos_slider,
|
| 545 |
+
speaker_noised_checkbox,
|
| 546 |
+
unconditional_keys,
|
| 547 |
+
],
|
| 548 |
+
)
|
| 549 |
+
|
| 550 |
+
# Generate audio on button click
|
| 551 |
+
generate_button.click(
|
| 552 |
+
fn=generate_audio,
|
| 553 |
+
inputs=[
|
| 554 |
+
model_choice,
|
| 555 |
+
text,
|
| 556 |
+
language,
|
| 557 |
+
speaker_audio,
|
| 558 |
+
prefix_audio,
|
| 559 |
+
emotion1,
|
| 560 |
+
emotion2,
|
| 561 |
+
emotion3,
|
| 562 |
+
emotion4,
|
| 563 |
+
emotion5,
|
| 564 |
+
emotion6,
|
| 565 |
+
emotion7,
|
| 566 |
+
emotion8,
|
| 567 |
+
vq_single_slider,
|
| 568 |
+
fmax_slider,
|
| 569 |
+
pitch_std_slider,
|
| 570 |
+
speaking_rate_slider,
|
| 571 |
+
dnsmos_slider,
|
| 572 |
+
speaker_noised_checkbox,
|
| 573 |
+
cfg_scale_slider,
|
| 574 |
+
min_p_slider,
|
| 575 |
+
seed_number,
|
| 576 |
+
randomize_seed_toggle,
|
| 577 |
+
unconditional_keys,
|
| 578 |
+
],
|
| 579 |
+
outputs=[output_audio, seed_number],
|
| 580 |
+
)
|
| 581 |
|
| 582 |
return demo
|
| 583 |
|