Spaces:

Steveeeeeeen
/

Zonos

Running on Zero

App Files Files Community

Steveeeeeeen HF staff commited on 8 days ago

Commit

1be704d

verified ·

1 Parent(s): 1611a5c

Update app.py

Browse files

Files changed (1) hide show

app.py +328 -95

app.py CHANGED Viewed

@@ -1,139 +1,372 @@
 import torch
 import torchaudio
 import gradio as gr
-import spaces
 from zonos.model import Zonos
 from zonos.conditioning import make_cond_dict, supported_language_codes
-# We'll keep a global dictionary of loaded models to avoid reloading
-MODELS_CACHE = {}
 device = "cuda"
-banner_url = "https://huggingface.co/datasets/Steveeeeeeen/random_images/resolve/main/ZonosHeader.png"
-BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 150px; max-width: 300px;"> </div>'
-def load_model(model_name: str):
-    """
-    Loads or retrieves a cached Zonos model, sets it to eval and bfloat16.
     """
-    global MODELS_CACHE
-    if model_name not in MODELS_CACHE:
-        print(f"Loading model: {model_name}")
-        model = Zonos.from_pretrained(model_name, device=device)
-        model = model.requires_grad_(False).eval()
-        model.bfloat16()  # optional if GPU supports bfloat16
-        MODELS_CACHE[model_name] = model
-        print(f"Model loaded successfully: {model_name}")
-    return MODELS_CACHE[model_name]
-@spaces.GPU(duration=90)
-def tts(text, speaker_audio, selected_language, model_choice):
     """
-    text: str (Text prompt to synthesize)
-    speaker_audio: (sample_rate, numpy_array) from Gradio if type="numpy"
-    selected_language: str (language code)
-    model_choice: str (which Zonos model to use, e.g., "Zyphra/Zonos-v0.1-hybrid")
-    Returns (sr_out, wav_out_numpy).
-    """
-    model = load_model(model_choice)
-    if not text:
-        return None
-    # If the user did not provide a reference audio, skip
-    if speaker_audio is None:
-        return None
-    # Gradio gives audio in (sample_rate, numpy_array) format
-    sr, wav_np = speaker_audio
-    # Convert to Torch tensor
-    wav_tensor = torch.from_numpy(wav_np).float()
-    # If stereo (shape [channels, samples]) or multi-channel, downmix to mono
-    # e.g. shape (2, samples) -> shape (samples,) by averaging
-    if wav_tensor.ndim == 2 and wav_tensor.shape[0] > 1:
-        wav_tensor = wav_tensor.mean(dim=0)  # shape => (samples,)
-    # Now add a batch dimension => shape (1, samples)
-    wav_tensor = wav_tensor.unsqueeze(0)
-    # Get speaker embedding
-    with torch.no_grad():
-        spk_embedding = model.make_speaker_embedding(wav_tensor, sr)
-        spk_embedding = spk_embedding.to(device, dtype=torch.bfloat16)
-    # Prepare conditioning dictionary
     cond_dict = make_cond_dict(
         text=text,
-        speaker=spk_embedding,
-        language=selected_language,
         device=device,
     )
-    conditioning = model.prepare_conditioning(cond_dict)
-    # Generate codes
-    with torch.no_grad():
-        codes = model.generate(conditioning)
-    # Decode the codes into raw audio
-    wav_out = model.autoencoder.decode(codes).cpu().detach().squeeze()
-    sr_out = model.autoencoder.sampling_rate
-    return (sr_out, wav_out.numpy())
-def build_demo():
-    with gr.Blocks(theme='davehornik/Tealy') as demo:
-        gr.HTML(BANNER, elem_id="banner")
-        gr.Markdown("## Zonos-v0.1 TTS Demo")
-        gr.Markdown(
-            """
-> **Zero-shot TTS with Voice Cloning**: Input text and a 10–30 second speaker sample to generate high-quality text-to-speech output.
-> **Audio Prefix Inputs**: Enhance speaker matching by adding an audio prefix to the text, enabling behaviors like whispering that are hard to achieve with voice cloning alone.
-> **Multilingual Support**: Supports English, Japanese, Chinese, French, and German.
-            """
-        )
         with gr.Row():
-            text_input = gr.Textbox(
-                label="Text Prompt",
-                value="Hello from Zonos!",
-                lines=3
             )
-            ref_audio_input = gr.Audio(
-                label="Reference Audio (Speaker Cloning)",
-                type="numpy"
-                # Optionally add mono=True if you want Gradio to always downmix automatically:
-                # mono=True
             )
-        model_dropdown = gr.Dropdown(
-            label="Model Choice",
-            choices=["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"],
-            value="Zyphra/Zonos-v0.1-hybrid",
-            interactive=True,
-        )
-        language_dropdown = gr.Dropdown(
-            label="Language Code",
-            choices=["en-us", "ja", "cmn", "fr-fr", "de"],
-            value="en-us",
-            interactive=True,
         )
-        generate_button = gr.Button("Generate")
-        audio_output = gr.Audio(label="Synthesized Output", type="numpy")
         generate_button.click(
-            fn=tts,
-            inputs=[text_input, ref_audio_input, language_dropdown, model_dropdown],
-            outputs=audio_output,
         )
     return demo
 if __name__ == "__main__":
-    demo_app = build_demo()
-    demo_app.launch(server_name="0.0.0.0", server_port=7860, share=True)

 import torch
 import torchaudio
 import gradio as gr
+from os import getenv
 from zonos.model import Zonos
 from zonos.conditioning import make_cond_dict, supported_language_codes
 device = "cuda"
+CURRENT_MODEL_TYPE = None
+CURRENT_MODEL = None
+def load_model_if_needed(model_choice: str):
+    global CURRENT_MODEL_TYPE, CURRENT_MODEL
+    if CURRENT_MODEL_TYPE != model_choice:
+        if CURRENT_MODEL is not None:
+            del CURRENT_MODEL
+            torch.cuda.empty_cache()
+        print(f"Loading {model_choice} model...")
+        CURRENT_MODEL = Zonos.from_pretrained(model_choice, device=device)
+        CURRENT_MODEL.requires_grad_(False).eval()
+        CURRENT_MODEL_TYPE = model_choice
+        print(f"{model_choice} model loaded successfully!")
+    return CURRENT_MODEL
+def update_ui(model_choice):
     """
+    Dynamically show/hide UI elements based on the model's conditioners.
+    We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
     """
+    model = load_model_if_needed(model_choice)
+    cond_names = [c.name for c in model.prefix_conditioner.conditioners]
+    print("Conditioners in this model:", cond_names)
+    text_update = gr.update(visible=("espeak" in cond_names))
+    language_update = gr.update(visible=("espeak" in cond_names))
+    speaker_audio_update = gr.update(visible=("speaker" in cond_names))
+    prefix_audio_update = gr.update(visible=True)
+    emotion1_update = gr.update(visible=("emotion" in cond_names))
+    emotion2_update = gr.update(visible=("emotion" in cond_names))
+    emotion3_update = gr.update(visible=("emotion" in cond_names))
+    emotion4_update = gr.update(visible=("emotion" in cond_names))
+    emotion5_update = gr.update(visible=("emotion" in cond_names))
+    emotion6_update = gr.update(visible=("emotion" in cond_names))
+    emotion7_update = gr.update(visible=("emotion" in cond_names))
+    emotion8_update = gr.update(visible=("emotion" in cond_names))
+    vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
+    fmax_slider_update = gr.update(visible=("fmax" in cond_names))
+    pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
+    speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
+    dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
+    speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
+    unconditional_keys_update = gr.update(
+        choices=[name for name in cond_names if name not in ("espeak", "language_id")]
+    )
+    return (
+        text_update,
+        language_update,
+        speaker_audio_update,
+        prefix_audio_update,
+        emotion1_update,
+        emotion2_update,
+        emotion3_update,
+        emotion4_update,
+        emotion5_update,
+        emotion6_update,
+        emotion7_update,
+        emotion8_update,
+        vq_single_slider_update,
+        fmax_slider_update,
+        pitch_std_slider_update,
+        speaking_rate_slider_update,
+        dnsmos_slider_update,
+        speaker_noised_checkbox_update,
+        unconditional_keys_update,
+    )
+def generate_audio(
+    model_choice,
+    text,
+    language,
+    speaker_audio,
+    prefix_audio,
+    e1,
+    e2,
+    e3,
+    e4,
+    e5,
+    e6,
+    e7,
+    e8,
+    vq_single,
+    fmax,
+    pitch_std,
+    speaking_rate,
+    dnsmos_ovrl,
+    speaker_noised,
+    cfg_scale,
+    min_p,
+    seed,
+    randomize_seed,
+    unconditional_keys,
+    progress=gr.Progress(),
+):
+    """
+    Generates audio based on the provided UI parameters.
+    We do NOT use language_id or ctc_loss even if the model has them.
+    """
+    selected_model = load_model_if_needed(model_choice)
+    speaker_noised_bool = bool(speaker_noised)
+    fmax = float(fmax)
+    pitch_std = float(pitch_std)
+    speaking_rate = float(speaking_rate)
+    dnsmos_ovrl = float(dnsmos_ovrl)
+    cfg_scale = float(cfg_scale)
+    min_p = float(min_p)
+    seed = int(seed)
+    max_new_tokens = 86 * 30
+    if randomize_seed:
+        seed = torch.randint(0, 2**32 - 1, (1,)).item()
+    torch.manual_seed(seed)
+    speaker_embedding = None
+    if speaker_audio is not None and "speaker" not in unconditional_keys:
+        wav, sr = torchaudio.load(speaker_audio)
+        speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
+        speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)
+    audio_prefix_codes = None
+    if prefix_audio is not None:
+        wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
+        wav_prefix = wav_prefix.mean(0, keepdim=True)
+        wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
+        wav_prefix = wav_prefix.to(device, dtype=torch.float32)
+        with torch.autocast(device, dtype=torch.float32):
+            audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))
+    emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)
+    vq_val = float(vq_single)
+    vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)
     cond_dict = make_cond_dict(
         text=text,
+        language=language,
+        speaker=speaker_embedding,
+        emotion=emotion_tensor,
+        vqscore_8=vq_tensor,
+        fmax=fmax,
+        pitch_std=pitch_std,
+        speaking_rate=speaking_rate,
+        dnsmos_ovrl=dnsmos_ovrl,
+        speaker_noised=speaker_noised_bool,
         device=device,
+        unconditional_keys=unconditional_keys,
     )
+    conditioning = selected_model.prepare_conditioning(cond_dict)
+    estimated_generation_duration = 30 * len(text) / 400
+    estimated_total_steps = int(estimated_generation_duration * 86)
+    def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
+        progress((step, estimated_total_steps))
+        return True
+    codes = selected_model.generate(
+        prefix_conditioning=conditioning,
+        audio_prefix_codes=audio_prefix_codes,
+        max_new_tokens=max_new_tokens,
+        cfg_scale=cfg_scale,
+        batch_size=1,
+        sampling_params=dict(min_p=min_p),
+        callback=update_progress,
+    )
+    wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
+    sr_out = selected_model.autoencoder.sampling_rate
+    if wav_out.dim() == 2 and wav_out.size(0) > 1:
+        wav_out = wav_out[0:1, :]
+    return (sr_out, wav_out.squeeze().numpy()), seed
+def build_interface():
+    with gr.Blocks() as demo:
+        with gr.Row():
+            with gr.Column():
+                model_choice = gr.Dropdown(
+                    choices=["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"],
+                    value="Zyphra/Zonos-v0.1-transformer",
+                    label="Zonos Model Type",
+                    info="Select the model variant to use.",
+                )
+                text = gr.Textbox(
+                    label="Text to Synthesize",
+                    value="Zonos uses eSpeak for text to phoneme conversion!",
+                    lines=4,
+                    max_length=500,  # approximately
+                )
+                language = gr.Dropdown(
+                    choices=supported_language_codes,
+                    value="en-us",
+                    label="Language Code",
+                    info="Select a language code.",
+                )
+            prefix_audio = gr.Audio(
+                value="assets/silence_100ms.wav",
+                label="Optional Prefix Audio (continue from this audio)",
+                type="filepath",
+            )
+            with gr.Column():
+                speaker_audio = gr.Audio(
+                    label="Optional Speaker Audio (for cloning)",
+                    type="filepath",
+                )
+                speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)
         with gr.Row():
+            with gr.Column():
+                gr.Markdown("## Conditioning Parameters")
+                dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="DNSMOS Overall")
+                fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Fmax (Hz)")
+                vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="VQ Score")
+                pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Std")
+                speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate")
+            with gr.Column():
+                gr.Markdown("## Generation Parameters")
+                cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
+                min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
+                seed_number = gr.Number(label="Seed", value=420, precision=0)
+                randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)
+        with gr.Accordion("Advanced Parameters", open=False):
+            gr.Markdown(
+                "### Unconditional Toggles\n"
+                "Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
+                'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
             )
+            with gr.Row():
+                unconditional_keys = gr.CheckboxGroup(
+                    [
+                        "speaker",
+                        "emotion",
+                        "vqscore_8",
+                        "fmax",
+                        "pitch_std",
+                        "speaking_rate",
+                        "dnsmos_ovrl",
+                        "speaker_noised",
+                    ],
+                    value=["emotion"],
+                    label="Unconditional Keys",
+                )
+            gr.Markdown(
+                "### Emotion Sliders\n"
+                "Warning: The way these sliders work is not intuitive and may require some trial and error to get the desired effect.\n"
+                "Certain configurations can cause the model to become unstable. Setting emotion to unconditional may help."
             )
+            with gr.Row():
+                emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness")
+                emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness")
+                emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust")
+                emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear")
+            with gr.Row():
+                emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise")
+                emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger")
+                emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other")
+                emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral")
+        with gr.Column():
+            generate_button = gr.Button("Generate Audio")
+            output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True)
+        model_choice.change(
+            fn=update_ui,
+            inputs=[model_choice],
+            outputs=[
+                text,
+                language,
+                speaker_audio,
+                prefix_audio,
+                emotion1,
+                emotion2,
+                emotion3,
+                emotion4,
+                emotion5,
+                emotion6,
+                emotion7,
+                emotion8,
+                vq_single_slider,
+                fmax_slider,
+                pitch_std_slider,
+                speaking_rate_slider,
+                dnsmos_slider,
+                speaker_noised_checkbox,
+                unconditional_keys,
+            ],
         )
+        # On page load, trigger the same UI refresh
+        demo.load(
+            fn=update_ui,
+            inputs=[model_choice],
+            outputs=[
+                text,
+                language,
+                speaker_audio,
+                prefix_audio,
+                emotion1,
+                emotion2,
+                emotion3,
+                emotion4,
+                emotion5,
+                emotion6,
+                emotion7,
+                emotion8,
+                vq_single_slider,
+                fmax_slider,
+                pitch_std_slider,
+                speaking_rate_slider,
+                dnsmos_slider,
+                speaker_noised_checkbox,
+                unconditional_keys,
+            ],
+        )
+        # Generate audio on button click
         generate_button.click(
+            fn=generate_audio,
+            inputs=[
+                model_choice,
+                text,
+                language,
+                speaker_audio,
+                prefix_audio,
+                emotion1,
+                emotion2,
+                emotion3,
+                emotion4,
+                emotion5,
+                emotion6,
+                emotion7,
+                emotion8,
+                vq_single_slider,
+                fmax_slider,
+                pitch_std_slider,
+                speaking_rate_slider,
+                dnsmos_slider,
+                speaker_noised_checkbox,
+                cfg_scale_slider,
+                min_p_slider,
+                seed_number,
+                randomize_seed_toggle,
+                unconditional_keys,
+            ],
+            outputs=[output_audio, seed_number],
         )
     return demo
 if __name__ == "__main__":
+    demo = build_interface()
+    share = getenv("GRADIO_SHARE", "False").lower() in ("true", "1", "t")
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=share)