Spaces:

Steveeeeeeen
/

Zonos

Running on Zero

App Files Files Community

Steveeeeeeen HF staff commited on 13 days ago

Commit

46f1390

verified ·

1 Parent(s): b1f1246

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -41

app.py CHANGED Viewed

@@ -5,53 +5,103 @@ import gradio as gr
 from zonos.model import Zonos
 from zonos.conditioning import make_cond_dict
-model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device="cuda")
-model.bfloat16()
-def tts(text, reference_audio):
-    if reference_audio is None:
         return None
-    # Gradio returns (sample_rate, audio_data) for type="numpy"
-    sr, wav_np = reference_audio
-    # Convert NumPy audio data to Torch tensor
-    wav_torch = torch.from_numpy(wav_np).float().unsqueeze(0)
-    if wav_torch.dim() == 2 and wav_torch.shape[0] > wav_torch.shape[1]:
-        wav_torch = wav_torch.T
-    # Create speaker embedding
-    spk_embedding = model.embed_spk_audio(wav_torch, sr)
-    # Prepare conditioning
     cond_dict = make_cond_dict(
-        text=text,
-        speaker=spk_embedding.to(torch.bfloat16),
-        language="en-us",
     )
     conditioning = model.prepare_conditioning(cond_dict)
-    # Generate codes & decode
     with torch.no_grad():
-        torch.manual_seed(421)
         codes = model.generate(conditioning)
-    wavs = model.autoencoder.decode(codes).cpu()
-    out_audio = wavs[0].numpy()
-    # Return a tuple of (sample_rate, audio_data) for playback
-    return (model.autoencoder.sampling_rate, out_audio)
-demo = gr.Interface(
-    fn=tts,
-    inputs=[
-        gr.Textbox(label="Text to Synthesize"),
-        gr.Audio(type="numpy", label="Reference Audio (Speaker)"),
-    ],
-    outputs=gr.Audio(label="Generated Audio"),
-    title="Zonos TTS Demo (Hybrid)",
-    description="Upload a reference audio for speaker embedding, enter text, and generate speech!"
-)
 if __name__ == "__main__":
-    demo.launch(debug=True)

 from zonos.model import Zonos
 from zonos.conditioning import make_cond_dict
+# Global cache to hold the loaded model
+MODEL = None
+def load_model():
+    """
+    Loads the Zonos model once and caches it globally.
+    Adjust the model name to the one you want to use.
+    """
+    global MODEL
+    if MODEL is None:
+        model_name = "Zyphra/Zonos-v0.1-hybrid"
+        print(f"Loading model: {model_name}")
+        MODEL = Zonos.from_pretrained(model_name, device="cuda")
+        MODEL = MODEL.requires_grad_(False).eval()
+        MODEL.bfloat16()  # optional, if your GPU supports bfloat16
+        print("Model loaded successfully!")
+    return MODEL
+def tts(text, speaker_audio):
+    """
+    text: str
+    speaker_audio: (sample_rate, numpy_array) from Gradio if type="numpy"
+    Returns (sample_rate, waveform) for Gradio audio output.
+    """
+    model = load_model()
+    if not text:
         return None
+    # If the user hasn't provided any audio, just return None or a placeholder
+    if speaker_audio is None:
+        return None
+    # Gradio provides audio in the format (sample_rate, numpy_array)
+    sr, wav_np = speaker_audio
+    # Convert to Torch tensor: shape (1, num_samples)
+    wav_tensor = torch.from_numpy(wav_np).unsqueeze(0).float()
+    if wav_tensor.dim() == 2 and wav_tensor.shape[0] > wav_tensor.shape[1]:
+        # If shape is transposed, fix it
+        wav_tensor = wav_tensor.T
+    # Get speaker embedding
+    with torch.no_grad():
+        spk_embedding = model.make_speaker_embedding(wav_tensor, sr)
+        spk_embedding = spk_embedding.to(model.device, dtype=torch.bfloat16)
+    # Prepare conditioning dictionary
     cond_dict = make_cond_dict(
+        text=text,                # The text prompt
+        speaker=spk_embedding,    # Speaker embedding from reference audio
+        language="en-us",         # Hard-coded language or switch to another if needed
+        device=model.device,
     )
     conditioning = model.prepare_conditioning(cond_dict)
+    # Generate codes
     with torch.no_grad():
+        # Optionally set a manual seed for reproducibility
+        # torch.manual_seed(1234)
         codes = model.generate(conditioning)
+    # Decode the codes into raw audio
+    wav_out = model.autoencoder.decode(codes).cpu().detach().squeeze()
+    sr_out = model.autoencoder.sampling_rate
+    return (sr_out, wav_out.numpy())
+def build_demo():
+    with gr.Blocks() as demo:
+        gr.Markdown("# Simple Zonos TTS Demo (Text + Reference Audio)")
+        with gr.Row():
+            text_input = gr.Textbox(
+                label="Text Prompt",
+                value="Hello from Zonos!",
+                lines=3
+            )
+            ref_audio_input = gr.Audio(
+                label="Reference Audio (Speaker Cloning)",
+                type="numpy"
+            )
+        generate_button = gr.Button("Generate")
+        # The output will be an audio widget that Gradio will play
+        audio_output = gr.Audio(label="Synthesized Output", type="numpy")
+        # Bind the generate button
+        generate_button.click(
+            fn=tts,
+            inputs=[text_input, ref_audio_input],
+            outputs=audio_output,
+        )
+    return demo
 if __name__ == "__main__":
+    demo_app = build_demo()
+    demo_app.launch(server_name="0.0.0.0", server_port=7860, share=True)