Spaces:

Steveeeeeeen
/

Zonos

Running on Zero

App Files Files Community

Steveeeeeeen HF Staff commited on Feb 11

Commit

d5d8bf3

verified ·

1 Parent(s): ab5fd90

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -15

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import torchaudio
 import gradio as gr
 from zonos.model import Zonos
-from zonos.conditioning import make_cond_dict
 # Global cache to hold the loaded model
 MODEL = None
@@ -12,7 +12,7 @@ device = "cuda"
 def load_model():
     """
     Loads the Zonos model once and caches it globally.
-    Adjust the model name to the one you want to use.
     """
     global MODEL
     if MODEL is None:
@@ -20,26 +20,29 @@ def load_model():
         print(f"Loading model: {model_name}")
         MODEL = Zonos.from_pretrained(model_name, device="cuda")
         MODEL = MODEL.requires_grad_(False).eval()
-        MODEL.bfloat16()  # optional, if your GPU supports bfloat16
         print("Model loaded successfully!")
     return MODEL
-def tts(text, speaker_audio):
     """
     text: str
     speaker_audio: (sample_rate, numpy_array) from Gradio if type="numpy"
     Returns (sample_rate, waveform) for Gradio audio output.
     """
     model = load_model()
     if not text:
         return None
-    # If the user hasn't provided any audio, just return None or a placeholder
     if speaker_audio is None:
         return None
-    # Gradio provides audio in the format (sample_rate, numpy_array)
     sr, wav_np = speaker_audio
     # Convert to Torch tensor: shape (1, num_samples)
@@ -55,17 +58,15 @@ def tts(text, speaker_audio):
     # Prepare conditioning dictionary
     cond_dict = make_cond_dict(
-        text=text,                # The text prompt
-        speaker=spk_embedding,    # Speaker embedding from reference audio
-        language="en-us",         # Hard-coded language or switch to another if needed
         device=device,
     )
     conditioning = model.prepare_conditioning(cond_dict)
     # Generate codes
     with torch.no_grad():
-        # Optionally set a manual seed for reproducibility
-        # torch.manual_seed(1234)
         codes = model.generate(conditioning)
     # Decode the codes into raw audio
@@ -76,7 +77,7 @@ def tts(text, speaker_audio):
 def build_demo():
     with gr.Blocks() as demo:
-        gr.Markdown("# Simple Zonos TTS Demo (Text + Reference Audio)")
         with gr.Row():
             text_input = gr.Textbox(
@@ -88,16 +89,26 @@ def build_demo():
                 label="Reference Audio (Speaker Cloning)",
                 type="numpy"
             )
         generate_button = gr.Button("Generate")
-        # The output will be an audio widget that Gradio will play
         audio_output = gr.Audio(label="Synthesized Output", type="numpy")
-        # Bind the generate button
         generate_button.click(
             fn=tts,
-            inputs=[text_input, ref_audio_input],
             outputs=audio_output,
         )

 import gradio as gr
 from zonos.model import Zonos
+from zonos.conditioning import make_cond_dict, supported_language_codes
 # Global cache to hold the loaded model
 MODEL = None
 def load_model():
     """
     Loads the Zonos model once and caches it globally.
+    Adjust the model name if you want to switch from hybrid to transformer, etc.
     """
     global MODEL
     if MODEL is None:
         print(f"Loading model: {model_name}")
         MODEL = Zonos.from_pretrained(model_name, device="cuda")
         MODEL = MODEL.requires_grad_(False).eval()
+        MODEL.bfloat16()  # optional if your GPU supports bfloat16
         print("Model loaded successfully!")
     return MODEL
+def tts(text, speaker_audio, selected_language):
     """
     text: str
     speaker_audio: (sample_rate, numpy_array) from Gradio if type="numpy"
+    selected_language: str (e.g., "en-us", "es-es", etc.)
     Returns (sample_rate, waveform) for Gradio audio output.
     """
     model = load_model()
+    # If no text, return None
     if not text:
         return None
+    # If no reference audio, return None
     if speaker_audio is None:
         return None
+    # Gradio provides audio in (sample_rate, numpy_array)
     sr, wav_np = speaker_audio
     # Convert to Torch tensor: shape (1, num_samples)
     # Prepare conditioning dictionary
     cond_dict = make_cond_dict(
+        text=text,                   # The text prompt
+        speaker=spk_embedding,       # Speaker embedding
+        language=selected_language,  # Language from the Dropdown
         device=device,
     )
     conditioning = model.prepare_conditioning(cond_dict)
     # Generate codes
     with torch.no_grad():
         codes = model.generate(conditioning)
     # Decode the codes into raw audio
 def build_demo():
     with gr.Blocks() as demo:
+        gr.Markdown("# Simple Zonos TTS Demo (Text + Reference Audio + Language)")
         with gr.Row():
             text_input = gr.Textbox(
                 label="Reference Audio (Speaker Cloning)",
                 type="numpy"
             )
+        # Add a dropdown for language selection
+        language_dropdown = gr.Dropdown(
+            label="Language",
+            # You can provide your own subset or use all:
+            # For demonstration, let's pick 5 common ones
+            # or you can do: choices=supported_language_codes
+            choices=["en-us", "es-es", "fr-fr", "de-de", "it"],
+            value="en-us",
+            interactive=True
+        )
         generate_button = gr.Button("Generate")
+        # The output is an audio widget that Gradio will play
         audio_output = gr.Audio(label="Synthesized Output", type="numpy")
+        # Bind the generate button: pass text, reference audio, and selected language
         generate_button.click(
             fn=tts,
+            inputs=[text_input, ref_audio_input, language_dropdown],
             outputs=audio_output,
         )