Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,693 Bytes
748ecaa d743fc1 748ecaa d743fc1 b1f1246 748ecaa d743fc1 b1f1246 d743fc1 b1f1246 d743fc1 b1f1246 d743fc1 b1f1246 d743fc1 748ecaa d743fc1 748ecaa d743fc1 b1f1246 d743fc1 b1f1246 d743fc1 b1f1246 d743fc1 b1f1246 d743fc1 b1f1246 d743fc1 748ecaa d743fc1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import torch
import torchaudio
import gradio as gr
from zonos.model import Zonos
from zonos.conditioning import make_cond_dict
model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device="cuda")
model.bfloat16()
def tts(text, reference_audio):
if reference_audio is None:
return None
# Gradio returns (sample_rate, audio_data) for type="numpy"
sr, wav_np = reference_audio
# Convert NumPy audio data to Torch tensor
wav_torch = torch.from_numpy(wav_np).float().unsqueeze(0)
if wav_torch.dim() == 2 and wav_torch.shape[0] > wav_torch.shape[1]:
wav_torch = wav_torch.T
# Create speaker embedding
spk_embedding = model.embed_spk_audio(wav_torch, sr)
# Prepare conditioning
cond_dict = make_cond_dict(
text=text,
speaker=spk_embedding.to(torch.bfloat16),
language="en-us",
)
conditioning = model.prepare_conditioning(cond_dict)
# Generate codes & decode
with torch.no_grad():
torch.manual_seed(421)
codes = model.generate(conditioning)
wavs = model.autoencoder.decode(codes).cpu()
out_audio = wavs[0].numpy()
# Return a tuple of (sample_rate, audio_data) for playback
return (model.autoencoder.sampling_rate, out_audio)
demo = gr.Interface(
fn=tts,
inputs=[
gr.Textbox(label="Text to Synthesize"),
gr.Audio(type="numpy", label="Reference Audio (Speaker)"),
],
outputs=gr.Audio(label="Generated Audio"),
title="Zonos TTS Demo (Hybrid)",
description="Upload a reference audio for speaker embedding, enter text, and generate speech!"
)
if __name__ == "__main__":
demo.launch(debug=True)
|