File size: 1,693 Bytes
748ecaa
 
 
 
 
d743fc1
748ecaa
d743fc1
b1f1246
748ecaa
d743fc1
 
b1f1246
d743fc1
b1f1246
 
d743fc1
b1f1246
 
d743fc1
 
b1f1246
d743fc1
 
 
 
748ecaa
 
d743fc1
 
748ecaa
d743fc1
 
b1f1246
d743fc1
b1f1246
d743fc1
 
 
b1f1246
 
 
d743fc1
 
 
 
 
 
b1f1246
d743fc1
 
 
b1f1246
d743fc1
748ecaa
 
d743fc1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import torch
import torchaudio
import gradio as gr

from zonos.model import Zonos
from zonos.conditioning import make_cond_dict

model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device="cuda")
model.bfloat16()

def tts(text, reference_audio):
    if reference_audio is None:
        return None
    
    # Gradio returns (sample_rate, audio_data) for type="numpy"
    sr, wav_np = reference_audio
    
    # Convert NumPy audio data to Torch tensor
    wav_torch = torch.from_numpy(wav_np).float().unsqueeze(0)
    if wav_torch.dim() == 2 and wav_torch.shape[0] > wav_torch.shape[1]:
        wav_torch = wav_torch.T

    # Create speaker embedding
    spk_embedding = model.embed_spk_audio(wav_torch, sr)
    
    # Prepare conditioning
    cond_dict = make_cond_dict(
        text=text,
        speaker=spk_embedding.to(torch.bfloat16),
        language="en-us",
    )
    conditioning = model.prepare_conditioning(cond_dict)
    
    # Generate codes & decode
    with torch.no_grad():
        torch.manual_seed(421)
        codes = model.generate(conditioning)
    
    wavs = model.autoencoder.decode(codes).cpu()
    out_audio = wavs[0].numpy()
    
    # Return a tuple of (sample_rate, audio_data) for playback
    return (model.autoencoder.sampling_rate, out_audio)

demo = gr.Interface(
    fn=tts,
    inputs=[
        gr.Textbox(label="Text to Synthesize"),
        gr.Audio(type="numpy", label="Reference Audio (Speaker)"),
    ],
    outputs=gr.Audio(label="Generated Audio"),
    title="Zonos TTS Demo (Hybrid)",
    description="Upload a reference audio for speaker embedding, enter text, and generate speech!"
)

if __name__ == "__main__":
    demo.launch(debug=True)