|
import gradio as gr |
|
import torch |
|
import tempfile |
|
import soundfile as sf |
|
from tortoise.api import TextToSpeech |
|
from tortoise.utils.audio import load_audio |
|
|
|
|
|
tts = TextToSpeech() |
|
|
|
|
|
def generate_speech(reference_audio_path, text): |
|
""" |
|
reference_audio_path: filepath to a WAV sampled at 22 050 Hz |
|
text: the string to synthesize |
|
returns: path to a 24 kHz WAV file with your cloned voice |
|
""" |
|
|
|
ref_waveform = load_audio(reference_audio_path, 22050) |
|
|
|
|
|
output_tensor = tts.tts_with_preset( |
|
text, |
|
voice_samples=[ref_waveform], |
|
preset="fast" |
|
) |
|
|
|
|
|
wav_np = output_tensor.squeeze().cpu().numpy() |
|
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) |
|
sf.write(tmp.name, wav_np, samplerate=24000) |
|
return tmp.name |
|
|
|
|
|
with gr.Blocks(title="Tortoise Voice Cloning TTS") as app: |
|
gr.Markdown("## π£οΈ Voice Cloning with Tortoise TTS") |
|
gr.Markdown( |
|
"Upload a ~10 sec WAV clip (22 050 Hz), enter English text, " |
|
"and hear it spoken back in **your** voice!" |
|
) |
|
|
|
with gr.Row(): |
|
voice_sample = gr.Audio(type="filepath", label="ποΈ Upload Reference Voice (22 050 Hz WAV)") |
|
text_input = gr.Textbox(label="π¬ Text to Synthesize", placeholder="e.g., Hello, world!") |
|
|
|
generate_btn = gr.Button("π Generate Speech") |
|
output_audio = gr.Audio(label="π’ Cloned Speech Output (24 kHz)", interactive=False) |
|
|
|
generate_btn.click( |
|
fn=generate_speech, |
|
inputs=[voice_sample, text_input], |
|
outputs=output_audio |
|
) |
|
|
|
if __name__ == "__main__": |
|
app.launch() |