Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,938 Bytes
153a74a 5f2ef10 153a74a adf2556 5f2ef10 153a74a 5f2ef10 153a74a 701defd 2c4f229 153a74a 5f2ef10 153a74a 5f2ef10 153a74a 5e8f0ac 153a74a 493e475 153a74a 5f2ef10 153a74a 5f2ef10 36df638 5f2ef10 153a74a 5f2ef10 153a74a 5f2ef10 153a74a 5f2ef10 153a74a cda3ab4 5f2ef10 5bd6cdf 0779337 5f2ef10 0779337 153a74a 5f2ef10 3b05550 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import spaces
import os
from huggingface_hub import login
import gradio as gr
from cached_path import cached_path
import tempfile
from vinorm import TTSnorm
from f5_tts.model import DiT
from f5_tts.infer.utils_infer import (
preprocess_ref_audio_text,
load_vocoder,
load_model,
infer_process,
save_spectrogram,
)
# Retrieve token from secrets
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
# Log in to Hugging Face
if hf_token:
login(token=hf_token)
def post_process(text):
text = " " + text + " "
text = text.replace(" . . ", " . ")
text = " " + text + " "
text = text.replace(" .. ", " . ")
text = " " + text + " "
text = text.replace(" , , ", " , ")
text = " " + text + " "
text = text.replace(" ,, ", " , ")
text = " " + text + " "
text = text.replace('"', "")
return " ".join(text.split())
# Load models
vocoder = load_vocoder()
model = load_model(
DiT,
dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
ckpt_path=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/model_last.pt")),
vocab_file=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/config.json")),
)
@spaces.GPU
def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
if not ref_audio_orig:
raise gr.Error("Please upload a sample audio file.")
if not gen_text.strip():
raise gr.Error("Please enter the text content to generate voice.")
if len(gen_text.split()) > 1000:
raise gr.Error("Please enter text content with less than 1000 words.")
try:
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
final_wave, final_sample_rate, spectrogram = infer_process(
ref_audio, ref_text.lower(), post_process(TTSnorm(gen_text)).lower(), model, vocoder, speed=speed
)
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
spectrogram_path = tmp_spectrogram.name
save_spectrogram(spectrogram, spectrogram_path)
return (final_sample_rate, final_wave), spectrogram_path
except Exception as e:
raise gr.Error(f"Error generating voice: {e}")
# Gradio UI
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# π€ F5-TTS: Vietnamese Text-to-Speech Synthesis.
# The model was trained with approximately 1000 hours of data on a RTX 3090 GPU.
Enter text and upload a sample voice to generate natural speech.
""")
with gr.Row():
ref_audio = gr.Audio(label="π Sample Voice", type="filepath")
gen_text = gr.Textbox(label="π Text", placeholder="Enter the text to generate voice...", lines=3)
speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="β‘ Speed")
btn_synthesize = gr.Button("π₯ Generate Voice")
with gr.Row():
output_audio = gr.Audio(label="π§ Generated Audio", type="numpy")
output_spectrogram = gr.Image(label="π Spectrogram")
model_limitations = gr.Textbox(
value="""1. This model may not perform well with numerical characters, dates, special characters, etc. => A text normalization module is needed.
2. The rhythm of some generated audios may be inconsistent or choppy => It is recommended to select clearly pronounced sample audios with minimal pauses for better synthesis quality.
3. Default, reference audio text uses the pho-whisper-medium model, which may not always accurately recognize Vietnamese, resulting in poor voice synthesis quality.
4. Inference with overly long paragraphs may produce poor results.""",
label="β Model Limitations",
lines=4,
interactive=False
)
btn_synthesize.click(infer_tts, inputs=[ref_audio, gen_text, speed], outputs=[output_audio, output_spectrogram])
# Run Gradio with share=True to get a gradio.live link
demo.queue().launch() |