Spaces:
Running
Running
import gradio as gr | |
from gradio_app.components import ( | |
get_files_in_ckpts, handle_file_upload, | |
run_tts_inference, | |
run_setup_script | |
) | |
from gradio_app.asr_utils import transcribe_audio | |
from pathlib import Path | |
def create_gradio_app(): | |
"""Create Gradio interface for F5-TTS inference with Whisper ASR.""" | |
# Run setup script to ensure dependencies are installed | |
result_setup = run_setup_script() | |
# Function to update reference text based on audio file and Whisper checkbox | |
def update_ref_text(audio_file_path, use_whisper): | |
if use_whisper and audio_file_path: | |
return transcribe_audio(audio_file_path) | |
return gr.update() | |
def toggle_model_inputs(use_upload): | |
return ( | |
gr.update(visible=not use_upload), | |
gr.update(visible=not use_upload), | |
gr.update(visible=not use_upload), | |
gr.update(visible=use_upload), | |
gr.update(visible=use_upload), | |
gr.update(visible=use_upload) | |
) | |
def load_example(ref_audio_path, ref_text, inf_text): | |
"""Load example inputs and retrieve corresponding infer_audio for output.""" | |
# Find the matching example folder to get infer_audio | |
example_dirs = [ | |
Path("apps/gradio_app/assets/examples/f5_tts/1"), | |
Path("apps/gradio_app/assets/examples/f5_tts/2"), | |
Path("apps/gradio_app/assets/examples/f5_tts/3"), | |
Path("apps/gradio_app/assets/examples/f5_tts/4") | |
] | |
inf_audio_path = None | |
for dir_path in example_dirs: | |
if dir_path.exists(): | |
ref_audio = next((f for f in dir_path.glob("refer_audio.*") if f.suffix in [".mp3", ".wav"]), None) | |
if ref_audio and str(ref_audio) == ref_audio_path: | |
inf_audio = next((f for f in dir_path.glob("infer_audio.*") if f.suffix in [".mp3", ".wav"]), None) | |
inf_audio_path = str(inf_audio) if inf_audio else None | |
break | |
return ref_audio_path, ref_text, inf_text, inf_audio_path | |
# Prepare examples for gr.Examples (exclude infer_audio from table) | |
example_dirs = [ | |
Path("apps/gradio_app/assets/examples/f5_tts/1"), | |
Path("apps/gradio_app/assets/examples/f5_tts/2"), | |
Path("apps/gradio_app/assets/examples/f5_tts/3"), | |
Path("apps/gradio_app/assets/examples/f5_tts/4") | |
] | |
examples = [] | |
for dir_path in example_dirs: | |
if not dir_path.exists(): | |
continue | |
# Read text files | |
ref_text = (dir_path / "refer_text.txt").read_text(encoding="utf-8") if (dir_path / "refer_text.txt").exists() else "" | |
inf_text = (dir_path / "infer_text.txt").read_text(encoding="utf-8") if (dir_path / "infer_text.txt").exists() else "" | |
# Find audio files (mp3 or wav) | |
ref_audio = next((f for f in dir_path.glob("refer_audio.*") if f.suffix in [".mp3", ".wav"]), None) | |
examples.append([ | |
str(ref_audio) if ref_audio else None, | |
ref_text, | |
inf_text | |
]) | |
CSS = open("apps/gradio_app/static/styles.css", "r").read() | |
with gr.Blocks(css=CSS) as demo: | |
gr.Markdown("# F5-TTS Audio Generation") | |
gr.Markdown("Generate high-quality audio with a fine-tuned F5-TTS model. Upload reference audio, use Whisper ASR for transcription, enter text, adjust speed, and select or upload model files.") | |
with gr.Row(): | |
with gr.Column(): | |
ref_audio = gr.Audio(label="Reference Audio", type="filepath") | |
with gr.Group(): | |
use_whisper = gr.Checkbox(label="Use Whisper ASR for Transcription", value=False) | |
ref_text = gr.Textbox( | |
label="Reference Text", | |
placeholder="e.g., Sau nhà Ngô, lần lượt các triều Đinh...", | |
lines=1 | |
) | |
gen_text = gr.Textbox( | |
label="Generated Text", | |
placeholder="e.g., Nhà Tiền Lê, Lý và Trần đã chống trả...", | |
lines=1 | |
) | |
generate_btn = gr.Button("Generate Audio") | |
with gr.Column(): | |
output_audio = gr.Audio(label="Generated Audio") | |
output_text = gr.Textbox(label="Status", interactive=False) | |
with gr.Group(): | |
speed = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speed") | |
model_cfg = gr.Dropdown( | |
choices=get_files_in_ckpts([".yaml"]), | |
label="Model Config (*.yaml)", | |
value=get_files_in_ckpts([".yaml"])[0], | |
visible=True | |
) | |
ckpt_file = gr.Dropdown( | |
choices=get_files_in_ckpts([".pt", ".safetensors"], include_subdirs=True), | |
label="Checkpoint File (*.pt or *.safetensors)", | |
value=get_files_in_ckpts([".pt", ".safetensors"], include_subdirs=True)[0], | |
visible=True | |
) | |
vocab_file = gr.Dropdown( | |
choices=get_files_in_ckpts([".txt", ".safetensors"]), | |
label="Vocab File (*.txt or *.safetensors)", | |
value=get_files_in_ckpts([".txt", ".safetensors"])[0], | |
visible=True | |
) | |
use_upload = gr.Checkbox(label="Upload Custom Model Files", value=False) | |
model_cfg_upload = gr.File(label="Model Config (*.yaml)", file_types=[".yaml"], visible=False) | |
ckpt_file_upload = gr.File(label="Checkpoint File (*.pt or *.safetensors)", file_types=[".pt", ".safetensors"], visible=False) | |
vocab_file_upload = gr.File(label="Vocab File (*.txt or *.safetensors)", file_types=[".txt", ".safetensors"], visible=False) | |
# Add Examples component after both columns | |
gr.Examples( | |
examples=examples, | |
inputs=[ref_audio, ref_text, gen_text], | |
outputs=[ref_audio, ref_text, gen_text, output_audio], # Keep output_audio to display infer_audio | |
fn=load_example, | |
label="Example Inputs", | |
examples_per_page=4, | |
cache_examples=False | |
) | |
ref_audio.change(fn=update_ref_text, inputs=[ref_audio, use_whisper], outputs=ref_text) | |
use_whisper.change(fn=update_ref_text, inputs=[ref_audio, use_whisper], outputs=ref_text) | |
use_upload.change( | |
fn=toggle_model_inputs, | |
inputs=[use_upload], | |
outputs=[model_cfg, ckpt_file, vocab_file, model_cfg_upload, ckpt_file_upload, vocab_file_upload] | |
) | |
generate_btn.click( | |
fn=run_tts_inference, | |
inputs=[ref_audio, ref_text, gen_text, speed, use_upload, model_cfg, ckpt_file, vocab_file], | |
outputs=[output_audio, output_text] | |
) | |
return demo | |
if __name__ == "__main__": | |
demo = create_gradio_app() | |
demo.launch(share=True) |