Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
import logging | |
from pathlib import Path | |
import base64 | |
from model import ( | |
MODEL_ID as WHISPER_MODEL_ID, | |
PHI_MODEL_ID, | |
transcribe_audio_local, | |
transcribe_audio_phi, | |
preload_models, | |
) | |
# Set up logging | |
logging.basicConfig( | |
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
) | |
logger = logging.getLogger(__name__) | |
# Constants | |
EXAMPLES_DIR = Path("./examples") | |
MODEL_CHOICES = { | |
PHI_MODEL_ID: "Phi-4 Model", | |
WHISPER_MODEL_ID: "Whisper Model", | |
} | |
EXAMPLE_FILES = [ | |
[str(EXAMPLES_DIR / "audio1.mp3"), PHI_MODEL_ID], | |
[str(EXAMPLES_DIR / "audio2.mp3"), PHI_MODEL_ID], | |
] | |
def read_file_as_base64(file_path: str) -> str: | |
""" | |
Read a file and encode it as base64. | |
Args: | |
file_path: Path to the file to read | |
Returns: | |
Base64 encoded string of file contents | |
""" | |
try: | |
with open(file_path, "rb") as f: | |
return base64.b64encode(f.read()).decode() | |
except Exception as e: | |
logger.error(f"Failed to read file {file_path}: {str(e)}") | |
raise | |
def combined_transcription(audio: str, model_choice: str) -> str: | |
""" | |
Transcribe audio using the selected model. | |
Args: | |
audio: Path to audio file | |
model_choice: Full model ID to use for transcription | |
Returns: | |
Transcription text | |
""" | |
if not audio: | |
return "Please provide an audio file to transcribe." | |
try: | |
if model_choice == PHI_MODEL_ID: | |
return transcribe_audio_phi(audio) | |
elif model_choice == WHISPER_MODEL_ID: | |
return transcribe_audio_local(audio) | |
else: | |
logger.error(f"Unknown model choice: {model_choice}") | |
return f"Error: Unknown model {model_choice}" | |
except Exception as e: | |
logger.error(f"Transcription failed: {str(e)}") | |
return f"Error during transcription: {str(e)}" | |
def create_demo() -> gr.Blocks: | |
"""Create and configure the Gradio demo interface""" | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# TWASR: Chinese (Taiwan) Automatic Speech Recognition") | |
gr.Markdown( | |
"Upload an audio file or record your voice to transcribe it to text." | |
) | |
gr.Markdown( | |
"⚠️ First load may take a while to initialize the model, following requests will be faster." | |
) | |
with gr.Row(): | |
audio_input = gr.Audio( | |
label="Audio Input", type="filepath", show_download_button=True | |
) | |
with gr.Column(): | |
model_choice = gr.Dropdown( | |
label="Select Model", | |
choices=list(MODEL_CHOICES.keys()), | |
value=PHI_MODEL_ID, | |
info="Select the model for transcription", | |
) | |
text_output = gr.Textbox(label="Transcription Output", lines=5) | |
with gr.Row(): | |
transcribe_button = gr.Button("🎯 Transcribe", variant="primary") | |
clear_button = gr.Button("🧹 Clear") | |
transcribe_button.click( | |
fn=combined_transcription, | |
inputs=[audio_input, model_choice], | |
outputs=[text_output], | |
show_progress=True, | |
) | |
clear_button.click( | |
fn=lambda: (None, ""), | |
inputs=[], | |
outputs=[audio_input, text_output], | |
) | |
gr.Examples( | |
examples=EXAMPLE_FILES, | |
inputs=[audio_input, model_choice], | |
outputs=[text_output], | |
fn=combined_transcription, | |
cache_examples=True, | |
cache_mode="lazy", | |
run_on_click=True, | |
) | |
gr.Markdown("### Model Information") | |
with gr.Accordion("Model Details", open=False): | |
for model_id, model_name in MODEL_CHOICES.items(): | |
gr.Markdown( | |
f"**{model_name}:** [{model_id}](https://huggingface.co/{model_id})" | |
) | |
return demo | |
if __name__ == "__main__": | |
# Preload models before starting the app to reduce cold start time | |
logger.info("Preloading models to reduce cold start time") | |
preload_models() | |
demo = create_demo() | |
demo.launch(share=False) | |