import spaces import gradio as gr import logging from pathlib import Path import base64 from model import ( MODEL_ID as WHISPER_MODEL_ID, PHI_MODEL_ID, transcribe_audio_local, transcribe_audio_phi, preload_models, ) # Set up logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) # Constants EXAMPLES_DIR = Path("./examples") MODEL_CHOICES = { PHI_MODEL_ID: "Phi-4 Model", WHISPER_MODEL_ID: "Whisper Model", } EXAMPLE_FILES = [ [str(EXAMPLES_DIR / "audio1.mp3"), PHI_MODEL_ID], [str(EXAMPLES_DIR / "audio2.mp3"), PHI_MODEL_ID], ] def read_file_as_base64(file_path: str) -> str: """ Read a file and encode it as base64. Args: file_path: Path to the file to read Returns: Base64 encoded string of file contents """ try: with open(file_path, "rb") as f: return base64.b64encode(f.read()).decode() except Exception as e: logger.error(f"Failed to read file {file_path}: {str(e)}") raise def combined_transcription(audio: str, model_choice: str) -> str: """ Transcribe audio using the selected model. Args: audio: Path to audio file model_choice: Full model ID to use for transcription Returns: Transcription text """ if not audio: return "Please provide an audio file to transcribe." try: if model_choice == PHI_MODEL_ID: return transcribe_audio_phi(audio) elif model_choice == WHISPER_MODEL_ID: return transcribe_audio_local(audio) else: logger.error(f"Unknown model choice: {model_choice}") return f"Error: Unknown model {model_choice}" except Exception as e: logger.error(f"Transcription failed: {str(e)}") return f"Error during transcription: {str(e)}" def create_demo() -> gr.Blocks: """Create and configure the Gradio demo interface""" with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# TWASR: Chinese (Taiwan) Automatic Speech Recognition") gr.Markdown( "Upload an audio file or record your voice to transcribe it to text." ) gr.Markdown( "⚠️ First load may take a while to initialize the model, following requests will be faster." ) with gr.Row(): audio_input = gr.Audio( label="Audio Input", type="filepath", show_download_button=True ) with gr.Column(): model_choice = gr.Dropdown( label="Select Model", choices=list(MODEL_CHOICES.keys()), value=PHI_MODEL_ID, info="Select the model for transcription", ) text_output = gr.Textbox(label="Transcription Output", lines=5) with gr.Row(): transcribe_button = gr.Button("🎯 Transcribe", variant="primary") clear_button = gr.Button("🧹 Clear") transcribe_button.click( fn=combined_transcription, inputs=[audio_input, model_choice], outputs=[text_output], show_progress=True, ) clear_button.click( fn=lambda: (None, ""), inputs=[], outputs=[audio_input, text_output], ) gr.Examples( examples=EXAMPLE_FILES, inputs=[audio_input, model_choice], outputs=[text_output], fn=combined_transcription, cache_examples=True, cache_mode="lazy", run_on_click=True, ) gr.Markdown("### Model Information") with gr.Accordion("Model Details", open=False): for model_id, model_name in MODEL_CHOICES.items(): gr.Markdown( f"**{model_name}:** [{model_id}](https://huggingface.co/{model_id})" ) return demo if __name__ == "__main__": # Preload models before starting the app to reduce cold start time logger.info("Preloading models to reduce cold start time") preload_models() demo = create_demo() demo.launch(share=False)