import streamlit as st from transformers import WhisperProcessor, WhisperForConditionalGeneration import torchaudio from io import BytesIO # Load the model @st.cache_resource def load_model(): processor = WhisperProcessor.from_pretrained("233-Yorozuya/dl_twi_asr") model = WhisperForConditionalGeneration.from_pretrained("233-Yorozuya/dl_twi_asr") return processor, model processor, model = load_model() st.title("ASR with Fine-Tuned Whisper") st.write("Upload an audio file for transcription:") # File upload audio_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "ogg"]) if audio_file: try: # Convert uploaded file to bytes audio_bytes = BytesIO(audio_file.read()) audio, rate = torchaudio.load(audio_bytes) audio = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000)(audio) # Preprocess the audio inputs = processor(audio[0].numpy(), sampling_rate=16000, return_tensors="pt") # Specify the language (Asanti Twi) model.config.forced_decoder_ids = None # Disable forced language # Perform inference with st.spinner("Transcribing..."): predicted_ids = model.generate(inputs.input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] # Display result st.subheader("Transcription") st.write(transcription) except Exception as e: st.error(f"An error occurred: {e}")