AkanWhisperV.1 / app.py
GiftMark's picture
Upload 2 files
2f6398c verified
import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import numpy as np
from scipy.signal import resample
# app.py
# Load your model (use_auth_token for private models)
import os
hf_token = os.getenv("HF_TOKEN")
model = WhisperForConditionalGeneration.from_pretrained("GiftMark/akan-whisper-model", token=hf_token)
processor = WhisperProcessor.from_pretrained("GiftMark/akan-whisper-model", token=hf_token)
def transcribe(audio):
try:
if audio is None:
return "No audio provided."
sampling_rate, data = audio
data = np.array(data).astype(np.float32)
# Ensure mono
if len(data.shape) > 1:
data = data[:, 0]
# Resample if needed
target_sr = 16000
if sampling_rate != target_sr:
# Calculate number of samples after resampling
duration = data.shape[0] / sampling_rate
new_length = int(duration * target_sr)
data = resample(data, new_length)
sampling_rate = target_sr
inputs = processor(
data, sampling_rate=sampling_rate, return_tensors="pt"
).input_features
with torch.no_grad():
predicted_ids = model.generate(inputs)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
return transcription
except Exception as e:
print("Error during transcription:", e)
return f"Error: {e}"
demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources=["microphone", "upload"], type="numpy", label="Record or upload Akan audio"),
outputs=gr.Textbox(label="Transcription"),
title="Akan Speech-to-Text Demo",
description="Record or upload Akan audio to test the Whisper ASR model."
)
demo.launch()