Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
import torch | |
import numpy as np | |
from scipy.signal import resample | |
# app.py | |
# Load your model (use_auth_token for private models) | |
import os | |
hf_token = os.getenv("HF_TOKEN") | |
model = WhisperForConditionalGeneration.from_pretrained("GiftMark/akan-whisper-model", token=hf_token) | |
processor = WhisperProcessor.from_pretrained("GiftMark/akan-whisper-model", token=hf_token) | |
def transcribe(audio): | |
try: | |
if audio is None: | |
return "No audio provided." | |
sampling_rate, data = audio | |
data = np.array(data).astype(np.float32) | |
# Ensure mono | |
if len(data.shape) > 1: | |
data = data[:, 0] | |
# Resample if needed | |
target_sr = 16000 | |
if sampling_rate != target_sr: | |
# Calculate number of samples after resampling | |
duration = data.shape[0] / sampling_rate | |
new_length = int(duration * target_sr) | |
data = resample(data, new_length) | |
sampling_rate = target_sr | |
inputs = processor( | |
data, sampling_rate=sampling_rate, return_tensors="pt" | |
).input_features | |
with torch.no_grad(): | |
predicted_ids = model.generate(inputs) | |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
return transcription | |
except Exception as e: | |
print("Error during transcription:", e) | |
return f"Error: {e}" | |
demo = gr.Interface( | |
fn=transcribe, | |
inputs=gr.Audio(sources=["microphone", "upload"], type="numpy", label="Record or upload Akan audio"), | |
outputs=gr.Textbox(label="Transcription"), | |
title="Akan Speech-to-Text Demo", | |
description="Record or upload Akan audio to test the Whisper ASR model." | |
) | |
demo.launch() |