mrmuminov's picture
Update app.py
095c040 verified
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import gradio as gr
from pydub import AudioSegment, silence
import tempfile
import torch
import torchaudio
import os
# ---------------- Config ---------------- #
MODEL_NAME = "mrmuminov/whisper-small-uz"
SAMPLE_RATE = 16000
MIN_LEN_MS = 15000
MAX_LEN_MS = 25000
SILENCE_THRESH = -40 # in dBFS
# ---------------- Load Model ---------------- #
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device).eval() # set to eval mode
# ---------------- Chunking Logic ---------------- #
def split_on_silence_with_duration_control(audio, min_len, max_len, silence_thresh):
silences = silence.detect_silence(audio, min_silence_len=500, silence_thresh=silence_thresh)
silence_midpoints = [((start + end) // 2) for start, end in silences]
chunks = []
start = 0
duration = len(audio)
while start < duration:
end = min(start + max_len, duration)
valid_splits = [s for s in silence_midpoints if start + min_len <= s <= end]
split_point = valid_splits[-1] if valid_splits else end
chunk = audio[start:split_point]
# Avoid zero-length chunks
if len(chunk) > 0:
chunks.append(chunk)
start = split_point
return chunks
# ---------------- Transcription ---------------- #
def transcribe(audio_file_path):
audio = AudioSegment.from_file(audio_file_path)
# Ensure mono and target sample rate
audio = audio.set_channels(1).set_frame_rate(SAMPLE_RATE)
chunks = split_on_silence_with_duration_control(
audio, min_len=MIN_LEN_MS, max_len=MAX_LEN_MS, silence_thresh=SILENCE_THRESH
)
results = []
for chunk in chunks:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmpfile:
chunk.export(tmpfile.name, format="wav")
waveform, _ = torchaudio.load(tmpfile.name)
input_features = processor(
waveform.squeeze().numpy(),
sampling_rate=SAMPLE_RATE,
return_tensors="pt",
language="uz"
).input_features.to(device)
with torch.no_grad():
predicted_ids = model.generate(input_features)
text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
results.append(text)
return " ".join(results)
# ---------------- Gradio UI ---------------- #
with gr.Blocks() as demo:
gr.Markdown("### " + MODEL_NAME + " Transcribe Uzbek Audio")
file_transcribe = gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="filepath", label="Upload Audio"),
outputs=gr.Textbox(label="Transcription"),
)
gr.TabbedInterface([file_transcribe], ["Audio File"])
demo.launch()