Spaces:
Sleeping
Sleeping
File size: 2,932 Bytes
c53972e a67942c 4311f84 c53972e 095c040 9ba2a1c 095c040 e5ef7cb 095c040 c53972e 095c040 c53972e 998c4cc 095c040 998c4cc 095c040 998c4cc 095c040 998c4cc 095c040 998c4cc 095c040 998c4cc a67942c 095c040 998c4cc 095c040 998c4cc 095c040 998c4cc 095c040 c53972e 998c4cc 9ba2a1c 095c040 9ba2a1c 095c040 9ba2a1c 095c040 9ba2a1c a5bf333 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import gradio as gr
from pydub import AudioSegment, silence
import tempfile
import torch
import torchaudio
import os
# ---------------- Config ---------------- #
MODEL_NAME = "mrmuminov/whisper-small-uz"
SAMPLE_RATE = 16000
MIN_LEN_MS = 15000
MAX_LEN_MS = 25000
SILENCE_THRESH = -40 # in dBFS
# ---------------- Load Model ---------------- #
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device).eval() # set to eval mode
# ---------------- Chunking Logic ---------------- #
def split_on_silence_with_duration_control(audio, min_len, max_len, silence_thresh):
silences = silence.detect_silence(audio, min_silence_len=500, silence_thresh=silence_thresh)
silence_midpoints = [((start + end) // 2) for start, end in silences]
chunks = []
start = 0
duration = len(audio)
while start < duration:
end = min(start + max_len, duration)
valid_splits = [s for s in silence_midpoints if start + min_len <= s <= end]
split_point = valid_splits[-1] if valid_splits else end
chunk = audio[start:split_point]
# Avoid zero-length chunks
if len(chunk) > 0:
chunks.append(chunk)
start = split_point
return chunks
# ---------------- Transcription ---------------- #
def transcribe(audio_file_path):
audio = AudioSegment.from_file(audio_file_path)
# Ensure mono and target sample rate
audio = audio.set_channels(1).set_frame_rate(SAMPLE_RATE)
chunks = split_on_silence_with_duration_control(
audio, min_len=MIN_LEN_MS, max_len=MAX_LEN_MS, silence_thresh=SILENCE_THRESH
)
results = []
for chunk in chunks:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmpfile:
chunk.export(tmpfile.name, format="wav")
waveform, _ = torchaudio.load(tmpfile.name)
input_features = processor(
waveform.squeeze().numpy(),
sampling_rate=SAMPLE_RATE,
return_tensors="pt",
language="uz"
).input_features.to(device)
with torch.no_grad():
predicted_ids = model.generate(input_features)
text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
results.append(text)
return " ".join(results)
# ---------------- Gradio UI ---------------- #
with gr.Blocks() as demo:
gr.Markdown("### " + MODEL_NAME + " Transcribe Uzbek Audio")
file_transcribe = gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="filepath", label="Upload Audio"),
outputs=gr.Textbox(label="Transcription"),
)
gr.TabbedInterface([file_transcribe], ["Audio File"])
demo.launch()
|