File size: 2,932 Bytes
c53972e
a67942c
4311f84
 
c53972e
 
095c040
9ba2a1c
095c040
e5ef7cb
095c040
 
 
 
c53972e
095c040
c53972e
 
998c4cc
095c040
998c4cc
095c040
 
998c4cc
095c040
998c4cc
 
 
095c040
 
 
 
 
 
 
 
 
 
 
 
998c4cc
095c040
998c4cc
a67942c
095c040
 
 
 
 
 
 
998c4cc
095c040
998c4cc
 
 
 
 
 
 
 
 
095c040
998c4cc
 
 
 
 
 
095c040
 
c53972e
998c4cc
9ba2a1c
095c040
 
 
9ba2a1c
095c040
 
 
 
 
9ba2a1c
095c040
9ba2a1c
a5bf333
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import gradio as gr
from pydub import AudioSegment, silence
import tempfile
import torch
import torchaudio
import os

# ---------------- Config ---------------- #
MODEL_NAME = "mrmuminov/whisper-small-uz"
SAMPLE_RATE = 16000
MIN_LEN_MS = 15000
MAX_LEN_MS = 25000
SILENCE_THRESH = -40  # in dBFS

# ---------------- Load Model ---------------- #
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device).eval()  # set to eval mode

# ---------------- Chunking Logic ---------------- #
def split_on_silence_with_duration_control(audio, min_len, max_len, silence_thresh):
    silences = silence.detect_silence(audio, min_silence_len=500, silence_thresh=silence_thresh)
    silence_midpoints = [((start + end) // 2) for start, end in silences]

    chunks = []
    start = 0
    duration = len(audio)

    while start < duration:
        end = min(start + max_len, duration)
        valid_splits = [s for s in silence_midpoints if start + min_len <= s <= end]
        split_point = valid_splits[-1] if valid_splits else end
        chunk = audio[start:split_point]

        # Avoid zero-length chunks
        if len(chunk) > 0:
            chunks.append(chunk)

        start = split_point

    return chunks

# ---------------- Transcription ---------------- #
def transcribe(audio_file_path):
    audio = AudioSegment.from_file(audio_file_path)

    # Ensure mono and target sample rate
    audio = audio.set_channels(1).set_frame_rate(SAMPLE_RATE)

    chunks = split_on_silence_with_duration_control(
        audio, min_len=MIN_LEN_MS, max_len=MAX_LEN_MS, silence_thresh=SILENCE_THRESH
    )

    results = []
    for chunk in chunks:
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmpfile:
            chunk.export(tmpfile.name, format="wav")
            waveform, _ = torchaudio.load(tmpfile.name)
            input_features = processor(
                waveform.squeeze().numpy(),
                sampling_rate=SAMPLE_RATE,
                return_tensors="pt",
                language="uz"
            ).input_features.to(device)

            with torch.no_grad():
                predicted_ids = model.generate(input_features)
                text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
                results.append(text)

    return " ".join(results)

# ---------------- Gradio UI ---------------- #
with gr.Blocks() as demo:
    gr.Markdown("### " + MODEL_NAME + " Transcribe Uzbek Audio")

    file_transcribe = gr.Interface(
        fn=transcribe,
        inputs=gr.Audio(type="filepath", label="Upload Audio"),
        outputs=gr.Textbox(label="Transcription"),
    )

    gr.TabbedInterface([file_transcribe], ["Audio File"])

demo.launch()