mrmuminov commited on
Commit
095c040
·
verified ·
1 Parent(s): 9c507ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -35
app.py CHANGED
@@ -4,45 +4,55 @@ from pydub import AudioSegment, silence
4
  import tempfile
5
  import torch
6
  import torchaudio
 
7
 
 
8
  MODEL_NAME = "mrmuminov/whisper-small-uz"
 
 
 
 
9
 
 
10
  processor = WhisperProcessor.from_pretrained(MODEL_NAME)
11
  model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
12
-
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
- model = model.to(device)
15
 
16
- def split_on_silence_with_duration_control(audio, min_len, max_len, silence_thresh=-40):
 
17
  silences = silence.detect_silence(audio, min_silence_len=500, silence_thresh=silence_thresh)
18
- silences = [((start + end) // 2) for start, end in silences]
19
 
20
  chunks = []
21
  start = 0
22
- while start < len(audio):
23
- end = min(start + max_len, len(audio))
24
- candidates = [s for s in silences if start + min_len <= s <= end]
25
- split_point = candidates[-1] if candidates else end
26
- chunks.append(audio[start:split_point])
 
 
 
 
 
 
 
27
  start = split_point
 
28
  return chunks
29
 
30
- def transcribe(audio_file):
31
- # Load audio using pydub
32
- audio = AudioSegment.from_file(audio_file)
33
-
34
- # Convert to mono and 16kHz if needed
35
- if audio.channels > 1:
36
- audio = audio.set_channels(1)
37
- if audio.frame_rate != 16000:
38
- audio = audio.set_frame_rate(16000)
39
-
40
- # Detect silent chunks
41
  chunks = split_on_silence_with_duration_control(
42
- audio, min_len=15000, max_len=25000, silence_thresh=-40
43
  )
44
 
45
- # Transcribe each chunk
46
  results = []
47
  for chunk in chunks:
48
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmpfile:
@@ -50,29 +60,28 @@ def transcribe(audio_file):
50
  waveform, _ = torchaudio.load(tmpfile.name)
51
  input_features = processor(
52
  waveform.squeeze().numpy(),
53
- sampling_rate=16000,
54
  return_tensors="pt",
55
  language="uz"
56
  ).input_features.to(device)
57
 
58
  with torch.no_grad():
59
  predicted_ids = model.generate(input_features)
60
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
61
- results.append(transcription)
62
 
63
  return " ".join(results)
64
 
65
- demo = gr.Blocks()
 
 
66
 
67
- file_transcribe = gr.Interface(
68
- fn=transcribe,
69
- inputs=gr.Audio(type="filepath", label="Audio file"),
70
- outputs="text",
71
- title="mrmuminov/whisper-small-uz: Transcribe Audio",
72
- description="mrmuminov/whisper-small-uz fine-tuned for Uzbek language",
73
- )
74
 
75
- with demo:
76
- gr.TabbedInterface([file_transcribe], ["Audio file"])
77
 
78
  demo.launch()
 
4
  import tempfile
5
  import torch
6
  import torchaudio
7
+ import os
8
 
9
+ # ---------------- Config ---------------- #
10
  MODEL_NAME = "mrmuminov/whisper-small-uz"
11
+ SAMPLE_RATE = 16000
12
+ MIN_LEN_MS = 15000
13
+ MAX_LEN_MS = 25000
14
+ SILENCE_THRESH = -40 # in dBFS
15
 
16
+ # ---------------- Load Model ---------------- #
17
  processor = WhisperProcessor.from_pretrained(MODEL_NAME)
18
  model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
 
19
  device = "cuda" if torch.cuda.is_available() else "cpu"
20
+ model = model.to(device).eval() # set to eval mode
21
 
22
+ # ---------------- Chunking Logic ---------------- #
23
+ def split_on_silence_with_duration_control(audio, min_len, max_len, silence_thresh):
24
  silences = silence.detect_silence(audio, min_silence_len=500, silence_thresh=silence_thresh)
25
+ silence_midpoints = [((start + end) // 2) for start, end in silences]
26
 
27
  chunks = []
28
  start = 0
29
+ duration = len(audio)
30
+
31
+ while start < duration:
32
+ end = min(start + max_len, duration)
33
+ valid_splits = [s for s in silence_midpoints if start + min_len <= s <= end]
34
+ split_point = valid_splits[-1] if valid_splits else end
35
+ chunk = audio[start:split_point]
36
+
37
+ # Avoid zero-length chunks
38
+ if len(chunk) > 0:
39
+ chunks.append(chunk)
40
+
41
  start = split_point
42
+
43
  return chunks
44
 
45
+ # ---------------- Transcription ---------------- #
46
+ def transcribe(audio_file_path):
47
+ audio = AudioSegment.from_file(audio_file_path)
48
+
49
+ # Ensure mono and target sample rate
50
+ audio = audio.set_channels(1).set_frame_rate(SAMPLE_RATE)
51
+
 
 
 
 
52
  chunks = split_on_silence_with_duration_control(
53
+ audio, min_len=MIN_LEN_MS, max_len=MAX_LEN_MS, silence_thresh=SILENCE_THRESH
54
  )
55
 
 
56
  results = []
57
  for chunk in chunks:
58
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmpfile:
 
60
  waveform, _ = torchaudio.load(tmpfile.name)
61
  input_features = processor(
62
  waveform.squeeze().numpy(),
63
+ sampling_rate=SAMPLE_RATE,
64
  return_tensors="pt",
65
  language="uz"
66
  ).input_features.to(device)
67
 
68
  with torch.no_grad():
69
  predicted_ids = model.generate(input_features)
70
+ text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
71
+ results.append(text)
72
 
73
  return " ".join(results)
74
 
75
+ # ---------------- Gradio UI ---------------- #
76
+ with gr.Blocks() as demo:
77
+ gr.Markdown("### " + MODEL_NAME + " Transcribe Uzbek Audio")
78
 
79
+ file_transcribe = gr.Interface(
80
+ fn=transcribe,
81
+ inputs=gr.Audio(type="filepath", label="Upload Audio"),
82
+ outputs=gr.Textbox(label="Transcription"),
83
+ )
 
 
84
 
85
+ gr.TabbedInterface([file_transcribe], ["Audio File"])
 
86
 
87
  demo.launch()