Spaces:

GiorgiSekhniashvili
/

geo-whisper

Sleeping

App Files Files Community

GiorgiSekhniashvili commited on Feb 14

Commit

bb024f6

1 Parent(s): eacd0a8

using gradio

Browse files

Files changed (2) hide show

app.py +63 -20
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,31 +1,74 @@
 import gradio as gr
-from transformers.pipelines.audio_utils import ffmpeg_read
-from transformers import WhisperForConditionalGeneration, AutoProcessor
-model_name = "GiorgiSekhniashvili/whisper-tiny-ka-01"
-processor = AutoProcessor.from_pretrained(model_name)
-model = WhisperForConditionalGeneration.from_pretrained(model_name)
-forced_decoder_ids = processor.get_decoder_prompt_ids(
-    language="Georgian", task="transcribe"
 )
-def predict(audio_path):
-    if audio_path:
-        with open(audio_path, "rb") as f:
-            waveform = ffmpeg_read(f.read(), sampling_rate=16_000)
-    input_values = processor(waveform, sampling_rate=16_000, return_tensors="pt")
-    res = model.generate(
-        input_values["input_features"],
-        forced_decoder_ids=forced_decoder_ids,
-        max_new_tokens=448,
     )
-    return processor.batch_decode(res, skip_special_tokens=True)[0]
-mic = gr.Audio(source="microphone", type="filepath", label="Speak here...")
-demo = gr.Interface(predict, mic, "text")
-demo.launch()

+from pathlib import Path
 import gradio as gr
+import torch
+import torchaudio
+from transformers import (
+    WhisperFeatureExtractor,
+    WhisperForConditionalGeneration,
+    WhisperTokenizerFast,
+)
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
+SAMPLING_RATE = 16_000
+model = WhisperForConditionalGeneration.from_pretrained(
+    "../data/jobs/whisper-tiny-ka-09", torch_dtype=DTYPE
+)
+feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
+tokenizer = WhisperTokenizerFast.from_pretrained("openai/whisper-tiny")
+forced_decoder_ids = tokenizer.get_decoder_prompt_ids(
+    language="georgian", task="transcribe"
 )
+def load_audio(audio_path: Path, target_sr: int):
+    waveform, sr = torchaudio.load(audio_path, backend="soundfile")
+    if waveform.shape[0] > 1:
+        waveform = waveform.mean(dim=0, keepdim=True)
+    if sr != target_sr:
+        waveform = torchaudio.functional.resample(
+            waveform, orig_freq=sr, new_freq=target_sr
+        )
+    return waveform
+model.generation_config.forced_decoder_ids = forced_decoder_ids
+model.to(DEVICE)
+def transcribe(audio):
+    try:
+        waveform = load_audio(audio, target_sr=SAMPLING_RATE)
+    except Exception as e:
+        return str(e)
+    input_values = feature_extractor(
+        waveform[0], sampling_rate=SAMPLING_RATE, return_tensors="pt"
     )
+    input_features = input_values.input_features.to(DEVICE, dtype=DTYPE)
+    with torch.no_grad():
+        outputs = model.generate(
+            input_features,
+            forced_decoder_ids=forced_decoder_ids,
+            max_new_tokens=444,
+        )
+    transcriptions = tokenizer.batch_decode(outputs, skip_special_tokens=False)
+    return transcriptions[0]
+iface = gr.Interface(
+    fn=transcribe,
+    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
+    outputs="text",
+    title="Whisper Geo",
+    description="Realtime demo for Georgian speech recognition using a fine-tuned Whisper model.",
+)
+iface.launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 transformers
 torch
 torchvision
-torchaudio

 transformers
 torch
 torchvision
+torchaudio
+gradio