Dorjzodovsuren's picture
Update app.py
e0d5db2 verified
import os
import torch
import spaces
import librosa
import numpy as np
import gradio as gr
from transformers import pipeline
from transformers import WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained("Dorjzodovsuren/whisper-large-v2-mn", language="Mongolian", task="transcribe")
pipe = pipeline("automatic-speech-recognition", model = "Dorjzodovsuren/whisper-large-v2-mn", tokenizer=tokenizer, device_map="auto", chunk_length_s=9) #same as setting as "open whisper" default
gpu_timeout = int(os.getenv("GPU_TIMEOUT", 60))
SAMPLE_RATE = 16_000
MAX_DURATION = 120
MAX_NEW_TOKEN = 255
@spaces.GPU(duration=gpu_timeout)
def transcribe(audio):
waveform, sampling_rate = librosa.load(audio, sr=SAMPLE_RATE, mono=True, duration=MAX_DURATION)
# # # Convert to mono if stereo
# if waveform.ndim > 1:
# waveform = waveform.mean(axis=1)
# waveform = waveform.astype(np.float32)
# waveform /= np.max(np.abs(waveform))
with torch.cuda.amp.autocast():
text = pipe(waveform, max_new_tokens=MAX_NEW_TOKEN)["text"]
return text
with gr.Blocks() as demo:
gr.Markdown("""OpenAI-ийн Whisper Large-v3 Turbo ашиглан энгийн Монгол хэлний ярианы таних моделиг сургаж та бүхэндээ хүргэж байна.
Энэхүү модель нь ердөө 8000+ жишээнээс бүрдсэн жижиг датасетаар сурсан тул, сургалтын жишээ нэмэгдэхийн хэрээр илүү сайн гүйцэтгэл үзүүлэх боломжтой. Enjoy ;D""")
audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Аудио оролт")
send_button = gr.Button("Илгээх")
text_output = gr.Textbox(label="Текст хөрвүүлгэ")
# The send button triggers the transcribe function
send_button.click(transcribe, inputs=audio_input, outputs=text_output)
demo.launch()