File size: 1,966 Bytes
1368016 e380369 1368016 86ee3c0 25fbeda 8e6fbfa 2e71f67 4f2122d 1c82fbf e0d5db2 1368016 e380369 1368016 e380369 1368016 ebb8ec1 132349b ebb8ec1 132349b 4f2122d 1368016 db07dc1 e380369 c09e528 8ee7d06 33bcbd4 a2b21a4 1368016 bb5214d c09e528 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
import os
import torch
import spaces
import librosa
import numpy as np
import gradio as gr
from transformers import pipeline
from transformers import WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained("Dorjzodovsuren/whisper-large-v2-mn", language="Mongolian", task="transcribe")
pipe = pipeline("automatic-speech-recognition", model = "Dorjzodovsuren/whisper-large-v2-mn", tokenizer=tokenizer, device_map="auto", chunk_length_s=9) #same as setting as "open whisper" default
gpu_timeout = int(os.getenv("GPU_TIMEOUT", 60))
SAMPLE_RATE = 16_000
MAX_DURATION = 120
MAX_NEW_TOKEN = 255
@spaces.GPU(duration=gpu_timeout)
def transcribe(audio):
waveform, sampling_rate = librosa.load(audio, sr=SAMPLE_RATE, mono=True, duration=MAX_DURATION)
# # # Convert to mono if stereo
# if waveform.ndim > 1:
# waveform = waveform.mean(axis=1)
# waveform = waveform.astype(np.float32)
# waveform /= np.max(np.abs(waveform))
with torch.cuda.amp.autocast():
text = pipe(waveform, max_new_tokens=MAX_NEW_TOKEN)["text"]
return text
with gr.Blocks() as demo:
gr.Markdown("""OpenAI-ийн Whisper Large-v3 Turbo ашиглан энгийн Монгол хэлний ярианы таних моделиг сургаж та бүхэндээ хүргэж байна.
Энэхүү модель нь ердөө 8000+ жишээнээс бүрдсэн жижиг датасетаар сурсан тул, сургалтын жишээ нэмэгдэхийн хэрээр илүү сайн гүйцэтгэл үзүүлэх боломжтой. Enjoy ;D""")
audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Аудио оролт")
send_button = gr.Button("Илгээх")
text_output = gr.Textbox(label="Текст хөрвүүлгэ")
# The send button triggers the transcribe function
send_button.click(transcribe, inputs=audio_input, outputs=text_output)
demo.launch()
|