pashto-asr-v3

Running

File size: 3,176 Bytes

f0d0de7
 
 
 
 
 
796f6f8
f0d0de7
a44887a
f944087
f0d0de7
 
796f6f8
 
 
 
f0d0de7
 
 
 
796f6f8
f0d0de7
 
 
796f6f8
f0d0de7
 
 
fef2eb2
 
 
 
 
f0d0de7
fef2eb2
 
f0d0de7
fef2eb2
 
 
f0d0de7
 
 
796f6f8
f0d0de7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27d10ed
65358db
94f0f78
f0d0de7
 
 
 
 
 
 
 
 
83378e3
f0d0de7
f651cd7
f0d0de7
4d52726
 
cd5e6cd
f0d0de7
21a8dd4
f0d0de7
a4d3cd0
 
f0d0de7
a4d3cd0

import torch

import gradio as gr
import pytube as pt
from transformers import pipeline
from huggingface_hub import model_info
#from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# MODEL_NAME = "ihanif/pashto-asr-v3"
MODEL_NAME = "ihanif/whisper-small-tunning-v2" #"ihanif/pashto-asr-v5"
lang = "ps"

#load pre-trained model and tokenizer
#processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
#model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)

device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    #chunk_length_s=30,
    device=device,
)

#pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")

def transcribe(microphone, file_upload):
    warn_output = ""
    # if (microphone is not None) and (file_upload is not None):
    #     warn_output = (
    #         "WARNING: You've uploaded an audio file and used the microphone. "
    #         "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
    #     )

    # elif (microphone is None) and (file_upload is None):
    #     return "ERROR: You have to either use the microphone or upload an audio file"

    if (microphone is None) and (file_upload is None):
        return "ERROR: You have to either use the microphone or upload an audio file"
        
    file = microphone if microphone is not None else file_upload

    text = pipe(file)["text"]
    #transcription = wav2vec_model(audio)["text"]

    return warn_output + text


def _return_yt_html_embed(yt_url):
    video_id = yt_url.split("?v=")[-1]
    HTML_str = (
        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
        " </center>"
    )
    return HTML_str


def yt_transcribe(yt_url):
    yt = pt.YouTube(yt_url)
    html_embed_str = _return_yt_html_embed(yt_url)
    stream = yt.streams.filter(only_audio=True)[0]
    stream.download(filename="audio.mp3")

    text = pipe("audio.mp3")["text"]

    return html_embed_str, text


demo = gr.Blocks()

examples=[["example-1.wav", "example-1.wav"],["example-2.wav", "example-2.wav"]]
# examples=["example-1.wav"]

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath", optional=True),
        gr.inputs.Audio(source="upload", type="filepath", optional=True),
    ],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="(Pashto ASR) د پښتو اتوماتیک وینا پیژندنه",
    description=(
        "</p> تاسو کولی شئ یو آډیو فایل اپلوډ کړئ یا په خپل وسیله مایکروفون وکاروئ. مهرباني وکړئ ډاډ ترلاسه کړئ چې تاسو اجازه ورکړې ده<p>"
    ),
    #allow_flagging="never",
    flagging_options=["Transcription is not in Pashto", "Transcription is wrong"],
    examples=examples,
)
mf_transcribe.launch()

#with demo:
#    gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])

#demo.launch(enable_queue=False)