Spaces:
Running
Running
import gradio as gr | |
import torch | |
from wenet.cli.model import load_model | |
def process_cat_embs(cat_embs): | |
device = "cpu" | |
cat_embs = torch.tensor( | |
[float(c) for c in cat_embs.split(',')]).to(device) | |
return cat_embs | |
def download_rev_models(): | |
from huggingface_hub import hf_hub_download | |
import joblib | |
REPO_ID = "Revai/reverb-asr" | |
files = ['reverb_asr_v1.jit.zip', 'tk.units.txt'] | |
downloaded_files = [hf_hub_download(repo_id=REPO_ID, filename=f) for f in files] | |
model = load_model(downloaded_files[0], downloaded_files[1]) | |
return model | |
model = download_rev_models() | |
def recognition(audio, style=0): | |
if audio is None: | |
return "Input Error! Please enter one audio!" | |
cat_embs = ','.join([str(s) for s in (style, 1-style)]) | |
cat_embs = process_cat_embs(cat_embs) | |
ans = model.transcribe(audio, cat_embs = cat_embs) | |
if ans is None: | |
return "ERROR! No text output! Please try again!" | |
txt = ans['text'] | |
txt = txt.replace('β', ' ') | |
return txt | |
# input | |
inputs = [ | |
gr.inputs.Audio(source="microphone", type="filepath", label='Input audio'), | |
gr.Slider(0, 1, value=0, label="Verbatimicity - from non-verbatim (0) to verbatim (1)", info="Choose a transcription style between non-verbatim and verbatim"), | |
] | |
output = gr.outputs.Textbox(label="Output Text") | |
text = "ASR Transcription Opensource Demo" | |
# description | |
description = ( | |
" Opensource Automatic Speech Recognition in English | |
Verbatim Transcript style(1) refers to word to word-to-word transcription of an audio | |
Non Verbatim Transcript style(0) refers to just conserving the message of the original audio | |
" | |
) | |
interface = gr.Interface( | |
fn=recognition, | |
inputs=inputs, | |
outputs=output, | |
title=text, | |
description=description, | |
theme='huggingface', | |
) | |
interface.launch(enable_queue=True) | |