Spaces:
Runtime error
Runtime error
added app
Browse files- app.py +44 -0
- pipeline.py +66 -0
- requirements.txt +4 -0
app.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Tuple
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from torchaudio.transforms import Resample
|
| 7 |
+
|
| 8 |
+
from huggingface_hub import hf_hub_download
|
| 9 |
+
|
| 10 |
+
import gradio as gr
|
| 11 |
+
|
| 12 |
+
from pipeline import PreTrainedPipeline
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
HF_HUB_URL = 'ales/wav2vec2-cv-be'
|
| 16 |
+
LM_HUB_FP = 'language_model/cv8be_5gram.bin'
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def main(rate_audio_tuple: Tuple[int, np.ndarray]):
|
| 20 |
+
sampling_rate, audio = rate_audio_tuple
|
| 21 |
+
|
| 22 |
+
# resample audio to 16kHz
|
| 23 |
+
resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
|
| 24 |
+
audio_resampled = resampler(torch.tensor(audio)).numpy().flatten()
|
| 25 |
+
|
| 26 |
+
# download Language Model from HF Hub
|
| 27 |
+
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
|
| 28 |
+
|
| 29 |
+
# init pipeline
|
| 30 |
+
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
|
| 31 |
+
|
| 32 |
+
# recognize speech
|
| 33 |
+
text_recognized = pipeline(inputs=audio_resampled)['text'][0]
|
| 34 |
+
|
| 35 |
+
return text_recognized
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
iface = gr.Interface(
|
| 39 |
+
fn=main,
|
| 40 |
+
inputs='microphone',
|
| 41 |
+
outputs="text"
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
iface.launch()
|
pipeline.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
from typing import Dict
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
import pyctcdecode
|
| 7 |
+
|
| 8 |
+
from transformers import (
|
| 9 |
+
Wav2Vec2Processor,
|
| 10 |
+
Wav2Vec2ProcessorWithLM,
|
| 11 |
+
Wav2Vec2ForCTC,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class PreTrainedPipeline():
|
| 16 |
+
|
| 17 |
+
def __init__(self, model_path: str, language_model_fp: str):
|
| 18 |
+
self.language_model_fp = language_model_fp
|
| 19 |
+
|
| 20 |
+
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 21 |
+
self.model = Wav2Vec2ForCTC.from_pretrained(model_path)
|
| 22 |
+
self.model.to(self.device)
|
| 23 |
+
|
| 24 |
+
processor = Wav2Vec2Processor.from_pretrained(model_path)
|
| 25 |
+
self.sampling_rate = processor.feature_extractor.sampling_rate
|
| 26 |
+
|
| 27 |
+
vocab = processor.tokenizer.get_vocab()
|
| 28 |
+
sorted_vocab_dict = [(char, ix) for char, ix in sorted(vocab.items(), key=lambda item: item[1])]
|
| 29 |
+
|
| 30 |
+
self.decoder = pyctcdecode.build_ctcdecoder(
|
| 31 |
+
labels=[x[0] for x in sorted_vocab_dict],
|
| 32 |
+
kenlm_model_path=self.language_model_fp,
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
self.processor_with_lm = Wav2Vec2ProcessorWithLM(
|
| 36 |
+
feature_extractor=processor.feature_extractor,
|
| 37 |
+
tokenizer=processor.tokenizer,
|
| 38 |
+
decoder=self.decoder
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
def __call__(self, inputs: np.array) -> Dict[str, str]:
|
| 42 |
+
"""
|
| 43 |
+
Args:
|
| 44 |
+
inputs (:obj:`np.array`):
|
| 45 |
+
The raw waveform of audio received. By default at 16KHz.
|
| 46 |
+
Return:
|
| 47 |
+
A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
|
| 48 |
+
the detected text from the input audio.
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
input_values = self.processor_with_lm(
|
| 52 |
+
inputs, return_tensors="pt",
|
| 53 |
+
sampling_rate=self.sampling_rate
|
| 54 |
+
)['input_values']
|
| 55 |
+
|
| 56 |
+
with torch.no_grad():
|
| 57 |
+
# input_values should be a 1D numpy array by now
|
| 58 |
+
input_values = torch.tensor(input_values, device=self.device)
|
| 59 |
+
model_outs = self.model(input_values)
|
| 60 |
+
logits = model_outs.logits.cpu().detach().numpy()
|
| 61 |
+
|
| 62 |
+
text_predicted = self.processor_with_lm.batch_decode(logits)['text']
|
| 63 |
+
|
| 64 |
+
return {
|
| 65 |
+
"text": text_predicted
|
| 66 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers==4.17.0
|
| 2 |
+
pyctcdecode==0.3.0
|
| 3 |
+
numpy
|
| 4 |
+
https://github.com/kpu/kenlm/archive/master.zip
|