Spaces:

indonesian-nlp
/

luganda-asr

Runtime error

App Files Files Community

cahya commited on Jan 19, 2022

Commit

37c396e

1 Parent(s): dbebbd9

add KenLM

Browse files

Files changed (3) hide show

5gram.bin +3 -0
app.py +28 -3
requirements.txt +3 -1

5gram.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46e982596dbb0c7c225dd9b88ef89c733ba6d718befc3c3b833b1daddc60816a
+size 11939611

app.py CHANGED Viewed

@@ -1,11 +1,35 @@
 import soundfile as sf
 import torch
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import gradio as gr
 import sox
 import os
 def convert(inputfile, outfile):
     sox_tfm = sox.Transformer()
     sox_tfm.set_output_format(
@@ -18,6 +42,7 @@ api_token = os.getenv("API_TOKEN")
 model_name = "indonesian-nlp/wav2vec2-luganda"
 processor = Wav2Vec2Processor.from_pretrained(model_name, use_auth_token=api_token)
 model = Wav2Vec2ForCTC.from_pretrained(model_name, use_auth_token=api_token)
 def parse_transcription(wav_file):
@@ -25,9 +50,9 @@ def parse_transcription(wav_file):
     convert(wav_file.name, filename + "16k.wav")
     speech, _ = sf.read(filename + "16k.wav")
     input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
-    logits = model(input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
     return transcription

 import soundfile as sf
 import torch
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+from pyctcdecode import build_ctcdecoder
 import gradio as gr
 import sox
 import os
+from multiprocessing import Pool
+class KenLM:
+    def __init__(self, tokenizer, model_name, num_workers=8, beam_width=128):
+        self.num_workers = num_workers
+        self.beam_width = beam_width
+        vocab_dict = tokenizer.get_vocab()
+        self.vocabulary = [x[0] for x in sorted(vocab_dict.items(), key=lambda x: x[1], reverse=False)]
+        # Workaround for wrong number of vocabularies:
+        self.vocabulary = self.vocabulary[:-2]
+        self.decoder = build_ctcdecoder(self.vocabulary, model_name)
+    @staticmethod
+    def lm_postprocess(text):
+        return ' '.join([x if len(x) > 1 else "" for x in text.split()]).strip()
+    def decode(self, logits):
+        probs = logits.cpu().numpy()
+        # probs = logits.numpy()
+        with Pool(self.num_workers) as pool:
+            text = self.decoder.decode_batch(pool, probs)
+            text = [KenLM.lm_postprocess(x) for x in text]
+        return text
 def convert(inputfile, outfile):
     sox_tfm = sox.Transformer()
     sox_tfm.set_output_format(
 model_name = "indonesian-nlp/wav2vec2-luganda"
 processor = Wav2Vec2Processor.from_pretrained(model_name, use_auth_token=api_token)
 model = Wav2Vec2ForCTC.from_pretrained(model_name, use_auth_token=api_token)
+kenlm = KenLM(processor.tokenizer, "5gram.bin")
 def parse_transcription(wav_file):
     convert(wav_file.name, filename + "16k.wav")
     speech, _ = sf.read(filename + "16k.wav")
     input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
+    with torch.no_grad():
+        logits = model(input_values).logits
+    transcription = kenlm.decode(logits)[0]
     return transcription

requirements.txt CHANGED Viewed

@@ -3,4 +3,6 @@ soundfile
 torch
 transformers
 sox
-sentencepiece

 torch
 transformers
 sox
+sentencepiece
+pyctcdecode==0.3.0
+kenlm @ https://github.com/kpu/kenlm/archive/master.zip