Spaces:

reach-vb
/

asr-pyctcdecode

Runtime error

App Files Files Community

Vaibhav Srivastav commited on Jan 11, 2022

Commit

3b8d409

1 Parent(s): d32240b

removing unused code

Browse files

Files changed (1) hide show

app.py +9 -12

app.py CHANGED Viewed

@@ -12,10 +12,10 @@ model_name = "facebook/wav2vec2-base-960h"
 processor = Wav2Vec2Processor.from_pretrained(model_name)
 model = Wav2Vec2ForCTC.from_pretrained(model_name)
-def load_data(input_file):
   #read the file
   speech, sample_rate = librosa.load(input_file)
-  #make it 1-D
   if len(speech.shape) > 1:
       speech = speech[:,0] + speech[:,1]
   #resampling to 16KHz
@@ -29,26 +29,23 @@ def fix_transcription_casing(input_sentence):
   return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences]))
 def predict_and_decode(input_file):
-  speech = load_data(input_file)
-  #tokenize
   input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
   logits = model(input_values).logits.cpu().detach().numpy()[0]
-  vocab_list = list(processor.tokenizer.get_vocab().keys())
-#   #Take argmax
-#   predicted_ids = torch.argmax(logits, dim=-1)
-#   #Get the words from predicted word ids
-#   transcription = tokenizer.decode(predicted_ids[0])
   decoder = build_ctcdecoder(vocab_list)
   pred = decoder.decode(logits)
-  #Output is all upper case
   transcribed_text = fix_transcription_casing(pred.lower())
   return transcribed_text
 gr.Interface(predict_and_decode,
-             inputs = gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker"),
              outputs = gr.outputs.Textbox(label="Output Text"),
              title="ASR using Wav2Vec 2.0 & pyctcdecode",
-             description = "Wav2Vec2 in-action",
              layout = "horizontal",
              examples = [["test.wav"]], theme="huggingface").launch()

 processor = Wav2Vec2Processor.from_pretrained(model_name)
 model = Wav2Vec2ForCTC.from_pretrained(model_name)
+def load_and_fix_data(input_file):
   #read the file
   speech, sample_rate = librosa.load(input_file)
+  #make it 1D
   if len(speech.shape) > 1:
       speech = speech[:,0] + speech[:,1]
   #resampling to 16KHz
   return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences]))
 def predict_and_decode(input_file):
+  speech = load_and_fix_data(input_file)
   input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
   logits = model(input_values).logits.cpu().detach().numpy()[0]
+  vocab_list = list(processor.tokenizer.get_vocab().keys())
   decoder = build_ctcdecoder(vocab_list)
   pred = decoder.decode(logits)
   transcribed_text = fix_transcription_casing(pred.lower())
   return transcribed_text
 gr.Interface(predict_and_decode,
+             inputs = gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Record/ Drop audio"),
              outputs = gr.outputs.Textbox(label="Output Text"),
              title="ASR using Wav2Vec 2.0 & pyctcdecode",
+             description = "Extending HF ASR models with pyctcdecode decoder",
              layout = "horizontal",
              examples = [["test.wav"]], theme="huggingface").launch()