jiuuee commited on
Commit
c5a564e
·
verified ·
1 Parent(s): 2482c73

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -18
app.py CHANGED
@@ -5,6 +5,7 @@ import os
5
  import soundfile as sf
6
  import tempfile
7
  import uuid
 
8
  import torch
9
 
10
  from nemo.collections.asr.models import ASRModel
@@ -12,7 +13,7 @@ from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTask
12
  from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
13
 
14
  SAMPLE_RATE = 16000 # Hz
15
- MAX_AUDIO_MINUTES = 1 # wont try to transcribe if longer than this
16
 
17
  model = ASRModel.from_pretrained("nvidia/canary-1b")
18
  model.eval()
@@ -40,11 +41,6 @@ frame_asr = FrameBatchMultiTaskAED(
40
  amp_dtype = torch.float16
41
 
42
  def convert_audio(audio_filepath, tmpdir, utt_id):
43
- """
44
- Convert all files to monochannel 16 kHz wav files.
45
- Do not convert and raise error if audio too long.
46
- Returns output filename and duration.
47
- """
48
 
49
  data, sr = librosa.load(audio_filepath, sr=None, mono=True)
50
 
@@ -68,7 +64,7 @@ def convert_audio(audio_filepath, tmpdir, utt_id):
68
  return out_filename, duration
69
 
70
 
71
- def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
72
 
73
  if audio_filepath is None:
74
  raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
@@ -76,8 +72,8 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
76
  utt_id = uuid.uuid4()
77
  with tempfile.TemporaryDirectory() as tmpdir:
78
  converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
79
-
80
- # make manifest file and save
81
  manifest_data = {
82
  "audio_filepath": converted_audio_filepath,
83
  "source_lang": "en",
@@ -112,9 +108,9 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
112
  output_text = hyps[0].text
113
 
114
  return output_text
115
-
116
  with gr.Blocks(
117
- title="myAlexa",
118
  css="""
119
  textarea { font-size: 18px;}
120
  #model_output_text_box span {
@@ -125,20 +121,28 @@ with gr.Blocks(
125
  theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
126
  ) as demo:
127
 
128
- gr.HTML("<h1 style='text-align: center'>Your amazing AI assistant</h1>")
129
 
130
  with gr.Row():
131
  with gr.Column():
132
  gr.HTML(
133
- "<p><b>Step 1:</b> Record with your microphone.</p>"
134
- )
135
- audio_file = gr.Audio(sources=["microphone"], type="filepath")
 
 
 
 
 
136
 
 
137
 
138
  with gr.Column():
139
 
 
 
140
  go_button = gr.Button(
141
- value="Transcribe",
142
  variant="primary", # make "primary" so it stands out (default is "secondary")
143
  )
144
 
@@ -146,12 +150,16 @@ with gr.Blocks(
146
  label="Model Output",
147
  elem_id="model_output_text_box",
148
  )
149
-
 
150
  go_button.click(
151
  fn=transcribe,
152
  inputs = [audio_file],
153
  outputs = [model_output_text_box]
154
  )
155
 
 
156
  demo.queue()
157
- demo.launch()
 
 
 
5
  import soundfile as sf
6
  import tempfile
7
  import uuid
8
+
9
  import torch
10
 
11
  from nemo.collections.asr.models import ASRModel
 
13
  from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
14
 
15
  SAMPLE_RATE = 16000 # Hz
16
+ MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
17
 
18
  model = ASRModel.from_pretrained("nvidia/canary-1b")
19
  model.eval()
 
41
  amp_dtype = torch.float16
42
 
43
  def convert_audio(audio_filepath, tmpdir, utt_id):
 
 
 
 
 
44
 
45
  data, sr = librosa.load(audio_filepath, sr=None, mono=True)
46
 
 
64
  return out_filename, duration
65
 
66
 
67
+ def transcribe(audio_filepath):
68
 
69
  if audio_filepath is None:
70
  raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
 
72
  utt_id = uuid.uuid4()
73
  with tempfile.TemporaryDirectory() as tmpdir:
74
  converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
75
+
76
+ # make manifest file and save
77
  manifest_data = {
78
  "audio_filepath": converted_audio_filepath,
79
  "source_lang": "en",
 
108
  output_text = hyps[0].text
109
 
110
  return output_text
111
+
112
  with gr.Blocks(
113
+ title="NeMo Canary Model",
114
  css="""
115
  textarea { font-size: 18px;}
116
  #model_output_text_box span {
 
121
  theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
122
  ) as demo:
123
 
124
+ gr.HTML("<h1 style='text-align: center'>NeMo Canary model: Transcribe & Translate audio</h1>")
125
 
126
  with gr.Row():
127
  with gr.Column():
128
  gr.HTML(
129
+ "<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
130
+
131
+ "<p style='color: #A0A0A0;'>This demo supports audio files up to 10 mins long. "
132
+ "You can transcribe longer files locally with this NeMo "
133
+ "<a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py'>script</a>.</p>"
134
+ )
135
+
136
+ audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
137
 
138
+
139
 
140
  with gr.Column():
141
 
142
+ gr.HTML("<p><b>Step 2:</b> Run the model.</p>")
143
+
144
  go_button = gr.Button(
145
+ value="Run model",
146
  variant="primary", # make "primary" so it stands out (default is "secondary")
147
  )
148
 
 
150
  label="Model Output",
151
  elem_id="model_output_text_box",
152
  )
153
+
154
+
155
  go_button.click(
156
  fn=transcribe,
157
  inputs = [audio_file],
158
  outputs = [model_output_text_box]
159
  )
160
 
161
+
162
  demo.queue()
163
+ demo.launch()
164
+
165
+