jiuuee commited on
Commit
0deb309
·
verified ·
1 Parent(s): 261e165

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -37
app.py CHANGED
@@ -1,45 +1,30 @@
1
  import gradio as gr
2
- import torch
3
- import sounddevice as sd
4
- import numpy as np
5
  from nemo.collections.asr.models import ASRModel
 
6
 
7
  # Load the NeMo ASR model
8
  model = ASRModel.from_pretrained("nvidia/canary-1b")
9
  model.eval()
10
 
11
- # Load the keyword spotting model
12
- kws_model = torch.hub.load('snakers4/silero-vad', 'silero_vad')
13
-
14
- # Constants
15
- TRIGGER_WORD = "hey alexa"
16
- TRIGGER_DURATION = 2 # Duration to record after trigger word is detected, in seconds
17
- SAMPLE_RATE = 16000 # Sample rate for recording
18
-
19
- def start_recording():
20
- print("Recording started...")
21
- audio = sd.rec(int(TRIGGER_DURATION * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype='float32')
22
- sd.wait()
23
- return audio.flatten()
24
-
25
- def detect_trigger(audio):
26
- # Perform keyword spotting
27
- is_triggered = kws_model(audio, sample_rate=SAMPLE_RATE) >= 0.5
28
- return is_triggered
29
-
30
- def transcribe_triggered():
31
- while True:
32
- print("Listening for trigger word...")
33
- # Start recording
34
- recorded_audio = start_recording()
35
-
36
- # Check if trigger word is detected
37
- is_triggered = detect_trigger(recorded_audio)
38
- if is_triggered:
39
- print("Trigger word detected. Transcribing...")
40
- # Perform speech recognition
41
- transcription = model.transcribe([recorded_audio])
42
- return transcription[0]
43
-
44
- iface = gr.Interface(transcribe_triggered, gr.components.Audio(), "text", title="ASR with NeMo Canary Model (Triggered by 'Hey Alexa')")
45
  iface.launch()
 
1
  import gradio as gr
 
 
 
2
  from nemo.collections.asr.models import ASRModel
3
+ import librosa
4
 
5
  # Load the NeMo ASR model
6
  model = ASRModel.from_pretrained("nvidia/canary-1b")
7
  model.eval()
8
 
9
+ def preprocess_audio(audio):
10
+ # Convert audio data to mono channel and resample to 16kHz if necessary
11
+ audio_mono = librosa.to_mono(audio.T)
12
+ audio_resampled = librosa.resample(audio_mono, orig_sr=gradio.inputs.Audio.DEFAULT_SAMPLE_RATE, target_sr=16000)
13
+ return audio_resampled
14
+
15
+ def transcribe(audio):
16
+ if audio is None:
17
+ raise gr.InterfaceError("Please provide some input audio: either upload an audio file or use the microphone")
18
+
19
+ # Preprocess audio
20
+ audio_input = preprocess_audio(audio)
21
+
22
+ # Perform speech recognition
23
+ transcription = model.transcribe([audio_input])
24
+
25
+ return transcription[0]
26
+
27
+ audio_input = gr.inputs.Audio()
28
+
29
+ iface = gr.Interface(transcribe, audio_input, "text", title="ASR with NeMo Canary Model")
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  iface.launch()