Spaces:

jiuuee
/

my-alexa

Runtime error

App Files Files Community

jiuuee commited on May 2, 2024

Commit

0deb309

verified ·

1 Parent(s): 261e165

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -37

app.py CHANGED Viewed

@@ -1,45 +1,30 @@
 import gradio as gr
-import torch
-import sounddevice as sd
-import numpy as np
 from nemo.collections.asr.models import ASRModel
 # Load the NeMo ASR model
 model = ASRModel.from_pretrained("nvidia/canary-1b")
 model.eval()
-# Load the keyword spotting model
-kws_model = torch.hub.load('snakers4/silero-vad', 'silero_vad')
-# Constants
-TRIGGER_WORD = "hey alexa"
-TRIGGER_DURATION = 2  # Duration to record after trigger word is detected, in seconds
-SAMPLE_RATE = 16000  # Sample rate for recording
-def start_recording():
-    print("Recording started...")
-    audio = sd.rec(int(TRIGGER_DURATION * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype='float32')
-    sd.wait()
-    return audio.flatten()
-def detect_trigger(audio):
-    # Perform keyword spotting
-    is_triggered = kws_model(audio, sample_rate=SAMPLE_RATE) >= 0.5
-    return is_triggered
-def transcribe_triggered():
-    while True:
-        print("Listening for trigger word...")
-        # Start recording
-        recorded_audio = start_recording()
-        # Check if trigger word is detected
-        is_triggered = detect_trigger(recorded_audio)
-        if is_triggered:
-            print("Trigger word detected. Transcribing...")
-            # Perform speech recognition
-            transcription = model.transcribe([recorded_audio])
-            return transcription[0]
-iface = gr.Interface(transcribe_triggered, gr.components.Audio(), "text", title="ASR with NeMo Canary Model (Triggered by 'Hey Alexa')")
 iface.launch()

 import gradio as gr
 from nemo.collections.asr.models import ASRModel
+import librosa
 # Load the NeMo ASR model
 model = ASRModel.from_pretrained("nvidia/canary-1b")
 model.eval()
+def preprocess_audio(audio):
+    # Convert audio data to mono channel and resample to 16kHz if necessary
+    audio_mono = librosa.to_mono(audio.T)
+    audio_resampled = librosa.resample(audio_mono, orig_sr=gradio.inputs.Audio.DEFAULT_SAMPLE_RATE, target_sr=16000)
+    return audio_resampled
+def transcribe(audio):
+    if audio is None:
+        raise gr.InterfaceError("Please provide some input audio: either upload an audio file or use the microphone")
+    # Preprocess audio
+    audio_input = preprocess_audio(audio)
+    # Perform speech recognition
+    transcription = model.transcribe([audio_input])
+    return transcription[0]
+audio_input = gr.inputs.Audio()
+iface = gr.Interface(transcribe, audio_input, "text", title="ASR with NeMo Canary Model")
 iface.launch()