Spaces:

NightPrince
/

ASR

Sleeping

NightPrince commited on Jan 8

Commit

2726627

verified ·

1 Parent(s): c137890

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,23 +2,28 @@ import gradio as gr
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import torch
 import numpy as np
 # Load the pre-trained model and processor
 model_name = "facebook/s2t-wav2vec2-large-en-ar"
 model = Wav2Vec2ForCTC.from_pretrained(model_name)
 processor = Wav2Vec2Processor.from_pretrained(model_name)
-# Define a function for the ASR model
 def transcribe(audio):
-    # Convert the audio into a format compatible with the processor
-    if isinstance(audio, np.ndarray):
-        audio = audio.flatten()  # Ensure it's a 1D array
-    # Process the audio
     inputs = processor(audio, return_tensors="pt", sampling_rate=16000)
     # Get the model's predictions
-    logits = model(input_values=inputs.input_values).logits
     # Decode the predicted text
     predicted_ids = logits.argmax(dim=-1)
@@ -26,8 +31,13 @@ def transcribe(audio):
     return transcription
-# Define the Gradio interface
-interface = gr.Interface(fn=transcribe, inputs=gr.Audio(type="numpy"), outputs="text")
-# Launch the Gradio interface
 interface.launch()

 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import torch
 import numpy as np
+import librosa
 # Load the pre-trained model and processor
 model_name = "facebook/s2t-wav2vec2-large-en-ar"
 model = Wav2Vec2ForCTC.from_pretrained(model_name)
 processor = Wav2Vec2Processor.from_pretrained(model_name)
+# Function to transcribe audio using the model
 def transcribe(audio):
+    # Resample the audio to 16kHz if necessary
+    if audio.ndim > 1:  # If audio is stereo
+        audio = audio.mean(axis=1)  # Convert to mono
+    # Resample audio to 16kHz if it's not already
+    audio = librosa.resample(audio, orig_sr=audio.shape[0] / len(audio), target_sr=16000)
+    # Process the audio to match the model's input format
     inputs = processor(audio, return_tensors="pt", sampling_rate=16000)
     # Get the model's predictions
+    with torch.no_grad():
+        logits = model(input_values=inputs.input_values).logits
     # Decode the predicted text
     predicted_ids = logits.argmax(dim=-1)
     return transcription
+# Create the Gradio interface
+interface = gr.Interface(
+    fn=transcribe,
+    inputs=gr.Audio(type="numpy"),  # Take the audio input as numpy array
+    outputs="text",  # Output transcribed text
+    live=True  # Optional: live transcribing as you speak
+)
+# Launch the interface
 interface.launch()