Spaces:

Mohssinibra
/

STTDARIJAAPI

Running

App Files Files Community

Mohssinibra commited on Jan 22

Commit

980dcf2

verified ·

1 Parent(s): 0a85948

.

Browse files

Files changed (1) hide show

app.py +28 -39

app.py CHANGED Viewed

@@ -1,44 +1,33 @@
 import gradio as gr
 import librosa
 import torch
-from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor
-# Load tokenizer, processor, and model
-tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
-processor = Wav2Vec2Processor.from_pretrained('boumehdi/wav2vec2-large-xlsr-moroccan-darija', tokenizer=tokenizer)
-model = Wav2Vec2ForCTC.from_pretrained('boumehdi/wav2vec2-large-xlsr-moroccan-darija')
-# Define the function for transcribing audio
-def transcribe(audio):
-    # Load the audio data from the Gradio input (audio is in the format of a NumPy array)
-    input_audio = audio
-    sr = 16000  # Ensure the sample rate is 16000 Hz, which is expected by the model
-    # Tokenize the audio
-    input_values = processor(input_audio, return_tensors="pt", padding=True).input_values
-    # Get the model's logits
     logits = model(input_values).logits
-    # Find the predicted tokens
     tokens = torch.argmax(logits, axis=-1)
-    # Decode the tokens to text
-    transcription = tokenizer.batch_decode(tokens)
-    return transcription[0]
-# Create the Gradio interface
-interface = gr.Interface(
-    fn=transcribe,  # Function to be called when an audio file is uploaded or recorded
-    inputs=[
-        gr.Audio(source="upload", type="numpy"),  # Allow user to upload an audio file
-        gr.Audio(source="microphone", type="numpy")  # Allow user to record audio from the browser
-    ],
-    outputs="text",  # Output will be a transcription text
-    title="Moroccan Darija Speech-to-Text",  # Interface title
-    description="Upload an audio file or record audio directly from your microphone to transcribe it into Moroccan Darija."
-)
-# Launch the interface
-interface.launch()

 import gradio as gr
 import librosa
 import torch
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+# Load pre-trained model and processor directly from Hugging Face Hub
+model = Wav2Vec2ForCTC.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija")
+processor = Wav2Vec2Processor.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija")
+def transcribe_audio(audio):
+    # Load the audio file from Gradio interface
+    audio_array, sr = librosa.load(audio, sr=16000)
+    # Tokenize the audio file
+    input_values = processor(audio_array, return_tensors="pt", padding=True).input_values
+    # Get the model's logits (predicted token scores)
     logits = model(input_values).logits
+    # Get the predicted tokens
     tokens = torch.argmax(logits, axis=-1)
+    # Decode the tokens into text
+    transcription = processor.decode(tokens[0])
+    return transcription
+# Create a Gradio interface for uploading audio or recording from the browser
+demo = gr.Interface(fn=transcribe_audio,
+                    inputs=gr.Audio(source="upload", type="filepath"),
+                    outputs="text")
+demo.launch()