Mohssinibra commited on
Commit
980dcf2
·
verified ·
1 Parent(s): 0a85948
Files changed (1) hide show
  1. app.py +28 -39
app.py CHANGED
@@ -1,44 +1,33 @@
1
  import gradio as gr
2
  import librosa
3
  import torch
4
- from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor
5
-
6
- # Load tokenizer, processor, and model
7
- tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
8
- processor = Wav2Vec2Processor.from_pretrained('boumehdi/wav2vec2-large-xlsr-moroccan-darija', tokenizer=tokenizer)
9
- model = Wav2Vec2ForCTC.from_pretrained('boumehdi/wav2vec2-large-xlsr-moroccan-darija')
10
-
11
- # Define the function for transcribing audio
12
- def transcribe(audio):
13
- # Load the audio data from the Gradio input (audio is in the format of a NumPy array)
14
- input_audio = audio
15
- sr = 16000 # Ensure the sample rate is 16000 Hz, which is expected by the model
16
-
17
- # Tokenize the audio
18
- input_values = processor(input_audio, return_tensors="pt", padding=True).input_values
19
-
20
- # Get the model's logits
21
  logits = model(input_values).logits
22
-
23
- # Find the predicted tokens
24
  tokens = torch.argmax(logits, axis=-1)
25
-
26
- # Decode the tokens to text
27
- transcription = tokenizer.batch_decode(tokens)
28
-
29
- return transcription[0]
30
-
31
- # Create the Gradio interface
32
- interface = gr.Interface(
33
- fn=transcribe, # Function to be called when an audio file is uploaded or recorded
34
- inputs=[
35
- gr.Audio(source="upload", type="numpy"), # Allow user to upload an audio file
36
- gr.Audio(source="microphone", type="numpy") # Allow user to record audio from the browser
37
- ],
38
- outputs="text", # Output will be a transcription text
39
- title="Moroccan Darija Speech-to-Text", # Interface title
40
- description="Upload an audio file or record audio directly from your microphone to transcribe it into Moroccan Darija."
41
- )
42
-
43
- # Launch the interface
44
- interface.launch()
 
1
  import gradio as gr
2
  import librosa
3
  import torch
4
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
5
+
6
+ # Load pre-trained model and processor directly from Hugging Face Hub
7
+ model = Wav2Vec2ForCTC.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija")
8
+ processor = Wav2Vec2Processor.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija")
9
+
10
+ def transcribe_audio(audio):
11
+ # Load the audio file from Gradio interface
12
+ audio_array, sr = librosa.load(audio, sr=16000)
13
+
14
+ # Tokenize the audio file
15
+ input_values = processor(audio_array, return_tensors="pt", padding=True).input_values
16
+
17
+ # Get the model's logits (predicted token scores)
 
 
 
18
  logits = model(input_values).logits
19
+
20
+ # Get the predicted tokens
21
  tokens = torch.argmax(logits, axis=-1)
22
+
23
+ # Decode the tokens into text
24
+ transcription = processor.decode(tokens[0])
25
+
26
+ return transcription
27
+
28
+ # Create a Gradio interface for uploading audio or recording from the browser
29
+ demo = gr.Interface(fn=transcribe_audio,
30
+ inputs=gr.Audio(source="upload", type="filepath"),
31
+ outputs="text")
32
+
33
+ demo.launch()