Mohssinibra commited on
Commit
e7d0ead
·
verified ·
1 Parent(s): 6cc8631
Files changed (1) hide show
  1. app.py +39 -20
app.py CHANGED
@@ -1,25 +1,44 @@
 
1
  import librosa
2
  import torch
3
- from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor, TrainingArguments, Wav2Vec2FeatureExtractor, Trainer
4
 
 
5
  tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
6
  processor = Wav2Vec2Processor.from_pretrained('boumehdi/wav2vec2-large-xlsr-moroccan-darija', tokenizer=tokenizer)
7
- model=Wav2Vec2ForCTC.from_pretrained('boumehdi/wav2vec2-large-xlsr-moroccan-darija')
8
-
9
-
10
- # load the audio data (use your own wav file here!)
11
- input_audio, sr = librosa.load('file.wav', sr=16000)
12
-
13
- # tokenize
14
- input_values = processor(input_audio, return_tensors="pt", padding=True).input_values
15
-
16
- # retrieve logits
17
- logits = model(input_values).logits
18
-
19
- tokens = torch.argmax(logits, axis=-1)
20
-
21
- # decode using n-gram
22
- transcription = tokenizer.batch_decode(tokens)
23
-
24
- # print the output
25
- print(transcription)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
  import librosa
3
  import torch
4
+ from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor
5
 
6
+ # Load tokenizer, processor, and model
7
  tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
8
  processor = Wav2Vec2Processor.from_pretrained('boumehdi/wav2vec2-large-xlsr-moroccan-darija', tokenizer=tokenizer)
9
+ model = Wav2Vec2ForCTC.from_pretrained('boumehdi/wav2vec2-large-xlsr-moroccan-darija')
10
+
11
+ # Define the function for transcribing audio
12
+ def transcribe(audio):
13
+ # Load the audio data from the Gradio input (audio is in the format of a NumPy array)
14
+ input_audio = audio
15
+ sr = 16000 # Ensure the sample rate is 16000 Hz, which is expected by the model
16
+
17
+ # Tokenize the audio
18
+ input_values = processor(input_audio, return_tensors="pt", padding=True).input_values
19
+
20
+ # Get the model's logits
21
+ logits = model(input_values).logits
22
+
23
+ # Find the predicted tokens
24
+ tokens = torch.argmax(logits, axis=-1)
25
+
26
+ # Decode the tokens to text
27
+ transcription = tokenizer.batch_decode(tokens)
28
+
29
+ return transcription[0]
30
+
31
+ # Create the Gradio interface
32
+ interface = gr.Interface(
33
+ fn=transcribe, # Function to be called when an audio file is uploaded or recorded
34
+ inputs=[
35
+ gr.Audio(source="upload", type="numpy"), # Allow user to upload an audio file
36
+ gr.Audio(source="microphone", type="numpy") # Allow user to record audio from the browser
37
+ ],
38
+ outputs="text", # Output will be a transcription text
39
+ title="Moroccan Darija Speech-to-Text", # Interface title
40
+ description="Upload an audio file or record audio directly from your microphone to transcribe it into Moroccan Darija."
41
+ )
42
+
43
+ # Launch the interface
44
+ interface.launch()