File size: 1,536 Bytes
5b60321
1da6a76
ee196d5
98c9824
2726627
5b60321
1da6a76
 
 
 
 
2726627
1da6a76
0bbfec6
 
 
2726627
0bbfec6
 
98c9824
0bbfec6
 
 
2726627
 
0bbfec6
98c9824
1da6a76
2726627
 
98c9824
1da6a76
 
 
98c9824
1da6a76
 
2726627
 
 
 
0bbfec6
2726627
1da6a76
2726627
1da6a76
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import numpy as np
import librosa

# Load the pre-trained model and processor
model_name = "facebook/s2t-wav2vec2-large-en-ar"
model = Wav2Vec2ForCTC.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name)

# Function to transcribe audio using the model
def transcribe(audio):
    # Extract audio data from the tuple (audio, sample_rate)
    audio_data, sample_rate = audio
    
    # Resample the audio to 16kHz if necessary
    if audio_data.ndim > 1:  # If audio is stereo
        audio_data = audio_data.mean(axis=1)  # Convert to mono
    
    # Ensure the audio is resampled to 16kHz if it's not already
    if sample_rate != 16000:
        audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)

    # Process the audio to match the model's input format
    inputs = processor(audio_data, return_tensors="pt", sampling_rate=16000)
    
    # Get the model's predictions
    with torch.no_grad():
        logits = model(input_values=inputs.input_values).logits
    
    # Decode the predicted text
    predicted_ids = logits.argmax(dim=-1)
    transcription = processor.decode(predicted_ids[0])
    
    return transcription

# Create the Gradio interface
interface = gr.Interface(
    fn=transcribe, 
    inputs=gr.Audio(type="numpy"),  # Take the audio input as numpy array
    outputs="text" # Optional: live transcribing as you speak
)

# Launch the interface
interface.launch()