Spaces:

NightPrince
/

ASR

Sleeping

ASR / app.py

Update app.py

0bbfec6 verified 2 months ago

1.54 kB

	import gradio as gr
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
	import torch
	import numpy as np
	import librosa

	# Load the pre-trained model and processor
	model_name = "facebook/s2t-wav2vec2-large-en-ar"
	model = Wav2Vec2ForCTC.from_pretrained(model_name)
	processor = Wav2Vec2Processor.from_pretrained(model_name)

	# Function to transcribe audio using the model
	def transcribe(audio):
	# Extract audio data from the tuple (audio, sample_rate)
	audio_data, sample_rate = audio

	# Resample the audio to 16kHz if necessary
	if audio_data.ndim > 1: # If audio is stereo
	audio_data = audio_data.mean(axis=1) # Convert to mono

	# Ensure the audio is resampled to 16kHz if it's not already
	if sample_rate != 16000:
	audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)

	# Process the audio to match the model's input format
	inputs = processor(audio_data, return_tensors="pt", sampling_rate=16000)

	# Get the model's predictions
	with torch.no_grad():
	logits = model(input_values=inputs.input_values).logits

	# Decode the predicted text
	predicted_ids = logits.argmax(dim=-1)
	transcription = processor.decode(predicted_ids[0])

	return transcription

	# Create the Gradio interface
	interface = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(type="numpy"), # Take the audio input as numpy array
	outputs="text" # Optional: live transcribing as you speak
	)

	# Launch the interface
	interface.launch()