Spaces:

Mohammadp
/

gr

Sleeping

App Files Files Community

gr / app.py

Mohammadp

Update app.py

24f64d7 verified about 1 month ago

raw

history blame contribute delete

3.56 kB

	import gradio as gr
	import os
	from moviepy.editor import VideoFileClip
	from pydub import AudioSegment
	import torch
	from nemo.collections.asr.models import EncDecCTCModelBPE # Adjust based on your model type
	import wget

	MODEL_URL = "https://huggingface.co/Mohammadp/Persian-ASR/resolve/main/conformer_transducer_persian.nemo"
	MODEL_PATH = "conformer_transducer_persian.nemo"

	# Download model if it doesn't exist
	if not os.path.exists(MODEL_PATH):
	print("Downloading model...")
	wget.download(MODEL_URL, MODEL_PATH)
	print("\nModel downloaded successfully.")

	# Load the model
	model = EncDecCTCModelBPE.restore_from(MODEL_PATH)
	print("Model loaded successfully!")
	# Constants
	SAMPLE_RATE = 16000
	MAX_CHUNK_LENGTH_MS = 10 * 1000 # 10 seconds per chunk

	# Helper functions
	def extract_audio_from_video(video_path):
	"""Extracts audio from a video file and saves it as a WAV file."""
	video = VideoFileClip(video_path)
	audio_path = "extracted_audio.wav"
	video.audio.write_audiofile(audio_path)
	return audio_path

	def resample_audio(audio_path, target_sample_rate=SAMPLE_RATE):
	"""Resamples an audio file to 16kHz."""
	audio = AudioSegment.from_file(audio_path)
	audio = audio.set_frame_rate(target_sample_rate)
	resampled_path = "resampled_audio.wav"
	audio.export(resampled_path, format="wav")
	return resampled_path

	def split_audio(audio_path, max_length_ms=MAX_CHUNK_LENGTH_MS):
	"""Splits audio into chunks of max_length_ms each."""
	audio = AudioSegment.from_file(audio_path)
	chunks = []
	for i in range(0, len(audio), max_length_ms):
	chunk = audio[i:i + max_length_ms]
	chunk_path = f"chunk_{i // max_length_ms}.wav"
	chunk.export(chunk_path, format="wav")
	chunks.append(chunk_path)
	return chunks

	def transcribe_audio(audio_path):
	"""Transcribes a single audio file using the NeMo model."""
	return model.transcribe([audio_path])#[0]

	def process_audio(audio_path):
	"""Processes an audio file: resamples, splits, and transcribes."""
	resampled_path = resample_audio(audio_path)
	chunks = split_audio(resampled_path)
	transcriptions = [transcribe_audio(chunk) for chunk in chunks]
	return " ".join(transcriptions)

	def process_video(video_path):
	"""Extracts and processes audio from a video file."""
	audio_path = extract_audio_from_video(video_path)
	return process_audio(audio_path)

	def process_microphone(audio_path):
	"""Processes live-recorded microphone audio."""
	return process_audio(audio_path)

	# Gradio Interface
	def process_input(video=None, audio=None, microphone=None):
	if video is not None:
	return f"Transcription: {process_video(video)}"
	elif audio is not None:
	return f"Transcription: {process_audio(audio)}"
	elif microphone is not None:
	return f"Transcription: {process_microphone(microphone)}"
	else:
	return "No input provided."

	# WAV FILE EXAMPLES ONLY
	example_wav_files = [
	"FEmEC4QBSwA_285%20(4).wav",
	]

	iface = gr.Interface(
	fn=process_input,
	inputs=[
	gr.Video(label="Upload Video"),
	gr.Audio(label="Upload Audio File", type="filepath"),
	gr.Microphone(label="Record from Microphone", type="filepath")
	],
	outputs="text",
	title="NeMo ASR Transcription Interface",
	description="Upload a video, an audio file, or record from the microphone to transcribe the audio using a trained NeMo model.",
	examples=[[None, wav, None] for wav in example_wav_files] # Only WAV examples
	)

	iface.launch()