gr / app.py
Mohammadp's picture
Update app.py
24f64d7 verified
import gradio as gr
import os
from moviepy.editor import VideoFileClip
from pydub import AudioSegment
import torch
from nemo.collections.asr.models import EncDecCTCModelBPE # Adjust based on your model type
import wget
MODEL_URL = "https://huggingface.co/Mohammadp/Persian-ASR/resolve/main/conformer_transducer_persian.nemo"
MODEL_PATH = "conformer_transducer_persian.nemo"
# Download model if it doesn't exist
if not os.path.exists(MODEL_PATH):
print("Downloading model...")
wget.download(MODEL_URL, MODEL_PATH)
print("\nModel downloaded successfully.")
# Load the model
model = EncDecCTCModelBPE.restore_from(MODEL_PATH)
print("Model loaded successfully!")
# Constants
SAMPLE_RATE = 16000
MAX_CHUNK_LENGTH_MS = 10 * 1000 # 10 seconds per chunk
# Helper functions
def extract_audio_from_video(video_path):
"""Extracts audio from a video file and saves it as a WAV file."""
video = VideoFileClip(video_path)
audio_path = "extracted_audio.wav"
video.audio.write_audiofile(audio_path)
return audio_path
def resample_audio(audio_path, target_sample_rate=SAMPLE_RATE):
"""Resamples an audio file to 16kHz."""
audio = AudioSegment.from_file(audio_path)
audio = audio.set_frame_rate(target_sample_rate)
resampled_path = "resampled_audio.wav"
audio.export(resampled_path, format="wav")
return resampled_path
def split_audio(audio_path, max_length_ms=MAX_CHUNK_LENGTH_MS):
"""Splits audio into chunks of max_length_ms each."""
audio = AudioSegment.from_file(audio_path)
chunks = []
for i in range(0, len(audio), max_length_ms):
chunk = audio[i:i + max_length_ms]
chunk_path = f"chunk_{i // max_length_ms}.wav"
chunk.export(chunk_path, format="wav")
chunks.append(chunk_path)
return chunks
def transcribe_audio(audio_path):
"""Transcribes a single audio file using the NeMo model."""
return model.transcribe([audio_path])#[0]
def process_audio(audio_path):
"""Processes an audio file: resamples, splits, and transcribes."""
resampled_path = resample_audio(audio_path)
chunks = split_audio(resampled_path)
transcriptions = [transcribe_audio(chunk) for chunk in chunks]
return " ".join(transcriptions)
def process_video(video_path):
"""Extracts and processes audio from a video file."""
audio_path = extract_audio_from_video(video_path)
return process_audio(audio_path)
def process_microphone(audio_path):
"""Processes live-recorded microphone audio."""
return process_audio(audio_path)
# Gradio Interface
def process_input(video=None, audio=None, microphone=None):
if video is not None:
return f"Transcription: {process_video(video)}"
elif audio is not None:
return f"Transcription: {process_audio(audio)}"
elif microphone is not None:
return f"Transcription: {process_microphone(microphone)}"
else:
return "No input provided."
# ** WAV FILE EXAMPLES ONLY **
example_wav_files = [
"FEmEC4QBSwA_285%20(4).wav",
]
iface = gr.Interface(
fn=process_input,
inputs=[
gr.Video(label="Upload Video"),
gr.Audio(label="Upload Audio File", type="filepath"),
gr.Microphone(label="Record from Microphone", type="filepath")
],
outputs="text",
title="NeMo ASR Transcription Interface",
description="Upload a video, an audio file, or record from the microphone to transcribe the audio using a trained NeMo model.",
examples=[[None, wav, None] for wav in example_wav_files] # **Only WAV examples**
)
iface.launch()