import gradio as gr import os from moviepy.editor import VideoFileClip from pydub import AudioSegment import torch from nemo.collections.asr.models import EncDecCTCModelBPE # Adjust based on your model type import wget MODEL_URL = "https://huggingface.co/Mohammadp/Persian-ASR/resolve/main/conformer_transducer_persian.nemo" MODEL_PATH = "conformer_transducer_persian.nemo" # Download model if it doesn't exist if not os.path.exists(MODEL_PATH): print("Downloading model...") wget.download(MODEL_URL, MODEL_PATH) print("\nModel downloaded successfully.") # Load the model model = EncDecCTCModelBPE.restore_from(MODEL_PATH) print("Model loaded successfully!") # Constants SAMPLE_RATE = 16000 MAX_CHUNK_LENGTH_MS = 10 * 1000 # 10 seconds per chunk # Helper functions def extract_audio_from_video(video_path): """Extracts audio from a video file and saves it as a WAV file.""" video = VideoFileClip(video_path) audio_path = "extracted_audio.wav" video.audio.write_audiofile(audio_path) return audio_path def resample_audio(audio_path, target_sample_rate=SAMPLE_RATE): """Resamples an audio file to 16kHz.""" audio = AudioSegment.from_file(audio_path) audio = audio.set_frame_rate(target_sample_rate) resampled_path = "resampled_audio.wav" audio.export(resampled_path, format="wav") return resampled_path def split_audio(audio_path, max_length_ms=MAX_CHUNK_LENGTH_MS): """Splits audio into chunks of max_length_ms each.""" audio = AudioSegment.from_file(audio_path) chunks = [] for i in range(0, len(audio), max_length_ms): chunk = audio[i:i + max_length_ms] chunk_path = f"chunk_{i // max_length_ms}.wav" chunk.export(chunk_path, format="wav") chunks.append(chunk_path) return chunks def transcribe_audio(audio_path): """Transcribes a single audio file using the NeMo model.""" return model.transcribe([audio_path])#[0] def process_audio(audio_path): """Processes an audio file: resamples, splits, and transcribes.""" resampled_path = resample_audio(audio_path) chunks = split_audio(resampled_path) transcriptions = [transcribe_audio(chunk) for chunk in chunks] return " ".join(transcriptions) def process_video(video_path): """Extracts and processes audio from a video file.""" audio_path = extract_audio_from_video(video_path) return process_audio(audio_path) def process_microphone(audio_path): """Processes live-recorded microphone audio.""" return process_audio(audio_path) # Gradio Interface def process_input(video=None, audio=None, microphone=None): if video is not None: return f"Transcription: {process_video(video)}" elif audio is not None: return f"Transcription: {process_audio(audio)}" elif microphone is not None: return f"Transcription: {process_microphone(microphone)}" else: return "No input provided." # ** WAV FILE EXAMPLES ONLY ** example_wav_files = [ "FEmEC4QBSwA_285%20(4).wav", ] iface = gr.Interface( fn=process_input, inputs=[ gr.Video(label="Upload Video"), gr.Audio(label="Upload Audio File", type="filepath"), gr.Microphone(label="Record from Microphone", type="filepath") ], outputs="text", title="NeMo ASR Transcription Interface", description="Upload a video, an audio file, or record from the microphone to transcribe the audio using a trained NeMo model.", examples=[[None, wav, None] for wav in example_wav_files] # **Only WAV examples** ) iface.launch()