import warnings warnings.filterwarnings("ignore") import librosa # Library for loading and processing audio files. import numpy as np # Library for numerical computations, used for signal processing. import gradio as gr # Library for creating a web-based user interface for inference. from transformers import pipeline # Import pipeline for automatic speech recognition (ASR). # Importing custom utility functions for text processing. from text2int import text_to_int # Converts text numbers (e.g., "one") into integers (e.g., 1). from Text2List import text_to_list # Converts a text string into a list of words. from convert2list import convert_to_list # Converts processed text into a structured list. from processDoubles import process_doubles # Handles repeated words or numbers in speech recognition output. from replaceWords import replace_words # Replaces specific words in the recognized text with alternatives. from highPassFilter import high_pass_filter # filter noise by bypassing high frequency signals. from waveletDenoise import wavelet_denoise # used for signal Denoising. from applyWienerFilter import apply_wiener_filter # for Signal Denoising. # Initialize ASR model pipeline asr_model = pipeline("automatic-speech-recognition", model="cdactvm/w2v-bert-punjabi") # Function to handle speech recognition def recognize_speech(audio_file): audio, sr = librosa.load(audio_file, sr=16000) audio = high_pass_filter(audio, sr) audio = apply_wiener_filter(audio) denoised_audio = wavelet_denoise(audio) result = asr_model(denoised_audio) text_value = result['text'] cleaned_text = text_value.replace("[PAD]", "") converted_to_list = convert_to_list(cleaned_text, text_to_list()) processed_doubles = process_doubles(converted_to_list) replaced_words = replace_words(processed_doubles) converted_text = text_to_int(replaced_words) return converted_text def sel_lng(lng, mic=None, file=None): if mic is not None: audio = mic elif file is not None: audio = file else: return "You must either provide a mic recording or a file" if lng == "model_1": return recognize_speech(audio) # Create a Gradio interface demo = gr.Interface( fn=sel_lng, inputs=[ gr.Dropdown(["model_1"], label="Select Model"), gr.Audio(sources=["microphone", "upload"], type="filepath"), ], outputs=["textbox"], title="Automatic Speech Recognition" ) demo.launch()