File size: 2,536 Bytes
aeac508
 
 
4e0c03c
fb9d859
 
4e0c03c
fb9d859
 
4e0c03c
fb9d859
4e0c03c
fb9d859
4e0c03c
 
 
 
aeac508
bb285c0
b9a73f5
aeac508
bb285c0
aeac508
 
 
 
 
 
 
eaab174
d6f3644
 
 
 
 
aeac508
bb285c0
aeac508
 
 
 
 
 
 
 
 
 
 
bb285c0
 
aeac508
 
bb285c0
 
aeac508
bb285c0
4e0c03c
bb285c0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import warnings
warnings.filterwarnings("ignore")

import librosa          # Library for loading and processing audio files.
import numpy as np      # Library for numerical computations, used for signal processing.
import gradio as gr     # Library for creating a web-based user interface for inference.
from transformers import pipeline    # Import pipeline for automatic speech recognition (ASR).

# Importing custom utility functions for text processing.
from text2int import text_to_int     # Converts text numbers (e.g., "one") into integers (e.g., 1).
from Text2List import text_to_list   # Converts a text string into a list of words.
from convert2list import convert_to_list     # Converts processed text into a structured list.
from processDoubles import process_doubles   # Handles repeated words or numbers in speech recognition output.
from replaceWords import replace_words       # Replaces specific words in the recognized text with alternatives.
from highPassFilter import high_pass_filter  # filter noise by bypassing high frequency signals.
from waveletDenoise import wavelet_denoise   # used for signal Denoising.
from applyWienerFilter import apply_wiener_filter # for Signal Denoising.

# Initialize ASR model pipeline
asr_model = pipeline("automatic-speech-recognition", model="cdactvm/w2v-bert-punjabi")

# Function to handle speech recognition
def recognize_speech(audio_file):
    audio, sr = librosa.load(audio_file, sr=16000)
    audio = high_pass_filter(audio, sr)
    audio = apply_wiener_filter(audio)
    denoised_audio = wavelet_denoise(audio)
    result = asr_model(denoised_audio)
    text_value = result['text']
    cleaned_text = text_value.replace("[PAD]", "")
    converted_to_list = convert_to_list(cleaned_text, text_to_list())
    processed_doubles = process_doubles(converted_to_list)
    replaced_words = replace_words(processed_doubles)
    converted_text = text_to_int(replaced_words)
    return converted_text


def sel_lng(lng, mic=None, file=None):
    if mic is not None:
        audio = mic
    elif file is not None:
        audio = file
    else:
        return "You must either provide a mic recording or a file"
    
    if lng == "model_1":
        return recognize_speech(audio)
        
# Create a Gradio interface
demo = gr.Interface(
    fn=sel_lng, 
    inputs=[
        gr.Dropdown(["model_1"], label="Select Model"),
        gr.Audio(sources=["microphone", "upload"], type="filepath"),
    ],
    outputs=["textbox"],
    title="Automatic Speech Recognition"
)
demo.launch()