File size: 2,621 Bytes
cd1b576
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import gradio as gr
import librosa
import numpy as np
import pywt
import nbimporter
from scipy.signal import butter, lfilter, wiener
from scipy.io.wavfile import write
from transformers import pipeline
from text2int import text_to_int
from isNumber import is_number
from Text2List import text_to_list
from convert2list import convert_to_list
from processDoubles import process_doubles
from replaceWords import replace_words

asr_model = pipeline("automatic-speech-recognition", model="cdactvm/w2v-bert-tamil_new")

# Function to apply a high-pass filter
def high_pass_filter(audio, sr, cutoff=300):
    nyquist = 0.5 * sr
    normal_cutoff = cutoff / nyquist
    b, a = butter(1, normal_cutoff, btype='high', analog=False)
    filtered_audio = lfilter(b, a, audio)
    return filtered_audio

# Function to apply wavelet denoising
def wavelet_denoise(audio, wavelet='db1', level=1):
    coeffs = pywt.wavedec(audio, wavelet, mode='per')
    sigma = np.median(np.abs(coeffs[-level])) / 0.5
    uthresh = sigma * np.sqrt(2 * np.log(len(audio)))
    coeffs[1:] = [pywt.threshold(i, value=uthresh, mode='soft') for i in coeffs[1:]]
    return pywt.waverec(coeffs, wavelet, mode='per')

# Function to apply a Wiener filter for noise reduction
def apply_wiener_filter(audio):
    return wiener(audio)

# Function to handle speech recognition
def recognize_speech(audio_file):
    audio, sr = librosa.load(audio_file, sr=16000)
    audio = high_pass_filter(audio, sr)
    audio = apply_wiener_filter(audio)
    denoised_audio = wavelet_denoise(audio)
    result = asr_model(denoised_audio)
    text_value = result['text']
    cleaned_text = text_value.replace("<s>", "")
    print(cleaned_text)
    converted_to_list = convert_to_list(cleaned_text, text_to_list())
    print(converted_to_list)
    processed_doubles = process_doubles(converted_to_list)
    print(processed_doubles)
    replaced_words = replace_words(processed_doubles)
    print(replaced_words)
    converted_text = text_to_int(replaced_words)
    print(converted_text)
    return converted_text

# Gradio Interface
gr.Interface(
    fn=recognize_speech,
    inputs=gr.Audio(sources=["microphone","upload"], type="filepath"),
    outputs="text",
    title="Speech Recognition with Advanced Noise Reduction & Hindi ASR",
    description="Upload an audio file, and the system will use high-pass filtering, Wiener filtering, and wavelet-based denoising, then a Hindi ASR model will transcribe the clean audio."
).launch()


# In[ ]: