cdactvm commited on
Commit
aeac508
·
verified ·
1 Parent(s): 4088cbb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +201 -0
app.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ warnings.filterwarnings("ignore")
3
+
4
+ import os
5
+ import re
6
+ import pywt
7
+ import librosa
8
+ import webrtcvad
9
+ import nbimporter
10
+ import torchaudio
11
+ import numpy as np
12
+ import gradio as gr
13
+ import scipy.signal
14
+ import soundfile as sf
15
+ from scipy.io.wavfile import write
16
+ from transformers import pipeline
17
+ from transformers import AutoProcessor
18
+ from pyctcdecode import build_ctcdecoder
19
+ from transformers import Wav2Vec2ProcessorWithLM
20
+ from scipy.signal import butter, lfilter, wiener
21
+ from text2int import text_to_int
22
+ from isNumber import is_number
23
+ from Text2List import text_to_list
24
+ from convert2list import convert_to_list
25
+ from processDoubles import process_doubles
26
+ from replaceWords import replace_words
27
+ from applyVad import apply_vad
28
+ from wienerFilter import wiener_filter
29
+ from highPassFilter import high_pass_filter
30
+ from waveletDenoise import wavelet_denoise
31
+ from scipy.signal import butter, lfilter, wiener
32
+
33
+ asr_model = pipeline("automatic-speech-recognition", model="cdactvm/punjabi-wav2vec-bert")
34
+
35
+
36
+ # Function to apply a high-pass filter
37
+ def high_pass_filter(audio, sr, cutoff=300):
38
+ nyquist = 0.5 * sr
39
+ normal_cutoff = cutoff / nyquist
40
+ b, a = butter(1, normal_cutoff, btype='high', analog=False)
41
+ filtered_audio = lfilter(b, a, audio)
42
+ return filtered_audio
43
+
44
+ # Function to apply wavelet denoising
45
+ def wavelet_denoise(audio, wavelet='db1', level=1):
46
+ coeffs = pywt.wavedec(audio, wavelet, mode='per')
47
+ sigma = np.median(np.abs(coeffs[-level])) / 0.5
48
+ uthresh = sigma * np.sqrt(2 * np.log(len(audio)))
49
+ coeffs[1:] = [pywt.threshold(i, value=uthresh, mode='soft') for i in coeffs[1:]]
50
+ return pywt.waverec(coeffs, wavelet, mode='per')
51
+
52
+ # Function to apply a Wiener filter for noise reduction
53
+ def apply_wiener_filter(audio):
54
+ return wiener(audio)
55
+
56
+
57
+
58
+ def createlex(filename):
59
+ # Initialize an empty dictionary
60
+ data_dict = {}
61
+
62
+ # Open the file and read it line by line
63
+ with open(filename, "r", encoding="utf-8") as f:
64
+ for line in f:
65
+ # Strip newline characters and split by tab
66
+ key, value = line.strip().split("\t")
67
+ # Add to dictionary
68
+ data_dict[key] = value
69
+ return data_dict
70
+
71
+ lex=createlex("num_words_ta.txt")
72
+
73
+ def addnum(inlist):
74
+ sum=0
75
+ for num in inlist:
76
+ sum+=int(num)
77
+
78
+ return sum
79
+
80
+ from rapidfuzz import process
81
+ def get_val(word, lexicon):
82
+ threshold = 80 # Minimum similarity score
83
+ length_difference = 4
84
+ #length_range = (4, 6) # Acceptable character length range (min, max)
85
+
86
+ # Find the best match above the similarity threshold
87
+ result = process.extractOne(word, lexicon.keys(), score_cutoff=threshold)
88
+ #print (result)
89
+ if result:
90
+ match, score, _ = result
91
+ #print(lexicon[match])
92
+ #return lexicon[match]
93
+ if abs(len(match) - len(word)) <= length_difference:
94
+ #if length_range[0] <= len(match) <= length_range[1]:
95
+ return lexicon[match]
96
+ else:
97
+ return None
98
+ else:
99
+ return None
100
+
101
+ def convert2num(input, lex):
102
+ input += " #" # Add a period for termination
103
+ words = input.split()
104
+ i = 0
105
+ num = 0
106
+ outstr = ""
107
+ digit_end = True
108
+ numlist = []
109
+ addflag = False
110
+
111
+ # Process the words
112
+ while i < len(words):
113
+ #checkwordlist = handleSpecialnum(words[i])
114
+
115
+ # Handle special numbers
116
+ #if len(checkwordlist) == 2:
117
+ # words[i] = checkwordlist[0]
118
+ # words.insert(i + 1, checkwordlist[1]) # Collect new word for later processing
119
+
120
+ # Get numerical value of the word
121
+ numval = get_val(words[i], lex)
122
+ if numval is not None:
123
+ if words[i][-4:] in ('த்து', 'ற்று'):
124
+ addflag = True
125
+ numlist.append(numval)
126
+ else:
127
+ if addflag:
128
+ numlist.append(numval)
129
+ num = addnum(numlist)
130
+ outstr += str(num) + " "
131
+ addflag = False
132
+ numlist = []
133
+ else:
134
+ outstr += " " + str(numval) + " "
135
+ digit_end = False
136
+ else:
137
+ if addflag:
138
+ num = addnum(numlist)
139
+ outstr += str(num) + " " + words[i] + " "
140
+ addflag = False
141
+ numlist = []
142
+ else:
143
+ outstr += words[i] + " "
144
+ if not digit_end:
145
+ digit_end = True
146
+
147
+ # Move to the next word
148
+ i += 1
149
+
150
+ # Final processing
151
+ outstr = outstr.replace('#','') # Remove trailing spaces
152
+ return outstr
153
+
154
+ # Function to handle speech recognition
155
+ def recognize_speech(audio_file):
156
+ audio, sr = librosa.load(audio_file, sr=16000)
157
+ audio = high_pass_filter(audio, sr)
158
+ audio = apply_wiener_filter(audio)
159
+ denoised_audio = wavelet_denoise(audio)
160
+ result = asr_model(denoised_audio)
161
+ text_value = result['text']
162
+ cleaned_text = text_value.replace("<s>", "")
163
+ cleaned_text=convert2num(cleaned_text,lex)
164
+ # converted_to_list = convert_to_list(cleaned_text, text_to_list())
165
+ # processed_doubles = process_doubles(converted_to_list)
166
+ # replaced_words = replace_words(processed_doubles)
167
+ # converted_text = text_to_int(replaced_words)
168
+ return cleaned_text
169
+
170
+ def sel_lng(lng, mic=None, file=None):
171
+ if mic is not None:
172
+ audio = mic
173
+ elif file is not None:
174
+ audio = file
175
+ else:
176
+ return "You must either provide a mic recording or a file"
177
+
178
+ if lng == "model_1":
179
+ return recognize_speech(audio)
180
+ # elif lng == "model_2":
181
+ # return transcribe_hindi_new(audio)
182
+ # elif lng== "model_3":
183
+ # return transcribe_hindi_lm(audio)
184
+ # elif lng== "model_4":
185
+ # return Noise_cancellation_function(audio)
186
+
187
+
188
+ demo=gr.Interface(
189
+ fn=sel_lng,
190
+
191
+ inputs=[
192
+ gr.Dropdown([
193
+ "model_1"],label="Select Model"),
194
+ gr.Audio(sources=["microphone","upload"], type="filepath"),
195
+ ],
196
+ outputs=[
197
+ "textbox"
198
+ ],
199
+ title="Automatic Speech Recognition",
200
+ description = "Demo for Automatic Speech Recognition. Use microphone to record speech. Please press Record button. Initially it will take some time to load the model. The recognized text will appear in the output textbox",
201
+ ).launch()