File size: 5,104 Bytes
58da011
94a72a8
 
 
0745f2e
76b1d4d
0745f2e
1ea63df
94a72a8
76b1d4d
94a72a8
 
 
 
 
 
 
 
 
 
95d2589
94a72a8
76b1d4d
94a72a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76b1d4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94a72a8
 
58da011
 
 
94a72a8
 
 
 
 
 
 
c8ba291
 
94a72a8
 
 
76b1d4d
0745f2e
76b1d4d
 
 
0745f2e
58da011
db26e2e
58da011
 
76b1d4d
58da011
 
 
 
 
94a72a8
0745f2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94a72a8
0745f2e
94a72a8
 
 
0745f2e
94a72a8
 
 
 
 
 
 
 
 
 
 
 
c8ba291
94a72a8
 
 
c8ba291
94a72a8
 
 
 
 
 
 
 
76b1d4d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import pandas as pd
import difflib
import gradio as gr
from transformers import pipeline
import librosa
import re

# import numpy as np

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")


class Model_Voice_Text():
    
    """
    This class takes the voices, convert them to text
    """
    #open and read the file after the appending:

    def __init__(self) -> None:
        # self.SR_obj = sr.Recognizer()
        self.KEYWORDS = ['suicide', 'urgent', 'poor', 'in-need', 'old', 'pregnant', 'refugee', 'new immigrant', 'patient', 'ill', 'sick', 'anxiety', 'anxious']
        self.sins = [5678, 1967, 4530, 3986, 9750, 1065, 7134, 6410, 2906, 8056, 1307, 3503, 7708, 4980, 1248, 3491, 6157, 9242, 3198, 5632]
        # self.fuzzer = fuzz.Fuzz()
    
    # Define a function to find the number of times the word similar to the word stored in variable target_var, in a text stored in a variable named text_res
    def find_similar_word_count(self, text, target_var):
        """Finds the number of times the word similar to the word stored in variable target_var, in a text stored in a variable named text_res using difflib.

        Args:
            text: The text to search.
            target_var: The word to find the similar word to.

        Returns:
            The number of times the word similar to target_var appears in the text.
        """

        # Create a list of all words in the text
        words = text.split()

        # Find all words similar to target_var
        similar_words = difflib.get_close_matches(target_var, words, cutoff=0.75)

        # Return the number of similar words
        return len(similar_words)
    
    def extract_phone_number(self, text):
        # Define a regular expression pattern to match phone numbers
        phone_pattern = re.compile(r'\b\d{7,}\b')

        # Search for the phone number in the text
        match = re.search(phone_pattern, text)

        # Check if a match is found and return the phone number
        if match:
            return match.group()
        else:
            return "000"
        
    def extract_sin(self, text):
        # Define a regular expression pattern to match phone numbers
        sin_pattern = re.compile(r'\b\d{4}\b')

        # Search for the phone number in the text
        matches = re.findall(sin_pattern, text)
        if matches:
            return matches 
        else: return "Not detected"

    def check_eligibility(self, sins_ex):
        for number in sins_ex:
            if number in self.sins:
                return "Eligible"
        return "Not Eligible"

    def matching_text(self, text):
        df = pd.DataFrame()
        ph_num = '000'
        sin = '0000'
        ret = []
        # words = nltk.word_tokenize(text)
        for target_var in self.KEYWORDS:
            count = self.find_similar_word_count(text, target_var)
            
            # matches = process.extract(text, word)
            if count>0:
                ret.append(target_var)
                ret.append(count)
        if ret == []:
            ret.append("nothing found")
        
        ph_num = self.extract_phone_number(text=text)

        sin = self.extract_sin(text=text)

        eligib = self.check_eligibility(sins_ex=sin)
        
        # initialize data of lists. 
        data = {'Keywords': [ret], 
                'Phone Number': ph_num,
                'SIN': sin,
                'Eligible': eligib,
                'text': text} 
        df = pd.DataFrame(data)
        
        # ret.append(text)
        return df
    
    def transcribe(self, audio_f):
        text = ""

        # First load the file
        audio, sr = librosa.load(audio_f)

        # Get number of samples for 20 seconds; replace 20 by any number
        buffer = 20 * sr

        samples_total = len(audio)
        samples_wrote = 0
        counter = 1

        while samples_wrote < samples_total:

            #check if the buffer is not exceeding total samples 
            if buffer > (samples_total - samples_wrote):
                buffer = samples_total - samples_wrote

            block = audio[samples_wrote : (samples_wrote + buffer)]
            text += transcriber(block)["text"]
            counter += 1
            samples_wrote += buffer

        return text
    
    def voice_to_text_s(self, audio):
        tran_text = self.transcribe(audio)
        # print(tran_text)
        match_results = self.matching_text(tran_text.lower())
        return match_results


model = Model_Voice_Text()


demo = gr.Blocks()


micro_ph = gr.Interface(fn=model.voice_to_text_s,
             inputs=gr.Audio(source="microphone", type="filepath"),
             outputs=gr.Dataframe(label="Output Box", interactive=True))

file_ph = gr.Interface(fn=model.voice_to_text_s,
             inputs=gr.Audio(source="upload", type="filepath"),
             outputs=gr.Dataframe(label="Output Box", interactive=True))


with demo:
    gr.TabbedInterface(
        [micro_ph, file_ph],
        ["Transcribe Microphone", "Transcribe Audio File"],
    )

demo.launch(debug=True)