Sj8287 commited on
Commit
e2d93c0
Β·
1 Parent(s): dd47eea

initial commit

Browse files
Files changed (3) hide show
  1. README.md +1 -0
  2. app.py +180 -0
  3. requirements.txt +21 -0
README.md CHANGED
@@ -8,6 +8,7 @@ sdk_version: 5.20.1
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-4.0
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-4.0
11
+ short_description: A speech recognition tool for Indic languages.
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+ import gradio as gr
6
+
7
+ import torch
8
+ import torchaudio
9
+
10
+ import spaces
11
+
12
+ import nemo.collections.asr as nemo_asr
13
+
14
+ LANGUAGE_NAME_TO_CODE = {
15
+ "Assamese": "as",
16
+ "Bengali": "bn",
17
+ "Bodo": "br",
18
+ "Dogri": "doi",
19
+ "Gujarati": "gu",
20
+ "Hindi": "hi",
21
+ "Kannada": "kn",
22
+ "Kashmiri": "ks",
23
+ "Konkani": "kok",
24
+ "Maithili": "mai",
25
+ "Malayalam": "ml",
26
+ "Manipuri": "mni",
27
+ "Marathi": "mr",
28
+ "Nepali": "ne",
29
+ "Odia": "or",
30
+ "Punjabi": "pa",
31
+ "Sanskrit": "sa",
32
+ "Santali": "sat",
33
+ "Sindhi": "sd",
34
+ "Tamil": "ta",
35
+ "Telugu": "te",
36
+ "Urdu": "ur"
37
+ }
38
+
39
+
40
+ DESCRIPTION = """\
41
+ ### **IndicConformer: Speech Recognition for Indian Languages** πŸŽ™οΈβž‘οΈπŸ“œ
42
+
43
+ This Gradio demo showcases **IndicConformer**, a speech recognition model for **22 Indian languages**. The model operates in two modes: **CTC (Connectionist Temporal Classification)** and **RNNT (Recurrent Neural Network Transducer)**, providing robust and accurate transcriptions across diverse linguistic and acoustic conditions.
44
+
45
+ #### **How to Use:**
46
+ 1. **Upload or record** an audio clip in any supported Indian language.
47
+ 2. Select the **mode** (CTC or RNNT) for transcription.
48
+ 3. Click **"Transcribe"** to generate the corresponding text in the target language.
49
+ 4. View or copy the output for further use.
50
+
51
+ πŸš€ Try it out and experience seamless speech recognition for Indian languages!
52
+ """
53
+
54
+ hf_token = os.getenv("HF_TOKEN")
55
+ device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
56
+ torch_dtype = torch.bfloat16 if device != "cpu" else torch.float32
57
+ model_name_or_path = "ai4bharat/indicconformer_stt_bn_hybrid_ctc_rnnt_large"
58
+ model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name_or_path).to(device)
59
+ model.eval()
60
+
61
+ CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1" and torch.cuda.is_available()
62
+
63
+ AUDIO_SAMPLE_RATE = 16000
64
+ MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
65
+ DEFAULT_TARGET_LANGUAGE = "Bengali"
66
+
67
+ @spaces.GPU
68
+ def run_asr_ctc(input_audio: str, target_language: str) -> str:
69
+ # preprocess_audio(input_audio)
70
+ input_audio, orig_freq = torchaudio.load(input_audio)
71
+ input_audio = torchaudio.functional.resample(input_audio, orig_freq=orig_freq, new_freq=16000)
72
+ lang_id = LANGUAGE_NAME_TO_CODE[target_language]
73
+
74
+ model.cur_decoder = "ctc"
75
+ ctc_text = model.transcribe(['sample_audio_infer_ready.wav'], batch_size=1,logprobs=False, language_id=lang_id)[0]
76
+
77
+ return ctc_text
78
+
79
+ @spaces.GPU
80
+ def run_asr_rnnt(input_audio: str, target_language: str) -> str:
81
+ # preprocess_audio(input_audio)
82
+ input_audio, orig_freq = torchaudio.load(input_audio)
83
+ input_audio = torchaudio.functional.resample(input_audio, orig_freq=orig_freq, new_freq=16000)
84
+ lang_id = LANGUAGE_NAME_TO_CODE[target_language]
85
+
86
+ model.cur_decoder = "rnnt"
87
+ ctc_text = model.transcribe(['sample_audio_infer_ready.wav'], batch_size=1,logprobs=False, language_id=lang_id)[0]
88
+
89
+ return ctc_text
90
+
91
+
92
+
93
+ with gr.Blocks() as demo_asr_ctc:
94
+ with gr.Row():
95
+ with gr.Column():
96
+ with gr.Group():
97
+ input_audio = gr.Audio(label="Input speech", type="filepath")
98
+ target_language = gr.Dropdown(
99
+ label="Target language",
100
+ choices=LANGUAGE_NAME_TO_CODE.keys(),
101
+ value=DEFAULT_TARGET_LANGUAGE,
102
+ )
103
+ btn = gr.Button("Transcribe")
104
+ with gr.Column():
105
+ output_text = gr.Textbox(label="Transcribed text")
106
+
107
+ gr.Examples(
108
+ examples=[
109
+ ["assets/Bengali.wav", "Bengali", "English"],
110
+ ["assets/Gujarati.wav", "Gujarati", "Hindi"],
111
+ ["assets/Punjabi.wav", "Punjabi", "Hindi"],
112
+
113
+ ],
114
+ inputs=[input_audio, target_language],
115
+ outputs=output_text,
116
+ fn=run_asr_ctc,
117
+ cache_examples=CACHE_EXAMPLES,
118
+ api_name=False,
119
+ )
120
+
121
+ btn.click(
122
+ fn=run_asr_ctc,
123
+ inputs=[input_audio, target_language],
124
+ outputs=output_text,
125
+ api_name="asr",
126
+ )
127
+
128
+ with gr.Blocks() as demo_asr_rnnt:
129
+ with gr.Row():
130
+ with gr.Column():
131
+ with gr.Group():
132
+ input_audio = gr.Audio(label="Input speech", type="filepath")
133
+ target_language = gr.Dropdown(
134
+ label="Target language",
135
+ choices=LANGUAGE_NAME_TO_CODE.keys(),
136
+ value=DEFAULT_TARGET_LANGUAGE,
137
+ )
138
+ btn = gr.Button("Transcribe")
139
+ with gr.Column():
140
+ output_text = gr.Textbox(label="Transcribed text")
141
+
142
+ gr.Examples(
143
+ examples=[
144
+ ["assets/Bengali.wav", "Bengali", "English"],
145
+ ["assets/Gujarati.wav", "Gujarati", "Hindi"],
146
+ ["assets/Punjabi.wav", "Punjabi", "Hindi"],
147
+
148
+ ],
149
+ inputs=[input_audio, target_language],
150
+ outputs=output_text,
151
+ fn=run_asr_rnnt,
152
+ cache_examples=CACHE_EXAMPLES,
153
+ api_name=False,
154
+ )
155
+
156
+ btn.click(
157
+ fn=run_asr_rnnt,
158
+ inputs=[input_audio, target_language],
159
+ outputs=output_text,
160
+ api_name="asr",
161
+ )
162
+
163
+
164
+ with gr.Blocks(css="style.css") as demo:
165
+ gr.Markdown(DESCRIPTION)
166
+ gr.DuplicateButton(
167
+ value="Duplicate Space for private use",
168
+ elem_id="duplicate-button",
169
+ visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
170
+ )
171
+
172
+ with gr.Tabs():
173
+ with gr.Tab(label="CTC"):
174
+ demo_asr_ctc.render()
175
+ with gr.Tab(label="RNNT"):
176
+ demo_asr_rnnt.render()
177
+
178
+
179
+ if __name__ == "__main__":
180
+ demo.queue(max_size=50).launch()
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cython==0.29.37
2
+ pyyaml==6.0.2
3
+ argparse==1.4.0
4
+ onnxruntime==1.19.0
5
+ tqdm==4.66.5
6
+ nemo_toolkit @ git+https://github.com/AI4Bharat/NeMo@nemo-v2
7
+ transformers==4.40.0
8
+ huggingface_hub==0.23
9
+ pytorch-lightning==2.4.0
10
+ hydra-core==1.3.2
11
+ librosa==0.10.2.post1
12
+ sentencepiece==0.2.0
13
+ pandas==2.2.2
14
+ lhotse==1.27.0
15
+ editdistance==0.8.1
16
+ jiwer==3.0.4
17
+ pyannote.audio==3.3.1
18
+ webdataset==0.2.100
19
+ datasets==2.21.0
20
+ IPython==8.27.0
21
+ joblib==1.4.2