ccibeekeoc42 commited on
Commit
99be103
·
verified ·
1 Parent(s): c0df644

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +253 -0
app.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Loading the ST Model (Whisper)
2
+ import torch
3
+ from transformers import pipeline
4
+
5
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
6
+ pipe = pipeline("automatic-speech-recognition", model="okezieowen/whisper-small-multilingual-naija-11-03-2024", device=device)
7
+
8
+ # Take audio and return translated text
9
+ def transcribe(audio):
10
+ outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"})
11
+ return outputs["text"]
12
+
13
+
14
+ # The LLM Model
15
+ from huggingface_hub import HfFolder
16
+ from openai import OpenAI
17
+
18
+ # Initialize OpenAI client for Hugging Face Inference Endpoint
19
+ client = OpenAI(
20
+ base_url="https://f2iozzwigntrzkve.us-east-1.aws.endpoints.huggingface.cloud/v1/",
21
+ api_key=f"{HfFolder().get_token()}"
22
+ )
23
+
24
+
25
+ def generate_llm_response(text, model_id="ccibeekeoc42/Llama3.1-8b-base-SFT-2024-11-09"):
26
+ """Generates LLM response for given text with streaming support"""
27
+ full_response = []
28
+
29
+ # Create streaming response
30
+ chat_completion = client.chat.completions.create(
31
+ model="tgi",
32
+ messages=[
33
+ {"role": "system", "content": "You are a BRIEF AND DIRECT assistant. A part of a speech pipeline so keep your responces short, fluent, and straight to the point. Avoid markdown in responses"},
34
+ {"role": "user", "content": text}
35
+ ],
36
+ top_p=None,
37
+ temperature=None,
38
+ max_tokens=75,
39
+ stream=True,
40
+ seed=None,
41
+ stop=None,
42
+ frequency_penalty=None,
43
+ presence_penalty=None
44
+ )
45
+ # Collect streamed response chunks
46
+ for chunk in chat_completion:
47
+ if chunk.choices[0].delta.content:
48
+ full_response.append(chunk.choices[0].delta.content)
49
+
50
+ return "".join(full_response)
51
+
52
+ generate_llm_response("Explain Deep Learning in Igbo")
53
+
54
+ # Helper Functions to Cleanup LLM Texts
55
+ # Replacement rules
56
+ import re
57
+ # Language-specific replacements
58
+ ig_replacements = [('a', 'ah'), ('e', 'eh'), ('i', 'ee'), ('ị', 'ih'), ('ṅ', 'nn'), ('o','oh'), ('ọ','aw'), ('u','oo'), ('ụ','uh')]
59
+ yo_replacements = [('á', 'ah'), ('é', 'eh'), ('ẹ', 'e'), ('ó', 'oh'), ('ọ', 'aw'), ('ṣ', 'sh')]
60
+
61
+ # Overall Replacements Rules
62
+ replacements = [
63
+ ('²','squared'), ('½','square-root'), ('¾','one quarter'), ('¼','cubeed-root'),
64
+ ('ā','a'), ('â', 'a'), ('å','a'), ('á', 'a'), ('à', 'a'), ('ả', 'a'), ('ã', 'a'),
65
+ ('č', 'c'), ('ç', 'c'),
66
+ ('ë','e'), ('ẹ̀','e'), ('ẹ́','e'), ('é', 'e'), ('è', 'e'), ('ẻ', 'e'), ('ẽ', 'e'), ('ẹ', 'e'), ('ė', 'e'), ('ē', 'e'), ('ę', 'e'),
67
+ ('ï', 'i'), ('ì', 'i'), ('ị', 'i'), ('ỉ', 'i'), ('ĩ', 'i'), ('í', 'i'), ('ī', 'i'),
68
+ ('ń', 'n'), ('ň', 'n'), ('ń', 'n'), ('ṅ', 'n'), ('ñ', 'n'), ('ǹ', 'n'),
69
+ ('ö','o'), ('ọ̀','o'), ('ò', 'o'), ('ó', 'o'), ('ô', 'o'), ('ọ', 'o'), ('ò','o'), ('ó','o'), ('ò','o'), ('õ','o'), ('ō','o'),
70
+ ('ṣ', 's'), ('š', 's'),
71
+ ('ụ', 'u'), ('ü', 'u'), ('ú', 'u'), ('ǔ', 'u'), ('ù', 'u'), ('ū', 'u'), ('ũ', 'u'),
72
+ ('ω','omega'), ('θ','theta'), ('ł','w'),
73
+ ('α','alpha'), ('β','beta'), ('γ','gamma'), ('δ','delta'), ('ε','epsilon'), ('ζ','zeta'), ('η','eta'), ('θ','theta'),
74
+ ('ι','iota'), ('κ','kappa'), ('λ','lambda'), ('μ','mu'), ('ν','nu'), ('ξ','xi'), ('ο','omicron'), ('π','pi'),
75
+ ('ρ','rho'),
76
+ ('_',' '),
77
+ ]
78
+
79
+ # Function to clean up text
80
+ def cleanup_text(example, lng="en"):
81
+ example = example.lower()
82
+ if lng == "ig":
83
+ for src, dst in ig_replacements:
84
+ example = example.replace(src, dst)
85
+ elif lng == "yo":
86
+ for src, dst in yo_replacements:
87
+ example = example.replace(src, dst)
88
+ for src, dst in replacements:
89
+ example = example.replace(src, dst) # Update text directly
90
+ return example
91
+
92
+ # Normalizing the text
93
+ def normalize_text(text):
94
+ text = text.lower() # Convert to lowercase
95
+ text = re.sub(r'[^\w\s\']', '', text) # Remove punctuation (except apostrophes)
96
+ text = ' '.join(text.split()) # Remove extra whitespace
97
+ return text
98
+
99
+
100
+ # Language-specific number words
101
+ number_words = {
102
+ "en": { # English
103
+ 0: "zero", 1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine",
104
+ 10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen",
105
+ 17: "seventeen", 18: "eighteen", 19: "nineteen", 20: "twenty", 30: "thirty", 40: "forty", 50: "fifty",
106
+ 60: "sixty", 70: "seventy", 80: "eighty", 90: "ninety", 100: "hundred", 1000: "thousand"
107
+ },
108
+ "yo": { # Yoruba
109
+ 0: "ódo", 1: "ọ̀kan", 2: "méjì", 3: "mẹ́ta", 4: "mẹ́rin", 5: "márùn", 6: "mẹ́fà", 7: "mẹ̀je", 8: "mẹ̀jọ", 9: "mẹ́sàn",
110
+ 10: "ẹ́wa", 11: "ọọkànlá", 12: "méjìlá", 13: "mẹ́tàlá", 14: "mẹ́rìnlá", 15: "árundínlógún", 16: "ẹ́rindínlógún", 17: "ẹ́rindínlógún",
111
+ 18: "ẹ́rindínlógún", 19: "ẹ́rindínlógún", 20: "ogún", 30: "ọgbọ̀n", 40: "ogójì", 50: "àádọ́ta", 60: "ọgọ́ta", 70: "àádọ́rin",
112
+ 80: "ọgọ́rin", 90: "àádọ́run", 100: "ọgọ́run", 1000: "ẹgbẹ̀rún"
113
+ },
114
+ "ig": { # Igbo
115
+ 0: "nọọ", 1: "otu", 2: "abụọ", 3: "atọ", 4: "anọ", 5: "ise", 6: "isii", 7: "asaa", 8: "asatọ", 9: "itoolu",
116
+ 10: "iri", 11: "iri na otu", 12: "iri na abụọ", 13: "iri na atọ", 14: "iri na anọ", 15: "iri na ise",
117
+ 16: "iri na isii", 17: "iri na asaa", 18: "iri na asatọ", 19: "iri na itoolu", 20: "iri abụọ",
118
+ 30: "iri atọ", 40: "iri anọ", 50: "iri ise", 60: "iri isii", 70: "iri asaa", 80: "iri asatọ", 90: "iri itoolu",
119
+ 100: "nari", 1000: "puku"
120
+ }
121
+ }
122
+
123
+ # Number to words function
124
+ def number_to_words(number, lang="en"):
125
+ words = number_words[lang]
126
+
127
+ if number < 20:
128
+ return words[number]
129
+ elif number < 100:
130
+ tens, unit = divmod(number, 10)
131
+ return words[tens * 10] + (" " + words[unit] if unit else "")
132
+ elif number < 1000:
133
+ hundreds, remainder = divmod(number, 100)
134
+ return (words[hundreds] + " " + ("hundred" if lang == "en" else
135
+ "ọgọ́rùn" if lang == "yo" else "nari") if hundreds > 1 else
136
+ "hundred" if lang == "en" else "ọgọ́rùn" if lang == "yo" else "nari") + \
137
+ (" " + number_to_words(remainder, lang) if remainder else "")
138
+ elif number < 1000000:
139
+ thousands, remainder = divmod(number, 1000)
140
+ return (number_to_words(thousands, lang) + " " + ("thousand" if lang == "en" else
141
+ "ẹgbẹ̀rún" if lang == "yo" else "puku")) + \
142
+ (" " + number_to_words(remainder, lang) if remainder else "")
143
+ elif number < 1000000000:
144
+ millions, remainder = divmod(number, 1000000)
145
+ return number_to_words(millions, lang) + " " + ("million" if lang == "en" else
146
+ "mílíọ̀nù" if lang == "yo" else "nde") + \
147
+ (" " + number_to_words(remainder, lang) if remainder else "")
148
+ elif number < 1000000000000:
149
+ billions, remainder = divmod(number, 1000000000)
150
+ return number_to_words(billions, lang) + " " + ("billion" if lang == "en" else
151
+ "bílíọ̀nù" if lang == "yo" else "ijeri") + \
152
+ (" " + number_to_words(remainder, lang) if remainder else "")
153
+ else:
154
+ return str(number)
155
+
156
+ # Replace numbers in text
157
+ def replace_numbers_with_words(text, lang="en"):
158
+ def replace(match):
159
+ number = int(match.group())
160
+ return number_to_words(number, lang)
161
+
162
+ # Replace all numbers in the text
163
+ return re.sub(r'\b\d+\b', replace, text)
164
+
165
+ llm_response = generate_llm_response("Explain Deep Learning in Igbo")
166
+ llm_response_cleaned = normalize_text(cleanup_text(replace_numbers_with_words(llm_response, "yo"), "yo"))
167
+
168
+ print(f"LLM Response: {llm_response}")
169
+ print(f"LLM Response Cleaned: {llm_response_cleaned}")
170
+
171
+ # Loading the TTS and Vocoder
172
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
173
+ from datasets import load_dataset
174
+
175
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
176
+
177
+ model_default = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
178
+ model = SpeechT5ForTextToSpeech.from_pretrained("ccibeekeoc42/speecht5_finetuned_naija_ig_yo_2025-01-20_O2")
179
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
180
+
181
+ # sending the model to device
182
+ model_default.to(device)
183
+ model.to(device)
184
+ vocoder.to(device)
185
+
186
+ # Loading speaker embedings
187
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
188
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
189
+
190
+
191
+ # returning spech from text (and bringing to CPU)
192
+ def synthesise(text):
193
+ inputs = processor(text=text, return_tensors="pt")
194
+ speech = model_default.generate_speech(
195
+ inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
196
+ )
197
+ return speech.cpu()
198
+
199
+
200
+ # putting the ST and TTS system together
201
+ import numpy as np
202
+
203
+ target_dtype = np.int16
204
+ max_range = np.iinfo(target_dtype).max # Maximum value for 16-bit PCM audio conversion
205
+
206
+ # Modified speech-to-speech translation with textbox
207
+ def speech_to_speech_translation(audio):
208
+ # Speech to Text
209
+ transcribed_text = transcribe(audio)
210
+ print(f"Transcribed: {transcribed_text}")
211
+
212
+ # Generate LLM Response
213
+ llm_response = generate_llm_response(transcribed_text)
214
+ llm_response_cleaned = normalize_text(cleanup_text(replace_numbers_with_words(llm_response, "yo"), "yo"))
215
+ print(f"LLM Response: {llm_response}")
216
+ print(f"LLM Response Cleaned: {llm_response_cleaned}")
217
+
218
+ # Text to Speech
219
+ synthesised_speech = synthesise(llm_response_cleaned)
220
+ synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
221
+
222
+ return transcribed_text, (16000, synthesised_speech), llm_response
223
+
224
+
225
+ # Gradio Demo
226
+ import gradio as gr
227
+
228
+ demo = gr.Blocks()
229
+
230
+ mic_translate = gr.Interface(
231
+ fn=speech_to_speech_translation,
232
+ inputs=gr.Audio(sources="microphone", type="filepath"),
233
+ outputs=[
234
+ gr.Textbox(label="Transcribed Text", interactive=False),
235
+ gr.Audio(label="Generated Speech", type="numpy"),
236
+ gr.Markdown(label="LLM Enhanced Response") # New Markdown output
237
+ ]
238
+ )
239
+
240
+ file_translate = gr.Interface(
241
+ fn=speech_to_speech_translation,
242
+ inputs=gr.Audio(sources="upload", type="filepath"),
243
+ outputs=[
244
+ gr.Textbox(label="Transcribed Text", interactive=False),
245
+ gr.Audio(label="Generated Speech", type="numpy"),
246
+ gr.Markdown(label="LLM Enhanced Response") # New Markdown output
247
+ ]
248
+ )
249
+
250
+ with demo:
251
+ gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
252
+
253
+ demo.launch(enable_queue=True)