Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Loading the ST Model (Whisper)
|
2 |
+
import torch
|
3 |
+
from transformers import pipeline
|
4 |
+
|
5 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
6 |
+
pipe = pipeline("automatic-speech-recognition", model="okezieowen/whisper-small-multilingual-naija-11-03-2024", device=device)
|
7 |
+
|
8 |
+
# Take audio and return translated text
|
9 |
+
def transcribe(audio):
|
10 |
+
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"})
|
11 |
+
return outputs["text"]
|
12 |
+
|
13 |
+
|
14 |
+
# The LLM Model
|
15 |
+
from huggingface_hub import HfFolder
|
16 |
+
from openai import OpenAI
|
17 |
+
|
18 |
+
# Initialize OpenAI client for Hugging Face Inference Endpoint
|
19 |
+
client = OpenAI(
|
20 |
+
base_url="https://f2iozzwigntrzkve.us-east-1.aws.endpoints.huggingface.cloud/v1/",
|
21 |
+
api_key=f"{HfFolder().get_token()}"
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
def generate_llm_response(text, model_id="ccibeekeoc42/Llama3.1-8b-base-SFT-2024-11-09"):
|
26 |
+
"""Generates LLM response for given text with streaming support"""
|
27 |
+
full_response = []
|
28 |
+
|
29 |
+
# Create streaming response
|
30 |
+
chat_completion = client.chat.completions.create(
|
31 |
+
model="tgi",
|
32 |
+
messages=[
|
33 |
+
{"role": "system", "content": "You are a BRIEF AND DIRECT assistant. A part of a speech pipeline so keep your responces short, fluent, and straight to the point. Avoid markdown in responses"},
|
34 |
+
{"role": "user", "content": text}
|
35 |
+
],
|
36 |
+
top_p=None,
|
37 |
+
temperature=None,
|
38 |
+
max_tokens=75,
|
39 |
+
stream=True,
|
40 |
+
seed=None,
|
41 |
+
stop=None,
|
42 |
+
frequency_penalty=None,
|
43 |
+
presence_penalty=None
|
44 |
+
)
|
45 |
+
# Collect streamed response chunks
|
46 |
+
for chunk in chat_completion:
|
47 |
+
if chunk.choices[0].delta.content:
|
48 |
+
full_response.append(chunk.choices[0].delta.content)
|
49 |
+
|
50 |
+
return "".join(full_response)
|
51 |
+
|
52 |
+
generate_llm_response("Explain Deep Learning in Igbo")
|
53 |
+
|
54 |
+
# Helper Functions to Cleanup LLM Texts
|
55 |
+
# Replacement rules
|
56 |
+
import re
|
57 |
+
# Language-specific replacements
|
58 |
+
ig_replacements = [('a', 'ah'), ('e', 'eh'), ('i', 'ee'), ('ị', 'ih'), ('ṅ', 'nn'), ('o','oh'), ('ọ','aw'), ('u','oo'), ('ụ','uh')]
|
59 |
+
yo_replacements = [('á', 'ah'), ('é', 'eh'), ('ẹ', 'e'), ('ó', 'oh'), ('ọ', 'aw'), ('ṣ', 'sh')]
|
60 |
+
|
61 |
+
# Overall Replacements Rules
|
62 |
+
replacements = [
|
63 |
+
('²','squared'), ('½','square-root'), ('¾','one quarter'), ('¼','cubeed-root'),
|
64 |
+
('ā','a'), ('â', 'a'), ('å','a'), ('á', 'a'), ('à', 'a'), ('ả', 'a'), ('ã', 'a'),
|
65 |
+
('č', 'c'), ('ç', 'c'),
|
66 |
+
('ë','e'), ('ẹ̀','e'), ('ẹ́','e'), ('é', 'e'), ('è', 'e'), ('ẻ', 'e'), ('ẽ', 'e'), ('ẹ', 'e'), ('ė', 'e'), ('ē', 'e'), ('ę', 'e'),
|
67 |
+
('ï', 'i'), ('ì', 'i'), ('ị', 'i'), ('ỉ', 'i'), ('ĩ', 'i'), ('í', 'i'), ('ī', 'i'),
|
68 |
+
('ń', 'n'), ('ň', 'n'), ('ń', 'n'), ('ṅ', 'n'), ('ñ', 'n'), ('ǹ', 'n'),
|
69 |
+
('ö','o'), ('ọ̀','o'), ('ò', 'o'), ('ó', 'o'), ('ô', 'o'), ('ọ', 'o'), ('ò','o'), ('ó','o'), ('ò','o'), ('õ','o'), ('ō','o'),
|
70 |
+
('ṣ', 's'), ('š', 's'),
|
71 |
+
('ụ', 'u'), ('ü', 'u'), ('ú', 'u'), ('ǔ', 'u'), ('ù', 'u'), ('ū', 'u'), ('ũ', 'u'),
|
72 |
+
('ω','omega'), ('θ','theta'), ('ł','w'),
|
73 |
+
('α','alpha'), ('β','beta'), ('γ','gamma'), ('δ','delta'), ('ε','epsilon'), ('ζ','zeta'), ('η','eta'), ('θ','theta'),
|
74 |
+
('ι','iota'), ('κ','kappa'), ('λ','lambda'), ('μ','mu'), ('ν','nu'), ('ξ','xi'), ('ο','omicron'), ('π','pi'),
|
75 |
+
('ρ','rho'),
|
76 |
+
('_',' '),
|
77 |
+
]
|
78 |
+
|
79 |
+
# Function to clean up text
|
80 |
+
def cleanup_text(example, lng="en"):
|
81 |
+
example = example.lower()
|
82 |
+
if lng == "ig":
|
83 |
+
for src, dst in ig_replacements:
|
84 |
+
example = example.replace(src, dst)
|
85 |
+
elif lng == "yo":
|
86 |
+
for src, dst in yo_replacements:
|
87 |
+
example = example.replace(src, dst)
|
88 |
+
for src, dst in replacements:
|
89 |
+
example = example.replace(src, dst) # Update text directly
|
90 |
+
return example
|
91 |
+
|
92 |
+
# Normalizing the text
|
93 |
+
def normalize_text(text):
|
94 |
+
text = text.lower() # Convert to lowercase
|
95 |
+
text = re.sub(r'[^\w\s\']', '', text) # Remove punctuation (except apostrophes)
|
96 |
+
text = ' '.join(text.split()) # Remove extra whitespace
|
97 |
+
return text
|
98 |
+
|
99 |
+
|
100 |
+
# Language-specific number words
|
101 |
+
number_words = {
|
102 |
+
"en": { # English
|
103 |
+
0: "zero", 1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine",
|
104 |
+
10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen",
|
105 |
+
17: "seventeen", 18: "eighteen", 19: "nineteen", 20: "twenty", 30: "thirty", 40: "forty", 50: "fifty",
|
106 |
+
60: "sixty", 70: "seventy", 80: "eighty", 90: "ninety", 100: "hundred", 1000: "thousand"
|
107 |
+
},
|
108 |
+
"yo": { # Yoruba
|
109 |
+
0: "ódo", 1: "ọ̀kan", 2: "méjì", 3: "mẹ́ta", 4: "mẹ́rin", 5: "márùn", 6: "mẹ́fà", 7: "mẹ̀je", 8: "mẹ̀jọ", 9: "mẹ́sàn",
|
110 |
+
10: "ẹ́wa", 11: "ọọkànlá", 12: "méjìlá", 13: "mẹ́tàlá", 14: "mẹ́rìnlá", 15: "árundínlógún", 16: "ẹ́rindínlógún", 17: "ẹ́rindínlógún",
|
111 |
+
18: "ẹ́rindínlógún", 19: "ẹ́rindínlógún", 20: "ogún", 30: "ọgbọ̀n", 40: "ogójì", 50: "àádọ́ta", 60: "ọgọ́ta", 70: "àádọ́rin",
|
112 |
+
80: "ọgọ́rin", 90: "àádọ́run", 100: "ọgọ́run", 1000: "ẹgbẹ̀rún"
|
113 |
+
},
|
114 |
+
"ig": { # Igbo
|
115 |
+
0: "nọọ", 1: "otu", 2: "abụọ", 3: "atọ", 4: "anọ", 5: "ise", 6: "isii", 7: "asaa", 8: "asatọ", 9: "itoolu",
|
116 |
+
10: "iri", 11: "iri na otu", 12: "iri na abụọ", 13: "iri na atọ", 14: "iri na anọ", 15: "iri na ise",
|
117 |
+
16: "iri na isii", 17: "iri na asaa", 18: "iri na asatọ", 19: "iri na itoolu", 20: "iri abụọ",
|
118 |
+
30: "iri atọ", 40: "iri anọ", 50: "iri ise", 60: "iri isii", 70: "iri asaa", 80: "iri asatọ", 90: "iri itoolu",
|
119 |
+
100: "nari", 1000: "puku"
|
120 |
+
}
|
121 |
+
}
|
122 |
+
|
123 |
+
# Number to words function
|
124 |
+
def number_to_words(number, lang="en"):
|
125 |
+
words = number_words[lang]
|
126 |
+
|
127 |
+
if number < 20:
|
128 |
+
return words[number]
|
129 |
+
elif number < 100:
|
130 |
+
tens, unit = divmod(number, 10)
|
131 |
+
return words[tens * 10] + (" " + words[unit] if unit else "")
|
132 |
+
elif number < 1000:
|
133 |
+
hundreds, remainder = divmod(number, 100)
|
134 |
+
return (words[hundreds] + " " + ("hundred" if lang == "en" else
|
135 |
+
"ọgọ́rùn" if lang == "yo" else "nari") if hundreds > 1 else
|
136 |
+
"hundred" if lang == "en" else "ọgọ́rùn" if lang == "yo" else "nari") + \
|
137 |
+
(" " + number_to_words(remainder, lang) if remainder else "")
|
138 |
+
elif number < 1000000:
|
139 |
+
thousands, remainder = divmod(number, 1000)
|
140 |
+
return (number_to_words(thousands, lang) + " " + ("thousand" if lang == "en" else
|
141 |
+
"ẹgbẹ̀rún" if lang == "yo" else "puku")) + \
|
142 |
+
(" " + number_to_words(remainder, lang) if remainder else "")
|
143 |
+
elif number < 1000000000:
|
144 |
+
millions, remainder = divmod(number, 1000000)
|
145 |
+
return number_to_words(millions, lang) + " " + ("million" if lang == "en" else
|
146 |
+
"mílíọ̀nù" if lang == "yo" else "nde") + \
|
147 |
+
(" " + number_to_words(remainder, lang) if remainder else "")
|
148 |
+
elif number < 1000000000000:
|
149 |
+
billions, remainder = divmod(number, 1000000000)
|
150 |
+
return number_to_words(billions, lang) + " " + ("billion" if lang == "en" else
|
151 |
+
"bílíọ̀nù" if lang == "yo" else "ijeri") + \
|
152 |
+
(" " + number_to_words(remainder, lang) if remainder else "")
|
153 |
+
else:
|
154 |
+
return str(number)
|
155 |
+
|
156 |
+
# Replace numbers in text
|
157 |
+
def replace_numbers_with_words(text, lang="en"):
|
158 |
+
def replace(match):
|
159 |
+
number = int(match.group())
|
160 |
+
return number_to_words(number, lang)
|
161 |
+
|
162 |
+
# Replace all numbers in the text
|
163 |
+
return re.sub(r'\b\d+\b', replace, text)
|
164 |
+
|
165 |
+
llm_response = generate_llm_response("Explain Deep Learning in Igbo")
|
166 |
+
llm_response_cleaned = normalize_text(cleanup_text(replace_numbers_with_words(llm_response, "yo"), "yo"))
|
167 |
+
|
168 |
+
print(f"LLM Response: {llm_response}")
|
169 |
+
print(f"LLM Response Cleaned: {llm_response_cleaned}")
|
170 |
+
|
171 |
+
# Loading the TTS and Vocoder
|
172 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
173 |
+
from datasets import load_dataset
|
174 |
+
|
175 |
+
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
176 |
+
|
177 |
+
model_default = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
178 |
+
model = SpeechT5ForTextToSpeech.from_pretrained("ccibeekeoc42/speecht5_finetuned_naija_ig_yo_2025-01-20_O2")
|
179 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
180 |
+
|
181 |
+
# sending the model to device
|
182 |
+
model_default.to(device)
|
183 |
+
model.to(device)
|
184 |
+
vocoder.to(device)
|
185 |
+
|
186 |
+
# Loading speaker embedings
|
187 |
+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
188 |
+
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
189 |
+
|
190 |
+
|
191 |
+
# returning spech from text (and bringing to CPU)
|
192 |
+
def synthesise(text):
|
193 |
+
inputs = processor(text=text, return_tensors="pt")
|
194 |
+
speech = model_default.generate_speech(
|
195 |
+
inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
|
196 |
+
)
|
197 |
+
return speech.cpu()
|
198 |
+
|
199 |
+
|
200 |
+
# putting the ST and TTS system together
|
201 |
+
import numpy as np
|
202 |
+
|
203 |
+
target_dtype = np.int16
|
204 |
+
max_range = np.iinfo(target_dtype).max # Maximum value for 16-bit PCM audio conversion
|
205 |
+
|
206 |
+
# Modified speech-to-speech translation with textbox
|
207 |
+
def speech_to_speech_translation(audio):
|
208 |
+
# Speech to Text
|
209 |
+
transcribed_text = transcribe(audio)
|
210 |
+
print(f"Transcribed: {transcribed_text}")
|
211 |
+
|
212 |
+
# Generate LLM Response
|
213 |
+
llm_response = generate_llm_response(transcribed_text)
|
214 |
+
llm_response_cleaned = normalize_text(cleanup_text(replace_numbers_with_words(llm_response, "yo"), "yo"))
|
215 |
+
print(f"LLM Response: {llm_response}")
|
216 |
+
print(f"LLM Response Cleaned: {llm_response_cleaned}")
|
217 |
+
|
218 |
+
# Text to Speech
|
219 |
+
synthesised_speech = synthesise(llm_response_cleaned)
|
220 |
+
synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
|
221 |
+
|
222 |
+
return transcribed_text, (16000, synthesised_speech), llm_response
|
223 |
+
|
224 |
+
|
225 |
+
# Gradio Demo
|
226 |
+
import gradio as gr
|
227 |
+
|
228 |
+
demo = gr.Blocks()
|
229 |
+
|
230 |
+
mic_translate = gr.Interface(
|
231 |
+
fn=speech_to_speech_translation,
|
232 |
+
inputs=gr.Audio(sources="microphone", type="filepath"),
|
233 |
+
outputs=[
|
234 |
+
gr.Textbox(label="Transcribed Text", interactive=False),
|
235 |
+
gr.Audio(label="Generated Speech", type="numpy"),
|
236 |
+
gr.Markdown(label="LLM Enhanced Response") # New Markdown output
|
237 |
+
]
|
238 |
+
)
|
239 |
+
|
240 |
+
file_translate = gr.Interface(
|
241 |
+
fn=speech_to_speech_translation,
|
242 |
+
inputs=gr.Audio(sources="upload", type="filepath"),
|
243 |
+
outputs=[
|
244 |
+
gr.Textbox(label="Transcribed Text", interactive=False),
|
245 |
+
gr.Audio(label="Generated Speech", type="numpy"),
|
246 |
+
gr.Markdown(label="LLM Enhanced Response") # New Markdown output
|
247 |
+
]
|
248 |
+
)
|
249 |
+
|
250 |
+
with demo:
|
251 |
+
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
|
252 |
+
|
253 |
+
demo.launch(enable_queue=True)
|