ccibeekeoc42 commited on
Commit
7ec764e
Β·
verified Β·
1 Parent(s): 8fa7e6b

Updated File with ne TTS (YarnGPT)

Browse files
Files changed (1) hide show
  1. app.py +143 -170
app.py CHANGED
@@ -3,21 +3,36 @@ import torch
3
  from transformers import pipeline
4
 
5
  # Loading the TTS and Vocoder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
6
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
7
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
8
- from datasets import load_dataset
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
11
- model_default = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
12
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
 
14
- # sending the model to device
15
- model_default.to(device)
16
- vocoder.to(device)
17
 
18
- # Loading speaker embedings
19
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
20
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 
 
 
 
 
 
 
21
 
22
  # The LLM Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
23
  from huggingface_hub import HfFolder
@@ -75,140 +90,28 @@ def transcribe(audio):
75
  return outputs["text"]
76
 
77
 
78
- # Helper Functions to Cleanup LLM Texts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
79
- # Replacement rules
80
- import re
81
- # Language-specific replacements
82
- ig_replacements = [('a', 'ah'), ('e', 'eh'), ('i', 'ee'), ('α»‹', 'ih'), ('αΉ…', 'nn'), ('o','oh'), ('ọ','aw'), ('u','oo'), ('α»₯','uh')]
83
- yo_replacements = [('Ñ', 'ah'), ('é', 'eh'), ('ẹ', 'e'), ('ó', 'oh'), ('ọ', 'aw'), ('ṣ', 'sh')]
84
-
85
- # Overall Replacements Rules
86
- replacements = [
87
- ('Β²','squared'), ('Β½','square-root'), ('ΒΎ','one quarter'), ('ΒΌ','cubeed-root'),
88
- ('ā','a'), ('Γ’', 'a'), ('Γ₯','a'), ('Γ‘', 'a'), ('Γ ', 'a'), ('αΊ£', 'a'), ('Γ£', 'a'),
89
- ('č', 'c'), ('ç', 'c'),
90
- ('Γ«','e'), ('αΊΉΜ€','e'), ('ẹ́','e'), ('Γ©', 'e'), ('Γ¨', 'e'), ('αΊ»', 'e'), ('αΊ½', 'e'), ('αΊΉ', 'e'), ('Δ—', 'e'), ('Δ“', 'e'), ('Δ™', 'e'),
91
- ('Γ―', 'i'), ('Γ¬', 'i'), ('α»‹', 'i'), ('ỉ', 'i'), ('Δ©', 'i'), ('Γ­', 'i'), ('Δ«', 'i'),
92
- ('Ε„', 'n'), ('ň', 'n'), ('Ε„', 'n'), ('αΉ…', 'n'), ('Γ±', 'n'), ('ΗΉ', 'n'),
93
- ('ΓΆ','o'), ('ọ̀','o'), ('Γ²', 'o'), ('Γ³', 'o'), ('Γ΄', 'o'), ('ọ', 'o'), ('Γ²','o'), ('ó','o'), ('oΜ€','o'), ('Γ΅','o'), ('ō','o'),
94
- ('αΉ£', 's'), ('Ε‘', 's'),
95
- ('α»₯', 'u'), ('ΓΌ', 'u'), ('ΓΊ', 'u'), ('Η”', 'u'), ('ΓΉ', 'u'), ('Ε«', 'u'), ('Ε©', 'u'),
96
- ('Ο‰','omega'), ('ΞΈ','theta'), ('Ε‚','w'),
97
- ('Ξ±','alpha'), ('Ξ²','beta'), ('Ξ³','gamma'), ('Ξ΄','delta'), ('Ξ΅','epsilon'), ('ΞΆ','zeta'), ('Ξ·','eta'), ('ΞΈ','theta'),
98
- ('ΞΉ','iota'), ('ΞΊ','kappa'), ('Ξ»','lambda'), ('ΞΌ','mu'), ('Ξ½','nu'), ('ΞΎ','xi'), ('ΞΏ','omicron'), ('Ο€','pi'),
99
- ('ρ','rho'),
100
- ('_',' '),
101
- ]
102
-
103
- # Function to clean up text
104
- def cleanup_text(example, lng="en"):
105
- example = example.lower()
106
- if lng == "ig":
107
- for src, dst in ig_replacements:
108
- example = example.replace(src, dst)
109
- elif lng == "yo":
110
- for src, dst in yo_replacements:
111
- example = example.replace(src, dst)
112
- for src, dst in replacements:
113
- example = example.replace(src, dst) # Update text directly
114
- return example
115
-
116
- # Normalizing the text
117
- def normalize_text(text):
118
- text = text.lower() # Convert to lowercase
119
- text = re.sub(r'[^\w\s\']', '', text) # Remove punctuation (except apostrophes)
120
- text = ' '.join(text.split()) # Remove extra whitespace
121
- return text
122
-
123
-
124
- # Language-specific number words
125
- number_words = {
126
- "en": { # English
127
- 0: "zero", 1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine",
128
- 10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen",
129
- 17: "seventeen", 18: "eighteen", 19: "nineteen", 20: "twenty", 30: "thirty", 40: "forty", 50: "fifty",
130
- 60: "sixty", 70: "seventy", 80: "eighty", 90: "ninety", 100: "hundred", 1000: "thousand"
131
- },
132
- "yo": { # Yoruba
133
- 0: "Γ³do", 1: "ọ̀kan", 2: "mΓ©jΓ¬", 3: "mẹ́ta", 4: "mẹ́rin", 5: "mΓ‘rΓΉn", 6: "mẹ́fΓ ", 7: "mαΊΉΜ€je", 8: "mαΊΉΜ€jọ", 9: "mẹ́sΓ n",
134
- 10: "ẹ́wa", 11: "ọọkànlÑ", 12: "méjìlÑ", 13: "mẹ́tàlÑ", 14: "mẹ́rìnlÑ", 15: "Ñrundínlógún", 16: "ẹ́rindínlógún", 17: "ẹ́rindínlógún",
135
- 18: "ẹ́rindΓ­nlΓ³gΓΊn", 19: "ẹ́rindΓ­nlΓ³gΓΊn", 20: "ogΓΊn", 30: "ọgbọ̀n", 40: "ogΓ³jΓ¬", 50: "Γ Γ‘dọ́ta", 60: "ọgọ́ta", 70: "Γ Γ‘dọ́rin",
136
- 80: "ọgọ́rin", 90: "Γ Γ‘dọ́run", 100: "ọgọ́run", 1000: "αΊΉgbαΊΉΜ€rΓΊn"
137
- },
138
- "ig": { # Igbo
139
- 0: "nọọ", 1: "otu", 2: "abα»₯ọ", 3: "atọ", 4: "anọ", 5: "ise", 6: "isii", 7: "asaa", 8: "asatọ", 9: "itoolu",
140
- 10: "iri", 11: "iri na otu", 12: "iri na abα»₯ọ", 13: "iri na atọ", 14: "iri na anọ", 15: "iri na ise",
141
- 16: "iri na isii", 17: "iri na asaa", 18: "iri na asatọ", 19: "iri na itoolu", 20: "iri abα»₯ọ",
142
- 30: "iri atọ", 40: "iri anọ", 50: "iri ise", 60: "iri isii", 70: "iri asaa", 80: "iri asatọ", 90: "iri itoolu",
143
- 100: "nari", 1000: "puku"
144
- }
145
- }
146
-
147
- # Number to words function
148
- def number_to_words(number, lang="en"):
149
- words = number_words[lang]
150
-
151
- if number < 20:
152
- return words[number]
153
- elif number < 100:
154
- tens, unit = divmod(number, 10)
155
- return words[tens * 10] + (" " + words[unit] if unit else "")
156
- elif number < 1000:
157
- hundreds, remainder = divmod(number, 100)
158
- return (words[hundreds] + " " + ("hundred" if lang == "en" else
159
- "ọgọ́rùn" if lang == "yo" else "nari") if hundreds > 1 else
160
- "hundred" if lang == "en" else "ọgọ́rùn" if lang == "yo" else "nari") + \
161
- (" " + number_to_words(remainder, lang) if remainder else "")
162
- elif number < 1000000:
163
- thousands, remainder = divmod(number, 1000)
164
- return (number_to_words(thousands, lang) + " " + ("thousand" if lang == "en" else
165
- "αΊΉgbαΊΉΜ€rΓΊn" if lang == "yo" else "puku")) + \
166
- (" " + number_to_words(remainder, lang) if remainder else "")
167
- elif number < 1000000000:
168
- millions, remainder = divmod(number, 1000000)
169
- return number_to_words(millions, lang) + " " + ("million" if lang == "en" else
170
- "mΓ­líọ̀nΓΉ" if lang == "yo" else "nde") + \
171
- (" " + number_to_words(remainder, lang) if remainder else "")
172
- elif number < 1000000000000:
173
- billions, remainder = divmod(number, 1000000000)
174
- return number_to_words(billions, lang) + " " + ("billion" if lang == "en" else
175
- "bΓ­líọ̀nΓΉ" if lang == "yo" else "ijeri") + \
176
- (" " + number_to_words(remainder, lang) if remainder else "")
177
- else:
178
- return str(number)
179
-
180
- # Replace numbers in text
181
- def replace_numbers_with_words(text, lang="en"):
182
- def replace(match):
183
- number = int(match.group())
184
- return number_to_words(number, lang)
185
-
186
- # Replace all numbers in the text
187
- return re.sub(r'\b\d+\b', replace, text)
188
-
189
- # llm_response = generate_llm_response("Explain Deep Learning in Igbo")
190
- # llm_response_cleaned = normalize_text(cleanup_text(replace_numbers_with_words(llm_response, "yo"), "yo"))
191
-
192
- # print(f"LLM Response: {llm_response}")
193
- # print(f"LLM Response Cleaned: {llm_response_cleaned}")
194
-
195
- # returning spech from text (and bringing to CPU)
196
- def synthesise(text):
197
- inputs = processor(text=text, return_tensors="pt")
198
- speech = model_default.generate_speech(
199
- inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
200
- )
201
- return speech.cpu()
202
-
203
-
204
- # putting the ST and TTS system together
205
  import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  target_dtype = np.int16
208
  max_range = np.iinfo(target_dtype).max # Maximum value for 16-bit PCM audio conversion
209
 
210
- # Modified speech-to-speech translation with textbox
211
- def speech_to_speech_translation(audio):
212
  # Speech to Text
213
  transcribed_text = transcribe(audio)
214
  print(f"Transcribed: {transcribed_text}")
@@ -216,17 +119,63 @@ def speech_to_speech_translation(audio):
216
  # Generate LLM Response
217
  print("Now making LLM Call ~~~~~~~~~~~~~~~~~~~~~~~~")
218
  llm_response = generate_llm_response(transcribed_text)
219
- llm_response_cleaned = normalize_text(cleanup_text(replace_numbers_with_words(llm_response, "yo"), "yo"))
220
  print(f"LLM Response: {llm_response}")
221
- print(f"LLM Response Cleaned: {llm_response_cleaned}")
 
 
 
 
 
 
 
 
 
 
222
 
223
  # Text to Speech
224
- # print("Synthesizing Speech ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
225
- # synthesised_speech = synthesise(llm_response_cleaned)
226
- # synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
227
- # print("Speech Synthesis Completed~~~~~~~~~~~~~~~~~~~")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
- return transcribed_text, llm_response, #(16000, synthesised_speech)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
 
232
  # Gradio Demo
@@ -234,29 +183,53 @@ import gradio as gr
234
 
235
  demo = gr.Blocks()
236
 
237
- mic_translate = gr.Interface(
238
- fn=speech_to_speech_translation,
239
- inputs=gr.Audio(sources="microphone", type="filepath"),
240
- outputs=[
241
- gr.Textbox(label="Transcribed Text", interactive=False),
242
- gr.Textbox(label="HypaAI's Response", interactive=False), # New Markdown output
243
- # gr.Audio(label="Generated Speech", type="numpy")
244
- # gr.Markdown(label="LLM Enhanced Response") # New Markdown output
245
- ]
246
- )
247
 
248
- file_translate = gr.Interface(
249
- fn=speech_to_speech_translation,
250
- inputs=gr.Audio(sources="upload", type="filepath"),
251
- outputs=[
252
- gr.Textbox(label="Transcribed Text", interactive=False),
253
- gr.Textbox(label="HypaAI's Response", interactive=False), # New Markdown output
254
- # gr.Audio(label="Generated Speech", type="numpy")
255
- # gr.Markdown(label="LLM Enhanced Response") # New Markdown output
256
- ]
257
- )
258
 
259
- with demo:
260
- gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  demo.launch(share=True)
 
3
  from transformers import pipeline
4
 
5
  # Loading the TTS and Vocoder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
6
+ !git clone https://github.com/saheedniyi02/yarngpt.git
7
+ !pip install -qU outetts uroman
8
+
9
+ import os
10
+ import re
11
+ import json
12
+ import torch
13
+ import inflect
14
+ import random
15
+ import uroman as ur
16
+ import numpy as np
17
+ import torchaudio
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer
19
+ from outetts.wav_tokenizer.decoder import WavTokenizer
20
+
21
 
22
+ !wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
23
+ !wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
 
24
 
 
 
 
25
 
26
+ from yarngpt.audiotokenizer import AudioTokenizerV2
27
+
28
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
29
+
30
+ tokenizer_path="saheedniyi/YarnGPT2"
31
+ wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
32
+ wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
33
+
34
+ audio_tokenizer=AudioTokenizerV2(tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path)
35
+ tts_model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
36
 
37
  # The LLM Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
38
  from huggingface_hub import HfFolder
 
90
  return outputs["text"]
91
 
92
 
93
+ # putting the ST and TTS system together ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  import numpy as np
95
+ def synthesise_yarn2(text):
96
+ # change the language and voice
97
+ prompt=audio_tokenizer.create_prompt(text,lang="english",speaker_name="idera")
98
+ input_ids=audio_tokenizer.tokenize_prompt(prompt)
99
+ output = tts_model.generate(
100
+ input_ids=input_ids,
101
+ temperature=0.1,
102
+ repetition_penalty=1.1,
103
+ max_length=4000,
104
+ num_beams=5,# using a beam size helps for the local languages but not english
105
+ )
106
+
107
+ codes=audio_tokenizer.get_codes(output)
108
+ audio=audio_tokenizer.get_audio(codes)
109
+ return audio.cpu()
110
 
111
  target_dtype = np.int16
112
  max_range = np.iinfo(target_dtype).max # Maximum value for 16-bit PCM audio conversion
113
 
114
+ def speech_to_speech_translation(audio, language="english"):
 
115
  # Speech to Text
116
  transcribed_text = transcribe(audio)
117
  print(f"Transcribed: {transcribed_text}")
 
119
  # Generate LLM Response
120
  print("Now making LLM Call ~~~~~~~~~~~~~~~~~~~~~~~~")
121
  llm_response = generate_llm_response(transcribed_text)
 
122
  print(f"LLM Response: {llm_response}")
123
+
124
+ # Select a random voice based on the chosen language
125
+ voice_mapping = {
126
+ "english": ["idera", "chinenye", "jude", "emma", "umar", "joke", "zainab", "osagie", "remi", "tayo"],
127
+ "yoruba": ["yoruba_male2", "yoruba_female2", "yoruba_feamle1"],
128
+ "igbo": ["igbo_female2", "igbo_male2", "igbo_female1"],
129
+ "hausa": ["hausa_feamle1", "hausa_female2", "hausa_male2", "hausa_male1"]
130
+ }
131
+
132
+ selected_voice = random.choice(voice_mapping.get(language.lower(), voice_mapping["english"]))
133
+ print(f"Selected {language} voice: {selected_voice}")
134
 
135
  # Text to Speech
136
+ print("Synthesizing Speech ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
137
+ # Use the selected language and voice
138
+ prompt = audio_tokenizer.create_prompt(llm_response, lang=language.lower(), speaker_name=selected_voice)
139
+ input_ids = audio_tokenizer.tokenize_prompt(prompt)
140
+ output = tts_model.generate(
141
+ input_ids=input_ids,
142
+ temperature=0.1,
143
+ repetition_penalty=1.1,
144
+ max_length=4000,
145
+ )
146
+
147
+ codes = audio_tokenizer.get_codes(output)
148
+ synthesised_speech = audio_tokenizer.get_audio(codes)
149
+
150
+ # Make sure we have a NumPy array, not a tensor
151
+ if hasattr(synthesised_speech, 'numpy'):
152
+ audio_np = synthesised_speech.numpy()
153
+ else:
154
+ audio_np = synthesised_speech
155
+
156
+ # Handle NaN and Inf values
157
+ audio_np = np.nan_to_num(audio_np)
158
+
159
+ # Ensure audio is in [-1, 1] range
160
+ if np.max(np.abs(audio_np)) > 0:
161
+ audio_np = audio_np / np.max(np.abs(audio_np))
162
 
163
+ # Convert to signed int16 (-32768 to 32767)
164
+ int16_max = 32767 # Max value for signed 16-bit
165
+ audio_int16 = np.clip(audio_np * int16_max, -int16_max, int16_max).astype(np.int16)
166
+
167
+ # Ensure the audio is mono channel if needed
168
+ if len(audio_int16.shape) > 1 and audio_int16.shape[0] == 1:
169
+ audio_int16 = audio_int16[0] # Convert from [1, samples] to [samples]
170
+
171
+ # Debug info
172
+ print(f"Audio stats - Min: {np.min(audio_int16)}, Max: {np.max(audio_int16)}, Shape: {audio_int16.shape}")
173
+
174
+ # Ensure sample rate is within valid range (1-192000)
175
+ sample_rate = min(max(24000, 1), 192000)
176
+
177
+ print("Speech Synthesis Completed~~~~~~~~~~~~~~~~~~~")
178
+ return transcribed_text, llm_response, (sample_rate, audio_int16)
179
 
180
 
181
  # Gradio Demo
 
183
 
184
  demo = gr.Blocks()
185
 
186
+ with demo:
187
+ gr.Markdown("# Aware Speech-to-Speech Demo")
 
 
 
 
 
 
 
 
188
 
189
+ with gr.Tab("Microphone"):
190
+ with gr.Row():
191
+ mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak")
192
+ lang_dropdown_mic = gr.Dropdown(
193
+ choices=["English", "Yoruba", "Igbo", "Hausa"],
194
+ value="English",
195
+ label="Select Language"
196
+ )
 
 
197
 
198
+ mic_submit = gr.Button("Submit")
199
+
200
+ with gr.Row():
201
+ mic_transcribed = gr.Textbox(label="Transcribed Text", interactive=False)
202
+ mic_response = gr.Textbox(label="HypaAI's Response", interactive=False)
203
+
204
+ mic_audio_output = gr.Audio(label="Generated Speech", type="numpy")
205
+
206
+ mic_submit.click(
207
+ fn=speech_to_speech_translation,
208
+ inputs=[mic_input, lang_dropdown_mic],
209
+ outputs=[mic_transcribed, mic_response, mic_audio_output]
210
+ )
211
+
212
+ with gr.Tab("Audio File"):
213
+ with gr.Row():
214
+ file_input = gr.Audio(sources="upload", type="filepath", label="Upload Audio")
215
+ lang_dropdown_file = gr.Dropdown(
216
+ choices=["English", "Yoruba", "Igbo", "Hausa"],
217
+ value="English",
218
+ label="Select Language"
219
+ )
220
+
221
+ file_submit = gr.Button("Submit")
222
+
223
+ with gr.Row():
224
+ file_transcribed = gr.Textbox(label="Transcribed Text", interactive=False)
225
+ file_response = gr.Textbox(label="HypaAI's Response", interactive=False)
226
+
227
+ file_audio_output = gr.Audio(label="Generated Speech", type="numpy")
228
+
229
+ file_submit.click(
230
+ fn=speech_to_speech_translation,
231
+ inputs=[file_input, lang_dropdown_file],
232
+ outputs=[file_transcribed, file_response, file_audio_output]
233
+ )
234
 
235
  demo.launch(share=True)