Spaces:
Runtime error
Runtime error
Updated File with ne TTS (YarnGPT)
Browse files
app.py
CHANGED
@@ -3,21 +3,36 @@ import torch
|
|
3 |
from transformers import pipeline
|
4 |
|
5 |
# Loading the TTS and Vocoder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
13 |
|
14 |
-
# sending the model to device
|
15 |
-
model_default.to(device)
|
16 |
-
vocoder.to(device)
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
# The LLM Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
23 |
from huggingface_hub import HfFolder
|
@@ -75,140 +90,28 @@ def transcribe(audio):
|
|
75 |
return outputs["text"]
|
76 |
|
77 |
|
78 |
-
#
|
79 |
-
# Replacement rules
|
80 |
-
import re
|
81 |
-
# Language-specific replacements
|
82 |
-
ig_replacements = [('a', 'ah'), ('e', 'eh'), ('i', 'ee'), ('α»', 'ih'), ('αΉ
', 'nn'), ('o','oh'), ('α»','aw'), ('u','oo'), ('α»₯','uh')]
|
83 |
-
yo_replacements = [('Γ‘', 'ah'), ('Γ©', 'eh'), ('αΊΉ', 'e'), ('Γ³', 'oh'), ('α»', 'aw'), ('αΉ£', 'sh')]
|
84 |
-
|
85 |
-
# Overall Replacements Rules
|
86 |
-
replacements = [
|
87 |
-
('Β²','squared'), ('Β½','square-root'), ('ΒΎ','one quarter'), ('ΒΌ','cubeed-root'),
|
88 |
-
('Δ','a'), ('Γ’', 'a'), ('Γ₯','a'), ('Γ‘', 'a'), ('Γ ', 'a'), ('αΊ£', 'a'), ('Γ£', 'a'),
|
89 |
-
('Δ', 'c'), ('Γ§', 'c'),
|
90 |
-
('Γ«','e'), ('αΊΉΜ','e'), ('αΊΉΜ','e'), ('Γ©', 'e'), ('Γ¨', 'e'), ('αΊ»', 'e'), ('αΊ½', 'e'), ('αΊΉ', 'e'), ('Δ', 'e'), ('Δ', 'e'), ('Δ', 'e'),
|
91 |
-
('Γ―', 'i'), ('Γ¬', 'i'), ('α»', 'i'), ('α»', 'i'), ('Δ©', 'i'), ('Γ', 'i'), ('Δ«', 'i'),
|
92 |
-
('Ε', 'n'), ('Ε', 'n'), ('Ε', 'n'), ('αΉ
', 'n'), ('Γ±', 'n'), ('ΗΉ', 'n'),
|
93 |
-
('ΓΆ','o'), ('α»Μ','o'), ('Γ²', 'o'), ('Γ³', 'o'), ('Γ΄', 'o'), ('α»', 'o'), ('Γ²','o'), ('oΜ','o'), ('oΜ','o'), ('Γ΅','o'), ('Ε','o'),
|
94 |
-
('αΉ£', 's'), ('Ε‘', 's'),
|
95 |
-
('α»₯', 'u'), ('ΓΌ', 'u'), ('ΓΊ', 'u'), ('Η', 'u'), ('ΓΉ', 'u'), ('Ε«', 'u'), ('Ε©', 'u'),
|
96 |
-
('Ο','omega'), ('ΞΈ','theta'), ('Ε','w'),
|
97 |
-
('Ξ±','alpha'), ('Ξ²','beta'), ('Ξ³','gamma'), ('Ξ΄','delta'), ('Ξ΅','epsilon'), ('ΞΆ','zeta'), ('Ξ·','eta'), ('ΞΈ','theta'),
|
98 |
-
('ΞΉ','iota'), ('ΞΊ','kappa'), ('Ξ»','lambda'), ('ΞΌ','mu'), ('Ξ½','nu'), ('ΞΎ','xi'), ('ΞΏ','omicron'), ('Ο','pi'),
|
99 |
-
('Ο','rho'),
|
100 |
-
('_',' '),
|
101 |
-
]
|
102 |
-
|
103 |
-
# Function to clean up text
|
104 |
-
def cleanup_text(example, lng="en"):
|
105 |
-
example = example.lower()
|
106 |
-
if lng == "ig":
|
107 |
-
for src, dst in ig_replacements:
|
108 |
-
example = example.replace(src, dst)
|
109 |
-
elif lng == "yo":
|
110 |
-
for src, dst in yo_replacements:
|
111 |
-
example = example.replace(src, dst)
|
112 |
-
for src, dst in replacements:
|
113 |
-
example = example.replace(src, dst) # Update text directly
|
114 |
-
return example
|
115 |
-
|
116 |
-
# Normalizing the text
|
117 |
-
def normalize_text(text):
|
118 |
-
text = text.lower() # Convert to lowercase
|
119 |
-
text = re.sub(r'[^\w\s\']', '', text) # Remove punctuation (except apostrophes)
|
120 |
-
text = ' '.join(text.split()) # Remove extra whitespace
|
121 |
-
return text
|
122 |
-
|
123 |
-
|
124 |
-
# Language-specific number words
|
125 |
-
number_words = {
|
126 |
-
"en": { # English
|
127 |
-
0: "zero", 1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine",
|
128 |
-
10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen",
|
129 |
-
17: "seventeen", 18: "eighteen", 19: "nineteen", 20: "twenty", 30: "thirty", 40: "forty", 50: "fifty",
|
130 |
-
60: "sixty", 70: "seventy", 80: "eighty", 90: "ninety", 100: "hundred", 1000: "thousand"
|
131 |
-
},
|
132 |
-
"yo": { # Yoruba
|
133 |
-
0: "Γ³do", 1: "α»Μkan", 2: "mΓ©jΓ¬", 3: "mαΊΉΜta", 4: "mαΊΉΜrin", 5: "mΓ‘rΓΉn", 6: "mαΊΉΜfΓ ", 7: "mαΊΉΜje", 8: "mαΊΉΜjα»", 9: "mαΊΉΜsΓ n",
|
134 |
-
10: "αΊΉΜwa", 11: "α»α»kΓ nlΓ‘", 12: "mΓ©jΓ¬lΓ‘", 13: "mαΊΉΜtΓ lΓ‘", 14: "mαΊΉΜrΓ¬nlΓ‘", 15: "Γ‘rundΓnlΓ³gΓΊn", 16: "αΊΉΜrindΓnlΓ³gΓΊn", 17: "αΊΉΜrindΓnlΓ³gΓΊn",
|
135 |
-
18: "αΊΉΜrindΓnlΓ³gΓΊn", 19: "αΊΉΜrindΓnlΓ³gΓΊn", 20: "ogΓΊn", 30: "α»gbα»Μn", 40: "ogΓ³jΓ¬", 50: "Γ Γ‘dα»Μta", 60: "α»gα»Μta", 70: "Γ Γ‘dα»Μrin",
|
136 |
-
80: "α»gα»Μrin", 90: "Γ Γ‘dα»Μrun", 100: "α»gα»Μrun", 1000: "αΊΉgbαΊΉΜrΓΊn"
|
137 |
-
},
|
138 |
-
"ig": { # Igbo
|
139 |
-
0: "nα»α»", 1: "otu", 2: "abα»₯α»", 3: "atα»", 4: "anα»", 5: "ise", 6: "isii", 7: "asaa", 8: "asatα»", 9: "itoolu",
|
140 |
-
10: "iri", 11: "iri na otu", 12: "iri na abα»₯α»", 13: "iri na atα»", 14: "iri na anα»", 15: "iri na ise",
|
141 |
-
16: "iri na isii", 17: "iri na asaa", 18: "iri na asatα»", 19: "iri na itoolu", 20: "iri abα»₯α»",
|
142 |
-
30: "iri atα»", 40: "iri anα»", 50: "iri ise", 60: "iri isii", 70: "iri asaa", 80: "iri asatα»", 90: "iri itoolu",
|
143 |
-
100: "nari", 1000: "puku"
|
144 |
-
}
|
145 |
-
}
|
146 |
-
|
147 |
-
# Number to words function
|
148 |
-
def number_to_words(number, lang="en"):
|
149 |
-
words = number_words[lang]
|
150 |
-
|
151 |
-
if number < 20:
|
152 |
-
return words[number]
|
153 |
-
elif number < 100:
|
154 |
-
tens, unit = divmod(number, 10)
|
155 |
-
return words[tens * 10] + (" " + words[unit] if unit else "")
|
156 |
-
elif number < 1000:
|
157 |
-
hundreds, remainder = divmod(number, 100)
|
158 |
-
return (words[hundreds] + " " + ("hundred" if lang == "en" else
|
159 |
-
"α»gα»ΜrΓΉn" if lang == "yo" else "nari") if hundreds > 1 else
|
160 |
-
"hundred" if lang == "en" else "α»gα»ΜrΓΉn" if lang == "yo" else "nari") + \
|
161 |
-
(" " + number_to_words(remainder, lang) if remainder else "")
|
162 |
-
elif number < 1000000:
|
163 |
-
thousands, remainder = divmod(number, 1000)
|
164 |
-
return (number_to_words(thousands, lang) + " " + ("thousand" if lang == "en" else
|
165 |
-
"αΊΉgbαΊΉΜrΓΊn" if lang == "yo" else "puku")) + \
|
166 |
-
(" " + number_to_words(remainder, lang) if remainder else "")
|
167 |
-
elif number < 1000000000:
|
168 |
-
millions, remainder = divmod(number, 1000000)
|
169 |
-
return number_to_words(millions, lang) + " " + ("million" if lang == "en" else
|
170 |
-
"mΓlΓα»ΜnΓΉ" if lang == "yo" else "nde") + \
|
171 |
-
(" " + number_to_words(remainder, lang) if remainder else "")
|
172 |
-
elif number < 1000000000000:
|
173 |
-
billions, remainder = divmod(number, 1000000000)
|
174 |
-
return number_to_words(billions, lang) + " " + ("billion" if lang == "en" else
|
175 |
-
"bΓlΓα»ΜnΓΉ" if lang == "yo" else "ijeri") + \
|
176 |
-
(" " + number_to_words(remainder, lang) if remainder else "")
|
177 |
-
else:
|
178 |
-
return str(number)
|
179 |
-
|
180 |
-
# Replace numbers in text
|
181 |
-
def replace_numbers_with_words(text, lang="en"):
|
182 |
-
def replace(match):
|
183 |
-
number = int(match.group())
|
184 |
-
return number_to_words(number, lang)
|
185 |
-
|
186 |
-
# Replace all numbers in the text
|
187 |
-
return re.sub(r'\b\d+\b', replace, text)
|
188 |
-
|
189 |
-
# llm_response = generate_llm_response("Explain Deep Learning in Igbo")
|
190 |
-
# llm_response_cleaned = normalize_text(cleanup_text(replace_numbers_with_words(llm_response, "yo"), "yo"))
|
191 |
-
|
192 |
-
# print(f"LLM Response: {llm_response}")
|
193 |
-
# print(f"LLM Response Cleaned: {llm_response_cleaned}")
|
194 |
-
|
195 |
-
# returning spech from text (and bringing to CPU)
|
196 |
-
def synthesise(text):
|
197 |
-
inputs = processor(text=text, return_tensors="pt")
|
198 |
-
speech = model_default.generate_speech(
|
199 |
-
inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
|
200 |
-
)
|
201 |
-
return speech.cpu()
|
202 |
-
|
203 |
-
|
204 |
-
# putting the ST and TTS system together
|
205 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
target_dtype = np.int16
|
208 |
max_range = np.iinfo(target_dtype).max # Maximum value for 16-bit PCM audio conversion
|
209 |
|
210 |
-
|
211 |
-
def speech_to_speech_translation(audio):
|
212 |
# Speech to Text
|
213 |
transcribed_text = transcribe(audio)
|
214 |
print(f"Transcribed: {transcribed_text}")
|
@@ -216,17 +119,63 @@ def speech_to_speech_translation(audio):
|
|
216 |
# Generate LLM Response
|
217 |
print("Now making LLM Call ~~~~~~~~~~~~~~~~~~~~~~~~")
|
218 |
llm_response = generate_llm_response(transcribed_text)
|
219 |
-
llm_response_cleaned = normalize_text(cleanup_text(replace_numbers_with_words(llm_response, "yo"), "yo"))
|
220 |
print(f"LLM Response: {llm_response}")
|
221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
# Text to Speech
|
224 |
-
|
225 |
-
#
|
226 |
-
|
227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
|
229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
|
231 |
|
232 |
# Gradio Demo
|
@@ -234,29 +183,53 @@ import gradio as gr
|
|
234 |
|
235 |
demo = gr.Blocks()
|
236 |
|
237 |
-
|
238 |
-
|
239 |
-
inputs=gr.Audio(sources="microphone", type="filepath"),
|
240 |
-
outputs=[
|
241 |
-
gr.Textbox(label="Transcribed Text", interactive=False),
|
242 |
-
gr.Textbox(label="HypaAI's Response", interactive=False), # New Markdown output
|
243 |
-
# gr.Audio(label="Generated Speech", type="numpy")
|
244 |
-
# gr.Markdown(label="LLM Enhanced Response") # New Markdown output
|
245 |
-
]
|
246 |
-
)
|
247 |
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
]
|
257 |
-
)
|
258 |
|
259 |
-
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
|
262 |
demo.launch(share=True)
|
|
|
3 |
from transformers import pipeline
|
4 |
|
5 |
# Loading the TTS and Vocoder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
6 |
+
!git clone https://github.com/saheedniyi02/yarngpt.git
|
7 |
+
!pip install -qU outetts uroman
|
8 |
+
|
9 |
+
import os
|
10 |
+
import re
|
11 |
+
import json
|
12 |
+
import torch
|
13 |
+
import inflect
|
14 |
+
import random
|
15 |
+
import uroman as ur
|
16 |
+
import numpy as np
|
17 |
+
import torchaudio
|
18 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
19 |
+
from outetts.wav_tokenizer.decoder import WavTokenizer
|
20 |
+
|
21 |
|
22 |
+
!wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
|
23 |
+
!wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
|
|
|
24 |
|
|
|
|
|
|
|
25 |
|
26 |
+
from yarngpt.audiotokenizer import AudioTokenizerV2
|
27 |
+
|
28 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
29 |
+
|
30 |
+
tokenizer_path="saheedniyi/YarnGPT2"
|
31 |
+
wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
|
32 |
+
wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
|
33 |
+
|
34 |
+
audio_tokenizer=AudioTokenizerV2(tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path)
|
35 |
+
tts_model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
|
36 |
|
37 |
# The LLM Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
38 |
from huggingface_hub import HfFolder
|
|
|
90 |
return outputs["text"]
|
91 |
|
92 |
|
93 |
+
# putting the ST and TTS system together ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
import numpy as np
|
95 |
+
def synthesise_yarn2(text):
|
96 |
+
# change the language and voice
|
97 |
+
prompt=audio_tokenizer.create_prompt(text,lang="english",speaker_name="idera")
|
98 |
+
input_ids=audio_tokenizer.tokenize_prompt(prompt)
|
99 |
+
output = tts_model.generate(
|
100 |
+
input_ids=input_ids,
|
101 |
+
temperature=0.1,
|
102 |
+
repetition_penalty=1.1,
|
103 |
+
max_length=4000,
|
104 |
+
num_beams=5,# using a beam size helps for the local languages but not english
|
105 |
+
)
|
106 |
+
|
107 |
+
codes=audio_tokenizer.get_codes(output)
|
108 |
+
audio=audio_tokenizer.get_audio(codes)
|
109 |
+
return audio.cpu()
|
110 |
|
111 |
target_dtype = np.int16
|
112 |
max_range = np.iinfo(target_dtype).max # Maximum value for 16-bit PCM audio conversion
|
113 |
|
114 |
+
def speech_to_speech_translation(audio, language="english"):
|
|
|
115 |
# Speech to Text
|
116 |
transcribed_text = transcribe(audio)
|
117 |
print(f"Transcribed: {transcribed_text}")
|
|
|
119 |
# Generate LLM Response
|
120 |
print("Now making LLM Call ~~~~~~~~~~~~~~~~~~~~~~~~")
|
121 |
llm_response = generate_llm_response(transcribed_text)
|
|
|
122 |
print(f"LLM Response: {llm_response}")
|
123 |
+
|
124 |
+
# Select a random voice based on the chosen language
|
125 |
+
voice_mapping = {
|
126 |
+
"english": ["idera", "chinenye", "jude", "emma", "umar", "joke", "zainab", "osagie", "remi", "tayo"],
|
127 |
+
"yoruba": ["yoruba_male2", "yoruba_female2", "yoruba_feamle1"],
|
128 |
+
"igbo": ["igbo_female2", "igbo_male2", "igbo_female1"],
|
129 |
+
"hausa": ["hausa_feamle1", "hausa_female2", "hausa_male2", "hausa_male1"]
|
130 |
+
}
|
131 |
+
|
132 |
+
selected_voice = random.choice(voice_mapping.get(language.lower(), voice_mapping["english"]))
|
133 |
+
print(f"Selected {language} voice: {selected_voice}")
|
134 |
|
135 |
# Text to Speech
|
136 |
+
print("Synthesizing Speech ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
|
137 |
+
# Use the selected language and voice
|
138 |
+
prompt = audio_tokenizer.create_prompt(llm_response, lang=language.lower(), speaker_name=selected_voice)
|
139 |
+
input_ids = audio_tokenizer.tokenize_prompt(prompt)
|
140 |
+
output = tts_model.generate(
|
141 |
+
input_ids=input_ids,
|
142 |
+
temperature=0.1,
|
143 |
+
repetition_penalty=1.1,
|
144 |
+
max_length=4000,
|
145 |
+
)
|
146 |
+
|
147 |
+
codes = audio_tokenizer.get_codes(output)
|
148 |
+
synthesised_speech = audio_tokenizer.get_audio(codes)
|
149 |
+
|
150 |
+
# Make sure we have a NumPy array, not a tensor
|
151 |
+
if hasattr(synthesised_speech, 'numpy'):
|
152 |
+
audio_np = synthesised_speech.numpy()
|
153 |
+
else:
|
154 |
+
audio_np = synthesised_speech
|
155 |
+
|
156 |
+
# Handle NaN and Inf values
|
157 |
+
audio_np = np.nan_to_num(audio_np)
|
158 |
+
|
159 |
+
# Ensure audio is in [-1, 1] range
|
160 |
+
if np.max(np.abs(audio_np)) > 0:
|
161 |
+
audio_np = audio_np / np.max(np.abs(audio_np))
|
162 |
|
163 |
+
# Convert to signed int16 (-32768 to 32767)
|
164 |
+
int16_max = 32767 # Max value for signed 16-bit
|
165 |
+
audio_int16 = np.clip(audio_np * int16_max, -int16_max, int16_max).astype(np.int16)
|
166 |
+
|
167 |
+
# Ensure the audio is mono channel if needed
|
168 |
+
if len(audio_int16.shape) > 1 and audio_int16.shape[0] == 1:
|
169 |
+
audio_int16 = audio_int16[0] # Convert from [1, samples] to [samples]
|
170 |
+
|
171 |
+
# Debug info
|
172 |
+
print(f"Audio stats - Min: {np.min(audio_int16)}, Max: {np.max(audio_int16)}, Shape: {audio_int16.shape}")
|
173 |
+
|
174 |
+
# Ensure sample rate is within valid range (1-192000)
|
175 |
+
sample_rate = min(max(24000, 1), 192000)
|
176 |
+
|
177 |
+
print("Speech Synthesis Completed~~~~~~~~~~~~~~~~~~~")
|
178 |
+
return transcribed_text, llm_response, (sample_rate, audio_int16)
|
179 |
|
180 |
|
181 |
# Gradio Demo
|
|
|
183 |
|
184 |
demo = gr.Blocks()
|
185 |
|
186 |
+
with demo:
|
187 |
+
gr.Markdown("# Aware Speech-to-Speech Demo")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
+
with gr.Tab("Microphone"):
|
190 |
+
with gr.Row():
|
191 |
+
mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak")
|
192 |
+
lang_dropdown_mic = gr.Dropdown(
|
193 |
+
choices=["English", "Yoruba", "Igbo", "Hausa"],
|
194 |
+
value="English",
|
195 |
+
label="Select Language"
|
196 |
+
)
|
|
|
|
|
197 |
|
198 |
+
mic_submit = gr.Button("Submit")
|
199 |
+
|
200 |
+
with gr.Row():
|
201 |
+
mic_transcribed = gr.Textbox(label="Transcribed Text", interactive=False)
|
202 |
+
mic_response = gr.Textbox(label="HypaAI's Response", interactive=False)
|
203 |
+
|
204 |
+
mic_audio_output = gr.Audio(label="Generated Speech", type="numpy")
|
205 |
+
|
206 |
+
mic_submit.click(
|
207 |
+
fn=speech_to_speech_translation,
|
208 |
+
inputs=[mic_input, lang_dropdown_mic],
|
209 |
+
outputs=[mic_transcribed, mic_response, mic_audio_output]
|
210 |
+
)
|
211 |
+
|
212 |
+
with gr.Tab("Audio File"):
|
213 |
+
with gr.Row():
|
214 |
+
file_input = gr.Audio(sources="upload", type="filepath", label="Upload Audio")
|
215 |
+
lang_dropdown_file = gr.Dropdown(
|
216 |
+
choices=["English", "Yoruba", "Igbo", "Hausa"],
|
217 |
+
value="English",
|
218 |
+
label="Select Language"
|
219 |
+
)
|
220 |
+
|
221 |
+
file_submit = gr.Button("Submit")
|
222 |
+
|
223 |
+
with gr.Row():
|
224 |
+
file_transcribed = gr.Textbox(label="Transcribed Text", interactive=False)
|
225 |
+
file_response = gr.Textbox(label="HypaAI's Response", interactive=False)
|
226 |
+
|
227 |
+
file_audio_output = gr.Audio(label="Generated Speech", type="numpy")
|
228 |
+
|
229 |
+
file_submit.click(
|
230 |
+
fn=speech_to_speech_translation,
|
231 |
+
inputs=[file_input, lang_dropdown_file],
|
232 |
+
outputs=[file_transcribed, file_response, file_audio_output]
|
233 |
+
)
|
234 |
|
235 |
demo.launch(share=True)
|