Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -83,6 +83,25 @@ generate_llm_response("Explain Deep Learning in Igbo")
|
|
83 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
84 |
pipe = pipeline("automatic-speech-recognition", model="okezieowen/whisper-small-multilingual-naija-11-03-2024", device=device)
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
# Take audio and return translated text
|
87 |
def transcribe(audio):
|
88 |
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"})
|
@@ -206,26 +225,6 @@ llm_response_cleaned = normalize_text(cleanup_text(replace_numbers_with_words(ll
|
|
206 |
print(f"LLM Response: {llm_response}")
|
207 |
print(f"LLM Response Cleaned: {llm_response_cleaned}")
|
208 |
|
209 |
-
# Loading the TTS and Vocoder
|
210 |
-
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
211 |
-
from datasets import load_dataset
|
212 |
-
|
213 |
-
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
214 |
-
|
215 |
-
model_default = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
216 |
-
model = SpeechT5ForTextToSpeech.from_pretrained("ccibeekeoc42/speecht5_finetuned_naija_ig_yo_2025-01-20_O2")
|
217 |
-
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
218 |
-
|
219 |
-
# sending the model to device
|
220 |
-
model_default.to(device)
|
221 |
-
model.to(device)
|
222 |
-
vocoder.to(device)
|
223 |
-
|
224 |
-
# Loading speaker embedings
|
225 |
-
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
226 |
-
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
227 |
-
|
228 |
-
|
229 |
# returning spech from text (and bringing to CPU)
|
230 |
def synthesise(text):
|
231 |
inputs = processor(text=text, return_tensors="pt")
|
|
|
83 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
84 |
pipe = pipeline("automatic-speech-recognition", model="okezieowen/whisper-small-multilingual-naija-11-03-2024", device=device)
|
85 |
|
86 |
+
# Loading the TTS and Vocoder
|
87 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
88 |
+
from datasets import load_dataset
|
89 |
+
|
90 |
+
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
91 |
+
|
92 |
+
model_default = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
93 |
+
model = SpeechT5ForTextToSpeech.from_pretrained("ccibeekeoc42/speecht5_finetuned_naija_ig_yo_2025-01-20_O2")
|
94 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
95 |
+
|
96 |
+
# sending the model to device
|
97 |
+
model_default.to(device)
|
98 |
+
model.to(device)
|
99 |
+
vocoder.to(device)
|
100 |
+
|
101 |
+
# Loading speaker embedings
|
102 |
+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
103 |
+
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
104 |
+
|
105 |
# Take audio and return translated text
|
106 |
def transcribe(audio):
|
107 |
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"})
|
|
|
225 |
print(f"LLM Response: {llm_response}")
|
226 |
print(f"LLM Response Cleaned: {llm_response_cleaned}")
|
227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
# returning spech from text (and bringing to CPU)
|
229 |
def synthesise(text):
|
230 |
inputs = processor(text=text, return_tensors="pt")
|