Spaces:

ccibeekeoc42
/

Aware-Demo

Runtime error

App Files Files Community

ccibeekeoc42 commited on 11 days ago

Commit

7ec764e

verified ·

1 Parent(s): 8fa7e6b

Updated File with ne TTS (YarnGPT)

Browse files

Files changed (1) hide show

app.py +143 -170

app.py CHANGED Viewed

@@ -3,21 +3,36 @@ import torch
 from transformers import pipeline
 # Loading the TTS and Vocoder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-from datasets import load_dataset
-processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-model_default = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-# sending the model to device
-model_default.to(device)
-vocoder.to(device)
-# Loading speaker embedings
-embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 # The LLM Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 from huggingface_hub import HfFolder
@@ -75,140 +90,28 @@ def transcribe(audio):
     return outputs["text"]
-# Helper Functions to Cleanup LLM Texts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Replacement rules
-import re
-# Language-specific replacements
-ig_replacements = [('a', 'ah'), ('e', 'eh'), ('i', 'ee'), ('ị', 'ih'), ('ṅ', 'nn'), ('o','oh'), ('ọ','aw'), ('u','oo'), ('ụ','uh')]
-yo_replacements = [('á', 'ah'), ('é', 'eh'), ('ẹ', 'e'), ('ó', 'oh'), ('ọ', 'aw'), ('ṣ', 'sh')]
-# Overall Replacements Rules
-replacements = [
-    ('²','squared'), ('½','square-root'), ('¾','one quarter'), ('¼','cubeed-root'),
-    ('ā','a'), ('â', 'a'), ('å','a'), ('á', 'a'), ('à', 'a'), ('ả', 'a'), ('ã', 'a'),
-    ('č', 'c'), ('ç', 'c'),
-    ('ë','e'), ('ẹ̀','e'), ('ẹ́','e'), ('é', 'e'), ('è', 'e'), ('ẻ', 'e'), ('ẽ', 'e'), ('ẹ', 'e'), ('ė', 'e'), ('ē', 'e'), ('ę', 'e'),
-    ('ï', 'i'), ('ì', 'i'), ('ị', 'i'), ('ỉ', 'i'), ('ĩ', 'i'), ('í', 'i'), ('ī', 'i'),
-    ('ń', 'n'), ('ň', 'n'), ('ń', 'n'), ('ṅ', 'n'), ('ñ', 'n'), ('ǹ', 'n'),
-    ('ö','o'), ('ọ̀','o'), ('ò', 'o'), ('ó', 'o'), ('ô', 'o'), ('ọ', 'o'), ('ò','o'), ('ó','o'), ('ò','o'), ('õ','o'), ('ō','o'),
-    ('ṣ', 's'), ('š', 's'),
-    ('ụ', 'u'), ('ü', 'u'), ('ú', 'u'), ('ǔ', 'u'), ('ù', 'u'), ('ū', 'u'), ('ũ', 'u'),
-    ('ω','omega'), ('θ','theta'), ('ł','w'),
-    ('α','alpha'), ('β','beta'), ('γ','gamma'), ('δ','delta'), ('ε','epsilon'), ('ζ','zeta'), ('η','eta'), ('θ','theta'),
-    ('ι','iota'), ('κ','kappa'), ('λ','lambda'), ('μ','mu'), ('ν','nu'), ('ξ','xi'), ('ο','omicron'), ('π','pi'),
-    ('ρ','rho'),
-    ('_',' '),
-]
-# Function to clean up text
-def cleanup_text(example, lng="en"):
-    example = example.lower()
-    if lng == "ig":
-        for src, dst in ig_replacements:
-            example = example.replace(src, dst)
-    elif lng == "yo":
-        for src, dst in yo_replacements:
-            example = example.replace(src, dst)
-    for src, dst in replacements:
-        example = example.replace(src, dst)  # Update text directly
-    return example
-# Normalizing the text
-def normalize_text(text):
-    text = text.lower() # Convert to lowercase
-    text = re.sub(r'[^\w\s\']', '', text) # Remove punctuation (except apostrophes)
-    text = ' '.join(text.split()) # Remove extra whitespace
-    return text
-# Language-specific number words
-number_words = {
-    "en": {  # English
-        0: "zero", 1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine",
-        10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen",
-        17: "seventeen", 18: "eighteen", 19: "nineteen", 20: "twenty", 30: "thirty", 40: "forty", 50: "fifty",
-        60: "sixty", 70: "seventy", 80: "eighty", 90: "ninety", 100: "hundred", 1000: "thousand"
-    },
-    "yo": {  # Yoruba
-        0: "ódo", 1: "ọ̀kan", 2: "méjì", 3: "mẹ́ta", 4: "mẹ́rin", 5: "márùn", 6: "mẹ́fà", 7: "mẹ̀je", 8: "mẹ̀jọ", 9: "mẹ́sàn",
-        10: "ẹ́wa", 11: "ọọkànlá", 12: "méjìlá", 13: "mẹ́tàlá", 14: "mẹ́rìnlá", 15: "árundínlógún", 16: "ẹ́rindínlógún", 17: "ẹ́rindínlógún",
-        18: "ẹ́rindínlógún", 19: "ẹ́rindínlógún", 20: "ogún", 30: "ọgbọ̀n", 40: "ogójì", 50: "àádọ́ta", 60: "ọgọ́ta", 70: "àádọ́rin",
-        80: "ọgọ́rin", 90: "àádọ́run", 100: "ọgọ́run", 1000: "ẹgbẹ̀rún"
-    },
-    "ig": {  # Igbo
-        0: "nọọ", 1: "otu", 2: "abụọ", 3: "atọ", 4: "anọ", 5: "ise", 6: "isii", 7: "asaa", 8: "asatọ", 9: "itoolu",
-        10: "iri", 11: "iri na otu", 12: "iri na abụọ", 13: "iri na atọ", 14: "iri na anọ", 15: "iri na ise",
-        16: "iri na isii", 17: "iri na asaa", 18: "iri na asatọ", 19: "iri na itoolu", 20: "iri abụọ",
-        30: "iri atọ", 40: "iri anọ", 50: "iri ise", 60: "iri isii", 70: "iri asaa", 80: "iri asatọ", 90: "iri itoolu",
-        100: "nari", 1000: "puku"
-    }
-}
-# Number to words function
-def number_to_words(number, lang="en"):
-    words = number_words[lang]
-    if number < 20:
-        return words[number]
-    elif number < 100:
-        tens, unit = divmod(number, 10)
-        return words[tens * 10] + (" " + words[unit] if unit else "")
-    elif number < 1000:
-        hundreds, remainder = divmod(number, 100)
-        return (words[hundreds] + " " + ("hundred" if lang == "en" else
-                                         "ọgọ́rùn" if lang == "yo" else "nari") if hundreds > 1 else
-                                         "hundred" if lang == "en" else "ọgọ́rùn" if lang == "yo" else "nari") + \
-               (" " + number_to_words(remainder, lang) if remainder else "")
-    elif number < 1000000:
-        thousands, remainder = divmod(number, 1000)
-        return (number_to_words(thousands, lang) + " " + ("thousand" if lang == "en" else
-                                                          "ẹgbẹ̀rún" if lang == "yo" else "puku")) + \
-               (" " + number_to_words(remainder, lang) if remainder else "")
-    elif number < 1000000000:
-        millions, remainder = divmod(number, 1000000)
-        return number_to_words(millions, lang) + " " + ("million" if lang == "en" else
-                                                        "mílíọ̀nù" if lang == "yo" else "nde") + \
-               (" " + number_to_words(remainder, lang) if remainder else "")
-    elif number < 1000000000000:
-        billions, remainder = divmod(number, 1000000000)
-        return number_to_words(billions, lang) + " " + ("billion" if lang == "en" else
-                                                        "bílíọ̀nù" if lang == "yo" else "ijeri") + \
-               (" " + number_to_words(remainder, lang) if remainder else "")
-    else:
-        return str(number)
-# Replace numbers in text
-def replace_numbers_with_words(text, lang="en"):
-    def replace(match):
-        number = int(match.group())
-        return number_to_words(number, lang)
-    # Replace all numbers in the text
-    return re.sub(r'\b\d+\b', replace, text)
-# llm_response = generate_llm_response("Explain Deep Learning in Igbo")
-# llm_response_cleaned = normalize_text(cleanup_text(replace_numbers_with_words(llm_response, "yo"), "yo"))
-# print(f"LLM Response: {llm_response}")
-# print(f"LLM Response Cleaned: {llm_response_cleaned}")
-# returning spech from text (and bringing to CPU)
-def synthesise(text):
-    inputs = processor(text=text, return_tensors="pt")
-    speech = model_default.generate_speech(
-        inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
-    )
-    return speech.cpu()
-# putting the ST and TTS system together
 import numpy as np
 target_dtype = np.int16
 max_range = np.iinfo(target_dtype).max # Maximum value for 16-bit PCM audio conversion
-# Modified speech-to-speech translation with textbox
-def speech_to_speech_translation(audio):
     # Speech to Text
     transcribed_text = transcribe(audio)
     print(f"Transcribed: {transcribed_text}")
@@ -216,17 +119,63 @@ def speech_to_speech_translation(audio):
     # Generate LLM Response
     print("Now making LLM Call ~~~~~~~~~~~~~~~~~~~~~~~~")
     llm_response = generate_llm_response(transcribed_text)
-    llm_response_cleaned = normalize_text(cleanup_text(replace_numbers_with_words(llm_response, "yo"), "yo"))
     print(f"LLM Response: {llm_response}")
-    print(f"LLM Response Cleaned: {llm_response_cleaned}")
     # Text to Speech
-    # print("Synthesizing Speech ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
-    # synthesised_speech = synthesise(llm_response_cleaned)
-    # synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
-    # print("Speech Synthesis Completed~~~~~~~~~~~~~~~~~~~")
-    return transcribed_text, llm_response, #(16000, synthesised_speech)
 # Gradio Demo
@@ -234,29 +183,53 @@ import gradio as gr
 demo = gr.Blocks()
-mic_translate = gr.Interface(
-    fn=speech_to_speech_translation,
-    inputs=gr.Audio(sources="microphone", type="filepath"),
-    outputs=[
-        gr.Textbox(label="Transcribed Text", interactive=False),
-        gr.Textbox(label="HypaAI's Response", interactive=False),  # New Markdown output
-        # gr.Audio(label="Generated Speech", type="numpy")
-        # gr.Markdown(label="LLM Enhanced Response")  # New Markdown output
-        ]
-)
-file_translate = gr.Interface(
-    fn=speech_to_speech_translation,
-    inputs=gr.Audio(sources="upload", type="filepath"),
-    outputs=[
-        gr.Textbox(label="Transcribed Text", interactive=False),
-        gr.Textbox(label="HypaAI's Response", interactive=False),  # New Markdown output
-        # gr.Audio(label="Generated Speech", type="numpy")
-        # gr.Markdown(label="LLM Enhanced Response")  # New Markdown output
-        ]
-)
-with demo:
-    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
 demo.launch(share=True)

 from transformers import pipeline
 # Loading the TTS and Vocoder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+!git clone https://github.com/saheedniyi02/yarngpt.git
+!pip install -qU outetts uroman
+import os
+import re
+import json
+import torch
+import inflect
+import random
+import uroman as ur
+import numpy as np
+import torchaudio
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from outetts.wav_tokenizer.decoder import WavTokenizer
+!wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
+!wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
+from yarngpt.audiotokenizer import AudioTokenizerV2
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+tokenizer_path="saheedniyi/YarnGPT2"
+wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
+wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
+audio_tokenizer=AudioTokenizerV2(tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path)
+tts_model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
 # The LLM Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 from huggingface_hub import HfFolder
     return outputs["text"]
+# putting the ST and TTS system together ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 import numpy as np
+def synthesise_yarn2(text):
+    # change the language and voice
+    prompt=audio_tokenizer.create_prompt(text,lang="english",speaker_name="idera")
+    input_ids=audio_tokenizer.tokenize_prompt(prompt)
+    output  = tts_model.generate(
+                input_ids=input_ids,
+                temperature=0.1,
+                repetition_penalty=1.1,
+                max_length=4000,
+                num_beams=5,# using a beam size helps for the local languages but not english
+            )
+    codes=audio_tokenizer.get_codes(output)
+    audio=audio_tokenizer.get_audio(codes)
+    return audio.cpu()
 target_dtype = np.int16
 max_range = np.iinfo(target_dtype).max # Maximum value for 16-bit PCM audio conversion
+def speech_to_speech_translation(audio, language="english"):
     # Speech to Text
     transcribed_text = transcribe(audio)
     print(f"Transcribed: {transcribed_text}")
     # Generate LLM Response
     print("Now making LLM Call ~~~~~~~~~~~~~~~~~~~~~~~~")
     llm_response = generate_llm_response(transcribed_text)
     print(f"LLM Response: {llm_response}")
+    # Select a random voice based on the chosen language
+    voice_mapping = {
+        "english": ["idera", "chinenye", "jude", "emma", "umar", "joke", "zainab", "osagie", "remi", "tayo"],
+        "yoruba": ["yoruba_male2", "yoruba_female2", "yoruba_feamle1"],
+        "igbo": ["igbo_female2", "igbo_male2", "igbo_female1"],
+        "hausa": ["hausa_feamle1", "hausa_female2", "hausa_male2", "hausa_male1"]
+    }
+    selected_voice = random.choice(voice_mapping.get(language.lower(), voice_mapping["english"]))
+    print(f"Selected {language} voice: {selected_voice}")
     # Text to Speech
+    print("Synthesizing Speech ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
+    # Use the selected language and voice
+    prompt = audio_tokenizer.create_prompt(llm_response, lang=language.lower(), speaker_name=selected_voice)
+    input_ids = audio_tokenizer.tokenize_prompt(prompt)
+    output = tts_model.generate(
+        input_ids=input_ids,
+        temperature=0.1,
+        repetition_penalty=1.1,
+        max_length=4000,
+    )
+    codes = audio_tokenizer.get_codes(output)
+    synthesised_speech = audio_tokenizer.get_audio(codes)
+    # Make sure we have a NumPy array, not a tensor
+    if hasattr(synthesised_speech, 'numpy'):
+        audio_np = synthesised_speech.numpy()
+    else:
+        audio_np = synthesised_speech
+    # Handle NaN and Inf values
+    audio_np = np.nan_to_num(audio_np)
+    # Ensure audio is in [-1, 1] range
+    if np.max(np.abs(audio_np)) > 0:
+        audio_np = audio_np / np.max(np.abs(audio_np))
+    # Convert to signed int16 (-32768 to 32767)
+    int16_max = 32767  # Max value for signed 16-bit
+    audio_int16 = np.clip(audio_np * int16_max, -int16_max, int16_max).astype(np.int16)
+    # Ensure the audio is mono channel if needed
+    if len(audio_int16.shape) > 1 and audio_int16.shape[0] == 1:
+        audio_int16 = audio_int16[0]  # Convert from [1, samples] to [samples]
+    # Debug info
+    print(f"Audio stats - Min: {np.min(audio_int16)}, Max: {np.max(audio_int16)}, Shape: {audio_int16.shape}")
+    # Ensure sample rate is within valid range (1-192000)
+    sample_rate = min(max(24000, 1), 192000)
+    print("Speech Synthesis Completed~~~~~~~~~~~~~~~~~~~")
+    return transcribed_text, llm_response, (sample_rate, audio_int16)
 # Gradio Demo
 demo = gr.Blocks()
+with demo:
+    gr.Markdown("# Aware Speech-to-Speech Demo")
+    with gr.Tab("Microphone"):
+        with gr.Row():
+            mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak")
+            lang_dropdown_mic = gr.Dropdown(
+                choices=["English", "Yoruba", "Igbo", "Hausa"],
+                value="English",
+                label="Select Language"
+            )
+        mic_submit = gr.Button("Submit")
+        with gr.Row():
+            mic_transcribed = gr.Textbox(label="Transcribed Text", interactive=False)
+            mic_response = gr.Textbox(label="HypaAI's Response", interactive=False)
+        mic_audio_output = gr.Audio(label="Generated Speech", type="numpy")
+        mic_submit.click(
+            fn=speech_to_speech_translation,
+            inputs=[mic_input, lang_dropdown_mic],
+            outputs=[mic_transcribed, mic_response, mic_audio_output]
+        )
+    with gr.Tab("Audio File"):
+        with gr.Row():
+            file_input = gr.Audio(sources="upload", type="filepath", label="Upload Audio")
+            lang_dropdown_file = gr.Dropdown(
+                choices=["English", "Yoruba", "Igbo", "Hausa"],
+                value="English",
+                label="Select Language"
+            )
+        file_submit = gr.Button("Submit")
+        with gr.Row():
+            file_transcribed = gr.Textbox(label="Transcribed Text", interactive=False)
+            file_response = gr.Textbox(label="HypaAI's Response", interactive=False)
+        file_audio_output = gr.Audio(label="Generated Speech", type="numpy")
+        file_submit.click(
+            fn=speech_to_speech_translation,
+            inputs=[file_input, lang_dropdown_file],
+            outputs=[file_transcribed, file_response, file_audio_output]
+        )
 demo.launch(share=True)