Spaces:

ccibeekeoc42
/

Aware-Demo

Sleeping

App Files Files Community

Aware-Demo / app.py

ccibeekeoc42

Added Function to execute shell commands safely

3a00e4e verified 11 days ago

raw

history blame

10.8 kB

	import os
	import torch
	from transformers import pipeline

	# Loading the TTS and Vocoder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	import re
	import json
	import inflect
	import random
	import uroman as ur
	import numpy as np
	import torchaudio
	import subprocess
	import requests
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from outetts.wav_tokenizer.decoder import WavTokenizer

	# Function to execute shell commands safely
	def run_command(command):
	try:
	process = subprocess.Popen(
	command,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	shell=True,
	universal_newlines=True
	)
	stdout, stderr = process.communicate()

	if process.returncode != 0:
	print(f"Error executing: {command}")
	print(stderr)
	return False
	else:
	print(stdout)
	return True
	except Exception as e:
	print(f"Exception during execution of {command}: {e}")
	return False

	# Clone the YarnGPT repository
	if not os.path.exists('yarngpt'):
	print("Cloning YarnGPT repository...")
	run_command("git clone https://github.com/saheedniyi02/yarngpt.git")
	else:
	print("YarnGPT repository already exists")

	# Install required packages
	print("Installing required packages...")
	run_command("pip install -q outetts uroman")

	# If you need to install the cloned package in development mode
	if os.path.exists('yarngpt'):
	os.chdir('yarngpt')
	run_command("pip install -e .")
	os.chdir('..') # Go back to the original directory


	# Download files using Python's requests library instead of !wget
	def download_file(url, save_path):
	response = requests.get(url, stream=True)
	if response.status_code == 200:
	with open(save_path, 'wb') as f:
	f.write(response.content)
	print(f"Downloaded {save_path}")
	else:
	print(f"Failed to download {url}")

	# Download required files
	download_file(
	"https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml",
	"wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
	)
	download_file(
	"https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt",
	"wavtokenizer_large_speech_320_24k.ckpt"
	)


	from yarngpt.audiotokenizer import AudioTokenizerV2

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	tokenizer_path="saheedniyi/YarnGPT2"
	wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
	wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"

	audio_tokenizer=AudioTokenizerV2(tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path)
	tts_model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)

	# The LLM Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	from huggingface_hub import HfFolder
	from openai import OpenAI

	api_key = os.getenv("API_KEY")
	if api_key is None:
	raise ValueError("API_KEY is not set in the environment variables.")
	print("API key successfully loaded.")

	# Initialize OpenAI client for Hugging Face Inference Endpoint
	client = OpenAI(
	base_url="https://y1ztgv8tu09nay6u.us-east-1.aws.endpoints.huggingface.cloud/v1/", #https://f2iozzwigntrzkve.us-east-1.aws.endpoints.huggingface.cloud/v1/",
	api_key=api_key
	)

	def generate_llm_response(text, model_id="ccibeekeoc42/Llama3.1-8b-base-SFT-2024-11-09"):
	full_response = []
	try:
	chat_completion = client.chat.completions.create(
	model="tgi",
	messages=[
	{"role": "system", "content": "You are HypaAI a very BRIEF AND DIRECT assistant. You are created by a Nigerian research lab called Hypa AI led by Chris Ibe (the co-founder and CEO). As part of a speech pipeline so keep your responses short (under 60 words), fluent, and straight to the point. Avoid markdown or digits in responses."},
	{"role": "user", "content": text}
	],
	top_p=0.3,
	temperature=1,
	max_tokens=150,
	stream=True,
	seed=None,
	stop=None,
	frequency_penalty=None,
	presence_penalty=None
	)
	for chunk in chat_completion:
	if chunk.choices[0].delta.content:
	full_response.append(chunk.choices[0].delta.content)
	return "".join(full_response)
	except Exception as e:
	# If the error has a response with status code 503, assume the GPU is booting up.
	if hasattr(e, 'response') and e.response is not None and e.response.status_code == 503:
	return "The GPU is currently booting up. Please wait about 10 minutes and try again."
	else:
	raise e

	# generate_llm_response("Explain Deep Learning in Igbo")


	# Loading the ST Model (Whisper) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	pipe = pipeline("automatic-speech-recognition", model="okezieowen/whisper-small-multilingual-naija-11-03-2024", device=device)

	# Take audio and return translated text
	def transcribe(audio):
	outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"})
	return outputs["text"]


	# putting the ST and TTS system together ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	import numpy as np
	def synthesise_yarn2(text):
	# change the language and voice
	prompt=audio_tokenizer.create_prompt(text,lang="english",speaker_name="idera")
	input_ids=audio_tokenizer.tokenize_prompt(prompt)
	output = tts_model.generate(
	input_ids=input_ids,
	temperature=0.1,
	repetition_penalty=1.1,
	max_length=4000,
	num_beams=5,# using a beam size helps for the local languages but not english
	)

	codes=audio_tokenizer.get_codes(output)
	audio=audio_tokenizer.get_audio(codes)
	return audio.cpu()

	target_dtype = np.int16
	max_range = np.iinfo(target_dtype).max # Maximum value for 16-bit PCM audio conversion

	def speech_to_speech_translation(audio, language="english"):
	# Speech to Text
	transcribed_text = transcribe(audio)
	print(f"Transcribed: {transcribed_text}")

	# Generate LLM Response
	print("Now making LLM Call ~~~~~~~~~~~~~~~~~~~~~~~~")
	llm_response = generate_llm_response(transcribed_text)
	print(f"LLM Response: {llm_response}")

	# Select a random voice based on the chosen language
	voice_mapping = {
	"english": ["idera", "chinenye", "jude", "emma", "umar", "joke", "zainab", "osagie", "remi", "tayo"],
	"yoruba": ["yoruba_male2", "yoruba_female2", "yoruba_feamle1"],
	"igbo": ["igbo_female2", "igbo_male2", "igbo_female1"],
	"hausa": ["hausa_feamle1", "hausa_female2", "hausa_male2", "hausa_male1"]
	}

	selected_voice = random.choice(voice_mapping.get(language.lower(), voice_mapping["english"]))
	print(f"Selected {language} voice: {selected_voice}")

	# Text to Speech
	print("Synthesizing Speech ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
	# Use the selected language and voice
	prompt = audio_tokenizer.create_prompt(llm_response, lang=language.lower(), speaker_name=selected_voice)
	input_ids = audio_tokenizer.tokenize_prompt(prompt)
	output = tts_model.generate(
	input_ids=input_ids,
	temperature=0.1,
	repetition_penalty=1.1,
	max_length=4000,
	)

	codes = audio_tokenizer.get_codes(output)
	synthesised_speech = audio_tokenizer.get_audio(codes)

	# Make sure we have a NumPy array, not a tensor
	if hasattr(synthesised_speech, 'numpy'):
	audio_np = synthesised_speech.numpy()
	else:
	audio_np = synthesised_speech

	# Handle NaN and Inf values
	audio_np = np.nan_to_num(audio_np)

	# Ensure audio is in [-1, 1] range
	if np.max(np.abs(audio_np)) > 0:
	audio_np = audio_np / np.max(np.abs(audio_np))

	# Convert to signed int16 (-32768 to 32767)
	int16_max = 32767 # Max value for signed 16-bit
	audio_int16 = np.clip(audio_np * int16_max, -int16_max, int16_max).astype(np.int16)

	# Ensure the audio is mono channel if needed
	if len(audio_int16.shape) > 1 and audio_int16.shape[0] == 1:
	audio_int16 = audio_int16[0] # Convert from [1, samples] to [samples]

	# Debug info
	print(f"Audio stats - Min: {np.min(audio_int16)}, Max: {np.max(audio_int16)}, Shape: {audio_int16.shape}")

	# Ensure sample rate is within valid range (1-192000)
	sample_rate = min(max(24000, 1), 192000)

	print("Speech Synthesis Completed~~~~~~~~~~~~~~~~~~~")
	return transcribed_text, llm_response, (sample_rate, audio_int16)


	# Gradio Demo
	import gradio as gr

	demo = gr.Blocks()

	with demo:
	gr.Markdown("# Aware Speech-to-Speech Demo")

	with gr.Tab("Microphone"):
	with gr.Row():
	mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak")
	lang_dropdown_mic = gr.Dropdown(
	choices=["English", "Yoruba", "Igbo", "Hausa"],
	value="English",
	label="Select Language"
	)

	mic_submit = gr.Button("Submit")

	with gr.Row():
	mic_transcribed = gr.Textbox(label="Transcribed Text", interactive=False)
	mic_response = gr.Textbox(label="HypaAI's Response", interactive=False)

	mic_audio_output = gr.Audio(label="Generated Speech", type="numpy")

	mic_submit.click(
	fn=speech_to_speech_translation,
	inputs=[mic_input, lang_dropdown_mic],
	outputs=[mic_transcribed, mic_response, mic_audio_output]
	)

	with gr.Tab("Audio File"):
	with gr.Row():
	file_input = gr.Audio(sources="upload", type="filepath", label="Upload Audio")
	lang_dropdown_file = gr.Dropdown(
	choices=["English", "Yoruba", "Igbo", "Hausa"],
	value="English",
	label="Select Language"
	)

	file_submit = gr.Button("Submit")

	with gr.Row():
	file_transcribed = gr.Textbox(label="Transcribed Text", interactive=False)
	file_response = gr.Textbox(label="HypaAI's Response", interactive=False)

	file_audio_output = gr.Audio(label="Generated Speech", type="numpy")

	file_submit.click(
	fn=speech_to_speech_translation,
	inputs=[file_input, lang_dropdown_file],
	outputs=[file_transcribed, file_response, file_audio_output]
	)

	demo.launch(share=True)