File size: 3,737 Bytes
245387f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import torch
from llama_index.core.prompts import PromptTemplate
from transformers import AutoTokenizer
from llama_index.core import Settings
import os
import time
from llama_index.llms.text_generation_inference import TextGenerationInference
import whisper
import gradio as gr
from gtts import gTTS
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import soundfile as sf
from datasets import load_dataset
model = whisper.load_model("base")
HF_API_TOKEN = os.getenv("HF_TOKEN")

def translate_audio(audio):

    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # decode the audio
    options = whisper.DecodingOptions(language='en', task="transcribe", temperature=0)
    result = whisper.decode(model, mel, options)
    return result.text

def audio_response(text, output_path="speech.wav"):
    # Load the processor, model, and vocoder
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

    # Process the input text
    inputs = processor(text=text, return_tensors="pt")

    # Load xvector containing speaker's voice characteristics
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

    # Generate speech
    with torch.no_grad():
        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    # Save the audio to a file
    sf.write(output_path, speech.numpy(), samplerate=16000)  # Ensure the sample rate matches your needs

    return output_path

def messages_to_prompt(messages):
    # Default system message for a chatbot
    default_system_prompt = "You are an AI chatbot designed to assist with user queries in a friendly and conversational manner."

    prompt = default_system_prompt + "\n"

    for message in messages:
        if message.role == 'system':
            prompt += f"\n{message.content}</s>\n"
        elif message.role == 'user':
            prompt += f"\n{message.content}</s>\n"
        elif message.role == 'assistant':
            prompt += f"\n{message.content}</s>\n"

    # Ensure we start with a system prompt, insert blank if needed
    if not prompt.startswith("\n"):
        prompt = "\n</s>\n" + prompt

    # Add final assistant prompt
    prompt = prompt + "\n"

    return prompt

def completion_to_prompt(completion):
    return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n"

Settings.llm = TextGenerationInference(
    model_url="https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct",
    token=HF_API_TOKEN,
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt
)
def text_response(t):
    time.sleep(1)  # Adjust the delay as needed
    response = Settings.llm.complete(t)
    message = response.text
    return  message

def transcribe_(a):
    t1 = translate_audio(a)
    t2 = text_response(t1)
    t3 = audio_response(t2)
    return (t1, t2, t3)

output_1 = gr.Textbox(label="Speech to Text")
output_2 = gr.Textbox(label="LLM Output")
output_3 = gr.Audio(label="LLM output to audio")

gr.Interface(
    title='AI Voice Assistant',
    fn=transcribe_,
    inputs=[
        gr.Audio(sources="microphone", type="filepath"),
    ],
    outputs=[
        output_1, output_2, output_3
    ]
).launch(share=True)