Voice_Menu_Ordering3

Sleeping

File size: 4,158 Bytes

a941958

import gradio as gr
import edge_tts
import asyncio
import tempfile
import numpy as np
from pydub import AudioSegment
import torch
import sentencepiece as spm
import onnxruntime as ort
from huggingface_hub import hf_hub_download

# Dynamic Menu Items
MENU = {
    "Pizza": 10.99,
    "Burger": 6.99,
    "Pasta": 8.49,
    "Salad": 5.49,
    "Soda": 1.99,
    "Coffee": 2.99
}

cart = []  # To store cart items

# Speech Recognition Model Configuration
model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
sample_rate = 16000

# Download preprocessor, encoder, and tokenizer
preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx"))
encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx"))
tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx"))

async def text_to_speech(text):
    communicate = edge_tts.Communicate(text)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path

def resample(audio_fp32, sr):
    return soxr.resample(audio_fp32, sr, sample_rate)

def to_float32(audio_buffer):
    return np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)

def transcribe(audio_path):
    audio_file = AudioSegment.from_file(audio_path)
    sr = audio_file.frame_rate
    audio_buffer = np.array(audio_file.get_array_of_samples())

    audio_fp32 = to_float32(audio_buffer)
    audio_16k = resample(audio_fp32, sr)

    input_signal = torch.tensor(audio_16k).unsqueeze(0)
    length = torch.tensor(len(audio_16k)).unsqueeze(0)
    processed_signal, _ = preprocessor.forward(input_signal=input_signal, length=length)

    logits = encoder.run(None, {'audio_signal': processed_signal.numpy(), 'length': length.numpy()})[0][0]

    blank_id = tokenizer.vocab_size()
    decoded_prediction = [p for p in logits.argmax(axis=1).tolist() if p != blank_id]
    text = tokenizer.decode_ids(decoded_prediction)

    return text

def generate_menu():
    menu_text = "Here is our menu:\n"
    for item, price in MENU.items():
        menu_text += f"{item}: ${price:.2f}\n"
    menu_text += "What would you like to order?"
    return menu_text

def handle_cart(command):
    global cart
    response = ""

    # Check for menu-related commands
    if "menu" in command.lower():
        response = generate_menu()

    # Check for add-to-cart commands
    else:
        for item in MENU.keys():
            if item.lower() in command.lower():
                cart.append(item)
                response = f"{item} has been added to your cart."
                break

    # If user asks for cart
    if "cart" in command.lower():
        if cart:
            response = "Your cart contains:\n" + ", ".join(cart)
        else:
            response = "Your cart is empty."

    # If user confirms order
    if "submit" in command.lower() or "done" in command.lower():
        if cart:
            response = "Your final order is:\n" + ", ".join(cart) + ". Thank you for your order!"
            cart = []  # Clear the cart
        else:
            response = "Your cart is empty. Add some items before submitting."

    return response

async def respond(audio):
    try:
        user_command = transcribe(audio)
        reply = handle_cart(user_command)
        reply_audio_path = await text_to_speech(reply)
        return user_command, reply, reply_audio_path
    except Exception as e:
        return "Error: Could not transcribe audio.", "Error: Could not process your request.", None

with gr.Blocks() as demo:
    with gr.Row():
        audio_input = gr.Audio(label="Speak Here", type="filepath")
        submit = gr.Button("Submit")

    with gr.Row():
        transcribed_text = gr.Textbox(label="Transcribed Text")
        response_text = gr.Textbox(label="GPT Response")
        response_audio = gr.Audio(label="Response Audio")

    submit.click(fn=respond, inputs=[audio_input], outputs=[transcribed_text, response_text, response_audio])

if __name__ == "__main__":
    demo.queue().launch()