Voice_Menu_Ordering3

Sleeping

App Files Files Community

nagasurendra commited on Dec 28, 2024

Commit

a941958

verified ·

1 Parent(s): 67889b4

Create app.py

Browse files

Files changed (1) hide show

app.py +128 -0

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import gradio as gr
+import edge_tts
+import asyncio
+import tempfile
+import numpy as np
+from pydub import AudioSegment
+import torch
+import sentencepiece as spm
+import onnxruntime as ort
+from huggingface_hub import hf_hub_download
+# Dynamic Menu Items
+MENU = {
+    "Pizza": 10.99,
+    "Burger": 6.99,
+    "Pasta": 8.49,
+    "Salad": 5.49,
+    "Soda": 1.99,
+    "Coffee": 2.99
+}
+cart = []  # To store cart items
+# Speech Recognition Model Configuration
+model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
+sample_rate = 16000
+# Download preprocessor, encoder, and tokenizer
+preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx"))
+encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx"))
+tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx"))
+async def text_to_speech(text):
+    communicate = edge_tts.Communicate(text)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+        tmp_path = tmp_file.name
+        await communicate.save(tmp_path)
+    return tmp_path
+def resample(audio_fp32, sr):
+    return soxr.resample(audio_fp32, sr, sample_rate)
+def to_float32(audio_buffer):
+    return np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)
+def transcribe(audio_path):
+    audio_file = AudioSegment.from_file(audio_path)
+    sr = audio_file.frame_rate
+    audio_buffer = np.array(audio_file.get_array_of_samples())
+    audio_fp32 = to_float32(audio_buffer)
+    audio_16k = resample(audio_fp32, sr)
+    input_signal = torch.tensor(audio_16k).unsqueeze(0)
+    length = torch.tensor(len(audio_16k)).unsqueeze(0)
+    processed_signal, _ = preprocessor.forward(input_signal=input_signal, length=length)
+    logits = encoder.run(None, {'audio_signal': processed_signal.numpy(), 'length': length.numpy()})[0][0]
+    blank_id = tokenizer.vocab_size()
+    decoded_prediction = [p for p in logits.argmax(axis=1).tolist() if p != blank_id]
+    text = tokenizer.decode_ids(decoded_prediction)
+    return text
+def generate_menu():
+    menu_text = "Here is our menu:\n"
+    for item, price in MENU.items():
+        menu_text += f"{item}: ${price:.2f}\n"
+    menu_text += "What would you like to order?"
+    return menu_text
+def handle_cart(command):
+    global cart
+    response = ""
+    # Check for menu-related commands
+    if "menu" in command.lower():
+        response = generate_menu()
+    # Check for add-to-cart commands
+    else:
+        for item in MENU.keys():
+            if item.lower() in command.lower():
+                cart.append(item)
+                response = f"{item} has been added to your cart."
+                break
+    # If user asks for cart
+    if "cart" in command.lower():
+        if cart:
+            response = "Your cart contains:\n" + ", ".join(cart)
+        else:
+            response = "Your cart is empty."
+    # If user confirms order
+    if "submit" in command.lower() or "done" in command.lower():
+        if cart:
+            response = "Your final order is:\n" + ", ".join(cart) + ". Thank you for your order!"
+            cart = []  # Clear the cart
+        else:
+            response = "Your cart is empty. Add some items before submitting."
+    return response
+async def respond(audio):
+    try:
+        user_command = transcribe(audio)
+        reply = handle_cart(user_command)
+        reply_audio_path = await text_to_speech(reply)
+        return user_command, reply, reply_audio_path
+    except Exception as e:
+        return "Error: Could not transcribe audio.", "Error: Could not process your request.", None
+with gr.Blocks() as demo:
+    with gr.Row():
+        audio_input = gr.Audio(label="Speak Here", type="filepath")
+        submit = gr.Button("Submit")
+    with gr.Row():
+        transcribed_text = gr.Textbox(label="Transcribed Text")
+        response_text = gr.Textbox(label="GPT Response")
+        response_audio = gr.Audio(label="Response Audio")
+    submit.click(fn=respond, inputs=[audio_input], outputs=[transcribed_text, response_text, response_audio])
+if __name__ == "__main__":
+    demo.queue().launch()