nagasurendra commited on
Commit
a941958
·
verified ·
1 Parent(s): 67889b4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -0
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import edge_tts
3
+ import asyncio
4
+ import tempfile
5
+ import numpy as np
6
+ from pydub import AudioSegment
7
+ import torch
8
+ import sentencepiece as spm
9
+ import onnxruntime as ort
10
+ from huggingface_hub import hf_hub_download
11
+
12
+ # Dynamic Menu Items
13
+ MENU = {
14
+ "Pizza": 10.99,
15
+ "Burger": 6.99,
16
+ "Pasta": 8.49,
17
+ "Salad": 5.49,
18
+ "Soda": 1.99,
19
+ "Coffee": 2.99
20
+ }
21
+
22
+ cart = [] # To store cart items
23
+
24
+ # Speech Recognition Model Configuration
25
+ model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
26
+ sample_rate = 16000
27
+
28
+ # Download preprocessor, encoder, and tokenizer
29
+ preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx"))
30
+ encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx"))
31
+ tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx"))
32
+
33
+ async def text_to_speech(text):
34
+ communicate = edge_tts.Communicate(text)
35
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
36
+ tmp_path = tmp_file.name
37
+ await communicate.save(tmp_path)
38
+ return tmp_path
39
+
40
+ def resample(audio_fp32, sr):
41
+ return soxr.resample(audio_fp32, sr, sample_rate)
42
+
43
+ def to_float32(audio_buffer):
44
+ return np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)
45
+
46
+ def transcribe(audio_path):
47
+ audio_file = AudioSegment.from_file(audio_path)
48
+ sr = audio_file.frame_rate
49
+ audio_buffer = np.array(audio_file.get_array_of_samples())
50
+
51
+ audio_fp32 = to_float32(audio_buffer)
52
+ audio_16k = resample(audio_fp32, sr)
53
+
54
+ input_signal = torch.tensor(audio_16k).unsqueeze(0)
55
+ length = torch.tensor(len(audio_16k)).unsqueeze(0)
56
+ processed_signal, _ = preprocessor.forward(input_signal=input_signal, length=length)
57
+
58
+ logits = encoder.run(None, {'audio_signal': processed_signal.numpy(), 'length': length.numpy()})[0][0]
59
+
60
+ blank_id = tokenizer.vocab_size()
61
+ decoded_prediction = [p for p in logits.argmax(axis=1).tolist() if p != blank_id]
62
+ text = tokenizer.decode_ids(decoded_prediction)
63
+
64
+ return text
65
+
66
+ def generate_menu():
67
+ menu_text = "Here is our menu:\n"
68
+ for item, price in MENU.items():
69
+ menu_text += f"{item}: ${price:.2f}\n"
70
+ menu_text += "What would you like to order?"
71
+ return menu_text
72
+
73
+ def handle_cart(command):
74
+ global cart
75
+ response = ""
76
+
77
+ # Check for menu-related commands
78
+ if "menu" in command.lower():
79
+ response = generate_menu()
80
+
81
+ # Check for add-to-cart commands
82
+ else:
83
+ for item in MENU.keys():
84
+ if item.lower() in command.lower():
85
+ cart.append(item)
86
+ response = f"{item} has been added to your cart."
87
+ break
88
+
89
+ # If user asks for cart
90
+ if "cart" in command.lower():
91
+ if cart:
92
+ response = "Your cart contains:\n" + ", ".join(cart)
93
+ else:
94
+ response = "Your cart is empty."
95
+
96
+ # If user confirms order
97
+ if "submit" in command.lower() or "done" in command.lower():
98
+ if cart:
99
+ response = "Your final order is:\n" + ", ".join(cart) + ". Thank you for your order!"
100
+ cart = [] # Clear the cart
101
+ else:
102
+ response = "Your cart is empty. Add some items before submitting."
103
+
104
+ return response
105
+
106
+ async def respond(audio):
107
+ try:
108
+ user_command = transcribe(audio)
109
+ reply = handle_cart(user_command)
110
+ reply_audio_path = await text_to_speech(reply)
111
+ return user_command, reply, reply_audio_path
112
+ except Exception as e:
113
+ return "Error: Could not transcribe audio.", "Error: Could not process your request.", None
114
+
115
+ with gr.Blocks() as demo:
116
+ with gr.Row():
117
+ audio_input = gr.Audio(label="Speak Here", type="filepath")
118
+ submit = gr.Button("Submit")
119
+
120
+ with gr.Row():
121
+ transcribed_text = gr.Textbox(label="Transcribed Text")
122
+ response_text = gr.Textbox(label="GPT Response")
123
+ response_audio = gr.Audio(label="Response Audio")
124
+
125
+ submit.click(fn=respond, inputs=[audio_input], outputs=[transcribed_text, response_text, response_audio])
126
+
127
+ if __name__ == "__main__":
128
+ demo.queue().launch()