File size: 8,139 Bytes
9977a2c
 
ec3416f
 
9977a2c
9e304d5
ec3416f
 
 
9977a2c
134efe1
ec3416f
 
 
 
 
5da60bb
ec3416f
9977a2c
 
 
 
 
 
 
 
ec3416f
9977a2c
 
 
 
 
0ea3700
94b1983
f5a544b
 
9e304d5
f5a544b
94b1983
 
9977a2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ea3700
 
 
 
9e304d5
0ea3700
 
 
 
 
 
 
 
9e304d5
0ea3700
 
 
9e304d5
0ea3700
 
 
 
 
 
 
adc5c1c
55525a7
 
94b1983
 
0ea3700
 
55525a7
 
0ea3700
5da60bb
 
 
 
 
 
9e304d5
 
 
 
0ea3700
 
 
a64ea5d
0ea3700
9e304d5
 
 
 
 
 
 
 
 
 
 
05a24ef
0ea3700
 
 
 
 
 
 
 
 
 
 
 
adc5c1c
 
 
 
 
 
 
 
9977a2c
 
 
 
 
 
 
5da60bb
 
9977a2c
 
 
 
 
 
 
 
 
5e00de6
 
9977a2c
adc5c1c
 
 
 
 
9977a2c
 
 
 
 
 
 
 
 
 
 
94b1983
 
 
80abd37
94b1983
 
05a24ef
 
94b1983
 
9977a2c
 
 
 
 
 
 
 
05a24ef
 
9977a2c
 
134efe1
 
 
 
 
 
9977a2c
94b1983
9977a2c
 
 
134efe1
9977a2c
 
 
 
94b1983
9977a2c
 
 
 
ec3416f
 
134efe1
adc5c1c
134efe1
ec3416f
 
134efe1
0ea3700
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import asyncio
import base64
import json
import os
import pathlib
from typing import AsyncGenerator, Literal, Optional, List, Union

import gradio as gr
import numpy as np
from dotenv import load_dotenv
from fastapi import FastAPI
from fastapi.responses import HTMLResponse
from fastrtc import (
    AsyncStreamHandler,
    Stream,
    get_twilio_turn_credentials,
    wait_for_item,
)
from google import genai
from google.genai.types import (
    LiveConnectConfig,
    PrebuiltVoiceConfig,
    SpeechConfig,
    VoiceConfig,
)
from gradio.utils import get_space
from pydantic import BaseModel

current_dir = pathlib.Path(__file__).parent

load_dotenv()

# Define preset system prompts with more detailed and actionable instructions
SYSTEM_PROMPTS = {
    "Default": "You are a helpful, harmless, and honest AI assistant. Provide clear, concise, and accurate information.",
    "Behavior Expert": "You are an expert in behavior analysis, Positive Behavior Interventions and Supports, and Restorative approaches and practices. Your primary role is to help educators understand and address student behavior issues. Provide evidence-based strategies, empathetic insights, and practical solutions.",
    "UDL Expert": "You are an expert in Universal Design for Learning (UDL). Your role is to help educators design units, lessons, and activities to be universally accessible. Provide detailed guidance on addressing student variability and potential learning barriers.",
    "Learning Support Expert": "You are a learning support expert. Your role is to help educators adapt and support instruction based on student needs such as dyscalculia, dyslexia, executive functioning issues, low English proficiency, or other learning needs and differences. You also help devise ane implement IEPs, as well as accommodations."
}


def encode_audio(data: np.ndarray) -> str:
    """Encode Audio data to send to the server"""
    return base64.b64encode(data.tobytes()).decode("UTF-8")


class GeminiHandler(AsyncStreamHandler):
    """Handler for the Gemini API"""

    def __init__(
        self,
        expected_layout: Literal["mono"] = "mono",
        output_sample_rate: int = 24000,
        output_frame_size: int = 480,
    ) -> None:
        super().__init__(
            expected_layout,
            output_sample_rate,
            output_frame_size,
            input_sample_rate=16000,
        )
        self.input_queue: asyncio.Queue = asyncio.Queue()
        self.output_queue: asyncio.Queue = asyncio.Queue()
        self.quit: asyncio.Event = asyncio.Event()

    def copy(self) -> "GeminiHandler":
        return GeminiHandler(
            expected_layout="mono",
            output_sample_rate=self.output_sample_rate,
            output_frame_size=self.output_frame_size,
        )

    def prepare_system_instruction(
        self, 
        prompt_key: Optional[str] = None, 
        custom_prompt: Optional[str] = None
    ) -> Optional[str]:
        """
        Prepare system instruction based on preset or custom prompt.
        
        Args:
            prompt_key: Key for preset system prompt
            custom_prompt: Custom user-defined system prompt
        
        Returns:
            System instruction as a string or None
        """
        # Check for custom prompt first
        if custom_prompt:
            return custom_prompt
        
        # Then check for preset prompt
        if prompt_key and prompt_key in SYSTEM_PROMPTS:
            return SYSTEM_PROMPTS[prompt_key]
        
        return None

    async def start_up(self):
        if not self.phone_mode:
            await self.wait_for_args()
            api_key, voice_name, prompt_key, custom_prompt = self.latest_args[1:]
            
            # Prepare system instruction
            system_instruction = self.prepare_system_instruction(prompt_key, custom_prompt)
        else:
            api_key, voice_name = None, "Puck"
            system_instruction = None

        client = genai.Client(
            api_key=api_key or os.getenv("GEMINI_API_KEY"),
            http_options={"api_version": "v1alpha"},
        )

        # Create config for the connection
        config_kwargs = {
            "response_modalities": ["AUDIO"],
            "speech_config": SpeechConfig(
                voice_config=VoiceConfig(
                    prebuilt_voice_config=PrebuiltVoiceConfig(
                        voice_name=voice_name,
                    )
                )
            )
        }

        # Add system instruction if available
        if system_instruction:
            config_kwargs["system_instruction"] = {
                "parts": [{"text": system_instruction}]
            }
        
        # Create the configuration
        config = LiveConnectConfig(**config_kwargs)
        
        try:
            async with client.aio.live.connect(
                model="gemini-2.0-flash-exp", config=config
            ) as session:
                async for audio in session.start_stream(
                    stream=self.stream(), mime_type="audio/pcm"
                ):
                    if audio.data:
                        array = np.frombuffer(audio.data, dtype=np.int16)
                        self.output_queue.put_nowait((self.output_sample_rate, array))
        except Exception as e:
            print(f"Error in Gemini connection: {e}")

    async def stream(self) -> AsyncGenerator[bytes, None]:
        while not self.quit.is_set():
            try:
                audio = await asyncio.wait_for(self.input_queue.get(), 0.1)
                yield audio
            except (asyncio.TimeoutError, TimeoutError):
                pass

    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
        _, array = frame
        array = array.squeeze()
        audio_message = encode_audio(array)
        self.input_queue.put_nowait(audio_message)

    async def emit(self) -> tuple[int, np.ndarray] | None:
        return await wait_for_item(self.output_queue)

    def shutdown(self) -> None:
        self.quit.set()

stream = Stream(
    modality="audio",
    mode="send-receive",
    handler=GeminiHandler(),
    rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
    concurrency_limit=5 if get_space() else None,
    time_limit=90 if get_space() else None,
    additional_inputs=[
        gr.Textbox(
            label="API Key",
            type="password",
            value=os.getenv("GEMINI_API_KEY") if not get_space() else "",
        ),
        gr.Dropdown(
            label="Voice",
            choices=[
                "Puck",
                "Charon",
                "Kore",
                "Fenrir",
                "Aoede",
            ],
            value="Puck",
        ),
        gr.Dropdown(
            label="Preset Prompt",
            choices=list(SYSTEM_PROMPTS.keys()),
            value="Default",
        ),
        gr.Textbox(
            label="Custom Prompt",
            placeholder="Enter a custom system prompt (overrides preset if not empty)",
            value="",
        ),
    ],
)


class InputData(BaseModel):
    webrtc_id: str
    voice_name: str
    api_key: str
    prompt_key: str = ""
    custom_prompt: str = ""


app = FastAPI()

stream.mount(app)


@app.post("/input_hook")
async def _(body: InputData):
    stream.set_input(body.webrtc_id, body.api_key, body.voice_name, body.prompt_key, body.custom_prompt)
    return {"status": "ok"}


@app.get("/")
async def index():
    rtc_config = get_twilio_turn_credentials() if get_space() else None
    html_content = (current_dir / "index.html").read_text()
    html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
    html_content = html_content.replace("__SYSTEM_PROMPTS__", json.dumps(SYSTEM_PROMPTS))
    return HTMLResponse(content=html_content)


if __name__ == "__main__":
    import os

    if (mode := os.getenv("MODE")) == "UI":
        stream.ui.launch(server_port=7860)
    elif mode == "PHONE":
        stream.fastphone(host="0.0.0.0", port=7860)
    else:
        import uvicorn
        uvicorn.run(app, host="0.0.0.0", port=7860)