Nirav Madhani commited on
Commit
cc7c705
·
1 Parent(s): a4541ab

First commit

Browse files
__pycache__/handler.cpython-312.pyc ADDED
Binary file (6.18 kB). View file
 
__pycache__/webapp.cpython-312.pyc ADDED
Binary file (6.64 kB). View file
 
app.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright 2023 Google LLC
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ ## Setup
18
+
19
+ To install the dependencies for this script, run:
20
+
21
+ ```
22
+ pip install google-genai opencv-python pyaudio pillow mss
23
+ ```
24
+
25
+ Before running this script, ensure the `GOOGLE_API_KEY` environment
26
+ variable is set to the api-key you obtained from Google AI Studio.
27
+
28
+ Important: **Use headphones**. This script uses the system default audio
29
+ input and output, which often won't include echo cancellation. So to prevent
30
+ the model from interrupting itself it is important that you use headphones.
31
+
32
+ ## Run
33
+
34
+ To run the script:
35
+
36
+ ```
37
+ python live_api_starter.py
38
+ ```
39
+
40
+ The script takes a video-mode flag `--mode`, this can be "camera", "screen", or "none".
41
+ The default is "camera". To share your screen run:
42
+
43
+ ```
44
+ python live_api_starter.py --mode screen
45
+ ```
46
+ """
47
+
48
+ import asyncio
49
+ import base64
50
+ import json
51
+ import io
52
+ import os
53
+ import sys
54
+ import traceback
55
+
56
+ import cv2
57
+ import pyaudio
58
+ import PIL.Image
59
+ import mss
60
+ import argparse
61
+
62
+ from websockets.asyncio.client import connect
63
+
64
+ if sys.version_info < (3, 11, 0):
65
+ import taskgroup, exceptiongroup
66
+
67
+ asyncio.TaskGroup = taskgroup.TaskGroup
68
+ asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup
69
+
70
+ FORMAT = pyaudio.paInt16
71
+ CHANNELS = 1
72
+ SEND_SAMPLE_RATE = 16000
73
+ RECEIVE_SAMPLE_RATE = 24000
74
+ CHUNK_SIZE = 512
75
+
76
+ host = "generativelanguage.googleapis.com"
77
+ model = "gemini-2.0-flash-exp"
78
+ DEFAULT_MODE="none"
79
+
80
+
81
+ api_key = os.environ["GOOGLE_API_KEY"]
82
+ uri = f"wss://{host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}"
83
+
84
+
85
+ class AudioLoop:
86
+ def __init__(self, video_mode=DEFAULT_MODE):
87
+ self.video_mode=video_mode
88
+ self.audio_in_queue = None
89
+ self.out_queue = None
90
+
91
+ self.ws = None
92
+ self.audio_stream = None
93
+
94
+ async def startup(self):
95
+ setup_msg = {"setup": {"model": f"models/{model}"}}
96
+ await self.ws.send(json.dumps(setup_msg))
97
+ raw_response = await self.ws.recv(decode=False)
98
+ setup_response = json.loads(raw_response.decode("ascii"))
99
+
100
+ async def send_text(self):
101
+ while True:
102
+ text = await asyncio.to_thread(input, "message > ")
103
+ if text.lower() == "q":
104
+ break
105
+
106
+ msg = {
107
+ "client_content": {
108
+ "turn_complete": True,
109
+ "turns": [{"role": "user", "parts": [{"text": text}]}],
110
+ }
111
+ }
112
+ await self.ws.send(json.dumps(msg))
113
+
114
+ def _get_frame(self, cap):
115
+ # Read the frame
116
+ ret, frame = cap.read()
117
+ # Check if the frame was read successfully
118
+ if not ret:
119
+ return None
120
+
121
+ # Fix: Convert BGR to RGB color space
122
+ # OpenCV captures in BGR but PIL expects RGB format
123
+ # This prevents the blue tint in the video feed
124
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
125
+ img = PIL.Image.fromarray(frame_rgb) # Now using RGB frame
126
+ img.thumbnail([1024, 1024])
127
+
128
+ image_io = io.BytesIO()
129
+ img.save(image_io, format="jpeg")
130
+ image_io.seek(0)
131
+
132
+ mime_type = "image/jpeg"
133
+ image_bytes = image_io.read()
134
+ return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}
135
+
136
+ async def get_frames(self):
137
+ # This takes about a second, and will block the whole program
138
+ # causing the audio pipeline to overflow if you don't to_thread it.
139
+ cap = await asyncio.to_thread(
140
+ cv2.VideoCapture, 0
141
+ ) # 0 represents the default camera
142
+
143
+ while True:
144
+ frame = await asyncio.to_thread(self._get_frame, cap)
145
+ if frame is None:
146
+ break
147
+ await asyncio.sleep(1.0)
148
+
149
+ msg = {"realtime_input": {"media_chunks": [frame]}}
150
+ await self.out_queue.put(msg)
151
+
152
+ # Release the VideoCapture object
153
+ cap.release()
154
+
155
+ def _get_screen(self):
156
+ sct = mss.mss()
157
+ monitor = sct.monitors[0]
158
+
159
+ i = sct.grab(monitor)
160
+ mime_type = "image/jpeg"
161
+ image_bytes = mss.tools.to_png(i.rgb, i.size)
162
+ img = PIL.Image.open(io.BytesIO(image_bytes))
163
+
164
+ image_io = io.BytesIO()
165
+ img.save(image_io, format="jpeg")
166
+ image_io.seek(0)
167
+
168
+ image_bytes = image_io.read()
169
+ return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}
170
+
171
+ async def get_screen(self):
172
+ while True:
173
+ frame = await asyncio.to_thread(self._get_screen)
174
+ if frame is None:
175
+ break
176
+
177
+ await asyncio.sleep(1.0)
178
+
179
+ msg = {"realtime_input": {"media_chunks": frame}}
180
+ await self.out_queue.put(msg)
181
+
182
+ async def send_realtime(self):
183
+ while True:
184
+ msg = await self.out_queue.get()
185
+ await self.ws.send(json.dumps(msg))
186
+
187
+ async def listen_audio(self):
188
+ pya = pyaudio.PyAudio()
189
+
190
+ mic_info = pya.get_default_input_device_info()
191
+ self.audio_stream = pya.open(
192
+ format=FORMAT,
193
+ channels=CHANNELS,
194
+ rate=SEND_SAMPLE_RATE,
195
+ input=True,
196
+ input_device_index=mic_info["index"],
197
+ frames_per_buffer=CHUNK_SIZE,
198
+ )
199
+ while True:
200
+ data = await asyncio.to_thread(self.audio_stream.read, CHUNK_SIZE)
201
+ msg = {
202
+ "realtime_input": {
203
+ "media_chunks": [
204
+ {
205
+ "data": base64.b64encode(data).decode(),
206
+ "mime_type": "audio/pcm",
207
+ }
208
+ ]
209
+ }
210
+ }
211
+ await self.out_queue.put(msg)
212
+
213
+ async def receive_audio(self):
214
+ "Background task to reads from the websocket and write pcm chunks to the output queue"
215
+ async for raw_response in self.ws:
216
+ # Other things could be returned here, but we'll ignore those for now.
217
+ response = json.loads(raw_response.decode("ascii"))
218
+
219
+ try:
220
+ b64data = response["serverContent"]["modelTurn"]["parts"][0][
221
+ "inlineData"
222
+ ]["data"]
223
+ except KeyError:
224
+ pass
225
+ else:
226
+ pcm_data = base64.b64decode(b64data)
227
+ self.audio_in_queue.put_nowait(pcm_data)
228
+
229
+ try:
230
+ turn_complete = response["serverContent"]["turnComplete"]
231
+ except KeyError:
232
+ pass
233
+ else:
234
+ if turn_complete:
235
+ # If you interrupt the model, it sends an end_of_turn.
236
+ # For interruptions to work, we need to empty out the audio queue
237
+ # Because it may have loaded much more audio than has played yet.
238
+ print("\nEnd of turn")
239
+ while not self.audio_in_queue.empty():
240
+ self.audio_in_queue.get_nowait()
241
+
242
+ async def play_audio(self):
243
+ pya = pyaudio.PyAudio()
244
+ stream = pya.open(
245
+ format=FORMAT, channels=CHANNELS, rate=RECEIVE_SAMPLE_RATE, output=True
246
+ )
247
+ while True:
248
+ bytestream = await self.audio_in_queue.get()
249
+ await asyncio.to_thread(stream.write, bytestream)
250
+
251
+ async def run(self):
252
+ """Takes audio chunks off the input queue, and writes them to files.
253
+
254
+ Splits and displays files if the queue pauses for more than `max_pause`.
255
+ """
256
+ try:
257
+ async with (
258
+ await connect(
259
+ uri, additional_headers={"Content-Type": "application/json"}
260
+ ) as ws,
261
+ asyncio.TaskGroup() as tg,
262
+ ):
263
+ self.ws = ws
264
+ await self.startup()
265
+
266
+ self.audio_in_queue = asyncio.Queue()
267
+ self.out_queue = asyncio.Queue(maxsize=5)
268
+
269
+ send_text_task = tg.create_task(self.send_text())
270
+
271
+ tg.create_task(self.send_realtime())
272
+ tg.create_task(self.listen_audio())
273
+ if self.video_mode == "camera":
274
+ tg.create_task(self.get_frames())
275
+ elif self.video_mode == "screen":
276
+ tg.create_task(self.get_screen())
277
+ tg.create_task(self.receive_audio())
278
+ tg.create_task(self.play_audio())
279
+
280
+ await send_text_task
281
+ raise asyncio.CancelledError("User requested exit")
282
+
283
+ except asyncio.CancelledError:
284
+ pass
285
+ except ExceptionGroup as EG:
286
+ self.audio_stream.close()
287
+ traceback.print_exception(EG)
288
+
289
+
290
+ if __name__ == "__main__":
291
+ parser = argparse.ArgumentParser()
292
+ parser.add_argument(
293
+ "--mode",
294
+ type=str,
295
+ default=DEFAULT_MODE,
296
+ help="pixels to stream from",
297
+ choices=["camera", "screen", "none"],
298
+ )
299
+ args = parser.parse_args()
300
+
301
+ main = AudioLoop(video_mode=args.mode)
302
+ asyncio.run(main.run())
handler.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # handler.py
2
+ import asyncio
3
+ import base64
4
+ import json
5
+ import os
6
+ import traceback
7
+ from websockets.asyncio.client import connect
8
+
9
+ host = "generativelanguage.googleapis.com"
10
+ model = "gemini-2.0-flash-exp"
11
+ api_key = os.environ["GOOGLE_API_KEY"]
12
+ uri = f"wss://{host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}"
13
+
14
+ class AudioLoop:
15
+ def __init__(self):
16
+ self.ws = None
17
+ # Queue for messages to be sent *to* Gemini
18
+ self.out_queue = asyncio.Queue()
19
+ # Queue for PCM audio received *from* Gemini
20
+ self.audio_in_queue = asyncio.Queue()
21
+
22
+ async def startup(self, tools=None):
23
+ """Send the model setup message to Gemini.
24
+
25
+ Args:
26
+ tools: Optional list of tools to enable for the model
27
+ """
28
+ setup_msg = {"setup": {"model": f"models/{model}"}}
29
+ if tools:
30
+ setup_msg["setup"]["tools"] = tools
31
+
32
+ await self.ws.send(json.dumps(setup_msg))
33
+
34
+ raw_response = await self.ws.recv()
35
+ setup_response = json.loads(raw_response)
36
+ print("[AudioLoop] Setup response from Gemini:", setup_response)
37
+
38
+ async def send_realtime(self):
39
+ """Read from out_queue and forward those messages to Gemini in real time."""
40
+ while True:
41
+ msg = await self.out_queue.get()
42
+ await self.ws.send(json.dumps(msg))
43
+
44
+ async def receive_audio(self):
45
+ """Read from Gemini websocket and push PCM data into audio_in_queue."""
46
+ async for raw_response in self.ws:
47
+ response = json.loads(raw_response)
48
+ # Debug log all responses (optional)
49
+ # print("Gemini raw response:", response)
50
+
51
+ # Check if there's inline PCM data
52
+ try:
53
+ b64data = (
54
+ response["serverContent"]["modelTurn"]["parts"][0]["inlineData"]["data"]
55
+ )
56
+ pcm_data = base64.b64decode(b64data)
57
+ await self.audio_in_queue.put(pcm_data)
58
+ except KeyError:
59
+ # No audio in this message
60
+ pass
61
+
62
+ tool_call = response.pop('toolCall', None)
63
+ if tool_call is not None:
64
+ await self.handle_tool_call(tool_call)
65
+
66
+ # If "turnComplete" is present
67
+ if "serverContent" in response and response["serverContent"].get("turnComplete"):
68
+ print("[AudioLoop] Gemini turn complete")
69
+
70
+ async def handle_tool_call(self,tool_call):
71
+ print(" ", tool_call)
72
+ for fc in tool_call['functionCalls']:
73
+ msg = {
74
+ 'tool_response': {
75
+ 'function_responses': [{
76
+ 'id': fc['id'],
77
+ 'name': fc['name'],
78
+ 'response':{'result': {'string_value': 'ok'}}
79
+ }]
80
+ }
81
+ }
82
+ print('>>> ', msg)
83
+ await self.ws.send(json.dumps(msg))
84
+
85
+ async def run(self):
86
+ """Main entry point: connects to Gemini, starts send/receive tasks."""
87
+ try:
88
+ turn_on_the_lights_schema = {'name': 'turn_on_the_lights'}
89
+ turn_off_the_lights_schema = {'name': 'turn_off_the_lights'}
90
+ tools = [
91
+ {'google_search': {}},
92
+ {'function_declarations': [turn_on_the_lights_schema, turn_off_the_lights_schema]},
93
+ {'code_execution': {}},
94
+ ]
95
+ async with connect(uri, additional_headers={"Content-Type": "application/json"}) as ws:
96
+ self.ws = ws
97
+ await self.startup(tools)
98
+
99
+ async with asyncio.TaskGroup() as tg:
100
+ tg.create_task(self.send_realtime())
101
+ tg.create_task(self.receive_audio())
102
+
103
+ # Keep running until canceled
104
+ await asyncio.Future()
105
+
106
+ except asyncio.CancelledError:
107
+ pass
108
+ except Exception as e:
109
+ traceback.print_exc()
110
+ raise
index copy.html ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <title>Raw PCM Capture Demo</title>
6
+ </head>
7
+ <body>
8
+
9
+ <h1>Capture Raw PCM via ScriptProcessorNode</h1>
10
+
11
+ <p>
12
+ <button onclick="connectWebSocket()">Connect WebSocket</button>
13
+ <button onclick="startCapture()">Start Raw PCM</button>
14
+ <button onclick="stopCapture()">Stop Raw PCM</button>
15
+ </p>
16
+
17
+ <p>
18
+ <input type="text" id="textMessage" placeholder="Type your message here" />
19
+ <button onclick="sendText()">Send Text</button>
20
+ </p>
21
+
22
+ <pre id="log" style="background:#f0f0f0;padding:1em;"></pre>
23
+
24
+ <script>
25
+ let socket;
26
+ let playbackCtx = null;
27
+ let nextPlaybackTime = 0;
28
+ let audioCtx;
29
+ let scriptNode;
30
+ let micStream;
31
+ let isCapturing = false;
32
+
33
+ function logMessage(...args) {
34
+ const pre = document.getElementById("log");
35
+ pre.textContent += args.join(" ") + "\n";
36
+ console.log(...args);
37
+ }
38
+
39
+ function connectWebSocket() {
40
+ logMessage("[WebSocket] Connecting...");
41
+
42
+ // Adjust port/host if your FastAPI server is elsewhere
43
+ socket = new WebSocket("ws://localhost:8000/ws");
44
+
45
+ socket.onopen = () => {
46
+ logMessage("[WebSocket] Opened connection");
47
+ if (!playbackCtx) {
48
+ playbackCtx = new (window.AudioContext || window.webkitAudioContext)();
49
+ }
50
+ nextPlaybackTime = playbackCtx.currentTime;
51
+ };
52
+
53
+ socket.onerror = (err) => {
54
+ logMessage("[WebSocket] Error:", err);
55
+ };
56
+
57
+ socket.onclose = () => {
58
+ logMessage("[WebSocket] Closed");
59
+ };
60
+
61
+ socket.onmessage = (event) => {
62
+ try {
63
+ const data = JSON.parse(event.data);
64
+ if (data.type === "audio" && data.payload) {
65
+ const arrayBuffer = base64ToArrayBuffer(data.payload);
66
+ const int16View = new Int16Array(arrayBuffer);
67
+ const float32Buffer = new Float32Array(int16View.length);
68
+ for (let i = 0; i < int16View.length; i++) {
69
+ float32Buffer[i] = int16View[i] / 32768;
70
+ }
71
+ const sampleRate = 24000; // RECEIVED_SAMPLE_RATE from app.py
72
+ const audioBuffer = playbackCtx.createBuffer(1, float32Buffer.length, sampleRate);
73
+ audioBuffer.copyToChannel(float32Buffer, 0);
74
+ let scheduledTime = playbackCtx.currentTime > nextPlaybackTime ? playbackCtx.currentTime : nextPlaybackTime;
75
+ const source = playbackCtx.createBufferSource();
76
+ source.buffer = audioBuffer;
77
+ source.connect(playbackCtx.destination);
78
+ source.start(scheduledTime);
79
+ nextPlaybackTime = scheduledTime + audioBuffer.duration;
80
+ logMessage("[Audio] Scheduled playback. Start time:", scheduledTime, "Duration:", audioBuffer.duration);
81
+ } else if (data.type === "text" && data.content) {
82
+ logMessage("[Text] Received:", data.content);
83
+ } else {
84
+ logMessage("[WebSocket] Received:", event.data);
85
+ }
86
+ } catch (err) {
87
+ logMessage("[WebSocket] Error processing message:", err);
88
+ }
89
+ };
90
+ }
91
+
92
+ async function startCapture() {
93
+ if (!socket || socket.readyState !== WebSocket.OPEN) {
94
+ logMessage("WebSocket not connected. Click 'Connect WebSocket' first.");
95
+ return;
96
+ }
97
+ if (isCapturing) {
98
+ logMessage("Already capturing!");
99
+ return;
100
+ }
101
+ isCapturing = true;
102
+ logMessage("Starting microphone capture as raw PCM...");
103
+
104
+ try {
105
+ micStream = await navigator.mediaDevices.getUserMedia({ audio: true });
106
+ audioCtx = new (window.AudioContext || window.webkitAudioContext)();
107
+
108
+ // Create a media source from the mic stream
109
+ const source = audioCtx.createMediaStreamSource(micStream);
110
+
111
+ // Create a ScriptProcessorNode
112
+ const bufferSize = 4096; // You can adjust this
113
+ const inputChannels = 1;
114
+ const outputChannels = 1;
115
+ scriptNode = audioCtx.createScriptProcessor(bufferSize, inputChannels, outputChannels);
116
+
117
+ scriptNode.onaudioprocess = (audioEvent) => {
118
+ if (!isCapturing) return;
119
+
120
+ // Get raw floating-point samples [ -1.0 .. +1.0 ]
121
+ const inputBuffer = audioEvent.inputBuffer.getChannelData(0);
122
+ // Convert float samples to 16-bit signed
123
+ const pcm16 = floatTo16BitPCM(inputBuffer);
124
+
125
+ // Encode as base64 and send over WebSocket
126
+ const bytes = new Uint8Array(pcm16.buffer);
127
+ const b64 = btoa(String.fromCharCode(...bytes));
128
+ socket.send(JSON.stringify({
129
+ type: "audio",
130
+ payload: b64
131
+ }));
132
+ };
133
+
134
+ // Connect the pipeline: mic -> script -> (optional) audioCtx.destination
135
+ source.connect(scriptNode);
136
+ scriptNode.connect(audioCtx.destination);
137
+
138
+ logMessage("Recording...");
139
+ } catch (err) {
140
+ logMessage("Error getting user mic:", err);
141
+ }
142
+ }
143
+
144
+ function stopCapture() {
145
+ if (!isCapturing) return;
146
+ isCapturing = false;
147
+ logMessage("Stopped microphone capture.");
148
+
149
+ if (scriptNode) {
150
+ scriptNode.disconnect();
151
+ scriptNode.onaudioprocess = null;
152
+ scriptNode = null;
153
+ }
154
+ if (micStream) {
155
+ // Stop all tracks
156
+ micStream.getTracks().forEach(track => track.stop());
157
+ micStream = null;
158
+ }
159
+ if (audioCtx) {
160
+ audioCtx.close();
161
+ audioCtx = null;
162
+ }
163
+ }
164
+
165
+ function floatTo16BitPCM(floatSamples) {
166
+ // Convert an array of floats [-1, 1] to a Int16Array
167
+ const out = new Int16Array(floatSamples.length);
168
+ for (let i = 0; i < floatSamples.length; i++) {
169
+ let s = Math.max(-1, Math.min(1, floatSamples[i]));
170
+ // scale range
171
+ s = s < 0 ? s * 0x8000 : s * 0x7FFF;
172
+ out[i] = s;
173
+ }
174
+ return out;
175
+ }
176
+
177
+ function sendText() {
178
+ const textInput = document.getElementById("textMessage");
179
+ const text = textInput.value.trim();
180
+ if (text && socket && socket.readyState === WebSocket.OPEN) {
181
+ socket.send(JSON.stringify({ type: "text", content: text }));
182
+ logMessage("[Text] Sent:", text);
183
+ textInput.value = "";
184
+ } else {
185
+ logMessage("WebSocket not connected or text is empty.");
186
+ }
187
+ }
188
+
189
+ function base64ToArrayBuffer(b64) {
190
+ const binaryString = window.atob(b64);
191
+ const len = binaryString.length;
192
+ const bytes = new Uint8Array(len);
193
+ for (let i = 0; i < len; i++) {
194
+ bytes[i] = binaryString.charCodeAt(i);
195
+ }
196
+ return bytes.buffer;
197
+ }
198
+
199
+ </script>
200
+
201
+ </body>
202
+ </html>
index.html ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <title>Gemini Live Chat - Voice and Text Interaction</title>
6
+ <style>
7
+ body {
8
+ max-width: 800px;
9
+ margin: 2em auto;
10
+ padding: 0 1em;
11
+ font-family: system-ui, -apple-system, sans-serif;
12
+ }
13
+ #visualizer {
14
+ width: 100%;
15
+ height: 80px;
16
+ background: #f0f0f0;
17
+ border-radius: 4px;
18
+ margin: 0;
19
+ }
20
+ #log {
21
+ background: #f0f0f0;
22
+ padding: 1em;
23
+ border-radius: 4px;
24
+ font-family: monospace;
25
+ max-height: 400px;
26
+ overflow-y: auto;
27
+ }
28
+ .controls {
29
+ margin: 1em 0;
30
+ padding: 1em;
31
+ background: #f8f8f8;
32
+ border-radius: 4px;
33
+ }
34
+ .function-card {
35
+ padding: 0.8em;
36
+ background: white;
37
+ border-radius: 4px;
38
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1);
39
+ }
40
+ .function-card strong {
41
+ color: #1976d2;
42
+ }
43
+ .function-card ul {
44
+ color: #555;
45
+ }
46
+ button {
47
+ border: none;
48
+ padding: 0.5em 1em;
49
+ border-radius: 3px;
50
+ cursor: pointer;
51
+ transition: opacity 0.2s;
52
+ }
53
+ button:hover {
54
+ opacity: 0.9;
55
+ }
56
+ #connectButton {
57
+ background: #2196f3;
58
+ color: white;
59
+ }
60
+ .voice-start {
61
+ background: #4caf50;
62
+ color: white;
63
+ }
64
+ .voice-stop {
65
+ background: #f44336;
66
+ color: white;
67
+ }
68
+ </style>
69
+ </head>
70
+ <body>
71
+
72
+ <h1>Gemini Live Chat</h1>
73
+ <p>Interactive voice and text chat powered by Gemini AI that supports server-side function calling, code execution, and Google search capabilities.</p>
74
+
75
+ <div class="controls" style="background: #e3f2fd;">
76
+ <h3 style="margin-top: 0;">Available Functions:</h3>
77
+ <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1em;">
78
+ <div class="function-card">
79
+ <strong>💡 Light Control</strong>
80
+ <ul style="margin: 0.5em 0; padding-left: 1.5em;">
81
+ <li>Turn lights on</li>
82
+ <li>Turn lights off</li>
83
+ </ul>
84
+ </div>
85
+ <div class="function-card">
86
+ <strong>🔍 Search</strong>
87
+ <ul style="margin: 0.5em 0; padding-left: 1.5em;">
88
+ <li>Google search</li>
89
+ </ul>
90
+ </div>
91
+ <div class="function-card">
92
+ <strong>💻 Code</strong>
93
+ <ul style="margin: 0.5em 0; padding-left: 1.5em;">
94
+ <li>Execute code</li>
95
+ <li>Run commands</li>
96
+ </ul>
97
+ </div>
98
+ </div>
99
+ <p style="margin: 0.5em 0 0 0; font-size: 0.9em; color: #666;">
100
+ Try saying: "Turn on the lights" or "Search for weather in London" or ask any question!
101
+ </p>
102
+ </div>
103
+
104
+ <div class="controls">
105
+ <div style="display: flex; align-items: center; justify-content: space-between; gap: 1em;">
106
+ <div style="display: flex; align-items: center; gap: 1em;">
107
+ <div>
108
+ <span style="font-weight: 500; color: #666;">Server:</span>
109
+ <span id="connectionStatus" style="display: inline-block; padding: 0.3em 0.6em; margin-left: 0.5em; border-radius: 3px; background: #f44336; color: white;">Not connected</span>
110
+ </div>
111
+ <div id="micStatus" style="display: none;">
112
+ <span style="font-weight: 500; color: #666;">Voice:</span>
113
+ <span style="display: inline-block; padding: 0.3em 0.6em; margin-left: 0.5em; border-radius: 3px; background: #4caf50; color: white;">Recording</span>
114
+ </div>
115
+ </div>
116
+ <div style="display: flex; gap: 0.5em;">
117
+ <button id="connectButton" onclick="toggleConnection()">
118
+ <span style="margin-right: 0.3em;">🔌</span> Connect to Server
119
+ </button>
120
+ <button id="voiceStartButton" class="voice-start" onclick="startCapture()">
121
+ <span style="margin-right: 0.3em;">🎤</span> Start Voice Chat
122
+ </button>
123
+ <button id="voiceStopButton" class="voice-stop" onclick="stopCapture()" style="display: none;">
124
+ <span style="margin-right: 0.3em;">⏹️</span> Stop Voice Chat
125
+ </button>
126
+ </div>
127
+ </div>
128
+ <div style="margin-top: 1em; display: flex; gap: 0.5em;">
129
+ <input type="text" id="textMessage" placeholder="Type your message here" style="flex: 1; padding: 0.5em; border: 1px solid #ddd; border-radius: 3px;" />
130
+ <button onclick="sendText()" style="white-space: nowrap;">
131
+ <span style="margin-right: 0.3em;">📤</span> Send
132
+ </button>
133
+ </div>
134
+ </div>
135
+
136
+ <div class="controls">
137
+
138
+ <canvas id="visualizer"></canvas>
139
+ </div>
140
+
141
+ <div style="margin: 1em 0;">
142
+ <strong>Log Settings:</strong><br>
143
+ <label><input type="checkbox" id="logWebSocket" checked> WebSocket Events</label>
144
+ <label style="margin-left: 1em"><input type="checkbox" id="logAudio" checked> Audio Events</label>
145
+ <label style="margin-left: 1em"><input type="checkbox" id="logText" checked> Text Events</label>
146
+ <label style="margin-left: 1em"><input type="checkbox" id="logError" checked> Error Events</label>
147
+ </div>
148
+
149
+ <pre id="log"></pre>
150
+
151
+ <script>
152
+ let socket;
153
+ let playbackCtx = null;
154
+ let nextPlaybackTime = 0;
155
+ let audioCtx;
156
+ let scriptNode;
157
+ let micStream;
158
+ let isCapturing = false;
159
+ let audioSeq = 0;
160
+ let scheduledSources = []; // Track scheduled audio sources
161
+ let analyser;
162
+ let visualizerCanvas;
163
+ let visualizerCtx;
164
+ let animationFrame;
165
+
166
+ function updateConnectionStatus(connected) {
167
+ const statusEl = document.getElementById('connectionStatus');
168
+ const connectButton = document.getElementById('connectButton');
169
+ const voiceStartButton = document.getElementById('voiceStartButton');
170
+
171
+ if (connected) {
172
+ statusEl.textContent = 'Connected';
173
+ statusEl.style.background = '#4caf50';
174
+ connectButton.textContent = '🔌 Disconnect Server';
175
+ voiceStartButton.style.display = '';
176
+ } else {
177
+ statusEl.textContent = 'Not connected';
178
+ statusEl.style.background = '#f44336';
179
+ connectButton.textContent = '🔌 Connect to Server';
180
+ voiceStartButton.style.display = 'none';
181
+ // Also stop recording if we're disconnected
182
+ if (isCapturing) {
183
+ stopCapture();
184
+ }
185
+ }
186
+ }
187
+
188
+ function updateMicStatus(recording) {
189
+ const micStatus = document.getElementById('micStatus');
190
+ const voiceStartButton = document.getElementById('voiceStartButton');
191
+ const voiceStopButton = document.getElementById('voiceStopButton');
192
+
193
+ if (recording) {
194
+ micStatus.style.display = '';
195
+ voiceStartButton.style.display = 'none';
196
+ voiceStopButton.style.display = '';
197
+ } else {
198
+ micStatus.style.display = 'none';
199
+ voiceStartButton.style.display = '';
200
+ voiceStopButton.style.display = 'none';
201
+ }
202
+ }
203
+
204
+ function toggleConnection() {
205
+ if (socket && socket.readyState === WebSocket.OPEN) {
206
+ socket.close();
207
+ } else {
208
+ connectWebSocket();
209
+ }
210
+ }
211
+
212
+ function logMessage(category, ...args) {
213
+ const pre = document.getElementById("log");
214
+ const logCategory = document.getElementById(`log${category.charAt(0).toUpperCase() + category.slice(1)}`);
215
+ const shouldLog = logCategory ? logCategory.checked : false;
216
+
217
+ if (shouldLog) {
218
+ const timestamp = new Date().toLocaleTimeString();
219
+ pre.textContent += `[${timestamp}] [${category}] ` + args.join(" ") + "\n";
220
+ console.log(`[${category}]`, ...args);
221
+ }
222
+ }
223
+
224
+ function clearScheduledAudio() {
225
+ // Stop and disconnect all scheduled audio sources
226
+ while (scheduledSources.length > 0) {
227
+ const source = scheduledSources.pop();
228
+ try {
229
+ source.stop();
230
+ source.disconnect();
231
+ } catch (err) {
232
+ // Ignore errors if source already finished playing
233
+ }
234
+ }
235
+ // Reset next playback time
236
+ if (playbackCtx) {
237
+ nextPlaybackTime = playbackCtx.currentTime;
238
+ }
239
+ logMessage("Audio", "Cleared all scheduled audio");
240
+ }
241
+
242
+ function setupVisualizer() {
243
+ visualizerCanvas = document.getElementById('visualizer');
244
+ visualizerCtx = visualizerCanvas.getContext('2d');
245
+
246
+ // Make canvas resolution match display size
247
+ const rect = visualizerCanvas.getBoundingClientRect();
248
+ visualizerCanvas.width = rect.width;
249
+ visualizerCanvas.height = rect.height;
250
+
251
+ if (!analyser && playbackCtx) {
252
+ analyser = playbackCtx.createAnalyser();
253
+ analyser.fftSize = 2048;
254
+ }
255
+ }
256
+
257
+ function drawVisualizer() {
258
+ if (!analyser) return;
259
+
260
+ const bufferLength = analyser.frequencyBinCount;
261
+ const dataArray = new Uint8Array(bufferLength);
262
+ analyser.getByteTimeDomainData(dataArray);
263
+
264
+ visualizerCtx.fillStyle = '#f0f0f0';
265
+ visualizerCtx.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
266
+
267
+ visualizerCtx.lineWidth = 2;
268
+ visualizerCtx.strokeStyle = '#4caf50';
269
+ visualizerCtx.beginPath();
270
+
271
+ const sliceWidth = visualizerCanvas.width / bufferLength;
272
+ let x = 0;
273
+
274
+ for (let i = 0; i < bufferLength; i++) {
275
+ const v = dataArray[i] / 128.0;
276
+ const y = v * visualizerCanvas.height / 2;
277
+
278
+ if (i === 0) {
279
+ visualizerCtx.moveTo(x, y);
280
+ } else {
281
+ visualizerCtx.lineTo(x, y);
282
+ }
283
+
284
+ x += sliceWidth;
285
+ }
286
+
287
+ visualizerCtx.lineTo(visualizerCanvas.width, visualizerCanvas.height / 2);
288
+ visualizerCtx.stroke();
289
+
290
+ animationFrame = requestAnimationFrame(drawVisualizer);
291
+ }
292
+
293
+ function stopVisualizer() {
294
+ if (animationFrame) {
295
+ cancelAnimationFrame(animationFrame);
296
+ animationFrame = null;
297
+ }
298
+ if (visualizerCtx) {
299
+ visualizerCtx.fillStyle = '#f0f0f0';
300
+ visualizerCtx.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
301
+ }
302
+ }
303
+
304
+ function connectWebSocket() {
305
+ logMessage("WebSocket", "Connecting...");
306
+ updateConnectionStatus(false);
307
+
308
+ // Use current origin and replace http(s) with ws(s)
309
+ const wsUrl = `${window.location.protocol === 'https:' ? 'wss:' : 'ws:'}//${window.location.host}/ws`;
310
+ socket = new WebSocket(wsUrl);
311
+
312
+ socket.onopen = () => {
313
+ logMessage("WebSocket", "Opened connection");
314
+ updateConnectionStatus(true);
315
+ if (!playbackCtx) {
316
+ playbackCtx = new (window.AudioContext || window.webkitAudioContext)();
317
+ setupVisualizer();
318
+ }
319
+ nextPlaybackTime = playbackCtx.currentTime;
320
+ };
321
+
322
+ socket.onerror = (err) => {
323
+ logMessage("Error", "WebSocket error:", err);
324
+ updateConnectionStatus(false);
325
+ };
326
+
327
+ socket.onclose = () => {
328
+ logMessage("WebSocket", "Connection closed");
329
+ updateConnectionStatus(false);
330
+ if (isCapturing) {
331
+ stopCapture();
332
+ }
333
+ };
334
+
335
+ socket.onmessage = (event) => {
336
+ try {
337
+ const data = JSON.parse(event.data);
338
+ if (data.type === "audio" && data.payload) {
339
+ const arrayBuffer = base64ToArrayBuffer(data.payload);
340
+ const int16View = new Int16Array(arrayBuffer);
341
+ const float32Buffer = new Float32Array(int16View.length);
342
+ for (let i = 0; i < int16View.length; i++) {
343
+ float32Buffer[i] = int16View[i] / 32768;
344
+ }
345
+ const sampleRate = 24000; // RECEIVED_SAMPLE_RATE from app.py
346
+ const audioBuffer = playbackCtx.createBuffer(1, float32Buffer.length, sampleRate);
347
+ audioBuffer.copyToChannel(float32Buffer, 0);
348
+ let scheduledTime = playbackCtx.currentTime > nextPlaybackTime ? playbackCtx.currentTime : nextPlaybackTime;
349
+ const source = playbackCtx.createBufferSource();
350
+ source.buffer = audioBuffer;
351
+
352
+ // Connect through analyser for visualization
353
+ if (analyser) {
354
+ source.connect(analyser);
355
+ analyser.connect(playbackCtx.destination);
356
+ if (!animationFrame) {
357
+ drawVisualizer();
358
+ }
359
+ } else {
360
+ source.connect(playbackCtx.destination);
361
+ }
362
+
363
+ source.start(scheduledTime);
364
+ // Add source to tracked sources
365
+ scheduledSources.push(source);
366
+ // Remove source from tracking once it finishes
367
+ source.onended = () => {
368
+ const index = scheduledSources.indexOf(source);
369
+ if (index > -1) {
370
+ scheduledSources.splice(index, 1);
371
+ }
372
+ // Stop visualizer if no more audio
373
+ if (scheduledSources.length === 0) {
374
+ stopVisualizer();
375
+ }
376
+ };
377
+ nextPlaybackTime = scheduledTime + audioBuffer.duration;
378
+ logMessage("Audio", "Scheduled playback. Start time:", scheduledTime, "Duration:", audioBuffer.duration);
379
+ } else if (data.type === "text" && data.content) {
380
+ logMessage("Text", "Received:", data.content);
381
+ } else {
382
+ logMessage("WebSocket", "Received message:", event.data);
383
+ }
384
+ } catch (err) {
385
+ logMessage("Error", "Failed to process message:", err);
386
+ }
387
+ };
388
+ }
389
+
390
+ async function startCapture() {
391
+ if (!socket || socket.readyState !== WebSocket.OPEN) {
392
+ logMessage("WebSocket", "Not connected. Click 'Connect to Server' first.");
393
+ return;
394
+ }
395
+ if (isCapturing) {
396
+ logMessage("Audio", "Already capturing!");
397
+ return;
398
+ }
399
+
400
+ isCapturing = true;
401
+ updateMicStatus(true);
402
+ logMessage("Audio", "Starting microphone capture...");
403
+
404
+ try {
405
+ micStream = await navigator.mediaDevices.getUserMedia({ audio: true });
406
+ logMessage("Audio", "Got microphone access");
407
+ audioCtx = new (window.AudioContext || window.webkitAudioContext)();
408
+ logMessage("Audio", "Created AudioContext with sample rate:", audioCtx.sampleRate);
409
+
410
+ // Create a media source from the mic stream
411
+ const source = audioCtx.createMediaStreamSource(micStream);
412
+ logMessage("Audio", "Created MediaStreamSource");
413
+
414
+ // Create a ScriptProcessorNode
415
+ const bufferSize = 4096; // You can adjust this
416
+ const inputChannels = 1;
417
+ const outputChannels = 1;
418
+ scriptNode = audioCtx.createScriptProcessor(bufferSize, inputChannels, outputChannels);
419
+ logMessage("Audio", "Created ScriptProcessorNode with buffer size:", bufferSize);
420
+
421
+ scriptNode.onaudioprocess = (audioEvent) => {
422
+ if (!isCapturing) return;
423
+
424
+ // Get raw samples and resample to 16kHz
425
+ const inputBuffer = audioEvent.inputBuffer.getChannelData(0);
426
+
427
+ // Check if there's actual audio input (not just silence)
428
+ const hasAudio = inputBuffer.some(sample => Math.abs(sample) > 0.01); // Threshold for noise
429
+ if (hasAudio) {
430
+ clearScheduledAudio(); // Only clear when we detect actual audio input
431
+ }
432
+
433
+ const resampled = resampleAudio(inputBuffer, audioCtx.sampleRate, 16000);
434
+
435
+ // Convert resampled audio to 16-bit PCM
436
+ const pcm16 = floatTo16BitPCM(resampled);
437
+
438
+ // Encode as base64 and send over WebSocket
439
+ const bytes = new Uint8Array(pcm16.buffer);
440
+ const b64 = btoa(String.fromCharCode(...bytes));
441
+ const audioMsg = {
442
+ type: "audio",
443
+ payload: b64,
444
+ seq: audioSeq++,
445
+ config: {
446
+ sampleRate: 16000,
447
+ bitDepth: 16,
448
+ channels: 1
449
+ }
450
+ };
451
+ logMessage("Audio", "Processing chunk. Seq:", audioMsg.seq);
452
+ try {
453
+ if (socket.readyState === WebSocket.OPEN) {
454
+ socket.send(JSON.stringify(audioMsg));
455
+ } else {
456
+ logMessage("WebSocket", "Not open, stopping capture");
457
+ stopCapture();
458
+ }
459
+ } catch (err) {
460
+ logMessage("Error", "Failed to send audio:", err);
461
+ stopCapture();
462
+ }
463
+ };
464
+
465
+ // Connect the pipeline: mic -> script -> (optional) audioCtx.destination
466
+ source.connect(scriptNode);
467
+ logMessage("Audio", "Connected audio pipeline");
468
+
469
+ logMessage("Audio", "Recording...");
470
+ } catch (err) {
471
+ logMessage("Error", "Failed to get microphone access:", err);
472
+ isCapturing = false;
473
+ }
474
+ }
475
+
476
+ function stopCapture() {
477
+ if (!isCapturing) return;
478
+ isCapturing = false;
479
+ updateMicStatus(false);
480
+ logMessage("Audio", "Stopped microphone capture");
481
+
482
+ if (scriptNode) {
483
+ scriptNode.disconnect();
484
+ scriptNode.onaudioprocess = null;
485
+ scriptNode = null;
486
+ }
487
+ if (micStream) {
488
+ // Stop all tracks
489
+ micStream.getTracks().forEach(track => track.stop());
490
+ micStream = null;
491
+ }
492
+ if (audioCtx) {
493
+ audioCtx.close();
494
+ audioCtx = null;
495
+ }
496
+ }
497
+
498
+ function floatTo16BitPCM(floatSamples) {
499
+ // Convert an array of floats [-1, 1] to a Int16Array
500
+ const out = new Int16Array(floatSamples.length);
501
+ for (let i = 0; i < floatSamples.length; i++) {
502
+ let s = Math.max(-1, Math.min(1, floatSamples[i]));
503
+ // scale range
504
+ s = s < 0 ? s * 0x8000 : s * 0x7FFF;
505
+ out[i] = s;
506
+ }
507
+ return out;
508
+ }
509
+
510
+ function resampleAudio(inputBuffer, fromRate, toRate) {
511
+ const ratio = toRate / fromRate;
512
+ const newLength = Math.round(inputBuffer.length * ratio);
513
+ const resampled = new Float32Array(newLength);
514
+
515
+ for(let i = 0; i < newLength; i++) {
516
+ const index = Math.round(i / ratio);
517
+ resampled[i] = inputBuffer[Math.min(index, inputBuffer.length-1)];
518
+ }
519
+ return resampled;
520
+ }
521
+
522
+ function sendText() {
523
+ const textInput = document.getElementById("textMessage");
524
+ const text = textInput.value.trim();
525
+ if (text && socket && socket.readyState === WebSocket.OPEN) {
526
+ // Clear any scheduled audio before sending text
527
+ clearScheduledAudio();
528
+
529
+ socket.send(JSON.stringify({ type: "text", content: text }));
530
+ logMessage("Text", "Sent:", text);
531
+ textInput.value = "";
532
+ } else {
533
+ logMessage("WebSocket", "Not connected or text is empty");
534
+ }
535
+ }
536
+
537
+ function base64ToArrayBuffer(b64) {
538
+ const binaryString = window.atob(b64);
539
+ const len = binaryString.length;
540
+ const bytes = new Uint8Array(len);
541
+ for (let i = 0; i < len; i++) {
542
+ bytes[i] = binaryString.charCodeAt(i);
543
+ }
544
+ return bytes.buffer;
545
+ }
546
+
547
+ </script>
548
+
549
+ </body>
550
+ </html>
requirements.txt ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohappyeyeballs==2.4.4
3
+ aiohttp==3.11.11
4
+ aiohttp-retry==2.9.1
5
+ aioice==0.9.0
6
+ aiortc==1.10.0
7
+ aiosignal==1.3.2
8
+ altair==5.5.0
9
+ annotated-types==0.7.0
10
+ anyio==4.8.0
11
+ attrs==24.3.0
12
+ av==13.1.0
13
+ blinker==1.9.0
14
+ cachetools==5.5.0
15
+ certifi==2024.8.30
16
+ cffi==1.17.1
17
+ charset-normalizer==3.4.0
18
+ click==8.1.7
19
+ colorama==0.4.6
20
+ contourpy==1.3.1
21
+ cryptography==44.0.0
22
+ cycler==0.12.1
23
+ dnspython==2.7.0
24
+ exceptiongroup==1.2.0
25
+ fastapi==0.115.8
26
+ ffmpeg-python==0.2.0
27
+ ffmpy==0.5.0
28
+ filelock==3.17.0
29
+ fonttools==4.55.8
30
+ frozenlist==1.5.0
31
+ fsspec==2024.12.0
32
+ future==1.0.0
33
+ gitdb==4.0.11
34
+ GitPython==3.1.43
35
+ google-ai-generativelanguage==0.6.15
36
+ google-api-core==2.24.1
37
+ google-api-python-client==2.160.0
38
+ google-auth==2.37.0
39
+ google-auth-httplib2==0.2.0
40
+ google-crc32c==1.6.0
41
+ google-genai==0.2.2
42
+ google-generativeai==0.8.4
43
+ googleapis-common-protos==1.66.0
44
+ gradio==5.14.0
45
+ gradio_client==1.7.0
46
+ gradio_webrtc==0.0.30
47
+ grpcio==1.70.0
48
+ grpcio-status==1.70.0
49
+ h11==0.14.0
50
+ httpcore==1.0.7
51
+ httplib2==0.22.0
52
+ httpx==0.28.1
53
+ huggingface-hub==0.28.1
54
+ idna==3.10
55
+ ifaddr==0.2.0
56
+ importlib_resources==6.5.2
57
+ Jinja2==3.1.4
58
+ jsonschema==4.23.0
59
+ jsonschema-specifications==2024.10.1
60
+ keyboard==0.13.5
61
+ kiwisolver==1.4.8
62
+ markdown-it-py==3.0.0
63
+ MarkupSafe==2.1.5
64
+ matplotlib==3.10.0
65
+ mdurl==0.1.2
66
+ MouseInfo==0.1.3
67
+ mss==10.0.0
68
+ multidict==6.1.0
69
+ narwhals==1.19.0
70
+ numpy==1.26.4
71
+ opencv-python==4.10.0.84
72
+ orjson==3.10.15
73
+ packaging==24.2
74
+ pandas==2.2.3
75
+ pillow==10.4.0
76
+ plotly==5.24.1
77
+ propcache==0.2.1
78
+ proto-plus==1.26.0
79
+ protobuf==5.29.2
80
+ pyarrow==18.1.0
81
+ pyasn1==0.6.1
82
+ pyasn1_modules==0.4.1
83
+ PyAudio==0.2.14
84
+ PyAutoGUI==0.9.54
85
+ pycparser==2.22
86
+ pydantic==2.10.3
87
+ pydantic_core==2.27.1
88
+ pydeck==0.9.1
89
+ pydub==0.25.1
90
+ pyee==12.1.1
91
+ PyGetWindow==0.0.9
92
+ Pygments==2.18.0
93
+ PyJWT==2.10.1
94
+ pylibsrtp==0.10.0
95
+ PyMsgBox==1.0.9
96
+ pyngrok==7.2.2
97
+ pyOpenSSL==25.0.0
98
+ pyparsing==3.2.1
99
+ pyperclip==1.9.0
100
+ PyRect==0.2.0
101
+ PyScreeze==1.0.1
102
+ python-dateutil==2.9.0.post0
103
+ python-dotenv==1.0.1
104
+ python-multipart==0.0.20
105
+ pytweening==1.2.0
106
+ pytz==2024.2
107
+ PyYAML==6.0.2
108
+ referencing==0.35.1
109
+ requests==2.32.3
110
+ rich==13.9.4
111
+ rpds-py==0.22.3
112
+ rsa==4.9
113
+ ruff==0.9.4
114
+ safehttpx==0.1.6
115
+ semantic-version==2.10.0
116
+ setuptools==75.8.0
117
+ shellingham==1.5.4
118
+ simpleaudio==1.0.4
119
+ six==1.17.0
120
+ smmap==5.0.1
121
+ sniffio==1.3.1
122
+ starlette==0.45.3
123
+ streamlit==1.41.1
124
+ tenacity==9.0.0
125
+ toml==0.10.2
126
+ tomlkit==0.13.2
127
+ tornado==6.4.2
128
+ tqdm==4.67.1
129
+ twilio==9.4.4
130
+ typer==0.15.1
131
+ typing_extensions==4.12.2
132
+ tzdata==2024.2
133
+ uritemplate==4.1.1
134
+ urllib3==2.2.3
135
+ uvicorn==0.34.0
136
+ watchdog==6.0.0
137
+ websockets==14.2
138
+ yarl==1.18.3
webapp.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # webapp.py
2
+
3
+ import asyncio
4
+ import base64
5
+ import json
6
+ import os
7
+
8
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
9
+ from fastapi.responses import HTMLResponse
10
+ from fastapi.staticfiles import StaticFiles
11
+ import uvicorn
12
+
13
+ from handler import AudioLoop # Import your AudioLoop from above
14
+
15
+ app = FastAPI()
16
+
17
+ # Mount the web_ui directory to serve static files
18
+ current_dir = os.path.dirname(os.path.realpath(__file__))
19
+ app.mount("/web_ui", StaticFiles(directory=current_dir), name="web_ui")
20
+
21
+ @app.get("/")
22
+ async def get_index():
23
+ # Read and return the index.html file
24
+ index_path = os.path.join(current_dir, "index.html")
25
+ with open(index_path, "r", encoding="utf-8") as f:
26
+ html_content = f.read()
27
+ return HTMLResponse(content=html_content)
28
+
29
+ @app.websocket("/ws")
30
+ async def websocket_endpoint(websocket: WebSocket):
31
+ await websocket.accept()
32
+ print("[websocket_endpoint] Client connected.")
33
+
34
+ # Create a new AudioLoop instance for this client
35
+ audio_loop = AudioLoop()
36
+ audio_ordering_buffer = {}
37
+ expected_audio_seq = 0
38
+
39
+ # Start the AudioLoop for this client
40
+ loop_task = asyncio.create_task(audio_loop.run())
41
+ print("[websocket_endpoint] Started new AudioLoop for client")
42
+
43
+ async def from_client_to_gemini():
44
+ """Handles incoming messages from the client and forwards them to Gemini."""
45
+ nonlocal audio_ordering_buffer, expected_audio_seq
46
+ try:
47
+ while True:
48
+ data = await websocket.receive_text()
49
+ msg = json.loads(data)
50
+ msg_type = msg.get("type")
51
+
52
+ #print("[from_client_to_gemini] Received message from client:", msg)
53
+
54
+ # Handle audio data from client
55
+ if msg_type == "audio":
56
+ raw_pcm = base64.b64decode(msg["payload"])
57
+ forward_msg = {
58
+ "realtime_input": {
59
+ "media_chunks": [
60
+ {
61
+ "data": base64.b64encode(raw_pcm).decode(),
62
+ "mime_type": "audio/pcm"
63
+ }
64
+ ]
65
+ }
66
+ }
67
+ # Retrieve the sequence number from the message
68
+ seq = msg.get("seq")
69
+ if seq is not None:
70
+ # Store the message in the buffer
71
+ audio_ordering_buffer[seq] = forward_msg
72
+ # Forward any messages in order
73
+ while expected_audio_seq in audio_ordering_buffer:
74
+ msg_to_forward = audio_ordering_buffer.pop(expected_audio_seq)
75
+ await audio_loop.out_queue.put(msg_to_forward)
76
+ expected_audio_seq += 1
77
+ else:
78
+ # If no sequence number is provided, forward immediately
79
+ await audio_loop.out_queue.put(forward_msg)
80
+
81
+ # Handle text data from client
82
+ elif msg_type == "text":
83
+ user_text = msg.get("content", "")
84
+ print("[from_client_to_gemini] Forwarding user text to Gemini:", user_text)
85
+ forward_msg = {
86
+ "client_content": {
87
+ "turn_complete": True,
88
+ "turns": [
89
+ {
90
+ "role": "user",
91
+ "parts": [
92
+ {"text": user_text}
93
+ ]
94
+ }
95
+ ]
96
+ }
97
+ }
98
+ await audio_loop.out_queue.put(forward_msg)
99
+
100
+ else:
101
+ print("[from_client_to_gemini] Unknown message type:", msg_type)
102
+
103
+ except WebSocketDisconnect:
104
+ print("[from_client_to_gemini] Client disconnected.")
105
+ except Exception as e:
106
+ print("[from_client_to_gemini] Error:", e)
107
+
108
+ async def from_gemini_to_client():
109
+ """Reads PCM audio from Gemini and sends it back to the client."""
110
+ try:
111
+ while True:
112
+ pcm_data = await audio_loop.audio_in_queue.get()
113
+ b64_pcm = base64.b64encode(pcm_data).decode()
114
+
115
+ out_msg = {
116
+ "type": "audio",
117
+ "payload": b64_pcm
118
+ }
119
+ print("[from_gemini_to_client] Sending audio chunk to client. Size:", len(pcm_data))
120
+ await websocket.send_text(json.dumps(out_msg))
121
+
122
+ except WebSocketDisconnect:
123
+ print("[from_gemini_to_client] Client disconnected.")
124
+ except Exception as e:
125
+ print("[from_gemini_to_client] Error:", e)
126
+
127
+ # Launch both tasks concurrently. If either fails or disconnects, we exit.
128
+ try:
129
+ await asyncio.gather(
130
+ from_client_to_gemini(),
131
+ from_gemini_to_client(),
132
+ )
133
+ finally:
134
+ print("[websocket_endpoint] WebSocket handler finished.")
135
+ # Clean up the AudioLoop when the client disconnects
136
+ loop_task.cancel()
137
+ try:
138
+ await loop_task
139
+ except asyncio.CancelledError:
140
+ pass
141
+ print("[websocket_endpoint] Cleaned up AudioLoop for client")
142
+
143
+ if __name__ == "__main__":
144
+ uvicorn.run("webapp:app", host="0.0.0.0", port=8000, reload=True)