Spaces:
Running
Running
Nirav Madhani
commited on
Commit
·
cc7c705
1
Parent(s):
a4541ab
First commit
Browse files- __pycache__/handler.cpython-312.pyc +0 -0
- __pycache__/webapp.cpython-312.pyc +0 -0
- app.py +302 -0
- handler.py +110 -0
- index copy.html +202 -0
- index.html +550 -0
- requirements.txt +138 -0
- webapp.py +144 -0
__pycache__/handler.cpython-312.pyc
ADDED
Binary file (6.18 kB). View file
|
|
__pycache__/webapp.cpython-312.pyc
ADDED
Binary file (6.64 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# Copyright 2023 Google LLC
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
|
16 |
+
"""
|
17 |
+
## Setup
|
18 |
+
|
19 |
+
To install the dependencies for this script, run:
|
20 |
+
|
21 |
+
```
|
22 |
+
pip install google-genai opencv-python pyaudio pillow mss
|
23 |
+
```
|
24 |
+
|
25 |
+
Before running this script, ensure the `GOOGLE_API_KEY` environment
|
26 |
+
variable is set to the api-key you obtained from Google AI Studio.
|
27 |
+
|
28 |
+
Important: **Use headphones**. This script uses the system default audio
|
29 |
+
input and output, which often won't include echo cancellation. So to prevent
|
30 |
+
the model from interrupting itself it is important that you use headphones.
|
31 |
+
|
32 |
+
## Run
|
33 |
+
|
34 |
+
To run the script:
|
35 |
+
|
36 |
+
```
|
37 |
+
python live_api_starter.py
|
38 |
+
```
|
39 |
+
|
40 |
+
The script takes a video-mode flag `--mode`, this can be "camera", "screen", or "none".
|
41 |
+
The default is "camera". To share your screen run:
|
42 |
+
|
43 |
+
```
|
44 |
+
python live_api_starter.py --mode screen
|
45 |
+
```
|
46 |
+
"""
|
47 |
+
|
48 |
+
import asyncio
|
49 |
+
import base64
|
50 |
+
import json
|
51 |
+
import io
|
52 |
+
import os
|
53 |
+
import sys
|
54 |
+
import traceback
|
55 |
+
|
56 |
+
import cv2
|
57 |
+
import pyaudio
|
58 |
+
import PIL.Image
|
59 |
+
import mss
|
60 |
+
import argparse
|
61 |
+
|
62 |
+
from websockets.asyncio.client import connect
|
63 |
+
|
64 |
+
if sys.version_info < (3, 11, 0):
|
65 |
+
import taskgroup, exceptiongroup
|
66 |
+
|
67 |
+
asyncio.TaskGroup = taskgroup.TaskGroup
|
68 |
+
asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup
|
69 |
+
|
70 |
+
FORMAT = pyaudio.paInt16
|
71 |
+
CHANNELS = 1
|
72 |
+
SEND_SAMPLE_RATE = 16000
|
73 |
+
RECEIVE_SAMPLE_RATE = 24000
|
74 |
+
CHUNK_SIZE = 512
|
75 |
+
|
76 |
+
host = "generativelanguage.googleapis.com"
|
77 |
+
model = "gemini-2.0-flash-exp"
|
78 |
+
DEFAULT_MODE="none"
|
79 |
+
|
80 |
+
|
81 |
+
api_key = os.environ["GOOGLE_API_KEY"]
|
82 |
+
uri = f"wss://{host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}"
|
83 |
+
|
84 |
+
|
85 |
+
class AudioLoop:
|
86 |
+
def __init__(self, video_mode=DEFAULT_MODE):
|
87 |
+
self.video_mode=video_mode
|
88 |
+
self.audio_in_queue = None
|
89 |
+
self.out_queue = None
|
90 |
+
|
91 |
+
self.ws = None
|
92 |
+
self.audio_stream = None
|
93 |
+
|
94 |
+
async def startup(self):
|
95 |
+
setup_msg = {"setup": {"model": f"models/{model}"}}
|
96 |
+
await self.ws.send(json.dumps(setup_msg))
|
97 |
+
raw_response = await self.ws.recv(decode=False)
|
98 |
+
setup_response = json.loads(raw_response.decode("ascii"))
|
99 |
+
|
100 |
+
async def send_text(self):
|
101 |
+
while True:
|
102 |
+
text = await asyncio.to_thread(input, "message > ")
|
103 |
+
if text.lower() == "q":
|
104 |
+
break
|
105 |
+
|
106 |
+
msg = {
|
107 |
+
"client_content": {
|
108 |
+
"turn_complete": True,
|
109 |
+
"turns": [{"role": "user", "parts": [{"text": text}]}],
|
110 |
+
}
|
111 |
+
}
|
112 |
+
await self.ws.send(json.dumps(msg))
|
113 |
+
|
114 |
+
def _get_frame(self, cap):
|
115 |
+
# Read the frame
|
116 |
+
ret, frame = cap.read()
|
117 |
+
# Check if the frame was read successfully
|
118 |
+
if not ret:
|
119 |
+
return None
|
120 |
+
|
121 |
+
# Fix: Convert BGR to RGB color space
|
122 |
+
# OpenCV captures in BGR but PIL expects RGB format
|
123 |
+
# This prevents the blue tint in the video feed
|
124 |
+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
125 |
+
img = PIL.Image.fromarray(frame_rgb) # Now using RGB frame
|
126 |
+
img.thumbnail([1024, 1024])
|
127 |
+
|
128 |
+
image_io = io.BytesIO()
|
129 |
+
img.save(image_io, format="jpeg")
|
130 |
+
image_io.seek(0)
|
131 |
+
|
132 |
+
mime_type = "image/jpeg"
|
133 |
+
image_bytes = image_io.read()
|
134 |
+
return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}
|
135 |
+
|
136 |
+
async def get_frames(self):
|
137 |
+
# This takes about a second, and will block the whole program
|
138 |
+
# causing the audio pipeline to overflow if you don't to_thread it.
|
139 |
+
cap = await asyncio.to_thread(
|
140 |
+
cv2.VideoCapture, 0
|
141 |
+
) # 0 represents the default camera
|
142 |
+
|
143 |
+
while True:
|
144 |
+
frame = await asyncio.to_thread(self._get_frame, cap)
|
145 |
+
if frame is None:
|
146 |
+
break
|
147 |
+
await asyncio.sleep(1.0)
|
148 |
+
|
149 |
+
msg = {"realtime_input": {"media_chunks": [frame]}}
|
150 |
+
await self.out_queue.put(msg)
|
151 |
+
|
152 |
+
# Release the VideoCapture object
|
153 |
+
cap.release()
|
154 |
+
|
155 |
+
def _get_screen(self):
|
156 |
+
sct = mss.mss()
|
157 |
+
monitor = sct.monitors[0]
|
158 |
+
|
159 |
+
i = sct.grab(monitor)
|
160 |
+
mime_type = "image/jpeg"
|
161 |
+
image_bytes = mss.tools.to_png(i.rgb, i.size)
|
162 |
+
img = PIL.Image.open(io.BytesIO(image_bytes))
|
163 |
+
|
164 |
+
image_io = io.BytesIO()
|
165 |
+
img.save(image_io, format="jpeg")
|
166 |
+
image_io.seek(0)
|
167 |
+
|
168 |
+
image_bytes = image_io.read()
|
169 |
+
return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}
|
170 |
+
|
171 |
+
async def get_screen(self):
|
172 |
+
while True:
|
173 |
+
frame = await asyncio.to_thread(self._get_screen)
|
174 |
+
if frame is None:
|
175 |
+
break
|
176 |
+
|
177 |
+
await asyncio.sleep(1.0)
|
178 |
+
|
179 |
+
msg = {"realtime_input": {"media_chunks": frame}}
|
180 |
+
await self.out_queue.put(msg)
|
181 |
+
|
182 |
+
async def send_realtime(self):
|
183 |
+
while True:
|
184 |
+
msg = await self.out_queue.get()
|
185 |
+
await self.ws.send(json.dumps(msg))
|
186 |
+
|
187 |
+
async def listen_audio(self):
|
188 |
+
pya = pyaudio.PyAudio()
|
189 |
+
|
190 |
+
mic_info = pya.get_default_input_device_info()
|
191 |
+
self.audio_stream = pya.open(
|
192 |
+
format=FORMAT,
|
193 |
+
channels=CHANNELS,
|
194 |
+
rate=SEND_SAMPLE_RATE,
|
195 |
+
input=True,
|
196 |
+
input_device_index=mic_info["index"],
|
197 |
+
frames_per_buffer=CHUNK_SIZE,
|
198 |
+
)
|
199 |
+
while True:
|
200 |
+
data = await asyncio.to_thread(self.audio_stream.read, CHUNK_SIZE)
|
201 |
+
msg = {
|
202 |
+
"realtime_input": {
|
203 |
+
"media_chunks": [
|
204 |
+
{
|
205 |
+
"data": base64.b64encode(data).decode(),
|
206 |
+
"mime_type": "audio/pcm",
|
207 |
+
}
|
208 |
+
]
|
209 |
+
}
|
210 |
+
}
|
211 |
+
await self.out_queue.put(msg)
|
212 |
+
|
213 |
+
async def receive_audio(self):
|
214 |
+
"Background task to reads from the websocket and write pcm chunks to the output queue"
|
215 |
+
async for raw_response in self.ws:
|
216 |
+
# Other things could be returned here, but we'll ignore those for now.
|
217 |
+
response = json.loads(raw_response.decode("ascii"))
|
218 |
+
|
219 |
+
try:
|
220 |
+
b64data = response["serverContent"]["modelTurn"]["parts"][0][
|
221 |
+
"inlineData"
|
222 |
+
]["data"]
|
223 |
+
except KeyError:
|
224 |
+
pass
|
225 |
+
else:
|
226 |
+
pcm_data = base64.b64decode(b64data)
|
227 |
+
self.audio_in_queue.put_nowait(pcm_data)
|
228 |
+
|
229 |
+
try:
|
230 |
+
turn_complete = response["serverContent"]["turnComplete"]
|
231 |
+
except KeyError:
|
232 |
+
pass
|
233 |
+
else:
|
234 |
+
if turn_complete:
|
235 |
+
# If you interrupt the model, it sends an end_of_turn.
|
236 |
+
# For interruptions to work, we need to empty out the audio queue
|
237 |
+
# Because it may have loaded much more audio than has played yet.
|
238 |
+
print("\nEnd of turn")
|
239 |
+
while not self.audio_in_queue.empty():
|
240 |
+
self.audio_in_queue.get_nowait()
|
241 |
+
|
242 |
+
async def play_audio(self):
|
243 |
+
pya = pyaudio.PyAudio()
|
244 |
+
stream = pya.open(
|
245 |
+
format=FORMAT, channels=CHANNELS, rate=RECEIVE_SAMPLE_RATE, output=True
|
246 |
+
)
|
247 |
+
while True:
|
248 |
+
bytestream = await self.audio_in_queue.get()
|
249 |
+
await asyncio.to_thread(stream.write, bytestream)
|
250 |
+
|
251 |
+
async def run(self):
|
252 |
+
"""Takes audio chunks off the input queue, and writes them to files.
|
253 |
+
|
254 |
+
Splits and displays files if the queue pauses for more than `max_pause`.
|
255 |
+
"""
|
256 |
+
try:
|
257 |
+
async with (
|
258 |
+
await connect(
|
259 |
+
uri, additional_headers={"Content-Type": "application/json"}
|
260 |
+
) as ws,
|
261 |
+
asyncio.TaskGroup() as tg,
|
262 |
+
):
|
263 |
+
self.ws = ws
|
264 |
+
await self.startup()
|
265 |
+
|
266 |
+
self.audio_in_queue = asyncio.Queue()
|
267 |
+
self.out_queue = asyncio.Queue(maxsize=5)
|
268 |
+
|
269 |
+
send_text_task = tg.create_task(self.send_text())
|
270 |
+
|
271 |
+
tg.create_task(self.send_realtime())
|
272 |
+
tg.create_task(self.listen_audio())
|
273 |
+
if self.video_mode == "camera":
|
274 |
+
tg.create_task(self.get_frames())
|
275 |
+
elif self.video_mode == "screen":
|
276 |
+
tg.create_task(self.get_screen())
|
277 |
+
tg.create_task(self.receive_audio())
|
278 |
+
tg.create_task(self.play_audio())
|
279 |
+
|
280 |
+
await send_text_task
|
281 |
+
raise asyncio.CancelledError("User requested exit")
|
282 |
+
|
283 |
+
except asyncio.CancelledError:
|
284 |
+
pass
|
285 |
+
except ExceptionGroup as EG:
|
286 |
+
self.audio_stream.close()
|
287 |
+
traceback.print_exception(EG)
|
288 |
+
|
289 |
+
|
290 |
+
if __name__ == "__main__":
|
291 |
+
parser = argparse.ArgumentParser()
|
292 |
+
parser.add_argument(
|
293 |
+
"--mode",
|
294 |
+
type=str,
|
295 |
+
default=DEFAULT_MODE,
|
296 |
+
help="pixels to stream from",
|
297 |
+
choices=["camera", "screen", "none"],
|
298 |
+
)
|
299 |
+
args = parser.parse_args()
|
300 |
+
|
301 |
+
main = AudioLoop(video_mode=args.mode)
|
302 |
+
asyncio.run(main.run())
|
handler.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# handler.py
|
2 |
+
import asyncio
|
3 |
+
import base64
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import traceback
|
7 |
+
from websockets.asyncio.client import connect
|
8 |
+
|
9 |
+
host = "generativelanguage.googleapis.com"
|
10 |
+
model = "gemini-2.0-flash-exp"
|
11 |
+
api_key = os.environ["GOOGLE_API_KEY"]
|
12 |
+
uri = f"wss://{host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}"
|
13 |
+
|
14 |
+
class AudioLoop:
|
15 |
+
def __init__(self):
|
16 |
+
self.ws = None
|
17 |
+
# Queue for messages to be sent *to* Gemini
|
18 |
+
self.out_queue = asyncio.Queue()
|
19 |
+
# Queue for PCM audio received *from* Gemini
|
20 |
+
self.audio_in_queue = asyncio.Queue()
|
21 |
+
|
22 |
+
async def startup(self, tools=None):
|
23 |
+
"""Send the model setup message to Gemini.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
tools: Optional list of tools to enable for the model
|
27 |
+
"""
|
28 |
+
setup_msg = {"setup": {"model": f"models/{model}"}}
|
29 |
+
if tools:
|
30 |
+
setup_msg["setup"]["tools"] = tools
|
31 |
+
|
32 |
+
await self.ws.send(json.dumps(setup_msg))
|
33 |
+
|
34 |
+
raw_response = await self.ws.recv()
|
35 |
+
setup_response = json.loads(raw_response)
|
36 |
+
print("[AudioLoop] Setup response from Gemini:", setup_response)
|
37 |
+
|
38 |
+
async def send_realtime(self):
|
39 |
+
"""Read from out_queue and forward those messages to Gemini in real time."""
|
40 |
+
while True:
|
41 |
+
msg = await self.out_queue.get()
|
42 |
+
await self.ws.send(json.dumps(msg))
|
43 |
+
|
44 |
+
async def receive_audio(self):
|
45 |
+
"""Read from Gemini websocket and push PCM data into audio_in_queue."""
|
46 |
+
async for raw_response in self.ws:
|
47 |
+
response = json.loads(raw_response)
|
48 |
+
# Debug log all responses (optional)
|
49 |
+
# print("Gemini raw response:", response)
|
50 |
+
|
51 |
+
# Check if there's inline PCM data
|
52 |
+
try:
|
53 |
+
b64data = (
|
54 |
+
response["serverContent"]["modelTurn"]["parts"][0]["inlineData"]["data"]
|
55 |
+
)
|
56 |
+
pcm_data = base64.b64decode(b64data)
|
57 |
+
await self.audio_in_queue.put(pcm_data)
|
58 |
+
except KeyError:
|
59 |
+
# No audio in this message
|
60 |
+
pass
|
61 |
+
|
62 |
+
tool_call = response.pop('toolCall', None)
|
63 |
+
if tool_call is not None:
|
64 |
+
await self.handle_tool_call(tool_call)
|
65 |
+
|
66 |
+
# If "turnComplete" is present
|
67 |
+
if "serverContent" in response and response["serverContent"].get("turnComplete"):
|
68 |
+
print("[AudioLoop] Gemini turn complete")
|
69 |
+
|
70 |
+
async def handle_tool_call(self,tool_call):
|
71 |
+
print(" ", tool_call)
|
72 |
+
for fc in tool_call['functionCalls']:
|
73 |
+
msg = {
|
74 |
+
'tool_response': {
|
75 |
+
'function_responses': [{
|
76 |
+
'id': fc['id'],
|
77 |
+
'name': fc['name'],
|
78 |
+
'response':{'result': {'string_value': 'ok'}}
|
79 |
+
}]
|
80 |
+
}
|
81 |
+
}
|
82 |
+
print('>>> ', msg)
|
83 |
+
await self.ws.send(json.dumps(msg))
|
84 |
+
|
85 |
+
async def run(self):
|
86 |
+
"""Main entry point: connects to Gemini, starts send/receive tasks."""
|
87 |
+
try:
|
88 |
+
turn_on_the_lights_schema = {'name': 'turn_on_the_lights'}
|
89 |
+
turn_off_the_lights_schema = {'name': 'turn_off_the_lights'}
|
90 |
+
tools = [
|
91 |
+
{'google_search': {}},
|
92 |
+
{'function_declarations': [turn_on_the_lights_schema, turn_off_the_lights_schema]},
|
93 |
+
{'code_execution': {}},
|
94 |
+
]
|
95 |
+
async with connect(uri, additional_headers={"Content-Type": "application/json"}) as ws:
|
96 |
+
self.ws = ws
|
97 |
+
await self.startup(tools)
|
98 |
+
|
99 |
+
async with asyncio.TaskGroup() as tg:
|
100 |
+
tg.create_task(self.send_realtime())
|
101 |
+
tg.create_task(self.receive_audio())
|
102 |
+
|
103 |
+
# Keep running until canceled
|
104 |
+
await asyncio.Future()
|
105 |
+
|
106 |
+
except asyncio.CancelledError:
|
107 |
+
pass
|
108 |
+
except Exception as e:
|
109 |
+
traceback.print_exc()
|
110 |
+
raise
|
index copy.html
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8" />
|
5 |
+
<title>Raw PCM Capture Demo</title>
|
6 |
+
</head>
|
7 |
+
<body>
|
8 |
+
|
9 |
+
<h1>Capture Raw PCM via ScriptProcessorNode</h1>
|
10 |
+
|
11 |
+
<p>
|
12 |
+
<button onclick="connectWebSocket()">Connect WebSocket</button>
|
13 |
+
<button onclick="startCapture()">Start Raw PCM</button>
|
14 |
+
<button onclick="stopCapture()">Stop Raw PCM</button>
|
15 |
+
</p>
|
16 |
+
|
17 |
+
<p>
|
18 |
+
<input type="text" id="textMessage" placeholder="Type your message here" />
|
19 |
+
<button onclick="sendText()">Send Text</button>
|
20 |
+
</p>
|
21 |
+
|
22 |
+
<pre id="log" style="background:#f0f0f0;padding:1em;"></pre>
|
23 |
+
|
24 |
+
<script>
|
25 |
+
let socket;
|
26 |
+
let playbackCtx = null;
|
27 |
+
let nextPlaybackTime = 0;
|
28 |
+
let audioCtx;
|
29 |
+
let scriptNode;
|
30 |
+
let micStream;
|
31 |
+
let isCapturing = false;
|
32 |
+
|
33 |
+
function logMessage(...args) {
|
34 |
+
const pre = document.getElementById("log");
|
35 |
+
pre.textContent += args.join(" ") + "\n";
|
36 |
+
console.log(...args);
|
37 |
+
}
|
38 |
+
|
39 |
+
function connectWebSocket() {
|
40 |
+
logMessage("[WebSocket] Connecting...");
|
41 |
+
|
42 |
+
// Adjust port/host if your FastAPI server is elsewhere
|
43 |
+
socket = new WebSocket("ws://localhost:8000/ws");
|
44 |
+
|
45 |
+
socket.onopen = () => {
|
46 |
+
logMessage("[WebSocket] Opened connection");
|
47 |
+
if (!playbackCtx) {
|
48 |
+
playbackCtx = new (window.AudioContext || window.webkitAudioContext)();
|
49 |
+
}
|
50 |
+
nextPlaybackTime = playbackCtx.currentTime;
|
51 |
+
};
|
52 |
+
|
53 |
+
socket.onerror = (err) => {
|
54 |
+
logMessage("[WebSocket] Error:", err);
|
55 |
+
};
|
56 |
+
|
57 |
+
socket.onclose = () => {
|
58 |
+
logMessage("[WebSocket] Closed");
|
59 |
+
};
|
60 |
+
|
61 |
+
socket.onmessage = (event) => {
|
62 |
+
try {
|
63 |
+
const data = JSON.parse(event.data);
|
64 |
+
if (data.type === "audio" && data.payload) {
|
65 |
+
const arrayBuffer = base64ToArrayBuffer(data.payload);
|
66 |
+
const int16View = new Int16Array(arrayBuffer);
|
67 |
+
const float32Buffer = new Float32Array(int16View.length);
|
68 |
+
for (let i = 0; i < int16View.length; i++) {
|
69 |
+
float32Buffer[i] = int16View[i] / 32768;
|
70 |
+
}
|
71 |
+
const sampleRate = 24000; // RECEIVED_SAMPLE_RATE from app.py
|
72 |
+
const audioBuffer = playbackCtx.createBuffer(1, float32Buffer.length, sampleRate);
|
73 |
+
audioBuffer.copyToChannel(float32Buffer, 0);
|
74 |
+
let scheduledTime = playbackCtx.currentTime > nextPlaybackTime ? playbackCtx.currentTime : nextPlaybackTime;
|
75 |
+
const source = playbackCtx.createBufferSource();
|
76 |
+
source.buffer = audioBuffer;
|
77 |
+
source.connect(playbackCtx.destination);
|
78 |
+
source.start(scheduledTime);
|
79 |
+
nextPlaybackTime = scheduledTime + audioBuffer.duration;
|
80 |
+
logMessage("[Audio] Scheduled playback. Start time:", scheduledTime, "Duration:", audioBuffer.duration);
|
81 |
+
} else if (data.type === "text" && data.content) {
|
82 |
+
logMessage("[Text] Received:", data.content);
|
83 |
+
} else {
|
84 |
+
logMessage("[WebSocket] Received:", event.data);
|
85 |
+
}
|
86 |
+
} catch (err) {
|
87 |
+
logMessage("[WebSocket] Error processing message:", err);
|
88 |
+
}
|
89 |
+
};
|
90 |
+
}
|
91 |
+
|
92 |
+
async function startCapture() {
|
93 |
+
if (!socket || socket.readyState !== WebSocket.OPEN) {
|
94 |
+
logMessage("WebSocket not connected. Click 'Connect WebSocket' first.");
|
95 |
+
return;
|
96 |
+
}
|
97 |
+
if (isCapturing) {
|
98 |
+
logMessage("Already capturing!");
|
99 |
+
return;
|
100 |
+
}
|
101 |
+
isCapturing = true;
|
102 |
+
logMessage("Starting microphone capture as raw PCM...");
|
103 |
+
|
104 |
+
try {
|
105 |
+
micStream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
106 |
+
audioCtx = new (window.AudioContext || window.webkitAudioContext)();
|
107 |
+
|
108 |
+
// Create a media source from the mic stream
|
109 |
+
const source = audioCtx.createMediaStreamSource(micStream);
|
110 |
+
|
111 |
+
// Create a ScriptProcessorNode
|
112 |
+
const bufferSize = 4096; // You can adjust this
|
113 |
+
const inputChannels = 1;
|
114 |
+
const outputChannels = 1;
|
115 |
+
scriptNode = audioCtx.createScriptProcessor(bufferSize, inputChannels, outputChannels);
|
116 |
+
|
117 |
+
scriptNode.onaudioprocess = (audioEvent) => {
|
118 |
+
if (!isCapturing) return;
|
119 |
+
|
120 |
+
// Get raw floating-point samples [ -1.0 .. +1.0 ]
|
121 |
+
const inputBuffer = audioEvent.inputBuffer.getChannelData(0);
|
122 |
+
// Convert float samples to 16-bit signed
|
123 |
+
const pcm16 = floatTo16BitPCM(inputBuffer);
|
124 |
+
|
125 |
+
// Encode as base64 and send over WebSocket
|
126 |
+
const bytes = new Uint8Array(pcm16.buffer);
|
127 |
+
const b64 = btoa(String.fromCharCode(...bytes));
|
128 |
+
socket.send(JSON.stringify({
|
129 |
+
type: "audio",
|
130 |
+
payload: b64
|
131 |
+
}));
|
132 |
+
};
|
133 |
+
|
134 |
+
// Connect the pipeline: mic -> script -> (optional) audioCtx.destination
|
135 |
+
source.connect(scriptNode);
|
136 |
+
scriptNode.connect(audioCtx.destination);
|
137 |
+
|
138 |
+
logMessage("Recording...");
|
139 |
+
} catch (err) {
|
140 |
+
logMessage("Error getting user mic:", err);
|
141 |
+
}
|
142 |
+
}
|
143 |
+
|
144 |
+
function stopCapture() {
|
145 |
+
if (!isCapturing) return;
|
146 |
+
isCapturing = false;
|
147 |
+
logMessage("Stopped microphone capture.");
|
148 |
+
|
149 |
+
if (scriptNode) {
|
150 |
+
scriptNode.disconnect();
|
151 |
+
scriptNode.onaudioprocess = null;
|
152 |
+
scriptNode = null;
|
153 |
+
}
|
154 |
+
if (micStream) {
|
155 |
+
// Stop all tracks
|
156 |
+
micStream.getTracks().forEach(track => track.stop());
|
157 |
+
micStream = null;
|
158 |
+
}
|
159 |
+
if (audioCtx) {
|
160 |
+
audioCtx.close();
|
161 |
+
audioCtx = null;
|
162 |
+
}
|
163 |
+
}
|
164 |
+
|
165 |
+
function floatTo16BitPCM(floatSamples) {
|
166 |
+
// Convert an array of floats [-1, 1] to a Int16Array
|
167 |
+
const out = new Int16Array(floatSamples.length);
|
168 |
+
for (let i = 0; i < floatSamples.length; i++) {
|
169 |
+
let s = Math.max(-1, Math.min(1, floatSamples[i]));
|
170 |
+
// scale range
|
171 |
+
s = s < 0 ? s * 0x8000 : s * 0x7FFF;
|
172 |
+
out[i] = s;
|
173 |
+
}
|
174 |
+
return out;
|
175 |
+
}
|
176 |
+
|
177 |
+
function sendText() {
|
178 |
+
const textInput = document.getElementById("textMessage");
|
179 |
+
const text = textInput.value.trim();
|
180 |
+
if (text && socket && socket.readyState === WebSocket.OPEN) {
|
181 |
+
socket.send(JSON.stringify({ type: "text", content: text }));
|
182 |
+
logMessage("[Text] Sent:", text);
|
183 |
+
textInput.value = "";
|
184 |
+
} else {
|
185 |
+
logMessage("WebSocket not connected or text is empty.");
|
186 |
+
}
|
187 |
+
}
|
188 |
+
|
189 |
+
function base64ToArrayBuffer(b64) {
|
190 |
+
const binaryString = window.atob(b64);
|
191 |
+
const len = binaryString.length;
|
192 |
+
const bytes = new Uint8Array(len);
|
193 |
+
for (let i = 0; i < len; i++) {
|
194 |
+
bytes[i] = binaryString.charCodeAt(i);
|
195 |
+
}
|
196 |
+
return bytes.buffer;
|
197 |
+
}
|
198 |
+
|
199 |
+
</script>
|
200 |
+
|
201 |
+
</body>
|
202 |
+
</html>
|
index.html
ADDED
@@ -0,0 +1,550 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8" />
|
5 |
+
<title>Gemini Live Chat - Voice and Text Interaction</title>
|
6 |
+
<style>
|
7 |
+
body {
|
8 |
+
max-width: 800px;
|
9 |
+
margin: 2em auto;
|
10 |
+
padding: 0 1em;
|
11 |
+
font-family: system-ui, -apple-system, sans-serif;
|
12 |
+
}
|
13 |
+
#visualizer {
|
14 |
+
width: 100%;
|
15 |
+
height: 80px;
|
16 |
+
background: #f0f0f0;
|
17 |
+
border-radius: 4px;
|
18 |
+
margin: 0;
|
19 |
+
}
|
20 |
+
#log {
|
21 |
+
background: #f0f0f0;
|
22 |
+
padding: 1em;
|
23 |
+
border-radius: 4px;
|
24 |
+
font-family: monospace;
|
25 |
+
max-height: 400px;
|
26 |
+
overflow-y: auto;
|
27 |
+
}
|
28 |
+
.controls {
|
29 |
+
margin: 1em 0;
|
30 |
+
padding: 1em;
|
31 |
+
background: #f8f8f8;
|
32 |
+
border-radius: 4px;
|
33 |
+
}
|
34 |
+
.function-card {
|
35 |
+
padding: 0.8em;
|
36 |
+
background: white;
|
37 |
+
border-radius: 4px;
|
38 |
+
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
|
39 |
+
}
|
40 |
+
.function-card strong {
|
41 |
+
color: #1976d2;
|
42 |
+
}
|
43 |
+
.function-card ul {
|
44 |
+
color: #555;
|
45 |
+
}
|
46 |
+
button {
|
47 |
+
border: none;
|
48 |
+
padding: 0.5em 1em;
|
49 |
+
border-radius: 3px;
|
50 |
+
cursor: pointer;
|
51 |
+
transition: opacity 0.2s;
|
52 |
+
}
|
53 |
+
button:hover {
|
54 |
+
opacity: 0.9;
|
55 |
+
}
|
56 |
+
#connectButton {
|
57 |
+
background: #2196f3;
|
58 |
+
color: white;
|
59 |
+
}
|
60 |
+
.voice-start {
|
61 |
+
background: #4caf50;
|
62 |
+
color: white;
|
63 |
+
}
|
64 |
+
.voice-stop {
|
65 |
+
background: #f44336;
|
66 |
+
color: white;
|
67 |
+
}
|
68 |
+
</style>
|
69 |
+
</head>
|
70 |
+
<body>
|
71 |
+
|
72 |
+
<h1>Gemini Live Chat</h1>
|
73 |
+
<p>Interactive voice and text chat powered by Gemini AI that supports server-side function calling, code execution, and Google search capabilities.</p>
|
74 |
+
|
75 |
+
<div class="controls" style="background: #e3f2fd;">
|
76 |
+
<h3 style="margin-top: 0;">Available Functions:</h3>
|
77 |
+
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1em;">
|
78 |
+
<div class="function-card">
|
79 |
+
<strong>💡 Light Control</strong>
|
80 |
+
<ul style="margin: 0.5em 0; padding-left: 1.5em;">
|
81 |
+
<li>Turn lights on</li>
|
82 |
+
<li>Turn lights off</li>
|
83 |
+
</ul>
|
84 |
+
</div>
|
85 |
+
<div class="function-card">
|
86 |
+
<strong>🔍 Search</strong>
|
87 |
+
<ul style="margin: 0.5em 0; padding-left: 1.5em;">
|
88 |
+
<li>Google search</li>
|
89 |
+
</ul>
|
90 |
+
</div>
|
91 |
+
<div class="function-card">
|
92 |
+
<strong>💻 Code</strong>
|
93 |
+
<ul style="margin: 0.5em 0; padding-left: 1.5em;">
|
94 |
+
<li>Execute code</li>
|
95 |
+
<li>Run commands</li>
|
96 |
+
</ul>
|
97 |
+
</div>
|
98 |
+
</div>
|
99 |
+
<p style="margin: 0.5em 0 0 0; font-size: 0.9em; color: #666;">
|
100 |
+
Try saying: "Turn on the lights" or "Search for weather in London" or ask any question!
|
101 |
+
</p>
|
102 |
+
</div>
|
103 |
+
|
104 |
+
<div class="controls">
|
105 |
+
<div style="display: flex; align-items: center; justify-content: space-between; gap: 1em;">
|
106 |
+
<div style="display: flex; align-items: center; gap: 1em;">
|
107 |
+
<div>
|
108 |
+
<span style="font-weight: 500; color: #666;">Server:</span>
|
109 |
+
<span id="connectionStatus" style="display: inline-block; padding: 0.3em 0.6em; margin-left: 0.5em; border-radius: 3px; background: #f44336; color: white;">Not connected</span>
|
110 |
+
</div>
|
111 |
+
<div id="micStatus" style="display: none;">
|
112 |
+
<span style="font-weight: 500; color: #666;">Voice:</span>
|
113 |
+
<span style="display: inline-block; padding: 0.3em 0.6em; margin-left: 0.5em; border-radius: 3px; background: #4caf50; color: white;">Recording</span>
|
114 |
+
</div>
|
115 |
+
</div>
|
116 |
+
<div style="display: flex; gap: 0.5em;">
|
117 |
+
<button id="connectButton" onclick="toggleConnection()">
|
118 |
+
<span style="margin-right: 0.3em;">🔌</span> Connect to Server
|
119 |
+
</button>
|
120 |
+
<button id="voiceStartButton" class="voice-start" onclick="startCapture()">
|
121 |
+
<span style="margin-right: 0.3em;">🎤</span> Start Voice Chat
|
122 |
+
</button>
|
123 |
+
<button id="voiceStopButton" class="voice-stop" onclick="stopCapture()" style="display: none;">
|
124 |
+
<span style="margin-right: 0.3em;">⏹️</span> Stop Voice Chat
|
125 |
+
</button>
|
126 |
+
</div>
|
127 |
+
</div>
|
128 |
+
<div style="margin-top: 1em; display: flex; gap: 0.5em;">
|
129 |
+
<input type="text" id="textMessage" placeholder="Type your message here" style="flex: 1; padding: 0.5em; border: 1px solid #ddd; border-radius: 3px;" />
|
130 |
+
<button onclick="sendText()" style="white-space: nowrap;">
|
131 |
+
<span style="margin-right: 0.3em;">📤</span> Send
|
132 |
+
</button>
|
133 |
+
</div>
|
134 |
+
</div>
|
135 |
+
|
136 |
+
<div class="controls">
|
137 |
+
|
138 |
+
<canvas id="visualizer"></canvas>
|
139 |
+
</div>
|
140 |
+
|
141 |
+
<div style="margin: 1em 0;">
|
142 |
+
<strong>Log Settings:</strong><br>
|
143 |
+
<label><input type="checkbox" id="logWebSocket" checked> WebSocket Events</label>
|
144 |
+
<label style="margin-left: 1em"><input type="checkbox" id="logAudio" checked> Audio Events</label>
|
145 |
+
<label style="margin-left: 1em"><input type="checkbox" id="logText" checked> Text Events</label>
|
146 |
+
<label style="margin-left: 1em"><input type="checkbox" id="logError" checked> Error Events</label>
|
147 |
+
</div>
|
148 |
+
|
149 |
+
<pre id="log"></pre>
|
150 |
+
|
151 |
+
<script>
|
152 |
+
let socket;
|
153 |
+
let playbackCtx = null;
|
154 |
+
let nextPlaybackTime = 0;
|
155 |
+
let audioCtx;
|
156 |
+
let scriptNode;
|
157 |
+
let micStream;
|
158 |
+
let isCapturing = false;
|
159 |
+
let audioSeq = 0;
|
160 |
+
let scheduledSources = []; // Track scheduled audio sources
|
161 |
+
let analyser;
|
162 |
+
let visualizerCanvas;
|
163 |
+
let visualizerCtx;
|
164 |
+
let animationFrame;
|
165 |
+
|
166 |
+
function updateConnectionStatus(connected) {
|
167 |
+
const statusEl = document.getElementById('connectionStatus');
|
168 |
+
const connectButton = document.getElementById('connectButton');
|
169 |
+
const voiceStartButton = document.getElementById('voiceStartButton');
|
170 |
+
|
171 |
+
if (connected) {
|
172 |
+
statusEl.textContent = 'Connected';
|
173 |
+
statusEl.style.background = '#4caf50';
|
174 |
+
connectButton.textContent = '🔌 Disconnect Server';
|
175 |
+
voiceStartButton.style.display = '';
|
176 |
+
} else {
|
177 |
+
statusEl.textContent = 'Not connected';
|
178 |
+
statusEl.style.background = '#f44336';
|
179 |
+
connectButton.textContent = '🔌 Connect to Server';
|
180 |
+
voiceStartButton.style.display = 'none';
|
181 |
+
// Also stop recording if we're disconnected
|
182 |
+
if (isCapturing) {
|
183 |
+
stopCapture();
|
184 |
+
}
|
185 |
+
}
|
186 |
+
}
|
187 |
+
|
188 |
+
function updateMicStatus(recording) {
|
189 |
+
const micStatus = document.getElementById('micStatus');
|
190 |
+
const voiceStartButton = document.getElementById('voiceStartButton');
|
191 |
+
const voiceStopButton = document.getElementById('voiceStopButton');
|
192 |
+
|
193 |
+
if (recording) {
|
194 |
+
micStatus.style.display = '';
|
195 |
+
voiceStartButton.style.display = 'none';
|
196 |
+
voiceStopButton.style.display = '';
|
197 |
+
} else {
|
198 |
+
micStatus.style.display = 'none';
|
199 |
+
voiceStartButton.style.display = '';
|
200 |
+
voiceStopButton.style.display = 'none';
|
201 |
+
}
|
202 |
+
}
|
203 |
+
|
204 |
+
function toggleConnection() {
|
205 |
+
if (socket && socket.readyState === WebSocket.OPEN) {
|
206 |
+
socket.close();
|
207 |
+
} else {
|
208 |
+
connectWebSocket();
|
209 |
+
}
|
210 |
+
}
|
211 |
+
|
212 |
+
function logMessage(category, ...args) {
|
213 |
+
const pre = document.getElementById("log");
|
214 |
+
const logCategory = document.getElementById(`log${category.charAt(0).toUpperCase() + category.slice(1)}`);
|
215 |
+
const shouldLog = logCategory ? logCategory.checked : false;
|
216 |
+
|
217 |
+
if (shouldLog) {
|
218 |
+
const timestamp = new Date().toLocaleTimeString();
|
219 |
+
pre.textContent += `[${timestamp}] [${category}] ` + args.join(" ") + "\n";
|
220 |
+
console.log(`[${category}]`, ...args);
|
221 |
+
}
|
222 |
+
}
|
223 |
+
|
224 |
+
function clearScheduledAudio() {
|
225 |
+
// Stop and disconnect all scheduled audio sources
|
226 |
+
while (scheduledSources.length > 0) {
|
227 |
+
const source = scheduledSources.pop();
|
228 |
+
try {
|
229 |
+
source.stop();
|
230 |
+
source.disconnect();
|
231 |
+
} catch (err) {
|
232 |
+
// Ignore errors if source already finished playing
|
233 |
+
}
|
234 |
+
}
|
235 |
+
// Reset next playback time
|
236 |
+
if (playbackCtx) {
|
237 |
+
nextPlaybackTime = playbackCtx.currentTime;
|
238 |
+
}
|
239 |
+
logMessage("Audio", "Cleared all scheduled audio");
|
240 |
+
}
|
241 |
+
|
242 |
+
function setupVisualizer() {
|
243 |
+
visualizerCanvas = document.getElementById('visualizer');
|
244 |
+
visualizerCtx = visualizerCanvas.getContext('2d');
|
245 |
+
|
246 |
+
// Make canvas resolution match display size
|
247 |
+
const rect = visualizerCanvas.getBoundingClientRect();
|
248 |
+
visualizerCanvas.width = rect.width;
|
249 |
+
visualizerCanvas.height = rect.height;
|
250 |
+
|
251 |
+
if (!analyser && playbackCtx) {
|
252 |
+
analyser = playbackCtx.createAnalyser();
|
253 |
+
analyser.fftSize = 2048;
|
254 |
+
}
|
255 |
+
}
|
256 |
+
|
257 |
+
function drawVisualizer() {
|
258 |
+
if (!analyser) return;
|
259 |
+
|
260 |
+
const bufferLength = analyser.frequencyBinCount;
|
261 |
+
const dataArray = new Uint8Array(bufferLength);
|
262 |
+
analyser.getByteTimeDomainData(dataArray);
|
263 |
+
|
264 |
+
visualizerCtx.fillStyle = '#f0f0f0';
|
265 |
+
visualizerCtx.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
|
266 |
+
|
267 |
+
visualizerCtx.lineWidth = 2;
|
268 |
+
visualizerCtx.strokeStyle = '#4caf50';
|
269 |
+
visualizerCtx.beginPath();
|
270 |
+
|
271 |
+
const sliceWidth = visualizerCanvas.width / bufferLength;
|
272 |
+
let x = 0;
|
273 |
+
|
274 |
+
for (let i = 0; i < bufferLength; i++) {
|
275 |
+
const v = dataArray[i] / 128.0;
|
276 |
+
const y = v * visualizerCanvas.height / 2;
|
277 |
+
|
278 |
+
if (i === 0) {
|
279 |
+
visualizerCtx.moveTo(x, y);
|
280 |
+
} else {
|
281 |
+
visualizerCtx.lineTo(x, y);
|
282 |
+
}
|
283 |
+
|
284 |
+
x += sliceWidth;
|
285 |
+
}
|
286 |
+
|
287 |
+
visualizerCtx.lineTo(visualizerCanvas.width, visualizerCanvas.height / 2);
|
288 |
+
visualizerCtx.stroke();
|
289 |
+
|
290 |
+
animationFrame = requestAnimationFrame(drawVisualizer);
|
291 |
+
}
|
292 |
+
|
293 |
+
function stopVisualizer() {
|
294 |
+
if (animationFrame) {
|
295 |
+
cancelAnimationFrame(animationFrame);
|
296 |
+
animationFrame = null;
|
297 |
+
}
|
298 |
+
if (visualizerCtx) {
|
299 |
+
visualizerCtx.fillStyle = '#f0f0f0';
|
300 |
+
visualizerCtx.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
|
301 |
+
}
|
302 |
+
}
|
303 |
+
|
304 |
+
function connectWebSocket() {
|
305 |
+
logMessage("WebSocket", "Connecting...");
|
306 |
+
updateConnectionStatus(false);
|
307 |
+
|
308 |
+
// Use current origin and replace http(s) with ws(s)
|
309 |
+
const wsUrl = `${window.location.protocol === 'https:' ? 'wss:' : 'ws:'}//${window.location.host}/ws`;
|
310 |
+
socket = new WebSocket(wsUrl);
|
311 |
+
|
312 |
+
socket.onopen = () => {
|
313 |
+
logMessage("WebSocket", "Opened connection");
|
314 |
+
updateConnectionStatus(true);
|
315 |
+
if (!playbackCtx) {
|
316 |
+
playbackCtx = new (window.AudioContext || window.webkitAudioContext)();
|
317 |
+
setupVisualizer();
|
318 |
+
}
|
319 |
+
nextPlaybackTime = playbackCtx.currentTime;
|
320 |
+
};
|
321 |
+
|
322 |
+
socket.onerror = (err) => {
|
323 |
+
logMessage("Error", "WebSocket error:", err);
|
324 |
+
updateConnectionStatus(false);
|
325 |
+
};
|
326 |
+
|
327 |
+
socket.onclose = () => {
|
328 |
+
logMessage("WebSocket", "Connection closed");
|
329 |
+
updateConnectionStatus(false);
|
330 |
+
if (isCapturing) {
|
331 |
+
stopCapture();
|
332 |
+
}
|
333 |
+
};
|
334 |
+
|
335 |
+
socket.onmessage = (event) => {
|
336 |
+
try {
|
337 |
+
const data = JSON.parse(event.data);
|
338 |
+
if (data.type === "audio" && data.payload) {
|
339 |
+
const arrayBuffer = base64ToArrayBuffer(data.payload);
|
340 |
+
const int16View = new Int16Array(arrayBuffer);
|
341 |
+
const float32Buffer = new Float32Array(int16View.length);
|
342 |
+
for (let i = 0; i < int16View.length; i++) {
|
343 |
+
float32Buffer[i] = int16View[i] / 32768;
|
344 |
+
}
|
345 |
+
const sampleRate = 24000; // RECEIVED_SAMPLE_RATE from app.py
|
346 |
+
const audioBuffer = playbackCtx.createBuffer(1, float32Buffer.length, sampleRate);
|
347 |
+
audioBuffer.copyToChannel(float32Buffer, 0);
|
348 |
+
let scheduledTime = playbackCtx.currentTime > nextPlaybackTime ? playbackCtx.currentTime : nextPlaybackTime;
|
349 |
+
const source = playbackCtx.createBufferSource();
|
350 |
+
source.buffer = audioBuffer;
|
351 |
+
|
352 |
+
// Connect through analyser for visualization
|
353 |
+
if (analyser) {
|
354 |
+
source.connect(analyser);
|
355 |
+
analyser.connect(playbackCtx.destination);
|
356 |
+
if (!animationFrame) {
|
357 |
+
drawVisualizer();
|
358 |
+
}
|
359 |
+
} else {
|
360 |
+
source.connect(playbackCtx.destination);
|
361 |
+
}
|
362 |
+
|
363 |
+
source.start(scheduledTime);
|
364 |
+
// Add source to tracked sources
|
365 |
+
scheduledSources.push(source);
|
366 |
+
// Remove source from tracking once it finishes
|
367 |
+
source.onended = () => {
|
368 |
+
const index = scheduledSources.indexOf(source);
|
369 |
+
if (index > -1) {
|
370 |
+
scheduledSources.splice(index, 1);
|
371 |
+
}
|
372 |
+
// Stop visualizer if no more audio
|
373 |
+
if (scheduledSources.length === 0) {
|
374 |
+
stopVisualizer();
|
375 |
+
}
|
376 |
+
};
|
377 |
+
nextPlaybackTime = scheduledTime + audioBuffer.duration;
|
378 |
+
logMessage("Audio", "Scheduled playback. Start time:", scheduledTime, "Duration:", audioBuffer.duration);
|
379 |
+
} else if (data.type === "text" && data.content) {
|
380 |
+
logMessage("Text", "Received:", data.content);
|
381 |
+
} else {
|
382 |
+
logMessage("WebSocket", "Received message:", event.data);
|
383 |
+
}
|
384 |
+
} catch (err) {
|
385 |
+
logMessage("Error", "Failed to process message:", err);
|
386 |
+
}
|
387 |
+
};
|
388 |
+
}
|
389 |
+
|
390 |
+
async function startCapture() {
|
391 |
+
if (!socket || socket.readyState !== WebSocket.OPEN) {
|
392 |
+
logMessage("WebSocket", "Not connected. Click 'Connect to Server' first.");
|
393 |
+
return;
|
394 |
+
}
|
395 |
+
if (isCapturing) {
|
396 |
+
logMessage("Audio", "Already capturing!");
|
397 |
+
return;
|
398 |
+
}
|
399 |
+
|
400 |
+
isCapturing = true;
|
401 |
+
updateMicStatus(true);
|
402 |
+
logMessage("Audio", "Starting microphone capture...");
|
403 |
+
|
404 |
+
try {
|
405 |
+
micStream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
406 |
+
logMessage("Audio", "Got microphone access");
|
407 |
+
audioCtx = new (window.AudioContext || window.webkitAudioContext)();
|
408 |
+
logMessage("Audio", "Created AudioContext with sample rate:", audioCtx.sampleRate);
|
409 |
+
|
410 |
+
// Create a media source from the mic stream
|
411 |
+
const source = audioCtx.createMediaStreamSource(micStream);
|
412 |
+
logMessage("Audio", "Created MediaStreamSource");
|
413 |
+
|
414 |
+
// Create a ScriptProcessorNode
|
415 |
+
const bufferSize = 4096; // You can adjust this
|
416 |
+
const inputChannels = 1;
|
417 |
+
const outputChannels = 1;
|
418 |
+
scriptNode = audioCtx.createScriptProcessor(bufferSize, inputChannels, outputChannels);
|
419 |
+
logMessage("Audio", "Created ScriptProcessorNode with buffer size:", bufferSize);
|
420 |
+
|
421 |
+
scriptNode.onaudioprocess = (audioEvent) => {
|
422 |
+
if (!isCapturing) return;
|
423 |
+
|
424 |
+
// Get raw samples and resample to 16kHz
|
425 |
+
const inputBuffer = audioEvent.inputBuffer.getChannelData(0);
|
426 |
+
|
427 |
+
// Check if there's actual audio input (not just silence)
|
428 |
+
const hasAudio = inputBuffer.some(sample => Math.abs(sample) > 0.01); // Threshold for noise
|
429 |
+
if (hasAudio) {
|
430 |
+
clearScheduledAudio(); // Only clear when we detect actual audio input
|
431 |
+
}
|
432 |
+
|
433 |
+
const resampled = resampleAudio(inputBuffer, audioCtx.sampleRate, 16000);
|
434 |
+
|
435 |
+
// Convert resampled audio to 16-bit PCM
|
436 |
+
const pcm16 = floatTo16BitPCM(resampled);
|
437 |
+
|
438 |
+
// Encode as base64 and send over WebSocket
|
439 |
+
const bytes = new Uint8Array(pcm16.buffer);
|
440 |
+
const b64 = btoa(String.fromCharCode(...bytes));
|
441 |
+
const audioMsg = {
|
442 |
+
type: "audio",
|
443 |
+
payload: b64,
|
444 |
+
seq: audioSeq++,
|
445 |
+
config: {
|
446 |
+
sampleRate: 16000,
|
447 |
+
bitDepth: 16,
|
448 |
+
channels: 1
|
449 |
+
}
|
450 |
+
};
|
451 |
+
logMessage("Audio", "Processing chunk. Seq:", audioMsg.seq);
|
452 |
+
try {
|
453 |
+
if (socket.readyState === WebSocket.OPEN) {
|
454 |
+
socket.send(JSON.stringify(audioMsg));
|
455 |
+
} else {
|
456 |
+
logMessage("WebSocket", "Not open, stopping capture");
|
457 |
+
stopCapture();
|
458 |
+
}
|
459 |
+
} catch (err) {
|
460 |
+
logMessage("Error", "Failed to send audio:", err);
|
461 |
+
stopCapture();
|
462 |
+
}
|
463 |
+
};
|
464 |
+
|
465 |
+
// Connect the pipeline: mic -> script -> (optional) audioCtx.destination
|
466 |
+
source.connect(scriptNode);
|
467 |
+
logMessage("Audio", "Connected audio pipeline");
|
468 |
+
|
469 |
+
logMessage("Audio", "Recording...");
|
470 |
+
} catch (err) {
|
471 |
+
logMessage("Error", "Failed to get microphone access:", err);
|
472 |
+
isCapturing = false;
|
473 |
+
}
|
474 |
+
}
|
475 |
+
|
476 |
+
function stopCapture() {
|
477 |
+
if (!isCapturing) return;
|
478 |
+
isCapturing = false;
|
479 |
+
updateMicStatus(false);
|
480 |
+
logMessage("Audio", "Stopped microphone capture");
|
481 |
+
|
482 |
+
if (scriptNode) {
|
483 |
+
scriptNode.disconnect();
|
484 |
+
scriptNode.onaudioprocess = null;
|
485 |
+
scriptNode = null;
|
486 |
+
}
|
487 |
+
if (micStream) {
|
488 |
+
// Stop all tracks
|
489 |
+
micStream.getTracks().forEach(track => track.stop());
|
490 |
+
micStream = null;
|
491 |
+
}
|
492 |
+
if (audioCtx) {
|
493 |
+
audioCtx.close();
|
494 |
+
audioCtx = null;
|
495 |
+
}
|
496 |
+
}
|
497 |
+
|
498 |
+
function floatTo16BitPCM(floatSamples) {
|
499 |
+
// Convert an array of floats [-1, 1] to a Int16Array
|
500 |
+
const out = new Int16Array(floatSamples.length);
|
501 |
+
for (let i = 0; i < floatSamples.length; i++) {
|
502 |
+
let s = Math.max(-1, Math.min(1, floatSamples[i]));
|
503 |
+
// scale range
|
504 |
+
s = s < 0 ? s * 0x8000 : s * 0x7FFF;
|
505 |
+
out[i] = s;
|
506 |
+
}
|
507 |
+
return out;
|
508 |
+
}
|
509 |
+
|
510 |
+
function resampleAudio(inputBuffer, fromRate, toRate) {
|
511 |
+
const ratio = toRate / fromRate;
|
512 |
+
const newLength = Math.round(inputBuffer.length * ratio);
|
513 |
+
const resampled = new Float32Array(newLength);
|
514 |
+
|
515 |
+
for(let i = 0; i < newLength; i++) {
|
516 |
+
const index = Math.round(i / ratio);
|
517 |
+
resampled[i] = inputBuffer[Math.min(index, inputBuffer.length-1)];
|
518 |
+
}
|
519 |
+
return resampled;
|
520 |
+
}
|
521 |
+
|
522 |
+
function sendText() {
|
523 |
+
const textInput = document.getElementById("textMessage");
|
524 |
+
const text = textInput.value.trim();
|
525 |
+
if (text && socket && socket.readyState === WebSocket.OPEN) {
|
526 |
+
// Clear any scheduled audio before sending text
|
527 |
+
clearScheduledAudio();
|
528 |
+
|
529 |
+
socket.send(JSON.stringify({ type: "text", content: text }));
|
530 |
+
logMessage("Text", "Sent:", text);
|
531 |
+
textInput.value = "";
|
532 |
+
} else {
|
533 |
+
logMessage("WebSocket", "Not connected or text is empty");
|
534 |
+
}
|
535 |
+
}
|
536 |
+
|
537 |
+
function base64ToArrayBuffer(b64) {
|
538 |
+
const binaryString = window.atob(b64);
|
539 |
+
const len = binaryString.length;
|
540 |
+
const bytes = new Uint8Array(len);
|
541 |
+
for (let i = 0; i < len; i++) {
|
542 |
+
bytes[i] = binaryString.charCodeAt(i);
|
543 |
+
}
|
544 |
+
return bytes.buffer;
|
545 |
+
}
|
546 |
+
|
547 |
+
</script>
|
548 |
+
|
549 |
+
</body>
|
550 |
+
</html>
|
requirements.txt
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
aiohappyeyeballs==2.4.4
|
3 |
+
aiohttp==3.11.11
|
4 |
+
aiohttp-retry==2.9.1
|
5 |
+
aioice==0.9.0
|
6 |
+
aiortc==1.10.0
|
7 |
+
aiosignal==1.3.2
|
8 |
+
altair==5.5.0
|
9 |
+
annotated-types==0.7.0
|
10 |
+
anyio==4.8.0
|
11 |
+
attrs==24.3.0
|
12 |
+
av==13.1.0
|
13 |
+
blinker==1.9.0
|
14 |
+
cachetools==5.5.0
|
15 |
+
certifi==2024.8.30
|
16 |
+
cffi==1.17.1
|
17 |
+
charset-normalizer==3.4.0
|
18 |
+
click==8.1.7
|
19 |
+
colorama==0.4.6
|
20 |
+
contourpy==1.3.1
|
21 |
+
cryptography==44.0.0
|
22 |
+
cycler==0.12.1
|
23 |
+
dnspython==2.7.0
|
24 |
+
exceptiongroup==1.2.0
|
25 |
+
fastapi==0.115.8
|
26 |
+
ffmpeg-python==0.2.0
|
27 |
+
ffmpy==0.5.0
|
28 |
+
filelock==3.17.0
|
29 |
+
fonttools==4.55.8
|
30 |
+
frozenlist==1.5.0
|
31 |
+
fsspec==2024.12.0
|
32 |
+
future==1.0.0
|
33 |
+
gitdb==4.0.11
|
34 |
+
GitPython==3.1.43
|
35 |
+
google-ai-generativelanguage==0.6.15
|
36 |
+
google-api-core==2.24.1
|
37 |
+
google-api-python-client==2.160.0
|
38 |
+
google-auth==2.37.0
|
39 |
+
google-auth-httplib2==0.2.0
|
40 |
+
google-crc32c==1.6.0
|
41 |
+
google-genai==0.2.2
|
42 |
+
google-generativeai==0.8.4
|
43 |
+
googleapis-common-protos==1.66.0
|
44 |
+
gradio==5.14.0
|
45 |
+
gradio_client==1.7.0
|
46 |
+
gradio_webrtc==0.0.30
|
47 |
+
grpcio==1.70.0
|
48 |
+
grpcio-status==1.70.0
|
49 |
+
h11==0.14.0
|
50 |
+
httpcore==1.0.7
|
51 |
+
httplib2==0.22.0
|
52 |
+
httpx==0.28.1
|
53 |
+
huggingface-hub==0.28.1
|
54 |
+
idna==3.10
|
55 |
+
ifaddr==0.2.0
|
56 |
+
importlib_resources==6.5.2
|
57 |
+
Jinja2==3.1.4
|
58 |
+
jsonschema==4.23.0
|
59 |
+
jsonschema-specifications==2024.10.1
|
60 |
+
keyboard==0.13.5
|
61 |
+
kiwisolver==1.4.8
|
62 |
+
markdown-it-py==3.0.0
|
63 |
+
MarkupSafe==2.1.5
|
64 |
+
matplotlib==3.10.0
|
65 |
+
mdurl==0.1.2
|
66 |
+
MouseInfo==0.1.3
|
67 |
+
mss==10.0.0
|
68 |
+
multidict==6.1.0
|
69 |
+
narwhals==1.19.0
|
70 |
+
numpy==1.26.4
|
71 |
+
opencv-python==4.10.0.84
|
72 |
+
orjson==3.10.15
|
73 |
+
packaging==24.2
|
74 |
+
pandas==2.2.3
|
75 |
+
pillow==10.4.0
|
76 |
+
plotly==5.24.1
|
77 |
+
propcache==0.2.1
|
78 |
+
proto-plus==1.26.0
|
79 |
+
protobuf==5.29.2
|
80 |
+
pyarrow==18.1.0
|
81 |
+
pyasn1==0.6.1
|
82 |
+
pyasn1_modules==0.4.1
|
83 |
+
PyAudio==0.2.14
|
84 |
+
PyAutoGUI==0.9.54
|
85 |
+
pycparser==2.22
|
86 |
+
pydantic==2.10.3
|
87 |
+
pydantic_core==2.27.1
|
88 |
+
pydeck==0.9.1
|
89 |
+
pydub==0.25.1
|
90 |
+
pyee==12.1.1
|
91 |
+
PyGetWindow==0.0.9
|
92 |
+
Pygments==2.18.0
|
93 |
+
PyJWT==2.10.1
|
94 |
+
pylibsrtp==0.10.0
|
95 |
+
PyMsgBox==1.0.9
|
96 |
+
pyngrok==7.2.2
|
97 |
+
pyOpenSSL==25.0.0
|
98 |
+
pyparsing==3.2.1
|
99 |
+
pyperclip==1.9.0
|
100 |
+
PyRect==0.2.0
|
101 |
+
PyScreeze==1.0.1
|
102 |
+
python-dateutil==2.9.0.post0
|
103 |
+
python-dotenv==1.0.1
|
104 |
+
python-multipart==0.0.20
|
105 |
+
pytweening==1.2.0
|
106 |
+
pytz==2024.2
|
107 |
+
PyYAML==6.0.2
|
108 |
+
referencing==0.35.1
|
109 |
+
requests==2.32.3
|
110 |
+
rich==13.9.4
|
111 |
+
rpds-py==0.22.3
|
112 |
+
rsa==4.9
|
113 |
+
ruff==0.9.4
|
114 |
+
safehttpx==0.1.6
|
115 |
+
semantic-version==2.10.0
|
116 |
+
setuptools==75.8.0
|
117 |
+
shellingham==1.5.4
|
118 |
+
simpleaudio==1.0.4
|
119 |
+
six==1.17.0
|
120 |
+
smmap==5.0.1
|
121 |
+
sniffio==1.3.1
|
122 |
+
starlette==0.45.3
|
123 |
+
streamlit==1.41.1
|
124 |
+
tenacity==9.0.0
|
125 |
+
toml==0.10.2
|
126 |
+
tomlkit==0.13.2
|
127 |
+
tornado==6.4.2
|
128 |
+
tqdm==4.67.1
|
129 |
+
twilio==9.4.4
|
130 |
+
typer==0.15.1
|
131 |
+
typing_extensions==4.12.2
|
132 |
+
tzdata==2024.2
|
133 |
+
uritemplate==4.1.1
|
134 |
+
urllib3==2.2.3
|
135 |
+
uvicorn==0.34.0
|
136 |
+
watchdog==6.0.0
|
137 |
+
websockets==14.2
|
138 |
+
yarl==1.18.3
|
webapp.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# webapp.py
|
2 |
+
|
3 |
+
import asyncio
|
4 |
+
import base64
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
|
8 |
+
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
9 |
+
from fastapi.responses import HTMLResponse
|
10 |
+
from fastapi.staticfiles import StaticFiles
|
11 |
+
import uvicorn
|
12 |
+
|
13 |
+
from handler import AudioLoop # Import your AudioLoop from above
|
14 |
+
|
15 |
+
app = FastAPI()
|
16 |
+
|
17 |
+
# Mount the web_ui directory to serve static files
|
18 |
+
current_dir = os.path.dirname(os.path.realpath(__file__))
|
19 |
+
app.mount("/web_ui", StaticFiles(directory=current_dir), name="web_ui")
|
20 |
+
|
21 |
+
@app.get("/")
|
22 |
+
async def get_index():
|
23 |
+
# Read and return the index.html file
|
24 |
+
index_path = os.path.join(current_dir, "index.html")
|
25 |
+
with open(index_path, "r", encoding="utf-8") as f:
|
26 |
+
html_content = f.read()
|
27 |
+
return HTMLResponse(content=html_content)
|
28 |
+
|
29 |
+
@app.websocket("/ws")
|
30 |
+
async def websocket_endpoint(websocket: WebSocket):
|
31 |
+
await websocket.accept()
|
32 |
+
print("[websocket_endpoint] Client connected.")
|
33 |
+
|
34 |
+
# Create a new AudioLoop instance for this client
|
35 |
+
audio_loop = AudioLoop()
|
36 |
+
audio_ordering_buffer = {}
|
37 |
+
expected_audio_seq = 0
|
38 |
+
|
39 |
+
# Start the AudioLoop for this client
|
40 |
+
loop_task = asyncio.create_task(audio_loop.run())
|
41 |
+
print("[websocket_endpoint] Started new AudioLoop for client")
|
42 |
+
|
43 |
+
async def from_client_to_gemini():
|
44 |
+
"""Handles incoming messages from the client and forwards them to Gemini."""
|
45 |
+
nonlocal audio_ordering_buffer, expected_audio_seq
|
46 |
+
try:
|
47 |
+
while True:
|
48 |
+
data = await websocket.receive_text()
|
49 |
+
msg = json.loads(data)
|
50 |
+
msg_type = msg.get("type")
|
51 |
+
|
52 |
+
#print("[from_client_to_gemini] Received message from client:", msg)
|
53 |
+
|
54 |
+
# Handle audio data from client
|
55 |
+
if msg_type == "audio":
|
56 |
+
raw_pcm = base64.b64decode(msg["payload"])
|
57 |
+
forward_msg = {
|
58 |
+
"realtime_input": {
|
59 |
+
"media_chunks": [
|
60 |
+
{
|
61 |
+
"data": base64.b64encode(raw_pcm).decode(),
|
62 |
+
"mime_type": "audio/pcm"
|
63 |
+
}
|
64 |
+
]
|
65 |
+
}
|
66 |
+
}
|
67 |
+
# Retrieve the sequence number from the message
|
68 |
+
seq = msg.get("seq")
|
69 |
+
if seq is not None:
|
70 |
+
# Store the message in the buffer
|
71 |
+
audio_ordering_buffer[seq] = forward_msg
|
72 |
+
# Forward any messages in order
|
73 |
+
while expected_audio_seq in audio_ordering_buffer:
|
74 |
+
msg_to_forward = audio_ordering_buffer.pop(expected_audio_seq)
|
75 |
+
await audio_loop.out_queue.put(msg_to_forward)
|
76 |
+
expected_audio_seq += 1
|
77 |
+
else:
|
78 |
+
# If no sequence number is provided, forward immediately
|
79 |
+
await audio_loop.out_queue.put(forward_msg)
|
80 |
+
|
81 |
+
# Handle text data from client
|
82 |
+
elif msg_type == "text":
|
83 |
+
user_text = msg.get("content", "")
|
84 |
+
print("[from_client_to_gemini] Forwarding user text to Gemini:", user_text)
|
85 |
+
forward_msg = {
|
86 |
+
"client_content": {
|
87 |
+
"turn_complete": True,
|
88 |
+
"turns": [
|
89 |
+
{
|
90 |
+
"role": "user",
|
91 |
+
"parts": [
|
92 |
+
{"text": user_text}
|
93 |
+
]
|
94 |
+
}
|
95 |
+
]
|
96 |
+
}
|
97 |
+
}
|
98 |
+
await audio_loop.out_queue.put(forward_msg)
|
99 |
+
|
100 |
+
else:
|
101 |
+
print("[from_client_to_gemini] Unknown message type:", msg_type)
|
102 |
+
|
103 |
+
except WebSocketDisconnect:
|
104 |
+
print("[from_client_to_gemini] Client disconnected.")
|
105 |
+
except Exception as e:
|
106 |
+
print("[from_client_to_gemini] Error:", e)
|
107 |
+
|
108 |
+
async def from_gemini_to_client():
|
109 |
+
"""Reads PCM audio from Gemini and sends it back to the client."""
|
110 |
+
try:
|
111 |
+
while True:
|
112 |
+
pcm_data = await audio_loop.audio_in_queue.get()
|
113 |
+
b64_pcm = base64.b64encode(pcm_data).decode()
|
114 |
+
|
115 |
+
out_msg = {
|
116 |
+
"type": "audio",
|
117 |
+
"payload": b64_pcm
|
118 |
+
}
|
119 |
+
print("[from_gemini_to_client] Sending audio chunk to client. Size:", len(pcm_data))
|
120 |
+
await websocket.send_text(json.dumps(out_msg))
|
121 |
+
|
122 |
+
except WebSocketDisconnect:
|
123 |
+
print("[from_gemini_to_client] Client disconnected.")
|
124 |
+
except Exception as e:
|
125 |
+
print("[from_gemini_to_client] Error:", e)
|
126 |
+
|
127 |
+
# Launch both tasks concurrently. If either fails or disconnects, we exit.
|
128 |
+
try:
|
129 |
+
await asyncio.gather(
|
130 |
+
from_client_to_gemini(),
|
131 |
+
from_gemini_to_client(),
|
132 |
+
)
|
133 |
+
finally:
|
134 |
+
print("[websocket_endpoint] WebSocket handler finished.")
|
135 |
+
# Clean up the AudioLoop when the client disconnects
|
136 |
+
loop_task.cancel()
|
137 |
+
try:
|
138 |
+
await loop_task
|
139 |
+
except asyncio.CancelledError:
|
140 |
+
pass
|
141 |
+
print("[websocket_endpoint] Cleaned up AudioLoop for client")
|
142 |
+
|
143 |
+
if __name__ == "__main__":
|
144 |
+
uvicorn.run("webapp:app", host="0.0.0.0", port=8000, reload=True)
|