Spaces:

Nirav-Madhani
/

gemini-live

Runtime error

App Files Files Community

Nirav Madhani commited on Feb 8

Commit

cc7c705

1 Parent(s): a4541ab

First commit

Browse files

Files changed (8) hide show

__pycache__/handler.cpython-312.pyc +0 -0
__pycache__/webapp.cpython-312.pyc +0 -0
app.py +302 -0
handler.py +110 -0
index copy.html +202 -0
index.html +550 -0
requirements.txt +138 -0
webapp.py +144 -0

__pycache__/handler.cpython-312.pyc ADDED Viewed

Binary file (6.18 kB). View file

__pycache__/webapp.cpython-312.pyc ADDED Viewed

Binary file (6.64 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,302 @@

+# -*- coding: utf-8 -*-
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+## Setup
+To install the dependencies for this script, run:
+```
+pip install google-genai opencv-python pyaudio pillow mss
+```
+Before running this script, ensure the `GOOGLE_API_KEY` environment
+variable is set to the api-key you obtained from Google AI Studio.
+Important: **Use headphones**. This script uses the system default audio
+input and output, which often won't include echo cancellation. So to prevent
+the model from interrupting itself it is important that you use headphones.
+## Run
+To run the script:
+```
+python live_api_starter.py
+```
+The script takes a video-mode flag `--mode`, this can be "camera", "screen", or "none".
+The default is "camera". To share your screen run:
+```
+python live_api_starter.py --mode screen
+```
+"""
+import asyncio
+import base64
+import json
+import io
+import os
+import sys
+import traceback
+import cv2
+import pyaudio
+import PIL.Image
+import mss
+import argparse
+from websockets.asyncio.client import connect
+if sys.version_info < (3, 11, 0):
+    import taskgroup, exceptiongroup
+    asyncio.TaskGroup = taskgroup.TaskGroup
+    asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+SEND_SAMPLE_RATE = 16000
+RECEIVE_SAMPLE_RATE = 24000
+CHUNK_SIZE = 512
+host = "generativelanguage.googleapis.com"
+model = "gemini-2.0-flash-exp"
+DEFAULT_MODE="none"
+api_key = os.environ["GOOGLE_API_KEY"]
+uri = f"wss://{host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}"
+class AudioLoop:
+    def __init__(self, video_mode=DEFAULT_MODE):
+        self.video_mode=video_mode
+        self.audio_in_queue = None
+        self.out_queue = None
+        self.ws = None
+        self.audio_stream = None
+    async def startup(self):
+        setup_msg = {"setup": {"model": f"models/{model}"}}
+        await self.ws.send(json.dumps(setup_msg))
+        raw_response = await self.ws.recv(decode=False)
+        setup_response = json.loads(raw_response.decode("ascii"))
+    async def send_text(self):
+        while True:
+            text = await asyncio.to_thread(input, "message > ")
+            if text.lower() == "q":
+                break
+            msg = {
+                "client_content": {
+                    "turn_complete": True,
+                    "turns": [{"role": "user", "parts": [{"text": text}]}],
+                }
+            }
+            await self.ws.send(json.dumps(msg))
+    def _get_frame(self, cap):
+        # Read the frame
+        ret, frame = cap.read()
+        # Check if the frame was read successfully
+        if not ret:
+            return None
+        # Fix: Convert BGR to RGB color space
+        # OpenCV captures in BGR but PIL expects RGB format
+        # This prevents the blue tint in the video feed
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        img = PIL.Image.fromarray(frame_rgb)  # Now using RGB frame
+        img.thumbnail([1024, 1024])
+        image_io = io.BytesIO()
+        img.save(image_io, format="jpeg")
+        image_io.seek(0)
+        mime_type = "image/jpeg"
+        image_bytes = image_io.read()
+        return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}
+    async def get_frames(self):
+        # This takes about a second, and will block the whole program
+        # causing the audio pipeline to overflow if you don't to_thread it.
+        cap = await asyncio.to_thread(
+            cv2.VideoCapture, 0
+        )  # 0 represents the default camera
+        while True:
+            frame = await asyncio.to_thread(self._get_frame, cap)
+            if frame is None:
+                break
+            await asyncio.sleep(1.0)
+            msg = {"realtime_input": {"media_chunks": [frame]}}
+            await self.out_queue.put(msg)
+        # Release the VideoCapture object
+        cap.release()
+    def _get_screen(self):
+        sct = mss.mss()
+        monitor = sct.monitors[0]
+        i = sct.grab(monitor)
+        mime_type = "image/jpeg"
+        image_bytes = mss.tools.to_png(i.rgb, i.size)
+        img = PIL.Image.open(io.BytesIO(image_bytes))
+        image_io = io.BytesIO()
+        img.save(image_io, format="jpeg")
+        image_io.seek(0)
+        image_bytes = image_io.read()
+        return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}
+    async def get_screen(self):
+        while True:
+            frame = await asyncio.to_thread(self._get_screen)
+            if frame is None:
+                break
+            await asyncio.sleep(1.0)
+            msg = {"realtime_input": {"media_chunks": frame}}
+            await self.out_queue.put(msg)
+    async def send_realtime(self):
+        while True:
+            msg = await self.out_queue.get()
+            await self.ws.send(json.dumps(msg))
+    async def listen_audio(self):
+        pya = pyaudio.PyAudio()
+        mic_info = pya.get_default_input_device_info()
+        self.audio_stream = pya.open(
+            format=FORMAT,
+            channels=CHANNELS,
+            rate=SEND_SAMPLE_RATE,
+            input=True,
+            input_device_index=mic_info["index"],
+            frames_per_buffer=CHUNK_SIZE,
+        )
+        while True:
+            data = await asyncio.to_thread(self.audio_stream.read, CHUNK_SIZE)
+            msg = {
+                "realtime_input": {
+                    "media_chunks": [
+                        {
+                            "data": base64.b64encode(data).decode(),
+                            "mime_type": "audio/pcm",
+                        }
+                    ]
+                }
+            }
+            await self.out_queue.put(msg)
+    async def receive_audio(self):
+        "Background task to reads from the websocket and write pcm chunks to the output queue"
+        async for raw_response in self.ws:
+            # Other things could be returned here, but we'll ignore those for now.
+            response = json.loads(raw_response.decode("ascii"))
+            try:
+                b64data = response["serverContent"]["modelTurn"]["parts"][0][
+                    "inlineData"
+                ]["data"]
+            except KeyError:
+                pass
+            else:
+                pcm_data = base64.b64decode(b64data)
+                self.audio_in_queue.put_nowait(pcm_data)
+            try:
+                turn_complete = response["serverContent"]["turnComplete"]
+            except KeyError:
+                pass
+            else:
+                if turn_complete:
+                    # If you interrupt the model, it sends an end_of_turn.
+                    # For interruptions to work, we need to empty out the audio queue
+                    # Because it may have loaded much more audio than has played yet.
+                    print("\nEnd of turn")
+                    while not self.audio_in_queue.empty():
+                        self.audio_in_queue.get_nowait()
+    async def play_audio(self):
+        pya = pyaudio.PyAudio()
+        stream = pya.open(
+            format=FORMAT, channels=CHANNELS, rate=RECEIVE_SAMPLE_RATE, output=True
+        )
+        while True:
+            bytestream = await self.audio_in_queue.get()
+            await asyncio.to_thread(stream.write, bytestream)
+    async def run(self):
+        """Takes audio chunks off the input queue, and writes them to files.
+        Splits and displays files if the queue pauses for more than `max_pause`.
+        """
+        try:
+            async with (
+                await connect(
+                    uri, additional_headers={"Content-Type": "application/json"}
+                ) as ws,
+                asyncio.TaskGroup() as tg,
+            ):
+                self.ws = ws
+                await self.startup()
+                self.audio_in_queue = asyncio.Queue()
+                self.out_queue = asyncio.Queue(maxsize=5)
+                send_text_task = tg.create_task(self.send_text())
+                tg.create_task(self.send_realtime())
+                tg.create_task(self.listen_audio())
+                if self.video_mode == "camera":
+                    tg.create_task(self.get_frames())
+                elif self.video_mode == "screen":
+                    tg.create_task(self.get_screen())
+                tg.create_task(self.receive_audio())
+                tg.create_task(self.play_audio())
+                await send_text_task
+                raise asyncio.CancelledError("User requested exit")
+        except asyncio.CancelledError:
+            pass
+        except ExceptionGroup as EG:
+            self.audio_stream.close()
+            traceback.print_exception(EG)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default=DEFAULT_MODE,
+        help="pixels to stream from",
+        choices=["camera", "screen", "none"],
+    )
+    args = parser.parse_args()
+    main = AudioLoop(video_mode=args.mode)
+    asyncio.run(main.run())

handler.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# handler.py
+import asyncio
+import base64
+import json
+import os
+import traceback
+from websockets.asyncio.client import connect
+host = "generativelanguage.googleapis.com"
+model = "gemini-2.0-flash-exp"
+api_key = os.environ["GOOGLE_API_KEY"]
+uri = f"wss://{host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}"
+class AudioLoop:
+    def __init__(self):
+        self.ws = None
+        # Queue for messages to be sent *to* Gemini
+        self.out_queue = asyncio.Queue()
+        # Queue for PCM audio received *from* Gemini
+        self.audio_in_queue = asyncio.Queue()
+    async def startup(self, tools=None):
+        """Send the model setup message to Gemini.
+        Args:
+            tools: Optional list of tools to enable for the model
+        """
+        setup_msg = {"setup": {"model": f"models/{model}"}}
+        if tools:
+            setup_msg["setup"]["tools"] = tools
+        await self.ws.send(json.dumps(setup_msg))
+        raw_response = await self.ws.recv()
+        setup_response = json.loads(raw_response)
+        print("[AudioLoop] Setup response from Gemini:", setup_response)
+    async def send_realtime(self):
+        """Read from out_queue and forward those messages to Gemini in real time."""
+        while True:
+            msg = await self.out_queue.get()
+            await self.ws.send(json.dumps(msg))
+    async def receive_audio(self):
+        """Read from Gemini websocket and push PCM data into audio_in_queue."""
+        async for raw_response in self.ws:
+            response = json.loads(raw_response)
+            # Debug log all responses (optional)
+            # print("Gemini raw response:", response)
+            # Check if there's inline PCM data
+            try:
+                b64data = (
+                    response["serverContent"]["modelTurn"]["parts"][0]["inlineData"]["data"]
+                )
+                pcm_data = base64.b64decode(b64data)
+                await self.audio_in_queue.put(pcm_data)
+            except KeyError:
+                # No audio in this message
+                pass
+            tool_call = response.pop('toolCall', None)
+            if tool_call is not None:
+                await self.handle_tool_call(tool_call)
+            # If "turnComplete" is present
+            if "serverContent" in response and response["serverContent"].get("turnComplete"):
+                print("[AudioLoop] Gemini turn complete")
+    async def handle_tool_call(self,tool_call):
+        print("    ", tool_call)
+        for fc in tool_call['functionCalls']:
+            msg = {
+            'tool_response': {
+                'function_responses': [{
+                    'id': fc['id'],
+                    'name': fc['name'],
+                    'response':{'result': {'string_value': 'ok'}}
+                }]
+                }
+            }
+            print('>>> ', msg)
+            await self.ws.send(json.dumps(msg))
+    async def run(self):
+        """Main entry point: connects to Gemini, starts send/receive tasks."""
+        try:
+            turn_on_the_lights_schema = {'name': 'turn_on_the_lights'}
+            turn_off_the_lights_schema = {'name': 'turn_off_the_lights'}
+            tools = [
+                {'google_search': {}},
+                {'function_declarations': [turn_on_the_lights_schema, turn_off_the_lights_schema]},
+                {'code_execution': {}},
+            ]
+            async with connect(uri, additional_headers={"Content-Type": "application/json"}) as ws:
+                self.ws = ws
+                await self.startup(tools)
+                async with asyncio.TaskGroup() as tg:
+                    tg.create_task(self.send_realtime())
+                    tg.create_task(self.receive_audio())
+                    # Keep running until canceled
+                    await asyncio.Future()
+        except asyncio.CancelledError:
+            pass
+        except Exception as e:
+            traceback.print_exc()
+            raise

index copy.html ADDED Viewed

	@@ -0,0 +1,202 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <title>Raw PCM Capture Demo</title>
+</head>
+<body>
+<h1>Capture Raw PCM via ScriptProcessorNode</h1>
+<p>
+  <button onclick="connectWebSocket()">Connect WebSocket</button>
+  <button onclick="startCapture()">Start Raw PCM</button>
+  <button onclick="stopCapture()">Stop Raw PCM</button>
+</p>
+<p>
+  <input type="text" id="textMessage" placeholder="Type your message here" />
+  <button onclick="sendText()">Send Text</button>
+</p>
+<pre id="log" style="background:#f0f0f0;padding:1em;"></pre>
+<script>
+  let socket;
+  let playbackCtx = null;
+  let nextPlaybackTime = 0;
+  let audioCtx;
+  let scriptNode;
+  let micStream;
+  let isCapturing = false;
+  function logMessage(...args) {
+    const pre = document.getElementById("log");
+    pre.textContent += args.join(" ") + "\n";
+    console.log(...args);
+  }
+  function connectWebSocket() {
+    logMessage("[WebSocket] Connecting...");
+    // Adjust port/host if your FastAPI server is elsewhere
+    socket = new WebSocket("ws://localhost:8000/ws");
+    socket.onopen = () => {
+      logMessage("[WebSocket] Opened connection");
+      if (!playbackCtx) {
+        playbackCtx = new (window.AudioContext || window.webkitAudioContext)();
+      }
+      nextPlaybackTime = playbackCtx.currentTime;
+    };
+    socket.onerror = (err) => {
+      logMessage("[WebSocket] Error:", err);
+    };
+    socket.onclose = () => {
+      logMessage("[WebSocket] Closed");
+    };
+    socket.onmessage = (event) => {
+      try {
+        const data = JSON.parse(event.data);
+        if (data.type === "audio" && data.payload) {
+          const arrayBuffer = base64ToArrayBuffer(data.payload);
+          const int16View = new Int16Array(arrayBuffer);
+          const float32Buffer = new Float32Array(int16View.length);
+          for (let i = 0; i < int16View.length; i++) {
+            float32Buffer[i] = int16View[i] / 32768;
+          }
+          const sampleRate = 24000; // RECEIVED_SAMPLE_RATE from app.py
+          const audioBuffer = playbackCtx.createBuffer(1, float32Buffer.length, sampleRate);
+          audioBuffer.copyToChannel(float32Buffer, 0);
+          let scheduledTime = playbackCtx.currentTime > nextPlaybackTime ? playbackCtx.currentTime : nextPlaybackTime;
+          const source = playbackCtx.createBufferSource();
+          source.buffer = audioBuffer;
+          source.connect(playbackCtx.destination);
+          source.start(scheduledTime);
+          nextPlaybackTime = scheduledTime + audioBuffer.duration;
+          logMessage("[Audio] Scheduled playback. Start time:", scheduledTime, "Duration:", audioBuffer.duration);
+        } else if (data.type === "text" && data.content) {
+          logMessage("[Text] Received:", data.content);
+        } else {
+          logMessage("[WebSocket] Received:", event.data);
+        }
+      } catch (err) {
+        logMessage("[WebSocket] Error processing message:", err);
+      }
+    };
+  }
+  async function startCapture() {
+    if (!socket || socket.readyState !== WebSocket.OPEN) {
+      logMessage("WebSocket not connected. Click 'Connect WebSocket' first.");
+      return;
+    }
+    if (isCapturing) {
+      logMessage("Already capturing!");
+      return;
+    }
+    isCapturing = true;
+    logMessage("Starting microphone capture as raw PCM...");
+    try {
+      micStream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      audioCtx = new (window.AudioContext || window.webkitAudioContext)();
+      // Create a media source from the mic stream
+      const source = audioCtx.createMediaStreamSource(micStream);
+      // Create a ScriptProcessorNode
+      const bufferSize = 4096;  // You can adjust this
+      const inputChannels = 1;
+      const outputChannels = 1;
+      scriptNode = audioCtx.createScriptProcessor(bufferSize, inputChannels, outputChannels);
+      scriptNode.onaudioprocess = (audioEvent) => {
+        if (!isCapturing) return;
+        // Get raw floating-point samples [ -1.0 .. +1.0 ]
+        const inputBuffer = audioEvent.inputBuffer.getChannelData(0);
+        // Convert float samples to 16-bit signed
+        const pcm16 = floatTo16BitPCM(inputBuffer);
+        // Encode as base64 and send over WebSocket
+        const bytes = new Uint8Array(pcm16.buffer);
+        const b64 = btoa(String.fromCharCode(...bytes));
+        socket.send(JSON.stringify({
+          type: "audio",
+          payload: b64
+        }));
+      };
+      // Connect the pipeline: mic -> script -> (optional) audioCtx.destination
+      source.connect(scriptNode);
+      scriptNode.connect(audioCtx.destination);
+      logMessage("Recording...");
+    } catch (err) {
+      logMessage("Error getting user mic:", err);
+    }
+  }
+  function stopCapture() {
+    if (!isCapturing) return;
+    isCapturing = false;
+    logMessage("Stopped microphone capture.");
+    if (scriptNode) {
+      scriptNode.disconnect();
+      scriptNode.onaudioprocess = null;
+      scriptNode = null;
+    }
+    if (micStream) {
+      // Stop all tracks
+      micStream.getTracks().forEach(track => track.stop());
+      micStream = null;
+    }
+    if (audioCtx) {
+      audioCtx.close();
+      audioCtx = null;
+    }
+  }
+  function floatTo16BitPCM(floatSamples) {
+    // Convert an array of floats [-1, 1] to a Int16Array
+    const out = new Int16Array(floatSamples.length);
+    for (let i = 0; i < floatSamples.length; i++) {
+      let s = Math.max(-1, Math.min(1, floatSamples[i]));
+      // scale range
+      s = s < 0 ? s * 0x8000 : s * 0x7FFF;
+      out[i] = s;
+    }
+    return out;
+  }
+  function sendText() {
+    const textInput = document.getElementById("textMessage");
+    const text = textInput.value.trim();
+    if (text && socket && socket.readyState === WebSocket.OPEN) {
+      socket.send(JSON.stringify({ type: "text", content: text }));
+      logMessage("[Text] Sent:", text);
+      textInput.value = "";
+    } else {
+      logMessage("WebSocket not connected or text is empty.");
+    }
+  }
+  function base64ToArrayBuffer(b64) {
+    const binaryString = window.atob(b64);
+    const len = binaryString.length;
+    const bytes = new Uint8Array(len);
+    for (let i = 0; i < len; i++) {
+      bytes[i] = binaryString.charCodeAt(i);
+    }
+    return bytes.buffer;
+  }
+</script>
+</body>
+</html>

index.html ADDED Viewed

	@@ -0,0 +1,550 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <title>Gemini Live Chat - Voice and Text Interaction</title>
+  <style>
+    body {
+      max-width: 800px;
+      margin: 2em auto;
+      padding: 0 1em;
+      font-family: system-ui, -apple-system, sans-serif;
+    }
+    #visualizer {
+      width: 100%;
+      height: 80px;
+      background: #f0f0f0;
+      border-radius: 4px;
+      margin: 0;
+    }
+    #log {
+      background: #f0f0f0;
+      padding: 1em;
+      border-radius: 4px;
+      font-family: monospace;
+      max-height: 400px;
+      overflow-y: auto;
+    }
+    .controls {
+      margin: 1em 0;
+      padding: 1em;
+      background: #f8f8f8;
+      border-radius: 4px;
+    }
+    .function-card {
+      padding: 0.8em;
+      background: white;
+      border-radius: 4px;
+      box-shadow: 0 1px 3px rgba(0,0,0,0.1);
+    }
+    .function-card strong {
+      color: #1976d2;
+    }
+    .function-card ul {
+      color: #555;
+    }
+    button {
+      border: none;
+      padding: 0.5em 1em;
+      border-radius: 3px;
+      cursor: pointer;
+      transition: opacity 0.2s;
+    }
+    button:hover {
+      opacity: 0.9;
+    }
+    #connectButton {
+      background: #2196f3;
+      color: white;
+    }
+    .voice-start {
+      background: #4caf50;
+      color: white;
+    }
+    .voice-stop {
+      background: #f44336;
+      color: white;
+    }
+  </style>
+</head>
+<body>
+<h1>Gemini Live Chat</h1>
+<p>Interactive voice and text chat powered by Gemini AI that supports server-side function calling, code execution, and Google search capabilities.</p>
+<div class="controls" style="background: #e3f2fd;">
+  <h3 style="margin-top: 0;">Available Functions:</h3>
+  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1em;">
+    <div class="function-card">
+      <strong>💡 Light Control</strong>
+      <ul style="margin: 0.5em 0; padding-left: 1.5em;">
+        <li>Turn lights on</li>
+        <li>Turn lights off</li>
+      </ul>
+    </div>
+    <div class="function-card">
+      <strong>🔍 Search</strong>
+      <ul style="margin: 0.5em 0; padding-left: 1.5em;">
+        <li>Google search</li>
+      </ul>
+    </div>
+    <div class="function-card">
+      <strong>💻 Code</strong>
+      <ul style="margin: 0.5em 0; padding-left: 1.5em;">
+        <li>Execute code</li>
+        <li>Run commands</li>
+      </ul>
+    </div>
+  </div>
+  <p style="margin: 0.5em 0 0 0; font-size: 0.9em; color: #666;">
+    Try saying: "Turn on the lights" or "Search for weather in London" or ask any question!
+  </p>
+</div>
+<div class="controls">
+  <div style="display: flex; align-items: center; justify-content: space-between; gap: 1em;">
+    <div style="display: flex; align-items: center; gap: 1em;">
+      <div>
+        <span style="font-weight: 500; color: #666;">Server:</span>
+        <span id="connectionStatus" style="display: inline-block; padding: 0.3em 0.6em; margin-left: 0.5em; border-radius: 3px; background: #f44336; color: white;">Not connected</span>
+      </div>
+      <div id="micStatus" style="display: none;">
+        <span style="font-weight: 500; color: #666;">Voice:</span>
+        <span style="display: inline-block; padding: 0.3em 0.6em; margin-left: 0.5em; border-radius: 3px; background: #4caf50; color: white;">Recording</span>
+      </div>
+    </div>
+    <div style="display: flex; gap: 0.5em;">
+      <button id="connectButton" onclick="toggleConnection()">
+        <span style="margin-right: 0.3em;">🔌</span> Connect to Server
+      </button>
+      <button id="voiceStartButton" class="voice-start" onclick="startCapture()">
+        <span style="margin-right: 0.3em;">🎤</span> Start Voice Chat
+      </button>
+      <button id="voiceStopButton" class="voice-stop" onclick="stopCapture()" style="display: none;">
+        <span style="margin-right: 0.3em;">⏹️</span> Stop Voice Chat
+      </button>
+    </div>
+  </div>
+  <div style="margin-top: 1em; display: flex; gap: 0.5em;">
+    <input type="text" id="textMessage" placeholder="Type your message here" style="flex: 1; padding: 0.5em; border: 1px solid #ddd; border-radius: 3px;" />
+    <button onclick="sendText()" style="white-space: nowrap;">
+      <span style="margin-right: 0.3em;">📤</span> Send
+    </button>
+  </div>
+</div>
+<div class="controls">
+  <canvas id="visualizer"></canvas>
+</div>
+<div style="margin: 1em 0;">
+  <strong>Log Settings:</strong><br>
+  <label><input type="checkbox" id="logWebSocket" checked> WebSocket Events</label>
+  <label style="margin-left: 1em"><input type="checkbox" id="logAudio" checked> Audio Events</label>
+  <label style="margin-left: 1em"><input type="checkbox" id="logText" checked> Text Events</label>
+  <label style="margin-left: 1em"><input type="checkbox" id="logError" checked> Error Events</label>
+</div>
+<pre id="log"></pre>
+<script>
+  let socket;
+  let playbackCtx = null;
+  let nextPlaybackTime = 0;
+  let audioCtx;
+  let scriptNode;
+  let micStream;
+  let isCapturing = false;
+  let audioSeq = 0;
+  let scheduledSources = []; // Track scheduled audio sources
+  let analyser;
+  let visualizerCanvas;
+  let visualizerCtx;
+  let animationFrame;
+  function updateConnectionStatus(connected) {
+    const statusEl = document.getElementById('connectionStatus');
+    const connectButton = document.getElementById('connectButton');
+    const voiceStartButton = document.getElementById('voiceStartButton');
+    if (connected) {
+      statusEl.textContent = 'Connected';
+      statusEl.style.background = '#4caf50';
+      connectButton.textContent = '🔌 Disconnect Server';
+      voiceStartButton.style.display = '';
+    } else {
+      statusEl.textContent = 'Not connected';
+      statusEl.style.background = '#f44336';
+      connectButton.textContent = '🔌 Connect to Server';
+      voiceStartButton.style.display = 'none';
+      // Also stop recording if we're disconnected
+      if (isCapturing) {
+        stopCapture();
+      }
+    }
+  }
+  function updateMicStatus(recording) {
+    const micStatus = document.getElementById('micStatus');
+    const voiceStartButton = document.getElementById('voiceStartButton');
+    const voiceStopButton = document.getElementById('voiceStopButton');
+    if (recording) {
+      micStatus.style.display = '';
+      voiceStartButton.style.display = 'none';
+      voiceStopButton.style.display = '';
+    } else {
+      micStatus.style.display = 'none';
+      voiceStartButton.style.display = '';
+      voiceStopButton.style.display = 'none';
+    }
+  }
+  function toggleConnection() {
+    if (socket && socket.readyState === WebSocket.OPEN) {
+      socket.close();
+    } else {
+      connectWebSocket();
+    }
+  }
+  function logMessage(category, ...args) {
+    const pre = document.getElementById("log");
+    const logCategory = document.getElementById(`log${category.charAt(0).toUpperCase() + category.slice(1)}`);
+    const shouldLog = logCategory ? logCategory.checked : false;
+    if (shouldLog) {
+      const timestamp = new Date().toLocaleTimeString();
+      pre.textContent += `[${timestamp}] [${category}] ` + args.join(" ") + "\n";
+      console.log(`[${category}]`, ...args);
+    }
+  }
+  function clearScheduledAudio() {
+    // Stop and disconnect all scheduled audio sources
+    while (scheduledSources.length > 0) {
+      const source = scheduledSources.pop();
+      try {
+        source.stop();
+        source.disconnect();
+      } catch (err) {
+        // Ignore errors if source already finished playing
+      }
+    }
+    // Reset next playback time
+    if (playbackCtx) {
+      nextPlaybackTime = playbackCtx.currentTime;
+    }
+    logMessage("Audio", "Cleared all scheduled audio");
+  }
+  function setupVisualizer() {
+    visualizerCanvas = document.getElementById('visualizer');
+    visualizerCtx = visualizerCanvas.getContext('2d');
+    // Make canvas resolution match display size
+    const rect = visualizerCanvas.getBoundingClientRect();
+    visualizerCanvas.width = rect.width;
+    visualizerCanvas.height = rect.height;
+    if (!analyser && playbackCtx) {
+      analyser = playbackCtx.createAnalyser();
+      analyser.fftSize = 2048;
+    }
+  }
+  function drawVisualizer() {
+    if (!analyser) return;
+    const bufferLength = analyser.frequencyBinCount;
+    const dataArray = new Uint8Array(bufferLength);
+    analyser.getByteTimeDomainData(dataArray);
+    visualizerCtx.fillStyle = '#f0f0f0';
+    visualizerCtx.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
+    visualizerCtx.lineWidth = 2;
+    visualizerCtx.strokeStyle = '#4caf50';
+    visualizerCtx.beginPath();
+    const sliceWidth = visualizerCanvas.width / bufferLength;
+    let x = 0;
+    for (let i = 0; i < bufferLength; i++) {
+      const v = dataArray[i] / 128.0;
+      const y = v * visualizerCanvas.height / 2;
+      if (i === 0) {
+        visualizerCtx.moveTo(x, y);
+      } else {
+        visualizerCtx.lineTo(x, y);
+      }
+      x += sliceWidth;
+    }
+    visualizerCtx.lineTo(visualizerCanvas.width, visualizerCanvas.height / 2);
+    visualizerCtx.stroke();
+    animationFrame = requestAnimationFrame(drawVisualizer);
+  }
+  function stopVisualizer() {
+    if (animationFrame) {
+      cancelAnimationFrame(animationFrame);
+      animationFrame = null;
+    }
+    if (visualizerCtx) {
+      visualizerCtx.fillStyle = '#f0f0f0';
+      visualizerCtx.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
+    }
+  }
+  function connectWebSocket() {
+    logMessage("WebSocket", "Connecting...");
+    updateConnectionStatus(false);
+    // Use current origin and replace http(s) with ws(s)
+    const wsUrl = `${window.location.protocol === 'https:' ? 'wss:' : 'ws:'}//${window.location.host}/ws`;
+    socket = new WebSocket(wsUrl);
+    socket.onopen = () => {
+      logMessage("WebSocket", "Opened connection");
+      updateConnectionStatus(true);
+      if (!playbackCtx) {
+        playbackCtx = new (window.AudioContext || window.webkitAudioContext)();
+        setupVisualizer();
+      }
+      nextPlaybackTime = playbackCtx.currentTime;
+    };
+    socket.onerror = (err) => {
+      logMessage("Error", "WebSocket error:", err);
+      updateConnectionStatus(false);
+    };
+    socket.onclose = () => {
+      logMessage("WebSocket", "Connection closed");
+      updateConnectionStatus(false);
+      if (isCapturing) {
+        stopCapture();
+      }
+    };
+    socket.onmessage = (event) => {
+      try {
+        const data = JSON.parse(event.data);
+        if (data.type === "audio" && data.payload) {
+          const arrayBuffer = base64ToArrayBuffer(data.payload);
+          const int16View = new Int16Array(arrayBuffer);
+          const float32Buffer = new Float32Array(int16View.length);
+          for (let i = 0; i < int16View.length; i++) {
+            float32Buffer[i] = int16View[i] / 32768;
+          }
+          const sampleRate = 24000; // RECEIVED_SAMPLE_RATE from app.py
+          const audioBuffer = playbackCtx.createBuffer(1, float32Buffer.length, sampleRate);
+          audioBuffer.copyToChannel(float32Buffer, 0);
+          let scheduledTime = playbackCtx.currentTime > nextPlaybackTime ? playbackCtx.currentTime : nextPlaybackTime;
+          const source = playbackCtx.createBufferSource();
+          source.buffer = audioBuffer;
+          // Connect through analyser for visualization
+          if (analyser) {
+            source.connect(analyser);
+            analyser.connect(playbackCtx.destination);
+            if (!animationFrame) {
+              drawVisualizer();
+            }
+          } else {
+            source.connect(playbackCtx.destination);
+          }
+          source.start(scheduledTime);
+          // Add source to tracked sources
+          scheduledSources.push(source);
+          // Remove source from tracking once it finishes
+          source.onended = () => {
+            const index = scheduledSources.indexOf(source);
+            if (index > -1) {
+              scheduledSources.splice(index, 1);
+            }
+            // Stop visualizer if no more audio
+            if (scheduledSources.length === 0) {
+              stopVisualizer();
+            }
+          };
+          nextPlaybackTime = scheduledTime + audioBuffer.duration;
+          logMessage("Audio", "Scheduled playback. Start time:", scheduledTime, "Duration:", audioBuffer.duration);
+        } else if (data.type === "text" && data.content) {
+          logMessage("Text", "Received:", data.content);
+        } else {
+          logMessage("WebSocket", "Received message:", event.data);
+        }
+      } catch (err) {
+        logMessage("Error", "Failed to process message:", err);
+      }
+    };
+  }
+  async function startCapture() {
+    if (!socket || socket.readyState !== WebSocket.OPEN) {
+      logMessage("WebSocket", "Not connected. Click 'Connect to Server' first.");
+      return;
+    }
+    if (isCapturing) {
+      logMessage("Audio", "Already capturing!");
+      return;
+    }
+    isCapturing = true;
+    updateMicStatus(true);
+    logMessage("Audio", "Starting microphone capture...");
+    try {
+      micStream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      logMessage("Audio", "Got microphone access");
+      audioCtx = new (window.AudioContext || window.webkitAudioContext)();
+      logMessage("Audio", "Created AudioContext with sample rate:", audioCtx.sampleRate);
+      // Create a media source from the mic stream
+      const source = audioCtx.createMediaStreamSource(micStream);
+      logMessage("Audio", "Created MediaStreamSource");
+      // Create a ScriptProcessorNode
+      const bufferSize = 4096;  // You can adjust this
+      const inputChannels = 1;
+      const outputChannels = 1;
+      scriptNode = audioCtx.createScriptProcessor(bufferSize, inputChannels, outputChannels);
+      logMessage("Audio", "Created ScriptProcessorNode with buffer size:", bufferSize);
+      scriptNode.onaudioprocess = (audioEvent) => {
+        if (!isCapturing) return;
+        // Get raw samples and resample to 16kHz
+        const inputBuffer = audioEvent.inputBuffer.getChannelData(0);
+        // Check if there's actual audio input (not just silence)
+        const hasAudio = inputBuffer.some(sample => Math.abs(sample) > 0.01); // Threshold for noise
+        if (hasAudio) {
+          clearScheduledAudio(); // Only clear when we detect actual audio input
+        }
+        const resampled = resampleAudio(inputBuffer, audioCtx.sampleRate, 16000);
+        // Convert resampled audio to 16-bit PCM
+        const pcm16 = floatTo16BitPCM(resampled);
+        // Encode as base64 and send over WebSocket
+        const bytes = new Uint8Array(pcm16.buffer);
+        const b64 = btoa(String.fromCharCode(...bytes));
+        const audioMsg = {
+          type: "audio",
+          payload: b64,
+          seq: audioSeq++,
+          config: {
+            sampleRate: 16000,
+            bitDepth: 16,
+            channels: 1
+          }
+        };
+        logMessage("Audio", "Processing chunk. Seq:", audioMsg.seq);
+        try {
+          if (socket.readyState === WebSocket.OPEN) {
+            socket.send(JSON.stringify(audioMsg));
+          } else {
+            logMessage("WebSocket", "Not open, stopping capture");
+            stopCapture();
+          }
+        } catch (err) {
+          logMessage("Error", "Failed to send audio:", err);
+          stopCapture();
+        }
+      };
+      // Connect the pipeline: mic -> script -> (optional) audioCtx.destination
+      source.connect(scriptNode);
+      logMessage("Audio", "Connected audio pipeline");
+      logMessage("Audio", "Recording...");
+    } catch (err) {
+      logMessage("Error", "Failed to get microphone access:", err);
+      isCapturing = false;
+    }
+  }
+  function stopCapture() {
+    if (!isCapturing) return;
+    isCapturing = false;
+    updateMicStatus(false);
+    logMessage("Audio", "Stopped microphone capture");
+    if (scriptNode) {
+      scriptNode.disconnect();
+      scriptNode.onaudioprocess = null;
+      scriptNode = null;
+    }
+    if (micStream) {
+      // Stop all tracks
+      micStream.getTracks().forEach(track => track.stop());
+      micStream = null;
+    }
+    if (audioCtx) {
+      audioCtx.close();
+      audioCtx = null;
+    }
+  }
+  function floatTo16BitPCM(floatSamples) {
+    // Convert an array of floats [-1, 1] to a Int16Array
+    const out = new Int16Array(floatSamples.length);
+    for (let i = 0; i < floatSamples.length; i++) {
+      let s = Math.max(-1, Math.min(1, floatSamples[i]));
+      // scale range
+      s = s < 0 ? s * 0x8000 : s * 0x7FFF;
+      out[i] = s;
+    }
+    return out;
+  }
+  function resampleAudio(inputBuffer, fromRate, toRate) {
+    const ratio = toRate / fromRate;
+    const newLength = Math.round(inputBuffer.length * ratio);
+    const resampled = new Float32Array(newLength);
+    for(let i = 0; i < newLength; i++) {
+      const index = Math.round(i / ratio);
+      resampled[i] = inputBuffer[Math.min(index, inputBuffer.length-1)];
+    }
+    return resampled;
+  }
+  function sendText() {
+    const textInput = document.getElementById("textMessage");
+    const text = textInput.value.trim();
+    if (text && socket && socket.readyState === WebSocket.OPEN) {
+      // Clear any scheduled audio before sending text
+      clearScheduledAudio();
+      socket.send(JSON.stringify({ type: "text", content: text }));
+      logMessage("Text", "Sent:", text);
+      textInput.value = "";
+    } else {
+      logMessage("WebSocket", "Not connected or text is empty");
+    }
+  }
+  function base64ToArrayBuffer(b64) {
+    const binaryString = window.atob(b64);
+    const len = binaryString.length;
+    const bytes = new Uint8Array(len);
+    for (let i = 0; i < len; i++) {
+      bytes[i] = binaryString.charCodeAt(i);
+    }
+    return bytes.buffer;
+  }
+</script>
+</body>
+</html>

requirements.txt ADDED Viewed

	@@ -0,0 +1,138 @@

+aiofiles==23.2.1
+aiohappyeyeballs==2.4.4
+aiohttp==3.11.11
+aiohttp-retry==2.9.1
+aioice==0.9.0
+aiortc==1.10.0
+aiosignal==1.3.2
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.8.0
+attrs==24.3.0
+av==13.1.0
+blinker==1.9.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.4.0
+click==8.1.7
+colorama==0.4.6
+contourpy==1.3.1
+cryptography==44.0.0
+cycler==0.12.1
+dnspython==2.7.0
+exceptiongroup==1.2.0
+fastapi==0.115.8
+ffmpeg-python==0.2.0
+ffmpy==0.5.0
+filelock==3.17.0
+fonttools==4.55.8
+frozenlist==1.5.0
+fsspec==2024.12.0
+future==1.0.0
+gitdb==4.0.11
+GitPython==3.1.43
+google-ai-generativelanguage==0.6.15
+google-api-core==2.24.1
+google-api-python-client==2.160.0
+google-auth==2.37.0
+google-auth-httplib2==0.2.0
+google-crc32c==1.6.0
+google-genai==0.2.2
+google-generativeai==0.8.4
+googleapis-common-protos==1.66.0
+gradio==5.14.0
+gradio_client==1.7.0
+gradio_webrtc==0.0.30
+grpcio==1.70.0
+grpcio-status==1.70.0
+h11==0.14.0
+httpcore==1.0.7
+httplib2==0.22.0
+httpx==0.28.1
+huggingface-hub==0.28.1
+idna==3.10
+ifaddr==0.2.0
+importlib_resources==6.5.2
+Jinja2==3.1.4
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+keyboard==0.13.5
+kiwisolver==1.4.8
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.10.0
+mdurl==0.1.2
+MouseInfo==0.1.3
+mss==10.0.0
+multidict==6.1.0
+narwhals==1.19.0
+numpy==1.26.4
+opencv-python==4.10.0.84
+orjson==3.10.15
+packaging==24.2
+pandas==2.2.3
+pillow==10.4.0
+plotly==5.24.1
+propcache==0.2.1
+proto-plus==1.26.0
+protobuf==5.29.2
+pyarrow==18.1.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+PyAudio==0.2.14
+PyAutoGUI==0.9.54
+pycparser==2.22
+pydantic==2.10.3
+pydantic_core==2.27.1
+pydeck==0.9.1
+pydub==0.25.1
+pyee==12.1.1
+PyGetWindow==0.0.9
+Pygments==2.18.0
+PyJWT==2.10.1
+pylibsrtp==0.10.0
+PyMsgBox==1.0.9
+pyngrok==7.2.2
+pyOpenSSL==25.0.0
+pyparsing==3.2.1
+pyperclip==1.9.0
+PyRect==0.2.0
+PyScreeze==1.0.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.20
+pytweening==1.2.0
+pytz==2024.2
+PyYAML==6.0.2
+referencing==0.35.1
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.22.3
+rsa==4.9
+ruff==0.9.4
+safehttpx==0.1.6
+semantic-version==2.10.0
+setuptools==75.8.0
+shellingham==1.5.4
+simpleaudio==1.0.4
+six==1.17.0
+smmap==5.0.1
+sniffio==1.3.1
+starlette==0.45.3
+streamlit==1.41.1
+tenacity==9.0.0
+toml==0.10.2
+tomlkit==0.13.2
+tornado==6.4.2
+tqdm==4.67.1
+twilio==9.4.4
+typer==0.15.1
+typing_extensions==4.12.2
+tzdata==2024.2
+uritemplate==4.1.1
+urllib3==2.2.3
+uvicorn==0.34.0
+watchdog==6.0.0
+websockets==14.2
+yarl==1.18.3

webapp.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# webapp.py
+import asyncio
+import base64
+import json
+import os
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+from fastapi.responses import HTMLResponse
+from fastapi.staticfiles import StaticFiles
+import uvicorn
+from handler import AudioLoop  # Import your AudioLoop from above
+app = FastAPI()
+# Mount the web_ui directory to serve static files
+current_dir = os.path.dirname(os.path.realpath(__file__))
+app.mount("/web_ui", StaticFiles(directory=current_dir), name="web_ui")
+@app.get("/")
+async def get_index():
+    # Read and return the index.html file
+    index_path = os.path.join(current_dir, "index.html")
+    with open(index_path, "r", encoding="utf-8") as f:
+        html_content = f.read()
+    return HTMLResponse(content=html_content)
+@app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    print("[websocket_endpoint] Client connected.")
+    # Create a new AudioLoop instance for this client
+    audio_loop = AudioLoop()
+    audio_ordering_buffer = {}
+    expected_audio_seq = 0
+    # Start the AudioLoop for this client
+    loop_task = asyncio.create_task(audio_loop.run())
+    print("[websocket_endpoint] Started new AudioLoop for client")
+    async def from_client_to_gemini():
+        """Handles incoming messages from the client and forwards them to Gemini."""
+        nonlocal audio_ordering_buffer, expected_audio_seq
+        try:
+            while True:
+                data = await websocket.receive_text()
+                msg = json.loads(data)
+                msg_type = msg.get("type")
+                #print("[from_client_to_gemini] Received message from client:", msg)
+                # Handle audio data from client
+                if msg_type == "audio":
+                    raw_pcm = base64.b64decode(msg["payload"])
+                    forward_msg = {
+                        "realtime_input": {
+                            "media_chunks": [
+                                {
+                                    "data": base64.b64encode(raw_pcm).decode(),
+                                    "mime_type": "audio/pcm"
+                                }
+                            ]
+                        }
+                    }
+                    # Retrieve the sequence number from the message
+                    seq = msg.get("seq")
+                    if seq is not None:
+                        # Store the message in the buffer
+                        audio_ordering_buffer[seq] = forward_msg
+                        # Forward any messages in order
+                        while expected_audio_seq in audio_ordering_buffer:
+                            msg_to_forward = audio_ordering_buffer.pop(expected_audio_seq)
+                            await audio_loop.out_queue.put(msg_to_forward)
+                            expected_audio_seq += 1
+                    else:
+                        # If no sequence number is provided, forward immediately
+                        await audio_loop.out_queue.put(forward_msg)
+                # Handle text data from client
+                elif msg_type == "text":
+                    user_text = msg.get("content", "")
+                    print("[from_client_to_gemini] Forwarding user text to Gemini:", user_text)
+                    forward_msg = {
+                        "client_content": {
+                            "turn_complete": True,
+                            "turns": [
+                                {
+                                    "role": "user",
+                                    "parts": [
+                                        {"text": user_text}
+                                    ]
+                                }
+                            ]
+                        }
+                    }
+                    await audio_loop.out_queue.put(forward_msg)
+                else:
+                    print("[from_client_to_gemini] Unknown message type:", msg_type)
+        except WebSocketDisconnect:
+            print("[from_client_to_gemini] Client disconnected.")
+        except Exception as e:
+            print("[from_client_to_gemini] Error:", e)
+    async def from_gemini_to_client():
+        """Reads PCM audio from Gemini and sends it back to the client."""
+        try:
+            while True:
+                pcm_data = await audio_loop.audio_in_queue.get()
+                b64_pcm = base64.b64encode(pcm_data).decode()
+                out_msg = {
+                    "type": "audio",
+                    "payload": b64_pcm
+                }
+                print("[from_gemini_to_client] Sending audio chunk to client. Size:", len(pcm_data))
+                await websocket.send_text(json.dumps(out_msg))
+        except WebSocketDisconnect:
+            print("[from_gemini_to_client] Client disconnected.")
+        except Exception as e:
+            print("[from_gemini_to_client] Error:", e)
+    # Launch both tasks concurrently. If either fails or disconnects, we exit.
+    try:
+        await asyncio.gather(
+            from_client_to_gemini(),
+            from_gemini_to_client(),
+        )
+    finally:
+        print("[websocket_endpoint] WebSocket handler finished.")
+        # Clean up the AudioLoop when the client disconnects
+        loop_task.cancel()
+        try:
+            await loop_task
+        except asyncio.CancelledError:
+            pass
+        print("[websocket_endpoint] Cleaned up AudioLoop for client")
+if __name__ == "__main__":
+    uvicorn.run("webapp:app", host="0.0.0.0", port=8000, reload=True)