Nirav Madhani commited on
Commit
265e66c
·
1 Parent(s): 87717b1

Fixed requirements and image

Browse files
Files changed (4) hide show
  1. Dockerfile +23 -5
  2. app.py +0 -302
  3. requirements.txt +4 -0
  4. webapp.py +1 -1
Dockerfile CHANGED
@@ -1,14 +1,32 @@
1
  FROM python:3.12.6
2
- RUN apt-get update && apt-get install -y portaudio19-dev
 
 
 
 
 
 
 
 
3
  RUN useradd -m -u 1000 user
4
- USER user
5
- ENV PATH="/home/user/.local/bin:$PATH"
6
 
7
  WORKDIR /app
8
 
9
- COPY --chown=user ./requirements.txt requirements.txt
 
10
 
 
 
 
 
 
11
  RUN pip install --no-cache-dir -r requirements.txt
12
 
 
13
  COPY --chown=user . /app
14
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
 
 
 
1
  FROM python:3.12.6
2
+
3
+ # Install OpenGL dependencies and PortAudio
4
+ USER root
5
+ RUN apt-get update && apt-get install -y \
6
+ libgl1 \
7
+ libglib2.0-0 \
8
+ portaudio19-dev
9
+
10
+ # Create non-root user
11
  RUN useradd -m -u 1000 user
 
 
12
 
13
  WORKDIR /app
14
 
15
+ # Set PATH for the non-root user
16
+ ENV PATH="/home/user/.local/bin:$PATH"
17
 
18
+ # Switch to non-root user
19
+ USER user
20
+
21
+ # Copy and install Python dependencies
22
+ COPY --chown=user ./requirements.txt requirements.txt
23
  RUN pip install --no-cache-dir -r requirements.txt
24
 
25
+ # Copy application files
26
  COPY --chown=user . /app
27
+
28
+ # Expose the Hugging Face Spaces default port
29
+ EXPOSE 7860
30
+
31
+ # Set the command to run the app
32
+ CMD ["python", "webapp.py"]
app.py DELETED
@@ -1,302 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # Copyright 2023 Google LLC
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """
17
- ## Setup
18
-
19
- To install the dependencies for this script, run:
20
-
21
- ```
22
- pip install google-genai opencv-python pyaudio pillow mss
23
- ```
24
-
25
- Before running this script, ensure the `GOOGLE_API_KEY` environment
26
- variable is set to the api-key you obtained from Google AI Studio.
27
-
28
- Important: **Use headphones**. This script uses the system default audio
29
- input and output, which often won't include echo cancellation. So to prevent
30
- the model from interrupting itself it is important that you use headphones.
31
-
32
- ## Run
33
-
34
- To run the script:
35
-
36
- ```
37
- python live_api_starter.py
38
- ```
39
-
40
- The script takes a video-mode flag `--mode`, this can be "camera", "screen", or "none".
41
- The default is "camera". To share your screen run:
42
-
43
- ```
44
- python live_api_starter.py --mode screen
45
- ```
46
- """
47
-
48
- import asyncio
49
- import base64
50
- import json
51
- import io
52
- import os
53
- import sys
54
- import traceback
55
-
56
- import cv2
57
- import pyaudio
58
- import PIL.Image
59
- import mss
60
- import argparse
61
-
62
- from websockets.asyncio.client import connect
63
-
64
- if sys.version_info < (3, 11, 0):
65
- import taskgroup, exceptiongroup
66
-
67
- asyncio.TaskGroup = taskgroup.TaskGroup
68
- asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup
69
-
70
- FORMAT = pyaudio.paInt16
71
- CHANNELS = 1
72
- SEND_SAMPLE_RATE = 16000
73
- RECEIVE_SAMPLE_RATE = 24000
74
- CHUNK_SIZE = 512
75
-
76
- host = "generativelanguage.googleapis.com"
77
- model = "gemini-2.0-flash-exp"
78
- DEFAULT_MODE="none"
79
-
80
-
81
- api_key = os.environ["GOOGLE_API_KEY"]
82
- uri = f"wss://{host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}"
83
-
84
-
85
- class AudioLoop:
86
- def __init__(self, video_mode=DEFAULT_MODE):
87
- self.video_mode=video_mode
88
- self.audio_in_queue = None
89
- self.out_queue = None
90
-
91
- self.ws = None
92
- self.audio_stream = None
93
-
94
- async def startup(self):
95
- setup_msg = {"setup": {"model": f"models/{model}"}}
96
- await self.ws.send(json.dumps(setup_msg))
97
- raw_response = await self.ws.recv(decode=False)
98
- setup_response = json.loads(raw_response.decode("ascii"))
99
-
100
- async def send_text(self):
101
- while True:
102
- text = await asyncio.to_thread(input, "message > ")
103
- if text.lower() == "q":
104
- break
105
-
106
- msg = {
107
- "client_content": {
108
- "turn_complete": True,
109
- "turns": [{"role": "user", "parts": [{"text": text}]}],
110
- }
111
- }
112
- await self.ws.send(json.dumps(msg))
113
-
114
- def _get_frame(self, cap):
115
- # Read the frame
116
- ret, frame = cap.read()
117
- # Check if the frame was read successfully
118
- if not ret:
119
- return None
120
-
121
- # Fix: Convert BGR to RGB color space
122
- # OpenCV captures in BGR but PIL expects RGB format
123
- # This prevents the blue tint in the video feed
124
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
125
- img = PIL.Image.fromarray(frame_rgb) # Now using RGB frame
126
- img.thumbnail([1024, 1024])
127
-
128
- image_io = io.BytesIO()
129
- img.save(image_io, format="jpeg")
130
- image_io.seek(0)
131
-
132
- mime_type = "image/jpeg"
133
- image_bytes = image_io.read()
134
- return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}
135
-
136
- async def get_frames(self):
137
- # This takes about a second, and will block the whole program
138
- # causing the audio pipeline to overflow if you don't to_thread it.
139
- cap = await asyncio.to_thread(
140
- cv2.VideoCapture, 0
141
- ) # 0 represents the default camera
142
-
143
- while True:
144
- frame = await asyncio.to_thread(self._get_frame, cap)
145
- if frame is None:
146
- break
147
- await asyncio.sleep(1.0)
148
-
149
- msg = {"realtime_input": {"media_chunks": [frame]}}
150
- await self.out_queue.put(msg)
151
-
152
- # Release the VideoCapture object
153
- cap.release()
154
-
155
- def _get_screen(self):
156
- sct = mss.mss()
157
- monitor = sct.monitors[0]
158
-
159
- i = sct.grab(monitor)
160
- mime_type = "image/jpeg"
161
- image_bytes = mss.tools.to_png(i.rgb, i.size)
162
- img = PIL.Image.open(io.BytesIO(image_bytes))
163
-
164
- image_io = io.BytesIO()
165
- img.save(image_io, format="jpeg")
166
- image_io.seek(0)
167
-
168
- image_bytes = image_io.read()
169
- return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}
170
-
171
- async def get_screen(self):
172
- while True:
173
- frame = await asyncio.to_thread(self._get_screen)
174
- if frame is None:
175
- break
176
-
177
- await asyncio.sleep(1.0)
178
-
179
- msg = {"realtime_input": {"media_chunks": frame}}
180
- await self.out_queue.put(msg)
181
-
182
- async def send_realtime(self):
183
- while True:
184
- msg = await self.out_queue.get()
185
- await self.ws.send(json.dumps(msg))
186
-
187
- async def listen_audio(self):
188
- pya = pyaudio.PyAudio()
189
-
190
- mic_info = pya.get_default_input_device_info()
191
- self.audio_stream = pya.open(
192
- format=FORMAT,
193
- channels=CHANNELS,
194
- rate=SEND_SAMPLE_RATE,
195
- input=True,
196
- input_device_index=mic_info["index"],
197
- frames_per_buffer=CHUNK_SIZE,
198
- )
199
- while True:
200
- data = await asyncio.to_thread(self.audio_stream.read, CHUNK_SIZE)
201
- msg = {
202
- "realtime_input": {
203
- "media_chunks": [
204
- {
205
- "data": base64.b64encode(data).decode(),
206
- "mime_type": "audio/pcm",
207
- }
208
- ]
209
- }
210
- }
211
- await self.out_queue.put(msg)
212
-
213
- async def receive_audio(self):
214
- "Background task to reads from the websocket and write pcm chunks to the output queue"
215
- async for raw_response in self.ws:
216
- # Other things could be returned here, but we'll ignore those for now.
217
- response = json.loads(raw_response.decode("ascii"))
218
-
219
- try:
220
- b64data = response["serverContent"]["modelTurn"]["parts"][0][
221
- "inlineData"
222
- ]["data"]
223
- except KeyError:
224
- pass
225
- else:
226
- pcm_data = base64.b64decode(b64data)
227
- self.audio_in_queue.put_nowait(pcm_data)
228
-
229
- try:
230
- turn_complete = response["serverContent"]["turnComplete"]
231
- except KeyError:
232
- pass
233
- else:
234
- if turn_complete:
235
- # If you interrupt the model, it sends an end_of_turn.
236
- # For interruptions to work, we need to empty out the audio queue
237
- # Because it may have loaded much more audio than has played yet.
238
- print("\nEnd of turn")
239
- while not self.audio_in_queue.empty():
240
- self.audio_in_queue.get_nowait()
241
-
242
- async def play_audio(self):
243
- pya = pyaudio.PyAudio()
244
- stream = pya.open(
245
- format=FORMAT, channels=CHANNELS, rate=RECEIVE_SAMPLE_RATE, output=True
246
- )
247
- while True:
248
- bytestream = await self.audio_in_queue.get()
249
- await asyncio.to_thread(stream.write, bytestream)
250
-
251
- async def run(self):
252
- """Takes audio chunks off the input queue, and writes them to files.
253
-
254
- Splits and displays files if the queue pauses for more than `max_pause`.
255
- """
256
- try:
257
- async with (
258
- await connect(
259
- uri, additional_headers={"Content-Type": "application/json"}
260
- ) as ws,
261
- asyncio.TaskGroup() as tg,
262
- ):
263
- self.ws = ws
264
- await self.startup()
265
-
266
- self.audio_in_queue = asyncio.Queue()
267
- self.out_queue = asyncio.Queue(maxsize=5)
268
-
269
- send_text_task = tg.create_task(self.send_text())
270
-
271
- tg.create_task(self.send_realtime())
272
- tg.create_task(self.listen_audio())
273
- if self.video_mode == "camera":
274
- tg.create_task(self.get_frames())
275
- elif self.video_mode == "screen":
276
- tg.create_task(self.get_screen())
277
- tg.create_task(self.receive_audio())
278
- tg.create_task(self.play_audio())
279
-
280
- await send_text_task
281
- raise asyncio.CancelledError("User requested exit")
282
-
283
- except asyncio.CancelledError:
284
- pass
285
- except ExceptionGroup as EG:
286
- self.audio_stream.close()
287
- traceback.print_exception(EG)
288
-
289
-
290
- if __name__ == "__main__":
291
- parser = argparse.ArgumentParser()
292
- parser.add_argument(
293
- "--mode",
294
- type=str,
295
- default=DEFAULT_MODE,
296
- help="pixels to stream from",
297
- choices=["camera", "screen", "none"],
298
- )
299
- args = parser.parse_args()
300
-
301
- main = AudioLoop(video_mode=args.mode)
302
- asyncio.run(main.run())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,8 +1,10 @@
1
  annotated-types==0.7.0
 
2
  cachetools==5.5.1
3
  certifi==2025.1.31
4
  charset-normalizer==3.4.1
5
  click==8.1.8
 
6
  google-auth==2.38.0
7
  google-genai==1.0.0
8
  h11==0.14.0
@@ -18,6 +20,8 @@ pydantic==2.10.6
18
  pydantic_core==2.27.2
19
  requests==2.32.3
20
  rsa==4.9
 
 
21
  typing_extensions==4.12.2
22
  urllib3==2.3.0
23
  uvicorn==0.34.0
 
1
  annotated-types==0.7.0
2
+ anyio==4.8.0
3
  cachetools==5.5.1
4
  certifi==2025.1.31
5
  charset-normalizer==3.4.1
6
  click==8.1.8
7
+ fastapi==0.115.8
8
  google-auth==2.38.0
9
  google-genai==1.0.0
10
  h11==0.14.0
 
20
  pydantic_core==2.27.2
21
  requests==2.32.3
22
  rsa==4.9
23
+ sniffio==1.3.1
24
+ starlette==0.45.3
25
  typing_extensions==4.12.2
26
  urllib3==2.3.0
27
  uvicorn==0.34.0
webapp.py CHANGED
@@ -141,4 +141,4 @@ async def websocket_endpoint(websocket: WebSocket):
141
  print("[websocket_endpoint] Cleaned up AudioLoop for client")
142
 
143
  if __name__ == "__main__":
144
- uvicorn.run("webapp:app", host="0.0.0.0", port=8000, reload=True)
 
141
  print("[websocket_endpoint] Cleaned up AudioLoop for client")
142
 
143
  if __name__ == "__main__":
144
+ uvicorn.run("webapp:app", host="0.0.0.0", port=7860, reload=True)