Prathamesh1420 commited on
Commit
1fa177b
Β·
verified Β·
1 Parent(s): dd4f83b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -495
app.py CHANGED
@@ -1,220 +1,3 @@
1
- '''import cv2
2
- import numpy as np
3
- from ultralytics import YOLO
4
- import cvzone
5
- import base64
6
- import os
7
- import gradio as gr
8
- from langchain_core.messages import HumanMessage
9
- from langchain_google_genai import ChatGoogleGenerativeAI
10
-
11
- # βœ… Set up Google API Key (Avoid hardcoding in production)
12
- os.environ["GOOGLE_API_KEY"] = "AIzaSyCC-QiN5S42PQDxH6HUg-d-jye-jgc2_oM"
13
-
14
- # βœ… Initialize the Gemini model
15
- gemini_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
16
-
17
- # βœ… Load the YOLO model
18
- yolo_model = YOLO("best.pt")
19
- names = yolo_model.names # Class names from the YOLO model
20
-
21
- def encode_image_to_base64(image):
22
- _, img_buffer = cv2.imencode('.jpg', image)
23
- return base64.b64encode(img_buffer).decode('utf-8')
24
-
25
- def analyze_image_with_gemini(image):
26
- if image is None or image.shape[0] == 0 or image.shape[1] == 0:
27
- return "Error: Invalid image."
28
-
29
- image_data = encode_image_to_base64(image)
30
- message = HumanMessage(content=[
31
- {"type": "text", "text": """
32
- Analyze this image and determine if the label is present on the bottle.
33
- Return the result strictly in a structured table format:
34
-
35
- | Label Present | Damage |
36
- |--------------|--------|
37
- | Yes/No | Yes/No |
38
- """},
39
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, "description": "Detected product"}
40
- ])
41
- try:
42
- response = gemini_model.invoke([message])
43
- return response.content
44
- except Exception as e:
45
- return f"Error processing image: {e}"
46
-
47
- def process_video(video_path):
48
- cap = cv2.VideoCapture(video_path)
49
- if not cap.isOpened():
50
- return "Error: Could not open video file."
51
-
52
- width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
53
- height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
54
- fps = int(cap.get(cv2.CAP_PROP_FPS))
55
-
56
- fourcc = cv2.VideoWriter_fourcc(*"mp4v")
57
- output_video_path = "output.mp4"
58
- out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
59
-
60
- vertical_center = width // 2
61
-
62
- while True:
63
- ret, frame = cap.read()
64
- if not ret:
65
- break
66
-
67
- frame = cv2.resize(frame, (width, height))
68
- results = yolo_model.track(frame, persist=True)
69
-
70
- if results[0].boxes is not None:
71
- boxes = results[0].boxes.xyxy.int().cpu().tolist()
72
- class_ids = results[0].boxes.cls.int().cpu().tolist()
73
- track_ids = results[0].boxes.id.int().cpu().tolist() if results[0].boxes.id is not None else [-1] * len(boxes)
74
-
75
- for box, track_id, class_id in zip(boxes, track_ids, class_ids):
76
- x1, y1, x2, y2 = box
77
- center_x = (x1 + x2) // 2
78
- center_y = (y1 + y2) // 2
79
-
80
- cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
81
- cvzone.putTextRect(frame, f'ID: {track_id}', (x2, y2), 1, 1)
82
- cvzone.putTextRect(frame, f'{names[class_id]}', (x1, y1), 1, 1)
83
-
84
- if abs(center_x - vertical_center) < 10: # If the center of the box is near the vertical center
85
- crop = frame[y1:y2, x1:x2]
86
- response = analyze_image_with_gemini(crop)
87
-
88
- cvzone.putTextRect(frame, response, (x1, y1 - 10), 1, 1, colorT=(255, 255, 255), colorR=(0, 0, 255))
89
-
90
- out.write(frame)
91
-
92
- cap.release()
93
- out.release()
94
-
95
- return output_video_path
96
-
97
- def gradio_interface(video_path):
98
- if video_path is None:
99
- return "Error: No video uploaded."
100
- return process_video(video_path)
101
-
102
- # βœ… Gradio UI setup
103
- iface = gr.Interface(
104
- fn=gradio_interface,
105
- inputs=gr.File(type="filepath", label="Upload Video"),
106
- outputs=gr.Video(label="Processed Video"),
107
- title="YOLO + Gemini AI Video Analysis",
108
- description="Upload a video to detect objects and analyze them using Gemini AI.",
109
- )
110
-
111
- if __name__ == "__main__":
112
- iface.launch(share=True)'''
113
-
114
- '''import cv2
115
- import numpy as np
116
- from ultralytics import YOLO
117
- import cvzone
118
- import base64
119
- import os
120
- import gradio as gr
121
- from langchain_core.messages import HumanMessage
122
- from langchain_google_genai import ChatGoogleGenerativeAI
123
-
124
- # βœ… Set up Google API Key (Avoid hardcoding in production)
125
- os.environ["GOOGLE_API_KEY"] = "AIzaSyCC-QiN5S42PQDxH6HUg-d-jye-jgc2_oM" # Replace with your actual API Key
126
-
127
- # βœ… Initialize the Gemini model
128
- gemini_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
129
-
130
- # βœ… Load the YOLO model
131
- yolo_model = YOLO("best.pt")
132
- names = yolo_model.names # Class names from the YOLO model
133
-
134
- def encode_image_to_base64(image):
135
- """Encodes an image to a base64 string."""
136
- _, img_buffer = cv2.imencode('.jpg', image)
137
- return base64.b64encode(img_buffer).decode('utf-8')
138
-
139
- def analyze_image_with_gemini(image):
140
- """Sends an image to Gemini AI for analysis."""
141
- if image is None:
142
- return "No image available for analysis."
143
-
144
- image_data = encode_image_to_base64(image)
145
- message = HumanMessage(content=[
146
- {"type": "text", "text": """
147
- Analyze this image and determine if the label is present on the bottle.
148
- Return the result strictly in a structured table format:
149
-
150
- | Label Present | Damage |
151
- |--------------|--------|
152
- | Yes/No | Yes/No |
153
- """},
154
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, "description": "Detected product"}
155
- ])
156
- try:
157
- response = gemini_model.invoke([message])
158
- return response.content
159
- except Exception as e:
160
- return f"Error processing image: {e}"
161
-
162
- def process_video(video_path):
163
- """Processes the uploaded video frame by frame using YOLO and Gemini AI."""
164
- cap = cv2.VideoCapture(video_path)
165
- if not cap.isOpened():
166
- return "Error: Could not open video file."
167
-
168
- frame_list = []
169
- while True:
170
- ret, frame = cap.read()
171
- if not ret:
172
- break
173
-
174
- frame = cv2.resize(frame, (1020, 500)) # Resize for processing
175
- results = yolo_model.track(frame, persist=True)
176
-
177
- if results[0].boxes is not None:
178
- boxes = results[0].boxes.xyxy.int().cpu().tolist()
179
- class_ids = results[0].boxes.cls.int().cpu().tolist()
180
- track_ids = results[0].boxes.id.int().cpu().tolist() if results[0].boxes.id is not None else [-1] * len(boxes)
181
-
182
- for box, track_id, class_id in zip(boxes, track_ids, class_ids):
183
- x1, y1, x2, y2 = box
184
- cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
185
- cvzone.putTextRect(frame, f'ID: {track_id}', (x2, y2), 1, 1)
186
- cvzone.putTextRect(frame, f'{names[class_id]}', (x1, y1), 1, 1)
187
-
188
- # Extract and analyze detected object
189
- crop = frame[y1:y2, x1:x2]
190
- response = analyze_image_with_gemini(crop)
191
- print(response) # Log Gemini AI response
192
-
193
- frame_list.append(frame)
194
-
195
- cap.release() # Free resources
196
- return frame_list[0] if frame_list else "Error: No frames processed."
197
-
198
- def gradio_interface(video_path):
199
- """Handles Gradio video input and processes it."""
200
- if video_path is None:
201
- return "Error: No video uploaded."
202
- return process_video(video_path)
203
-
204
- # βœ… Gradio UI setup
205
- iface = gr.Interface(
206
- fn=gradio_interface,
207
- inputs=gr.File(type="filepath", label="Upload Video"), # Accepts video files
208
- outputs=gr.Image(label="Processed Frame"), # Shows a single processed frame
209
- title="YOLO + Gemini AI Video Analysis",
210
- description="Upload a video to detect objects and analyze them using Gemini AI.",
211
- )
212
-
213
- if __name__ == "__main__":
214
- iface.launch(share=True) # Enables a public link for testing
215
- '''
216
-
217
- '''
218
  import cv2
219
  import numpy as np
220
  from ultralytics import YOLO
@@ -225,15 +8,15 @@ import gradio as gr
225
  from langchain_core.messages import HumanMessage
226
  from langchain_google_genai import ChatGoogleGenerativeAI
227
 
228
- # βœ… Set up Google API Key (Avoid hardcoding in production)
229
- os.environ["GOOGLE_API_KEY"] = "AIzaSyCC-QiN5S42PQDxH6HUg-d-jye-jgc2_oM" # Replace with your actual API Key
230
 
231
  # βœ… Initialize the Gemini model
232
  gemini_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
233
 
234
  # βœ… Load the YOLO model
235
- yolo_model = YOLO("best.pt") # Ensure "best.pt" is in the working directory
236
- names = yolo_model.names # Class names from the YOLO model
237
 
238
  def encode_image_to_base64(image):
239
  """Encodes an image to a base64 string."""
@@ -268,263 +51,8 @@ def process_video(video_path):
268
  """Processes the uploaded video frame by frame using YOLO and Gemini AI."""
269
  cap = cv2.VideoCapture(video_path)
270
  if not cap.isOpened():
271
- return "Error: Could not open video file."
272
-
273
- width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
274
- height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
275
- fps = int(cap.get(cv2.CAP_PROP_FPS))
276
-
277
- fourcc = cv2.VideoWriter_fourcc(*"mp4v")
278
- output_video_path = "output.mp4"
279
- out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
280
-
281
- vertical_center = width // 2
282
-
283
- frame_count = 0
284
- while True:
285
- ret, frame = cap.read()
286
- if not ret:
287
- break
288
-
289
- frame_count += 1
290
- frame = cv2.resize(frame, (width, height))
291
- results = yolo_model.track(frame, persist=True)
292
 
293
- if results and results[0].boxes is not None and results[0].boxes.xyxy is not None:
294
- boxes = results[0].boxes.xyxy.int().cpu().tolist()
295
- class_ids = results[0].boxes.cls.int().cpu().tolist()
296
- track_ids = results[0].boxes.id.int().cpu().tolist() if results[0].boxes.id is not None else [-1] * len(boxes)
297
-
298
- for box, track_id, class_id in zip(boxes, track_ids, class_ids):
299
- x1, y1, x2, y2 = box
300
- center_x = (x1 + x2) // 2
301
-
302
- # Draw detection box and label
303
- cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
304
- cvzone.putTextRect(frame, f'ID: {track_id}', (x2, y2), 1, 1)
305
- cvzone.putTextRect(frame, f'{names[class_id]}', (x1, y1), 1, 1)
306
-
307
- # If object is near vertical center, analyze
308
- if abs(center_x - vertical_center) < 10:
309
- crop = frame[y1:y2, x1:x2]
310
- response = analyze_image_with_gemini(crop)
311
-
312
- # Log response and display on frame
313
- print(f"Frame {frame_count}, Object {track_id}: {response}")
314
- cvzone.putTextRect(frame, response, (x1, y1 - 10), 1, 1, colorT=(255, 255, 255), colorR=(0, 0, 255))
315
-
316
- out.write(frame)
317
-
318
- cap.release()
319
- out.release()
320
-
321
- return output_video_path
322
-
323
- def gradio_interface(video_path):
324
- """Handles Gradio video input and processes it."""
325
- if video_path is None:
326
- return "Error: No video uploaded."
327
- return process_video(video_path)
328
-
329
- # βœ… Gradio UI setup
330
- iface = gr.Interface(
331
- fn=gradio_interface,
332
- inputs=gr.File(type="filepath", label="Upload Video"), # Accepts video files
333
- outputs=gr.Video(label="Processed Video"), # Outputs processed video
334
- title="YOLO + Gemini AI Video Analysis",
335
- description="Upload a video to detect objects and analyze them using Gemini AI.",
336
- )
337
-
338
- if __name__ == "__main__":
339
- iface.launch(share=True)
340
- '''
341
-
342
- '''
343
- import cv2
344
- import numpy as np
345
- from ultralytics import YOLO
346
- import cvzone
347
- import base64
348
- import os
349
- import gradio as gr
350
- from langchain_core.messages import HumanMessage
351
- from langchain_google_genai import ChatGoogleGenerativeAI
352
-
353
- # βœ… Set up Google API Key securely (Avoid hardcoding in production)
354
- os.environ["GOOGLE_API_KEY"] = "AIzaSyCC-QiN5S42PQDxH6HUg-d-jye-jgc2_oM" # Replace with your actual API Key
355
-
356
- # βœ… Initialize the Gemini model
357
- gemini_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
358
-
359
- # βœ… Load the YOLO model
360
- yolo_model = YOLO("best.pt") # Ensure "best.pt" is in the working directory
361
- names = yolo_model.names # Class names from the YOLO model
362
-
363
- def encode_image_to_base64(image):
364
- """Encodes an image to a base64 string."""
365
- _, img_buffer = cv2.imencode('.jpg', image)
366
- return base64.b64encode(img_buffer).decode('utf-8')
367
-
368
- def analyze_image_with_gemini(image):
369
- """Sends an image to Gemini AI for analysis."""
370
- if image is None or image.shape[0] == 0 or image.shape[1] == 0:
371
- return "Error: Invalid image."
372
-
373
- image_data = encode_image_to_base64(image)
374
- message = HumanMessage(content=[
375
- {"type": "text", "text": """
376
- Analyze this image and determine if the label is present on the bottle.
377
- Return the result strictly in a structured table format:
378
-
379
- | Label Present | Damage |
380
- |--------------|--------|
381
- | Yes/No | Yes/No |
382
- """},
383
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, "description": "Detected product"}
384
- ])
385
-
386
- try:
387
- response = gemini_model.invoke([message])
388
- return response.content
389
- except Exception as e:
390
- return f"Error processing image: {e}"
391
-
392
- def process_video(video_path):
393
- """Processes the uploaded video frame by frame using YOLO and Gemini AI."""
394
- cap = cv2.VideoCapture(video_path)
395
- if not cap.isOpened():
396
- return "Error: Could not open video file."
397
-
398
- width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
399
- height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
400
- fps = int(cap.get(cv2.CAP_PROP_FPS))
401
-
402
- fourcc = cv2.VideoWriter_fourcc(*"mp4v")
403
- output_video_path = "/tmp/output.mp4" # Use /tmp for Hugging Face Spaces
404
- out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
405
-
406
- vertical_center = width // 2
407
-
408
- frame_count = 0
409
- while True:
410
- ret, frame = cap.read()
411
- if not ret:
412
- break
413
-
414
- frame_count += 1
415
- frame = cv2.resize(frame, (width, height))
416
- results = yolo_model.track(frame, persist=True)
417
-
418
- if results and results[0].boxes is not None and results[0].boxes.xyxy is not None:
419
- boxes = results[0].boxes.xyxy.int().cpu().tolist()
420
- class_ids = results[0].boxes.cls.int().cpu().tolist()
421
- track_ids = results[0].boxes.id.int().cpu().tolist() if results[0].boxes.id is not None else [-1] * len(boxes)
422
-
423
- for box, track_id, class_id in zip(boxes, track_ids, class_ids):
424
- x1, y1, x2, y2 = box
425
- center_x = (x1 + x2) // 2
426
-
427
- # Draw detection box and label
428
- cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
429
- cvzone.putTextRect(frame, f'ID: {track_id}', (x2, y2), 1, 1)
430
- cvzone.putTextRect(frame, f'{names[class_id]}', (x1, y1), 1, 1)
431
-
432
- # If object is near vertical center, analyze
433
- if abs(center_x - vertical_center) < 10:
434
- crop = frame[y1:y2, x1:x2]
435
- response = analyze_image_with_gemini(crop)
436
-
437
- # Log response and display on frame
438
- print(f"Frame {frame_count}, Object {track_id}: {response}")
439
- cvzone.putTextRect(frame, response, (x1, y1 - 10), 1, 1, colorT=(255, 255, 255), colorR=(0, 0, 255))
440
-
441
- out.write(frame)
442
-
443
- cap.release()
444
- out.release()
445
-
446
- return output_video_path
447
-
448
- def gradio_interface(video_file):
449
- """Handles Gradio video input and processes it."""
450
- if video_file is None:
451
- return "Error: No video uploaded."
452
-
453
- processed_video = process_video(video_file)
454
- return processed_video # Return the processed video file
455
-
456
- # βœ… Gradio UI setup
457
- iface = gr.Interface(
458
- fn=gradio_interface,
459
- inputs=gr.File(type="filepath", label="Upload Video"), # Accepts video files
460
- outputs=gr.Video(label="Processed Video"), # Outputs processed video
461
- title="YOLO + Gemini AI Video Analysis",
462
- description="Upload a video to detect objects and analyze them using Gemini AI.",
463
- )
464
-
465
- if __name__ == "__main__":
466
- iface.launch(share=True)
467
-
468
- #working
469
- '''
470
-
471
-
472
-
473
- import cv2
474
- import numpy as np
475
- from ultralytics import YOLO
476
- import cvzone
477
- import base64
478
- import os
479
- import gradio as gr
480
- from langchain_core.messages import HumanMessage
481
- from langchain_google_genai import ChatGoogleGenerativeAI
482
-
483
- # βœ… Set up Google API Key (Avoid hardcoding in production)
484
- os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY") # Replace with your actual API Key
485
-
486
- # βœ… Initialize the Gemini model
487
- gemini_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
488
-
489
- # βœ… Load the YOLO model
490
- yolo_model = YOLO("best.pt") # Ensure "best.pt" is in the working directory
491
- names = yolo_model.names # Class names from the YOLO model
492
-
493
- def encode_image_to_base64(image):
494
- """Encodes an image to a base64 string."""
495
- _, img_buffer = cv2.imencode('.jpg', image)
496
- return base64.b64encode(img_buffer).decode('utf-8')
497
-
498
- def analyze_image_with_gemini(image):
499
- """Sends an image to Gemini AI for analysis."""
500
- if image is None or image.shape[0] == 0 or image.shape[1] == 0:
501
- return "Error: Invalid image."
502
-
503
- image_data = encode_image_to_base64(image)
504
- message = HumanMessage(content=[
505
- {"type": "text", "text": """
506
- Analyze this image and determine if the label is present on the bottle.
507
- Return the result strictly in a structured table format:
508
-
509
- | Label Present | Damage |
510
- |--------------|--------|
511
- | Yes/No | Yes/No |
512
- """},
513
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, "description": "Detected product"}
514
- ])
515
-
516
- try:
517
- response = gemini_model.invoke([message])
518
- return response.content
519
- except Exception as e:
520
- return f"Error processing image: {e}"
521
-
522
- def process_video(video_path):
523
- """Processes the uploaded video frame by frame using YOLO and Gemini AI."""
524
- cap = cv2.VideoCapture(video_path)
525
- if not cap.isOpened():
526
- return "Error: Could not open video file."
527
-
528
  width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
529
  height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
530
  fps = int(cap.get(cv2.CAP_PROP_FPS))
@@ -534,7 +62,8 @@ def process_video(video_path):
534
  out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
535
 
536
  vertical_center = width // 2
537
- analyzed_objects = {} # Dictionary to store analyzed objects
 
538
 
539
  while True:
540
  ret, frame = cap.read()
@@ -552,45 +81,59 @@ def process_video(video_path):
552
  x1, y1, x2, y2 = box
553
  center_x = (x1 + x2) // 2
554
 
555
- # Draw detection box and label
 
 
 
 
556
  cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
557
  cvzone.putTextRect(frame, f'ID: {track_id}', (x2, y2), 1, 1)
558
  cvzone.putTextRect(frame, f'{names[class_id]}', (x1, y1), 1, 1)
559
 
560
- # If object is near vertical center and hasn't been analyzed yet
561
- if abs(center_x - vertical_center) < 10 and track_id not in analyzed_objects:
562
  crop = frame[y1:y2, x1:x2]
563
  response = analyze_image_with_gemini(crop)
564
-
565
- # Store analyzed object to prevent duplicate analysis
566
  analyzed_objects[track_id] = response
567
 
568
- # Log response and display on frame
569
- print(f"Object {track_id}: {response}")
570
- cvzone.putTextRect(frame, response, (x1, y1 - 10), 1, 1, colorT=(255, 255, 255), colorR=(0, 0, 255))
571
-
 
 
 
 
 
 
572
  out.write(frame)
573
 
574
  cap.release()
575
  out.release()
576
 
577
- return output_video_path
578
 
579
  def gradio_interface(video_path):
580
  """Handles Gradio video input and processes it."""
581
  if video_path is None:
582
- return "Error: No video uploaded."
 
583
  return process_video(video_path)
584
 
585
- # βœ… Gradio UI setup
 
 
 
586
  iface = gr.Interface(
587
  fn=gradio_interface,
588
- inputs=gr.File(type="filepath", label="Upload Video"), # Accepts video files
589
- outputs=gr.Video(label="Processed Video"), # Outputs processed video
 
 
 
590
  title="YOLO + Gemini AI Video Analysis",
591
- description="Upload a video to detect objects and analyze them using Gemini AI.",
592
  )
593
 
594
  if __name__ == "__main__":
595
- iface.launch(share=True)
596
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import cv2
2
  import numpy as np
3
  from ultralytics import YOLO
 
8
  from langchain_core.messages import HumanMessage
9
  from langchain_google_genai import ChatGoogleGenerativeAI
10
 
11
+ # βœ… Set up Google API Key
12
+ os.environ["GOOGLE_API_KEY"] = "AIzaSyDT0y1kJqgGKiOYiYFMXc-2kTgV_WLbOpA"#os.getenv("GOOGLE_API_KEY")
13
 
14
  # βœ… Initialize the Gemini model
15
  gemini_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
16
 
17
  # βœ… Load the YOLO model
18
+ yolo_model = YOLO("/content/Bottle_lable_checking_using_gemini_yolo/best.pt")
19
+ names = yolo_model.names
20
 
21
  def encode_image_to_base64(image):
22
  """Encodes an image to a base64 string."""
 
51
  """Processes the uploaded video frame by frame using YOLO and Gemini AI."""
52
  cap = cv2.VideoCapture(video_path)
53
  if not cap.isOpened():
54
+ return "Error: Could not open video file.", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
57
  height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
58
  fps = int(cap.get(cv2.CAP_PROP_FPS))
 
62
  out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
63
 
64
  vertical_center = width // 2
65
+ analyzed_objects = {}
66
+ log_messages = []
67
 
68
  while True:
69
  ret, frame = cap.read()
 
81
  x1, y1, x2, y2 = box
82
  center_x = (x1 + x2) // 2
83
 
84
+ # βœ… Apply bounding box only after the bottle reaches the left half of the frame
85
+ if center_x > vertical_center:
86
+ continue # Skip drawing before it crosses the center to the left side
87
+
88
+ # Draw detection box
89
  cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
90
  cvzone.putTextRect(frame, f'ID: {track_id}', (x2, y2), 1, 1)
91
  cvzone.putTextRect(frame, f'{names[class_id]}', (x1, y1), 1, 1)
92
 
93
+ # βœ… Ensure label (analysis result) remains visible after detection
94
+ if track_id not in analyzed_objects:
95
  crop = frame[y1:y2, x1:x2]
96
  response = analyze_image_with_gemini(crop)
 
 
97
  analyzed_objects[track_id] = response
98
 
99
+ log_messages.append(f"Object {track_id}: {response}") # βœ… Add log
100
+ print(f"Object {track_id}: {response}") # βœ… Print log for debugging
101
+
102
+ # πŸ› οΈ Keep analysis text on screen for each analyzed object
103
+ if track_id in analyzed_objects:
104
+ response_text = analyzed_objects[track_id]
105
+ text_x = 50 # Left side
106
+ text_y = height // 2 # Middle of the frame
107
+ cvzone.putTextRect(frame, response_text, (text_x, text_y), 2, 2, colorT=(255, 255, 255), colorR=(0, 0, 255))
108
+
109
  out.write(frame)
110
 
111
  cap.release()
112
  out.release()
113
 
114
+ return output_video_path, "\n".join(log_messages) # βœ… Return logs along with the processed video
115
 
116
  def gradio_interface(video_path):
117
  """Handles Gradio video input and processes it."""
118
  if video_path is None:
119
+ return "Error: No video uploaded.", ""
120
+
121
  return process_video(video_path)
122
 
123
+ # βœ… Sample video file
124
+ sample_video_path = "/content/Bottle_lable_checking_using_gemini_yolo/vid4.mp4" # Make sure this file is available in the working directory
125
+
126
+ # βœ… Gradio UI setup with sample video
127
  iface = gr.Interface(
128
  fn=gradio_interface,
129
+ inputs=gr.File(value=sample_video_path, type="filepath", label="Upload Video (Sample Included)"),
130
+ outputs=[
131
+ gr.Video(label="Processed Video"),
132
+ gr.Textbox(label="Processing Logs", lines=10, interactive=False)
133
+ ],
134
  title="YOLO + Gemini AI Video Analysis",
135
+ description="Upload a video to detect objects and analyze them using Gemini AI.\nA sample video is preloaded for quick testing.",
136
  )
137
 
138
  if __name__ == "__main__":
139
+ iface.launch(share=True)