Spaces:

reab5555
/

Owlv2-Video-Object-Detection

Paused

App Files Files Community

reab5555 commited on Jul 24, 2024

Commit

343407e

verified ·

1 Parent(s): 53eff3d

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -17

app.py CHANGED Viewed

@@ -5,6 +5,8 @@ import torch
 from transformers import Owlv2Processor, Owlv2ForObjectDetection
 import numpy as np
 import os
 # Check if CUDA is available, otherwise use CPU
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
@@ -24,7 +26,7 @@ def detect_objects_in_frame(image, target):
     color_map = {target: "red"}
     try:
-        font = ImageFont.truetype("arial.ttf", 15)
     except IOError:
         font = ImageFont.load_default()
@@ -32,6 +34,7 @@ def detect_objects_in_frame(image, target):
     text = texts[i]
     boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
     for box, score, label in zip(boxes, scores, labels):
         if score.item() >= 0.25:
             box = [round(i, 2) for i in box.tolist()]
@@ -39,42 +42,64 @@ def detect_objects_in_frame(image, target):
             confidence = round(score.item(), 3)
             annotation = f"{object_label}: {confidence}"
-            draw.rectangle(box, outline=color_map.get(object_label, "red"), width=2)
-            text_position = (box[0], box[1] - 10)
             draw.text(text_position, annotation, fill="white", font=font)
-    return image
 def process_video(video_path, target, progress=gr.Progress()):
     if video_path is None:
-        return None, "Error: No video uploaded"
     if not os.path.exists(video_path):
-        return None, f"Error: Video file not found at {video_path}"
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
-        return None, f"Error: Unable to open video file at {video_path}"
     frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     original_fps = int(cap.get(cv2.CAP_PROP_FPS))
     output_fps = 3
     processed_frames = []
-    frame_interval = max(1, round(original_fps / output_fps))
-    for frame in progress.tqdm(range(0, frame_count, frame_interval)):
-        cap.set(cv2.CAP_PROP_POS_FRAMES, frame)
         ret, img = cap.read()
         if not ret:
             break
         pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
-        annotated_img = detect_objects_in_frame(pil_img, target)
         processed_frames.append(np.array(annotated_img))
     cap.release()
-    return processed_frames, None
 def load_sample_frame(video_path):
     cap = cv2.VideoCapture(video_path)
@@ -95,18 +120,21 @@ def gradio_app():
         target_input = gr.Textbox(label="Target Object", value="Elephant")
         frame_slider = gr.Slider(minimum=0, maximum=100, step=1, label="Frame", value=0)
         output_image = gr.Image(label="Processed Frame")
         error_output = gr.Textbox(label="Error Messages", visible=False)
         sample_video_frame = gr.Image(value=load_sample_frame("Drone Video of African Wildlife Wild Botswan.mp4"), label="Sample Video Frame")
         use_sample_button = gr.Button("Use Sample Video")
         progress_bar = gr.Progress()
         processed_frames = gr.State([])
         def process_and_update(video, target):
-            frames, error = process_video(video, target, progress_bar)
             if frames is not None:
-                return frames, frames[0], error, gr.Slider(maximum=len(frames) - 1, value=0)
-            return None, None, error, gr.Slider(maximum=100, value=0)
         def update_frame(frame_index, frames):
             if frames and 0 <= frame_index < len(frames):
@@ -115,7 +143,7 @@ def gradio_app():
         video_input.upload(process_and_update,
                            inputs=[video_input, target_input],
-                           outputs=[processed_frames, output_image, error_output, frame_slider])
         frame_slider.change(update_frame,
                             inputs=[frame_slider, processed_frames],
@@ -127,7 +155,7 @@ def gradio_app():
         use_sample_button.click(use_sample_video,
                                 inputs=None,
-                                outputs=[processed_frames, output_image, error_output, frame_slider])
     return app

 from transformers import Owlv2Processor, Owlv2ForObjectDetection
 import numpy as np
 import os
+import matplotlib.pyplot as plt
+from io import BytesIO
 # Check if CUDA is available, otherwise use CPU
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
     color_map = {target: "red"}
     try:
+        font = ImageFont.truetype("arial.ttf", 30)
     except IOError:
         font = ImageFont.load_default()
     text = texts[i]
     boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
+    max_score = 0
     for box, score, label in zip(boxes, scores, labels):
         if score.item() >= 0.25:
             box = [round(i, 2) for i in box.tolist()]
             confidence = round(score.item(), 3)
             annotation = f"{object_label}: {confidence}"
+            draw.rectangle(box, outline=color_map.get(object_label, "red"), width=4)
+            text_position = (box[0], box[1] - 30)
             draw.text(text_position, annotation, fill="white", font=font)
+            max_score = max(max_score, confidence)
+    return image, max_score
 def process_video(video_path, target, progress=gr.Progress()):
     if video_path is None:
+        return None, None, "Error: No video uploaded"
     if not os.path.exists(video_path):
+        return None, None, f"Error: Video file not found at {video_path}"
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
+        return None, None, f"Error: Unable to open video file at {video_path}"
     frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     original_fps = int(cap.get(cv2.CAP_PROP_FPS))
     output_fps = 3
+    frame_duration = 1 / output_fps
+    video_duration = frame_count / original_fps
     processed_frames = []
+    frame_scores = []
+    for time in progress.tqdm(np.arange(0, video_duration, frame_duration)):
+        frame_number = int(time * original_fps)
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
         ret, img = cap.read()
         if not ret:
             break
         pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+        annotated_img, max_score = detect_objects_in_frame(pil_img, target)
         processed_frames.append(np.array(annotated_img))
+        frame_scores.append(max_score)
     cap.release()
+    return processed_frames, frame_scores, None
+def create_heatmap(frame_scores):
+    plt.figure(figsize=(10, 2))
+    plt.imshow([frame_scores], cmap='hot', aspect='auto')
+    plt.colorbar(label='Confidence')
+    plt.title('Object Detection Heatmap')
+    plt.xlabel('Frame')
+    plt.yticks([])
+    plt.tight_layout()
+    buf = BytesIO()
+    plt.savefig(buf, format='png')
+    buf.seek(0)
+    plt.close()
+    return buf
 def load_sample_frame(video_path):
     cap = cv2.VideoCapture(video_path)
         target_input = gr.Textbox(label="Target Object", value="Elephant")
         frame_slider = gr.Slider(minimum=0, maximum=100, step=1, label="Frame", value=0)
         output_image = gr.Image(label="Processed Frame")
+        heatmap_output = gr.Image(label="Detection Heatmap")
         error_output = gr.Textbox(label="Error Messages", visible=False)
         sample_video_frame = gr.Image(value=load_sample_frame("Drone Video of African Wildlife Wild Botswan.mp4"), label="Sample Video Frame")
         use_sample_button = gr.Button("Use Sample Video")
         progress_bar = gr.Progress()
         processed_frames = gr.State([])
+        frame_scores = gr.State([])
         def process_and_update(video, target):
+            frames, scores, error = process_video(video, target, progress_bar)
             if frames is not None:
+                heatmap = create_heatmap(scores)
+                return frames, scores, frames[0], heatmap, error, gr.Slider(maximum=len(frames) - 1, value=0)
+            return None, None, None, None, error, gr.Slider(maximum=100, value=0)
         def update_frame(frame_index, frames):
             if frames and 0 <= frame_index < len(frames):
         video_input.upload(process_and_update,
                            inputs=[video_input, target_input],
+                           outputs=[processed_frames, frame_scores, output_image, heatmap_output, error_output, frame_slider])
         frame_slider.change(update_frame,
                             inputs=[frame_slider, processed_frames],
         use_sample_button.click(use_sample_video,
                                 inputs=None,
+                                outputs=[processed_frames, frame_scores, output_image, heatmap_output, error_output, frame_slider])
     return app