File size: 6,469 Bytes
81e2598
36dd82f
81e2598
 
 
36dd82f
 
343407e
 
2934aa1
81e2598
ebc376e
 
81e2598
 
 
 
 
36dd82f
343407e
36dd82f
 
343407e
36dd82f
 
 
343407e
36dd82f
3f2cadc
69dc1f7
 
343407e
 
69dc1f7
7f942f1
343407e
dec322a
4f98062
 
f05ca8c
343407e
 
 
36dd82f
 
 
3f2cadc
81e2598
4f98062
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a58e878
4f98062
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f2cadc
36dd82f
343407e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2934aa1
36dd82f
 
 
 
 
 
 
 
 
 
 
 
 
 
a58e878
81e2598
36dd82f
a702d47
7f942f1
 
343407e
36dd82f
81e2598
 
 
 
7f942f1
343407e
7f942f1
81e2598
343407e
7f942f1
343407e
 
 
7f942f1
 
 
 
 
36dd82f
 
81e2598
343407e
7f942f1
 
 
 
36dd82f
 
81e2598
53eff3d
36dd82f
 
 
343407e
36dd82f
 
 
 
 
7f942f1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import gradio as gr
import cv2
from PIL import Image, ImageDraw, ImageFont
import torch
from transformers import Owlv2Processor, Owlv2ForObjectDetection
import numpy as np
import os
import matplotlib.pyplot as plt
from io import BytesIO
import base64

# Check if CUDA is available, otherwise use CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16")
model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16").to(device)

def process_video(video_path, target, progress=gr.Progress()):
    if video_path is None:
        return None, None, "Error: No video uploaded"

    if not os.path.exists(video_path):
        return None, None, f"Error: Video file not found at {video_path}"

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None, None, f"Error: Unable to open video file at {video_path}"

    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    original_fps = int(cap.get(cv2.CAP_PROP_FPS))
    output_fps = 3
    frame_duration = 1 / output_fps
    video_duration = frame_count / original_fps

    processed_frames = []
    frame_scores = []
    batch_size = 1
    batch_frames = []
    batch_times = []

    for time in progress.tqdm(np.arange(0, video_duration, frame_duration)):
        frame_number = int(time * original_fps)
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
        ret, img = cap.read()
        if not ret:
            break

        pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        batch_frames.append(pil_img)
        batch_times.append(time)

        if len(batch_frames) == batch_size or time + frame_duration >= video_duration:
            # Process the batch
            texts = [[target]] * len(batch_frames)
            inputs = processor(text=texts, images=batch_frames, return_tensors="pt", padding=True).to(device)
            outputs = model(**inputs)

            for i, (image, batch_time) in enumerate(zip(batch_frames, batch_times)):
                target_sizes = torch.Tensor([image.size[::-1]])
                results = processor.post_process_object_detection(outputs, target_sizes=target_sizes)
                
                draw = ImageDraw.Draw(image)
                max_score = 0

                try:
                    font = ImageFont.truetype("arial.ttf", 30)
                except IOError:
                    font = ImageFont.load_default()

                boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

                for box, score, label in zip(boxes, scores, labels):
                    if score.item() >= 0.5:
                        box = [round(i, 2) for i in box.tolist()]
                        object_label = target
                        confidence = round(score.item(), 3)
                        annotation = f"{object_label}: {confidence}"

                        draw.rectangle(box, outline="red", width=4)
                        text_position = (box[0], box[1] - 30)
                        draw.text(text_position, annotation, fill="white", font=font)

                        max_score = max(max_score, confidence)

                processed_frames.append(np.array(image))
                frame_scores.append(max_score)

            batch_frames = []
            batch_times = []

    cap.release()
    return processed_frames, frame_scores, None

def create_heatmap(frame_scores):
    plt.figure(figsize=(10, 2))
    plt.imshow([frame_scores], cmap='hot', aspect='auto')
    plt.colorbar(label='Confidence')
    plt.title('Object Detection Heatmap')
    plt.xlabel('Frame')
    plt.yticks([])
    plt.tight_layout()
    
    buf = BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    plt.close()
    
    return base64.b64encode(buf.getvalue()).decode('utf-8')

def load_sample_frame(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None
    ret, frame = cap.read()
    cap.release()
    if not ret:
        return None
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    return frame_rgb

def gradio_app():
    with gr.Blocks() as app:
        gr.Markdown("# Video Object Detection with Owlv2")

        video_input = gr.Video(label="Upload Video")
        target_input = gr.Textbox(label="Target Object", value="Elephant")
        frame_slider = gr.Slider(minimum=0, maximum=100, step=1, label="Frame", value=0)
        output_image = gr.Image(label="Processed Frame")
        heatmap_output = gr.Image(label="Detection Heatmap")
        error_output = gr.Textbox(label="Error Messages", visible=False)
        sample_video_frame = gr.Image(value=load_sample_frame("Drone Video of African Wildlife Wild Botswan.mp4"), label="Sample Video Frame")
        use_sample_button = gr.Button("Use Sample Video")
        progress_bar = gr.Progress()

        processed_frames = gr.State([])
        frame_scores = gr.State([])

        def process_and_update(video, target):
            frames, scores, error = process_video(video, target, progress_bar)
            if frames is not None:
                heatmap = create_heatmap(scores)
                return frames, scores, frames[0], heatmap, error, gr.Slider(maximum=len(frames) - 1, value=0)
            return None, None, None, None, error, gr.Slider(maximum=100, value=0)

        def update_frame(frame_index, frames):
            if frames and 0 <= frame_index < len(frames):
                return frames[frame_index]
            return None

        video_input.upload(process_and_update, 
                           inputs=[video_input, target_input], 
                           outputs=[processed_frames, frame_scores, output_image, heatmap_output, error_output, frame_slider])

        frame_slider.change(update_frame, 
                            inputs=[frame_slider, processed_frames], 
                            outputs=[output_image])

        def use_sample_video():
            sample_video_path = "Drone Video of African Wildlife Wild Botswan.mp4"
            return process_and_update(sample_video_path, "Elephant")

        use_sample_button.click(use_sample_video, 
                                inputs=None, 
                                outputs=[processed_frames, frame_scores, output_image, heatmap_output, error_output, frame_slider])

    return app

if __name__ == "__main__":
    app = gradio_app()
    app.launch(share=True)