File size: 6,434 Bytes
bfa3aba
eef3f03
bfa3aba
 
 
 
 
 
 
 
 
 
 
 
d7ae9fa
bfa3aba
 
 
 
d7ae9fa
bfa3aba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7ae9fa
bfa3aba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eef3f03
 
 
 
bfa3aba
 
eef3f03
 
bfa3aba
 
 
 
 
 
 
 
 
 
 
 
 
7bfaa80
 
 
 
 
bfa3aba
 
7bfaa80
bfa3aba
7bfaa80
 
 
 
 
bfa3aba
 
 
 
 
 
 
 
7bfaa80
d7ae9fa
bfa3aba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f14479
bfad7a6
 
7bfaa80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfa3aba
 
 
 
 
 
7bfaa80
bfa3aba
d7ae9fa
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import torch
import torch.quantization
from tqdm import tqdm
import cv2
import os
import numpy as np
import pandas as pd

from datetime import datetime
from typing import Tuple
from PIL import Image
from utils import plot_predictions, mp4_to_png, vid_stitcher
from transformers import Owlv2Processor, Owlv2ForObjectDetection


def preprocess_text(text_prompt: str, num_prompts: int = 1):
    """
    Takes a string of text prompts and returns a list of lists of text prompts for each image. 
    i.e. text_prompt = "a, b, c" -> [["a", "b", "c"], ["a", "b", "c"]]
    """ 
    text_prompt = [s.strip() for s in text_prompt.split(",")]
    text_queries = [text_prompt] * num_prompts
    # print("text_queries:", text_queries)
    return text_queries
def owl_batch_prediction(
        images: torch.Tensor,
        text_queries : list[str], # assuming that every image is queried with the same text prompt
        threshold: float,
        processor, 
        model,
        device: str = 'cuda'
    ):

    inputs = processor(text=text_queries, images=images, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
    target_sizes = torch.Tensor([img.size[::-1] for img in images]).to(device)
    # Convert outputs (bounding boxes and class logits) to COCO API, resizes to original image size and filter by threshold
    results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=threshold)

    return results

def owl_full_video(
        vid_path: str, 
        text_prompt: str,
        threshold: float,
        fps_processed: int = 1,
        scaling_factor: float = 0.5,
        device: str = 'cuda',
        batch_size: int = 6,
        ):
    """ Same as owl_video, but processes the entire video regardless of detection bool. 
        Saves results per frame to a df.
    """

    processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
    model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").half().to('cuda')


    # create new dirs and paths for results
    filename = os.path.splitext(os.path.basename(vid_path))[0]
    # results_dir = f'../temp/{filename}_{datetime.now().strftime("%H%M%S")}' # run this on server
    results_dir = f'temp/{filename}_{datetime.now().strftime("%H%M%S")}' # run this on local
    frames_dir = os.path.join(results_dir, "frames")

    # if the frames directory does not exist, create it and get the frames from the video
    if not os.path.exists(results_dir):
        os.makedirs(results_dir, exist_ok=True)
        os.makedirs(frames_dir, exist_ok=True)
        # process video and create a directory of video frames
        fps = mp4_to_png(vid_path, frames_dir, scaling_factor)

    # get all frame paths
    frame_filenames = os.listdir(frames_dir)

    frame_paths = []  # list of frame paths to process based on fps_processed

    # TESTING OUT FADING OUT THE ANNOTATED BOX BETWEEN FRAMES
    annotation_guide = {}
    last_frame_run = frame_filenames[0]

    # for every frame processed, add to frame_paths
    for i, frame in enumerate(frame_filenames):
        path = os.path.join(frames_dir, frame)
        if i % fps_processed == 0:
            last_frame_run = path
            frame_paths.append(path)
            annotation_guide[path] = [] # TESTING
        else:
            annotation_guide[last_frame_run].append(path) # TESTING

    # set up df for results
    df  = pd.DataFrame(columns=["frame", "boxes", "scores", "labels"])

    # run owl in batches
    for i in tqdm(range(0, len(frame_paths), batch_size), desc="Running batches"):
        frame_nums = [i*fps_processed for i in range(batch_size)]
        batch_paths = frame_paths[i:i+batch_size]  # paths for this batch
        filenames = [os.path.basename(p) for p in batch_paths]
        images = [Image.open(image_path) for image_path  in batch_paths]

        # run owl on this batch of frames
        text_queries = preprocess_text(text_prompt, len(batch_paths))
        results = owl_batch_prediction(images, text_queries, threshold, processor, model, device)

        # get the labels
        label_ids = []
        for entry in results:
            if entry['labels'].numel() > 0:
                label_ids.append(entry['labels'].tolist())
            else:
                label_ids.append(None)

        text = text_queries[0] # assuming that all texts in query are the same
        labels = []
        # convert label_ids to phrases, if no phrases, append None
        for idx in label_ids:
            if idx is not None:
                idx = [text[id] for id in idx]
                labels.append(idx)
            else: 
                labels.append(None)
        
        for j, image in enumerate(batch_paths):
            boxes = results[j]['boxes'].cpu().numpy()
            scores = results[j]['scores'].cpu().numpy()
            row = pd.DataFrame({"frame": [image], "boxes": [boxes], "scores": [scores], "labels": [labels[j]]})
            df = pd.concat([df, row], ignore_index=True)
            
            # if there are detections, save the frame replacing the original frame
            if labels[j] is not None:
                annotated_frame = plot_predictions(image, labels[j], scores, boxes)
                cv2.imwrite(image, annotated_frame)

    # annotate all other frames with no detections
    for key in annotation_guide:
        labels = df[df["frame"] == key]["labels"].tolist()[0]
        boxes = df[df["frame"] == key]["boxes"].tolist()[0]
        scores = df[df["frame"] == key]["scores"].tolist()[0]

        print(labels)
        # Flatten nested lists if necessary
        if not labels: 
            continue

        for frame in annotation_guide[key]:
            annotated_frame = plot_predictions(frame, labels, scores, boxes, opacity=0.3)
            cv2.imwrite(frame, annotated_frame)
                
    # save the df to a csv
    csv_path = f"{results_dir}/{filename}_{threshold}.csv"
    df.to_csv(csv_path, index=False)

    # stitch the frames into a video
    save_path = vid_stitcher(frames_dir, output_path=os.path.join(results_dir, "output.mp4"), fps=fps)

    return csv_path, save_path


# # DEBUGGING
# if __name__ == "__main__":
#     owl_full_video('baboon_15s.mp4', 'baboon', 0.3, fps_processed=1, scaling_factor=4)