File size: 5,606 Bytes
dcd4560
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from typing import Dict
import random
import re

from torchvision import transforms

from .utils import sample_video

def return_same(x):
    return x

def _bbox_transform_for_padding(bbox, frame):
    w1, h1, w2, h2 = bbox
    width, height = frame.size
    if width == height:
        pass
    elif width > height:
        h1 += (width - height) // 2
        h2 += (width - height) // 2
        height = width
    else:
        w1 += (height - width) // 2
        w2 += (height - width) // 2
        width = height
    new_bbox = [w1 / width, h1 / height, w2 / width, h2 / height]
    new_bbox = [round(i, 2) for i in new_bbox]
    return new_bbox

def _bbox_transform_for_resize(bbox, frame):
    w1, h1, w2, h2 = bbox
    width, height = frame.size
    new_bbox = [w1 / width, h1 / height, w2 / width, h2 / height]
    new_bbox = [round(i, 2) for i in new_bbox]
    return new_bbox

class InAndOutCropAndResize(object):
    """Crop and resize for in_and_out boxes data according to yuchen
    Args:
        size: tuple of (width, height)
    """

    def __init__(self, size):
        self.size = size

    def __call__(self, img):
        """
        Args:
            img (PIL Image): PIL Image
        Returns:
            PIL Image: PIL image.
        """
        w = img.width
        h = img.height
        x0 = int(w * 0.5 - h * 0.375)
        y0 = int(h * 0.125)
        x1 = int(w * 0.5 + h * 0.375)
        y1 = int(h * 0.875)
        img = img.crop((x0, y0, x1, y1)).resize(self.size)
        return img


class ObjectTrackingParser:
    def __init__(
        self,
        n_frames = 8,
        max_objects = 3,
        is_training=True,
    ):
        self.n_frames = n_frames
        self.max_objects = max_objects
        self.is_training = is_training
        self.img_transform = self.get_img_transform()
        # fmt: off
        self.data_temp = {
            "video_file": "/mnt/bn/llmdatalq/jiaxin/hdvila/20230926/saved/saved_video_clips/0076/lOjn__YCec4.624.1104.mp4",
            "frame_indices": [154, 157, 160, 163, 166, 169, 172, 175, 178, 181, 184, 187, 190, 193, 196, 199, 202],
            "objects": {
                "0": {
                    "phrase": "person",
                    "all_frame_bounding_boxes": [[2, 0, 255, 250], [17, 0, 255, 251], [35, 0, 255, 253], [44, 0, 255, 255], [52, 0, 255, 255], [54, 0, 255, 255], [63, 0, 255, 255], [60, 0, 255, 255], [54, 0, 253, 255], [43, 0, 250, 255], [36, 1, 249, 255], [36, 0, 252, 254], [41, 0, 252, 254], [61, 0, 255, 253], [68, 4, 255, 255], [74, 8, 255, 255], [91, 3, 255, 255]]
                }
            },
            "task": "object_tracking",
            "dataset": "hdvila"
        }
        # fmt: on

    def check_format(self, data_dict: Dict, image_processing_config: Dict):
        # box tracking 数据不支持 do_crop!!!
        if image_processing_config.get('do_crop', False):
            raise ValueError(f'do_crop is not supported in ObjectTrackingParser!')

    def transform(self, data_dict: Dict, image_processing_config: Dict = None) -> Dict:
        self.check_format(data_dict, image_processing_config)

        bbox_transform = _bbox_transform_for_padding if image_processing_config['do_padding'] else _bbox_transform_for_resize

        # sample n_frames
        if isinstance(self.n_frames, int):
            n_frames = self.n_frames
        else:
            n_frames = random.choice(self.n_frames)
        total_frames = list(range(len(data_dict['frame_indices'])))
        idxs = random.sample(total_frames, min(n_frames, len(total_frames)))
        idxs.sort()

        frame_indices = [data_dict['frame_indices'][i] for i in idxs]
        frames = sample_video(data_dict['video_file'], frame_indices=frame_indices)
        img_transform = self.img_transform[data_dict['dataset']]
        frames = [img_transform(f) for f in frames]

        objects = []
        for _, o in data_dict['objects'].items():
            if o is None:
                continue
            all_frame_bounding_boxes = [o['all_frame_bounding_boxes'][i] for i in idxs]
            all_frame_bounding_boxes_t = []
            for bbox, frame in zip(all_frame_bounding_boxes, frames):
                all_frame_bounding_boxes_t.append(bbox_transform(bbox, frame))
            objects.append(all_frame_bounding_boxes_t)
            if len(objects) >= self.max_objects:
                break

        prompt = "Given the bounding box coordinates of these objects in the first frame, output the bounding box coordinates in the following frames.\n{}"
        response = ''

        object_info = ''
        for i, o in enumerate(objects):
            object_info += f'object {i+1}: {o[0]}\n'
            response += f'object {i+1}: {o[1:]}\n'
        response = response.strip()
        prompt = prompt.format(object_info)

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "video", "video": frames},
                    {"type": "text", "text": prompt}
                ]
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": response}
                ]
            }
        ]
        
        return messages

    def get_img_transform(self):
        return {
            'webvid': return_same,
            'hdvila': transforms.Compose([
                transforms.Resize(size=256),
                transforms.CenterCrop(size=(256, 256))
            ]),
            'hdvila_in_and_out_boxes': InAndOutCropAndResize(size=(256, 256))
        }