import os import json import pickle import random import time import itertools import numpy as np from PIL import Image import skimage.io as io import matplotlib.pyplot as plt from matplotlib.collections import PatchCollection from matplotlib.patches import Polygon, Rectangle from torch.utils.data import Dataset from minigpt4.datasets.datasets.base_dataset import BaseDataset from minigpt4.datasets.datasets.caption_datasets import CaptionDataset def sample_object_bbox(objects, bbox): zipped_list = list(zip(objects, bbox)) # Shuffle the zipped list random.shuffle(zipped_list) # Generate the new string with interleaved format # interleaved_list = str([{'{},{}'.format(obj, str(bbox).replace("[","").replace("]","") )} for obj, bbox in zipped_list]) # print("objects", objects) # print("bbox",bbox) interleaved_list = str([{'{},{}'.format(obj, bbox.strip())} for obj, bbox in zipped_list]).replace("'","").replace("[","").replace("]","") # interleaved_list = " "+interleaved_list # print(interleaved_list) return interleaved_list def bbox_to_object(objects, bbox): index_sample = random.sample(range(len(objects)),1)[0] sample_object = str(objects[index_sample]) sample_bbox = bbox[index_sample] # sample_center_point = center_point[index_sample] sample_bbox = r"{"+str(sample_bbox) + "}" return sample_bbox, sample_object def object_to_bbox(objects, bbox, center_point): index_sample = random.sample(range(len(objects)),1)[0] sample_object = objects[index_sample] sample_bbox = bbox[index_sample] sample_center_point = center_point[index_sample] instruction = "what is object and the bounding box in the center coordinate of "+str(sample_center_point)+"? " answer = "{"+str(sample_object)+","+str(sample_bbox)+"}" return instruction, answer class LVISBBOXDataset(BaseDataset): def __init__(self, vis_processor, text_processor, location): super().__init__(vis_processor=vis_processor, text_processor=text_processor) self.inner_dataset = wds.DataPipeline( wds.ResampledShards(location), wds.tarfile_to_samples(handler=wds.warn_and_continue), wds.shuffle(1000, handler=wds.warn_and_continue), wds.decode("pilrgb", handler=wds.warn_and_continue), wds.to_tuple("jpg", "json", handler=wds.warn_and_continue), wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue), wds.map(self.to_dict, handler=wds.warn_and_continue), ) def to_dict(self, sample): objects = sample[1]["objects"] boxes = sample[1]["bbox"] new_bboxes = [] image_size = sample[0].shape[1] image_size = 100 for index in range(len(boxes)): box = boxes[index] x1 = int(box[0]*image_size) y1 = int(box[1]*image_size) x2 = x1 + int(box[2]*image_size) y2 = y1 + int(box[3]*image_size) assert x1>=0 and x1<=image_size assert x2>=0 and x2<=image_size assert y1>=0 and y1<=image_size assert y2>=0 and y2<=image_size new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">" # new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">" new_bboxes.append(new_bbox) instruction = r"Given an image, identify the objects and their bounding boxes in the format of {object,x1 y1 x2 y2}. " instruction = " {}".format(self.text_processor(instruction)) answer = sample_object_bbox(objects, new_bboxes) # print("instruction",instruction) # print("answer", answer) return { "image": sample[0], "instruction_input": instruction, "answer": self.text_processor(answer), "data_type": "bbox", "question_split": True } class LVISBboxToObjectDataset(BaseDataset): def __init__(self, vis_processor, text_processor, location): super().__init__(vis_processor=vis_processor, text_processor=text_processor) self.inner_dataset = wds.DataPipeline( wds.ResampledShards(location), wds.tarfile_to_samples(handler=wds.warn_and_continue), wds.shuffle(1000, handler=wds.warn_and_continue), wds.decode("pilrgb", handler=wds.warn_and_continue), wds.to_tuple("jpg", "json", handler=wds.warn_and_continue), wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue), wds.map(self.to_dict, handler=wds.warn_and_continue), ) # self.instruction_pool = [ # "###Human: what object is in this bounding box location {}###Assistant: ", # "###Human: what object is in this location {}###Assistant: ", # "###Human: identify the object present at this location {}###Assistant: ", # "###Human: what is it in bounding box location{}###Assistant: ", # "###Human: describe this object in {} ###Assistant: ", # "###Human: this {} is ###Assistant: ", # "###Human: the object in {} is ###Assistant: ", # "###Human: please tell me what is inside the bounding box position {} ###Assistant: ", # "###Human: what can you find in the bounding box area at position {}? ###Assistant: ", # "###Human: what is the object occupying this bbox area {}###Assistant: ", # "###Human: could you identify the content within the bounding box located at {}###Assistant: ", # ] self.instruction_pool = [ "what object is in this bounding box location {} ", "what object is in this location {} ", "identify the object present at this location {} ", "what is it in bounding box location{} ", "describe this object in {} ", "this {} is ", "the object in {} is ", "please tell me what is inside the bounding box position {} ", "what can you find in the bounding box area at position {}? ", "what is the object occupying this area {} ", "could you identify the content within the bounding box located at {} ", ] def to_dict(self, sample): objects = sample[1]["objects"] boxes = sample[1]["bbox"] new_bboxes = [] image_size = sample[0].shape[1] image_size= 100 for index in range(len(boxes)): box = boxes[index] x1 = int(box[0]*image_size) y1 = int(box[1]*image_size) x2 = x1 + int(box[2]*image_size) y2 = y1 + int(box[3]*image_size) assert x1>=0 and x1<=image_size assert x2>=0 and x2<=image_size assert y1>=0 and y1<=image_size assert y2>=0 and y2<=image_size new_bbox = "<"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">" new_bboxes.append(new_bbox) bbox, object = bbox_to_object(objects, new_bboxes) instruction = random.choice(self.instruction_pool).format(bbox) # instruction = "###Human: {} ###Assistant: ".format(instruction) instruction = " {} ".format(instruction) return { "image": sample[0], "instruction_input": instruction, "answer": self.text_processor(object), "data_type": "bbox", "question_split": True }