File size: 3,297 Bytes
3e1d9f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import json
import os

import numpy as np
from torch.utils.data import Dataset

from .io import read_img_general


class QuestionTemplateMixin:
    def __init__(
            self,
            *args,
            template_string=None,
            template_file=None,
            max_dynamic_size=None,
            placeholders=None,
            **kwargs
    ):
        super().__init__(*args, **kwargs)
        self.template_string = template_string
        self.template_file = template_file
        self.max_dynamic_size = max_dynamic_size
        self.placeholders = placeholders
        if template_string is None and template_file is None:
            raise ValueError("assign either template_string or template_file")
        if template_string is not None and template_file is not None:
            raise ValueError(f"assign both template_string and template_file:\nstring:{template_string}\nfile:{template_file}")
        if template_string is not None:
            self.templates = [self.template_string]
        else:
            assert template_file is not None
            self.templates = json.load(open(template_file, 'r', encoding='utf8'))
        if self.max_dynamic_size is not None:
            self.templates = self.templates[: self.max_dynamic_size]

        # sanity check
        assert self.placeholders is not None
        for template in self.templates:
            for placeholder in placeholders:
                assert str(template).count(placeholder) == 1, f"template: {template}\nplaceholder:{placeholder}"

    def get_template(self):
        import random
        return random.choice(self.templates)

    def template_nums(self):
        return len(self.templates)


class MInstrDataset(QuestionTemplateMixin, Dataset):
    _repr_indent = 4

    def __init__(self, filename, image_folder=None, seed=None, **kwargs):
        super().__init__(**kwargs)
        self.filename = filename
        self.image_folder = image_folder
        self.rng = np.random.default_rng(seed)

        self.data = []
        with open(filename, 'r', encoding='utf8') as f:
            # for line in tqdm(f, desc=f'{self.__class__.__name__} loading ann {self.filename}'):
            for line in f:
                self.data.append(line)

    def get_raw_item(self, index):
        return json.loads(self.data[index])

    def get_image(self, image_path):
        if self.image_folder is not None:
            image_path = os.path.join(self.image_folder, image_path)
        image = read_img_general(image_path)
        return image

    def get_template(self):
        return self.rng.choice(self.templates)

    def __getitem__(self, index):
        raise NotImplementedError

    def __len__(self):
        return len(self.data)

    def __repr__(self) -> str:
        head = "Dataset " + self.__class__.__name__
        body = [
            f"Number of datapoints: {self.__len__()}",
            f"ann file: {self.filename}"
        ]
        if self.image_folder is not None:
            body.append(f"image folder: {self.image_folder}")
        body += self.extra_repr().splitlines()
        lines = [head] + [" " * self._repr_indent + line for line in body]
        return "\n".join(lines)

    # noinspection PyMethodMayBeStatic
    def extra_repr(self) -> str:
        return ""