Spaces:
Sleeping
Sleeping
""" | |
Copyright (c) 2022, salesforce.com, inc. | |
All rights reserved. | |
SPDX-License-Identifier: BSD-3-Clause | |
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause | |
""" | |
import os | |
from lavis.common.registry import registry | |
from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder | |
from lavis.datasets.datasets.image_text_pair_datasets import ImageTextPairDataset | |
from lavis.datasets.datasets.laion_dataset import LaionDataset | |
class ConceptualCaption3MBuilder(BaseDatasetBuilder): | |
train_dataset_cls = ImageTextPairDataset | |
DATASET_CONFIG_DICT = { | |
"default": "configs/datasets/conceptual_caption/defaults_3m.yaml" | |
} | |
class ConceptualCaption12MBuilder(BaseDatasetBuilder): | |
train_dataset_cls = ImageTextPairDataset | |
DATASET_CONFIG_DICT = { | |
"default": "configs/datasets/conceptual_caption/defaults_12m.yaml" | |
} | |
class SBUCaptionBuilder(BaseDatasetBuilder): | |
train_dataset_cls = ImageTextPairDataset | |
DATASET_CONFIG_DICT = {"default": "configs/datasets/sbu_caption/defaults.yaml"} | |
class VGCaptionBuilder(BaseDatasetBuilder): | |
train_dataset_cls = ImageTextPairDataset | |
DATASET_CONFIG_DICT = {"default": "configs/datasets/vg/defaults_caption.yaml"} | |
class Laion2BMultiBuilder(BaseDatasetBuilder): | |
train_dataset_cls = LaionDataset | |
DATASET_CONFIG_DICT = {"default": "configs/datasets/laion/defaults_2B_multi.yaml"} | |
def _download_ann(self): | |
pass | |
def _download_vis(self): | |
pass | |
def build(self): | |
self.build_processors() | |
build_info = self.config.build_info | |
datasets = dict() | |
split = "train" # laion dataset only has train split | |
# create datasets | |
# [NOTE] return inner_datasets (wds.DataPipeline) | |
dataset_cls = self.train_dataset_cls | |
datasets[split] = dataset_cls( | |
vis_processor=self.vis_processors[split], | |
text_processor=self.text_processors[split], | |
location=build_info.storage, | |
).inner_dataset | |
return datasets | |