Spaces:

Jannat24
/

uncovering_deepfake_image

Running

App Files Files Community

Jannat24 commited on 11 days ago

Commit

9101b75

1 Parent(s): 202f3ae

taming_transformer

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
=1.0.8 +0 -0
=2.0.0 +0 -0
License.txt +19 -0
__pycache__/main.cpython-312.pyc +0 -0
environment.yaml +25 -0
main.py +585 -0
scripts/extract_depth.py +112 -0
scripts/extract_segmentation.py +130 -0
scripts/extract_submodel.py +17 -0
scripts/make_samples.py +292 -0
scripts/make_scene_samples.py +198 -0
scripts/sample_conditional.py +355 -0
scripts/sample_fast.py +260 -0
scripts/taming-transformers.ipynb +0 -0
setup.py +13 -0
taming/__pycache__/lr_scheduler.cpython-312.pyc +0 -0
taming/__pycache__/util.cpython-312.pyc +0 -0
taming/data/.ipynb_checkpoints/utils-checkpoint.py +171 -0
taming/data/__pycache__/helper_types.cpython-312.pyc +0 -0
taming/data/__pycache__/utils.cpython-312.pyc +0 -0
taming/data/ade20k.py +124 -0
taming/data/annotated_objects_coco.py +139 -0
taming/data/annotated_objects_dataset.py +218 -0
taming/data/annotated_objects_open_images.py +137 -0
taming/data/base.py +70 -0
taming/data/coco.py +176 -0
taming/data/conditional_builder/objects_bbox.py +60 -0
taming/data/conditional_builder/objects_center_points.py +168 -0
taming/data/conditional_builder/utils.py +105 -0
taming/data/custom.py +38 -0
taming/data/faceshq.py +134 -0
taming/data/helper_types.py +49 -0
taming/data/image_transforms.py +132 -0
taming/data/imagenet.py +558 -0
taming/data/open_images_helper.py +379 -0
taming/data/sflckr.py +91 -0
taming/data/utils.py +171 -0
taming/lr_scheduler.py +34 -0
taming/models/__pycache__/vqgan.cpython-312.pyc +0 -0
taming/models/cond_transformer.py +352 -0
taming/models/dummy_cond_stage.py +22 -0
taming/models/vqgan.py +404 -0
taming/modules/__pycache__/util.cpython-312.pyc +0 -0
taming/modules/diffusionmodules/__pycache__/model.cpython-312.pyc +0 -0
taming/modules/diffusionmodules/model.py +776 -0
taming/modules/discriminator/__pycache__/model.cpython-312.pyc +0 -0
taming/modules/discriminator/model.py +67 -0
taming/modules/losses/__init__.py +2 -0
taming/modules/losses/__pycache__/__init__.cpython-312.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+scripts/reconstruction_usage.ipynb filter=lfs diff=lfs merge=lfs -text

=1.0.8 ADDED Viewed

The diff for this file is too large to render. See raw diff

=2.0.0 ADDED Viewed

File without changes

License.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+Copyright (c) 2020 Patrick Esser and Robin Rombach and Björn Ommer
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE./

__pycache__/main.cpython-312.pyc ADDED Viewed

Binary file (27.2 kB). View file

environment.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+name: taming
+channels:
+  - pytorch
+  - defaults
+dependencies:
+  - python=3.8.5
+  - pip=20.3
+  - cudatoolkit=10.2
+  - pytorch=1.7.0
+  - torchvision=0.8.1
+  - numpy=1.19.2
+  - pip:
+    - albumentations==0.4.3
+    - opencv-python==4.1.2.30
+    - pudb==2019.2
+    - imageio==2.9.0
+    - imageio-ffmpeg==0.4.2
+    - pytorch-lightning==1.0.8
+    - omegaconf==2.0.0
+    - test-tube>=0.7.5
+    - streamlit>=0.73.1
+    - einops==0.3.0
+    - more-itertools>=8.0.0
+    - transformers==4.3.1
+    - -e .

main.py ADDED Viewed

	@@ -0,0 +1,585 @@

+import argparse, os, sys, datetime, glob, importlib
+from omegaconf import OmegaConf
+import numpy as np
+from PIL import Image
+import torch
+import torchvision
+from torch.utils.data import random_split, DataLoader, Dataset
+import pytorch_lightning as pl
+from pytorch_lightning import seed_everything
+from pytorch_lightning.trainer import Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint, Callback, LearningRateMonitor
+from pytorch_lightning.utilities import rank_zero_only
+from taming.data.utils import custom_collate
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def get_parser(**parser_kwargs):
+    def str2bool(v):
+        if isinstance(v, bool):
+            return v
+        if v.lower() in ("yes", "true", "t", "y", "1"):
+            return True
+        elif v.lower() in ("no", "false", "f", "n", "0"):
+            return False
+        else:
+            raise argparse.ArgumentTypeError("Boolean value expected.")
+    parser = argparse.ArgumentParser(**parser_kwargs)
+    parser.add_argument(
+        "-n",
+        "--name",
+        type=str,
+        const=True,
+        default="",
+        nargs="?",
+        help="postfix for logdir",
+    )
+    parser.add_argument(
+        "-r",
+        "--resume",
+        type=str,
+        const=True,
+        default="",
+        nargs="?",
+        help="resume from logdir or checkpoint in logdir",
+    )
+    parser.add_argument(
+        "-b",
+        "--base",
+        nargs="*",
+        metavar="base_config.yaml",
+        help="paths to base configs. Loaded from left-to-right. "
+        "Parameters can be overwritten or added with command-line options of the form `--key value`.",
+        default=list(),
+    )
+    parser.add_argument(
+        "-t",
+        "--train",
+        type=str2bool,
+        const=True,
+        default=False,
+        nargs="?",
+        help="train",
+    )
+    parser.add_argument(
+        "--no-test",
+        type=str2bool,
+        const=True,
+        default=False,
+        nargs="?",
+        help="disable test",
+    )
+    parser.add_argument("-p", "--project", help="name of new or path to existing project")
+    parser.add_argument(
+        "-d",
+        "--debug",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="enable post-mortem debugging",
+    )
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        default=23,
+        help="seed for seed_everything",
+    )
+    parser.add_argument(
+        "-f",
+        "--postfix",
+        type=str,
+        default="",
+        help="post-postfix for default name",
+    )
+    return parser
+def nondefault_trainer_args(opt):
+    parser = argparse.ArgumentParser()
+    parser = Trainer.add_argparse_args(parser)
+    args = parser.parse_args([])
+    return sorted(k for k in vars(args) if getattr(opt, k) != getattr(args, k))
+def instantiate_from_config(config):
+    if not "target" in config:
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+class WrappedDataset(Dataset):
+    """Wraps an arbitrary object with __len__ and __getitem__ into a pytorch dataset"""
+    def __init__(self, dataset):
+        self.data = dataset
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return self.data[idx]
+class DataModuleFromConfig(pl.LightningDataModule):
+    def __init__(self, batch_size, train=None, validation=None, test=None,
+                 wrap=False, num_workers=None):
+        super().__init__()
+        self.batch_size = batch_size
+        self.dataset_configs = dict()
+        self.num_workers = num_workers if num_workers is not None else batch_size*2
+        if train is not None:
+            self.dataset_configs["train"] = train
+            self.train_dataloader = self._train_dataloader
+        if validation is not None:
+            self.dataset_configs["validation"] = validation
+            self.val_dataloader = self._val_dataloader
+        if test is not None:
+            self.dataset_configs["test"] = test
+            self.test_dataloader = self._test_dataloader
+        self.wrap = wrap
+    def prepare_data(self):
+        for data_cfg in self.dataset_configs.values():
+            instantiate_from_config(data_cfg)
+    def setup(self, stage=None):
+        self.datasets = dict(
+            (k, instantiate_from_config(self.dataset_configs[k]))
+            for k in self.dataset_configs)
+        if self.wrap:
+            for k in self.datasets:
+                self.datasets[k] = WrappedDataset(self.datasets[k])
+    def _train_dataloader(self):
+        return DataLoader(self.datasets["train"], batch_size=self.batch_size,
+                          num_workers=self.num_workers, shuffle=True, collate_fn=custom_collate)
+    def _val_dataloader(self):
+        return DataLoader(self.datasets["validation"],
+                          batch_size=self.batch_size,
+                          num_workers=self.num_workers, collate_fn=custom_collate)
+    def _test_dataloader(self):
+        return DataLoader(self.datasets["test"], batch_size=self.batch_size,
+                          num_workers=self.num_workers, collate_fn=custom_collate)
+class SetupCallback(Callback):
+    def __init__(self, resume, now, logdir, ckptdir, cfgdir, config, lightning_config):
+        super().__init__()
+        self.resume = resume
+        self.now = now
+        self.logdir = logdir
+        self.ckptdir = ckptdir
+        self.cfgdir = cfgdir
+        self.config = config
+        self.lightning_config = lightning_config
+    def on_pretrain_routine_start(self, trainer, pl_module):
+        if trainer.global_rank == 0:
+            # Create logdirs and save configs
+            os.makedirs(self.logdir, exist_ok=True)
+            os.makedirs(self.ckptdir, exist_ok=True)
+            os.makedirs(self.cfgdir, exist_ok=True)
+            print("Project config")
+            print(self.config.pretty())
+            OmegaConf.save(self.config,
+                           os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)))
+            print("Lightning config")
+            print(self.lightning_config.pretty())
+            OmegaConf.save(OmegaConf.create({"lightning": self.lightning_config}),
+                           os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now)))
+        else:
+            # ModelCheckpoint callback created log directory --- remove it
+            if not self.resume and os.path.exists(self.logdir):
+                dst, name = os.path.split(self.logdir)
+                dst = os.path.join(dst, "child_runs", name)
+                os.makedirs(os.path.split(dst)[0], exist_ok=True)
+                try:
+                    os.rename(self.logdir, dst)
+                except FileNotFoundError:
+                    pass
+class ImageLogger(Callback):
+    def __init__(self, batch_frequency, max_images, clamp=True, increase_log_steps=True):
+        super().__init__()
+        self.batch_freq = batch_frequency
+        self.max_images = max_images
+        self.logger_log_images = {
+            pl.loggers.WandbLogger: self._wandb,
+            pl.loggers.TestTubeLogger: self._testtube,
+        }
+        self.log_steps = [2 ** n for n in range(int(np.log2(self.batch_freq)) + 1)]
+        if not increase_log_steps:
+            self.log_steps = [self.batch_freq]
+        self.clamp = clamp
+    @rank_zero_only
+    def _wandb(self, pl_module, images, batch_idx, split):
+        raise ValueError("No way wandb")
+        grids = dict()
+        for k in images:
+            grid = torchvision.utils.make_grid(images[k])
+            grids[f"{split}/{k}"] = wandb.Image(grid)
+        pl_module.logger.experiment.log(grids)
+    @rank_zero_only
+    def _testtube(self, pl_module, images, batch_idx, split):
+        for k in images:
+            grid = torchvision.utils.make_grid(images[k])
+            grid = (grid+1.0)/2.0 # -1,1 -> 0,1; c,h,w
+            tag = f"{split}/{k}"
+            pl_module.logger.experiment.add_image(
+                tag, grid,
+                global_step=pl_module.global_step)
+    @rank_zero_only
+    def log_local(self, save_dir, split, images,
+                  global_step, current_epoch, batch_idx):
+        root = os.path.join(save_dir, "images", split)
+        for k in images:
+            grid = torchvision.utils.make_grid(images[k], nrow=4)
+            grid = (grid+1.0)/2.0 # -1,1 -> 0,1; c,h,w
+            grid = grid.transpose(0,1).transpose(1,2).squeeze(-1)
+            grid = grid.numpy()
+            grid = (grid*255).astype(np.uint8)
+            filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format(
+                k,
+                global_step,
+                current_epoch,
+                batch_idx)
+            path = os.path.join(root, filename)
+            os.makedirs(os.path.split(path)[0], exist_ok=True)
+            Image.fromarray(grid).save(path)
+    def log_img(self, pl_module, batch, batch_idx, split="train"):
+        if (self.check_frequency(batch_idx) and  # batch_idx % self.batch_freq == 0
+                hasattr(pl_module, "log_images") and
+                callable(pl_module.log_images) and
+                self.max_images > 0):
+            logger = type(pl_module.logger)
+            is_train = pl_module.training
+            if is_train:
+                pl_module.eval()
+            with torch.no_grad():
+                images = pl_module.log_images(batch, split=split, pl_module=pl_module)
+            for k in images:
+                N = min(images[k].shape[0], self.max_images)
+                images[k] = images[k][:N]
+                if isinstance(images[k], torch.Tensor):
+                    images[k] = images[k].detach().cpu()
+                    if self.clamp:
+                        images[k] = torch.clamp(images[k], -1., 1.)
+            self.log_local(pl_module.logger.save_dir, split, images,
+                           pl_module.global_step, pl_module.current_epoch, batch_idx)
+            logger_log_images = self.logger_log_images.get(logger, lambda *args, **kwargs: None)
+            logger_log_images(pl_module, images, pl_module.global_step, split)
+            if is_train:
+                pl_module.train()
+    def check_frequency(self, batch_idx):
+        if (batch_idx % self.batch_freq) == 0 or (batch_idx in self.log_steps):
+            try:
+                self.log_steps.pop(0)
+            except IndexError:
+                pass
+            return True
+        return False
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
+        self.log_img(pl_module, batch, batch_idx, split="train")
+    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
+        self.log_img(pl_module, batch, batch_idx, split="val")
+if __name__ == "__main__":
+    # custom parser to specify config files, train, test and debug mode,
+    # postfix, resume.
+    # `--key value` arguments are interpreted as arguments to the trainer.
+    # `nested.key=value` arguments are interpreted as config parameters.
+    # configs are merged from left-to-right followed by command line parameters.
+    # model:
+    #   base_learning_rate: float
+    #   target: path to lightning module
+    #   params:
+    #       key: value
+    # data:
+    #   target: main.DataModuleFromConfig
+    #   params:
+    #      batch_size: int
+    #      wrap: bool
+    #      train:
+    #          target: path to train dataset
+    #          params:
+    #              key: value
+    #      validation:
+    #          target: path to validation dataset
+    #          params:
+    #              key: value
+    #      test:
+    #          target: path to test dataset
+    #          params:
+    #              key: value
+    # lightning: (optional, has sane defaults and can be specified on cmdline)
+    #   trainer:
+    #       additional arguments to trainer
+    #   logger:
+    #       logger to instantiate
+    #   modelcheckpoint:
+    #       modelcheckpoint to instantiate
+    #   callbacks:
+    #       callback1:
+    #           target: importpath
+    #           params:
+    #               key: value
+    now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+    # add cwd for convenience and to make classes in this file available when
+    # running as `python main.py`
+    # (in particular `main.DataModuleFromConfig`)
+    sys.path.append(os.getcwd())
+    parser = get_parser()
+    parser = Trainer.add_argparse_args(parser)
+    opt, unknown = parser.parse_known_args()
+    if opt.name and opt.resume:
+        raise ValueError(
+            "-n/--name and -r/--resume cannot be specified both."
+            "If you want to resume training in a new log folder, "
+            "use -n/--name in combination with --resume_from_checkpoint"
+        )
+    if opt.resume:
+        if not os.path.exists(opt.resume):
+            raise ValueError("Cannot find {}".format(opt.resume))
+        if os.path.isfile(opt.resume):
+            paths = opt.resume.split("/")
+            idx = len(paths)-paths[::-1].index("logs")+1
+            logdir = "/".join(paths[:idx])
+            ckpt = opt.resume
+        else:
+            assert os.path.isdir(opt.resume), opt.resume
+            logdir = opt.resume.rstrip("/")
+            ckpt = os.path.join(logdir, "checkpoints", "last.ckpt")
+        opt.resume_from_checkpoint = ckpt
+        base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*.yaml")))
+        opt.base = base_configs+opt.base
+        _tmp = logdir.split("/")
+        nowname = _tmp[_tmp.index("logs")+1]
+    else:
+        if opt.name:
+            name = "_"+opt.name
+        elif opt.base:
+            cfg_fname = os.path.split(opt.base[0])[-1]
+            cfg_name = os.path.splitext(cfg_fname)[0]
+            name = "_"+cfg_name
+        else:
+            name = ""
+        nowname = now+name+opt.postfix
+        logdir = os.path.join("logs", nowname)
+    ckptdir = os.path.join(logdir, "checkpoints")
+    cfgdir = os.path.join(logdir, "configs")
+    seed_everything(opt.seed)
+    try:
+        # init and save configs
+        configs = [OmegaConf.load(cfg) for cfg in opt.base]
+        cli = OmegaConf.from_dotlist(unknown)
+        config = OmegaConf.merge(*configs, cli)
+        lightning_config = config.pop("lightning", OmegaConf.create())
+        # merge trainer cli with config
+        trainer_config = lightning_config.get("trainer", OmegaConf.create())
+        # default to ddp
+        trainer_config["distributed_backend"] = "ddp"
+        for k in nondefault_trainer_args(opt):
+            trainer_config[k] = getattr(opt, k)
+        if not "gpus" in trainer_config:
+            del trainer_config["distributed_backend"]
+            cpu = True
+        else:
+            gpuinfo = trainer_config["gpus"]
+            print(f"Running on GPUs {gpuinfo}")
+            cpu = False
+        trainer_opt = argparse.Namespace(**trainer_config)
+        lightning_config.trainer = trainer_config
+        # model
+        model = instantiate_from_config(config.model)
+        # trainer and callbacks
+        trainer_kwargs = dict()
+        # default logger configs
+        # NOTE wandb < 0.10.0 interferes with shutdown
+        # wandb >= 0.10.0 seems to fix it but still interferes with pudb
+        # debugging (wrongly sized pudb ui)
+        # thus prefer testtube for now
+        default_logger_cfgs = {
+            "wandb": {
+                "target": "pytorch_lightning.loggers.WandbLogger",
+                "params": {
+                    "name": nowname,
+                    "save_dir": logdir,
+                    "offline": opt.debug,
+                    "id": nowname,
+                }
+            },
+            "testtube": {
+                "target": "pytorch_lightning.loggers.TestTubeLogger",
+                "params": {
+                    "name": "testtube",
+                    "save_dir": logdir,
+                }
+            },
+        }
+        default_logger_cfg = default_logger_cfgs["testtube"]
+        logger_cfg = lightning_config.logger or OmegaConf.create()
+        logger_cfg = OmegaConf.merge(default_logger_cfg, logger_cfg)
+        trainer_kwargs["logger"] = instantiate_from_config(logger_cfg)
+        # modelcheckpoint - use TrainResult/EvalResult(checkpoint_on=metric) to
+        # specify which metric is used to determine best models
+        default_modelckpt_cfg = {
+            "target": "pytorch_lightning.callbacks.ModelCheckpoint",
+            "params": {
+                "dirpath": ckptdir,
+                "filename": "{epoch:06}",
+                "verbose": True,
+                "save_last": True,
+            }
+        }
+        if hasattr(model, "monitor"):
+            print(f"Monitoring {model.monitor} as checkpoint metric.")
+            default_modelckpt_cfg["params"]["monitor"] = model.monitor
+            default_modelckpt_cfg["params"]["save_top_k"] = 3
+        modelckpt_cfg = lightning_config.modelcheckpoint or OmegaConf.create()
+        modelckpt_cfg = OmegaConf.merge(default_modelckpt_cfg, modelckpt_cfg)
+        trainer_kwargs["checkpoint_callback"] = instantiate_from_config(modelckpt_cfg)
+        # add callback which sets up log directory
+        default_callbacks_cfg = {
+            "setup_callback": {
+                "target": "main.SetupCallback",
+                "params": {
+                    "resume": opt.resume,
+                    "now": now,
+                    "logdir": logdir,
+                    "ckptdir": ckptdir,
+                    "cfgdir": cfgdir,
+                    "config": config,
+                    "lightning_config": lightning_config,
+                }
+            },
+            "image_logger": {
+                "target": "main.ImageLogger",
+                "params": {
+                    "batch_frequency": 750,
+                    "max_images": 4,
+                    "clamp": True
+                }
+            },
+            "learning_rate_logger": {
+                "target": "main.LearningRateMonitor",
+                "params": {
+                    "logging_interval": "step",
+                    #"log_momentum": True
+                }
+            },
+        }
+        callbacks_cfg = lightning_config.callbacks or OmegaConf.create()
+        callbacks_cfg = OmegaConf.merge(default_callbacks_cfg, callbacks_cfg)
+        trainer_kwargs["callbacks"] = [instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg]
+        trainer = Trainer.from_argparse_args(trainer_opt, **trainer_kwargs)
+        # data
+        data = instantiate_from_config(config.data)
+        # NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
+        # calling these ourselves should not be necessary but it is.
+        # lightning still takes care of proper multiprocessing though
+        data.prepare_data()
+        data.setup()
+        # configure learning rate
+        bs, base_lr = config.data.params.batch_size, config.model.base_learning_rate
+        if not cpu:
+            ngpu = len(lightning_config.trainer.gpus.strip(",").split(','))
+        else:
+            ngpu = 1
+        accumulate_grad_batches = lightning_config.trainer.accumulate_grad_batches or 1
+        print(f"accumulate_grad_batches = {accumulate_grad_batches}")
+        lightning_config.trainer.accumulate_grad_batches = accumulate_grad_batches
+        model.learning_rate = accumulate_grad_batches * ngpu * bs * base_lr
+        print("Setting learning rate to {:.2e} = {} (accumulate_grad_batches) * {} (num_gpus) * {} (batchsize) * {:.2e} (base_lr)".format(
+            model.learning_rate, accumulate_grad_batches, ngpu, bs, base_lr))
+        # allow checkpointing via USR1
+        def melk(*args, **kwargs):
+            # run all checkpoint hooks
+            if trainer.global_rank == 0:
+                print("Summoning checkpoint.")
+                ckpt_path = os.path.join(ckptdir, "last.ckpt")
+                trainer.save_checkpoint(ckpt_path)
+        def divein(*args, **kwargs):
+            if trainer.global_rank == 0:
+                import pudb; pudb.set_trace()
+        import signal
+        signal.signal(signal.SIGUSR1, melk)
+        signal.signal(signal.SIGUSR2, divein)
+        # run
+        if opt.train:
+            try:
+                trainer.fit(model, data)
+            except Exception:
+                melk()
+                raise
+        if not opt.no_test and not trainer.interrupted:
+            trainer.test(model, data)
+    except Exception:
+        if opt.debug and trainer.global_rank==0:
+            try:
+                import pudb as debugger
+            except ImportError:
+                import pdb as debugger
+            debugger.post_mortem()
+        raise
+    finally:
+        # move newly created debug project to debug_runs
+        if opt.debug and not opt.resume and trainer.global_rank==0:
+            dst, name = os.path.split(logdir)
+            dst = os.path.join(dst, "debug_runs", name)
+            os.makedirs(os.path.split(dst)[0], exist_ok=True)
+            os.rename(logdir, dst)

scripts/extract_depth.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import os
+import torch
+import numpy as np
+from tqdm import trange
+from PIL import Image
+def get_state(gpu):
+    import torch
+    midas = torch.hub.load("intel-isl/MiDaS", "MiDaS")
+    if gpu:
+        midas.cuda()
+    midas.eval()
+    midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
+    transform = midas_transforms.default_transform
+    state = {"model": midas,
+             "transform": transform}
+    return state
+def depth_to_rgba(x):
+    assert x.dtype == np.float32
+    assert len(x.shape) == 2
+    y = x.copy()
+    y.dtype = np.uint8
+    y = y.reshape(x.shape+(4,))
+    return np.ascontiguousarray(y)
+def rgba_to_depth(x):
+    assert x.dtype == np.uint8
+    assert len(x.shape) == 3 and x.shape[2] == 4
+    y = x.copy()
+    y.dtype = np.float32
+    y = y.reshape(x.shape[:2])
+    return np.ascontiguousarray(y)
+def run(x, state):
+    model = state["model"]
+    transform = state["transform"]
+    hw = x.shape[:2]
+    with torch.no_grad():
+        prediction = model(transform((x + 1.0) * 127.5).cuda())
+        prediction = torch.nn.functional.interpolate(
+            prediction.unsqueeze(1),
+            size=hw,
+            mode="bicubic",
+            align_corners=False,
+        ).squeeze()
+        output = prediction.cpu().numpy()
+    return output
+def get_filename(relpath, level=-2):
+    # save class folder structure and filename:
+    fn = relpath.split(os.sep)[level:]
+    folder = fn[-2]
+    file   = fn[-1].split('.')[0]
+    return folder, file
+def save_depth(dataset, path, debug=False):
+    os.makedirs(path)
+    N = len(dset)
+    if debug:
+        N = 10
+    state = get_state(gpu=True)
+    for idx in trange(N, desc="Data"):
+        ex = dataset[idx]
+        image, relpath = ex["image"], ex["relpath"]
+        folder, filename = get_filename(relpath)
+        # prepare
+        folderabspath = os.path.join(path, folder)
+        os.makedirs(folderabspath, exist_ok=True)
+        savepath = os.path.join(folderabspath, filename)
+        # run model
+        xout = run(image, state)
+        I = depth_to_rgba(xout)
+        Image.fromarray(I).save("{}.png".format(savepath))
+if __name__ == "__main__":
+    from taming.data.imagenet import ImageNetTrain, ImageNetValidation
+    out = "data/imagenet_depth"
+    if not os.path.exists(out):
+        print("Please create a folder or symlink '{}' to extract depth data ".format(out) +
+              "(be prepared that the output size will be larger than ImageNet itself).")
+        exit(1)
+    # go
+    dset = ImageNetValidation()
+    abspath = os.path.join(out, "val")
+    if os.path.exists(abspath):
+        print("{} exists - not doing anything.".format(abspath))
+    else:
+        print("preparing {}".format(abspath))
+        save_depth(dset, abspath)
+        print("done with validation split")
+    dset = ImageNetTrain()
+    abspath = os.path.join(out, "train")
+    if os.path.exists(abspath):
+        print("{} exists - not doing anything.".format(abspath))
+    else:
+        print("preparing {}".format(abspath))
+        save_depth(dset, abspath)
+        print("done with train split")
+    print("done done.")

scripts/extract_segmentation.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import sys, os
+import numpy as np
+import scipy
+import torch
+import torch.nn as nn
+from scipy import ndimage
+from tqdm import tqdm, trange
+from PIL import Image
+import torch.hub
+import torchvision
+import torch.nn.functional as F
+# download deeplabv2_resnet101_msc-cocostuff164k-100000.pth from
+# https://github.com/kazuto1011/deeplab-pytorch/releases/download/v1.0/deeplabv2_resnet101_msc-cocostuff164k-100000.pth
+# and put the path here
+CKPT_PATH = "TODO"
+rescale = lambda x: (x + 1.) / 2.
+def rescale_bgr(x):
+    x = (x+1)*127.5
+    x = torch.flip(x, dims=[0])
+    return x
+class COCOStuffSegmenter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.n_labels = 182
+        model = torch.hub.load("kazuto1011/deeplab-pytorch", "deeplabv2_resnet101", n_classes=self.n_labels)
+        ckpt_path = CKPT_PATH
+        model.load_state_dict(torch.load(ckpt_path))
+        self.model = model
+        normalize = torchvision.transforms.Normalize(mean=self.mean, std=self.std)
+        self.image_transform = torchvision.transforms.Compose([
+            torchvision.transforms.Lambda(lambda image: torch.stack(
+                [normalize(rescale_bgr(x)) for x in image]))
+        ])
+    def forward(self, x, upsample=None):
+        x = self._pre_process(x)
+        x = self.model(x)
+        if upsample is not None:
+            x = torch.nn.functional.upsample_bilinear(x, size=upsample)
+        return x
+    def _pre_process(self, x):
+        x = self.image_transform(x)
+        return x
+    @property
+    def mean(self):
+        # bgr
+        return [104.008, 116.669, 122.675]
+    @property
+    def std(self):
+        return [1.0, 1.0, 1.0]
+    @property
+    def input_size(self):
+        return [3, 224, 224]
+def run_model(img, model):
+    model = model.eval()
+    with torch.no_grad():
+        segmentation = model(img, upsample=(img.shape[2], img.shape[3]))
+        segmentation = torch.argmax(segmentation, dim=1, keepdim=True)
+    return segmentation.detach().cpu()
+def get_input(batch, k):
+    x = batch[k]
+    if len(x.shape) == 3:
+        x = x[..., None]
+    x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format)
+    return x.float()
+def save_segmentation(segmentation, path):
+    # --> class label to uint8, save as png
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    assert len(segmentation.shape)==4
+    assert segmentation.shape[0]==1
+    for seg in segmentation:
+        seg = seg.permute(1,2,0).numpy().squeeze().astype(np.uint8)
+        seg = Image.fromarray(seg)
+        seg.save(path)
+def iterate_dataset(dataloader, destpath, model):
+    os.makedirs(destpath, exist_ok=True)
+    num_processed = 0
+    for i, batch in tqdm(enumerate(dataloader), desc="Data"):
+        try:
+            img = get_input(batch, "image")
+            img = img.cuda()
+            seg = run_model(img, model)
+            path = batch["relative_file_path_"][0]
+            path = os.path.splitext(path)[0]
+            path = os.path.join(destpath, path + ".png")
+            save_segmentation(seg, path)
+            num_processed += 1
+        except Exception as e:
+            print(e)
+            print("but anyhow..")
+    print("Processed {} files. Bye.".format(num_processed))
+from taming.data.sflckr import Examples
+from torch.utils.data import DataLoader
+if __name__ == "__main__":
+    dest = sys.argv[1]
+    batchsize = 1
+    print("Running with batch-size {}, saving to {}...".format(batchsize, dest))
+    model = COCOStuffSegmenter({}).cuda()
+    print("Instantiated model.")
+    dataset = Examples()
+    dloader = DataLoader(dataset, batch_size=batchsize)
+    iterate_dataset(dataloader=dloader, destpath=dest, model=model)
+    print("done.")

scripts/extract_submodel.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch
+import sys
+if __name__ == "__main__":
+    inpath = sys.argv[1]
+    outpath = sys.argv[2]
+    submodel = "cond_stage_model"
+    if len(sys.argv) > 3:
+        submodel = sys.argv[3]
+    print("Extracting {} from {} to {}.".format(submodel, inpath, outpath))
+    sd = torch.load(inpath, map_location="cpu")
+    new_sd = {"state_dict": dict((k.split(".", 1)[-1],v)
+                                 for k,v in sd["state_dict"].items()
+                                 if k.startswith("cond_stage_model"))}
+    torch.save(new_sd, outpath)

scripts/make_samples.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import argparse, os, sys, glob, math, time
+import torch
+import numpy as np
+from omegaconf import OmegaConf
+from PIL import Image
+from main import instantiate_from_config, DataModuleFromConfig
+from torch.utils.data import DataLoader
+from torch.utils.data.dataloader import default_collate
+from tqdm import trange
+def save_image(x, path):
+    c,h,w = x.shape
+    assert c==3
+    x = ((x.detach().cpu().numpy().transpose(1,2,0)+1.0)*127.5).clip(0,255).astype(np.uint8)
+    Image.fromarray(x).save(path)
+@torch.no_grad()
+def run_conditional(model, dsets, outdir, top_k, temperature, batch_size=1):
+    if len(dsets.datasets) > 1:
+        split = sorted(dsets.datasets.keys())[0]
+        dset = dsets.datasets[split]
+    else:
+        dset = next(iter(dsets.datasets.values()))
+    print("Dataset: ", dset.__class__.__name__)
+    for start_idx in trange(0,len(dset)-batch_size+1,batch_size):
+        indices = list(range(start_idx, start_idx+batch_size))
+        example = default_collate([dset[i] for i in indices])
+        x = model.get_input("image", example).to(model.device)
+        for i in range(x.shape[0]):
+            save_image(x[i], os.path.join(outdir, "originals",
+                                          "{:06}.png".format(indices[i])))
+        cond_key = model.cond_stage_key
+        c = model.get_input(cond_key, example).to(model.device)
+        scale_factor = 1.0
+        quant_z, z_indices = model.encode_to_z(x)
+        quant_c, c_indices = model.encode_to_c(c)
+        cshape = quant_z.shape
+        xrec = model.first_stage_model.decode(quant_z)
+        for i in range(xrec.shape[0]):
+            save_image(xrec[i], os.path.join(outdir, "reconstructions",
+                                             "{:06}.png".format(indices[i])))
+        if cond_key == "segmentation":
+            # get image from segmentation mask
+            num_classes = c.shape[1]
+            c = torch.argmax(c, dim=1, keepdim=True)
+            c = torch.nn.functional.one_hot(c, num_classes=num_classes)
+            c = c.squeeze(1).permute(0, 3, 1, 2).float()
+            c = model.cond_stage_model.to_rgb(c)
+        idx = z_indices
+        half_sample = False
+        if half_sample:
+            start = idx.shape[1]//2
+        else:
+            start = 0
+        idx[:,start:] = 0
+        idx = idx.reshape(cshape[0],cshape[2],cshape[3])
+        start_i = start//cshape[3]
+        start_j = start %cshape[3]
+        cidx = c_indices
+        cidx = cidx.reshape(quant_c.shape[0],quant_c.shape[2],quant_c.shape[3])
+        sample = True
+        for i in range(start_i,cshape[2]-0):
+            if i <= 8:
+                local_i = i
+            elif cshape[2]-i < 8:
+                local_i = 16-(cshape[2]-i)
+            else:
+                local_i = 8
+            for j in range(start_j,cshape[3]-0):
+                if j <= 8:
+                    local_j = j
+                elif cshape[3]-j < 8:
+                    local_j = 16-(cshape[3]-j)
+                else:
+                    local_j = 8
+                i_start = i-local_i
+                i_end = i_start+16
+                j_start = j-local_j
+                j_end = j_start+16
+                patch = idx[:,i_start:i_end,j_start:j_end]
+                patch = patch.reshape(patch.shape[0],-1)
+                cpatch = cidx[:, i_start:i_end, j_start:j_end]
+                cpatch = cpatch.reshape(cpatch.shape[0], -1)
+                patch = torch.cat((cpatch, patch), dim=1)
+                logits,_ = model.transformer(patch[:,:-1])
+                logits = logits[:, -256:, :]
+                logits = logits.reshape(cshape[0],16,16,-1)
+                logits = logits[:,local_i,local_j,:]
+                logits = logits/temperature
+                if top_k is not None:
+                    logits = model.top_k_logits(logits, top_k)
+                # apply softmax to convert to probabilities
+                probs = torch.nn.functional.softmax(logits, dim=-1)
+                # sample from the distribution or take the most likely
+                if sample:
+                    ix = torch.multinomial(probs, num_samples=1)
+                else:
+                    _, ix = torch.topk(probs, k=1, dim=-1)
+                idx[:,i,j] = ix
+        xsample = model.decode_to_img(idx[:,:cshape[2],:cshape[3]], cshape)
+        for i in range(xsample.shape[0]):
+            save_image(xsample[i], os.path.join(outdir, "samples",
+                                                "{:06}.png".format(indices[i])))
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-r",
+        "--resume",
+        type=str,
+        nargs="?",
+        help="load from logdir or checkpoint in logdir",
+    )
+    parser.add_argument(
+        "-b",
+        "--base",
+        nargs="*",
+        metavar="base_config.yaml",
+        help="paths to base configs. Loaded from left-to-right. "
+        "Parameters can be overwritten or added with command-line options of the form `--key value`.",
+        default=list(),
+    )
+    parser.add_argument(
+        "-c",
+        "--config",
+        nargs="?",
+        metavar="single_config.yaml",
+        help="path to single config. If specified, base configs will be ignored "
+        "(except for the last one if left unspecified).",
+        const=True,
+        default="",
+    )
+    parser.add_argument(
+        "--ignore_base_data",
+        action="store_true",
+        help="Ignore data specification from base configs. Useful if you want "
+        "to specify a custom datasets on the command line.",
+    )
+    parser.add_argument(
+        "--outdir",
+        required=True,
+        type=str,
+        help="Where to write outputs to.",
+    )
+    parser.add_argument(
+        "--top_k",
+        type=int,
+        default=100,
+        help="Sample from among top-k predictions.",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="Sampling temperature.",
+    )
+    return parser
+def load_model_from_config(config, sd, gpu=True, eval_mode=True):
+    if "ckpt_path" in config.params:
+        print("Deleting the restore-ckpt path from the config...")
+        config.params.ckpt_path = None
+    if "downsample_cond_size" in config.params:
+        print("Deleting downsample-cond-size from the config and setting factor=0.5 instead...")
+        config.params.downsample_cond_size = -1
+        config.params["downsample_cond_factor"] = 0.5
+    try:
+        if "ckpt_path" in config.params.first_stage_config.params:
+            config.params.first_stage_config.params.ckpt_path = None
+            print("Deleting the first-stage restore-ckpt path from the config...")
+        if "ckpt_path" in config.params.cond_stage_config.params:
+            config.params.cond_stage_config.params.ckpt_path = None
+            print("Deleting the cond-stage restore-ckpt path from the config...")
+    except:
+        pass
+    model = instantiate_from_config(config)
+    if sd is not None:
+        missing, unexpected = model.load_state_dict(sd, strict=False)
+        print(f"Missing Keys in State Dict: {missing}")
+        print(f"Unexpected Keys in State Dict: {unexpected}")
+    if gpu:
+        model.cuda()
+    if eval_mode:
+        model.eval()
+    return {"model": model}
+def get_data(config):
+    # get data
+    data = instantiate_from_config(config.data)
+    data.prepare_data()
+    data.setup()
+    return data
+def load_model_and_dset(config, ckpt, gpu, eval_mode):
+    # get data
+    dsets = get_data(config)   # calls data.config ...
+    # now load the specified checkpoint
+    if ckpt:
+        pl_sd = torch.load(ckpt, map_location="cpu")
+        global_step = pl_sd["global_step"]
+    else:
+        pl_sd = {"state_dict": None}
+        global_step = None
+    model = load_model_from_config(config.model,
+                                   pl_sd["state_dict"],
+                                   gpu=gpu,
+                                   eval_mode=eval_mode)["model"]
+    return dsets, model, global_step
+if __name__ == "__main__":
+    sys.path.append(os.getcwd())
+    parser = get_parser()
+    opt, unknown = parser.parse_known_args()
+    ckpt = None
+    if opt.resume:
+        if not os.path.exists(opt.resume):
+            raise ValueError("Cannot find {}".format(opt.resume))
+        if os.path.isfile(opt.resume):
+            paths = opt.resume.split("/")
+            try:
+                idx = len(paths)-paths[::-1].index("logs")+1
+            except ValueError:
+                idx = -2 # take a guess: path/to/logdir/checkpoints/model.ckpt
+            logdir = "/".join(paths[:idx])
+            ckpt = opt.resume
+        else:
+            assert os.path.isdir(opt.resume), opt.resume
+            logdir = opt.resume.rstrip("/")
+            ckpt = os.path.join(logdir, "checkpoints", "last.ckpt")
+        print(f"logdir:{logdir}")
+        base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*-project.yaml")))
+        opt.base = base_configs+opt.base
+    if opt.config:
+        if type(opt.config) == str:
+            opt.base = [opt.config]
+        else:
+            opt.base = [opt.base[-1]]
+    configs = [OmegaConf.load(cfg) for cfg in opt.base]
+    cli = OmegaConf.from_dotlist(unknown)
+    if opt.ignore_base_data:
+        for config in configs:
+            if hasattr(config, "data"): del config["data"]
+    config = OmegaConf.merge(*configs, cli)
+    print(ckpt)
+    gpu = True
+    eval_mode = True
+    show_config = False
+    if show_config:
+        print(OmegaConf.to_container(config))
+    dsets, model, global_step = load_model_and_dset(config, ckpt, gpu, eval_mode)
+    print(f"Global step: {global_step}")
+    outdir = os.path.join(opt.outdir, "{:06}_{}_{}".format(global_step,
+                                                           opt.top_k,
+                                                           opt.temperature))
+    os.makedirs(outdir, exist_ok=True)
+    print("Writing samples to ", outdir)
+    for k in ["originals", "reconstructions", "samples"]:
+        os.makedirs(os.path.join(outdir, k), exist_ok=True)
+    run_conditional(model, dsets, outdir, opt.top_k, opt.temperature)

scripts/make_scene_samples.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import glob
+import os
+import sys
+from itertools import product
+from pathlib import Path
+from typing import Literal, List, Optional, Tuple
+import numpy as np
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning import seed_everything
+from torch import Tensor
+from torchvision.utils import save_image
+from tqdm import tqdm
+from scripts.make_samples import get_parser, load_model_and_dset
+from taming.data.conditional_builder.objects_center_points import ObjectsCenterPointsConditionalBuilder
+from taming.data.helper_types import BoundingBox, Annotation
+from taming.data.annotated_objects_dataset import AnnotatedObjectsDataset
+from taming.models.cond_transformer import Net2NetTransformer
+seed_everything(42424242)
+device: Literal['cuda', 'cpu'] = 'cuda'
+first_stage_factor = 16
+trained_on_res = 256
+def _helper(coord: int, coord_max: int, coord_window: int) -> (int, int):
+    assert 0 <= coord < coord_max
+    coord_desired_center = (coord_window - 1) // 2
+    return np.clip(coord - coord_desired_center, 0, coord_max - coord_window)
+def get_crop_coordinates(x: int, y: int) -> BoundingBox:
+    WIDTH, HEIGHT = desired_z_shape[1], desired_z_shape[0]
+    x0 = _helper(x, WIDTH, first_stage_factor) / WIDTH
+    y0 = _helper(y, HEIGHT, first_stage_factor) / HEIGHT
+    w = first_stage_factor / WIDTH
+    h = first_stage_factor / HEIGHT
+    return x0, y0, w, h
+def get_z_indices_crop_out(z_indices: Tensor, predict_x: int, predict_y: int) -> Tensor:
+    WIDTH, HEIGHT = desired_z_shape[1], desired_z_shape[0]
+    x0 = _helper(predict_x, WIDTH, first_stage_factor)
+    y0 = _helper(predict_y, HEIGHT, first_stage_factor)
+    no_images = z_indices.shape[0]
+    cut_out_1 = z_indices[:, y0:predict_y, x0:x0+first_stage_factor].reshape((no_images, -1))
+    cut_out_2 = z_indices[:, predict_y, x0:predict_x]
+    return torch.cat((cut_out_1, cut_out_2), dim=1)
+@torch.no_grad()
+def sample(model: Net2NetTransformer, annotations: List[Annotation], dataset: AnnotatedObjectsDataset,
+           conditional_builder: ObjectsCenterPointsConditionalBuilder, no_samples: int,
+           temperature: float, top_k: int) -> Tensor:
+    x_max, y_max = desired_z_shape[1], desired_z_shape[0]
+    annotations = [a._replace(category_no=dataset.get_category_number(a.category_id)) for a in annotations]
+    recompute_conditional = any((desired_resolution[0] > trained_on_res, desired_resolution[1] > trained_on_res))
+    if not recompute_conditional:
+        crop_coordinates = get_crop_coordinates(0, 0)
+        conditional_indices = conditional_builder.build(annotations, crop_coordinates)
+        c_indices = conditional_indices.to(device).repeat(no_samples, 1)
+        z_indices = torch.zeros((no_samples, 0), device=device).long()
+        output_indices = model.sample(z_indices, c_indices, steps=x_max*y_max, temperature=temperature,
+                                      sample=True, top_k=top_k)
+    else:
+        output_indices = torch.zeros((no_samples, y_max, x_max), device=device).long()
+        for predict_y, predict_x in tqdm(product(range(y_max), range(x_max)), desc='sampling_image', total=x_max*y_max):
+            crop_coordinates = get_crop_coordinates(predict_x, predict_y)
+            z_indices = get_z_indices_crop_out(output_indices, predict_x, predict_y)
+            conditional_indices = conditional_builder.build(annotations, crop_coordinates)
+            c_indices = conditional_indices.to(device).repeat(no_samples, 1)
+            new_index = model.sample(z_indices, c_indices, steps=1, temperature=temperature, sample=True, top_k=top_k)
+            output_indices[:, predict_y, predict_x] = new_index[:, -1]
+    z_shape = (
+        no_samples,
+        model.first_stage_model.quantize.e_dim,  # codebook embed_dim
+        desired_z_shape[0],  # z_height
+        desired_z_shape[1]  # z_width
+    )
+    x_sample = model.decode_to_img(output_indices, z_shape) * 0.5 + 0.5
+    x_sample = x_sample.to('cpu')
+    plotter = conditional_builder.plot
+    figure_size = (x_sample.shape[2], x_sample.shape[3])
+    scene_graph = conditional_builder.build(annotations, (0., 0., 1., 1.))
+    plot = plotter(scene_graph, dataset.get_textual_label_for_category_no, figure_size)
+    return torch.cat((x_sample, plot.unsqueeze(0)))
+def get_resolution(resolution_str: str) -> (Tuple[int, int], Tuple[int, int]):
+    if not resolution_str.count(',') == 1:
+        raise ValueError("Give resolution as in 'height,width'")
+    res_h, res_w = resolution_str.split(',')
+    res_h = max(int(res_h), trained_on_res)
+    res_w = max(int(res_w), trained_on_res)
+    z_h = int(round(res_h/first_stage_factor))
+    z_w = int(round(res_w/first_stage_factor))
+    return (z_h, z_w), (z_h*first_stage_factor, z_w*first_stage_factor)
+def add_arg_to_parser(parser):
+    parser.add_argument(
+        "-R",
+        "--resolution",
+        type=str,
+        default='256,256',
+        help=f"give resolution in multiples of {first_stage_factor}, default is '256,256'",
+    )
+    parser.add_argument(
+        "-C",
+        "--conditional",
+        type=str,
+        default='objects_bbox',
+        help=f"objects_bbox or objects_center_points",
+    )
+    parser.add_argument(
+        "-N",
+        "--n_samples_per_layout",
+        type=int,
+        default=4,
+        help=f"how many samples to generate per layout",
+    )
+    return parser
+if __name__ == "__main__":
+    sys.path.append(os.getcwd())
+    parser = get_parser()
+    parser = add_arg_to_parser(parser)
+    opt, unknown = parser.parse_known_args()
+    ckpt = None
+    if opt.resume:
+        if not os.path.exists(opt.resume):
+            raise ValueError("Cannot find {}".format(opt.resume))
+        if os.path.isfile(opt.resume):
+            paths = opt.resume.split("/")
+            try:
+                idx = len(paths)-paths[::-1].index("logs")+1
+            except ValueError:
+                idx = -2  # take a guess: path/to/logdir/checkpoints/model.ckpt
+            logdir = "/".join(paths[:idx])
+            ckpt = opt.resume
+        else:
+            assert os.path.isdir(opt.resume), opt.resume
+            logdir = opt.resume.rstrip("/")
+            ckpt = os.path.join(logdir, "checkpoints", "last.ckpt")
+        print(f"logdir:{logdir}")
+        base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*-project.yaml")))
+        opt.base = base_configs+opt.base
+    if opt.config:
+        if type(opt.config) == str:
+            opt.base = [opt.config]
+        else:
+            opt.base = [opt.base[-1]]
+    configs = [OmegaConf.load(cfg) for cfg in opt.base]
+    cli = OmegaConf.from_dotlist(unknown)
+    if opt.ignore_base_data:
+        for config in configs:
+            if hasattr(config, "data"):
+                del config["data"]
+    config = OmegaConf.merge(*configs, cli)
+    desired_z_shape, desired_resolution = get_resolution(opt.resolution)
+    conditional = opt.conditional
+    print(ckpt)
+    gpu = True
+    eval_mode = True
+    show_config = False
+    if show_config:
+        print(OmegaConf.to_container(config))
+    dsets, model, global_step = load_model_and_dset(config, ckpt, gpu, eval_mode)
+    print(f"Global step: {global_step}")
+    data_loader = dsets.val_dataloader()
+    print(dsets.datasets["validation"].conditional_builders)
+    conditional_builder = dsets.datasets["validation"].conditional_builders[conditional]
+    outdir = Path(opt.outdir).joinpath(f"{global_step:06}_{opt.top_k}_{opt.temperature}")
+    outdir.mkdir(exist_ok=True, parents=True)
+    print("Writing samples to ", outdir)
+    p_bar_1 = tqdm(enumerate(iter(data_loader)), desc='batch', total=len(data_loader))
+    for batch_no, batch in p_bar_1:
+        save_img: Optional[Tensor] = None
+        for i, annotations in tqdm(enumerate(batch['annotations']), desc='within_batch', total=data_loader.batch_size):
+            imgs = sample(model, annotations, dsets.datasets["validation"], conditional_builder,
+                          opt.n_samples_per_layout, opt.temperature, opt.top_k)
+            save_image(imgs, outdir.joinpath(f'{batch_no:04}_{i:02}.png'), n_row=opt.n_samples_per_layout+1)

scripts/sample_conditional.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import argparse, os, sys, glob, math, time
+import torch
+import numpy as np
+from omegaconf import OmegaConf
+import streamlit as st
+from streamlit import caching
+from PIL import Image
+from main import instantiate_from_config, DataModuleFromConfig
+from torch.utils.data import DataLoader
+from torch.utils.data.dataloader import default_collate
+rescale = lambda x: (x + 1.) / 2.
+def bchw_to_st(x):
+    return rescale(x.detach().cpu().numpy().transpose(0,2,3,1))
+def save_img(xstart, fname):
+    I = (xstart.clip(0,1)[0]*255).astype(np.uint8)
+    Image.fromarray(I).save(fname)
+def get_interactive_image(resize=False):
+    image = st.file_uploader("Input", type=["jpg", "JPEG", "png"])
+    if image is not None:
+        image = Image.open(image)
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        image = np.array(image).astype(np.uint8)
+        print("upload image shape: {}".format(image.shape))
+        img = Image.fromarray(image)
+        if resize:
+            img = img.resize((256, 256))
+        image = np.array(img)
+        return image
+def single_image_to_torch(x, permute=True):
+    assert x is not None, "Please provide an image through the upload function"
+    x = np.array(x)
+    x = torch.FloatTensor(x/255.*2. - 1.)[None,...]
+    if permute:
+        x = x.permute(0, 3, 1, 2)
+    return x
+def pad_to_M(x, M):
+    hp = math.ceil(x.shape[2]/M)*M-x.shape[2]
+    wp = math.ceil(x.shape[3]/M)*M-x.shape[3]
+    x = torch.nn.functional.pad(x, (0,wp,0,hp,0,0,0,0))
+    return x
+@torch.no_grad()
+def run_conditional(model, dsets):
+    if len(dsets.datasets) > 1:
+        split = st.sidebar.radio("Split", sorted(dsets.datasets.keys()))
+        dset = dsets.datasets[split]
+    else:
+        dset = next(iter(dsets.datasets.values()))
+    batch_size = 1
+    start_index = st.sidebar.number_input("Example Index (Size: {})".format(len(dset)), value=0,
+                                          min_value=0,
+                                          max_value=len(dset)-batch_size)
+    indices = list(range(start_index, start_index+batch_size))
+    example = default_collate([dset[i] for i in indices])
+    x = model.get_input("image", example).to(model.device)
+    cond_key = model.cond_stage_key
+    c = model.get_input(cond_key, example).to(model.device)
+    scale_factor = st.sidebar.slider("Scale Factor", min_value=0.5, max_value=4.0, step=0.25, value=1.00)
+    if scale_factor != 1.0:
+        x = torch.nn.functional.interpolate(x, scale_factor=scale_factor, mode="bicubic")
+        c = torch.nn.functional.interpolate(c, scale_factor=scale_factor, mode="bicubic")
+    quant_z, z_indices = model.encode_to_z(x)
+    quant_c, c_indices = model.encode_to_c(c)
+    cshape = quant_z.shape
+    xrec = model.first_stage_model.decode(quant_z)
+    st.write("image: {}".format(x.shape))
+    st.image(bchw_to_st(x), clamp=True, output_format="PNG")
+    st.write("image reconstruction: {}".format(xrec.shape))
+    st.image(bchw_to_st(xrec), clamp=True, output_format="PNG")
+    if cond_key == "segmentation":
+        # get image from segmentation mask
+        num_classes = c.shape[1]
+        c = torch.argmax(c, dim=1, keepdim=True)
+        c = torch.nn.functional.one_hot(c, num_classes=num_classes)
+        c = c.squeeze(1).permute(0, 3, 1, 2).float()
+        c = model.cond_stage_model.to_rgb(c)
+    st.write(f"{cond_key}: {tuple(c.shape)}")
+    st.image(bchw_to_st(c), clamp=True, output_format="PNG")
+    idx = z_indices
+    half_sample = st.sidebar.checkbox("Image Completion", value=False)
+    if half_sample:
+        start = idx.shape[1]//2
+    else:
+        start = 0
+    idx[:,start:] = 0
+    idx = idx.reshape(cshape[0],cshape[2],cshape[3])
+    start_i = start//cshape[3]
+    start_j = start %cshape[3]
+    if not half_sample and quant_z.shape == quant_c.shape:
+        st.info("Setting idx to c_indices")
+        idx = c_indices.clone().reshape(cshape[0],cshape[2],cshape[3])
+    cidx = c_indices
+    cidx = cidx.reshape(quant_c.shape[0],quant_c.shape[2],quant_c.shape[3])
+    xstart = model.decode_to_img(idx[:,:cshape[2],:cshape[3]], cshape)
+    st.image(bchw_to_st(xstart), clamp=True, output_format="PNG")
+    temperature = st.number_input("Temperature", value=1.0)
+    top_k = st.number_input("Top k", value=100)
+    sample = st.checkbox("Sample", value=True)
+    update_every = st.number_input("Update every", value=75)
+    st.text(f"Sampling shape ({cshape[2]},{cshape[3]})")
+    animate = st.checkbox("animate")
+    if animate:
+        import imageio
+        outvid = "sampling.mp4"
+        writer = imageio.get_writer(outvid, fps=25)
+    elapsed_t = st.empty()
+    info = st.empty()
+    st.text("Sampled")
+    if st.button("Sample"):
+        output = st.empty()
+        start_t = time.time()
+        for i in range(start_i,cshape[2]-0):
+            if i <= 8:
+                local_i = i
+            elif cshape[2]-i < 8:
+                local_i = 16-(cshape[2]-i)
+            else:
+                local_i = 8
+            for j in range(start_j,cshape[3]-0):
+                if j <= 8:
+                    local_j = j
+                elif cshape[3]-j < 8:
+                    local_j = 16-(cshape[3]-j)
+                else:
+                    local_j = 8
+                i_start = i-local_i
+                i_end = i_start+16
+                j_start = j-local_j
+                j_end = j_start+16
+                elapsed_t.text(f"Time: {time.time() - start_t} seconds")
+                info.text(f"Step: ({i},{j}) | Local: ({local_i},{local_j}) | Crop: ({i_start}:{i_end},{j_start}:{j_end})")
+                patch = idx[:,i_start:i_end,j_start:j_end]
+                patch = patch.reshape(patch.shape[0],-1)
+                cpatch = cidx[:, i_start:i_end, j_start:j_end]
+                cpatch = cpatch.reshape(cpatch.shape[0], -1)
+                patch = torch.cat((cpatch, patch), dim=1)
+                logits,_ = model.transformer(patch[:,:-1])
+                logits = logits[:, -256:, :]
+                logits = logits.reshape(cshape[0],16,16,-1)
+                logits = logits[:,local_i,local_j,:]
+                logits = logits/temperature
+                if top_k is not None:
+                    logits = model.top_k_logits(logits, top_k)
+                # apply softmax to convert to probabilities
+                probs = torch.nn.functional.softmax(logits, dim=-1)
+                # sample from the distribution or take the most likely
+                if sample:
+                    ix = torch.multinomial(probs, num_samples=1)
+                else:
+                    _, ix = torch.topk(probs, k=1, dim=-1)
+                idx[:,i,j] = ix
+                if (i*cshape[3]+j)%update_every==0:
+                    xstart = model.decode_to_img(idx[:, :cshape[2], :cshape[3]], cshape,)
+                    xstart = bchw_to_st(xstart)
+                    output.image(xstart, clamp=True, output_format="PNG")
+                    if animate:
+                        writer.append_data((xstart[0]*255).clip(0, 255).astype(np.uint8))
+        xstart = model.decode_to_img(idx[:,:cshape[2],:cshape[3]], cshape)
+        xstart = bchw_to_st(xstart)
+        output.image(xstart, clamp=True, output_format="PNG")
+        #save_img(xstart, "full_res_sample.png")
+        if animate:
+            writer.close()
+            st.video(outvid)
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-r",
+        "--resume",
+        type=str,
+        nargs="?",
+        help="load from logdir or checkpoint in logdir",
+    )
+    parser.add_argument(
+        "-b",
+        "--base",
+        nargs="*",
+        metavar="base_config.yaml",
+        help="paths to base configs. Loaded from left-to-right. "
+        "Parameters can be overwritten or added with command-line options of the form `--key value`.",
+        default=list(),
+    )
+    parser.add_argument(
+        "-c",
+        "--config",
+        nargs="?",
+        metavar="single_config.yaml",
+        help="path to single config. If specified, base configs will be ignored "
+        "(except for the last one if left unspecified).",
+        const=True,
+        default="",
+    )
+    parser.add_argument(
+        "--ignore_base_data",
+        action="store_true",
+        help="Ignore data specification from base configs. Useful if you want "
+        "to specify a custom datasets on the command line.",
+    )
+    return parser
+def load_model_from_config(config, sd, gpu=True, eval_mode=True):
+    if "ckpt_path" in config.params:
+        st.warning("Deleting the restore-ckpt path from the config...")
+        config.params.ckpt_path = None
+    if "downsample_cond_size" in config.params:
+        st.warning("Deleting downsample-cond-size from the config and setting factor=0.5 instead...")
+        config.params.downsample_cond_size = -1
+        config.params["downsample_cond_factor"] = 0.5
+    try:
+        if "ckpt_path" in config.params.first_stage_config.params:
+            config.params.first_stage_config.params.ckpt_path = None
+            st.warning("Deleting the first-stage restore-ckpt path from the config...")
+        if "ckpt_path" in config.params.cond_stage_config.params:
+            config.params.cond_stage_config.params.ckpt_path = None
+            st.warning("Deleting the cond-stage restore-ckpt path from the config...")
+    except:
+        pass
+    model = instantiate_from_config(config)
+    if sd is not None:
+        missing, unexpected = model.load_state_dict(sd, strict=False)
+        st.info(f"Missing Keys in State Dict: {missing}")
+        st.info(f"Unexpected Keys in State Dict: {unexpected}")
+    if gpu:
+        model.cuda()
+    if eval_mode:
+        model.eval()
+    return {"model": model}
+def get_data(config):
+    # get data
+    data = instantiate_from_config(config.data)
+    data.prepare_data()
+    data.setup()
+    return data
+@st.cache(allow_output_mutation=True, suppress_st_warning=True)
+def load_model_and_dset(config, ckpt, gpu, eval_mode):
+    # get data
+    dsets = get_data(config)   # calls data.config ...
+    # now load the specified checkpoint
+    if ckpt:
+        pl_sd = torch.load(ckpt, map_location="cpu")
+        global_step = pl_sd["global_step"]
+    else:
+        pl_sd = {"state_dict": None}
+        global_step = None
+    model = load_model_from_config(config.model,
+                                   pl_sd["state_dict"],
+                                   gpu=gpu,
+                                   eval_mode=eval_mode)["model"]
+    return dsets, model, global_step
+if __name__ == "__main__":
+    sys.path.append(os.getcwd())
+    parser = get_parser()
+    opt, unknown = parser.parse_known_args()
+    ckpt = None
+    if opt.resume:
+        if not os.path.exists(opt.resume):
+            raise ValueError("Cannot find {}".format(opt.resume))
+        if os.path.isfile(opt.resume):
+            paths = opt.resume.split("/")
+            try:
+                idx = len(paths)-paths[::-1].index("logs")+1
+            except ValueError:
+                idx = -2 # take a guess: path/to/logdir/checkpoints/model.ckpt
+            logdir = "/".join(paths[:idx])
+            ckpt = opt.resume
+        else:
+            assert os.path.isdir(opt.resume), opt.resume
+            logdir = opt.resume.rstrip("/")
+            ckpt = os.path.join(logdir, "checkpoints", "last.ckpt")
+        print(f"logdir:{logdir}")
+        base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*-project.yaml")))
+        opt.base = base_configs+opt.base
+    if opt.config:
+        if type(opt.config) == str:
+            opt.base = [opt.config]
+        else:
+            opt.base = [opt.base[-1]]
+    configs = [OmegaConf.load(cfg) for cfg in opt.base]
+    cli = OmegaConf.from_dotlist(unknown)
+    if opt.ignore_base_data:
+        for config in configs:
+            if hasattr(config, "data"): del config["data"]
+    config = OmegaConf.merge(*configs, cli)
+    st.sidebar.text(ckpt)
+    gs = st.sidebar.empty()
+    gs.text(f"Global step: ?")
+    st.sidebar.text("Options")
+    #gpu = st.sidebar.checkbox("GPU", value=True)
+    gpu = True
+    #eval_mode = st.sidebar.checkbox("Eval Mode", value=True)
+    eval_mode = True
+    #show_config = st.sidebar.checkbox("Show Config", value=False)
+    show_config = False
+    if show_config:
+        st.info("Checkpoint: {}".format(ckpt))
+        st.json(OmegaConf.to_container(config))
+    dsets, model, global_step = load_model_and_dset(config, ckpt, gpu, eval_mode)
+    gs.text(f"Global step: {global_step}")
+    run_conditional(model, dsets)

scripts/sample_fast.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import argparse, os, sys, glob
+import torch
+import time
+import numpy as np
+from omegaconf import OmegaConf
+from PIL import Image
+from tqdm import tqdm, trange
+from einops import repeat
+from main import instantiate_from_config
+from taming.modules.transformer.mingpt import sample_with_past
+rescale = lambda x: (x + 1.) / 2.
+def chw_to_pillow(x):
+    return Image.fromarray((255*rescale(x.detach().cpu().numpy().transpose(1,2,0))).clip(0,255).astype(np.uint8))
+@torch.no_grad()
+def sample_classconditional(model, batch_size, class_label, steps=256, temperature=None, top_k=None, callback=None,
+                            dim_z=256, h=16, w=16, verbose_time=False, top_p=None):
+    log = dict()
+    assert type(class_label) == int, f'expecting type int but type is {type(class_label)}'
+    qzshape = [batch_size, dim_z, h, w]
+    assert not model.be_unconditional, 'Expecting a class-conditional Net2NetTransformer.'
+    c_indices = repeat(torch.tensor([class_label]), '1 -> b 1', b=batch_size).to(model.device)  # class token
+    t1 = time.time()
+    index_sample = sample_with_past(c_indices, model.transformer, steps=steps,
+                                    sample_logits=True, top_k=top_k, callback=callback,
+                                    temperature=temperature, top_p=top_p)
+    if verbose_time:
+        sampling_time = time.time() - t1
+        print(f"Full sampling takes about {sampling_time:.2f} seconds.")
+    x_sample = model.decode_to_img(index_sample, qzshape)
+    log["samples"] = x_sample
+    log["class_label"] = c_indices
+    return log
+@torch.no_grad()
+def sample_unconditional(model, batch_size, steps=256, temperature=None, top_k=None, top_p=None, callback=None,
+                         dim_z=256, h=16, w=16, verbose_time=False):
+    log = dict()
+    qzshape = [batch_size, dim_z, h, w]
+    assert model.be_unconditional, 'Expecting an unconditional model.'
+    c_indices = repeat(torch.tensor([model.sos_token]), '1 -> b 1', b=batch_size).to(model.device)  # sos token
+    t1 = time.time()
+    index_sample = sample_with_past(c_indices, model.transformer, steps=steps,
+                                    sample_logits=True, top_k=top_k, callback=callback,
+                                    temperature=temperature, top_p=top_p)
+    if verbose_time:
+        sampling_time = time.time() - t1
+        print(f"Full sampling takes about {sampling_time:.2f} seconds.")
+    x_sample = model.decode_to_img(index_sample, qzshape)
+    log["samples"] = x_sample
+    return log
+@torch.no_grad()
+def run(logdir, model, batch_size, temperature, top_k, unconditional=True, num_samples=50000,
+        given_classes=None, top_p=None):
+    batches = [batch_size for _ in range(num_samples//batch_size)] + [num_samples % batch_size]
+    if not unconditional:
+        assert given_classes is not None
+        print("Running in pure class-conditional sampling mode. I will produce "
+              f"{num_samples} samples for each of the {len(given_classes)} classes, "
+              f"i.e. {num_samples*len(given_classes)} in total.")
+        for class_label in tqdm(given_classes, desc="Classes"):
+            for n, bs in tqdm(enumerate(batches), desc="Sampling Class"):
+                if bs == 0: break
+                logs = sample_classconditional(model, batch_size=bs, class_label=class_label,
+                                               temperature=temperature, top_k=top_k, top_p=top_p)
+                save_from_logs(logs, logdir, base_count=n * batch_size, cond_key=logs["class_label"])
+    else:
+        print(f"Running in unconditional sampling mode, producing {num_samples} samples.")
+        for n, bs in tqdm(enumerate(batches), desc="Sampling"):
+            if bs == 0: break
+            logs = sample_unconditional(model, batch_size=bs, temperature=temperature, top_k=top_k, top_p=top_p)
+            save_from_logs(logs, logdir, base_count=n * batch_size)
+def save_from_logs(logs, logdir, base_count, key="samples", cond_key=None):
+    xx = logs[key]
+    for i, x in enumerate(xx):
+        x = chw_to_pillow(x)
+        count = base_count + i
+        if cond_key is None:
+            x.save(os.path.join(logdir, f"{count:06}.png"))
+        else:
+            condlabel = cond_key[i]
+            if type(condlabel) == torch.Tensor: condlabel = condlabel.item()
+            os.makedirs(os.path.join(logdir, str(condlabel)), exist_ok=True)
+            x.save(os.path.join(logdir, str(condlabel), f"{count:06}.png"))
+def get_parser():
+    def str2bool(v):
+        if isinstance(v, bool):
+            return v
+        if v.lower() in ("yes", "true", "t", "y", "1"):
+            return True
+        elif v.lower() in ("no", "false", "f", "n", "0"):
+            return False
+        else:
+            raise argparse.ArgumentTypeError("Boolean value expected.")
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-r",
+        "--resume",
+        type=str,
+        nargs="?",
+        help="load from logdir or checkpoint in logdir",
+    )
+    parser.add_argument(
+        "-o",
+        "--outdir",
+        type=str,
+        nargs="?",
+        help="path where the samples will be logged to.",
+        default=""
+    )
+    parser.add_argument(
+        "-b",
+        "--base",
+        nargs="*",
+        metavar="base_config.yaml",
+        help="paths to base configs. Loaded from left-to-right. "
+        "Parameters can be overwritten or added with command-line options of the form `--key value`.",
+        default=list(),
+    )
+    parser.add_argument(
+        "-n",
+        "--num_samples",
+        type=int,
+        nargs="?",
+        help="num_samples to draw",
+        default=50000
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        nargs="?",
+        help="the batch size",
+        default=25
+    )
+    parser.add_argument(
+        "-k",
+        "--top_k",
+        type=int,
+        nargs="?",
+        help="top-k value to sample with",
+        default=250,
+    )
+    parser.add_argument(
+        "-t",
+        "--temperature",
+        type=float,
+        nargs="?",
+        help="temperature value to sample with",
+        default=1.0
+    )
+    parser.add_argument(
+        "-p",
+        "--top_p",
+        type=float,
+        nargs="?",
+        help="top-p value to sample with",
+        default=1.0
+    )
+    parser.add_argument(
+        "--classes",
+        type=str,
+        nargs="?",
+        help="specify comma-separated classes to sample from. Uses 1000 classes per default.",
+        default="imagenet"
+    )
+    return parser
+def load_model_from_config(config, sd, gpu=True, eval_mode=True):
+    model = instantiate_from_config(config)
+    if sd is not None:
+        model.load_state_dict(sd)
+    if gpu:
+        model.cuda()
+    if eval_mode:
+        model.eval()
+    return {"model": model}
+def load_model(config, ckpt, gpu, eval_mode):
+    # load the specified checkpoint
+    if ckpt:
+        pl_sd = torch.load(ckpt, map_location="cpu")
+        global_step = pl_sd["global_step"]
+        print(f"loaded model from global step {global_step}.")
+    else:
+        pl_sd = {"state_dict": None}
+        global_step = None
+    model = load_model_from_config(config.model, pl_sd["state_dict"], gpu=gpu, eval_mode=eval_mode)["model"]
+    return model, global_step
+if __name__ == "__main__":
+    sys.path.append(os.getcwd())
+    parser = get_parser()
+    opt, unknown = parser.parse_known_args()
+    assert opt.resume
+    ckpt = None
+    if not os.path.exists(opt.resume):
+        raise ValueError("Cannot find {}".format(opt.resume))
+    if os.path.isfile(opt.resume):
+        paths = opt.resume.split("/")
+        try:
+            idx = len(paths)-paths[::-1].index("logs")+1
+        except ValueError:
+            idx = -2 # take a guess: path/to/logdir/checkpoints/model.ckpt
+        logdir = "/".join(paths[:idx])
+        ckpt = opt.resume
+    else:
+        assert os.path.isdir(opt.resume), opt.resume
+        logdir = opt.resume.rstrip("/")
+        ckpt = os.path.join(logdir, "checkpoints", "last.ckpt")
+    base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*-project.yaml")))
+    opt.base = base_configs+opt.base
+    configs = [OmegaConf.load(cfg) for cfg in opt.base]
+    cli = OmegaConf.from_dotlist(unknown)
+    config = OmegaConf.merge(*configs, cli)
+    model, global_step = load_model(config, ckpt, gpu=True, eval_mode=True)
+    if opt.outdir:
+        print(f"Switching logdir from '{logdir}' to '{opt.outdir}'")
+        logdir = opt.outdir
+    if opt.classes == "imagenet":
+        given_classes = [i for i in range(1000)]
+    else:
+        cls_str = opt.classes
+        assert not cls_str.endswith(","), 'class string should not end with a ","'
+        given_classes = [int(c) for c in cls_str.split(",")]
+    logdir = os.path.join(logdir, "samples", f"top_k_{opt.top_k}_temp_{opt.temperature:.2f}_top_p_{opt.top_p}",
+                          f"{global_step}")
+    print(f"Logging to {logdir}")
+    os.makedirs(logdir, exist_ok=True)
+    run(logdir, model, opt.batch_size, opt.temperature, opt.top_k, unconditional=model.be_unconditional,
+        given_classes=given_classes, num_samples=opt.num_samples, top_p=opt.top_p)
+    print("done.")

scripts/taming-transformers.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

setup.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from setuptools import setup, find_packages
+setup(
+    name='taming-transformers',
+    version='0.0.1',
+    description='Taming Transformers for High-Resolution Image Synthesis',
+    packages=find_packages(),
+    install_requires=[
+        'torch',
+        'numpy',
+        'tqdm',
+    ],
+)

taming/__pycache__/lr_scheduler.cpython-312.pyc ADDED Viewed

Binary file (2.19 kB). View file

taming/__pycache__/util.cpython-312.pyc ADDED Viewed

Binary file (6.33 kB). View file

taming/data/.ipynb_checkpoints/utils-checkpoint.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import collections
+import os
+import tarfile
+import urllib
+import zipfile
+from pathlib import Path
+import numpy as np
+import torch
+from taming.data.helper_types import Annotation
+#from torch._six import string_classes
+from torch.utils.data._utils.collate import np_str_obj_array_pattern, default_collate_err_msg_format
+from tqdm import tqdm
+string_classes = (str,bytes)
+def unpack(path):
+    if path.endswith("tar.gz"):
+        with tarfile.open(path, "r:gz") as tar:
+            tar.extractall(path=os.path.split(path)[0])
+    elif path.endswith("tar"):
+        with tarfile.open(path, "r:") as tar:
+            tar.extractall(path=os.path.split(path)[0])
+    elif path.endswith("zip"):
+        with zipfile.ZipFile(path, "r") as f:
+            f.extractall(path=os.path.split(path)[0])
+    else:
+        raise NotImplementedError(
+            "Unknown file extension: {}".format(os.path.splitext(path)[1])
+        )
+def reporthook(bar):
+    """tqdm progress bar for downloads."""
+    def hook(b=1, bsize=1, tsize=None):
+        if tsize is not None:
+            bar.total = tsize
+        bar.update(b * bsize - bar.n)
+    return hook
+def get_root(name):
+    base = "data/"
+    root = os.path.join(base, name)
+    os.makedirs(root, exist_ok=True)
+    return root
+def is_prepared(root):
+    return Path(root).joinpath(".ready").exists()
+def mark_prepared(root):
+    Path(root).joinpath(".ready").touch()
+def prompt_download(file_, source, target_dir, content_dir=None):
+    targetpath = os.path.join(target_dir, file_)
+    while not os.path.exists(targetpath):
+        if content_dir is not None and os.path.exists(
+            os.path.join(target_dir, content_dir)
+        ):
+            break
+        print(
+            "Please download '{}' from '{}' to '{}'.".format(file_, source, targetpath)
+        )
+        if content_dir is not None:
+            print(
+                "Or place its content into '{}'.".format(
+                    os.path.join(target_dir, content_dir)
+                )
+            )
+        input("Press Enter when done...")
+    return targetpath
+def download_url(file_, url, target_dir):
+    targetpath = os.path.join(target_dir, file_)
+    os.makedirs(target_dir, exist_ok=True)
+    with tqdm(
+        unit="B", unit_scale=True, unit_divisor=1024, miniters=1, desc=file_
+    ) as bar:
+        urllib.request.urlretrieve(url, targetpath, reporthook=reporthook(bar))
+    return targetpath
+def download_urls(urls, target_dir):
+    paths = dict()
+    for fname, url in urls.items():
+        outpath = download_url(fname, url, target_dir)
+        paths[fname] = outpath
+    return paths
+def quadratic_crop(x, bbox, alpha=1.0):
+    """bbox is xmin, ymin, xmax, ymax"""
+    im_h, im_w = x.shape[:2]
+    bbox = np.array(bbox, dtype=np.float32)
+    bbox = np.clip(bbox, 0, max(im_h, im_w))
+    center = 0.5 * (bbox[0] + bbox[2]), 0.5 * (bbox[1] + bbox[3])
+    w = bbox[2] - bbox[0]
+    h = bbox[3] - bbox[1]
+    l = int(alpha * max(w, h))
+    l = max(l, 2)
+    required_padding = -1 * min(
+        center[0] - l, center[1] - l, im_w - (center[0] + l), im_h - (center[1] + l)
+    )
+    required_padding = int(np.ceil(required_padding))
+    if required_padding > 0:
+        padding = [
+            [required_padding, required_padding],
+            [required_padding, required_padding],
+        ]
+        padding += [[0, 0]] * (len(x.shape) - 2)
+        x = np.pad(x, padding, "reflect")
+        center = center[0] + required_padding, center[1] + required_padding
+    xmin = int(center[0] - l / 2)
+    ymin = int(center[1] - l / 2)
+    return np.array(x[ymin : ymin + l, xmin : xmin + l, ...])
+def custom_collate(batch):
+    r"""source: pytorch 1.9.0, only one modification to original code """
+    elem = batch[0]
+    elem_type = type(elem)
+    if isinstance(elem, torch.Tensor):
+        out = None
+        if torch.utils.data.get_worker_info() is not None:
+            # If we're in a background process, concatenate directly into a
+            # shared memory tensor to avoid an extra copy
+            numel = sum([x.numel() for x in batch])
+            storage = elem.storage()._new_shared(numel)
+            out = elem.new(storage)
+        return torch.stack(batch, 0, out=out)
+    elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+            and elem_type.__name__ != 'string_':
+        if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
+            # array of string classes and object
+            if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+            return custom_collate([torch.as_tensor(b) for b in batch])
+        elif elem.shape == ():  # scalars
+            return torch.as_tensor(batch)
+    elif isinstance(elem, float):
+        return torch.tensor(batch, dtype=torch.float64)
+    elif isinstance(elem, int):
+        return torch.tensor(batch)
+    elif isinstance(elem, string_classes):
+        return batch
+    elif isinstance(elem, collections.abc.Mapping):
+        return {key: custom_collate([d[key] for d in batch]) for key in elem}
+    elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
+        return elem_type(*(custom_collate(samples) for samples in zip(*batch)))
+    if isinstance(elem, collections.abc.Sequence) and isinstance(elem[0], Annotation):  # added
+        return batch  # added
+    elif isinstance(elem, collections.abc.Sequence):
+        # check to make sure that the elements in batch have consistent size
+        it = iter(batch)
+        elem_size = len(next(it))
+        if not all(len(elem) == elem_size for elem in it):
+            raise RuntimeError('each element in list of batch should be of equal size')
+        transposed = zip(*batch)
+        return [custom_collate(samples) for samples in transposed]
+    raise TypeError(default_collate_err_msg_format.format(elem_type))

taming/data/__pycache__/helper_types.cpython-312.pyc ADDED Viewed

Binary file (2.43 kB). View file

taming/data/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (10.6 kB). View file

taming/data/ade20k.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+import numpy as np
+import cv2
+import albumentations
+from PIL import Image
+from torch.utils.data import Dataset
+from taming.data.sflckr import SegmentationBase # for examples included in repo
+class Examples(SegmentationBase):
+    def __init__(self, size=256, random_crop=False, interpolation="bicubic"):
+        super().__init__(data_csv="data/ade20k_examples.txt",
+                         data_root="data/ade20k_images",
+                         segmentation_root="data/ade20k_segmentations",
+                         size=size, random_crop=random_crop,
+                         interpolation=interpolation,
+                         n_labels=151, shift_segmentation=False)
+# With semantic map and scene label
+class ADE20kBase(Dataset):
+    def __init__(self, config=None, size=None, random_crop=False, interpolation="bicubic", crop_size=None):
+        self.split = self.get_split()
+        self.n_labels = 151 # unknown + 150
+        self.data_csv = {"train": "data/ade20k_train.txt",
+                         "validation": "data/ade20k_test.txt"}[self.split]
+        self.data_root = "data/ade20k_root"
+        with open(os.path.join(self.data_root, "sceneCategories.txt"), "r") as f:
+            self.scene_categories = f.read().splitlines()
+        self.scene_categories = dict(line.split() for line in self.scene_categories)
+        with open(self.data_csv, "r") as f:
+            self.image_paths = f.read().splitlines()
+        self._length = len(self.image_paths)
+        self.labels = {
+            "relative_file_path_": [l for l in self.image_paths],
+            "file_path_": [os.path.join(self.data_root, "images", l)
+                           for l in self.image_paths],
+            "relative_segmentation_path_": [l.replace(".jpg", ".png")
+                                            for l in self.image_paths],
+            "segmentation_path_": [os.path.join(self.data_root, "annotations",
+                                                l.replace(".jpg", ".png"))
+                                   for l in self.image_paths],
+            "scene_category": [self.scene_categories[l.split("/")[1].replace(".jpg", "")]
+                               for l in self.image_paths],
+        }
+        size = None if size is not None and size<=0 else size
+        self.size = size
+        if crop_size is None:
+            self.crop_size = size if size is not None else None
+        else:
+            self.crop_size = crop_size
+        if self.size is not None:
+            self.interpolation = interpolation
+            self.interpolation = {
+                "nearest": cv2.INTER_NEAREST,
+                "bilinear": cv2.INTER_LINEAR,
+                "bicubic": cv2.INTER_CUBIC,
+                "area": cv2.INTER_AREA,
+                "lanczos": cv2.INTER_LANCZOS4}[self.interpolation]
+            self.image_rescaler = albumentations.SmallestMaxSize(max_size=self.size,
+                                                                 interpolation=self.interpolation)
+            self.segmentation_rescaler = albumentations.SmallestMaxSize(max_size=self.size,
+                                                                        interpolation=cv2.INTER_NEAREST)
+        if crop_size is not None:
+            self.center_crop = not random_crop
+            if self.center_crop:
+                self.cropper = albumentations.CenterCrop(height=self.crop_size, width=self.crop_size)
+            else:
+                self.cropper = albumentations.RandomCrop(height=self.crop_size, width=self.crop_size)
+            self.preprocessor = self.cropper
+    def __len__(self):
+        return self._length
+    def __getitem__(self, i):
+        example = dict((k, self.labels[k][i]) for k in self.labels)
+        image = Image.open(example["file_path_"])
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        image = np.array(image).astype(np.uint8)
+        if self.size is not None:
+            image = self.image_rescaler(image=image)["image"]
+        segmentation = Image.open(example["segmentation_path_"])
+        segmentation = np.array(segmentation).astype(np.uint8)
+        if self.size is not None:
+            segmentation = self.segmentation_rescaler(image=segmentation)["image"]
+        if self.size is not None:
+            processed = self.preprocessor(image=image, mask=segmentation)
+        else:
+            processed = {"image": image, "mask": segmentation}
+        example["image"] = (processed["image"]/127.5 - 1.0).astype(np.float32)
+        segmentation = processed["mask"]
+        onehot = np.eye(self.n_labels)[segmentation]
+        example["segmentation"] = onehot
+        return example
+class ADE20kTrain(ADE20kBase):
+    # default to random_crop=True
+    def __init__(self, config=None, size=None, random_crop=True, interpolation="bicubic", crop_size=None):
+        super().__init__(config=config, size=size, random_crop=random_crop,
+                          interpolation=interpolation, crop_size=crop_size)
+    def get_split(self):
+        return "train"
+class ADE20kValidation(ADE20kBase):
+    def get_split(self):
+        return "validation"
+if __name__ == "__main__":
+    dset = ADE20kValidation()
+    ex = dset[0]
+    for k in ["image", "scene_category", "segmentation"]:
+        print(type(ex[k]))
+        try:
+            print(ex[k].shape)
+        except:
+            print(ex[k])

taming/data/annotated_objects_coco.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import json
+from itertools import chain
+from pathlib import Path
+from typing import Iterable, Dict, List, Callable, Any
+from collections import defaultdict
+from tqdm import tqdm
+from taming.data.annotated_objects_dataset import AnnotatedObjectsDataset
+from taming.data.helper_types import Annotation, ImageDescription, Category
+COCO_PATH_STRUCTURE = {
+    'train': {
+        'top_level': '',
+        'instances_annotations': 'annotations/instances_train2017.json',
+        'stuff_annotations': 'annotations/stuff_train2017.json',
+        'files': 'train2017'
+    },
+    'validation': {
+        'top_level': '',
+        'instances_annotations': 'annotations/instances_val2017.json',
+        'stuff_annotations': 'annotations/stuff_val2017.json',
+        'files': 'val2017'
+    }
+}
+def load_image_descriptions(description_json: List[Dict]) -> Dict[str, ImageDescription]:
+    return {
+        str(img['id']): ImageDescription(
+            id=img['id'],
+            license=img.get('license'),
+            file_name=img['file_name'],
+            coco_url=img['coco_url'],
+            original_size=(img['width'], img['height']),
+            date_captured=img.get('date_captured'),
+            flickr_url=img.get('flickr_url')
+        )
+        for img in description_json
+    }
+def load_categories(category_json: Iterable) -> Dict[str, Category]:
+    return {str(cat['id']): Category(id=str(cat['id']), super_category=cat['supercategory'], name=cat['name'])
+            for cat in category_json if cat['name'] != 'other'}
+def load_annotations(annotations_json: List[Dict], image_descriptions: Dict[str, ImageDescription],
+                     category_no_for_id: Callable[[str], int], split: str) -> Dict[str, List[Annotation]]:
+    annotations = defaultdict(list)
+    total = sum(len(a) for a in annotations_json)
+    for ann in tqdm(chain(*annotations_json), f'Loading {split} annotations', total=total):
+        image_id = str(ann['image_id'])
+        if image_id not in image_descriptions:
+            raise ValueError(f'image_id [{image_id}] has no image description.')
+        category_id = ann['category_id']
+        try:
+            category_no = category_no_for_id(str(category_id))
+        except KeyError:
+            continue
+        width, height = image_descriptions[image_id].original_size
+        bbox = (ann['bbox'][0] / width, ann['bbox'][1] / height, ann['bbox'][2] / width, ann['bbox'][3] / height)
+        annotations[image_id].append(
+            Annotation(
+                id=ann['id'],
+                area=bbox[2]*bbox[3],  # use bbox area
+                is_group_of=ann['iscrowd'],
+                image_id=ann['image_id'],
+                bbox=bbox,
+                category_id=str(category_id),
+                category_no=category_no
+            )
+        )
+    return dict(annotations)
+class AnnotatedObjectsCoco(AnnotatedObjectsDataset):
+    def __init__(self, use_things: bool = True, use_stuff: bool = True, **kwargs):
+        """
+        @param data_path: is the path to the following folder structure:
+                          coco/
+                          ├── annotations
+                          │   ├── instances_train2017.json
+                          │   ├── instances_val2017.json
+                          │   ├── stuff_train2017.json
+                          │   └── stuff_val2017.json
+                          ├── train2017
+                          │   ├── 000000000009.jpg
+                          │   ├── 000000000025.jpg
+                          │   └── ...
+                          ├── val2017
+                          │   ├── 000000000139.jpg
+                          │   ├── 000000000285.jpg
+                          │   └── ...
+        @param: split: one of 'train' or 'validation'
+        @param: desired image size (give square images)
+        """
+        super().__init__(**kwargs)
+        self.use_things = use_things
+        self.use_stuff = use_stuff
+        with open(self.paths['instances_annotations']) as f:
+            inst_data_json = json.load(f)
+        with open(self.paths['stuff_annotations']) as f:
+            stuff_data_json = json.load(f)
+        category_jsons = []
+        annotation_jsons = []
+        if self.use_things:
+            category_jsons.append(inst_data_json['categories'])
+            annotation_jsons.append(inst_data_json['annotations'])
+        if self.use_stuff:
+            category_jsons.append(stuff_data_json['categories'])
+            annotation_jsons.append(stuff_data_json['annotations'])
+        self.categories = load_categories(chain(*category_jsons))
+        self.filter_categories()
+        self.setup_category_id_and_number()
+        self.image_descriptions = load_image_descriptions(inst_data_json['images'])
+        annotations = load_annotations(annotation_jsons, self.image_descriptions, self.get_category_number, self.split)
+        self.annotations = self.filter_object_number(annotations, self.min_object_area,
+                                                     self.min_objects_per_image, self.max_objects_per_image)
+        self.image_ids = list(self.annotations.keys())
+        self.clean_up_annotations_and_image_descriptions()
+    def get_path_structure(self) -> Dict[str, str]:
+        if self.split not in COCO_PATH_STRUCTURE:
+            raise ValueError(f'Split [{self.split} does not exist for COCO data.]')
+        return COCO_PATH_STRUCTURE[self.split]
+    def get_image_path(self, image_id: str) -> Path:
+        return self.paths['files'].joinpath(self.image_descriptions[str(image_id)].file_name)
+    def get_image_description(self, image_id: str) -> Dict[str, Any]:
+        # noinspection PyProtectedMember
+        return self.image_descriptions[image_id]._asdict()

taming/data/annotated_objects_dataset.py ADDED Viewed

	@@ -0,0 +1,218 @@

+from pathlib import Path
+from typing import Optional, List, Callable, Dict, Any, Union
+import warnings
+import PIL.Image as pil_image
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchvision import transforms
+from taming.data.conditional_builder.objects_bbox import ObjectsBoundingBoxConditionalBuilder
+from taming.data.conditional_builder.objects_center_points import ObjectsCenterPointsConditionalBuilder
+from taming.data.conditional_builder.utils import load_object_from_string
+from taming.data.helper_types import BoundingBox, CropMethodType, Image, Annotation, SplitType
+from taming.data.image_transforms import CenterCropReturnCoordinates, RandomCrop1dReturnCoordinates, \
+    Random2dCropReturnCoordinates, RandomHorizontalFlipReturn, convert_pil_to_tensor
+class AnnotatedObjectsDataset(Dataset):
+    def __init__(self, data_path: Union[str, Path], split: SplitType, keys: List[str], target_image_size: int,
+                 min_object_area: float, min_objects_per_image: int, max_objects_per_image: int,
+                 crop_method: CropMethodType, random_flip: bool, no_tokens: int, use_group_parameter: bool,
+                 encode_crop: bool, category_allow_list_target: str = "", category_mapping_target: str = "",
+                 no_object_classes: Optional[int] = None):
+        self.data_path = data_path
+        self.split = split
+        self.keys = keys
+        self.target_image_size = target_image_size
+        self.min_object_area = min_object_area
+        self.min_objects_per_image = min_objects_per_image
+        self.max_objects_per_image = max_objects_per_image
+        self.crop_method = crop_method
+        self.random_flip = random_flip
+        self.no_tokens = no_tokens
+        self.use_group_parameter = use_group_parameter
+        self.encode_crop = encode_crop
+        self.annotations = None
+        self.image_descriptions = None
+        self.categories = None
+        self.category_ids = None
+        self.category_number = None
+        self.image_ids = None
+        self.transform_functions: List[Callable] = self.setup_transform(target_image_size, crop_method, random_flip)
+        self.paths = self.build_paths(self.data_path)
+        self._conditional_builders = None
+        self.category_allow_list = None
+        if category_allow_list_target:
+            allow_list = load_object_from_string(category_allow_list_target)
+            self.category_allow_list = {name for name, _ in allow_list}
+        self.category_mapping = {}
+        if category_mapping_target:
+            self.category_mapping = load_object_from_string(category_mapping_target)
+        self.no_object_classes = no_object_classes
+    def build_paths(self, top_level: Union[str, Path]) -> Dict[str, Path]:
+        top_level = Path(top_level)
+        sub_paths = {name: top_level.joinpath(sub_path) for name, sub_path in self.get_path_structure().items()}
+        for path in sub_paths.values():
+            if not path.exists():
+                raise FileNotFoundError(f'{type(self).__name__} data structure error: [{path}] does not exist.')
+        return sub_paths
+    @staticmethod
+    def load_image_from_disk(path: Path) -> Image:
+        return pil_image.open(path).convert('RGB')
+    @staticmethod
+    def setup_transform(target_image_size: int, crop_method: CropMethodType, random_flip: bool):
+        transform_functions = []
+        if crop_method == 'none':
+            transform_functions.append(transforms.Resize((target_image_size, target_image_size)))
+        elif crop_method == 'center':
+            transform_functions.extend([
+                transforms.Resize(target_image_size),
+                CenterCropReturnCoordinates(target_image_size)
+            ])
+        elif crop_method == 'random-1d':
+            transform_functions.extend([
+                transforms.Resize(target_image_size),
+                RandomCrop1dReturnCoordinates(target_image_size)
+            ])
+        elif crop_method == 'random-2d':
+            transform_functions.extend([
+                Random2dCropReturnCoordinates(target_image_size),
+                transforms.Resize(target_image_size)
+            ])
+        elif crop_method is None:
+            return None
+        else:
+            raise ValueError(f'Received invalid crop method [{crop_method}].')
+        if random_flip:
+            transform_functions.append(RandomHorizontalFlipReturn())
+        transform_functions.append(transforms.Lambda(lambda x: x / 127.5 - 1.))
+        return transform_functions
+    def image_transform(self, x: Tensor) -> (Optional[BoundingBox], Optional[bool], Tensor):
+        crop_bbox = None
+        flipped = None
+        for t in self.transform_functions:
+            if isinstance(t, (RandomCrop1dReturnCoordinates, CenterCropReturnCoordinates, Random2dCropReturnCoordinates)):
+                crop_bbox, x = t(x)
+            elif isinstance(t, RandomHorizontalFlipReturn):
+                flipped, x = t(x)
+            else:
+                x = t(x)
+        return crop_bbox, flipped, x
+    @property
+    def no_classes(self) -> int:
+        return self.no_object_classes if self.no_object_classes else len(self.categories)
+    @property
+    def conditional_builders(self) -> ObjectsCenterPointsConditionalBuilder:
+        # cannot set this up in init because no_classes is only known after loading data in init of superclass
+        if self._conditional_builders is None:
+            self._conditional_builders = {
+                'objects_center_points': ObjectsCenterPointsConditionalBuilder(
+                    self.no_classes,
+                    self.max_objects_per_image,
+                    self.no_tokens,
+                    self.encode_crop,
+                    self.use_group_parameter,
+                    getattr(self, 'use_additional_parameters', False)
+                ),
+                'objects_bbox': ObjectsBoundingBoxConditionalBuilder(
+                    self.no_classes,
+                    self.max_objects_per_image,
+                    self.no_tokens,
+                    self.encode_crop,
+                    self.use_group_parameter,
+                    getattr(self, 'use_additional_parameters', False)
+                )
+            }
+        return self._conditional_builders
+    def filter_categories(self) -> None:
+        if self.category_allow_list:
+            self.categories = {id_: cat for id_, cat in self.categories.items() if cat.name in self.category_allow_list}
+        if self.category_mapping:
+            self.categories = {id_: cat for id_, cat in self.categories.items() if cat.id not in self.category_mapping}
+    def setup_category_id_and_number(self) -> None:
+        self.category_ids = list(self.categories.keys())
+        self.category_ids.sort()
+        if '/m/01s55n' in self.category_ids:
+            self.category_ids.remove('/m/01s55n')
+            self.category_ids.append('/m/01s55n')
+        self.category_number = {category_id: i for i, category_id in enumerate(self.category_ids)}
+        if self.category_allow_list is not None and self.category_mapping is None \
+                and len(self.category_ids) != len(self.category_allow_list):
+            warnings.warn('Unexpected number of categories: Mismatch with category_allow_list. '
+                          'Make sure all names in category_allow_list exist.')
+    def clean_up_annotations_and_image_descriptions(self) -> None:
+        image_id_set = set(self.image_ids)
+        self.annotations = {k: v for k, v in self.annotations.items() if k in image_id_set}
+        self.image_descriptions = {k: v for k, v in self.image_descriptions.items() if k in image_id_set}
+    @staticmethod
+    def filter_object_number(all_annotations: Dict[str, List[Annotation]], min_object_area: float,
+                             min_objects_per_image: int, max_objects_per_image: int) -> Dict[str, List[Annotation]]:
+        filtered = {}
+        for image_id, annotations in all_annotations.items():
+            annotations_with_min_area = [a for a in annotations if a.area > min_object_area]
+            if min_objects_per_image <= len(annotations_with_min_area) <= max_objects_per_image:
+                filtered[image_id] = annotations_with_min_area
+        return filtered
+    def __len__(self):
+        return len(self.image_ids)
+    def __getitem__(self, n: int) -> Dict[str, Any]:
+        image_id = self.get_image_id(n)
+        sample = self.get_image_description(image_id)
+        sample['annotations'] = self.get_annotation(image_id)
+        if 'image' in self.keys:
+            sample['image_path'] = str(self.get_image_path(image_id))
+            sample['image'] = self.load_image_from_disk(sample['image_path'])
+            sample['image'] = convert_pil_to_tensor(sample['image'])
+            sample['crop_bbox'], sample['flipped'], sample['image'] = self.image_transform(sample['image'])
+            sample['image'] = sample['image'].permute(1, 2, 0)
+        for conditional, builder in self.conditional_builders.items():
+            if conditional in self.keys:
+                sample[conditional] = builder.build(sample['annotations'], sample['crop_bbox'], sample['flipped'])
+        if self.keys:
+            # only return specified keys
+            sample = {key: sample[key] for key in self.keys}
+        return sample
+    def get_image_id(self, no: int) -> str:
+        return self.image_ids[no]
+    def get_annotation(self, image_id: str) -> str:
+        return self.annotations[image_id]
+    def get_textual_label_for_category_id(self, category_id: str) -> str:
+        return self.categories[category_id].name
+    def get_textual_label_for_category_no(self, category_no: int) -> str:
+        return self.categories[self.get_category_id(category_no)].name
+    def get_category_number(self, category_id: str) -> int:
+        return self.category_number[category_id]
+    def get_category_id(self, category_no: int) -> str:
+        return self.category_ids[category_no]
+    def get_image_description(self, image_id: str) -> Dict[str, Any]:
+        raise NotImplementedError()
+    def get_path_structure(self):
+        raise NotImplementedError
+    def get_image_path(self, image_id: str) -> Path:
+        raise NotImplementedError

taming/data/annotated_objects_open_images.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from collections import defaultdict
+from csv import DictReader, reader as TupleReader
+from pathlib import Path
+from typing import Dict, List, Any
+import warnings
+from taming.data.annotated_objects_dataset import AnnotatedObjectsDataset
+from taming.data.helper_types import Annotation, Category
+from tqdm import tqdm
+OPEN_IMAGES_STRUCTURE = {
+    'train': {
+        'top_level': '',
+        'class_descriptions': 'class-descriptions-boxable.csv',
+        'annotations': 'oidv6-train-annotations-bbox.csv',
+        'file_list': 'train-images-boxable.csv',
+        'files': 'train'
+    },
+    'validation': {
+        'top_level': '',
+        'class_descriptions': 'class-descriptions-boxable.csv',
+        'annotations': 'validation-annotations-bbox.csv',
+        'file_list': 'validation-images.csv',
+        'files': 'validation'
+    },
+    'test': {
+        'top_level': '',
+        'class_descriptions': 'class-descriptions-boxable.csv',
+        'annotations': 'test-annotations-bbox.csv',
+        'file_list': 'test-images.csv',
+        'files': 'test'
+    }
+}
+def load_annotations(descriptor_path: Path, min_object_area: float, category_mapping: Dict[str, str],
+                     category_no_for_id: Dict[str, int]) -> Dict[str, List[Annotation]]:
+    annotations: Dict[str, List[Annotation]] = defaultdict(list)
+    with open(descriptor_path) as file:
+        reader = DictReader(file)
+        for i, row in tqdm(enumerate(reader), total=14620000, desc='Loading OpenImages annotations'):
+            width = float(row['XMax']) - float(row['XMin'])
+            height = float(row['YMax']) - float(row['YMin'])
+            area = width * height
+            category_id = row['LabelName']
+            if category_id in category_mapping:
+                category_id = category_mapping[category_id]
+            if area >= min_object_area and category_id in category_no_for_id:
+                annotations[row['ImageID']].append(
+                    Annotation(
+                        id=i,
+                        image_id=row['ImageID'],
+                        source=row['Source'],
+                        category_id=category_id,
+                        category_no=category_no_for_id[category_id],
+                        confidence=float(row['Confidence']),
+                        bbox=(float(row['XMin']), float(row['YMin']), width, height),
+                        area=area,
+                        is_occluded=bool(int(row['IsOccluded'])),
+                        is_truncated=bool(int(row['IsTruncated'])),
+                        is_group_of=bool(int(row['IsGroupOf'])),
+                        is_depiction=bool(int(row['IsDepiction'])),
+                        is_inside=bool(int(row['IsInside']))
+                    )
+                )
+        if 'train' in str(descriptor_path) and i < 14000000:
+            warnings.warn(f'Running with subset of Open Images. Train dataset has length [{len(annotations)}].')
+        return dict(annotations)
+def load_image_ids(csv_path: Path) -> List[str]:
+    with open(csv_path) as file:
+        reader = DictReader(file)
+        return [row['image_name'] for row in reader]
+def load_categories(csv_path: Path) -> Dict[str, Category]:
+    with open(csv_path) as file:
+        reader = TupleReader(file)
+        return {row[0]: Category(id=row[0], name=row[1], super_category=None) for row in reader}
+class AnnotatedObjectsOpenImages(AnnotatedObjectsDataset):
+    def __init__(self, use_additional_parameters: bool, **kwargs):
+        """
+        @param data_path: is the path to the following folder structure:
+                          open_images/
+                          │   oidv6-train-annotations-bbox.csv
+                          ├── class-descriptions-boxable.csv
+                          ├── oidv6-train-annotations-bbox.csv
+                          ├── test
+                          │   ├── 000026e7ee790996.jpg
+                          │   ├── 000062a39995e348.jpg
+                          │   └── ...
+                          ├── test-annotations-bbox.csv
+                          ├── test-images.csv
+                          ├── train
+                          │   ├── 000002b66c9c498e.jpg
+                          │   ├── 000002b97e5471a0.jpg
+                          │   └── ...
+                          ├── train-images-boxable.csv
+                          ├── validation
+                          │   ├── 0001eeaf4aed83f9.jpg
+                          │   ├── 0004886b7d043cfd.jpg
+                          │   └── ...
+                          ├── validation-annotations-bbox.csv
+                          └── validation-images.csv
+        @param: split: one of 'train', 'validation' or 'test'
+        @param: desired image size (returns square images)
+        """
+        super().__init__(**kwargs)
+        self.use_additional_parameters = use_additional_parameters
+        self.categories = load_categories(self.paths['class_descriptions'])
+        self.filter_categories()
+        self.setup_category_id_and_number()
+        self.image_descriptions = {}
+        annotations = load_annotations(self.paths['annotations'], self.min_object_area, self.category_mapping,
+                                       self.category_number)
+        self.annotations = self.filter_object_number(annotations, self.min_object_area, self.min_objects_per_image,
+                                                     self.max_objects_per_image)
+        self.image_ids = list(self.annotations.keys())
+        self.clean_up_annotations_and_image_descriptions()
+    def get_path_structure(self) -> Dict[str, str]:
+        if self.split not in OPEN_IMAGES_STRUCTURE:
+            raise ValueError(f'Split [{self.split} does not exist for Open Images data.]')
+        return OPEN_IMAGES_STRUCTURE[self.split]
+    def get_image_path(self, image_id: str) -> Path:
+        return self.paths['files'].joinpath(f'{image_id:0>16}.jpg')
+    def get_image_description(self, image_id: str) -> Dict[str, Any]:
+        image_path = self.get_image_path(image_id)
+        return {'file_path': str(image_path), 'file_name': image_path.name}

taming/data/base.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import bisect
+import numpy as np
+import albumentations
+from PIL import Image
+from torch.utils.data import Dataset, ConcatDataset
+class ConcatDatasetWithIndex(ConcatDataset):
+    """Modified from original pytorch code to return dataset idx"""
+    def __getitem__(self, idx):
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError("absolute value of index should not exceed dataset length")
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx][sample_idx], dataset_idx
+class ImagePaths(Dataset):
+    def __init__(self, paths, size=None, random_crop=False, labels=None):
+        self.size = size
+        self.random_crop = random_crop
+        self.labels = dict() if labels is None else labels
+        self.labels["file_path_"] = paths
+        self._length = len(paths)
+        if self.size is not None and self.size > 0:
+            self.rescaler = albumentations.SmallestMaxSize(max_size = self.size)
+            if not self.random_crop:
+                self.cropper = albumentations.CenterCrop(height=self.size,width=self.size)
+            else:
+                self.cropper = albumentations.RandomCrop(height=self.size,width=self.size)
+            self.preprocessor = albumentations.Compose([self.rescaler, self.cropper])
+        else:
+            self.preprocessor = lambda **kwargs: kwargs
+    def __len__(self):
+        return self._length
+    def preprocess_image(self, image_path):
+        image = Image.open(image_path)
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        image = np.array(image).astype(np.uint8)
+        image = self.preprocessor(image=image)["image"]
+        image = (image/127.5 - 1.0).astype(np.float32)
+        return image
+    def __getitem__(self, i):
+        example = dict()
+        example["image"] = self.preprocess_image(self.labels["file_path_"][i])
+        for k in self.labels:
+            example[k] = self.labels[k][i]
+        return example
+class NumpyPaths(ImagePaths):
+    def preprocess_image(self, image_path):
+        image = np.load(image_path).squeeze(0)  # 3 x 1024 x 1024
+        image = np.transpose(image, (1,2,0))
+        image = Image.fromarray(image, mode="RGB")
+        image = np.array(image).astype(np.uint8)
+        image = self.preprocessor(image=image)["image"]
+        image = (image/127.5 - 1.0).astype(np.float32)
+        return image

taming/data/coco.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import os
+import json
+import albumentations
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from torch.utils.data import Dataset
+from taming.data.sflckr import SegmentationBase # for examples included in repo
+class Examples(SegmentationBase):
+    def __init__(self, size=256, random_crop=False, interpolation="bicubic"):
+        super().__init__(data_csv="data/coco_examples.txt",
+                         data_root="data/coco_images",
+                         segmentation_root="data/coco_segmentations",
+                         size=size, random_crop=random_crop,
+                         interpolation=interpolation,
+                         n_labels=183, shift_segmentation=True)
+class CocoBase(Dataset):
+    """needed for (image, caption, segmentation) pairs"""
+    def __init__(self, size=None, dataroot="", datajson="", onehot_segmentation=False, use_stuffthing=False,
+                 crop_size=None, force_no_crop=False, given_files=None):
+        self.split = self.get_split()
+        self.size = size
+        if crop_size is None:
+            self.crop_size = size
+        else:
+            self.crop_size = crop_size
+        self.onehot = onehot_segmentation       # return segmentation as rgb or one hot
+        self.stuffthing = use_stuffthing        # include thing in segmentation
+        if self.onehot and not self.stuffthing:
+            raise NotImplemented("One hot mode is only supported for the "
+                                 "stuffthings version because labels are stored "
+                                 "a bit different.")
+        data_json = datajson
+        with open(data_json) as json_file:
+            self.json_data = json.load(json_file)
+            self.img_id_to_captions = dict()
+            self.img_id_to_filepath = dict()
+            self.img_id_to_segmentation_filepath = dict()
+        assert data_json.split("/")[-1] in ["captions_train2017.json",
+                                            "captions_val2017.json"]
+        if self.stuffthing:
+            self.segmentation_prefix = (
+                "data/cocostuffthings/val2017" if
+                data_json.endswith("captions_val2017.json") else
+                "data/cocostuffthings/train2017")
+        else:
+            self.segmentation_prefix = (
+                "data/coco/annotations/stuff_val2017_pixelmaps" if
+                data_json.endswith("captions_val2017.json") else
+                "data/coco/annotations/stuff_train2017_pixelmaps")
+        imagedirs = self.json_data["images"]
+        self.labels = {"image_ids": list()}
+        for imgdir in tqdm(imagedirs, desc="ImgToPath"):
+            self.img_id_to_filepath[imgdir["id"]] = os.path.join(dataroot, imgdir["file_name"])
+            self.img_id_to_captions[imgdir["id"]] = list()
+            pngfilename = imgdir["file_name"].replace("jpg", "png")
+            self.img_id_to_segmentation_filepath[imgdir["id"]] = os.path.join(
+                self.segmentation_prefix, pngfilename)
+            if given_files is not None:
+                if pngfilename in given_files:
+                    self.labels["image_ids"].append(imgdir["id"])
+            else:
+                self.labels["image_ids"].append(imgdir["id"])
+        capdirs = self.json_data["annotations"]
+        for capdir in tqdm(capdirs, desc="ImgToCaptions"):
+            # there are in average 5 captions per image
+            self.img_id_to_captions[capdir["image_id"]].append(np.array([capdir["caption"]]))
+        self.rescaler = albumentations.SmallestMaxSize(max_size=self.size)
+        if self.split=="validation":
+            self.cropper = albumentations.CenterCrop(height=self.crop_size, width=self.crop_size)
+        else:
+            self.cropper = albumentations.RandomCrop(height=self.crop_size, width=self.crop_size)
+        self.preprocessor = albumentations.Compose(
+            [self.rescaler, self.cropper],
+            additional_targets={"segmentation": "image"})
+        if force_no_crop:
+            self.rescaler = albumentations.Resize(height=self.size, width=self.size)
+            self.preprocessor = albumentations.Compose(
+                [self.rescaler],
+                additional_targets={"segmentation": "image"})
+    def __len__(self):
+        return len(self.labels["image_ids"])
+    def preprocess_image(self, image_path, segmentation_path):
+        image = Image.open(image_path)
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        image = np.array(image).astype(np.uint8)
+        segmentation = Image.open(segmentation_path)
+        if not self.onehot and not segmentation.mode == "RGB":
+            segmentation = segmentation.convert("RGB")
+        segmentation = np.array(segmentation).astype(np.uint8)
+        if self.onehot:
+            assert self.stuffthing
+            # stored in caffe format: unlabeled==255. stuff and thing from
+            # 0-181. to be compatible with the labels in
+            # https://github.com/nightrome/cocostuff/blob/master/labels.txt
+            # we shift stuffthing one to the right and put unlabeled in zero
+            # as long as segmentation is uint8 shifting to right handles the
+            # latter too
+            assert segmentation.dtype == np.uint8
+            segmentation = segmentation + 1
+        processed = self.preprocessor(image=image, segmentation=segmentation)
+        image, segmentation = processed["image"], processed["segmentation"]
+        image = (image / 127.5 - 1.0).astype(np.float32)
+        if self.onehot:
+            assert segmentation.dtype == np.uint8
+            # make it one hot
+            n_labels = 183
+            flatseg = np.ravel(segmentation)
+            onehot = np.zeros((flatseg.size, n_labels), dtype=np.bool)
+            onehot[np.arange(flatseg.size), flatseg] = True
+            onehot = onehot.reshape(segmentation.shape + (n_labels,)).astype(int)
+            segmentation = onehot
+        else:
+            segmentation = (segmentation / 127.5 - 1.0).astype(np.float32)
+        return image, segmentation
+    def __getitem__(self, i):
+        img_path = self.img_id_to_filepath[self.labels["image_ids"][i]]
+        seg_path = self.img_id_to_segmentation_filepath[self.labels["image_ids"][i]]
+        image, segmentation = self.preprocess_image(img_path, seg_path)
+        captions = self.img_id_to_captions[self.labels["image_ids"][i]]
+        # randomly draw one of all available captions per image
+        caption = captions[np.random.randint(0, len(captions))]
+        example = {"image": image,
+                   "caption": [str(caption[0])],
+                   "segmentation": segmentation,
+                   "img_path": img_path,
+                   "seg_path": seg_path,
+                   "filename_": img_path.split(os.sep)[-1]
+                    }
+        return example
+class CocoImagesAndCaptionsTrain(CocoBase):
+    """returns a pair of (image, caption)"""
+    def __init__(self, size, onehot_segmentation=False, use_stuffthing=False, crop_size=None, force_no_crop=False):
+        super().__init__(size=size,
+                         dataroot="data/coco/train2017",
+                         datajson="data/coco/annotations/captions_train2017.json",
+                         onehot_segmentation=onehot_segmentation,
+                         use_stuffthing=use_stuffthing, crop_size=crop_size, force_no_crop=force_no_crop)
+    def get_split(self):
+        return "train"
+class CocoImagesAndCaptionsValidation(CocoBase):
+    """returns a pair of (image, caption)"""
+    def __init__(self, size, onehot_segmentation=False, use_stuffthing=False, crop_size=None, force_no_crop=False,
+                 given_files=None):
+        super().__init__(size=size,
+                         dataroot="data/coco/val2017",
+                         datajson="data/coco/annotations/captions_val2017.json",
+                         onehot_segmentation=onehot_segmentation,
+                         use_stuffthing=use_stuffthing, crop_size=crop_size, force_no_crop=force_no_crop,
+                         given_files=given_files)
+    def get_split(self):
+        return "validation"

taming/data/conditional_builder/objects_bbox.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from itertools import cycle
+from typing import List, Tuple, Callable, Optional
+from PIL import Image as pil_image, ImageDraw as pil_img_draw, ImageFont
+from more_itertools.recipes import grouper
+from taming.data.image_transforms import convert_pil_to_tensor
+from torch import LongTensor, Tensor
+from taming.data.helper_types import BoundingBox, Annotation
+from taming.data.conditional_builder.objects_center_points import ObjectsCenterPointsConditionalBuilder
+from taming.data.conditional_builder.utils import COLOR_PALETTE, WHITE, GRAY_75, BLACK, additional_parameters_string, \
+    pad_list, get_plot_font_size, absolute_bbox
+class ObjectsBoundingBoxConditionalBuilder(ObjectsCenterPointsConditionalBuilder):
+    @property
+    def object_descriptor_length(self) -> int:
+        return 3
+    def _make_object_descriptors(self, annotations: List[Annotation]) -> List[Tuple[int, ...]]:
+        object_triples = [
+            (self.object_representation(ann), *self.token_pair_from_bbox(ann.bbox))
+            for ann in annotations
+        ]
+        empty_triple = (self.none, self.none, self.none)
+        object_triples = pad_list(object_triples, empty_triple, self.no_max_objects)
+        return object_triples
+    def inverse_build(self, conditional: LongTensor) -> Tuple[List[Tuple[int, BoundingBox]], Optional[BoundingBox]]:
+        conditional_list = conditional.tolist()
+        crop_coordinates = None
+        if self.encode_crop:
+            crop_coordinates = self.bbox_from_token_pair(conditional_list[-2], conditional_list[-1])
+            conditional_list = conditional_list[:-2]
+        object_triples = grouper(conditional_list, 3)
+        assert conditional.shape[0] == self.embedding_dim
+        return [
+            (object_triple[0], self.bbox_from_token_pair(object_triple[1], object_triple[2]))
+            for object_triple in object_triples if object_triple[0] != self.none
+        ], crop_coordinates
+    def plot(self, conditional: LongTensor, label_for_category_no: Callable[[int], str], figure_size: Tuple[int, int],
+             line_width: int = 3, font_size: Optional[int] = None) -> Tensor:
+        plot = pil_image.new('RGB', figure_size, WHITE)
+        draw = pil_img_draw.Draw(plot)
+        font = ImageFont.truetype(
+            "/usr/share/fonts/truetype/lato/Lato-Regular.ttf",
+            size=get_plot_font_size(font_size, figure_size)
+        )
+        width, height = plot.size
+        description, crop_coordinates = self.inverse_build(conditional)
+        for (representation, bbox), color in zip(description, cycle(COLOR_PALETTE)):
+            annotation = self.representation_to_annotation(representation)
+            class_label = label_for_category_no(annotation.category_no) + ' ' + additional_parameters_string(annotation)
+            bbox = absolute_bbox(bbox, width, height)
+            draw.rectangle(bbox, outline=color, width=line_width)
+            draw.text((bbox[0] + line_width, bbox[1] + line_width), class_label, anchor='la', fill=BLACK, font=font)
+        if crop_coordinates is not None:
+            draw.rectangle(absolute_bbox(crop_coordinates, width, height), outline=GRAY_75, width=line_width)
+        return convert_pil_to_tensor(plot) / 127.5 - 1.

taming/data/conditional_builder/objects_center_points.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import math
+import random
+import warnings
+from itertools import cycle
+from typing import List, Optional, Tuple, Callable
+from PIL import Image as pil_image, ImageDraw as pil_img_draw, ImageFont
+from more_itertools.recipes import grouper
+from taming.data.conditional_builder.utils import COLOR_PALETTE, WHITE, GRAY_75, BLACK, FULL_CROP, filter_annotations, \
+    additional_parameters_string, horizontally_flip_bbox, pad_list, get_circle_size, get_plot_font_size, \
+    absolute_bbox, rescale_annotations
+from taming.data.helper_types import BoundingBox, Annotation
+from taming.data.image_transforms import convert_pil_to_tensor
+from torch import LongTensor, Tensor
+class ObjectsCenterPointsConditionalBuilder:
+    def __init__(self, no_object_classes: int, no_max_objects: int, no_tokens: int, encode_crop: bool,
+                 use_group_parameter: bool, use_additional_parameters: bool):
+        self.no_object_classes = no_object_classes
+        self.no_max_objects = no_max_objects
+        self.no_tokens = no_tokens
+        self.encode_crop = encode_crop
+        self.no_sections = int(math.sqrt(self.no_tokens))
+        self.use_group_parameter = use_group_parameter
+        self.use_additional_parameters = use_additional_parameters
+    @property
+    def none(self) -> int:
+        return self.no_tokens - 1
+    @property
+    def object_descriptor_length(self) -> int:
+        return 2
+    @property
+    def embedding_dim(self) -> int:
+        extra_length = 2 if self.encode_crop else 0
+        return self.no_max_objects * self.object_descriptor_length + extra_length
+    def tokenize_coordinates(self, x: float, y: float) -> int:
+        """
+        Express 2d coordinates with one number.
+        Example: assume self.no_tokens = 16, then no_sections = 4:
+        0  0  0  0
+        0  0  #  0
+        0  0  0  0
+        0  0  0  x
+        Then the # position corresponds to token 6, the x position to token 15.
+        @param x: float in [0, 1]
+        @param y: float in [0, 1]
+        @return: discrete tokenized coordinate
+        """
+        x_discrete = int(round(x * (self.no_sections - 1)))
+        y_discrete = int(round(y * (self.no_sections - 1)))
+        return y_discrete * self.no_sections + x_discrete
+    def coordinates_from_token(self, token: int) -> (float, float):
+        x = token % self.no_sections
+        y = token // self.no_sections
+        return x / (self.no_sections - 1), y / (self.no_sections - 1)
+    def bbox_from_token_pair(self, token1: int, token2: int) -> BoundingBox:
+        x0, y0 = self.coordinates_from_token(token1)
+        x1, y1 = self.coordinates_from_token(token2)
+        return x0, y0, x1 - x0, y1 - y0
+    def token_pair_from_bbox(self, bbox: BoundingBox) -> Tuple[int, int]:
+        return self.tokenize_coordinates(bbox[0], bbox[1]), \
+               self.tokenize_coordinates(bbox[0] + bbox[2], bbox[1] + bbox[3])
+    def inverse_build(self, conditional: LongTensor) \
+            -> Tuple[List[Tuple[int, Tuple[float, float]]], Optional[BoundingBox]]:
+        conditional_list = conditional.tolist()
+        crop_coordinates = None
+        if self.encode_crop:
+            crop_coordinates = self.bbox_from_token_pair(conditional_list[-2], conditional_list[-1])
+            conditional_list = conditional_list[:-2]
+        table_of_content = grouper(conditional_list, self.object_descriptor_length)
+        assert conditional.shape[0] == self.embedding_dim
+        return [
+            (object_tuple[0], self.coordinates_from_token(object_tuple[1]))
+            for object_tuple in table_of_content if object_tuple[0] != self.none
+        ], crop_coordinates
+    def plot(self, conditional: LongTensor, label_for_category_no: Callable[[int], str], figure_size: Tuple[int, int],
+             line_width: int = 3, font_size: Optional[int] = None) -> Tensor:
+        plot = pil_image.new('RGB', figure_size, WHITE)
+        draw = pil_img_draw.Draw(plot)
+        circle_size = get_circle_size(figure_size)
+        font = ImageFont.truetype('/usr/share/fonts/truetype/lato/Lato-Regular.ttf',
+                                  size=get_plot_font_size(font_size, figure_size))
+        width, height = plot.size
+        description, crop_coordinates = self.inverse_build(conditional)
+        for (representation, (x, y)), color in zip(description, cycle(COLOR_PALETTE)):
+            x_abs, y_abs = x * width, y * height
+            ann = self.representation_to_annotation(representation)
+            label = label_for_category_no(ann.category_no) + ' ' + additional_parameters_string(ann)
+            ellipse_bbox = [x_abs - circle_size, y_abs - circle_size, x_abs + circle_size, y_abs + circle_size]
+            draw.ellipse(ellipse_bbox, fill=color, width=0)
+            draw.text((x_abs, y_abs), label, anchor='md', fill=BLACK, font=font)
+        if crop_coordinates is not None:
+            draw.rectangle(absolute_bbox(crop_coordinates, width, height), outline=GRAY_75, width=line_width)
+        return convert_pil_to_tensor(plot) / 127.5 - 1.
+    def object_representation(self, annotation: Annotation) -> int:
+        modifier = 0
+        if self.use_group_parameter:
+            modifier |= 1 * (annotation.is_group_of is True)
+        if self.use_additional_parameters:
+            modifier |= 2 * (annotation.is_occluded is True)
+            modifier |= 4 * (annotation.is_depiction is True)
+            modifier |= 8 * (annotation.is_inside is True)
+        return annotation.category_no + self.no_object_classes * modifier
+    def representation_to_annotation(self, representation: int) -> Annotation:
+        category_no = representation % self.no_object_classes
+        modifier = representation // self.no_object_classes
+        # noinspection PyTypeChecker
+        return Annotation(
+            area=None, image_id=None, bbox=None, category_id=None, id=None, source=None, confidence=None,
+            category_no=category_no,
+            is_group_of=bool((modifier & 1) * self.use_group_parameter),
+            is_occluded=bool((modifier & 2) * self.use_additional_parameters),
+            is_depiction=bool((modifier & 4) * self.use_additional_parameters),
+            is_inside=bool((modifier & 8) * self.use_additional_parameters)
+        )
+    def _crop_encoder(self, crop_coordinates: BoundingBox) -> List[int]:
+        return list(self.token_pair_from_bbox(crop_coordinates))
+    def _make_object_descriptors(self, annotations: List[Annotation]) -> List[Tuple[int, ...]]:
+        object_tuples = [
+            (self.object_representation(a),
+             self.tokenize_coordinates(a.bbox[0] + a.bbox[2] / 2, a.bbox[1] + a.bbox[3] / 2))
+            for a in annotations
+        ]
+        empty_tuple = (self.none, self.none)
+        object_tuples = pad_list(object_tuples, empty_tuple, self.no_max_objects)
+        return object_tuples
+    def build(self, annotations: List, crop_coordinates: Optional[BoundingBox] = None, horizontal_flip: bool = False) \
+            -> LongTensor:
+        if len(annotations) == 0:
+            warnings.warn('Did not receive any annotations.')
+        if len(annotations) > self.no_max_objects:
+            warnings.warn('Received more annotations than allowed.')
+            annotations = annotations[:self.no_max_objects]
+        if not crop_coordinates:
+            crop_coordinates = FULL_CROP
+        random.shuffle(annotations)
+        annotations = filter_annotations(annotations, crop_coordinates)
+        if self.encode_crop:
+            annotations = rescale_annotations(annotations, FULL_CROP, horizontal_flip)
+            if horizontal_flip:
+                crop_coordinates = horizontally_flip_bbox(crop_coordinates)
+            extra = self._crop_encoder(crop_coordinates)
+        else:
+            annotations = rescale_annotations(annotations, crop_coordinates, horizontal_flip)
+            extra = []
+        object_tuples = self._make_object_descriptors(annotations)
+        flattened = [token for tuple_ in object_tuples for token in tuple_] + extra
+        assert len(flattened) == self.embedding_dim
+        assert all(0 <= value < self.no_tokens for value in flattened)
+        return LongTensor(flattened)

taming/data/conditional_builder/utils.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import importlib
+from typing import List, Any, Tuple, Optional
+from taming.data.helper_types import BoundingBox, Annotation
+# source: seaborn, color palette tab10
+COLOR_PALETTE = [(30, 118, 179), (255, 126, 13), (43, 159, 43), (213, 38, 39), (147, 102, 188),
+                 (139, 85, 74), (226, 118, 193), (126, 126, 126), (187, 188, 33), (22, 189, 206)]
+BLACK = (0, 0, 0)
+GRAY_75 = (63, 63, 63)
+GRAY_50 = (127, 127, 127)
+GRAY_25 = (191, 191, 191)
+WHITE = (255, 255, 255)
+FULL_CROP = (0., 0., 1., 1.)
+def intersection_area(rectangle1: BoundingBox, rectangle2: BoundingBox) -> float:
+    """
+    Give intersection area of two rectangles.
+    @param rectangle1: (x0, y0, w, h) of first rectangle
+    @param rectangle2: (x0, y0, w, h) of second rectangle
+    """
+    rectangle1 = rectangle1[0], rectangle1[1], rectangle1[0] + rectangle1[2], rectangle1[1] + rectangle1[3]
+    rectangle2 = rectangle2[0], rectangle2[1], rectangle2[0] + rectangle2[2], rectangle2[1] + rectangle2[3]
+    x_overlap = max(0., min(rectangle1[2], rectangle2[2]) - max(rectangle1[0], rectangle2[0]))
+    y_overlap = max(0., min(rectangle1[3], rectangle2[3]) - max(rectangle1[1], rectangle2[1]))
+    return x_overlap * y_overlap
+def horizontally_flip_bbox(bbox: BoundingBox) -> BoundingBox:
+    return 1 - (bbox[0] + bbox[2]), bbox[1], bbox[2], bbox[3]
+def absolute_bbox(relative_bbox: BoundingBox, width: int, height: int) -> Tuple[int, int, int, int]:
+    bbox = relative_bbox
+    bbox = bbox[0] * width, bbox[1] * height, (bbox[0] + bbox[2]) * width, (bbox[1] + bbox[3]) * height
+    return int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
+def pad_list(list_: List, pad_element: Any, pad_to_length: int) -> List:
+    return list_ + [pad_element for _ in range(pad_to_length - len(list_))]
+def rescale_annotations(annotations: List[Annotation], crop_coordinates: BoundingBox, flip: bool) -> \
+        List[Annotation]:
+    def clamp(x: float):
+        return max(min(x, 1.), 0.)
+    def rescale_bbox(bbox: BoundingBox) -> BoundingBox:
+        x0 = clamp((bbox[0] - crop_coordinates[0]) / crop_coordinates[2])
+        y0 = clamp((bbox[1] - crop_coordinates[1]) / crop_coordinates[3])
+        w = min(bbox[2] / crop_coordinates[2], 1 - x0)
+        h = min(bbox[3] / crop_coordinates[3], 1 - y0)
+        if flip:
+            x0 = 1 - (x0 + w)
+        return x0, y0, w, h
+    return [a._replace(bbox=rescale_bbox(a.bbox)) for a in annotations]
+def filter_annotations(annotations: List[Annotation], crop_coordinates: BoundingBox) -> List:
+    return [a for a in annotations if intersection_area(a.bbox, crop_coordinates) > 0.0]
+def additional_parameters_string(annotation: Annotation, short: bool = True) -> str:
+    sl = slice(1) if short else slice(None)
+    string = ''
+    if not (annotation.is_group_of or annotation.is_occluded or annotation.is_depiction or annotation.is_inside):
+        return string
+    if annotation.is_group_of:
+        string += 'group'[sl] + ','
+    if annotation.is_occluded:
+        string += 'occluded'[sl] + ','
+    if annotation.is_depiction:
+        string += 'depiction'[sl] + ','
+    if annotation.is_inside:
+        string += 'inside'[sl]
+    return '(' + string.strip(",") + ')'
+def get_plot_font_size(font_size: Optional[int], figure_size: Tuple[int, int]) -> int:
+    if font_size is None:
+        font_size = 10
+        if max(figure_size) >= 256:
+            font_size = 12
+        if max(figure_size) >= 512:
+            font_size = 15
+    return font_size
+def get_circle_size(figure_size: Tuple[int, int]) -> int:
+    circle_size = 2
+    if max(figure_size) >= 256:
+        circle_size = 3
+    if max(figure_size) >= 512:
+        circle_size = 4
+    return circle_size
+def load_object_from_string(object_string: str) -> Any:
+    """
+    Source: https://stackoverflow.com/a/10773699
+    """
+    module_name, class_name = object_string.rsplit(".", 1)
+    return getattr(importlib.import_module(module_name), class_name)

taming/data/custom.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import os
+import numpy as np
+import albumentations
+from torch.utils.data import Dataset
+from taming.data.base import ImagePaths, NumpyPaths, ConcatDatasetWithIndex
+class CustomBase(Dataset):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.data = None
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, i):
+        example = self.data[i]
+        return example
+class CustomTrain(CustomBase):
+    def __init__(self, size, training_images_list_file):
+        super().__init__()
+        with open(training_images_list_file, "r") as f:
+            paths = f.read().splitlines()
+        self.data = ImagePaths(paths=paths, size=size, random_crop=False)
+class CustomTest(CustomBase):
+    def __init__(self, size, test_images_list_file):
+        super().__init__()
+        with open(test_images_list_file, "r") as f:
+            paths = f.read().splitlines()
+        self.data = ImagePaths(paths=paths, size=size, random_crop=False)

taming/data/faceshq.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import os
+import numpy as np
+import albumentations
+from torch.utils.data import Dataset
+from taming.data.base import ImagePaths, NumpyPaths, ConcatDatasetWithIndex
+class FacesBase(Dataset):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.data = None
+        self.keys = None
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, i):
+        example = self.data[i]
+        ex = {}
+        if self.keys is not None:
+            for k in self.keys:
+                ex[k] = example[k]
+        else:
+            ex = example
+        return ex
+class CelebAHQTrain(FacesBase):
+    def __init__(self, size, keys=None):
+        super().__init__()
+        root = "data/celebahq"
+        with open("data/celebahqtrain.txt", "r") as f:
+            relpaths = f.read().splitlines()
+        paths = [os.path.join(root, relpath) for relpath in relpaths]
+        self.data = NumpyPaths(paths=paths, size=size, random_crop=False)
+        self.keys = keys
+class CelebAHQValidation(FacesBase):
+    def __init__(self, size, keys=None):
+        super().__init__()
+        root = "data/celebahq"
+        with open("data/celebahqvalidation.txt", "r") as f:
+            relpaths = f.read().splitlines()
+        paths = [os.path.join(root, relpath) for relpath in relpaths]
+        self.data = NumpyPaths(paths=paths, size=size, random_crop=False)
+        self.keys = keys
+class FFHQTrain(FacesBase):
+    def __init__(self, size, keys=None):
+        super().__init__()
+        root = "data/ffhq"
+        with open("data/ffhqtrain.txt", "r") as f:
+            relpaths = f.read().splitlines()
+        paths = [os.path.join(root, relpath) for relpath in relpaths]
+        self.data = ImagePaths(paths=paths, size=size, random_crop=False)
+        self.keys = keys
+class FFHQValidation(FacesBase):
+    def __init__(self, size, keys=None):
+        super().__init__()
+        root = "data/ffhq"
+        with open("data/ffhqvalidation.txt", "r") as f:
+            relpaths = f.read().splitlines()
+        paths = [os.path.join(root, relpath) for relpath in relpaths]
+        self.data = ImagePaths(paths=paths, size=size, random_crop=False)
+        self.keys = keys
+class FacesHQTrain(Dataset):
+    # CelebAHQ [0] + FFHQ [1]
+    def __init__(self, size, keys=None, crop_size=None, coord=False):
+        d1 = CelebAHQTrain(size=size, keys=keys)
+        d2 = FFHQTrain(size=size, keys=keys)
+        self.data = ConcatDatasetWithIndex([d1, d2])
+        self.coord = coord
+        if crop_size is not None:
+            self.cropper = albumentations.RandomCrop(height=crop_size,width=crop_size)
+            if self.coord:
+                self.cropper = albumentations.Compose([self.cropper],
+                                                      additional_targets={"coord": "image"})
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, i):
+        ex, y = self.data[i]
+        if hasattr(self, "cropper"):
+            if not self.coord:
+                out = self.cropper(image=ex["image"])
+                ex["image"] = out["image"]
+            else:
+                h,w,_ = ex["image"].shape
+                coord = np.arange(h*w).reshape(h,w,1)/(h*w)
+                out = self.cropper(image=ex["image"], coord=coord)
+                ex["image"] = out["image"]
+                ex["coord"] = out["coord"]
+        ex["class"] = y
+        return ex
+class FacesHQValidation(Dataset):
+    # CelebAHQ [0] + FFHQ [1]
+    def __init__(self, size, keys=None, crop_size=None, coord=False):
+        d1 = CelebAHQValidation(size=size, keys=keys)
+        d2 = FFHQValidation(size=size, keys=keys)
+        self.data = ConcatDatasetWithIndex([d1, d2])
+        self.coord = coord
+        if crop_size is not None:
+            self.cropper = albumentations.CenterCrop(height=crop_size,width=crop_size)
+            if self.coord:
+                self.cropper = albumentations.Compose([self.cropper],
+                                                      additional_targets={"coord": "image"})
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, i):
+        ex, y = self.data[i]
+        if hasattr(self, "cropper"):
+            if not self.coord:
+                out = self.cropper(image=ex["image"])
+                ex["image"] = out["image"]
+            else:
+                h,w,_ = ex["image"].shape
+                coord = np.arange(h*w).reshape(h,w,1)/(h*w)
+                out = self.cropper(image=ex["image"], coord=coord)
+                ex["image"] = out["image"]
+                ex["coord"] = out["coord"]
+        ex["class"] = y
+        return ex

taming/data/helper_types.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from typing import Dict, Tuple, Optional, NamedTuple, Union
+from PIL.Image import Image as pil_image
+from torch import Tensor
+try:
+  from typing import Literal
+except ImportError:
+  from typing_extensions import Literal
+Image = Union[Tensor, pil_image]
+BoundingBox = Tuple[float, float, float, float]  # x0, y0, w, h
+CropMethodType = Literal['none', 'random', 'center', 'random-2d']
+SplitType = Literal['train', 'validation', 'test']
+class ImageDescription(NamedTuple):
+    id: int
+    file_name: str
+    original_size: Tuple[int, int]  # w, h
+    url: Optional[str] = None
+    license: Optional[int] = None
+    coco_url: Optional[str] = None
+    date_captured: Optional[str] = None
+    flickr_url: Optional[str] = None
+    flickr_id: Optional[str] = None
+    coco_id: Optional[str] = None
+class Category(NamedTuple):
+    id: str
+    super_category: Optional[str]
+    name: str
+class Annotation(NamedTuple):
+    area: float
+    image_id: str
+    bbox: BoundingBox
+    category_no: int
+    category_id: str
+    id: Optional[int] = None
+    source: Optional[str] = None
+    confidence: Optional[float] = None
+    is_group_of: Optional[bool] = None
+    is_truncated: Optional[bool] = None
+    is_occluded: Optional[bool] = None
+    is_depiction: Optional[bool] = None
+    is_inside: Optional[bool] = None
+    segmentation: Optional[Dict] = None

taming/data/image_transforms.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import random
+import warnings
+from typing import Union
+import torch
+from torch import Tensor
+from torchvision.transforms import RandomCrop, functional as F, CenterCrop, RandomHorizontalFlip, PILToTensor
+from torchvision.transforms.functional import _get_image_size as get_image_size
+from taming.data.helper_types import BoundingBox, Image
+pil_to_tensor = PILToTensor()
+def convert_pil_to_tensor(image: Image) -> Tensor:
+    with warnings.catch_warnings():
+        # to filter PyTorch UserWarning as described here: https://github.com/pytorch/vision/issues/2194
+        warnings.simplefilter("ignore")
+        return pil_to_tensor(image)
+class RandomCrop1dReturnCoordinates(RandomCrop):
+    def forward(self, img: Image) -> (BoundingBox, Image):
+        """
+        Additionally to cropping, returns the relative coordinates of the crop bounding box.
+        Args:
+            img (PIL Image or Tensor): Image to be cropped.
+        Returns:
+            Bounding box: x0, y0, w, h
+            PIL Image or Tensor: Cropped image.
+        Based on:
+            torchvision.transforms.RandomCrop, torchvision 1.7.0
+        """
+        if self.padding is not None:
+            img = F.pad(img, self.padding, self.fill, self.padding_mode)
+        width, height = get_image_size(img)
+        # pad the width if needed
+        if self.pad_if_needed and width < self.size[1]:
+            padding = [self.size[1] - width, 0]
+            img = F.pad(img, padding, self.fill, self.padding_mode)
+        # pad the height if needed
+        if self.pad_if_needed and height < self.size[0]:
+            padding = [0, self.size[0] - height]
+            img = F.pad(img, padding, self.fill, self.padding_mode)
+        i, j, h, w = self.get_params(img, self.size)
+        bbox = (j / width, i / height, w / width, h / height)  # x0, y0, w, h
+        return bbox, F.crop(img, i, j, h, w)
+class Random2dCropReturnCoordinates(torch.nn.Module):
+    """
+    Additionally to cropping, returns the relative coordinates of the crop bounding box.
+    Args:
+        img (PIL Image or Tensor): Image to be cropped.
+    Returns:
+        Bounding box: x0, y0, w, h
+        PIL Image or Tensor: Cropped image.
+    Based on:
+        torchvision.transforms.RandomCrop, torchvision 1.7.0
+    """
+    def __init__(self, min_size: int):
+        super().__init__()
+        self.min_size = min_size
+    def forward(self, img: Image) -> (BoundingBox, Image):
+        width, height = get_image_size(img)
+        max_size = min(width, height)
+        if max_size <= self.min_size:
+            size = max_size
+        else:
+            size = random.randint(self.min_size, max_size)
+        top = random.randint(0, height - size)
+        left = random.randint(0, width - size)
+        bbox = left / width, top / height, size / width, size / height
+        return bbox, F.crop(img, top, left, size, size)
+class CenterCropReturnCoordinates(CenterCrop):
+    @staticmethod
+    def get_bbox_of_center_crop(width: int, height: int) -> BoundingBox:
+        if width > height:
+            w = height / width
+            h = 1.0
+            x0 = 0.5 - w / 2
+            y0 = 0.
+        else:
+            w = 1.0
+            h = width / height
+            x0 = 0.
+            y0 = 0.5 - h / 2
+        return x0, y0, w, h
+    def forward(self, img: Union[Image, Tensor]) -> (BoundingBox, Union[Image, Tensor]):
+        """
+        Additionally to cropping, returns the relative coordinates of the crop bounding box.
+        Args:
+            img (PIL Image or Tensor): Image to be cropped.
+        Returns:
+            Bounding box: x0, y0, w, h
+            PIL Image or Tensor: Cropped image.
+        Based on:
+            torchvision.transforms.RandomHorizontalFlip (version 1.7.0)
+        """
+        width, height = get_image_size(img)
+        return self.get_bbox_of_center_crop(width, height),  F.center_crop(img, self.size)
+class RandomHorizontalFlipReturn(RandomHorizontalFlip):
+    def forward(self, img: Image) -> (bool, Image):
+        """
+        Additionally to flipping, returns a boolean whether it was flipped or not.
+        Args:
+            img (PIL Image or Tensor): Image to be flipped.
+        Returns:
+            flipped: whether the image was flipped or not
+            PIL Image or Tensor: Randomly flipped image.
+        Based on:
+            torchvision.transforms.RandomHorizontalFlip (version 1.7.0)
+        """
+        if torch.rand(1) < self.p:
+            return True, F.hflip(img)
+        return False, img

taming/data/imagenet.py ADDED Viewed

	@@ -0,0 +1,558 @@

+import os, tarfile, glob, shutil
+import yaml
+import numpy as np
+from tqdm import tqdm
+from PIL import Image
+import albumentations
+from omegaconf import OmegaConf
+from torch.utils.data import Dataset
+from taming.data.base import ImagePaths
+from taming.util import download, retrieve
+import taming.data.utils as bdu
+def give_synsets_from_indices(indices, path_to_yaml="data/imagenet_idx_to_synset.yaml"):
+    synsets = []
+    with open(path_to_yaml) as f:
+        di2s = yaml.load(f)
+    for idx in indices:
+        synsets.append(str(di2s[idx]))
+    print("Using {} different synsets for construction of Restriced Imagenet.".format(len(synsets)))
+    return synsets
+def str_to_indices(string):
+    """Expects a string in the format '32-123, 256, 280-321'"""
+    assert not string.endswith(","), "provided string '{}' ends with a comma, pls remove it".format(string)
+    subs = string.split(",")
+    indices = []
+    for sub in subs:
+        subsubs = sub.split("-")
+        assert len(subsubs) > 0
+        if len(subsubs) == 1:
+            indices.append(int(subsubs[0]))
+        else:
+            rang = [j for j in range(int(subsubs[0]), int(subsubs[1]))]
+            indices.extend(rang)
+    return sorted(indices)
+class ImageNetBase(Dataset):
+    def __init__(self, config=None):
+        self.config = config or OmegaConf.create()
+        if not type(self.config)==dict:
+            self.config = OmegaConf.to_container(self.config)
+        self._prepare()
+        self._prepare_synset_to_human()
+        self._prepare_idx_to_synset()
+        self._load()
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, i):
+        return self.data[i]
+    def _prepare(self):
+        raise NotImplementedError()
+    def _filter_relpaths(self, relpaths):
+        ignore = set([
+            "n06596364_9591.JPEG",
+        ])
+        relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
+        if "sub_indices" in self.config:
+            indices = str_to_indices(self.config["sub_indices"])
+            synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn)  # returns a list of strings
+            files = []
+            for rpath in relpaths:
+                syn = rpath.split("/")[0]
+                if syn in synsets:
+                    files.append(rpath)
+            return files
+        else:
+            return relpaths
+    def _prepare_synset_to_human(self):
+        SIZE = 2655750
+        URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
+        self.human_dict = os.path.join(self.root, "synset_human.txt")
+        if (not os.path.exists(self.human_dict) or
+                not os.path.getsize(self.human_dict)==SIZE):
+            download(URL, self.human_dict)
+    def _prepare_idx_to_synset(self):
+        URL = "https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1"
+        self.idx2syn = os.path.join(self.root, "index_synset.yaml")
+        if (not os.path.exists(self.idx2syn)):
+            download(URL, self.idx2syn)
+    def _load(self):
+        with open(self.txt_filelist, "r") as f:
+            self.relpaths = f.read().splitlines()
+            l1 = len(self.relpaths)
+            self.relpaths = self._filter_relpaths(self.relpaths)
+            print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
+        self.synsets = [p.split("/")[0] for p in self.relpaths]
+        self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]
+        unique_synsets = np.unique(self.synsets)
+        class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
+        self.class_labels = [class_dict[s] for s in self.synsets]
+        with open(self.human_dict, "r") as f:
+            human_dict = f.read().splitlines()
+            human_dict = dict(line.split(maxsplit=1) for line in human_dict)
+        self.human_labels = [human_dict[s] for s in self.synsets]
+        labels = {
+            "relpath": np.array(self.relpaths),
+            "synsets": np.array(self.synsets),
+            "class_label": np.array(self.class_labels),
+            "human_label": np.array(self.human_labels),
+        }
+        self.data = ImagePaths(self.abspaths,
+                               labels=labels,
+                               size=retrieve(self.config, "size", default=0),
+                               random_crop=self.random_crop)
+class ImageNetTrain(ImageNetBase):
+    NAME = "ILSVRC2012_train"
+    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
+    AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
+    FILES = [
+        "ILSVRC2012_img_train.tar",
+    ]
+    SIZES = [
+        147897477120,
+    ]
+    def _prepare(self):
+        self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop",
+                                    default=True)
+        cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
+        self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
+        self.datadir = os.path.join(self.root, "data")
+        self.txt_filelist = os.path.join(self.root, "filelist.txt")
+        self.expected_length = 1281167
+        if not bdu.is_prepared(self.root):
+            # prep
+            print("Preparing dataset {} in {}".format(self.NAME, self.root))
+            datadir = self.datadir
+            if not os.path.exists(datadir):
+                path = os.path.join(self.root, self.FILES[0])
+                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
+                    import academictorrents as at
+                    atpath = at.get(self.AT_HASH, datastore=self.root)
+                    assert atpath == path
+                print("Extracting {} to {}".format(path, datadir))
+                os.makedirs(datadir, exist_ok=True)
+                with tarfile.open(path, "r:") as tar:
+                    tar.extractall(path=datadir)
+                print("Extracting sub-tars.")
+                subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
+                for subpath in tqdm(subpaths):
+                    subdir = subpath[:-len(".tar")]
+                    os.makedirs(subdir, exist_ok=True)
+                    with tarfile.open(subpath, "r:") as tar:
+                        tar.extractall(path=subdir)
+            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
+            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
+            filelist = sorted(filelist)
+            filelist = "\n".join(filelist)+"\n"
+            with open(self.txt_filelist, "w") as f:
+                f.write(filelist)
+            bdu.mark_prepared(self.root)
+class ImageNetValidation(ImageNetBase):
+    NAME = "ILSVRC2012_validation"
+    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
+    AT_HASH = "5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5"
+    VS_URL = "https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1"
+    FILES = [
+        "ILSVRC2012_img_val.tar",
+        "validation_synset.txt",
+    ]
+    SIZES = [
+        6744924160,
+        1950000,
+    ]
+    def _prepare(self):
+        self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop",
+                                    default=False)
+        cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
+        self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
+        self.datadir = os.path.join(self.root, "data")
+        self.txt_filelist = os.path.join(self.root, "filelist.txt")
+        self.expected_length = 50000
+        if not bdu.is_prepared(self.root):
+            # prep
+            print("Preparing dataset {} in {}".format(self.NAME, self.root))
+            datadir = self.datadir
+            if not os.path.exists(datadir):
+                path = os.path.join(self.root, self.FILES[0])
+                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
+                    import academictorrents as at
+                    atpath = at.get(self.AT_HASH, datastore=self.root)
+                    assert atpath == path
+                print("Extracting {} to {}".format(path, datadir))
+                os.makedirs(datadir, exist_ok=True)
+                with tarfile.open(path, "r:") as tar:
+                    tar.extractall(path=datadir)
+                vspath = os.path.join(self.root, self.FILES[1])
+                if not os.path.exists(vspath) or not os.path.getsize(vspath)==self.SIZES[1]:
+                    download(self.VS_URL, vspath)
+                with open(vspath, "r") as f:
+                    synset_dict = f.read().splitlines()
+                    synset_dict = dict(line.split() for line in synset_dict)
+                print("Reorganizing into synset folders")
+                synsets = np.unique(list(synset_dict.values()))
+                for s in synsets:
+                    os.makedirs(os.path.join(datadir, s), exist_ok=True)
+                for k, v in synset_dict.items():
+                    src = os.path.join(datadir, k)
+                    dst = os.path.join(datadir, v)
+                    shutil.move(src, dst)
+            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
+            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
+            filelist = sorted(filelist)
+            filelist = "\n".join(filelist)+"\n"
+            with open(self.txt_filelist, "w") as f:
+                f.write(filelist)
+            bdu.mark_prepared(self.root)
+def get_preprocessor(size=None, random_crop=False, additional_targets=None,
+                     crop_size=None):
+    if size is not None and size > 0:
+        transforms = list()
+        rescaler = albumentations.SmallestMaxSize(max_size = size)
+        transforms.append(rescaler)
+        if not random_crop:
+            cropper = albumentations.CenterCrop(height=size,width=size)
+            transforms.append(cropper)
+        else:
+            cropper = albumentations.RandomCrop(height=size,width=size)
+            transforms.append(cropper)
+            flipper = albumentations.HorizontalFlip()
+            transforms.append(flipper)
+        preprocessor = albumentations.Compose(transforms,
+                                              additional_targets=additional_targets)
+    elif crop_size is not None and crop_size > 0:
+        if not random_crop:
+            cropper = albumentations.CenterCrop(height=crop_size,width=crop_size)
+        else:
+            cropper = albumentations.RandomCrop(height=crop_size,width=crop_size)
+        transforms = [cropper]
+        preprocessor = albumentations.Compose(transforms,
+                                              additional_targets=additional_targets)
+    else:
+        preprocessor = lambda **kwargs: kwargs
+    return preprocessor
+def rgba_to_depth(x):
+    assert x.dtype == np.uint8
+    assert len(x.shape) == 3 and x.shape[2] == 4
+    y = x.copy()
+    y.dtype = np.float32
+    y = y.reshape(x.shape[:2])
+    return np.ascontiguousarray(y)
+class BaseWithDepth(Dataset):
+    DEFAULT_DEPTH_ROOT="data/imagenet_depth"
+    def __init__(self, config=None, size=None, random_crop=False,
+                 crop_size=None, root=None):
+        self.config = config
+        self.base_dset = self.get_base_dset()
+        self.preprocessor = get_preprocessor(
+            size=size,
+            crop_size=crop_size,
+            random_crop=random_crop,
+            additional_targets={"depth": "image"})
+        self.crop_size = crop_size
+        if self.crop_size is not None:
+            self.rescaler = albumentations.Compose(
+                [albumentations.SmallestMaxSize(max_size = self.crop_size)],
+                additional_targets={"depth": "image"})
+        if root is not None:
+            self.DEFAULT_DEPTH_ROOT = root
+    def __len__(self):
+        return len(self.base_dset)
+    def preprocess_depth(self, path):
+        rgba = np.array(Image.open(path))
+        depth = rgba_to_depth(rgba)
+        depth = (depth - depth.min())/max(1e-8, depth.max()-depth.min())
+        depth = 2.0*depth-1.0
+        return depth
+    def __getitem__(self, i):
+        e = self.base_dset[i]
+        e["depth"] = self.preprocess_depth(self.get_depth_path(e))
+        # up if necessary
+        h,w,c = e["image"].shape
+        if self.crop_size and min(h,w) < self.crop_size:
+            # have to upscale to be able to crop - this just uses bilinear
+            out = self.rescaler(image=e["image"], depth=e["depth"])
+            e["image"] = out["image"]
+            e["depth"] = out["depth"]
+        transformed = self.preprocessor(image=e["image"], depth=e["depth"])
+        e["image"] = transformed["image"]
+        e["depth"] = transformed["depth"]
+        return e
+class ImageNetTrainWithDepth(BaseWithDepth):
+    # default to random_crop=True
+    def __init__(self, random_crop=True, sub_indices=None, **kwargs):
+        self.sub_indices = sub_indices
+        super().__init__(random_crop=random_crop, **kwargs)
+    def get_base_dset(self):
+        if self.sub_indices is None:
+            return ImageNetTrain()
+        else:
+            return ImageNetTrain({"sub_indices": self.sub_indices})
+    def get_depth_path(self, e):
+        fid = os.path.splitext(e["relpath"])[0]+".png"
+        fid = os.path.join(self.DEFAULT_DEPTH_ROOT, "train", fid)
+        return fid
+class ImageNetValidationWithDepth(BaseWithDepth):
+    def __init__(self, sub_indices=None, **kwargs):
+        self.sub_indices = sub_indices
+        super().__init__(**kwargs)
+    def get_base_dset(self):
+        if self.sub_indices is None:
+            return ImageNetValidation()
+        else:
+            return ImageNetValidation({"sub_indices": self.sub_indices})
+    def get_depth_path(self, e):
+        fid = os.path.splitext(e["relpath"])[0]+".png"
+        fid = os.path.join(self.DEFAULT_DEPTH_ROOT, "val", fid)
+        return fid
+class RINTrainWithDepth(ImageNetTrainWithDepth):
+    def __init__(self, config=None, size=None, random_crop=True, crop_size=None):
+        sub_indices = "30-32, 33-37, 151-268, 281-285, 80-100, 365-382, 389-397, 118-121, 300-319"
+        super().__init__(config=config, size=size, random_crop=random_crop,
+                         sub_indices=sub_indices, crop_size=crop_size)
+class RINValidationWithDepth(ImageNetValidationWithDepth):
+    def __init__(self, config=None, size=None, random_crop=False, crop_size=None):
+        sub_indices = "30-32, 33-37, 151-268, 281-285, 80-100, 365-382, 389-397, 118-121, 300-319"
+        super().__init__(config=config, size=size, random_crop=random_crop,
+                         sub_indices=sub_indices, crop_size=crop_size)
+class DRINExamples(Dataset):
+    def __init__(self):
+        self.preprocessor = get_preprocessor(size=256, additional_targets={"depth": "image"})
+        with open("data/drin_examples.txt", "r") as f:
+            relpaths = f.read().splitlines()
+        self.image_paths = [os.path.join("data/drin_images",
+                                         relpath) for relpath in relpaths]
+        self.depth_paths = [os.path.join("data/drin_depth",
+                                         relpath.replace(".JPEG", ".png")) for relpath in relpaths]
+    def __len__(self):
+        return len(self.image_paths)
+    def preprocess_image(self, image_path):
+        image = Image.open(image_path)
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        image = np.array(image).astype(np.uint8)
+        image = self.preprocessor(image=image)["image"]
+        image = (image/127.5 - 1.0).astype(np.float32)
+        return image
+    def preprocess_depth(self, path):
+        rgba = np.array(Image.open(path))
+        depth = rgba_to_depth(rgba)
+        depth = (depth - depth.min())/max(1e-8, depth.max()-depth.min())
+        depth = 2.0*depth-1.0
+        return depth
+    def __getitem__(self, i):
+        e = dict()
+        e["image"] = self.preprocess_image(self.image_paths[i])
+        e["depth"] = self.preprocess_depth(self.depth_paths[i])
+        transformed = self.preprocessor(image=e["image"], depth=e["depth"])
+        e["image"] = transformed["image"]
+        e["depth"] = transformed["depth"]
+        return e
+def imscale(x, factor, keepshapes=False, keepmode="bicubic"):
+    if factor is None or factor==1:
+        return x
+    dtype = x.dtype
+    assert dtype in [np.float32, np.float64]
+    assert x.min() >= -1
+    assert x.max() <= 1
+    keepmode = {"nearest": Image.NEAREST, "bilinear": Image.BILINEAR,
+                "bicubic": Image.BICUBIC}[keepmode]
+    lr = (x+1.0)*127.5
+    lr = lr.clip(0,255).astype(np.uint8)
+    lr = Image.fromarray(lr)
+    h, w, _ = x.shape
+    nh = h//factor
+    nw = w//factor
+    assert nh > 0 and nw > 0, (nh, nw)
+    lr = lr.resize((nw,nh), Image.BICUBIC)
+    if keepshapes:
+        lr = lr.resize((w,h), keepmode)
+    lr = np.array(lr)/127.5-1.0
+    lr = lr.astype(dtype)
+    return lr
+class ImageNetScale(Dataset):
+    def __init__(self, size=None, crop_size=None, random_crop=False,
+                 up_factor=None, hr_factor=None, keep_mode="bicubic"):
+        self.base = self.get_base()
+        self.size = size
+        self.crop_size = crop_size if crop_size is not None else self.size
+        self.random_crop = random_crop
+        self.up_factor = up_factor
+        self.hr_factor = hr_factor
+        self.keep_mode = keep_mode
+        transforms = list()
+        if self.size is not None and self.size > 0:
+            rescaler = albumentations.SmallestMaxSize(max_size = self.size)
+            self.rescaler = rescaler
+            transforms.append(rescaler)
+        if self.crop_size is not None and self.crop_size > 0:
+            if len(transforms) == 0:
+                self.rescaler = albumentations.SmallestMaxSize(max_size = self.crop_size)
+            if not self.random_crop:
+                cropper = albumentations.CenterCrop(height=self.crop_size,width=self.crop_size)
+            else:
+                cropper = albumentations.RandomCrop(height=self.crop_size,width=self.crop_size)
+            transforms.append(cropper)
+        if len(transforms) > 0:
+            if self.up_factor is not None:
+                additional_targets = {"lr": "image"}
+            else:
+                additional_targets = None
+            self.preprocessor = albumentations.Compose(transforms,
+                                                       additional_targets=additional_targets)
+        else:
+            self.preprocessor = lambda **kwargs: kwargs
+    def __len__(self):
+        return len(self.base)
+    def __getitem__(self, i):
+        example = self.base[i]
+        image = example["image"]
+        # adjust resolution
+        image = imscale(image, self.hr_factor, keepshapes=False)
+        h,w,c = image.shape
+        if self.crop_size and min(h,w) < self.crop_size:
+            # have to upscale to be able to crop - this just uses bilinear
+            image = self.rescaler(image=image)["image"]
+        if self.up_factor is None:
+            image = self.preprocessor(image=image)["image"]
+            example["image"] = image
+        else:
+            lr = imscale(image, self.up_factor, keepshapes=True,
+                         keepmode=self.keep_mode)
+            out = self.preprocessor(image=image, lr=lr)
+            example["image"] = out["image"]
+            example["lr"] = out["lr"]
+        return example
+class ImageNetScaleTrain(ImageNetScale):
+    def __init__(self, random_crop=True, **kwargs):
+        super().__init__(random_crop=random_crop, **kwargs)
+    def get_base(self):
+        return ImageNetTrain()
+class ImageNetScaleValidation(ImageNetScale):
+    def get_base(self):
+        return ImageNetValidation()
+from skimage.feature import canny
+from skimage.color import rgb2gray
+class ImageNetEdges(ImageNetScale):
+    def __init__(self, up_factor=1, **kwargs):
+        super().__init__(up_factor=1, **kwargs)
+    def __getitem__(self, i):
+        example = self.base[i]
+        image = example["image"]
+        h,w,c = image.shape
+        if self.crop_size and min(h,w) < self.crop_size:
+            # have to upscale to be able to crop - this just uses bilinear
+            image = self.rescaler(image=image)["image"]
+        lr = canny(rgb2gray(image), sigma=2)
+        lr = lr.astype(np.float32)
+        lr = lr[:,:,None][:,:,[0,0,0]]
+        out = self.preprocessor(image=image, lr=lr)
+        example["image"] = out["image"]
+        example["lr"] = out["lr"]
+        return example
+class ImageNetEdgesTrain(ImageNetEdges):
+    def __init__(self, random_crop=True, **kwargs):
+        super().__init__(random_crop=random_crop, **kwargs)
+    def get_base(self):
+        return ImageNetTrain()
+class ImageNetEdgesValidation(ImageNetEdges):
+    def get_base(self):
+        return ImageNetValidation()

taming/data/open_images_helper.py ADDED Viewed

	@@ -0,0 +1,379 @@

+open_images_unify_categories_for_coco = {
+    '/m/03bt1vf': '/m/01g317',
+    '/m/04yx4': '/m/01g317',
+    '/m/05r655': '/m/01g317',
+    '/m/01bl7v': '/m/01g317',
+    '/m/0cnyhnx': '/m/01xq0k1',
+    '/m/01226z': '/m/018xm',
+    '/m/05ctyq': '/m/018xm',
+    '/m/058qzx': '/m/04ctx',
+    '/m/06pcq': '/m/0l515',
+    '/m/03m3pdh': '/m/02crq1',
+    '/m/046dlr': '/m/01x3z',
+    '/m/0h8mzrc': '/m/01x3z',
+}
+top_300_classes_plus_coco_compatibility = [
+    ('Man', 1060962),
+    ('Clothing', 986610),
+    ('Tree', 748162),
+    ('Woman', 611896),
+    ('Person', 610294),
+    ('Human face', 442948),
+    ('Girl', 175399),
+    ('Building', 162147),
+    ('Car', 159135),
+    ('Plant', 155704),
+    ('Human body', 137073),
+    ('Flower', 133128),
+    ('Window', 127485),
+    ('Human arm', 118380),
+    ('House', 114365),
+    ('Wheel', 111684),
+    ('Suit', 99054),
+    ('Human hair', 98089),
+    ('Human head', 92763),
+    ('Chair', 88624),
+    ('Boy', 79849),
+    ('Table', 73699),
+    ('Jeans', 57200),
+    ('Tire', 55725),
+    ('Skyscraper', 53321),
+    ('Food', 52400),
+    ('Footwear', 50335),
+    ('Dress', 50236),
+    ('Human leg', 47124),
+    ('Toy', 46636),
+    ('Tower', 45605),
+    ('Boat', 43486),
+    ('Land vehicle', 40541),
+    ('Bicycle wheel', 34646),
+    ('Palm tree', 33729),
+    ('Fashion accessory', 32914),
+    ('Glasses', 31940),
+    ('Bicycle', 31409),
+    ('Furniture', 30656),
+    ('Sculpture', 29643),
+    ('Bottle', 27558),
+    ('Dog', 26980),
+    ('Snack', 26796),
+    ('Human hand', 26664),
+    ('Bird', 25791),
+    ('Book', 25415),
+    ('Guitar', 24386),
+    ('Jacket', 23998),
+    ('Poster', 22192),
+    ('Dessert', 21284),
+    ('Baked goods', 20657),
+    ('Drink', 19754),
+    ('Flag', 18588),
+    ('Houseplant', 18205),
+    ('Tableware', 17613),
+    ('Airplane', 17218),
+    ('Door', 17195),
+    ('Sports uniform', 17068),
+    ('Shelf', 16865),
+    ('Drum', 16612),
+    ('Vehicle', 16542),
+    ('Microphone', 15269),
+    ('Street light', 14957),
+    ('Cat', 14879),
+    ('Fruit', 13684),
+    ('Fast food', 13536),
+    ('Animal', 12932),
+    ('Vegetable', 12534),
+    ('Train', 12358),
+    ('Horse', 11948),
+    ('Flowerpot', 11728),
+    ('Motorcycle', 11621),
+    ('Fish', 11517),
+    ('Desk', 11405),
+    ('Helmet', 10996),
+    ('Truck', 10915),
+    ('Bus', 10695),
+    ('Hat', 10532),
+    ('Auto part', 10488),
+    ('Musical instrument', 10303),
+    ('Sunglasses', 10207),
+    ('Picture frame', 10096),
+    ('Sports equipment', 10015),
+    ('Shorts', 9999),
+    ('Wine glass', 9632),
+    ('Duck', 9242),
+    ('Wine', 9032),
+    ('Rose', 8781),
+    ('Tie', 8693),
+    ('Butterfly', 8436),
+    ('Beer', 7978),
+    ('Cabinetry', 7956),
+    ('Laptop', 7907),
+    ('Insect', 7497),
+    ('Goggles', 7363),
+    ('Shirt', 7098),
+    ('Dairy Product', 7021),
+    ('Marine invertebrates', 7014),
+    ('Cattle', 7006),
+    ('Trousers', 6903),
+    ('Van', 6843),
+    ('Billboard', 6777),
+    ('Balloon', 6367),
+    ('Human nose', 6103),
+    ('Tent', 6073),
+    ('Camera', 6014),
+    ('Doll', 6002),
+    ('Coat', 5951),
+    ('Mobile phone', 5758),
+    ('Swimwear', 5729),
+    ('Strawberry', 5691),
+    ('Stairs', 5643),
+    ('Goose', 5599),
+    ('Umbrella', 5536),
+    ('Cake', 5508),
+    ('Sun hat', 5475),
+    ('Bench', 5310),
+    ('Bookcase', 5163),
+    ('Bee', 5140),
+    ('Computer monitor', 5078),
+    ('Hiking equipment', 4983),
+    ('Office building', 4981),
+    ('Coffee cup', 4748),
+    ('Curtain', 4685),
+    ('Plate', 4651),
+    ('Box', 4621),
+    ('Tomato', 4595),
+    ('Coffee table', 4529),
+    ('Office supplies', 4473),
+    ('Maple', 4416),
+    ('Muffin', 4365),
+    ('Cocktail', 4234),
+    ('Castle', 4197),
+    ('Couch', 4134),
+    ('Pumpkin', 3983),
+    ('Computer keyboard', 3960),
+    ('Human mouth', 3926),
+    ('Christmas tree', 3893),
+    ('Mushroom', 3883),
+    ('Swimming pool', 3809),
+    ('Pastry', 3799),
+    ('Lavender (Plant)', 3769),
+    ('Football helmet', 3732),
+    ('Bread', 3648),
+    ('Traffic sign', 3628),
+    ('Common sunflower', 3597),
+    ('Television', 3550),
+    ('Bed', 3525),
+    ('Cookie', 3485),
+    ('Fountain', 3484),
+    ('Paddle', 3447),
+    ('Bicycle helmet', 3429),
+    ('Porch', 3420),
+    ('Deer', 3387),
+    ('Fedora', 3339),
+    ('Canoe', 3338),
+    ('Carnivore', 3266),
+    ('Bowl', 3202),
+    ('Human eye', 3166),
+    ('Ball', 3118),
+    ('Pillow', 3077),
+    ('Salad', 3061),
+    ('Beetle', 3060),
+    ('Orange', 3050),
+    ('Drawer', 2958),
+    ('Platter', 2937),
+    ('Elephant', 2921),
+    ('Seafood', 2921),
+    ('Monkey', 2915),
+    ('Countertop', 2879),
+    ('Watercraft', 2831),
+    ('Helicopter', 2805),
+    ('Kitchen appliance', 2797),
+    ('Personal flotation device', 2781),
+    ('Swan', 2739),
+    ('Lamp', 2711),
+    ('Boot', 2695),
+    ('Bronze sculpture', 2693),
+    ('Chicken', 2677),
+    ('Taxi', 2643),
+    ('Juice', 2615),
+    ('Cowboy hat', 2604),
+    ('Apple', 2600),
+    ('Tin can', 2590),
+    ('Necklace', 2564),
+    ('Ice cream', 2560),
+    ('Human beard', 2539),
+    ('Coin', 2536),
+    ('Candle', 2515),
+    ('Cart', 2512),
+    ('High heels', 2441),
+    ('Weapon', 2433),
+    ('Handbag', 2406),
+    ('Penguin', 2396),
+    ('Rifle', 2352),
+    ('Violin', 2336),
+    ('Skull', 2304),
+    ('Lantern', 2285),
+    ('Scarf', 2269),
+    ('Saucer', 2225),
+    ('Sheep', 2215),
+    ('Vase', 2189),
+    ('Lily', 2180),
+    ('Mug', 2154),
+    ('Parrot', 2140),
+    ('Human ear', 2137),
+    ('Sandal', 2115),
+    ('Lizard', 2100),
+    ('Kitchen & dining room table', 2063),
+    ('Spider', 1977),
+    ('Coffee', 1974),
+    ('Goat', 1926),
+    ('Squirrel', 1922),
+    ('Cello', 1913),
+    ('Sushi', 1881),
+    ('Tortoise', 1876),
+    ('Pizza', 1870),
+    ('Studio couch', 1864),
+    ('Barrel', 1862),
+    ('Cosmetics', 1841),
+    ('Moths and butterflies', 1841),
+    ('Convenience store', 1817),
+    ('Watch', 1792),
+    ('Home appliance', 1786),
+    ('Harbor seal', 1780),
+    ('Luggage and bags', 1756),
+    ('Vehicle registration plate', 1754),
+    ('Shrimp', 1751),
+    ('Jellyfish', 1730),
+    ('French fries', 1723),
+    ('Egg (Food)', 1698),
+    ('Football', 1697),
+    ('Musical keyboard', 1683),
+    ('Falcon', 1674),
+    ('Candy', 1660),
+    ('Medical equipment', 1654),
+    ('Eagle', 1651),
+    ('Dinosaur', 1634),
+    ('Surfboard', 1630),
+    ('Tank', 1628),
+    ('Grape', 1624),
+    ('Lion', 1624),
+    ('Owl', 1622),
+    ('Ski', 1613),
+    ('Waste container', 1606),
+    ('Frog', 1591),
+    ('Sparrow', 1585),
+    ('Rabbit', 1581),
+    ('Pen', 1546),
+    ('Sea lion', 1537),
+    ('Spoon', 1521),
+    ('Sink', 1512),
+    ('Teddy bear', 1507),
+    ('Bull', 1495),
+    ('Sofa bed', 1490),
+    ('Dragonfly', 1479),
+    ('Brassiere', 1478),
+    ('Chest of drawers', 1472),
+    ('Aircraft', 1466),
+    ('Human foot', 1463),
+    ('Pig', 1455),
+    ('Fork', 1454),
+    ('Antelope', 1438),
+    ('Tripod', 1427),
+    ('Tool', 1424),
+    ('Cheese', 1422),
+    ('Lemon', 1397),
+    ('Hamburger', 1393),
+    ('Dolphin', 1390),
+    ('Mirror', 1390),
+    ('Marine mammal', 1387),
+    ('Giraffe', 1385),
+    ('Snake', 1368),
+    ('Gondola', 1364),
+    ('Wheelchair', 1360),
+    ('Piano', 1358),
+    ('Cupboard', 1348),
+    ('Banana', 1345),
+    ('Trumpet', 1335),
+    ('Lighthouse', 1333),
+    ('Invertebrate', 1317),
+    ('Carrot', 1268),
+    ('Sock', 1260),
+    ('Tiger', 1241),
+    ('Camel', 1224),
+    ('Parachute', 1224),
+    ('Bathroom accessory', 1223),
+    ('Earrings', 1221),
+    ('Headphones', 1218),
+    ('Skirt', 1198),
+    ('Skateboard', 1190),
+    ('Sandwich', 1148),
+    ('Saxophone', 1141),
+    ('Goldfish', 1136),
+    ('Stool', 1104),
+    ('Traffic light', 1097),
+    ('Shellfish', 1081),
+    ('Backpack', 1079),
+    ('Sea turtle', 1078),
+    ('Cucumber', 1075),
+    ('Tea', 1051),
+    ('Toilet', 1047),
+    ('Roller skates', 1040),
+    ('Mule', 1039),
+    ('Bust', 1031),
+    ('Broccoli', 1030),
+    ('Crab', 1020),
+    ('Oyster', 1019),
+    ('Cannon', 1012),
+    ('Zebra', 1012),
+    ('French horn', 1008),
+    ('Grapefruit', 998),
+    ('Whiteboard', 997),
+    ('Zucchini', 997),
+    ('Crocodile', 992),
+    ('Clock', 960),
+    ('Wall clock', 958),
+    ('Doughnut', 869),
+    ('Snail', 868),
+    ('Baseball glove', 859),
+    ('Panda', 830),
+    ('Tennis racket', 830),
+    ('Pear', 652),
+    ('Bagel', 617),
+    ('Oven', 616),
+    ('Ladybug', 615),
+    ('Shark', 615),
+    ('Polar bear', 614),
+    ('Ostrich', 609),
+    ('Hot dog', 473),
+    ('Microwave oven', 467),
+    ('Fire hydrant', 20),
+    ('Stop sign', 20),
+    ('Parking meter', 20),
+    ('Bear', 20),
+    ('Flying disc', 20),
+    ('Snowboard', 20),
+    ('Tennis ball', 20),
+    ('Kite', 20),
+    ('Baseball bat', 20),
+    ('Kitchen knife', 20),
+    ('Knife', 20),
+    ('Submarine sandwich', 20),
+    ('Computer mouse', 20),
+    ('Remote control', 20),
+    ('Toaster', 20),
+    ('Sink', 20),
+    ('Refrigerator', 20),
+    ('Alarm clock', 20),
+    ('Wall clock', 20),
+    ('Scissors', 20),
+    ('Hair dryer', 20),
+    ('Toothbrush', 20),
+    ('Suitcase', 20)
+]

taming/data/sflckr.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import numpy as np
+import cv2
+import albumentations
+from PIL import Image
+from torch.utils.data import Dataset
+class SegmentationBase(Dataset):
+    def __init__(self,
+                 data_csv, data_root, segmentation_root,
+                 size=None, random_crop=False, interpolation="bicubic",
+                 n_labels=182, shift_segmentation=False,
+                 ):
+        self.n_labels = n_labels
+        self.shift_segmentation = shift_segmentation
+        self.data_csv = data_csv
+        self.data_root = data_root
+        self.segmentation_root = segmentation_root
+        with open(self.data_csv, "r") as f:
+            self.image_paths = f.read().splitlines()
+        self._length = len(self.image_paths)
+        self.labels = {
+            "relative_file_path_": [l for l in self.image_paths],
+            "file_path_": [os.path.join(self.data_root, l)
+                           for l in self.image_paths],
+            "segmentation_path_": [os.path.join(self.segmentation_root, l.replace(".jpg", ".png"))
+                                   for l in self.image_paths]
+        }
+        size = None if size is not None and size<=0 else size
+        self.size = size
+        if self.size is not None:
+            self.interpolation = interpolation
+            self.interpolation = {
+                "nearest": cv2.INTER_NEAREST,
+                "bilinear": cv2.INTER_LINEAR,
+                "bicubic": cv2.INTER_CUBIC,
+                "area": cv2.INTER_AREA,
+                "lanczos": cv2.INTER_LANCZOS4}[self.interpolation]
+            self.image_rescaler = albumentations.SmallestMaxSize(max_size=self.size,
+                                                                 interpolation=self.interpolation)
+            self.segmentation_rescaler = albumentations.SmallestMaxSize(max_size=self.size,
+                                                                        interpolation=cv2.INTER_NEAREST)
+            self.center_crop = not random_crop
+            if self.center_crop:
+                self.cropper = albumentations.CenterCrop(height=self.size, width=self.size)
+            else:
+                self.cropper = albumentations.RandomCrop(height=self.size, width=self.size)
+            self.preprocessor = self.cropper
+    def __len__(self):
+        return self._length
+    def __getitem__(self, i):
+        example = dict((k, self.labels[k][i]) for k in self.labels)
+        image = Image.open(example["file_path_"])
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        image = np.array(image).astype(np.uint8)
+        if self.size is not None:
+            image = self.image_rescaler(image=image)["image"]
+        segmentation = Image.open(example["segmentation_path_"])
+        assert segmentation.mode == "L", segmentation.mode
+        segmentation = np.array(segmentation).astype(np.uint8)
+        if self.shift_segmentation:
+            # used to support segmentations containing unlabeled==255 label
+            segmentation = segmentation+1
+        if self.size is not None:
+            segmentation = self.segmentation_rescaler(image=segmentation)["image"]
+        if self.size is not None:
+            processed = self.preprocessor(image=image,
+                                          mask=segmentation
+                                          )
+        else:
+            processed = {"image": image,
+                         "mask": segmentation
+                         }
+        example["image"] = (processed["image"]/127.5 - 1.0).astype(np.float32)
+        segmentation = processed["mask"]
+        onehot = np.eye(self.n_labels)[segmentation]
+        example["segmentation"] = onehot
+        return example
+class Examples(SegmentationBase):
+    def __init__(self, size=None, random_crop=False, interpolation="bicubic"):
+        super().__init__(data_csv="data/sflckr_examples.txt",
+                         data_root="data/sflckr_images",
+                         segmentation_root="data/sflckr_segmentations",
+                         size=size, random_crop=random_crop, interpolation=interpolation)

taming/data/utils.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import collections
+import os
+import tarfile
+import urllib
+import zipfile
+from pathlib import Path
+import numpy as np
+import torch
+from taming.data.helper_types import Annotation
+#from torch._six import string_classes
+from torch.utils.data._utils.collate import np_str_obj_array_pattern, default_collate_err_msg_format
+from tqdm import tqdm
+string_classes = (str,bytes)
+def unpack(path):
+    if path.endswith("tar.gz"):
+        with tarfile.open(path, "r:gz") as tar:
+            tar.extractall(path=os.path.split(path)[0])
+    elif path.endswith("tar"):
+        with tarfile.open(path, "r:") as tar:
+            tar.extractall(path=os.path.split(path)[0])
+    elif path.endswith("zip"):
+        with zipfile.ZipFile(path, "r") as f:
+            f.extractall(path=os.path.split(path)[0])
+    else:
+        raise NotImplementedError(
+            "Unknown file extension: {}".format(os.path.splitext(path)[1])
+        )
+def reporthook(bar):
+    """tqdm progress bar for downloads."""
+    def hook(b=1, bsize=1, tsize=None):
+        if tsize is not None:
+            bar.total = tsize
+        bar.update(b * bsize - bar.n)
+    return hook
+def get_root(name):
+    base = "data/"
+    root = os.path.join(base, name)
+    os.makedirs(root, exist_ok=True)
+    return root
+def is_prepared(root):
+    return Path(root).joinpath(".ready").exists()
+def mark_prepared(root):
+    Path(root).joinpath(".ready").touch()
+def prompt_download(file_, source, target_dir, content_dir=None):
+    targetpath = os.path.join(target_dir, file_)
+    while not os.path.exists(targetpath):
+        if content_dir is not None and os.path.exists(
+            os.path.join(target_dir, content_dir)
+        ):
+            break
+        print(
+            "Please download '{}' from '{}' to '{}'.".format(file_, source, targetpath)
+        )
+        if content_dir is not None:
+            print(
+                "Or place its content into '{}'.".format(
+                    os.path.join(target_dir, content_dir)
+                )
+            )
+        input("Press Enter when done...")
+    return targetpath
+def download_url(file_, url, target_dir):
+    targetpath = os.path.join(target_dir, file_)
+    os.makedirs(target_dir, exist_ok=True)
+    with tqdm(
+        unit="B", unit_scale=True, unit_divisor=1024, miniters=1, desc=file_
+    ) as bar:
+        urllib.request.urlretrieve(url, targetpath, reporthook=reporthook(bar))
+    return targetpath
+def download_urls(urls, target_dir):
+    paths = dict()
+    for fname, url in urls.items():
+        outpath = download_url(fname, url, target_dir)
+        paths[fname] = outpath
+    return paths
+def quadratic_crop(x, bbox, alpha=1.0):
+    """bbox is xmin, ymin, xmax, ymax"""
+    im_h, im_w = x.shape[:2]
+    bbox = np.array(bbox, dtype=np.float32)
+    bbox = np.clip(bbox, 0, max(im_h, im_w))
+    center = 0.5 * (bbox[0] + bbox[2]), 0.5 * (bbox[1] + bbox[3])
+    w = bbox[2] - bbox[0]
+    h = bbox[3] - bbox[1]
+    l = int(alpha * max(w, h))
+    l = max(l, 2)
+    required_padding = -1 * min(
+        center[0] - l, center[1] - l, im_w - (center[0] + l), im_h - (center[1] + l)
+    )
+    required_padding = int(np.ceil(required_padding))
+    if required_padding > 0:
+        padding = [
+            [required_padding, required_padding],
+            [required_padding, required_padding],
+        ]
+        padding += [[0, 0]] * (len(x.shape) - 2)
+        x = np.pad(x, padding, "reflect")
+        center = center[0] + required_padding, center[1] + required_padding
+    xmin = int(center[0] - l / 2)
+    ymin = int(center[1] - l / 2)
+    return np.array(x[ymin : ymin + l, xmin : xmin + l, ...])
+def custom_collate(batch):
+    r"""source: pytorch 1.9.0, only one modification to original code """
+    elem = batch[0]
+    elem_type = type(elem)
+    if isinstance(elem, torch.Tensor):
+        out = None
+        if torch.utils.data.get_worker_info() is not None:
+            # If we're in a background process, concatenate directly into a
+            # shared memory tensor to avoid an extra copy
+            numel = sum([x.numel() for x in batch])
+            storage = elem.storage()._new_shared(numel)
+            out = elem.new(storage)
+        return torch.stack(batch, 0, out=out)
+    elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+            and elem_type.__name__ != 'string_':
+        if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
+            # array of string classes and object
+            if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+            return custom_collate([torch.as_tensor(b) for b in batch])
+        elif elem.shape == ():  # scalars
+            return torch.as_tensor(batch)
+    elif isinstance(elem, float):
+        return torch.tensor(batch, dtype=torch.float64)
+    elif isinstance(elem, int):
+        return torch.tensor(batch)
+    elif isinstance(elem, string_classes):
+        return batch
+    elif isinstance(elem, collections.abc.Mapping):
+        return {key: custom_collate([d[key] for d in batch]) for key in elem}
+    elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
+        return elem_type(*(custom_collate(samples) for samples in zip(*batch)))
+    if isinstance(elem, collections.abc.Sequence) and isinstance(elem[0], Annotation):  # added
+        return batch  # added
+    elif isinstance(elem, collections.abc.Sequence):
+        # check to make sure that the elements in batch have consistent size
+        it = iter(batch)
+        elem_size = len(next(it))
+        if not all(len(elem) == elem_size for elem in it):
+            raise RuntimeError('each element in list of batch should be of equal size')
+        transposed = zip(*batch)
+        return [custom_collate(samples) for samples in transposed]
+    raise TypeError(default_collate_err_msg_format.format(elem_type))

taming/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import numpy as np
+class LambdaWarmUpCosineScheduler:
+    """
+    note: use with a base_lr of 1.0
+    """
+    def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
+        self.lr_warm_up_steps = warm_up_steps
+        self.lr_start = lr_start
+        self.lr_min = lr_min
+        self.lr_max = lr_max
+        self.lr_max_decay_steps = max_decay_steps
+        self.last_lr = 0.
+        self.verbosity_interval = verbosity_interval
+    def schedule(self, n):
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
+        if n < self.lr_warm_up_steps:
+            lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
+            self.last_lr = lr
+            return lr
+        else:
+            t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
+            t = min(t, 1.0)
+            lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
+                    1 + np.cos(t * np.pi))
+            self.last_lr = lr
+            return lr
+    def __call__(self, n):
+        return self.schedule(n)

taming/models/__pycache__/vqgan.cpython-312.pyc ADDED Viewed

Binary file (21.7 kB). View file

taming/models/cond_transformer.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import os, math
+import torch
+import torch.nn.functional as F
+import pytorch_lightning as pl
+from main import instantiate_from_config
+from taming.modules.util import SOSProvider
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+class Net2NetTransformer(pl.LightningModule):
+    def __init__(self,
+                 transformer_config,
+                 first_stage_config,
+                 cond_stage_config,
+                 permuter_config=None,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 first_stage_key="image",
+                 cond_stage_key="depth",
+                 downsample_cond_size=-1,
+                 pkeep=1.0,
+                 sos_token=0,
+                 unconditional=False,
+                 ):
+        super().__init__()
+        self.be_unconditional = unconditional
+        self.sos_token = sos_token
+        self.first_stage_key = first_stage_key
+        self.cond_stage_key = cond_stage_key
+        self.init_first_stage_from_ckpt(first_stage_config)
+        self.init_cond_stage_from_ckpt(cond_stage_config)
+        if permuter_config is None:
+            permuter_config = {"target": "taming.modules.transformer.permuter.Identity"}
+        self.permuter = instantiate_from_config(config=permuter_config)
+        self.transformer = instantiate_from_config(config=transformer_config)
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.downsample_cond_size = downsample_cond_size
+        self.pkeep = pkeep
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        for k in sd.keys():
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    self.print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    def init_first_stage_from_ckpt(self, config):
+        model = instantiate_from_config(config)
+        model = model.eval()
+        model.train = disabled_train
+        self.first_stage_model = model
+    def init_cond_stage_from_ckpt(self, config):
+        if config == "__is_first_stage__":
+            print("Using first stage also as cond stage.")
+            self.cond_stage_model = self.first_stage_model
+        elif config == "__is_unconditional__" or self.be_unconditional:
+            print(f"Using no cond stage. Assuming the training is intended to be unconditional. "
+                  f"Prepending {self.sos_token} as a sos token.")
+            self.be_unconditional = True
+            self.cond_stage_key = self.first_stage_key
+            self.cond_stage_model = SOSProvider(self.sos_token)
+        else:
+            model = instantiate_from_config(config)
+            model = model.eval()
+            model.train = disabled_train
+            self.cond_stage_model = model
+    def forward(self, x, c):
+        # one step to produce the logits
+        _, z_indices = self.encode_to_z(x)
+        _, c_indices = self.encode_to_c(c)
+        if self.training and self.pkeep < 1.0:
+            mask = torch.bernoulli(self.pkeep*torch.ones(z_indices.shape,
+                                                         device=z_indices.device))
+            mask = mask.round().to(dtype=torch.int64)
+            r_indices = torch.randint_like(z_indices, self.transformer.config.vocab_size)
+            a_indices = mask*z_indices+(1-mask)*r_indices
+        else:
+            a_indices = z_indices
+        cz_indices = torch.cat((c_indices, a_indices), dim=1)
+        # target includes all sequence elements (no need to handle first one
+        # differently because we are conditioning)
+        target = z_indices
+        # make the prediction
+        logits, _ = self.transformer(cz_indices[:, :-1])
+        # cut off conditioning outputs - output i corresponds to p(z_i | z_{<i}, c)
+        logits = logits[:, c_indices.shape[1]-1:]
+        return logits, target
+    def top_k_logits(self, logits, k):
+        v, ix = torch.topk(logits, k)
+        out = logits.clone()
+        out[out < v[..., [-1]]] = -float('Inf')
+        return out
+    @torch.no_grad()
+    def sample(self, x, c, steps, temperature=1.0, sample=False, top_k=None,
+               callback=lambda k: None):
+        x = torch.cat((c,x),dim=1)
+        block_size = self.transformer.get_block_size()
+        assert not self.transformer.training
+        if self.pkeep <= 0.0:
+            # one pass suffices since input is pure noise anyway
+            assert len(x.shape)==2
+            noise_shape = (x.shape[0], steps-1)
+            #noise = torch.randint(self.transformer.config.vocab_size, noise_shape).to(x)
+            noise = c.clone()[:,x.shape[1]-c.shape[1]:-1]
+            x = torch.cat((x,noise),dim=1)
+            logits, _ = self.transformer(x)
+            # take all logits for now and scale by temp
+            logits = logits / temperature
+            # optionally crop probabilities to only the top k options
+            if top_k is not None:
+                logits = self.top_k_logits(logits, top_k)
+            # apply softmax to convert to probabilities
+            probs = F.softmax(logits, dim=-1)
+            # sample from the distribution or take the most likely
+            if sample:
+                shape = probs.shape
+                probs = probs.reshape(shape[0]*shape[1],shape[2])
+                ix = torch.multinomial(probs, num_samples=1)
+                probs = probs.reshape(shape[0],shape[1],shape[2])
+                ix = ix.reshape(shape[0],shape[1])
+            else:
+                _, ix = torch.topk(probs, k=1, dim=-1)
+            # cut off conditioning
+            x = ix[:, c.shape[1]-1:]
+        else:
+            for k in range(steps):
+                callback(k)
+                assert x.size(1) <= block_size # make sure model can see conditioning
+                x_cond = x if x.size(1) <= block_size else x[:, -block_size:]  # crop context if needed
+                logits, _ = self.transformer(x_cond)
+                # pluck the logits at the final step and scale by temperature
+                logits = logits[:, -1, :] / temperature
+                # optionally crop probabilities to only the top k options
+                if top_k is not None:
+                    logits = self.top_k_logits(logits, top_k)
+                # apply softmax to convert to probabilities
+                probs = F.softmax(logits, dim=-1)
+                # sample from the distribution or take the most likely
+                if sample:
+                    ix = torch.multinomial(probs, num_samples=1)
+                else:
+                    _, ix = torch.topk(probs, k=1, dim=-1)
+                # append to the sequence and continue
+                x = torch.cat((x, ix), dim=1)
+            # cut off conditioning
+            x = x[:, c.shape[1]:]
+        return x
+    @torch.no_grad()
+    def encode_to_z(self, x):
+        quant_z, _, info = self.first_stage_model.encode(x)
+        indices = info[2].view(quant_z.shape[0], -1)
+        indices = self.permuter(indices)
+        return quant_z, indices
+    @torch.no_grad()
+    def encode_to_c(self, c):
+        if self.downsample_cond_size > -1:
+            c = F.interpolate(c, size=(self.downsample_cond_size, self.downsample_cond_size))
+        quant_c, _, [_,_,indices] = self.cond_stage_model.encode(c)
+        if len(indices.shape) > 2:
+            indices = indices.view(c.shape[0], -1)
+        return quant_c, indices
+    @torch.no_grad()
+    def decode_to_img(self, index, zshape):
+        index = self.permuter(index, reverse=True)
+        bhwc = (zshape[0],zshape[2],zshape[3],zshape[1])
+        quant_z = self.first_stage_model.quantize.get_codebook_entry(
+            index.reshape(-1), shape=bhwc)
+        x = self.first_stage_model.decode(quant_z)
+        return x
+    @torch.no_grad()
+    def log_images(self, batch, temperature=None, top_k=None, callback=None, lr_interface=False, **kwargs):
+        log = dict()
+        N = 4
+        if lr_interface:
+            x, c = self.get_xc(batch, N, diffuse=False, upsample_factor=8)
+        else:
+            x, c = self.get_xc(batch, N)
+        x = x.to(device=self.device)
+        c = c.to(device=self.device)
+        quant_z, z_indices = self.encode_to_z(x)
+        quant_c, c_indices = self.encode_to_c(c)
+        # create a "half"" sample
+        z_start_indices = z_indices[:,:z_indices.shape[1]//2]
+        index_sample = self.sample(z_start_indices, c_indices,
+                                   steps=z_indices.shape[1]-z_start_indices.shape[1],
+                                   temperature=temperature if temperature is not None else 1.0,
+                                   sample=True,
+                                   top_k=top_k if top_k is not None else 100,
+                                   callback=callback if callback is not None else lambda k: None)
+        x_sample = self.decode_to_img(index_sample, quant_z.shape)
+        # sample
+        z_start_indices = z_indices[:, :0]
+        index_sample = self.sample(z_start_indices, c_indices,
+                                   steps=z_indices.shape[1],
+                                   temperature=temperature if temperature is not None else 1.0,
+                                   sample=True,
+                                   top_k=top_k if top_k is not None else 100,
+                                   callback=callback if callback is not None else lambda k: None)
+        x_sample_nopix = self.decode_to_img(index_sample, quant_z.shape)
+        # det sample
+        z_start_indices = z_indices[:, :0]
+        index_sample = self.sample(z_start_indices, c_indices,
+                                   steps=z_indices.shape[1],
+                                   sample=False,
+                                   callback=callback if callback is not None else lambda k: None)
+        x_sample_det = self.decode_to_img(index_sample, quant_z.shape)
+        # reconstruction
+        x_rec = self.decode_to_img(z_indices, quant_z.shape)
+        log["inputs"] = x
+        log["reconstructions"] = x_rec
+        if self.cond_stage_key in ["objects_bbox", "objects_center_points"]:
+            figure_size = (x_rec.shape[2], x_rec.shape[3])
+            dataset = kwargs["pl_module"].trainer.datamodule.datasets["validation"]
+            label_for_category_no = dataset.get_textual_label_for_category_no
+            plotter = dataset.conditional_builders[self.cond_stage_key].plot
+            log["conditioning"] = torch.zeros_like(log["reconstructions"])
+            for i in range(quant_c.shape[0]):
+                log["conditioning"][i] = plotter(quant_c[i], label_for_category_no, figure_size)
+            log["conditioning_rec"] = log["conditioning"]
+        elif self.cond_stage_key != "image":
+            cond_rec = self.cond_stage_model.decode(quant_c)
+            if self.cond_stage_key == "segmentation":
+                # get image from segmentation mask
+                num_classes = cond_rec.shape[1]
+                c = torch.argmax(c, dim=1, keepdim=True)
+                c = F.one_hot(c, num_classes=num_classes)
+                c = c.squeeze(1).permute(0, 3, 1, 2).float()
+                c = self.cond_stage_model.to_rgb(c)
+                cond_rec = torch.argmax(cond_rec, dim=1, keepdim=True)
+                cond_rec = F.one_hot(cond_rec, num_classes=num_classes)
+                cond_rec = cond_rec.squeeze(1).permute(0, 3, 1, 2).float()
+                cond_rec = self.cond_stage_model.to_rgb(cond_rec)
+            log["conditioning_rec"] = cond_rec
+            log["conditioning"] = c
+        log["samples_half"] = x_sample
+        log["samples_nopix"] = x_sample_nopix
+        log["samples_det"] = x_sample_det
+        return log
+    def get_input(self, key, batch):
+        x = batch[key]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        if len(x.shape) == 4:
+            x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format)
+        if x.dtype == torch.double:
+            x = x.float()
+        return x
+    def get_xc(self, batch, N=None):
+        x = self.get_input(self.first_stage_key, batch)
+        c = self.get_input(self.cond_stage_key, batch)
+        if N is not None:
+            x = x[:N]
+            c = c[:N]
+        return x, c
+    def shared_step(self, batch, batch_idx):
+        x, c = self.get_xc(batch)
+        logits, target = self(x, c)
+        loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), target.reshape(-1))
+        return loss
+    def training_step(self, batch, batch_idx):
+        loss = self.shared_step(batch, batch_idx)
+        self.log("train/loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        loss = self.shared_step(batch, batch_idx)
+        self.log("val/loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        return loss
+    def configure_optimizers(self):
+        """
+        Following minGPT:
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+        for mn, m in self.transformer.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                if pn.endswith('bias'):
+                    # all biases will not be decayed
+                    no_decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+        # special case the position embedding parameter in the root GPT module as not decayed
+        no_decay.add('pos_emb')
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.transformer.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
+                                                    % (str(param_dict.keys() - union_params), )
+        # create the pytorch optimizer object
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": 0.01},
+            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+        ]
+        optimizer = torch.optim.AdamW(optim_groups, lr=self.learning_rate, betas=(0.9, 0.95))
+        return optimizer

taming/models/dummy_cond_stage.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from torch import Tensor
+class DummyCondStage:
+    def __init__(self, conditional_key):
+        self.conditional_key = conditional_key
+        self.train = None
+    def eval(self):
+        return self
+    @staticmethod
+    def encode(c: Tensor):
+        return c, None, (None, None, c)
+    @staticmethod
+    def decode(c: Tensor):
+        return c
+    @staticmethod
+    def to_rgb(c: Tensor):
+        return c

taming/models/vqgan.py ADDED Viewed

	@@ -0,0 +1,404 @@

+import torch
+import torch.nn.functional as F
+import pytorch_lightning as pl
+from main import instantiate_from_config
+from taming.modules.diffusionmodules.model import Encoder, Decoder
+from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
+from taming.modules.vqvae.quantize import GumbelQuantize
+from taming.modules.vqvae.quantize import EMAVectorQuantizer
+class VQModel(pl.LightningModule):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 n_embed,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 remap=None,
+                 sane_index_shape=False,  # tell vector quantizer to return indices as bhw
+                 ):
+        super().__init__()
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25,
+                                        remap=remap, sane_index_shape=sane_index_shape)
+        self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.image_key = image_key
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+    def decode_code(self, code_b):
+        quant_b = self.quantize.embed_code(code_b)
+        dec = self.decode(quant_b)
+        return dec
+    def forward(self, input):
+        quant, diff, _ = self.encode(input)
+        dec = self.decode(quant)
+        return dec, diff
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format)
+        return x.float()
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        if optimizer_idx == 0:
+            # autoencode
+            aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log("train/aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            return aeloss
+        if optimizer_idx == 1:
+            # discriminator
+            discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log("train/discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            return discloss
+    def validation_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0, self.global_step,
+                                            last_layer=self.get_last_layer(), split="val")
+        discloss, log_dict_disc = self.loss(qloss, x, xrec, 1, self.global_step,
+                                            last_layer=self.get_last_layer(), split="val")
+        rec_loss = log_dict_ae["val/rec_loss"]
+        self.log("val/rec_loss", rec_loss,
+                   prog_bar=True, logger=True, on_step=True, on_epoch=True, sync_dist=True)
+        self.log("val/aeloss", aeloss,
+                   prog_bar=True, logger=True, on_step=True, on_epoch=True, sync_dist=True)
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quantize.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+                                    lr=lr, betas=(0.5, 0.9))
+        return [opt_ae, opt_disc], []
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    def log_images(self, batch, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        xrec, _ = self(x)
+        if x.shape[1] > 3:
+            # colorize with random projection
+            assert xrec.shape[1] > 3
+            x = self.to_rgb(x)
+            xrec = self.to_rgb(xrec)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x
+class VQSegmentationModel(VQModel):
+    def __init__(self, n_labels, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.register_buffer("colorize", torch.randn(3, n_labels, 1, 1))
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quantize.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr, betas=(0.5, 0.9))
+        return opt_ae
+    def training_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, split="train")
+        self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+        return aeloss
+    def validation_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, split="val")
+        self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+        total_loss = log_dict_ae["val/total_loss"]
+        self.log("val/total_loss", total_loss,
+                 prog_bar=True, logger=True, on_step=True, on_epoch=True, sync_dist=True)
+        return aeloss
+    @torch.no_grad()
+    def log_images(self, batch, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        xrec, _ = self(x)
+        if x.shape[1] > 3:
+            # colorize with random projection
+            assert xrec.shape[1] > 3
+            # convert logits to indices
+            xrec = torch.argmax(xrec, dim=1, keepdim=True)
+            xrec = F.one_hot(xrec, num_classes=x.shape[1])
+            xrec = xrec.squeeze(1).permute(0, 3, 1, 2).float()
+            x = self.to_rgb(x)
+            xrec = self.to_rgb(xrec)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        return log
+class VQNoDiscModel(VQModel):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 n_embed,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None
+                 ):
+        super().__init__(ddconfig=ddconfig, lossconfig=lossconfig, n_embed=n_embed, embed_dim=embed_dim,
+                         ckpt_path=ckpt_path, ignore_keys=ignore_keys, image_key=image_key,
+                         colorize_nlabels=colorize_nlabels)
+    def training_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        # autoencode
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, self.global_step, split="train")
+        output = pl.TrainResult(minimize=aeloss)
+        output.log("train/aeloss", aeloss,
+                   prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        output.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+        return output
+    def validation_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, self.global_step, split="val")
+        rec_loss = log_dict_ae["val/rec_loss"]
+        output = pl.EvalResult(checkpoint_on=rec_loss)
+        output.log("val/rec_loss", rec_loss,
+                   prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        output.log("val/aeloss", aeloss,
+                   prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        output.log_dict(log_dict_ae)
+        return output
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quantize.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=self.learning_rate, betas=(0.5, 0.9))
+        return optimizer
+class GumbelVQ(VQModel):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 n_embed,
+                 embed_dim,
+                 temperature_scheduler_config,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 kl_weight=1e-8,
+                 remap=None,
+                 ):
+        z_channels = ddconfig["z_channels"]
+        super().__init__(ddconfig,
+                         lossconfig,
+                         n_embed,
+                         embed_dim,
+                         ckpt_path=None,
+                         ignore_keys=ignore_keys,
+                         image_key=image_key,
+                         colorize_nlabels=colorize_nlabels,
+                         monitor=monitor,
+                         )
+        self.loss.n_classes = n_embed
+        self.vocab_size = n_embed
+        self.quantize = GumbelQuantize(z_channels, embed_dim,
+                                       n_embed=n_embed,
+                                       kl_weight=kl_weight, temp_init=1.0,
+                                       remap=remap)
+        self.temperature_scheduler = instantiate_from_config(temperature_scheduler_config)   # annealing of temp
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+    def temperature_scheduling(self):
+        self.quantize.temperature = self.temperature_scheduler(self.global_step)
+    def encode_to_prequant(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+    def decode_code(self, code_b):
+        raise NotImplementedError
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        self.temperature_scheduling()
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        if optimizer_idx == 0:
+            # autoencode
+            aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            self.log("temperature", self.quantize.temperature, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            return aeloss
+        if optimizer_idx == 1:
+            # discriminator
+            discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            return discloss
+    def validation_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x, return_pred_indices=True)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0, self.global_step,
+                                        last_layer=self.get_last_layer(), split="val")
+        discloss, log_dict_disc = self.loss(qloss, x, xrec, 1, self.global_step,
+                                            last_layer=self.get_last_layer(), split="val")
+        rec_loss = log_dict_ae["val/rec_loss"]
+        self.log("val/rec_loss", rec_loss,
+                 prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
+        self.log("val/aeloss", aeloss,
+                 prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+    def log_images(self, batch, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        # encode
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, _, _ = self.quantize(h)
+        # decode
+        x_rec = self.decode(quant)
+        log["inputs"] = x
+        log["reconstructions"] = x_rec
+        return log
+class EMAVQ(VQModel):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 n_embed,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 remap=None,
+                 sane_index_shape=False,  # tell vector quantizer to return indices as bhw
+                 ):
+        super().__init__(ddconfig,
+                         lossconfig,
+                         n_embed,
+                         embed_dim,
+                         ckpt_path=None,
+                         ignore_keys=ignore_keys,
+                         image_key=image_key,
+                         colorize_nlabels=colorize_nlabels,
+                         monitor=monitor,
+                         )
+        self.quantize = EMAVectorQuantizer(n_embed=n_embed,
+                                           embedding_dim=embed_dim,
+                                           beta=0.25,
+                                           remap=remap)
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        #Remove self.quantize from parameter list since it is updated via EMA
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+                                    lr=lr, betas=(0.5, 0.9))
+        return [opt_ae, opt_disc], []

taming/modules/__pycache__/util.cpython-312.pyc ADDED Viewed

Binary file (7.4 kB). View file

taming/modules/diffusionmodules/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (34.6 kB). View file

taming/modules/diffusionmodules/model.py ADDED Viewed

	@@ -0,0 +1,776 @@

+# pytorch_diffusion + derived encoder decoder
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+def get_timestep_embedding(timesteps, embedding_dim):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models:
+    From Fairseq.
+    Build sinusoidal embeddings.
+    This matches the implementation in tensor2tensor, but differs slightly
+    from the description in Section 3.5 of "Attention Is All You Need".
+    """
+    assert len(timesteps.shape) == 1
+    half_dim = embedding_dim // 2
+    emb = math.log(10000) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
+    emb = emb.to(device=timesteps.device)
+    emb = timesteps.float()[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0,1,0,0))
+    return emb
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels,
+                                             out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+        h_ = self.proj_out(h_)
+        return x+h_
+class Model(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, use_timestep=True):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = self.ch*4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.use_timestep = use_timestep
+        if self.use_timestep:
+            # timestep embedding
+            self.temb = nn.Module()
+            self.temb.dense = nn.ModuleList([
+                torch.nn.Linear(self.ch,
+                                self.temb_ch),
+                torch.nn.Linear(self.temb_ch,
+                                self.temb_ch),
+            ])
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            skip_in = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch*in_ch_mult[i_level]
+                block.append(ResnetBlock(in_channels=block_in+skip_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x, t=None):
+        #assert x.shape[2] == x.shape[3] == self.resolution
+        if self.use_timestep:
+            # timestep embedding
+            assert t is not None
+            temb = get_timestep_embedding(t, self.ch)
+            temb = self.temb.dense[0](temb)
+            temb = nonlinearity(temb)
+            temb = self.temb.dense[1](temb)
+        else:
+            temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](
+                    torch.cat([h, hs.pop()], dim=1), temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Encoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, double_z=True, **ignore_kwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        2*z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        #assert x.shape[2] == x.shape[3] == self.resolution, "{}, {}, {}".format(x.shape[2], x.shape[3], self.resolution)
+        # timestep embedding
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, give_pre_end=False, **ignorekwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,)+tuple(ch_mult)
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        curr_res = resolution // 2**(self.num_resolutions-1)
+        self.z_shape = (1,z_channels,curr_res,curr_res)
+        print("Working with z of shape {} = {} dimensions.".format(
+            self.z_shape, np.prod(self.z_shape)))
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels,
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, z):
+        #assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class VUNet(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True,
+                 in_channels, c_channels,
+                 resolution, z_channels, use_timestep=False, **ignore_kwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = self.ch*4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.use_timestep = use_timestep
+        if self.use_timestep:
+            # timestep embedding
+            self.temb = nn.Module()
+            self.temb.dense = nn.ModuleList([
+                torch.nn.Linear(self.ch,
+                                self.temb_ch),
+                torch.nn.Linear(self.temb_ch,
+                                self.temb_ch),
+            ])
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(c_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        self.z_in = torch.nn.Conv2d(z_channels,
+                                    block_in,
+                                    kernel_size=1,
+                                    stride=1,
+                                    padding=0)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=2*block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            skip_in = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch*in_ch_mult[i_level]
+                block.append(ResnetBlock(in_channels=block_in+skip_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x, z):
+        #assert x.shape[2] == x.shape[3] == self.resolution
+        if self.use_timestep:
+            # timestep embedding
+            assert t is not None
+            temb = get_timestep_embedding(t, self.ch)
+            temb = self.temb.dense[0](temb)
+            temb = nonlinearity(temb)
+            temb = self.temb.dense[1](temb)
+        else:
+            temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        z = self.z_in(z)
+        h = torch.cat((h,z),dim=1)
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](
+                    torch.cat([h, hs.pop()], dim=1), temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class SimpleDecoder(nn.Module):
+    def __init__(self, in_channels, out_channels, *args, **kwargs):
+        super().__init__()
+        self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
+                                     ResnetBlock(in_channels=in_channels,
+                                                 out_channels=2 * in_channels,
+                                                 temb_channels=0, dropout=0.0),
+                                     ResnetBlock(in_channels=2 * in_channels,
+                                                out_channels=4 * in_channels,
+                                                temb_channels=0, dropout=0.0),
+                                     ResnetBlock(in_channels=4 * in_channels,
+                                                out_channels=2 * in_channels,
+                                                temb_channels=0, dropout=0.0),
+                                     nn.Conv2d(2*in_channels, in_channels, 1),
+                                     Upsample(in_channels, with_conv=True)])
+        # end
+        self.norm_out = Normalize(in_channels)
+        self.conv_out = torch.nn.Conv2d(in_channels,
+                                        out_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        for i, layer in enumerate(self.model):
+            if i in [1,2,3]:
+                x = layer(x, None)
+            else:
+                x = layer(x)
+        h = self.norm_out(x)
+        h = nonlinearity(h)
+        x = self.conv_out(h)
+        return x
+class UpsampleDecoder(nn.Module):
+    def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
+                 ch_mult=(2,2), dropout=0.0):
+        super().__init__()
+        # upsampling
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        block_in = in_channels
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.res_blocks = nn.ModuleList()
+        self.upsample_blocks = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            res_block = []
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                res_block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+            self.res_blocks.append(nn.ModuleList(res_block))
+            if i_level != self.num_resolutions - 1:
+                self.upsample_blocks.append(Upsample(block_in, True))
+                curr_res = curr_res * 2
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        # upsampling
+        h = x
+        for k, i_level in enumerate(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.res_blocks[i_level][i_block](h, None)
+            if i_level != self.num_resolutions - 1:
+                h = self.upsample_blocks[k](h)
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h

taming/modules/discriminator/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (3.81 kB). View file

taming/modules/discriminator/model.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import functools
+import torch.nn as nn
+from taming.modules.util import ActNorm
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        nn.init.normal_(m.weight.data, 0.0, 0.02)
+    elif classname.find('BatchNorm') != -1:
+        nn.init.normal_(m.weight.data, 1.0, 0.02)
+        nn.init.constant_(m.bias.data, 0)
+class NLayerDiscriminator(nn.Module):
+    """Defines a PatchGAN discriminator as in Pix2Pix
+        --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+    """
+    def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
+        """Construct a PatchGAN discriminator
+        Parameters:
+            input_nc (int)  -- the number of channels in input images
+            ndf (int)       -- the number of filters in the last conv layer
+            n_layers (int)  -- the number of conv layers in the discriminator
+            norm_layer      -- normalization layer
+        """
+        super(NLayerDiscriminator, self).__init__()
+        if not use_actnorm:
+            norm_layer = nn.BatchNorm2d
+        else:
+            norm_layer = ActNorm
+        if type(norm_layer) == functools.partial:  # no need to use bias as BatchNorm2d has affine parameters
+            use_bias = norm_layer.func != nn.BatchNorm2d
+        else:
+            use_bias = norm_layer != nn.BatchNorm2d
+        kw = 4
+        padw = 1
+        sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
+        nf_mult = 1
+        nf_mult_prev = 1
+        for n in range(1, n_layers):  # gradually increase the number of filters
+            nf_mult_prev = nf_mult
+            nf_mult = min(2 ** n, 8)
+            sequence += [
+                nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias),
+                norm_layer(ndf * nf_mult),
+                nn.LeakyReLU(0.2, True)
+            ]
+        nf_mult_prev = nf_mult
+        nf_mult = min(2 ** n_layers, 8)
+        sequence += [
+            nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias),
+            norm_layer(ndf * nf_mult),
+            nn.LeakyReLU(0.2, True)
+        ]
+        sequence += [
+            nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)]  # output 1 channel prediction map
+        self.main = nn.Sequential(*sequence)
+    def forward(self, input):
+        """Standard forward."""
+        return self.main(input)

taming/modules/losses/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from taming.modules.losses.vqperceptual import DummyLoss
2	+

taming/modules/losses/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (253 Bytes). View file