Spaces:

milyiyo
/

reimagine-it

Runtime error

App Files Files Community

Alberto Carmona commited on Jul 24, 2022

Commit

23d1bbb

1 Parent(s): ebd4e51

Track error cloning the repo

Browse files

Files changed (46) hide show

clip/__init__.py +0 -1
clip/bpe_simple_vocab_16e6.txt.gz +0 -3
clip/clip.py +0 -193
clip/model.py +0 -437
clip/simple_tokenizer.py +0 -132
configs/phase1/FineCapEval_clipRN50_mle.yml +0 -60
configs/phase1/clipRN50_mle.yml +0 -52
configs/phase1/transformer.yml +0 -41
configs/phase2/FineCapEval_clipRN50_cider.yml +0 -61
configs/phase2/FineCapEval_clipRN50_cider_clips.yml +0 -65
configs/phase2/FineCapEval_clipRN50_clips.yml +0 -64
configs/phase2/FineCapEval_clipRN50_clips_grammar.yml +0 -64
configs/phase2/clipRN50_cider.yml +0 -58
configs/phase2/clipRN50_cider_clips.yml +0 -61
configs/phase2/clipRN50_clips.yml +0 -58
configs/phase2/clipRN50_clips_grammar.yml +0 -64
configs/phase2/transformer.yml +0 -41
data/README.md +0 -1
retrieval/README.md +0 -5
retrieval/caption_data.py +0 -500
retrieval/clip_model.py +0 -350
retrieval/configs/clip_negative_text.yaml +0 -14
retrieval/param.py +0 -209
retrieval/pth_loader.py +0 -334
retrieval/text_utils.py +0 -74
retrieval/train_pl.py +0 -661
save/README.md +0 -1
scripts/build_bpe_subword_nmt.py +0 -214
scripts/clip_prepro_feats.py +0 -170
scripts/clipscore_prepro_feats.py +0 -162
scripts/copy_model.sh +0 -9
scripts/dump_to_h5df.py +0 -56
scripts/dump_to_lmdb.py +0 -241
scripts/make_bu_data.py +0 -52
scripts/prepro_feats.py +0 -103
scripts/prepro_labels.py +0 -206
scripts/prepro_ngrams.py +0 -94
scripts/prepro_reference_json.py +0 -69
scripts_FineCapEval/clip_prepro_feats.py +0 -163
scripts_FineCapEval/clipscore_prepro_feats.py +0 -154
scripts_FineCapEval/prepro_labels.py +0 -209
tools/eval.py +0 -125
tools/eval_clip_retrieval.py +0 -231
tools/eval_finecapeval.py +0 -204
tools/finecapeval_inference.py +0 -186
tools/train_pl.py +0 -709

clip/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .clip import *

clip/bpe_simple_vocab_16e6.txt.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
-size 1356917

clip/clip.py DELETED Viewed

@@ -1,193 +0,0 @@
-import hashlib
-import os
-import urllib
-import warnings
-from typing import Union, List
-import torch
-from PIL import Image
-from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
-from tqdm import tqdm
-from .model import build_model
-from .simple_tokenizer import SimpleTokenizer as _Tokenizer
-__all__ = ["available_models", "load", "tokenize"]
-_tokenizer = _Tokenizer()
-_MODELS = {
-    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
-    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
-    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
-    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
-}
-def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")):
-    os.makedirs(root, exist_ok=True)
-    filename = os.path.basename(url)
-    expected_sha256 = url.split("/")[-2]
-    download_target = os.path.join(root, filename)
-    if os.path.exists(download_target) and not os.path.isfile(download_target):
-        raise RuntimeError(f"{download_target} exists and is not a regular file")
-    if os.path.isfile(download_target):
-        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
-            return download_target
-        else:
-            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
-    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
-        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
-            while True:
-                buffer = source.read(8192)
-                if not buffer:
-                    break
-                output.write(buffer)
-                loop.update(len(buffer))
-    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
-        raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
-    return download_target
-def _transform(n_px):
-    return Compose([
-        Resize(n_px, interpolation=Image.BICUBIC),
-        CenterCrop(n_px),
-        lambda image: image.convert("RGB"),
-        ToTensor(),
-        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
-    ])
-def available_models() -> List[str]:
-    """Returns the names of available CLIP models"""
-    return list(_MODELS.keys())
-def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit=True):
-    """Load a CLIP model
-    Parameters
-    ----------
-    name : str
-        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
-    device : Union[str, torch.device]
-        The device to put the loaded model
-    jit : bool
-        Whether to load the optimized JIT model (default) or more hackable non-JIT model.
-    Returns
-    -------
-    model : torch.nn.Module
-        The CLIP model
-    preprocess : Callable[[PIL.Image], torch.Tensor]
-        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
-    """
-    if name in _MODELS:
-        model_path = _download(_MODELS[name])
-    elif os.path.isfile(name):
-        model_path = name
-    else:
-        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
-    try:
-        # loading JIT archive
-        model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
-        state_dict = None
-    except RuntimeError:
-        # loading saved state dict
-        if jit:
-            warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
-            jit = False
-        state_dict = torch.load(model_path, map_location="cpu")
-    if not jit:
-        model = build_model(state_dict or model.state_dict()).to(device)
-        if str(device) == "cpu":
-            model.float()
-        return model, _transform(model.visual.input_resolution)
-    # patch the device names
-    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
-    device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
-    def patch_device(module):
-        graphs = [module.graph] if hasattr(module, "graph") else []
-        if hasattr(module, "forward1"):
-            graphs.append(module.forward1.graph)
-        for graph in graphs:
-            for node in graph.findAllNodes("prim::Constant"):
-                if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
-                    node.copyAttributes(device_node)
-    model.apply(patch_device)
-    patch_device(model.encode_image)
-    patch_device(model.encode_text)
-    # patch dtype to float32 on CPU
-    if str(device) == "cpu":
-        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
-        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
-        float_node = float_input.node()
-        def patch_float(module):
-            graphs = [module.graph] if hasattr(module, "graph") else []
-            if hasattr(module, "forward1"):
-                graphs.append(module.forward1.graph)
-            for graph in graphs:
-                for node in graph.findAllNodes("aten::to"):
-                    inputs = list(node.inputs())
-                    for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
-                        if inputs[i].node()["value"] == 5:
-                            inputs[i].node().copyAttributes(float_node)
-        model.apply(patch_float)
-        patch_float(model.encode_image)
-        patch_float(model.encode_text)
-        model.float()
-    return model, _transform(model.input_resolution.item())
-def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor:
-    """
-    Returns the tokenized representation of given input string(s)
-    Parameters
-    ----------
-    texts : Union[str, List[str]]
-        An input string or a list of input strings to tokenize
-    context_length : int
-        The context length to use; all CLIP models use 77 as the context length
-    Returns
-    -------
-    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
-    """
-    if isinstance(texts, str):
-        texts = [texts]
-    sot_token = _tokenizer.encoder["<|startoftext|>"]
-    eot_token = _tokenizer.encoder["<|endoftext|>"]
-    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
-    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
-    for i, tokens in enumerate(all_tokens):
-        if len(tokens) > context_length:
-            raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
-        result[i, :len(tokens)] = torch.tensor(tokens)
-    return result

clip/model.py DELETED Viewed

@@ -1,437 +0,0 @@
-from collections import OrderedDict
-from typing import Tuple, Union
-import torch
-import torch.nn.functional as F
-from torch import nn
-class Bottleneck(nn.Module):
-    expansion = 4
-    def __init__(self, inplanes, planes, stride=1):
-        super().__init__()
-        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
-        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
-        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = None
-        self.stride = stride
-        if stride > 1 or inplanes != planes * Bottleneck.expansion:
-            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
-            self.downsample = nn.Sequential(OrderedDict([
-                ("-1", nn.AvgPool2d(stride)),
-                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
-                ("1", nn.BatchNorm2d(planes * self.expansion))
-            ]))
-    def forward(self, x: torch.Tensor):
-        identity = x
-        out = self.relu(self.bn1(self.conv1(x)))
-        out = self.relu(self.bn2(self.conv2(out)))
-        out = self.avgpool(out)
-        out = self.bn3(self.conv3(out))
-        if self.downsample is not None:
-            identity = self.downsample(x)
-        out += identity
-        out = self.relu(out)
-        return out
-class AttentionPool2d(nn.Module):
-    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
-        super().__init__()
-        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
-        self.k_proj = nn.Linear(embed_dim, embed_dim)
-        self.q_proj = nn.Linear(embed_dim, embed_dim)
-        self.v_proj = nn.Linear(embed_dim, embed_dim)
-        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
-        self.num_heads = num_heads
-    def forward(self, x):
-        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
-        # print(x.shape, self.positional_embedding.shape)
-        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
-        x = x + self.positional_embedding[0, :, None, :].to(x.dtype)  # (HW+1)NC
-        x, _ = F.multi_head_attention_forward(
-            query=x, key=x, value=x,
-            embed_dim_to_check=x.shape[-1],
-            num_heads=self.num_heads,
-            q_proj_weight=self.q_proj.weight,
-            k_proj_weight=self.k_proj.weight,
-            v_proj_weight=self.v_proj.weight,
-            in_proj_weight=None,
-            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
-            bias_k=None,
-            bias_v=None,
-            add_zero_attn=False,
-            dropout_p=0,
-            out_proj_weight=torch.ones_like(self.q_proj.weight),
-            out_proj_bias=torch.zeros_like(self.q_proj.bias),
-            # out_proj_weight=self.c_proj.weight,
-            # out_proj_bias=self.c_proj.bias,
-            use_separate_proj_weight=True,
-            training=self.training,
-            need_weights=False
-        )
-        return x[0]
-class ModifiedResNet(nn.Module):
-    """
-    A ResNet class that is similar to torchvision's but contains the following changes:
-    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
-    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
-    - The final pooling layer is a QKV attention instead of an average pool
-    """
-    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
-        super().__init__()
-        self.output_dim = output_dim
-        self.input_resolution = input_resolution
-        # the 3-layer stem
-        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(width // 2)
-        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(width // 2)
-        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(width)
-        self.avgpool = nn.AvgPool2d(2)
-        self.relu = nn.ReLU(inplace=True)
-        # residual layers
-        self._inplanes = width  # this is a *mutable* variable used during construction
-        self.layer1 = self._make_layer(width, layers[0])
-        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
-        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
-        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
-        embed_dim = width * 32  # the ResNet feature dimension
-        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
-    def _make_layer(self, planes, blocks, stride=1):
-        layers = [Bottleneck(self._inplanes, planes, stride)]
-        self._inplanes = planes * Bottleneck.expansion
-        for _ in range(1, blocks):
-            layers.append(Bottleneck(self._inplanes, planes))
-        return nn.Sequential(*layers)
-    def forward(self, x):
-        def stem(x):
-            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
-                x = self.relu(bn(conv(x)))
-            x = self.avgpool(x)
-            return x
-        x = x.type(self.conv1.weight.dtype)
-        x = stem(x)
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-        # print(x.shape)
-        # x = self.attnpool(x)
-        attnpool = self.attnpool(x)
-        return (x, attnpool)
-class LayerNorm(nn.LayerNorm):
-    """Subclass torch's LayerNorm to handle fp16."""
-    def forward(self, x: torch.Tensor):
-        orig_type = x.dtype
-        ret = super().forward(x.type(torch.float32))
-        return ret.type(orig_type)
-class QuickGELU(nn.Module):
-    def forward(self, x: torch.Tensor):
-        return x * torch.sigmoid(1.702 * x)
-class ResidualAttentionBlock(nn.Module):
-    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
-        super().__init__()
-        self.attn = nn.MultiheadAttention(d_model, n_head)
-        self.ln_1 = LayerNorm(d_model)
-        self.mlp = nn.Sequential(OrderedDict([
-            ("c_fc", nn.Linear(d_model, d_model * 4)),
-            ("gelu", QuickGELU()),
-            ("c_proj", nn.Linear(d_model * 4, d_model))
-        ]))
-        self.ln_2 = LayerNorm(d_model)
-        self.attn_mask = attn_mask
-    def attention(self, x: torch.Tensor):
-        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
-        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
-    def forward(self, x: torch.Tensor):
-        x = x + self.attention(self.ln_1(x))
-        x = x + self.mlp(self.ln_2(x))
-        return x
-class Transformer(nn.Module):
-    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
-        super().__init__()
-        self.width = width
-        self.layers = layers
-        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
-    def forward(self, x: torch.Tensor):
-        return self.resblocks(x)
-class VisualTransformer(nn.Module):
-    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
-        super().__init__()
-        self.input_resolution = input_resolution
-        self.output_dim = output_dim
-        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
-        scale = width ** -0.5
-        self.class_embedding = nn.Parameter(scale * torch.randn(width))
-        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
-        self.ln_pre = LayerNorm(width)
-        self.transformer = Transformer(width, layers, heads)
-        self.ln_post = LayerNorm(width)
-        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
-    def forward(self, x: torch.Tensor):
-        x = self.conv1(x)  # shape = [*, width, grid, grid]
-        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
-        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
-        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
-        x = x + self.positional_embedding.to(x.dtype)
-        x = self.ln_pre(x)
-        x = x.permute(1, 0, 2)  # NLD -> LND
-        x = self.transformer(x)
-        x = x.permute(1, 0, 2)  # LND -> NLD
-        # x = self.ln_post(x[:, 0, :])
-        x = self.ln_post(x)
-        # if self.proj is not None:
-        #     x = x @ self.proj
-        return x
-class CLIP(nn.Module):
-    def __init__(self,
-                 embed_dim: int,
-                 # vision
-                 image_resolution: int,
-                 vision_layers: Union[Tuple[int, int, int, int], int],
-                 vision_width: int,
-                 vision_patch_size: int,
-                 # text
-                 context_length: int,
-                 vocab_size: int,
-                 transformer_width: int,
-                 transformer_heads: int,
-                 transformer_layers: int
-                 ):
-        super().__init__()
-        self.context_length = context_length
-        if isinstance(vision_layers, (tuple, list)):
-            vision_heads = vision_width * 32 // 64
-            self.visual = ModifiedResNet(
-                layers=vision_layers,
-                output_dim=embed_dim,
-                heads=vision_heads,
-                input_resolution=image_resolution,
-                width=vision_width
-            )
-        else:
-            vision_heads = vision_width // 64
-            self.visual = VisualTransformer(
-                input_resolution=image_resolution,
-                patch_size=vision_patch_size,
-                width=vision_width,
-                layers=vision_layers,
-                heads=vision_heads,
-                output_dim=embed_dim
-            )
-        self.transformer = Transformer(
-            width=transformer_width,
-            layers=transformer_layers,
-            heads=transformer_heads,
-            attn_mask=self.build_attention_mask()
-        )
-        self.vocab_size = vocab_size
-        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
-        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
-        self.ln_final = LayerNorm(transformer_width)
-        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
-        self.logit_scale = nn.Parameter(torch.ones([]))
-        self.initialize_parameters()
-    def initialize_parameters(self):
-        nn.init.normal_(self.token_embedding.weight, std=0.02)
-        nn.init.normal_(self.positional_embedding, std=0.01)
-        if isinstance(self.visual, ModifiedResNet):
-            if self.visual.attnpool is not None:
-                std = self.visual.attnpool.c_proj.in_features ** -0.5
-                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
-                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
-                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
-                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
-            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
-                for name, param in resnet_block.named_parameters():
-                    if name.endswith("bn3.weight"):
-                        nn.init.zeros_(param)
-        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
-        attn_std = self.transformer.width ** -0.5
-        fc_std = (2 * self.transformer.width) ** -0.5
-        for block in self.transformer.resblocks:
-            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
-            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
-            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
-            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
-        if self.text_projection is not None:
-            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
-    def build_attention_mask(self):
-        # lazily create causal attention mask, with full attention between the vision tokens
-        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(self.context_length, self.context_length)
-        mask.fill_(float("-inf"))
-        mask.triu_(1)  # zero out the lower diagonal
-        return mask
-    @property
-    def dtype(self):
-        return self.visual.conv1.weight.dtype
-    def encode_image(self, image):
-        return self.visual(image.type(self.dtype))
-    def encode_text(self, text):
-        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
-        x = x + self.positional_embedding.type(self.dtype)
-        x = x.permute(1, 0, 2)  # NLD -> LND
-        x = self.transformer(x)
-        x = x.permute(1, 0, 2)  # LND -> NLD
-        x = self.ln_final(x).type(self.dtype)
-        # x.shape = [batch_size, n_ctx, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
-        return x
-    def forward(self, image, text):
-        image_features = self.encode_image(image)
-        text_features = self.encode_text(text)
-        # normalized features
-        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
-        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
-        # cosine similarity as logits
-        logit_scale = self.logit_scale.exp()
-        logits_per_image = logit_scale * image_features @ text_features.t()
-        logits_per_text = logit_scale * text_features @ image_features.t()
-        # shape = [global_batch_size, global_batch_size]
-        return logits_per_image, logits_per_text
-def convert_weights(model: nn.Module):
-    """Convert applicable model parameters to fp16"""
-    def _convert_weights_to_fp16(l):
-        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
-            l.weight.data = l.weight.data.half()
-            if l.bias is not None:
-                l.bias.data = l.bias.data.half()
-        if isinstance(l, nn.MultiheadAttention):
-            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
-                tensor = getattr(l, attr)
-                if tensor is not None:
-                    tensor.data = tensor.data.half()
-        for name in ["text_projection", "proj"]:
-            if hasattr(l, name):
-                attr = getattr(l, name)
-                if attr is not None:
-                    attr.data = attr.data.half()
-    model.apply(_convert_weights_to_fp16)
-def build_model(state_dict: dict):
-    vit = "visual.proj" in state_dict
-    if vit:
-        vision_width = state_dict["visual.conv1.weight"].shape[0]
-        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
-        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
-        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
-        image_resolution = vision_patch_size * grid_size
-    else:
-        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
-        vision_layers = tuple(counts)
-        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
-        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
-        vision_patch_size = None
-        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
-        image_resolution = output_width * 32
-    embed_dim = state_dict["text_projection"].shape[1]
-    context_length = state_dict["positional_embedding"].shape[0]
-    vocab_size = state_dict["token_embedding.weight"].shape[0]
-    transformer_width = state_dict["ln_final.weight"].shape[0]
-    transformer_heads = transformer_width // 64
-    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
-    model = CLIP(
-        embed_dim,
-        image_resolution, vision_layers, vision_width, vision_patch_size,
-        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
-    )
-    for key in ["input_resolution", "context_length", "vocab_size"]:
-        if key in state_dict:
-            del state_dict[key]
-    convert_weights(model)
-    model.load_state_dict(state_dict)
-    return model.eval()

clip/simple_tokenizer.py DELETED Viewed

@@ -1,132 +0,0 @@
-import gzip
-import html
-import os
-from functools import lru_cache
-import ftfy
-import regex as re
-@lru_cache()
-def default_bpe():
-    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-def get_pairs(word):
-    """Return set of symbol pairs in a word.
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-def basic_clean(text):
-    text = ftfy.fix_text(text)
-    text = html.unescape(html.unescape(text))
-    return text.strip()
-def whitespace_clean(text):
-    text = re.sub(r'\s+', ' ', text)
-    text = text.strip()
-    return text
-class SimpleTokenizer(object):
-    def __init__(self, bpe_path: str = default_bpe()):
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
-        merges = merges[1:49152-256-2+1]
-        merges = [tuple(merge.split()) for merge in merges]
-        vocab = list(bytes_to_unicode().values())
-        vocab = vocab + [v+'</w>' for v in vocab]
-        for merge in merges:
-            vocab.append(''.join(merge))
-        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
-        self.encoder = dict(zip(vocab, range(len(vocab))))
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.bpe_ranks = dict(zip(merges, range(len(merges))))
-        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
-        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
-        pairs = get_pairs(word)
-        if not pairs:
-            return token+'</w>'
-        while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
-                    new_word.extend(word[i:])
-                    break
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = ' '.join(word)
-        self.cache[token] = word
-        return word
-    def encode(self, text):
-        bpe_tokens = []
-        text = whitespace_clean(basic_clean(text)).lower()
-        for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
-            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
-        return bpe_tokens
-    def decode(self, tokens):
-        text = ''.join([self.decoder[token] for token in tokens])
-        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
-        return text

configs/phase1/FineCapEval_clipRN50_mle.yml DELETED Viewed

@@ -1,60 +0,0 @@
-caption_model: transformer
-noamopt: true
-noamopt_warmup: 20000
-label_smoothing: 0.0
-input_json: data/FineCapEval.json
-input_label_h5: none
-input_fc_dir: data/FineCapEval_clip_RN50_fc
-input_att_dir: data/FineCapEval_clip_RN50_att
-input_clipscore_vis_dir: data/FineCapEval_clipscore_vis
-seq_per_img: 5
-batch_size: 200
-learning_rate: 0.0005
-checkpoint_path: ./save/clipRN50_mle/clipRN50_mle
-# clip_load_path: '/scratch-space/retrieval/save/clip_negative_text/clip_negative_text-epoch=10.ckpt'
-# Notice: because I'm to lazy, I reuse the option name for RNNs to set the hyperparameters for transformer:
-# N=num_layers
-# d_model=input_encoding_size
-# d_ff=rnn_size
-# will be ignored
-num_layers: 6
-input_encoding_size: 512
-rnn_size: 2048
-# Transformer config
-N_enc: 6
-N_dec: 6
-d_model: 512
-d_ff: 2048
-num_att_heads: 8
-dropout: 0.1
-learning_rate_decay_start: 0
-scheduled_sampling_start: -1
-save_checkpoint_every: 3000
-language_eval: 1
-val_images_use: 5000
-max_epochs: 15
-train_sample_n: 5
-REFORWARD: false
-# _BASE_: transformer.yml
-reduce_on_plateau: false
-noamopt: false
-learning_rate: 0.000005
-learning_rate_decay_start: -1
-self_critical_after: 15
-max_epochs: 50
-verbose: false
-precision: 32
-use_clipscore: false

configs/phase1/clipRN50_mle.yml DELETED Viewed

@@ -1,52 +0,0 @@
-caption_model: transformer
-noamopt: true
-# noamopt: false
-noamopt_warmup: 20000
-label_smoothing: 0.0
-input_json: data/cocotalk.json
-input_label_h5: data/cocotalk_label.h5
-input_fc_dir: data/cocotalk_clip_RN50_fc
-input_att_dir: data/cocotalk_clip_RN50_att
-input_clipscore_vis_dir: data/cocotalk_clipscore_vis
-seq_per_img: 5
-# batch_size: 600
-batch_size: 200
-learning_rate: 0.0005
-# checkpoint_path: ./save/trans_clip_rn50_sc_pl
-checkpoint_path: save/clipRN50_mle/clipRN50_mle
-# Notice: because I'm to lazy, I reuse the option name for RNNs to set the hyperparameters for transformer:
-# N=num_layers
-# d_model=input_encoding_size
-# d_ff=rnn_size
-# will be ignored
-num_layers: 6
-input_encoding_size: 512
-rnn_size: 2048
-# Transformer config
-N_enc: 6
-N_dec: 6
-d_model: 512
-d_ff: 2048
-num_att_heads: 8
-dropout: 0.1
-learning_rate_decay_start: 0
-scheduled_sampling_start: -1
-save_checkpoint_every: 3000
-language_eval: 1
-val_images_use: 5000
-# max_epochs: 15
-max_epochs: 25
-train_sample_n: 5
-REFORWARD: false
-verbose: false
-precision: 16

configs/phase1/transformer.yml DELETED Viewed

@@ -1,41 +0,0 @@
-caption_model: transformer
-noamopt: true
-noamopt_warmup: 20000
-label_smoothing: 0.0
-input_json: data/cocotalk.json
-input_label_h5: data/cocotalk_label.h5
-input_att_dir: data/cocotalk_att
-seq_per_img: 5
-batch_size: 10
-learning_rate: 0.0005
-checkpoint_path: ./save/trans_rn50_sc
-# Notice: because I'm to lazy, I reuse the option name for RNNs to set the hyperparameters for transformer:
-# N=num_layers
-# d_model=input_encoding_size
-# d_ff=rnn_size
-# will be ignored
-num_layers: 6
-input_encoding_size: 512
-rnn_size: 2048
-# Transformer config
-N_enc: 6
-N_dec: 6
-d_model: 512
-d_ff: 2048
-num_att_heads: 8
-dropout: 0.1
-learning_rate_decay_start: 0
-scheduled_sampling_start: -1
-save_checkpoint_every: 3000
-language_eval: 1
-val_images_use: 5000
-max_epochs: 15
-train_sample_n: 5
-REFORWARD: false

configs/phase2/FineCapEval_clipRN50_cider.yml DELETED Viewed

@@ -1,61 +0,0 @@
-caption_model: transformer
-noamopt: true
-noamopt_warmup: 20000
-label_smoothing: 0.0
-input_json: data/FineCapEval.json
-input_label_h5: none
-input_fc_dir: data/FineCapEval_clip_RN50_fc
-input_att_dir: data/FineCapEval_clip_RN50_att
-input_clipscore_vis_dir: data/FineCapEval_clipscore_vis
-seq_per_img: 5
-batch_size: 200
-learning_rate: 0.0005
-checkpoint_path: ./save/clipRN50_cider/clipRN50_cider
-# clip_load_path: '/scratch-space/retrieval/save/clip_negative_text/clip_negative_text-epoch=10.ckpt'
-# Notice: because I'm to lazy, I reuse the option name for RNNs to set the hyperparameters for transformer:
-# N=num_layers
-# d_model=input_encoding_size
-# d_ff=rnn_size
-# will be ignored
-num_layers: 6
-input_encoding_size: 512
-rnn_size: 2048
-# Transformer config
-N_enc: 6
-N_dec: 6
-d_model: 512
-d_ff: 2048
-num_att_heads: 8
-dropout: 0.1
-learning_rate_decay_start: 0
-scheduled_sampling_start: -1
-save_checkpoint_every: 3000
-language_eval: 1
-val_images_use: 5000
-max_epochs: 15
-train_sample_n: 5
-REFORWARD: false
-# _BASE_: transformer.yml
-reduce_on_plateau: false
-noamopt: false
-learning_rate: 0.000005
-learning_rate_decay_start: -1
-self_critical_after: 15
-max_epochs: 50
-verbose: false
-precision: 32
-# use_clipscore: true
-use_clipscore: false

configs/phase2/FineCapEval_clipRN50_cider_clips.yml DELETED Viewed

@@ -1,65 +0,0 @@
-caption_model: transformer
-noamopt: true
-noamopt_warmup: 20000
-label_smoothing: 0.0
-input_json: data/FineCapEval.json
-input_label_h5: none
-input_fc_dir: data/FineCapEval_clip_RN50_fc
-input_att_dir: data/FineCapEval_clip_RN50_att
-input_clipscore_vis_dir: data/FineCapEval_clipscore_vis
-seq_per_img: 5
-batch_size: 200
-learning_rate: 0.0005
-checkpoint_path: ./save/clipRN50_cider_clips/clipRN50_cider_clips
-# clip_load_path: '/scratch-space/retrieval/save/clip_negative_text/clip_negative_text-epoch=10.ckpt'
-# Notice: because I'm to lazy, I reuse the option name for RNNs to set the hyperparameters for transformer:
-# N=num_layers
-# d_model=input_encoding_size
-# d_ff=rnn_size
-# will be ignored
-num_layers: 6
-input_encoding_size: 512
-rnn_size: 2048
-# Transformer config
-N_enc: 6
-N_dec: 6
-d_model: 512
-d_ff: 2048
-num_att_heads: 8
-dropout: 0.1
-learning_rate_decay_start: 0
-scheduled_sampling_start: -1
-save_checkpoint_every: 3000
-language_eval: 1
-val_images_use: 5000
-max_epochs: 15
-train_sample_n: 5
-REFORWARD: false
-# _BASE_: transformer.yml
-reduce_on_plateau: false
-noamopt: false
-learning_rate: 0.000005
-learning_rate_decay_start: -1
-self_critical_after: 15
-max_epochs: 50
-verbose: false
-precision: 32
-# use_clipscore: true
-use_clipscore: false
-clipscore_reward_weight: 2.0
-clipscore_mode: clip_s
-use_multi_rewards: true

configs/phase2/FineCapEval_clipRN50_clips.yml DELETED Viewed

@@ -1,64 +0,0 @@
-caption_model: transformer
-noamopt: true
-noamopt_warmup: 20000
-label_smoothing: 0.0
-input_json: data/FineCapEval.json
-input_label_h5: none
-input_fc_dir: data/FineCapEval_clip_RN50_fc
-input_att_dir: data/FineCapEval_clip_RN50_att
-input_clipscore_vis_dir: data/FineCapEval_clipscore_vis
-seq_per_img: 5
-batch_size: 160
-learning_rate: 0.0005
-checkpoint_path: ./save/clipRN50_clips/clipRN50_clips
-use_multi_rewards: false
-use_grammar: false
-use_grammar_baseline: false
-# clip_load_path: '/scratch-space/retrieval/save/clip_negative_text/clip_negative_text-epoch=10.ckpt'
-# Notice: because I'm to lazy, I reuse the option name for RNNs to set the hyperparameters for transformer:
-# N=num_layers
-# d_model=input_encoding_size
-# d_ff=rnn_size
-# will be ignored
-num_layers: 6
-input_encoding_size: 512
-rnn_size: 2048
-# Transformer config
-N_enc: 6
-N_dec: 6
-d_model: 512
-d_ff: 2048
-num_att_heads: 8
-dropout: 0.1
-learning_rate_decay_start: 0
-scheduled_sampling_start: -1
-save_checkpoint_every: 3000
-language_eval: 0
-val_images_use: 5000
-max_epochs: 15
-train_sample_n: 5
-REFORWARD: false
-# _BASE_: transformer.yml
-reduce_on_plateau: false
-noamopt: false
-learning_rate: 0.000005
-learning_rate_decay_start: -1
-self_critical_after: 15
-max_epochs: 50
-verbose: false
-precision: 32
-# use_clipscore: true
-use_clipscore: false
-clipscore_reward_weight: 2.0

configs/phase2/FineCapEval_clipRN50_clips_grammar.yml DELETED Viewed

@@ -1,64 +0,0 @@
-caption_model: transformer
-noamopt: true
-noamopt_warmup: 20000
-label_smoothing: 0.0
-input_json: data/FineCapEval.json
-input_label_h5: none
-input_fc_dir: data/FineCapEval_clip_RN50_fc
-input_att_dir: data/FineCapEval_clip_RN50_att
-input_clipscore_vis_dir: data/FineCapEval_clipscore_vis
-seq_per_img: 5
-batch_size: 160
-learning_rate: 0.0005
-checkpoint_path: ./save/clipRN50_clips_grammar/clipRN50_clips_grammar
-use_multi_rewards: true
-use_grammar: true
-use_grammar_baseline: true
-# clip_load_path: '/scratch-space/retrieval/save/clip_negative_text/clip_negative_text-epoch=10.ckpt'
-# Notice: because I'm to lazy, I reuse the option name for RNNs to set the hyperparameters for transformer:
-# N=num_layers
-# d_model=input_encoding_size
-# d_ff=rnn_size
-# will be ignored
-num_layers: 6
-input_encoding_size: 512
-rnn_size: 2048
-# Transformer config
-N_enc: 6
-N_dec: 6
-d_model: 512
-d_ff: 2048
-num_att_heads: 8
-dropout: 0.1
-learning_rate_decay_start: 0
-scheduled_sampling_start: -1
-save_checkpoint_every: 3000
-language_eval: 0
-val_images_use: 5000
-max_epochs: 15
-train_sample_n: 5
-REFORWARD: false
-# _BASE_: transformer.yml
-reduce_on_plateau: false
-noamopt: false
-learning_rate: 0.000005
-learning_rate_decay_start: -1
-self_critical_after: 15
-max_epochs: 50
-verbose: false
-precision: 32
-# use_clipscore: true
-use_clipscore: false
-clipscore_reward_weight: 2.0

configs/phase2/clipRN50_cider.yml DELETED Viewed

@@ -1,58 +0,0 @@
-caption_model: transformer
-noamopt: true
-noamopt_warmup: 20000
-label_smoothing: 0.0
-input_json: data/cocotalk.json
-input_label_h5: data/cocotalk_label.h5
-input_fc_dir: data/cocotalk_clip_RN50_fc
-input_att_dir: data/cocotalk_clip_RN50_att
-# used only for evaluation
-input_clipscore_vis_dir: data/cocotalk_clipscore_vis
-seq_per_img: 5
-batch_size: 200
-learning_rate: 0.0005
-# checkpoint_path: ./save/trans_clip_rn50_sc_pl_scst_cider
-checkpoint_path: save/clipRN50_cider/clipRN50_cider
-# Notice: because I'm to lazy, I reuse the option name for RNNs to set the hyperparameters for transformer:
-# N=num_layers
-# d_model=input_encoding_size
-# d_ff=rnn_size
-# will be ignored
-num_layers: 6
-input_encoding_size: 512
-rnn_size: 2048
-# Transformer config
-N_enc: 6
-N_dec: 6
-d_model: 512
-d_ff: 2048
-num_att_heads: 8
-dropout: 0.1
-learning_rate_decay_start: 0
-scheduled_sampling_start: -1
-save_checkpoint_every: 3000
-language_eval: 1
-val_images_use: 5000
-max_epochs: 15
-train_sample_n: 5
-REFORWARD: false
-# _BASE_: transformer.yml
-reduce_on_plateau: false
-noamopt: false
-learning_rate: 0.000005
-learning_rate_decay_start: -1
-self_critical_after: 15
-max_epochs: 40
-verbose: false
-precision: 32

configs/phase2/clipRN50_cider_clips.yml DELETED Viewed

@@ -1,61 +0,0 @@
-caption_model: transformer
-noamopt: true
-noamopt_warmup: 20000
-label_smoothing: 0.0
-input_json: data/cocotalk.json
-input_label_h5: data/cocotalk_label.h5
-input_fc_dir: data/cocotalk_clip_RN50_fc
-input_att_dir: data/cocotalk_clip_RN50_att
-input_clipscore_vis_dir: data/cocotalk_clipscore_vis
-seq_per_img: 5
-batch_size: 160
-learning_rate: 0.0005
-checkpoint_path: save/clipRN50_cider_clips/clipRN50_cider_clips
-# Notice: because I'm to lazy, I reuse the option name for RNNs to set the hyperparameters for transformer:
-# N=num_layers
-# d_model=input_encoding_size
-# d_ff=rnn_size
-# will be ignored
-num_layers: 6
-input_encoding_size: 512
-rnn_size: 2048
-# Transformer config
-N_enc: 6
-N_dec: 6
-d_model: 512
-d_ff: 2048
-num_att_heads: 8
-dropout: 0.1
-learning_rate_decay_start: 0
-scheduled_sampling_start: -1
-save_checkpoint_every: 3000
-language_eval: 1
-val_images_use: 5000
-max_epochs: 15
-train_sample_n: 5
-REFORWARD: false
-# _BASE_: transformer.yml
-reduce_on_plateau: false
-noamopt: false
-learning_rate: 0.000005
-learning_rate_decay_start: -1
-self_critical_after: 15
-max_epochs: 40
-verbose: false
-precision: 32
-use_clipscore: true
-clipscore_reward_weight: 2.0
-clipscore_mode: clip_s
-use_multi_rewards: true

configs/phase2/clipRN50_clips.yml DELETED Viewed

@@ -1,58 +0,0 @@
-caption_model: transformer
-noamopt: true
-noamopt_warmup: 20000
-label_smoothing: 0.0
-input_json: data/cocotalk.json
-input_label_h5: data/cocotalk_label.h5
-input_fc_dir: data/cocotalk_clip_RN50_fc
-input_att_dir: data/cocotalk_clip_RN50_att
-input_clipscore_vis_dir: data/cocotalk_clipscore_vis
-seq_per_img: 5
-batch_size: 160
-learning_rate: 0.0005
-checkpoint_path: save/clipRN50_clips/clipRN50_clips
-# Notice: because I'm to lazy, I reuse the option name for RNNs to set the hyperparameters for transformer:
-# N=num_layers
-# d_model=input_encoding_size
-# d_ff=rnn_size
-# will be ignored
-num_layers: 6
-input_encoding_size: 512
-rnn_size: 2048
-# Transformer config
-N_enc: 6
-N_dec: 6
-d_model: 512
-d_ff: 2048
-num_att_heads: 8
-dropout: 0.1
-learning_rate_decay_start: 0
-scheduled_sampling_start: -1
-save_checkpoint_every: 3000
-language_eval: 1
-val_images_use: 5000
-max_epochs: 15
-train_sample_n: 5
-REFORWARD: false
-# _BASE_: transformer.yml
-reduce_on_plateau: false
-noamopt: false
-learning_rate: 0.000005
-learning_rate_decay_start: -1
-self_critical_after: 15
-max_epochs: 40
-verbose: false
-precision: 32
-use_clipscore: true
-clipscore_reward_weight: 2.0

configs/phase2/clipRN50_clips_grammar.yml DELETED Viewed

@@ -1,64 +0,0 @@
-caption_model: transformer
-noamopt: true
-noamopt_warmup: 20000
-label_smoothing: 0.0
-input_json: data/cocotalk.json
-input_label_h5: data/cocotalk_label.h5
-input_fc_dir: data/cocotalk_clip_RN50_fc
-input_att_dir: data/cocotalk_clip_RN50_att
-input_clipscore_vis_dir: data/cocotalk_clipscore_vis
-seq_per_img: 5
-batch_size: 160
-learning_rate: 0.0005
-checkpoint_path: save/clipRN50_clips_grammar/clipRN50_clips_grammar
-use_multi_rewards: true
-use_grammar: true
-use_grammar_baseline: true
-# clip_load_path: '/scratch-space/retrieval/save/clip_negative_text/clip_negative_text-epoch=10.ckpt'
-clip_load_path: 'retrieval/save/clip_negative_text/clip_negative_text-epoch=12.ckpt'
-# Notice: because I'm to lazy, I reuse the option name for RNNs to set the hyperparameters for transformer:
-# N=num_layers
-# d_model=input_encoding_size
-# d_ff=rnn_size
-# will be ignored
-num_layers: 6
-input_encoding_size: 512
-rnn_size: 2048
-# Transformer config
-N_enc: 6
-N_dec: 6
-d_model: 512
-d_ff: 2048
-num_att_heads: 8
-dropout: 0.1
-learning_rate_decay_start: 0
-scheduled_sampling_start: -1
-save_checkpoint_every: 3000
-language_eval: 1
-val_images_use: 5000
-max_epochs: 15
-train_sample_n: 5
-REFORWARD: false
-# _BASE_: transformer.yml
-reduce_on_plateau: false
-noamopt: false
-learning_rate: 0.000005
-learning_rate_decay_start: -1
-self_critical_after: 15
-max_epochs: 40
-verbose: false
-precision: 32
-use_clipscore: true
-clipscore_reward_weight: 2.0

configs/phase2/transformer.yml DELETED Viewed

@@ -1,41 +0,0 @@
-caption_model: transformer
-noamopt: true
-noamopt_warmup: 20000
-label_smoothing: 0.0
-input_json: data/cocotalk.json
-input_label_h5: data/cocotalk_label.h5
-input_att_dir: data/cocotalk_att
-seq_per_img: 5
-batch_size: 10
-learning_rate: 0.0005
-checkpoint_path: ./save/trans_rn50_sc
-# Notice: because I'm to lazy, I reuse the option name for RNNs to set the hyperparameters for transformer:
-# N=num_layers
-# d_model=input_encoding_size
-# d_ff=rnn_size
-# will be ignored
-num_layers: 6
-input_encoding_size: 512
-rnn_size: 2048
-# Transformer config
-N_enc: 6
-N_dec: 6
-d_model: 512
-d_ff: 2048
-num_att_heads: 8
-dropout: 0.1
-learning_rate_decay_start: 0
-scheduled_sampling_start: -1
-save_checkpoint_every: 3000
-language_eval: 1
-val_images_use: 5000
-max_epochs: 15
-train_sample_n: 5
-REFORWARD: false

data/README.md DELETED Viewed

	@@ -1 +0,0 @@
1	- directory to store preprocessed files

retrieval/README.md DELETED Viewed

@@ -1,5 +0,0 @@
-# Finetuning CLIP reward model
-```bash
-python train_pl.py --cfg clip_negative_text --id clip_negative_text
-```

retrieval/caption_data.py DELETED Viewed

@@ -1,500 +0,0 @@
-from torch.utils.data import DataLoader, Dataset, Sampler
-from pathlib import Path
-import json
-from multiprocessing import Pool
-from tqdm import tqdm
-from PIL import Image
-import random
-import numpy as np
-import torch
-import torchvision
-import torchvision.transforms as T
-from torch.utils.data.distributed import DistributedSampler
-from transformers import T5Tokenizer, BertTokenizer, BertTokenizerFast, CLIPTokenizer
-import text_utils
-project_dir = Path(__file__).parent.resolve()
-workspace_dir = project_dir.parent.parent
-dataset_dir = workspace_dir.joinpath('datasets/').resolve()
-# coco_dir = dataset_dir.joinpath('COCO')
-# vg_dir = dataset_dir.joinpath('VG')
-coco_img_dir = dataset_dir.joinpath('COCO/images/')
-coco_data_dir = project_dir.parent.joinpath('CLIP-ViL/CLIP-ViL-Direct/caption/data/')
-# coco_feature_dir = coco_dir.joinpath('features')
-class COCORetrievalDataset(Dataset):
-    def __init__(self, split='karpathy_train', rank=-1, topk=-1, verbose=True, args=None, mode='train'):
-        super().__init__()
-        self.topk = topk
-        self.verbose = verbose
-        self.args = args
-        self.rank = rank
-        self.mode = mode
-        # Loading datasets to data
-        self.source = split
-        if self.verbose:
-            print('Data source: ', self.source)
-        # if self.args.tokenizer is None:
-        #     self.args.tokenizer = self.args.decoder_backbone
-        # if 'bert' in self.args.tokenizer:
-        #     self.tokenizer = BertTokenizerFast.from_pretrained(
-        #         self.args.tokenizer,
-        #         # max_length=self.args.max_text_length,
-        #         # do_lower_case=self.args.do_lower_case
-        #         )
-        # elif 'clip' in self.args.tokenizer:
-        #     self.tokenizer = CLIPTokenizer.from_pretrained(
-        #         self.args.tokenizer,
-        #         # max_length=self.args.max_text_length,
-        #         # do_lower_case=self.args.do_lower_case
-        #         )
-        self.tokenizer = CLIPTokenizer.from_pretrained(
-                self.args.tokenizer,
-                # max_length=self.args.max_text_length,
-                # do_lower_case=self.args.do_lower_case
-                )
-        with open(coco_data_dir.joinpath('cocotalk.json')) as f:
-            self.vocab = list(json.load(f)['ix_to_word'].values())
-            popped = self.vocab.pop(-1)
-            assert popped == 'UNK'
-            if self.verbose:
-                print('vocab size: ', len(self.vocab))
-        data_info_path = coco_data_dir.joinpath('dataset_coco.json')
-        with open(data_info_path) as f:
-            karpathy_data = json.load(f)
-        split_rename = {
-            'train': 'train',
-            'restval': 'train',
-            'val': 'val',
-            'test': 'test'
-        }
-        n_images = 0
-        data = []
-        # self.vocab = set()
-        for datum in karpathy_data['images']:
-            re_split = split_rename[datum['split']]
-            # if re_split == 'train':
-            #     for d in datum['sentences']:
-            #         self.vocab = self.vocab.union(set(d['tokens']))
-            if re_split != self.source.split('_')[-1]:
-                continue
-            if re_split == 'train':
-                # for d in datum['sentences']:
-                #     img_id = datum['filename'].split('.')[0]
-                #     new_datum = {
-                #         'filename': datum['filename'],
-                #         'img_id': img_id,
-                #         'sent': d['raw'].strip(),
-                #         'targets': [d['raw'].strip() for d in datum['sentences']],
-                #         'is_train': True,
-                #         'cocoid': datum['cocoid']
-                #     }
-                #     data.append(new_datum)
-                img_id = datum['filename'].split('.')[0]
-                new_datum = {
-                    'filename': datum['filename'],
-                    'img_id': img_id,
-                    # 'sent': d['raw'],
-                    # 'targets': [d['raw'].strip() for d in datum['sentences']],
-                    'targets': [" ".join(d['tokens']) for d in datum['sentences']],
-                    'is_train': True,
-                    'cocoid': datum['cocoid']
-                }
-                data.append(new_datum)
-            else:
-                img_id = datum['filename'].split('.')[0]
-                new_datum = {
-                    'filename': datum['filename'],
-                    'img_id': img_id,
-                    # 'sent': d['raw'],
-                    # 'targets': [d['raw'].strip() for d in datum['sentences']],
-                    'targets': [" ".join(d['tokens']) for d in datum['sentences']],
-                    'is_train': False,
-                    'cocoid': datum['cocoid']
-                }
-                data.append(new_datum)
-            n_images += 1
-        if self.verbose:
-            print(f"{self.source} has {n_images} images")
-            # print(f"Loaded {len(data)} data from", split)
-        self.n_gpus = torch.cuda.device_count()
-        if self.topk > 0:
-            data = data[:self.topk]
-            if self.verbose:
-                print(f"Use only {self.topk} data")
-        self.data = data
-        # if self.verbose:
-        #     print("# all sentences:", len(self.data))
-        if self.args.load_feat:
-            # feat_dir = coco_dir.joinpath(''
-            # self.feat_loader = HybridLoader('/scratch-space/CLIP-ViL/CLIP-ViL-Direct/caption/data/cocotalk_clipscore_vis', ext='.npy', in_memory=False)
-            self.feat_loader = HybridLoader(
-                coco_data_dir.joinpath('cocotalk_clipscore_vis'),
-                ext='.npy', in_memory=False)
-        else:
-            if 'openai/clip' in self.args.encoder_backbone:
-                # from transformers import CLIPProcessor
-                # self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32",
-                #     size=args.image_size,
-                #     do_resize=True,
-                #     do_center_crop=False,
-                # )
-                # self.img_transform = lambda image: self.processor.feature_extractor(
-                #     image,
-                #     return_tensors='pt')['pixel_values'][0]
-                self.image_mean = [0.48145466, 0.4578275, 0.40821073]
-                self.image_std = [0.26862954, 0.26130258, 0.27577711]
-                # captioning
-                # self.img_transform = T.Compose([
-                #     T.Resize((self.args.image_size, self.args.image_size))
-                # ])
-                # retrieval
-                self.img_transform = T.Compose([
-                    T.Resize(self.args.image_size, interpolation=T.functional.InterpolationMode.BICUBIC),
-                    T.CenterCrop(self.args.image_size)
-                ])
-                self.img_tensor_transform = T.Compose([
-                    # T.RandomCrop(224),
-                    # T.RandomHorizontalFlip(p=0.3),
-                    T.ConvertImageDtype(torch.float),
-                    T.Normalize(self.image_mean, self.image_std)
-                ]
-                )
-            # elif 'google/vit' in self.args.encoder_backbone:
-            #     self.image_mean = [0.5, 0.5, 0.5]
-            #     self.image_std = [0.5, 0.5, 0.5]
-            #     self.img_transform = T.Compose([
-            #         # T.PILToTensor(),
-            #         T.Resize((self.args.image_size, self.args.image_size))
-            #     ])
-            #     self.img_tensor_transform = T.Compose([
-            #         # T.RandomCrop(224),
-            #         # T.RandomHorizontalFlip(p=0.3),
-            #         T.ConvertImageDtype(torch.float),
-            #         T.Normalize(self.image_mean, self.image_std)
-            #     ]
-            #     )
-    def get_negative_text(self, text):
-        neg_type = random.choice(['repeat', 'remove', 'insert', 'swap', 'shuffle'])
-        if neg_type == 'repeat':
-            text = text_utils.repeat(text)
-        elif neg_type == 'remove':
-            text = text_utils.remove(text)
-        elif neg_type == 'insert':
-            text = text_utils.insert(text, self.vocab)
-        elif neg_type == 'swap':
-            text = text_utils.swap(text, self.vocab)
-        elif neg_type == 'shuffle':
-            text = text_utils.shuffle(text)
-        return text, neg_type
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        datum = self.data[idx]
-        return self.process_datum(datum)
-    def process_datum(self, datum):
-        out_dict = {}
-        ###### Image ######
-        if self.args.load_feat:
-            cocoid = datum['cocoid']
-            out_dict['cocoid'] = str(cocoid)
-            img_feat = self.feat_loader.get(str(cocoid))
-            out_dict['img_feat'] = torch.from_numpy(img_feat)
-        else:
-            img_id = datum['img_id']
-            out_dict['img_id'] = img_id
-            if 'train' in datum['filename']:
-                img_split = 'train2014'
-            elif 'val' in datum['filename']:
-                img_split = 'val2014'
-            img_path = coco_img_dir.joinpath(img_split).joinpath(datum['filename']).with_suffix('.jpg')
-            assert img_path.exists()
-            img_path = str(img_path)
-            out_dict['img_path'] = img_path
-            img_tensor = torchvision.io.read_image(img_path)
-            # out_dict['img_tensor'] = img
-            # img = Image.open(img_path).convert('RGB')
-            # img_tensor = torch.as_tensor(np.asarray(img))
-            out_dict['img_tensor'] = self.img_transform(img_tensor)
-            # self.img_transform(img_tensor)
-            # out_dict['img_tensor'] = self.img_transform(img)
-        ###### Text #####
-        # if datum['is_train']:
-        # sent = datum['sent'].strip()
-        sent = random.choice(datum['targets'])
-        # target_ids = self.tokenizer.encode(
-        #     sent, max_length=self.args.gen_max_length, truncation=True)
-        # assert len(target_ids) <= self.args.gen_max_length, len(target_ids)
-        out_dict['sent'] = sent
-        # out_dict['target_ids'] = torch.LongTensor(target_ids)
-        # out_dict['target_length'] = len(target_ids)
-        # negative sample
-        neg_sent, neg_type = self.get_negative_text(sent)
-        # neg_target_ids = self.tokenizer.encode(
-        #     neg_sent, max_length=self.args.gen_max_length, truncation=True)
-        # assert len(neg_target_ids) <= self.args.gen_max_length, len(neg_target_ids)
-        out_dict['neg_sent'] = neg_sent
-        out_dict['neg_type'] = neg_type
-        # out_dict['neg_target_ids'] = torch.LongTensor(neg_target_ids)
-        # out_dict['neg_target_length'] = len(neg_target_ids)
-        if 'targets' in datum:
-            out_dict['targets'] = datum['targets']
-        return out_dict
-    def collate_fn(self, batch):
-        batch_entry = {}
-        B = len(batch)
-        # if 'target_ids' in batch[0]:
-        #     T_W_L = max(entry['target_length'] for entry in batch)
-        #     target_ids = torch.ones(
-        #         B, T_W_L, dtype=torch.long) * self.tokenizer.pad_token_id
-        # if 'target_ids' in batch[0]:
-        #     T_W_L = max(entry['target_length'] for entry in batch)
-        #     target_ids = torch.ones(
-        #         B, T_W_L, dtype=torch.long) * self.tokenizer.pad_token_id
-        targets = []
-        img_ids = []
-        img_paths = []
-        coco_ids = []
-        if self.args.load_feat:
-            img_feats = torch.zeros(B, 512, dtype=torch.float)
-        else:
-            # imgs = []
-            img_tensor = torch.zeros(B, 3, self.args.image_size, self.args.image_size, dtype=torch.uint8)
-        for i, entry in enumerate(batch):
-            if self.args.load_feat:
-                coco_ids.append(entry['cocoid'])
-                img_feats[i] = entry['img_feat']
-            else:
-                img_ids.append(entry['img_id'])
-                img_paths.append(entry['img_path'])
-                img_tensor[i] = entry['img_tensor']
-            # if 'target_ids' in entry:
-            #     target_ids[i, :entry['target_length']] = entry['target_ids']
-            if 'targets' in entry:
-                targets.append(entry['targets'])
-        if 'sent' in batch[0]:
-            # word_mask = target_ids != self.tokenizer.pad_token_id
-            # target_ids[~word_mask] = -100
-            # batch_entry['target_ids'] = target_ids
-            tokenized = self.tokenizer([entry['sent'] for entry in batch], truncation=True, padding=True, return_tensors='pt')
-            neg_tokenized = self.tokenizer([entry['neg_sent'] for entry in batch], truncation=True, padding=True, return_tensors='pt')
-                #     sent, max_length=self.args.gen_max_length, truncation=True)
-            batch_entry['text'] = (tokenized.input_ids, tokenized.attention_mask)
-            batch_entry['neg_text'] = (neg_tokenized.input_ids, neg_tokenized.attention_mask)
-        if self.args.load_feat:
-            batch_entry['coco_ids'] = coco_ids
-            batch_entry['img_feats'] = img_feats
-        else:
-            img_tensor = self.img_tensor_transform(img_tensor)
-            batch_entry['img_id'] = img_ids
-            batch_entry['img_paths'] = img_paths
-            batch_entry['img_tensor'] = img_tensor
-        batch_entry['targets'] = targets
-        # print('batch created')
-        # batch_entry['task'] = 'caption'
-        return batch_entry
-# def get_loader(args, split='karpathy_train', mode='train',
-#                batch_size=32, workers=4, distributed=False, gpu=0,
-#                topk=-1):
-#     verbose = (gpu == 0)
-#     dataset = COCORetrievalDataset(
-#         split,
-#         rank=gpu,
-#         topk=topk,
-#         verbose=verbose,
-#         args=args,
-#         mode=mode)
-#     # if distributed:
-#     #     sampler = DistributedSampler(dataset)
-#     # else:
-#     #     sampler = None
-#     if mode == 'train':
-#         loader = DataLoader(
-#             dataset, batch_size=batch_size, shuffle=(sampler is None),
-#             num_workers=workers, pin_memory=True, sampler=sampler,
-#             collate_fn=dataset.collate_fn)
-#     else:
-#         loader = DataLoader(
-#             dataset,
-#             batch_size=batch_size, shuffle=False,
-#             num_workers=workers, pin_memory=True,
-#             sampler=sampler,
-#             collate_fn=dataset.collate_fn,
-#             drop_last=False)
-#     # if verbose:
-#         # loader.evaluator = COCOCaptionEvaluator()
-#     # loader.task = 'caption'
-#     return loader
-# class COCOCaptionEvaluator:
-#     def __init__(self):
-#         import language_evaluation
-#         self.evaluator = language_evaluation.CocoEvaluator(verbose=False)
-#     def evaluate(self, predicts, answers):
-#         results = self.evaluator.run_evaluation(predicts, answers)
-#         return results
-import six
-import os
-import h5py
-class HybridLoader:
-    """
-    If db_path is a director, then use normal file loading
-    If lmdb, then load from lmdb
-    The loading method depend on extention.
-    in_memory: if in_memory is True, we save all the features in memory
-               For individual np(y|z)s, we don't need to do that because the system will do this for us.
-               Should be useful for lmdb or h5.
-               (Copied this idea from vilbert)
-    """
-    def __init__(self, db_path, ext='.npy', in_memory=False):
-        self.db_path = db_path
-        self.ext = ext
-        if self.ext == '.npy':
-            self.loader = lambda x: np.load(six.BytesIO(x))
-        else:
-            self.loader = lambda x: np.load(six.BytesIO(x))['feat']
-        # if db_path.endswith('.lmdb'):
-        #     self.db_type = 'lmdb'
-        #     self.lmdb = lmdbdict(db_path, unsafe=True)
-        #     self.lmdb._key_dumps = DUMPS_FUNC['ascii']
-        #     self.lmdb._value_loads = LOADS_FUNC['identity']
-        # elif db_path.endswith('.pth'):  # Assume a key,value dictionary
-        #     self.db_type = 'pth'
-        #     self.feat_file = torch.load(db_path)
-        #     self.loader = lambda x: x
-        #     print('HybridLoader: ext is ignored')
-        # elif db_path.endswith('h5'):
-        #     self.db_type = 'h5'
-        #     self.loader = lambda x: np.array(x).astype('float32')
-        # else:
-        #     self.db_type = 'dir'
-        self.in_memory = in_memory
-        if self.in_memory:
-            self.features = {}
-    def get(self, key):
-        # if self.in_memory and key in self.features:
-        #     # We save f_input because we want to save the
-        #     # compressed bytes to save memory
-        #     f_input = self.features[key]
-        # elif self.db_type == 'lmdb':
-        #     f_input = self.lmdb[key]
-        # elif self.db_type == 'pth':
-        #     f_input = self.feat_file[key]
-        # elif self.db_type == 'h5':
-        #     f_input = h5py.File(self.db_path, 'r')[key]
-        # else:
-            # f_input = open(os.path.join(
-            #     self.db_path, key + self.ext), 'rb').read()
-        f_input = open(os.path.join(
-            self.db_path, key + self.ext), 'rb').read()
-        if self.in_memory and key not in self.features:
-            self.features[key] = f_input
-        # load image
-        feat = self.loader(f_input)
-        return feat

retrieval/clip_model.py DELETED Viewed

@@ -1,350 +0,0 @@
-from transformers import CLIPModel, CLIPTokenizer
-import os
-import json
-import argparse
-from random import shuffle, seed
-import string
-# non-standard dependencies:
-import h5py
-from six.moves import cPickle
-import numpy as np
-import torch
-import torchvision.models as models
-import skimage.io
-from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
-from PIL import Image
-from torch import nn
-class CLIPScore(nn.Module):
-    def __init__(self, clipscore_w=2.5, image_size=224, mode='clip_s', use_grammar=False, joint_out=False):
-        super(CLIPScore, self).__init__()
-        # from transformers import CLIPModel, CLIPTokenizer
-        self.clip_model = CLIPModel.from_pretrained(
-            'openai/clip-vit-base-patch32')
-        self.tokenizer = CLIPTokenizer.from_pretrained(
-            'openai/clip-vit-base-patch32')
-        self.clip_model.eval()
-        self.clipscore_w = clipscore_w
-        self.image_transform = self._transform(image_size)
-        self.mode = mode
-        assert mode in ['clip_s', 'refclip_s']
-        self.use_grammar = use_grammar
-        self.joint_out = joint_out
-        if self.use_grammar and self.joint_out is False:
-            self.grammar_score_head = nn.Sequential(
-                nn.Linear(self.clip_model.text_embed_dim, self.clip_model.projection_dim, bias=False),
-                nn.ReLU(),
-                nn.Linear(self.clip_model.projection_dim, 2, bias=False)
-            )
-    def _transform(self, n_px):
-        return Compose([
-            Resize(n_px, interpolation=Image.BICUBIC),
-            CenterCrop(n_px),
-            lambda image: image.convert("RGB"),
-            ToTensor(),
-            Normalize((0.48145466, 0.4578275, 0.40821073),
-                      (0.26862954, 0.26130258, 0.27577711)),
-        ])
-    def load_image(self, image_path):
-        image = Image.open(image_path)
-        return image
-    # @torch.no_grad()
-    def image_extract(self, image):
-        if isinstance(image, str):
-            image = self.load_image(image)
-        if not isinstance(image, torch.Tensor):
-            image = self.image_transform(image)
-        img_tensor = image.view(-1, 3, 224, 224)
-        device = next(self.clip_model.parameters()).device
-        img_tensor = img_tensor.to(device)
-        clip_model = self.clip_model
-        img_feat = clip_model.vision_model(img_tensor).pooler_output
-        img_feat = clip_model.visual_projection(img_feat)
-        img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
-        return img_feat
-    # @torch.no_grad()
-    def text_extract(self, text, prompt="A photo depicts", proj_norm=True):
-        if isinstance(text, str):
-            text_batch = [" ".join([prompt, text])]
-        elif isinstance(text, list):
-            text_batch = [" ".join([prompt, txt]) for txt in text]
-        if isinstance(text, tuple) and isinstance(text[0], torch.Tensor):
-            input_ids, attention_mask = text
-        else:
-            input_text = text_batch
-            tokenized = self.tokenizer(
-                input_text, return_tensors='pt', padding=True)
-            input_ids = tokenized.input_ids
-            attention_mask = tokenized.attention_mask
-        clip_model = self.clip_model
-        device = next(self.clip_model.parameters()).device
-        input_ids = input_ids.to(device)
-        attention_mask = attention_mask.to(device)
-        text_feat = clip_model.text_model(input_ids, attention_mask).pooler_output
-        if proj_norm:
-            text_feat = clip_model.text_projection(text_feat)
-            text_feat = text_feat / text_feat.norm(dim=-1, keepdim=True)
-        return text_feat
-    # @torch.no_grad()
-    def calc_clip_s(self, img_feat, text_feat):
-        return self.clipscore_w * torch.relu((img_feat * text_feat).sum(dim=-1))
-    # @torch.no_grad()
-    def calc_refclip_s(self, img_feat=None, text_feat=None, ref_text_feat=None, ref_text_mask=None, clip_s=None):
-        if clip_s is None:
-            clip_s = self.calc_clip_s(img_feat, text_feat)
-        B, dim = img_feat.size()
-        ref_text_feat = ref_text_feat.view(B, -1, dim)
-        K = ref_text_feat.size(1)
-        text_feat = text_feat.view(B, 1, dim).expand(-1, K, -1)
-        assert ref_text_feat.size() == text_feat.size(
-        ), (ref_text_feat.size(), text_feat.size())
-        ref_score = self.calc_clip_s(text_feat, ref_text_feat)
-        if ref_text_mask is not None:
-            if not isinstance(ref_text_mask, torch.Tensor):
-                ref_text_mask = torch.tensor(
-                    ref_text_mask, dtype=ref_score.dtype, device=ref_score.device)
-            ref_score = ref_score.view(B, K) * ref_text_mask.view(B, K)
-        ref_score = ref_score.view(B, K).max(dim=1).values
-        assert clip_s.size() == (B,)
-        assert clip_s.size() == ref_score.size()
-        # harmonic mean
-        refclip_s = 2 / (1 / clip_s + 1 / ref_score)
-        return refclip_s
-    # # @torch.no_grad()
-    # def forward(self,
-    #             images=None, text=None,
-    #             img_feat=None, text_feat=None,
-    #             ref_text=None, ref_text_feat=None, ref_text_mask=None,
-    #             prompt="A photo depicts",
-    #             mode=None):
-    #     if img_feat is None:
-    #         img_feat = self.image_extract(images)
-    #     img_feat = img_feat.view(-1, 512)
-    #     if text_feat is None:
-    #         text_feat = self.text_extract(text, prompt=prompt)
-    #     text_feat = text_feat.view(-1, 512)
-    #     if mode is None:
-    #         mode = self.mode
-    #     assert mode in ['clip_s', 'refclip_s']
-    #     if mode == 'clip_s':
-    #         clip_s = self.calc_clip_s(img_feat, text_feat)
-    #         return clip_s
-    #     elif mode == 'refclip_s':
-    #         if ref_text_feat is None:
-    #             ref_text_feat = self.text_extract(ref_text, prompt=prompt)
-    #         ref_text_feat = ref_text_feat.view(-1, 512)
-    #         refclip_s = self.calc_refclip_s(
-    #             img_feat, text_feat, ref_text_feat, ref_text_mask=ref_text_mask)
-    #         return refclip_s
-    def train_step(self,
-                   images=None, text=None,
-                   img_feat=None, text_feat=None,
-                   neg_text=None, neg_text_feat=None,
-                #    ref_text=None, ref_text_feat=None, ref_text_mask=None,
-                   prompt="A photo depicts",
-                #    return_loss=True,
-                   **kwargs):
-        if img_feat is None:
-            img_feat = self.image_extract(images)
-        img_feat = img_feat.view(-1, 512)
-        B = img_feat.size(0)
-        if self.joint_out:
-            pos_text_feat = self.text_extract(text, prompt=prompt, proj_norm=False).view(B, 512)
-            neg_text_feat = self.text_extract(neg_text, prompt=prompt, proj_norm=False).view(-1, 512)
-            neg_B = neg_text_feat.size(0)
-            # [B+neg_B, 512]
-            text_feat = torch.cat([pos_text_feat, neg_text_feat], dim=0)
-            text_cont_feat = self.clip_model.text_projection(text_feat)
-            text_cont_feat = text_cont_feat / text_cont_feat.norm(dim=-1, keepdim=True)
-            text_cont_feat = text_cont_feat.view(B+neg_B, 512)
-            logit_scale = self.clip_model.logit_scale.exp()
-            # [B+neg_B * B]
-            logits_per_text = torch.matmul(text_cont_feat, img_feat.t()) * logit_scale
-            # image-to-text label: positive text
-            caption_loss = -torch.diag(nn.functional.log_softmax(logits_per_text, dim=0)[:B]).mean()
-            # calculate text-to-image only on positive text
-            image_loss = -torch.diag(nn.functional.log_softmax(logits_per_text[:B], dim=1)).mean()
-            clip_loss = (caption_loss + image_loss) / 2.0
-            out = {
-                'clip_loss': clip_loss,
-                'img_feat': img_feat,
-                'text_feat': text_cont_feat[:B].detach(),
-                # 'neg_text_feat': neg_text_feat,
-            }
-            return out
-        else:
-            if text_feat is None:
-                text_feat = self.text_extract(text, prompt=prompt, proj_norm=False)
-            text_cont_feat = self.clip_model.text_projection(text_feat)
-            text_cont_feat = text_cont_feat / \
-                text_cont_feat.norm(dim=-1, keepdim=True)
-            text_cont_feat = text_cont_feat.view(B, 512)
-            # cosine similarity as logits
-            logit_scale = self.clip_model.logit_scale.exp()
-            logits_per_text = torch.matmul(text_cont_feat, img_feat.t()) * logit_scale
-            # logits_per_image = logits_per_text.T
-            clip_loss = clip_loss_fn(logits_per_text)
-            # negative sampling
-            pos_text_feat = text_feat.view(B, 512)
-            neg_text_feat = self.text_extract(neg_text, prompt=prompt, proj_norm=False).view(B, 512)
-            grammar_text_feat = torch.cat([pos_text_feat, neg_text_feat], dim=0)
-            # 2B, 1
-            grammar_text_logit = self.grammar_score_head(grammar_text_feat)
-            grammar_labels = torch.LongTensor([1] * B + [0] * B).to(grammar_text_logit.device).view(2 * B)
-            grammar_loss = torch.nn.functional.cross_entropy(grammar_text_logit, grammar_labels)
-            grammar_pred = grammar_text_logit.argmax(dim=1, keepdim=False)
-            grammar_pos_pred = grammar_pred[:B]
-            grammar_neg_pred = grammar_pred[B:]
-            # grammar_acc = (grammar_pred == grammar_labels).float().mean()
-            out = {
-                'clip_loss': clip_loss,
-                'grammar_loss': grammar_loss,
-                'img_feat': img_feat,
-                'text_feat': text_cont_feat,
-                'neg_text_feat': neg_text_feat,
-                'grammar_pos_pred': grammar_pos_pred,
-                'grammar_neg_pred': grammar_neg_pred,
-            }
-            return out
-    def train_step_old(self,
-                   images=None, text=None,
-                   img_feat=None, text_feat=None,
-                   neg_text=None, neg_text_feat=None,
-                #    ref_text=None, ref_text_feat=None, ref_text_mask=None,
-                   prompt="A photo depicts",
-                #    return_loss=True,
-                   **kwargs):
-        if img_feat is None:
-            img_feat = self.image_extract(images)
-        img_feat = img_feat.view(-1, 512)
-        B = img_feat.size(0)
-        if text_feat is None:
-            text_feat = self.text_extract(text, prompt=prompt, proj_norm=False)
-            text_cont_feat = self.clip_model.text_projection(text_feat)
-            text_cont_feat = text_cont_feat / text_cont_feat.norm(dim=-1, keepdim=True)
-        text_cont_feat = text_cont_feat.view(B, 512)
-        # cosine similarity as logits
-        logit_scale = self.clip_model.logit_scale.exp()
-        logits_per_text = torch.matmul(text_cont_feat, img_feat.t()) * logit_scale
-        # logits_per_image = logits_per_text.T
-        clip_loss = clip_loss_fn(logits_per_text)
-        # negative sampling
-        pos_text_feat = text_feat.view(B, 512)
-        neg_text_feat = self.text_extract(neg_text, prompt=prompt, proj_norm=False).view(B, 512)
-        grammar_text_feat = torch.cat([pos_text_feat, neg_text_feat], dim=0)
-        # 2B, 1
-        grammar_text_logit = self.grammar_score_head(grammar_text_feat)
-        grammar_labels = torch.LongTensor([1] * B + [0] * B).to(grammar_text_logit.device).view(2 * B)
-        grammar_loss = torch.nn.functional.cross_entropy(grammar_text_logit, grammar_labels)
-        grammar_pred = grammar_text_logit.argmax(dim=1, keepdim=False)
-        grammar_pos_pred = grammar_pred[:B]
-        grammar_neg_pred = grammar_pred[B:]
-        # grammar_acc = (grammar_pred == grammar_labels).float().mean()
-        out = {
-            'clip_loss': clip_loss,
-            'grammar_loss': grammar_loss,
-            'img_feat': img_feat,
-            'text_feat': text_cont_feat,
-            'neg_text_feat': neg_text_feat,
-            'grammar_pos_pred': grammar_pos_pred,
-            'grammar_neg_pred': grammar_neg_pred,
-        }
-        return out
-# contrastive loss function, adapted from
-# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
-def contrastive_loss(logits: torch.Tensor, dim: int) -> torch.Tensor:
-    neg_ce = torch.diag(nn.functional.log_softmax(logits, dim=dim))
-    return -neg_ce.mean()
-def clip_loss_fn(similarity: torch.Tensor) -> torch.Tensor:
-    caption_loss = contrastive_loss(similarity, dim=0)
-    image_loss = contrastive_loss(similarity, dim=1)
-    return (caption_loss + image_loss) / 2.0

retrieval/configs/clip_negative_text.yaml DELETED Viewed

@@ -1,14 +0,0 @@
-checkpoint_dir: ./save/clip_negative_text/
-losses_log_every: 25
-precision: 32
-load_feat: true
-data_in_memory: false
-batch_size: 1600
-valid_batch_size: 200
-clip_grad_norm: 0
-epochs: 30
-use_grammar: true
-joint_out: false

retrieval/param.py DELETED Viewed

@@ -1,209 +0,0 @@
-import argparse
-import random
-import numpy as np
-import torch
-import pprint
-import yaml
-def str2bool(v):
-    if v.lower() in ('yes', 'true', 't', 'y', '1'):
-        return True
-    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-def is_interactive():
-    import __main__ as main
-    return not hasattr(main, '__file__')
-def get_optimizer(optim, verbose=False):
-    # Bind the optimizer
-    if optim == 'rms':
-        if verbose:
-            print("Optimizer: Using RMSProp")
-        optimizer = torch.optim.RMSprop
-    elif optim == 'adam':
-        if verbose:
-            print("Optimizer: Using Adam")
-        optimizer = torch.optim.Adam
-    elif optim == 'adamw':
-        if verbose:
-            print("Optimizer: Using AdamW")
-        # optimizer = torch.optim.AdamW
-        optimizer = 'adamw'
-    elif optim == 'adamax':
-        if verbose:
-            print("Optimizer: Using Adamax")
-        optimizer = torch.optim.Adamax
-    elif optim == 'sgd':
-        if verbose:
-            print("Optimizer: SGD")
-        optimizer = torch.optim.SGD
-    else:
-        assert False, "Please add your optimizer %s in the list." % optim
-    return optimizer
-def parse_args(parse=True, **optional_kwargs):
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--seed', type=int, default=9595, help='random seed')
-    # Data Splits
-    parser.add_argument("--train", default='karpathy_train')
-    parser.add_argument("--valid", default='karpathy_val')
-    parser.add_argument("--test", default='karpathy_test')
-    # parser.add_argument('--test_only', action='store_true')
-    # Quick experiments
-    parser.add_argument('--train_topk', type=int, default=-1)
-    parser.add_argument('--valid_topk', type=int, default=-1)
-    # Checkpoint
-    parser.add_argument('--output', type=str, default='snap/test')
-    parser.add_argument('--load', type=str, default=None, help='Load the model (usually the fine-tuned model).')
-    parser.add_argument('--from_scratch', action='store_true')
-    # CPU/GPU
-    parser.add_argument("--multiGPU", action='store_const', default=False, const=True)
-    parser.add_argument('--fp16', action='store_true')
-    parser.add_argument("--distributed", action='store_true')
-    parser.add_argument("--num_workers", default=0, type=int)
-    parser.add_argument('--local_rank', type=int, default=-1)
-    # parser.add_argument('--rank', type=int, default=-1)
-    # Model Config
-    # parser.add_argument('--encoder_backbone', type=str, default='openai/clip-vit-base-patch32')
-    # parser.add_argument('--decoder_backbone', type=str, default='bert-base-uncased')
-    parser.add_argument('--tokenizer', type=str, default='openai/clip-vit-base-patch32')
-    # parser.add_argument('--position_embedding_type', type=str, default='absolute')
-    # parser.add_argument('--encoder_transform', action='store_true')
-    parser.add_argument('--max_text_length', type=int, default=40)
-    # parser.add_argument('--image_size', type=int, default=224)
-    # parser.add_argument('--patch_size', type=int, default=32)
-    # parser.add_argument('--decoder_num_layers', type=int, default=12)
-    # Training
-    parser.add_argument('--batch_size', type=int, default=256)
-    parser.add_argument('--valid_batch_size', type=int, default=None)
-    parser.add_argument('--optim', default='adamw')
-    parser.add_argument('--warmup_ratio', type=float, default=0.05)
-    parser.add_argument('--weight_decay', type=float, default=0.01)
-    parser.add_argument('--clip_grad_norm', type=float, default=-1.0)
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1)
-    parser.add_argument('--lr', type=float, default=1e-4)
-    parser.add_argument('--adam_eps', type=float, default=1e-6)
-    parser.add_argument('--adam_beta1', type=float, default=0.9)
-    parser.add_argument('--adam_beta2', type=float, default=0.999)
-    parser.add_argument('--epochs', type=int, default=20)
-    # parser.add_argument('--dropout', type=float, default=0.1)
-    # Inference
-    # parser.add_argument('--num_beams', type=int, default=1)
-    # parser.add_argument('--gen_max_length', type=int, default=20)
-    parser.add_argument('--start_from', type=str, default=None)
-    # Data
-    # parser.add_argument('--do_lower_case', type=str2bool, default=None)
-    # parser.add_argument('--prefix', type=str, default=None)
-    # COCO Caption
-    # parser.add_argument('--no_prefix', action='store_true')
-    parser.add_argument('--no_cls', action='store_true')
-    parser.add_argument('--cfg', type=str, default=None)
-    parser.add_argument('--id', type=str, default=None)
-    # Etc.
-    parser.add_argument('--comment', type=str, default='')
-    parser.add_argument("--dry", action='store_true')
-    # Parse the arguments.
-    if parse:
-        args = parser.parse_args()
-    # For interative engironmnet (ex. jupyter)
-    else:
-        args = parser.parse_known_args()[0]
-    loaded_kwargs = {}
-    if args.cfg is not None:
-        cfg_path = f'configs/{args.cfg}.yaml'
-        with open(cfg_path, 'r') as f:
-            loaded_kwargs = yaml.safe_load(f)
-    # Namespace => Dictionary
-    parsed_kwargs = vars(args)
-    parsed_kwargs.update(optional_kwargs)
-    kwargs = {}
-    kwargs.update(parsed_kwargs)
-    kwargs.update(loaded_kwargs)
-    args = Config(**kwargs)
-    # Bind optimizer class.
-    verbose = False
-    args.optimizer = get_optimizer(args.optim, verbose=verbose)
-    # Set seeds
-    torch.manual_seed(args.seed)
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    return args
-class Config(object):
-    def __init__(self, **kwargs):
-        """Configuration Class: set kwargs as class attributes with setattr"""
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-    @property
-    def config_str(self):
-        return pprint.pformat(self.__dict__)
-    def __repr__(self):
-        """Pretty-print configurations in alphabetical order"""
-        config_str = 'Configurations\n'
-        config_str += self.config_str
-        return config_str
-    # def update(self, **kwargs):
-    #     for k, v in kwargs.items():
-    #         setattr(self, k, v)
-    # def save(self, path):
-    #     with open(path, 'w') as f:
-    #         yaml.dump(self.__dict__, f, default_flow_style=False)
-    # @classmethod
-    # def load(cls, path):
-    #     with open(path, 'r') as f:
-    #         kwargs = yaml.load(f)
-    #     return Config(**kwargs)
-if __name__ == '__main__':
-    args = parse_args(True)

retrieval/pth_loader.py DELETED Viewed

@@ -1,334 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import json
-import h5py
-from lmdbdict import lmdbdict
-from lmdbdict.methods import DUMPS_FUNC, LOADS_FUNC
-import os
-import numpy as np
-import numpy.random as npr
-import random
-import torch
-import torch.utils.data as data
-import multiprocessing
-import six
-verbose = True
-# import torch
-# if torch.cuda.current_device() in [0, -1]:
-if 'LOCAL_RANK' in os.environ and os.environ['LOCAL_RANK'] != '0':
-    verbose = False
-class HybridLoader:
-    """
-    If db_path is a director, then use normal file loading
-    If lmdb, then load from lmdb
-    The loading method depend on extention.
-    in_memory: if in_memory is True, we save all the features in memory
-               For individual np(y|z)s, we don't need to do that because the system will do this for us.
-               Should be useful for lmdb or h5.
-               (Copied this idea from vilbert)
-    """
-    def __init__(self, db_path, ext, in_memory=False):
-        self.db_path = db_path
-        self.ext = ext
-        if self.ext == '.npy':
-            self.loader = lambda x: np.load(six.BytesIO(x))
-        else:
-            self.loader = lambda x: np.load(six.BytesIO(x))['feat']
-        if db_path.endswith('.lmdb'):
-            self.db_type = 'lmdb'
-            self.lmdb = lmdbdict(db_path, unsafe=True)
-            self.lmdb._key_dumps = DUMPS_FUNC['ascii']
-            self.lmdb._value_loads = LOADS_FUNC['identity']
-        elif db_path.endswith('.pth'): # Assume a key,value dictionary
-            self.db_type = 'pth'
-            self.feat_file = torch.load(db_path)
-            self.loader = lambda x: x
-            print('HybridLoader: ext is ignored')
-        elif db_path.endswith('h5'):
-            self.db_type = 'h5'
-            self.loader = lambda x: np.array(x).astype('float32')
-        else:
-            self.db_type = 'dir'
-        self.in_memory = in_memory
-        if self.in_memory:
-            self.features = {}
-    def get(self, key):
-        if self.in_memory and key in self.features:
-            # We save f_input because we want to save the
-            # compressed bytes to save memory
-            f_input = self.features[key]
-        elif self.db_type == 'lmdb':
-            f_input = self.lmdb[key]
-        elif self.db_type == 'pth':
-            f_input = self.feat_file[key]
-        elif self.db_type == 'h5':
-            f_input = h5py.File(self.db_path, 'r')[key]
-        else:
-            f_input = open(os.path.join(self.db_path, key + self.ext), 'rb').read()
-        if self.in_memory and key not in self.features:
-            self.features[key] = f_input
-        # load image
-        feat = self.loader(f_input)
-        return feat
-class CaptionDataset(data.Dataset):
-    def get_vocab_size(self):
-        return self.vocab_size
-    def get_vocab(self):
-        return self.ix_to_word
-    def get_seq_length(self):
-        return self.seq_length
-    def __init__(self, opt):
-        self.opt = opt
-        self.seq_per_img = opt.seq_per_img
-        # feature related options
-        self.use_fc = getattr(opt, 'use_fc', True)
-        self.use_att = getattr(opt, 'use_att', True)
-        self.use_box = getattr(opt, 'use_box', 0)
-        self.norm_att_feat = getattr(opt, 'norm_att_feat', 0)
-        self.norm_box_feat = getattr(opt, 'norm_box_feat', 0)
-        # load the json file which contains additional information about the dataset
-        if verbose:
-            print('DataLoader loading json file: ', opt.input_json)
-        self.info = json.load(open(self.opt.input_json))
-        if 'ix_to_word' in self.info:
-            self.ix_to_word = self.info['ix_to_word']
-            self.vocab_size = len(self.ix_to_word)
-            if verbose:
-                print('vocab size is ', self.vocab_size)
-        # open the hdf5 file
-        if verbose:
-            print('DataLoader loading h5 file: ', opt.input_fc_dir, opt.input_att_dir, opt.input_box_dir, opt.input_label_h5)
-        """
-        Setting input_label_h5 to none is used when only doing generation.
-        For example, when you need to test on coco test set.
-        """
-        if self.opt.input_label_h5 != 'none':
-            self.h5_label_file = h5py.File(self.opt.input_label_h5, 'r', driver='core')
-            # load in the sequence data
-            seq_size = self.h5_label_file['labels'].shape
-            self.label = self.h5_label_file['labels'][:]
-            self.seq_length = seq_size[1]
-            if verbose:
-                print('max sequence length in data is', self.seq_length)
-            # load the pointers in full to RAM (should be small enough)
-            self.label_start_ix = self.h5_label_file['label_start_ix'][:]
-            self.label_end_ix = self.h5_label_file['label_end_ix'][:]
-        else:
-            self.seq_length = 1
-        self.data_in_memory = getattr(opt, 'data_in_memory', False)
-        self.fc_loader = HybridLoader(self.opt.input_fc_dir, '.npy', in_memory=self.data_in_memory)
-        self.att_loader = HybridLoader(self.opt.input_att_dir, '.npz', in_memory=self.data_in_memory)
-        self.box_loader = HybridLoader(self.opt.input_box_dir, '.npy', in_memory=self.data_in_memory)
-        self.use_clipscore = getattr(opt, 'use_clipscore', False)
-        if self.use_clipscore:
-            self.clipscore_loader = HybridLoader(self.opt.input_clipscore_vis_dir, '.npy', in_memory=self.data_in_memory)
-        self.num_images = len(self.info['images']) # self.label_start_ix.shape[0]
-        if verbose:
-            print('read %d image features' %(self.num_images))
-        # separate out indexes for each of the provided splits
-        self.split_ix = {'train': [], 'val': [], 'test': []}
-        for ix in range(len(self.info['images'])):
-            img = self.info['images'][ix]
-            if not 'split' in img:
-                self.split_ix['train'].append(ix)
-                self.split_ix['val'].append(ix)
-                self.split_ix['test'].append(ix)
-            elif img['split'] == 'train':
-                self.split_ix['train'].append(ix)
-            elif img['split'] == 'val':
-                self.split_ix['val'].append(ix)
-            elif img['split'] == 'test':
-                self.split_ix['test'].append(ix)
-            elif opt.train_only == 0: # restval
-                self.split_ix['train'].append(ix)
-        if verbose:
-            print('assigned %d images to split train' %len(self.split_ix['train']))
-            print('assigned %d images to split val' %len(self.split_ix['val']))
-            print('assigned %d images to split test' %len(self.split_ix['test']))
-    def get_captions(self, ix, seq_per_img):
-        # fetch the sequence labels
-        ix1 = self.label_start_ix[ix] - 1 #label_start_ix starts from 1
-        ix2 = self.label_end_ix[ix] - 1
-        ncap = ix2 - ix1 + 1 # number of captions available for this image
-        assert ncap > 0, 'an image does not have any label. this can be handled but right now isn\'t'
-        if ncap < seq_per_img:
-            # we need to subsample (with replacement)
-            seq = np.zeros([seq_per_img, self.seq_length], dtype = 'int')
-            for q in range(seq_per_img):
-                ixl = random.randint(ix1,ix2)
-                seq[q, :] = self.label[ixl, :self.seq_length]
-        else:
-            ixl = random.randint(ix1, ix2 - seq_per_img + 1)
-            seq = self.label[ixl: ixl + seq_per_img, :self.seq_length]
-        return seq
-    def collate_func(self, batch):
-        seq_per_img = self.seq_per_img
-        fc_batch = []
-        att_batch = []
-        label_batch = []
-        clip_vis_feat_batch = []
-        wrapped = False
-        infos = []
-        gts = []
-        for sample in batch:
-            # fetch image
-            if self.use_clipscore:
-                tmp_fc, tmp_att, tmp_seq, \
-                    ix, tmp_clip_vis_feat = sample
-                clip_vis_feat_batch.append(tmp_clip_vis_feat)
-            else:
-                tmp_fc, tmp_att, tmp_seq, \
-                    ix = sample
-            fc_batch.append(tmp_fc)
-            att_batch.append(tmp_att)
-            tmp_label = np.zeros([seq_per_img, self.seq_length + 2], dtype = 'int')
-            if hasattr(self, 'h5_label_file'):
-                # if there is ground truth
-                tmp_label[:, 1 : self.seq_length + 1] = tmp_seq
-            label_batch.append(tmp_label)
-            # Used for reward evaluation
-            if hasattr(self, 'h5_label_file'):
-                # if there is ground truth
-                gts.append(self.label[self.label_start_ix[ix] - 1: self.label_end_ix[ix]])
-            else:
-                gts.append([])
-            # record associated info as well
-            info_dict = {}
-            info_dict['ix'] = ix
-            info_dict['id'] = self.info['images'][ix]['id']
-            info_dict['file_path'] = self.info['images'][ix].get('file_path', '')
-            infos.append(info_dict)
-        # #sort by att_feat length
-        # fc_batch, att_batch, label_batch, gts, infos = \
-        #     zip(*sorted(zip(fc_batch, att_batch, np.vsplit(label_batch, batch_size), gts, infos), key=lambda x: len(x[1]), reverse=True))
-        if self.use_clipscore:
-            fc_batch, att_batch, label_batch, clip_vis_feat_batch, gts, infos = \
-                zip(*sorted(zip(fc_batch, att_batch, label_batch, clip_vis_feat_batch, gts, infos), key=lambda x: 0, reverse=True))
-        else:
-            fc_batch, att_batch, label_batch, gts, infos = \
-                zip(*sorted(zip(fc_batch, att_batch, label_batch, gts, infos), key=lambda x: 0, reverse=True))
-        data = {}
-        data['fc_feats'] = np.stack(fc_batch)
-        # merge att_feats
-        max_att_len = max([_.shape[0] for _ in att_batch])
-        data['att_feats'] = np.zeros([len(att_batch), max_att_len, att_batch[0].shape[1]], dtype = 'float32')
-        for i in range(len(att_batch)):
-            data['att_feats'][i, :att_batch[i].shape[0]] = att_batch[i]
-        data['att_masks'] = np.zeros(data['att_feats'].shape[:2], dtype='float32')
-        for i in range(len(att_batch)):
-            data['att_masks'][i, :att_batch[i].shape[0]] = 1
-        # set att_masks to None if attention features have same length
-        if data['att_masks'].sum() == data['att_masks'].size:
-            data['att_masks'] = None
-        if self.use_clipscore:
-            data['clip_vis_feats'] = np.stack(clip_vis_feat_batch)
-        data['labels'] = np.vstack(label_batch)
-        # generate mask
-        nonzeros = np.array(list(map(lambda x: (x != 0).sum()+2, data['labels'])))
-        mask_batch = np.zeros([data['labels'].shape[0], self.seq_length + 2], dtype = 'float32')
-        for ix, row in enumerate(mask_batch):
-            row[:nonzeros[ix]] = 1
-        data['masks'] = mask_batch
-        data['labels'] = data['labels'].reshape(len(batch), seq_per_img, -1)
-        data['masks'] = data['masks'].reshape(len(batch), seq_per_img, -1)
-        data['gts'] = gts # all ground truth captions of each images
-        data['infos'] = infos
-        data = {k:torch.from_numpy(v) if type(v) is np.ndarray else v for k,v in data.items()} # Turn all ndarray to torch tensor
-        return data
-    def __getitem__(self, ix):
-        """This function returns a tuple that is further passed to collate_fn
-        """
-        if self.use_att:
-            att_feat = self.att_loader.get(str(self.info['images'][ix]['id']))
-            # Reshape to K x C
-            att_feat = att_feat.reshape(-1, att_feat.shape[-1])
-            if self.norm_att_feat:
-                att_feat = att_feat / np.linalg.norm(att_feat, 2, 1, keepdims=True)
-            if self.use_box:
-                box_feat = self.box_loader.get(str(self.info['images'][ix]['id']))
-                # devided by image width and height
-                x1,y1,x2,y2 = np.hsplit(box_feat, 4)
-                h,w = self.info['images'][ix]['height'], self.info['images'][ix]['width']
-                box_feat = np.hstack((x1/w, y1/h, x2/w, y2/h, (x2-x1)*(y2-y1)/(w*h))) # question? x2-x1+1??
-                if self.norm_box_feat:
-                    box_feat = box_feat / np.linalg.norm(box_feat, 2, 1, keepdims=True)
-                att_feat = np.hstack([att_feat, box_feat])
-                # sort the features by the size of boxes
-                att_feat = np.stack(sorted(att_feat, key=lambda x:x[-1], reverse=True))
-        else:
-            att_feat = np.zeros((0,0), dtype='float32')
-        if self.use_fc:
-            try:
-                fc_feat = self.fc_loader.get(str(self.info['images'][ix]['id']))
-            except:
-                # Use average of attention when there is no fc provided (For bottomup feature)
-                fc_feat = att_feat.mean(0)
-        else:
-            fc_feat = np.zeros((0), dtype='float32')
-        if hasattr(self, 'h5_label_file'):
-            seq = self.get_captions(ix, self.seq_per_img)
-        else:
-            seq = None
-        if self.use_clipscore:
-            clip_vis_feat = self.clipscore_loader.get(
-                str(self.info['images'][ix]['id']))
-            return (fc_feat,
-                    att_feat, seq,
-                    ix, clip_vis_feat)
-        return (fc_feat,
-                att_feat, seq,
-                ix)
-    def __len__(self):
-        return len(self.info['images'])

retrieval/text_utils.py DELETED Viewed

@@ -1,74 +0,0 @@
-import random
-def repeat(text, n_max_gram=3, n_max_repeat=3):
-    """repeat n-grams"""
-    tokens = text.split()
-    n_gram = random.randint(1, n_max_gram)
-    repeat_token_idx = random.randint(0, len(tokens) - n_gram)
-    repeated_tokens = tokens[repeat_token_idx:repeat_token_idx+n_gram]
-    n_repeat = random.randint(1, n_max_repeat)
-    for _ in range(n_repeat):
-        insert_idx = random.randint(0, len(tokens))
-        tokens = tokens[:insert_idx] + \
-            repeated_tokens + tokens[insert_idx:]
-    new_text = " ".join(tokens)
-    return new_text
-def remove(text, n_max_gram=3):
-    """remove n-grams"""
-    tokens = text.split()
-    n_gram = random.randint(1, n_max_gram)
-    remove_token_idx = random.randint(0, len(tokens) - n_gram)
-    tokens = tokens[:remove_token_idx] + tokens[remove_token_idx + n_gram:]
-    new_text = " ".join(tokens)
-    return new_text
-def insert(text, vocab, n_max_tokens=3):
-    """Insert tokens"""
-    tokens = text.split()
-    n_insert_token = random.randint(1, n_max_tokens)
-    for _ in range(n_insert_token):
-        insert_token_idx = random.randint(0, len(tokens) - 1)
-        insert_token = random.choice(vocab)
-        tokens = tokens[:insert_token_idx] + [insert_token] + tokens[insert_token_idx:]
-    new_text = " ".join(tokens)
-    return new_text
-def swap(text, vocab, n_max_tokens=3):
-    """Swap tokens"""
-    tokens = text.split()
-    n_swap_tokens = random.randint(1, n_max_tokens)
-    for _ in range(n_swap_tokens):
-        swap_token_idx = random.randint(0, len(tokens) - 1)
-        swap_token = random.choice(vocab)
-        while swap_token == tokens[swap_token_idx]:
-            swap_token = random.choice(vocab)
-        tokens[swap_token_idx] = swap_token
-    new_text = " ".join(tokens)
-    return new_text
-def shuffle(text):
-    """shuffle tokens"""
-    tokens = text.split()
-    random.shuffle(tokens)
-    new_text = " ".join(tokens)
-    return new_text

retrieval/train_pl.py DELETED Viewed

@@ -1,661 +0,0 @@
-from ast import parse
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-import numpy as np
-import time
-import os
-from collections import defaultdict
-# import captioning.utils.opts as opts
-# import captioning.models as models
-# from captioning.data.pth_loader import CaptionDataset
-# import captioning.utils.eval_utils as eval_utils
-# import captioning.utils.misc as utils
-# from captioning.utils.rewards import init_scorer, get_self_critical_reward
-# from captioning.modules.loss_wrapper import LossWrapper
-from clip_model import CLIPScore
-from caption_data import COCORetrievalDataset
-import pytorch_lightning as pl
-import detectron2.utils.comm as d2comm
-from detectron2.utils.env import seed_all_rng
-seed_all_rng(1234)
-class LitModel(pl.LightningModule):
-    def __init__(self, opt):
-        super().__init__()
-        self.opt = opt
-        self.args = args
-        # Intilaize dataset
-        # self.dataset = CaptionDataset(opt)
-        # self.dataset =
-        # opt.vocab_size = self.dataset.vocab_size
-        # opt.seq_length = self.dataset.seq_length
-        # self.batch_size = opt.batch_size
-        # Build model
-        # opt.vocab = self.dataset.get_vocab()
-        # model = models.setup(opt)
-        # print(model)
-        # del opt.vocab
-        # wrapper with loss in it.
-        # lw_model = LossWrapper(model, opt)
-        self.model = CLIPScore(use_grammar=opt.use_grammar, joint_out=opt.joint_out)
-        # self.lw_model = lw_model
-        for p in self.model.clip_model.vision_model.parameters():
-            p.requires_grad = False
-        for p in self.model.clip_model.visual_projection.parameters():
-            p.requires_grad = False
-        # self.struc_flag = None
-        # self.sc_flag = None
-    def forward(self, *args, **kwargs):
-        """
-        I hate this design. Never pretend it as a nn.Module
-        """
-        raise NotImplementedError
-    def train_dataloader(self):
-        # train_dataset = torch.utils.data.Subset(
-        #     self.dataset,
-        #     self.dataset.split_ix['train']
-        # )
-        # train_loader = torch.utils.data.DataLoader(
-        #     dataset=train_dataset,
-        #     batch_size=self.batch_size,
-        #     shuffle=True,
-        #     num_workers=4,
-        #     collate_fn=self.dataset.collate_func
-        # )
-        train_dataset = COCORetrievalDataset(
-            split='karpathy_train', mode='train',
-            args=opt,
-            verbose=verbose
-            )
-        train_loader = torch.utils.data.DataLoader(
-            dataset=train_dataset,
-            batch_size=opt.batch_size,
-            shuffle=True,
-            num_workers=4,
-            collate_fn=train_dataset.collate_fn
-        )
-        return train_loader
-    def val_dataloader(self, split='karpathy_val'):
-        # val_dataset = torch.utils.data.Subset(
-        #     self.dataset,
-        #     self.dataset.split_ix[split]
-        # )
-        # val_loader = torch.utils.data.DataLoader(
-        #     val_dataset,
-        #     batch_size=self.batch_size,
-        #     shuffle=False,
-        #     num_workers=4,
-        #     drop_last=False,
-        #     collate_fn=self.dataset.collate_func
-        # )
-        val_dataset = COCORetrievalDataset(
-            split=split, mode='val',
-            args=opt,
-            verbose=verbose
-        )
-        val_loader = torch.utils.data.DataLoader(
-            dataset=val_dataset,
-            batch_size=opt.valid_batch_size,
-            shuffle=False,
-            num_workers=4,
-            drop_last=False,
-            collate_fn=val_dataset.collate_fn
-        )
-        return val_loader
-    def test_dataloader(self):
-        return self.val_dataloader('karpathy_test')
-    def training_step(self, data, batch_idx):
-        batch = data
-        self.model.train()
-        model_out = self.model.train_step(
-            img_feat=batch['img_feats'],
-            text=batch['text'],
-            neg_text=batch['neg_text'],
-        )
-        clip_loss = model_out['clip_loss']
-        if self.opt.joint_out:
-            loss = clip_loss
-        else:
-            grammar_loss = model_out['grammar_loss']
-            loss = clip_loss + grammar_loss
-        data_time = self.trainer.profiler.recorded_durations["get_train_batch"][-1]
-        data_time = torch.tensor(data_time)
-        # print('batch_idx', batch_idx)
-        # print('loss:', loss)
-        # logger_logs = model_out.copy()
-        logger_logs = {}
-        logger_logs['loss'] = loss.detach()
-        logger_logs['clip_loss'] = clip_loss.detach()
-        if not self.opt.joint_out:
-            logger_logs['grammar_loss'] = grammar_loss.detach()
-        logger_logs['data_time'] = data_time.detach()
-        # UserWarning: The {progress_bar:dict keyword} was deprecated in 0.9.1 and will be removed in 1.0.0
-        # Please use self.log(...) inside the lightningModule instead.
-        # # log on a step or aggregate epoch metric to the logger and/or progress bar
-        # # (inside LightningModule)
-        # self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
-        # warnings.warn(*args, **kwargs)
-        # UserWarning: The {log:dict keyword} was deprecated in 0.9.1 and will be removed in 1.0.0
-        # Please use self.log(...) inside the lightningModule instead.
-        # output = {
-        #     'loss': loss,
-        #     'log': logger_logs,
-        #     'progress_bar': {'data_time': data_time}
-        # }
-        for k, v in logger_logs.items():
-            if k in ['data_time', 'clip_loss', 'grammar_loss']:
-                self.log('train/'+k, v, prog_bar=True)
-            else:
-                self.log('train/'+k, v)
-        # print('training step logged')
-        return loss
-    def validation_step(self, data, batch_idx):
-        batch = data
-        self.model.eval()
-        with torch.no_grad():
-            model_out = self.model.train_step(
-                img_feat=batch['img_feats'],
-                text=batch['text'],
-                neg_text=batch['neg_text'],
-            )
-            if self.opt.joint_out:
-                clip_loss = model_out['clip_loss']
-                loss = clip_loss
-                output = {
-                    # 'val_loss': loss,
-                    'loss': loss.detach(),
-                    'clip_loss': clip_loss.detach(),
-                    # 'grammar_loss': grammar_loss.detach(),
-                    'img_feat': model_out['img_feat'].detach(),
-                    'text_feat': model_out['text_feat'].detach(),
-                    # 'neg_text_feat': model_out['neg_text_feat'].detach(),
-                    # 'grammar_pos_pred': model_out['grammar_pos_pred'].detach(),
-                    # 'grammar_neg_pred': model_out['grammar_neg_pred'].detach(),
-                    # 'predictions': predictions,
-                    # 'n_predictions': n_predictions,
-                }
-            else:
-                clip_loss = model_out['clip_loss']
-                grammar_loss = model_out['grammar_loss']
-                loss = clip_loss + grammar_loss
-                output = {
-                    # 'val_loss': loss,
-                    'loss': loss.detach(),
-                    'clip_loss': clip_loss.detach(),
-                    'grammar_loss': grammar_loss.detach(),
-                    'img_feat': model_out['img_feat'].detach(),
-                    'text_feat': model_out['text_feat'].detach(),
-                    # 'neg_text_feat': model_out['neg_text_feat'].detach(),
-                    'grammar_pos_pred': model_out['grammar_pos_pred'].detach(),
-                    'grammar_neg_pred': model_out['grammar_neg_pred'].detach(),
-                    # 'predictions': predictions,
-                    # 'n_predictions': n_predictions,
-                }
-        return output
-    def test_step(self, *args, **kwargs):
-        return self.validation_step(*args, **kwargs)
-    def validation_epoch_end(self, outputs, split='val'):
-        outputs = d2comm.gather(outputs)
-        # master node
-        if d2comm.is_main_process():
-            assert self.trainer.node_rank == 0 and self.trainer.local_rank == 0
-            outputs = sum(outputs, [])
-            out = {}
-            val_loss_mean = sum([_['loss'].cpu() for _ in outputs]) / len(outputs)
-            val_clip_loss_mean = sum([_['clip_loss'].cpu() for _ in outputs]) / len(outputs)
-            if not self.opt.joint_out:
-                val_grammar_loss_mean = sum([_['grammar_loss'].cpu() for _ in outputs]) / len(outputs)
-            print('loss', val_loss_mean.item())
-            print('clip_loss', val_clip_loss_mean.item())
-            if not self.opt.joint_out:
-                print('grammar_loss', val_grammar_loss_mean.item())
-            logit_scale = self.model.clip_model.logit_scale.exp().cpu()
-            text_feats = torch.cat([_['text_feat'].cpu() for _ in outputs], dim=0)
-            img_feats = torch.cat([_['img_feat'].cpu() for _ in outputs], dim=0)
-            assert text_feats.size() == (5000, 512), text_feats.size()
-            assert img_feats.size() == (5000, 512), img_feats.size()
-            logits_per_text = torch.matmul(text_feats, img_feats.t()) * logit_scale
-            logits_per_image = logits_per_text.T
-            # text-to-image retrieval
-            print('Text-to-Image retrieval')
-            for k in [1, 5, 10]:
-                text_to_image_topk = logits_per_text.topk(k, dim=1).indices
-                n_text = len(text_to_image_topk)
-                labels = torch.arange(0, n_text).view(-1, 1)
-                n_retrieved = ((text_to_image_topk == labels).sum(dim=1) > 0).sum()
-                recall_k = n_retrieved / n_text * 100
-                out[f'text_to_image_recall_{k}'] = recall_k.item()
-                print(f'R@{k}: {recall_k.item():.2f}%')
-            # image-to-text retrieval
-            print('Image-to-Text retrieval')
-            for k in [1, 5, 10]:
-                image_to_text_topk = logits_per_image.topk(k, dim=1).indices
-                n_image = len(image_to_text_topk)
-                labels = torch.arange(0, n_image).view(-1, 1)
-                n_retrieved = ((image_to_text_topk == labels).sum(dim=1) > 0).sum()
-                recall_k = n_retrieved / n_image * 100
-                out[f'image_to_text_recall_{k}'] = recall_k.item()
-                print(f'R@{k}: {recall_k.item():.2f}%')
-            out.update({
-                'loss': val_loss_mean.item(),
-                'clip_loss': val_clip_loss_mean.item()
-            })
-            if not self.opt.joint_out:
-                # grammar scoring
-                grammar_pos_pred = torch.cat([_['grammar_pos_pred'].cpu() for _ in outputs], dim=0)
-                grammar_neg_pred = torch.cat([_['grammar_neg_pred'].cpu() for _ in outputs], dim=0)
-                TP = (grammar_pos_pred == 1).sum().item()
-                FP = (grammar_pos_pred == 0).sum().item()
-                FN = (grammar_neg_pred == 1).sum().item()
-                TN = (grammar_neg_pred == 0).sum().item()
-                print('Grammar check')
-                print(f'TP: {TP} FP: {FP}  FN: {FN}  TN: {TN}')
-                precision = TP / (TP + FP) * 100
-                recall = TP / (TP + FN) * 100
-                accuracy = (TP + TN) / (TP + FP + FN + TN) * 100
-                f1 = 2 * precision * recall / (precision + recall)
-                print(f'Precision: {precision:.2f}%')
-                print(f'Recall: {recall:.2f}%')
-                print(f'Accuracy: {accuracy:.2f}%')
-                print(f'F1: {f1:.2f}%')
-                print('Total: {}'.format(len(grammar_pos_pred)))
-                out.update({
-                    'grammar_loss': val_grammar_loss_mean,
-                    'grammar_precision': precision,
-                    'grammar_recall': recall,
-                    'grammar_accuracy': accuracy,
-                    'grammar_f1': f1,
-                })
-        else:
-            out = {}
-        out = d2comm.all_gather(out)[0]  # Only the one from master node
-        assert len(out) > 0  # make sure the head has index 0
-        # must all be tensors
-        out = {k: torch.tensor(v) if not torch.is_tensor(
-            v) else v for k, v in out.items()}
-        for k, v in out.items():
-            self.log(f'{split}/{k}', v)
-    def test_epoch_end(self, outputs):
-        self.validation_epoch_end(outputs, 'test')
-    def configure_optimizers(self):
-        # opt = self.opt
-        # model = self.model
-        # parameters = [p for p in model.parameters() if p.requires_grad]
-        # if opt.noamopt:
-        #     # assert opt.caption_model in ['transformer', 'bert', 'm2transformer'], 'noamopt can only work with transformer'
-        #     optimizer = utils.get_std_opt(
-        #         model, optim_func=opt.optim, factor=opt.noamopt_factor, warmup=opt.noamopt_warmup)
-        # elif opt.reduce_on_plateau:
-        #     # optimizer = utils.build_optimizer(model.parameters(), opt)
-        #     optimizer = utils.build_optimizer(parameters, opt)
-        #     optimizer = utils.ReduceLROnPlateau(optimizer,
-        #                                         factor=opt.reduce_on_plateau_factor,
-        #                                         patience=opt.reduce_on_plateau_patience)
-        # else:
-        #     # optimizer = utils.build_optimizer(model.parameters(), opt)
-        #     optimizer = utils.build_optimizer(parameters, opt)
-        # from transformers.optimization import AdamW, get_linear_schedule_with_warmup
-        # batch_per_epoch = len(self.train_loader)
-        # t_total = batch_per_epoch // self.args.gradient_accumulation_steps * self.args.epochs
-        # warmup_ratio = self.args.warmup_ratio
-        # warmup_iters = int(t_total * warmup_ratio)
-        # if self.verbose:
-        #     print("Batch per epoch: %d" % batch_per_epoch)
-        #     print("Total Iters: %d" % t_total)
-        #     print('Warmup ratio:', warmup_ratio)
-        #     print("Warm up Iters: %d" % warmup_iters)
-        if self.args.optim == 'adamw':
-            no_decay = ["bias", "LayerNorm.weight"]
-            optimizer_grouped_parameters = [
-                {
-                    "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
-                    "weight_decay": self.args.weight_decay,
-                },
-                {
-                    "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
-                    "weight_decay": 0.0,
-                },
-            ]
-            for group in optimizer_grouped_parameters:
-                group['params'] = [p for p in group['params'] if p.requires_grad]
-            from transformers.optimization import AdamW
-            optim = AdamW(optimizer_grouped_parameters,
-                            lr=self.args.lr, eps=self.args.adam_eps)
-            # lr_scheduler = get_linear_schedule_with_warmup(
-            #     optim, warmup_iters, t_total)
-        # optimizers = []
-        optimizers = [optim]
-        lr_schedulers = []
-        return optimizers, lr_schedulers
-    def optimizer_step(self, epoch, batch_idx, optimizer,
-                       optimizer_idx, *args, **kwargs):
-        # # warm up lr
-        # opt = self.opt
-        # iteration = self.trainer.global_step
-        # if opt.use_warmup and (iteration < opt.noamopt_warmup):
-        #     opt.current_lr = opt.learning_rate * \
-        #         (iteration+1) / opt.noamopt_warmup
-        #     utils.set_lr(optimizer, opt.current_lr)
-        super().optimizer_step(epoch, batch_idx, optimizer,
-                               optimizer_idx, *args, **kwargs)
-        # print('optimizer step')
-    def state_dict(self):
-        """
-        Save the model state dict as well as opt and vocab
-        """
-        state_dict = self.model.state_dict()
-        device = next(iter(state_dict.values())).device
-        assert '_vocab' not in state_dict and '_opt' not in state_dict, 'Just in case'
-        # state_dict.update({
-        #     '_vocab': utils.serialize_to_tensor(self.model.vocab).to(device),
-        #     '_opt': utils.serialize_to_tensor(self.opt).to(device)
-        # })
-        return state_dict
-    def load_state_dict(self, state_dict=None, strict=True):
-        # if '_vocab' in state_dict:
-        #     self.model.vocab = utils.deserialize(state_dict['_vocab'])
-        #     del state_dict['_vocab']
-        # elif strict:
-        #     raise KeyError
-        # if '_opt' in state_dict:
-        #     saved_model_opt = utils.deserialize(state_dict['_opt'])
-        #     del state_dict['_opt']
-        #     opt = self.opt
-        #     # Make sure the saved opt is compatible with the curren topt
-        #     need_be_same = ["caption_model",
-        #                     "rnn_type", "rnn_size", "num_layers"]
-        #     for checkme in need_be_same:
-        #         if getattr(saved_model_opt, checkme) in ['updown', 'topdown'] and \
-        #                 getattr(opt, checkme) in ['updown', 'topdown']:
-        #             continue
-        #         assert getattr(saved_model_opt, checkme) == getattr(
-        #             opt, checkme), "Command line argument and saved model disagree on '%s' " % checkme
-        # elif strict:
-        #     raise KeyError
-        self.model.load_state_dict(state_dict, strict)
-class OnEpochStartCallback(pl.Callback):
-    def on_epoch_start(self, trainer, pl_module):
-        # Update lr/training stage/scheduled sampling prob etc.
-        opt = pl_module.opt
-        model = pl_module.model
-        epoch = trainer.current_epoch
-        optimizer = trainer.optimizers[0]
-        # if not opt.noamopt and not opt.reduce_on_plateau:
-        #     # Assign the learning rate
-        #     if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
-        #         frac = (
-        #             epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every
-        #         decay_factor = opt.learning_rate_decay_rate ** frac
-        #         opt.current_lr = opt.learning_rate * decay_factor
-        #     else:
-        #         opt.current_lr = opt.learning_rate
-        #     utils.set_lr(optimizer, opt.current_lr)  # set the decayed rate
-        # # Assign the scheduled sampling prob
-        # if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
-        #     frac = (
-        #         epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every
-        #     opt.ss_prob = min(opt.scheduled_sampling_increase_prob *
-        #                       frac, opt.scheduled_sampling_max_prob)
-        #     model.ss_prob = opt.ss_prob
-        # # If start self critical training
-        # if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
-        #     sc_flag = True
-        #     init_scorer(opt.cached_tokens)
-        # else:
-        #     sc_flag = False
-        # # If start structure loss training
-        # if opt.structure_after != -1 and epoch >= opt.structure_after:
-        #     struc_flag = True
-        #     init_scorer(opt.cached_tokens)
-        # else:
-        #     struc_flag = False
-        # pl_module.struc_flag = struc_flag
-        # pl_module.sc_flag = sc_flag
-class ModelCheckpoint(pl.callbacks.ModelCheckpoint):
-    def on_keyboard_interrupt(self, trainer, pl_module):
-        # Save model when keyboard interrupt
-        filepath = os.path.join(self.dirpath, self.prefix + 'interrupt.ckpt')
-        self._save_model(filepath)
-from param import parse_args
-# opt = opts.parse_opt()
-args = parse_args()
-opt = args
-checkpoint_callback = ModelCheckpoint(
-    filepath=opt.checkpoint_dir + '{epoch:02d}',
-    # dirpath=opt.checkpoint_path,
-    save_last=True,
-    save_top_k=1,
-    verbose=True,
-    # monitor='to_monitor',
-    # monitor='val/to_monitor',
-    # monitor='val/CIDEr',
-    monitor='val/loss',
-    mode='min',
-    # prefix=opt.id+'_',
-    prefix=opt.id,
-    # filename=f'{opt.id}_',
-)
-verbose = True
-# import torch
-# if torch.cuda.current_device() in [0, -1]:
-if 'LOCAL_RANK' in os.environ and os.environ['LOCAL_RANK'] != '0':
-    verbose = False
-# if verbose:
-#     print(opt)
-#     print("""
-#     val_image_use,
-#     save_checkpoint_very
-#     save_every_epoch,
-#     save_history-ckpt will be ignored.
-#     """)
-# Lightning defines batch size as batch size per gpu
-assert opt.batch_size % torch.cuda.device_count() == 0
-opt.batch_size = opt.batch_size // torch.cuda.device_count()
-opt.valid_batch_size = opt.valid_batch_size // torch.cuda.device_count()
-# If resume from last checkpoint
-# if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from, f'{opt.id}_last.ckpt')):
-#     resume_from = os.path.join(opt.start_from, f'{opt.id}_last.ckpt')
-if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from, f'{opt.id}-last.ckpt')):
-    resume_from = os.path.join(opt.start_from, f'{opt.id}-last.ckpt')
-    if verbose:
-        print('resume from', resume_from)
-else:
-    resume_from = None
-from pytorch_lightning.loggers import WandbLogger
-wandb_logger = WandbLogger(
-    # project='CLIP-ViL-COCOCaption',
-    project='CLIP-Finetune-COCO',
-    name=opt.id,
-)
-if verbose:
-    wandb_logger.experiment.config.update(opt)
-    from pathlib import Path
-    import glob
-    import wandb
-    # src_dir = Path(__file__).resolve().parent.parent
-    glob_str = "*.py"
-    base_path = './'
-    wandb.save(glob_str=glob_str, base_path=base_path)
-    glob_str = "**/*.yaml"
-    base_path = './'
-    wandb.save(glob_str=glob_str, base_path=base_path)
-    # code = wandb.Artifact('project-source', type='code')
-    # for path in glob.glob('**/*.py', recursive=True):
-    #     code.add_file(path, name='source/'+path)
-    #     print(path)
-    # wandb.run.use_artifact(code)
-lit = LitModel(opt)
-# warning grad_clip_mode is ignored.
-trainer = pl.Trainer(
-    callbacks=[
-        OnEpochStartCallback(),
-        # pl.callbacks.lr_logger.LearningRateLogger()
-        pl.callbacks.LearningRateMonitor()
-    ],
-    default_root_dir=opt.checkpoint_dir,
-    resume_from_checkpoint=resume_from,
-    distributed_backend='ddp',
-    gpus=torch.cuda.device_count(),
-    # gpus=1,
-    check_val_every_n_epoch=1,
-    # max_epochs=opt.max_epochs,
-    max_epochs=opt.epochs,
-    # gradient_clip_val=opt.grad_clip_value,
-    gradient_clip_val=opt.clip_grad_norm,
-    checkpoint_callback=checkpoint_callback,
-    log_gpu_memory='min_max',
-    # log_save_interval=opt.losses_log_every,
-    log_every_n_steps=opt.losses_log_every,
-    profiler=True,
-    # profiler='simple',
-    # row_log_interval=10,  # what is it?
-    flush_logs_every_n_steps=10,
-    num_sanity_val_steps=0,
-    # val_check_interval=0.01,
-    # limit_train_batches=500,
-    # progress_bar_refresh_rate=0,
-    # fast_dev_run=True,
-    precision=opt.precision,
-    logger=wandb_logger
-)
-if os.getenv('EVALUATE', '0') == '1':
-    trainer.test(lit)
-else:
-    trainer.fit(lit)

save/README.md DELETED Viewed

	@@ -1 +0,0 @@
1	- Directory for checkpoints

scripts/build_bpe_subword_nmt.py DELETED Viewed

@@ -1,214 +0,0 @@
-"""
-Preprocess a raw json dataset into hdf5/json files for use in data_loader.lua
-Input: json file that has the form
-[{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...]
-example element in this list would look like
-{'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}
-This script reads this json, does some basic preprocessing on the captions
-(e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays
-Output: a json file and an hdf5 file
-The hdf5 file contains several fields:
-/labels is (M,max_length) uint32 array of encoded labels, zero padded
-/label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the
-  first and last indices (in range 1..M) of labels for each image
-/label_length stores the length of the sequence for each of the M sequences
-The json file has a dict that contains:
-- an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed
-- an 'images' field that is a list holding auxiliary information for each image,
-  such as in particular the 'split' it was assigned to.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import json
-import argparse
-from random import shuffle, seed
-import string
-# non-standard dependencies:
-import h5py
-import numpy as np
-import torch
-import torchvision.models as models
-import skimage.io
-from PIL import Image
-import codecs
-import tempfile
-from subword_nmt import learn_bpe, apply_bpe
-# python scripts/build_bpe_subword_nmt.py --input_json data/dataset_coco.json --output_json data/cocotalkbpe.json --output_h5 data/cocotalkbpe
-def build_vocab(imgs, params):
-	# count up the number of words
-	captions = []
-	for img in imgs:
-		for sent in img['sentences']:
-			captions.append(' '.join(sent['tokens']))
-	captions='\n'.join(captions)
-	all_captions = tempfile.NamedTemporaryFile(delete=False)
-	all_captions.close()
-	with open(all_captions.name, 'w') as txt_file:
-		txt_file.write(captions)
-	#
-	codecs_output = tempfile.NamedTemporaryFile(delete=False)
-	codecs_output.close()
-	with codecs.open(codecs_output.name, 'w', encoding='UTF-8') as output:
-		learn_bpe.learn_bpe(codecs.open(all_captions.name, encoding='UTF-8'), output, params['symbol_count'])
-	with codecs.open(codecs_output.name, encoding='UTF-8') as codes:
-		bpe = apply_bpe.BPE(codes)
-	tmp = tempfile.NamedTemporaryFile(delete=False)
-	tmp.close()
-	tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')
-	for _, img in enumerate(imgs):
-		img['final_captions'] = []
-		for sent in img['sentences']:
-			txt = ' '.join(sent['tokens'])
-			txt = bpe.segment(txt).strip()
-			img['final_captions'].append(txt.split(' '))
-			tmpout.write(txt)
-			tmpout.write('\n')
-			if _ < 20:
-				print(txt)
-	tmpout.close()
-	tmpin = codecs.open(tmp.name, encoding='UTF-8')
-	vocab = learn_bpe.get_vocabulary(tmpin)
-	vocab = sorted(vocab.keys(), key=lambda x: vocab[x], reverse=True)
-	# Always insert UNK
-	print('inserting the special UNK token')
-	vocab.append('UNK')
-	print('Vocab size:', len(vocab))
-	os.remove(all_captions.name)
-	with open(codecs_output.name, 'r') as codes:
-		bpe = codes.read()
-	os.remove(codecs_output.name)
-	os.remove(tmp.name)
-	return vocab, bpe
-def encode_captions(imgs, params, wtoi):
-	"""
-	encode all captions into one large array, which will be 1-indexed.
-	also produces label_start_ix and label_end_ix which store 1-indexed
-	and inclusive (Lua-style) pointers to the first and last caption for
-	each image in the dataset.
-	"""
-	max_length = params['max_length']
-	N = len(imgs)
-	M = sum(len(img['final_captions']) for img in imgs) # total number of captions
-	label_arrays = []
-	label_start_ix = np.zeros(N, dtype='uint32') # note: these will be one-indexed
-	label_end_ix = np.zeros(N, dtype='uint32')
-	label_length = np.zeros(M, dtype='uint32')
-	caption_counter = 0
-	counter = 1
-	for i,img in enumerate(imgs):
-		n = len(img['final_captions'])
-		assert n > 0, 'error: some image has no captions'
-		Li = np.zeros((n, max_length), dtype='uint32')
-		for j,s in enumerate(img['final_captions']):
-			label_length[caption_counter] = min(max_length, len(s)) # record the length of this sequence
-			caption_counter += 1
-			for k,w in enumerate(s):
-				if k < max_length:
-					Li[j,k] = wtoi[w]
-		# note: word indices are 1-indexed, and captions are padded with zeros
-		label_arrays.append(Li)
-		label_start_ix[i] = counter
-		label_end_ix[i] = counter + n - 1
-		counter += n
-	L = np.concatenate(label_arrays, axis=0) # put all the labels together
-	assert L.shape[0] == M, 'lengths don\'t match? that\'s weird'
-	assert np.all(label_length > 0), 'error: some caption had no words?'
-	print('encoded captions to array of size ', L.shape)
-	return L, label_start_ix, label_end_ix, label_length
-def main(params):
-	imgs = json.load(open(params['input_json'], 'r'))
-	imgs = imgs['images']
-	seed(123) # make reproducible
-	# create the vocab
-	vocab, bpe = build_vocab(imgs, params)
-	itow = {i+1:w for i,w in enumerate(vocab)} # a 1-indexed vocab translation table
-	wtoi = {w:i+1 for i,w in enumerate(vocab)} # inverse table
-	# encode captions in large arrays, ready to ship to hdf5 file
-	L, label_start_ix, label_end_ix, label_length = encode_captions(imgs, params, wtoi)
-	# create output h5 file
-	N = len(imgs)
-	f_lb = h5py.File(params['output_h5']+'_label.h5', "w")
-	f_lb.create_dataset("labels", dtype='uint32', data=L)
-	f_lb.create_dataset("label_start_ix", dtype='uint32', data=label_start_ix)
-	f_lb.create_dataset("label_end_ix", dtype='uint32', data=label_end_ix)
-	f_lb.create_dataset("label_length", dtype='uint32', data=label_length)
-	f_lb.close()
-	# create output json file
-	out = {}
-	out['ix_to_word'] = itow # encode the (1-indexed) vocab
-	out['images'] = []
-	out['bpe'] = bpe
-	for i,img in enumerate(imgs):
-		jimg = {}
-		jimg['split'] = img['split']
-		if 'filename' in img: jimg['file_path'] = os.path.join(img['filepath'], img['filename']) # copy it over, might need
-		if 'cocoid' in img: jimg['id'] = img['cocoid'] # copy over & mantain an id, if present (e.g. coco ids, useful)
-		if params['images_root'] != '':
-			with Image.open(os.path.join(params['images_root'], img['filepath'], img['filename'])) as _img:
-				jimg['width'], jimg['height'] = _img.size
-		out['images'].append(jimg)
-	json.dump(out, open(params['output_json'], 'w'))
-	print('wrote ', params['output_json'])
-if __name__ == "__main__":
-	parser = argparse.ArgumentParser()
-	# input json
-	parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
-	parser.add_argument('--output_json', default='data.json', help='output json file')
-	parser.add_argument('--output_h5', default='data', help='output h5 file')
-	parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json')
-	# options
-	parser.add_argument('--max_length', default=16, type=int, help='max length of a caption, in number of words. captions longer than this get clipped.')
-	parser.add_argument('--symbol_count', default=10000, type=int, help='only words that occur more than this number of times will be put in vocab')
-	args = parser.parse_args()
-	params = vars(args) # convert to ordinary dict
-	print('parsed input parameters:')
-	print(json.dumps(params, indent = 2))
-	main(params)

scripts/clip_prepro_feats.py DELETED Viewed

@@ -1,170 +0,0 @@
-"""
-Preprocess a raw json dataset into features files for use in data_loader.py
-Input: json file that has the form
-[{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...]
-example element in this list would look like
-{'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}
-This script reads this json, does some basic preprocessing on the captions
-(e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays
-Output: two folders of features
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import json
-import argparse
-from random import shuffle, seed
-import string
-# non-standard dependencies:
-import h5py
-from six.moves import cPickle
-import numpy as np
-import torch
-import torchvision.models as models
-import skimage.io
-from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
-from PIL import Image
-from torch import nn
-preprocess = Compose([
-    Resize((448, 448), interpolation=Image.BICUBIC),
-    CenterCrop((448, 448)),
-    ToTensor()
-])
-from clip.clip import load
-from timm.models.vision_transformer import resize_pos_embed
-import timm
-from captioning.utils.resnet_utils import myResnet
-import captioning.utils.resnet as resnet
-from tqdm import tqdm
-def main(params):
-    if params["model_type"] != 'vit_base_patch32_224_in21k':
-        model, transform = load(params["model_type"], jit=False)
-    else:
-        model = timm.create_model(params["model_type"], pretrained=True)
-        model = model.cuda()
-    if params["model_type"] != 'vit_base_patch32_224_in21k':
-        save_model_type = params["model_type"].split("-")[0]
-        mean = torch.Tensor([0.48145466, 0.4578275, 0.40821073]).to("cuda").reshape(3, 1, 1)
-        std = torch.Tensor([0.26862954, 0.26130258, 0.27577711]).to("cuda").reshape(3, 1, 1)
-        if "RN" in params["model_type"]:
-            num_patches = 196 #600 * 1000 // 32 // 32
-            pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, model.visual.attnpool.positional_embedding.shape[-1],  device='cuda'),)
-            pos_embed.weight = resize_pos_embed(model.visual.attnpool.positional_embedding.unsqueeze(0), pos_embed)
-            model.visual.attnpool.positional_embedding = pos_embed
-    else:
-        save_model_type = 'vit_base'
-        mean = torch.Tensor([0.5, 0.5, 0.5]).to("cuda").reshape(3, 1, 1)
-        std = torch.Tensor([0.5, 0.5, 0.5]).to("cuda").reshape(3, 1, 1)
-        num_patches = 196 #600 * 1000 // 32 // 32
-        pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, 768,  device='cuda'),)
-        pos_embed.weight = resize_pos_embed(model.pos_embed, pos_embed)
-        model.pos_embed = pos_embed
-    if params["model_type"] == "ViT-B/32":
-        num_patches = 196 #600 * 1000 // 32 // 32
-        pos_embed = nn.Parameter(torch.zeros(num_patches + 1, 768,  device='cuda'),)
-        pos_embed.weight = resize_pos_embed(model.visual.positional_embedding.unsqueeze(0), pos_embed.unsqueeze(0))
-        model.visual.positional_embedding = pos_embed
-    imgs = json.load(open(params['input_json'], 'r'))
-    imgs = imgs['images']
-    if args.n_jobs > 1:
-        print('Total imgs:', len(imgs))
-        print('Using {} jobs'.format(args.n_jobs))
-        print('job id:', args.job_id)
-        imgs = imgs[args.job_id::args.n_jobs]
-    N = len(imgs)
-    seed(123) # make reproducible
-    dir_fc = params['output_dir']+'_clip_'+save_model_type+'_fc'
-    dir_att = params['output_dir']+'_clip_'+save_model_type+'_att'
-    if not os.path.isdir(dir_fc):
-        os.mkdir(dir_fc)
-    if not os.path.isdir(dir_att):
-        os.mkdir(dir_att)
-    for i,img in enumerate(tqdm(imgs)):
-        # load the image
-        with torch.no_grad():
-            image = preprocess(Image.open(os.path.join(params['images_root'], img['filepath'], img['filename']) ).convert("RGB"))
-            image = torch.tensor(np.stack([image])).cuda()
-            image -= mean
-            image /= std
-            if "RN" in params["model_type"]:
-                tmp_att, tmp_fc = model.encode_image(image)
-                tmp_att = tmp_att[0].permute(1, 2, 0)
-                tmp_fc = tmp_fc[0]
-            elif params["model_type"] == 'vit_base_patch32_224_in21k':
-                x = model(image)
-                tmp_fc = x[0, 0, :]
-                tmp_att = x[0, 1:, :].reshape( 14, 14, 768 )
-            else:
-                x = model.visual.conv1(image.half())  # shape = [*, width, grid, grid]
-                x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
-                x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
-                x = torch.cat([model.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
-                x = x + model.visual.positional_embedding.to(x.dtype)[:x.shape[1], :]
-                x = model.visual.ln_pre(x)
-                x = x.permute(1, 0, 2)  # NLD -> LND
-                for layer_idx, layer in enumerate(model.visual.transformer.resblocks):
-                    x = layer(x)
-                x = x.permute(1, 0, 2)
-                tmp_fc = x[0, 0, :]
-                tmp_att = x[0, 1:, :].reshape( 14, 14, 768 )
-        np.save(os.path.join(dir_fc, str(img['cocoid'])), tmp_fc.data.cpu().float().numpy())
-        np.savez_compressed(os.path.join(dir_att, str(img['cocoid'])), feat=tmp_att.data.cpu().float().numpy())
-        # if i % 1000 == 0:
-        #     print('processing %d/%d (%.2f%% done)' % (i, N, i*100.0/N))
-    print('wrote ', dir_fc, dir_att)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # input json
-    parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
-    parser.add_argument('--output_dir', default='data', help='output h5 file')
-    # options
-    parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json')
-    parser.add_argument('--att_size', default=14, type=int, help='14x14 or 7x7')
-    parser.add_argument('--model_type', default='RN50', type=str, help='RN50, RN101, RN50x4, ViT-B/32, vit_base_patch32_224_in21k')
-    parser.add_argument('--n_jobs', default=-1, type=int, help='number of jobs to run in parallel')
-    parser.add_argument('--job_id', default=0, type=int, help='job id')
-    parser.add_argument('--batch_size', default=1, type=int, help='batch size')
-    args = parser.parse_args()
-    params = vars(args) # convert to ordinary dict
-    print('parsed input parameters:')
-    print(json.dumps(params, indent = 2))
-    main(params)

scripts/clipscore_prepro_feats.py DELETED Viewed

@@ -1,162 +0,0 @@
-"""
-Preprocess a raw json dataset into features files for use in data_loader.py
-Input: json file that has the form
-[{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...]
-example element in this list would look like
-{'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}
-This script reads this json, does some basic preprocessing on the captions
-(e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays
-Output: two folders of features
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import json
-import argparse
-from random import shuffle, seed
-import string
-# non-standard dependencies:
-import h5py
-from six.moves import cPickle
-import numpy as np
-import torch
-import torchvision.models as models
-import skimage.io
-from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
-from PIL import Image
-from torch import nn
-# preprocess = Compose([
-#     Resize((448, 448), interpolation=Image.BICUBIC),
-#     CenterCrop((448, 448)),
-#     ToTensor()
-# ])
-# from clip.clip import load
-# from timm.models.vision_transformer import resize_pos_embed
-# import timm
-# from captioning.utils.resnet_utils import myResnet
-# import captioning.utils.resnet as resnet
-from captioning.utils.clipscore import CLIPScore
-from tqdm import tqdm
-def main(params):
-    clipscore_model = CLIPScore()
-    clipscore_model.to('cuda')
-    imgs = json.load(open(params['input_json'], 'r'))
-    imgs = imgs['images']
-    if args.n_jobs > 1:
-        print('Total imgs:', len(imgs))
-        print('Using {} jobs'.format(args.n_jobs))
-        print('job id:', args.job_id)
-        imgs = imgs[args.job_id::args.n_jobs]
-    N = len(imgs)
-    seed(123) # make reproducible
-    # dir_fc = params['output_dir']+'_clip_'+save_model_type+'_fc'
-    # dir_att = params['output_dir']+'_clip_'+save_model_type+'_att'
-    vis_dir_fc = params['output_dir']+'_clipscore_vis'
-    if not os.path.isdir(vis_dir_fc):
-        os.mkdir(vis_dir_fc)
-    # text_dir_fc = params['output_dir']+'_clipscore_text'
-    # if not os.path.isdir(text_dir_fc):
-    #     os.mkdir(text_dir_fc)
-    # if not os.path.isdir(dir_att):
-    #     os.mkdir(dir_att)
-    for i, img in enumerate(tqdm(imgs)):
-        # load the image
-        img_path = os.path.join(params['images_root'], img['filepath'], img['filename'])
-        img_feat = clipscore_model.image_extract(img_path)
-        img_feat = img_feat.view(512)
-        # for d in img['sentences']:
-        #     text = d['raw'].strip()
-        #     text_feat = clipscore_model.text_extract(text)
-        # with torch.no_grad():
-            # image = preprocess(Image.open(os.path.join(params['images_root'], img['filepath'], img['filename']) ).convert("RGB"))
-            # image = torch.tensor(np.stack([image])).cuda()
-            # image -= mean
-            # image /= std
-            # if "RN" in params["model_type"]:
-            #     tmp_att, tmp_fc = model.encode_image(image)
-            #     tmp_att = tmp_att[0].permute(1, 2, 0)
-            #     tmp_fc = tmp_fc[0]
-            # elif params["model_type"] == 'vit_base_patch32_224_in21k':
-            #     x = model(image)
-            #     tmp_fc = x[0, 0, :]
-            #     tmp_att = x[0, 1:, :].reshape( 14, 14, 768 )
-            # else:
-            #     x = model.visual.conv1(image.half())  # shape = [*, width, grid, grid]
-            #     x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
-            #     x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
-            #     x = torch.cat([model.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
-            #     x = x + model.visual.positional_embedding.to(x.dtype)[:x.shape[1], :]
-            #     x = model.visual.ln_pre(x)
-            #     x = x.permute(1, 0, 2)  # NLD -> LND
-            #     for layer_idx, layer in enumerate(model.visual.transformer.resblocks):
-            #         x = layer(x)
-            #     x = x.permute(1, 0, 2)
-            #     tmp_fc = x[0, 0, :]
-            #     tmp_att = x[0, 1:, :].reshape( 14, 14, 768 )
-        np.save(os.path.join(vis_dir_fc, str(img['cocoid'])), img_feat.data.cpu().float().numpy())
-        # np.save(os.path.join(text_dir_fc, str(img['cocoid'])), tmp_fc.data.cpu().float().numpy())
-        # np.savez_compressed(os.path.join(dir_att, str(img['cocoid'])), feat=tmp_att.data.cpu().float().numpy())
-        if i % 1000 == 0:
-            print('processing %d/%d (%.2f%% done)' % (i, N, i*100.0/N))
-    print('wrote ', vis_dir_fc)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # input json
-    # dataset_coco.json
-    parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
-    parser.add_argument('--output_dir', default='data', help='output h5 file')
-    # options
-    parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json')
-    # parser.add_argument('--att_size', default=14, type=int, help='14x14 or 7x7')
-    # parser.add_argument('--model_type', default='RN50', type=str, help='RN50, RN101, RN50x4, ViT-B/32, vit_base_patch32_224_in21k')
-    parser.add_argument('--n_jobs', default=-1, type=int, help='number of jobs to run in parallel')
-    parser.add_argument('--job_id', default=0, type=int, help='job id')
-    args = parser.parse_args()
-    params = vars(args) # convert to ordinary dict
-    print('parsed input parameters:')
-    print(json.dumps(params, indent = 2))
-    main(params)

scripts/copy_model.sh DELETED Viewed

@@ -1,9 +0,0 @@
-#!/bin/sh
-if [ ! -d log_$2 ]; then
-cp -r log_$1 log_$2
-cd log_$2
-mv infos_$1-best.pkl infos_$2-best.pkl
-mv infos_$1.pkl infos_$2.pkl
-cd ../
-fi

scripts/dump_to_h5df.py DELETED Viewed

@@ -1,56 +0,0 @@
-import argparse
-import h5py
-import os
-import numpy as np
-import json
-from tqdm import tqdm
-def main(params):
-    imgs = json.load(open(params['input_json'], 'r'))
-    imgs = imgs['images']
-    N = len(imgs)
-    if params['fc_input_dir'] is not None:
-        print('processing fc')
-        with h5py.File(params['fc_output']) as file_fc:
-            for i, img in enumerate(tqdm(imgs)):
-                npy_fc_path = os.path.join(
-                    params['fc_input_dir'],
-                    str(img['cocoid']) + '.npy')
-                d_set_fc = file_fc.create_dataset(
-                    str(img['cocoid']), data=np.load(npy_fc_path))
-            file_fc.close()
-    if params['att_input_dir'] is not None:
-        print('processing att')
-        with h5py.File(params['att_output']) as file_att:
-            for i, img in enumerate(tqdm(imgs)):
-                npy_att_path = os.path.join(
-                    params['att_input_dir'],
-                    str(img['cocoid']) + '.npz')
-                d_set_att = file_att.create_dataset(
-                    str(img['cocoid']),
-                    data=np.load(npy_att_path)['feat'])
-            file_att.close()
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
-    parser.add_argument('--fc_output', default='data', help='output h5 filename for fc')
-    parser.add_argument('--att_output', default='data', help='output h5 file for att')
-    parser.add_argument('--fc_input_dir', default=None, help='input directory for numpy fc files')
-    parser.add_argument('--att_input_dir', default=None, help='input directory for numpy att files')
-    args = parser.parse_args()
-    params = vars(args)  # convert to ordinary dict
-    print('parsed input parameters:')
-    print(json.dumps(params, indent=2))
-    main(params)

scripts/dump_to_lmdb.py DELETED Viewed

@@ -1,241 +0,0 @@
-# copy from https://github.com/Lyken17/Efficient-PyTorch/tools
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import os.path as osp
-import os, sys
-import os.path as osp
-from PIL import Image
-import six
-import string
-from lmdbdict import lmdbdict
-from lmdbdict.methods import DUMPS_FUNC, LOADS_FUNC
-import pickle
-import tqdm
-import numpy as np
-import argparse
-import json
-import torch
-import torch.utils.data as data
-from torch.utils.data import DataLoader
-import csv
-csv.field_size_limit(sys.maxsize)
-FIELDNAMES = ['image_id', 'status']
-class FolderLMDB(data.Dataset):
-    def __init__(self, db_path, fn_list=None):
-        self.db_path = db_path
-        self.lmdb = lmdbdict(db_path, unsafe=True)
-        self.lmdb._key_dumps = DUMPS_FUNC['ascii']
-        self.lmdb._value_loads = LOADS_FUNC['identity']
-        if fn_list is not None:
-            self.length = len(fn_list)
-            self.keys = fn_list
-        else:
-            raise Error
-    def __getitem__(self, index):
-        byteflow = self.lmdb[self.keys[index]]
-        # load image
-        imgbuf = byteflow
-        buf = six.BytesIO()
-        buf.write(imgbuf)
-        buf.seek(0)
-        try:
-            if args.extension == '.npz':
-                feat = np.load(buf)['feat']
-            else:
-                feat = np.load(buf)
-        except Exception as e:
-            print(self.keys[index], e)
-            return None
-        return feat
-    def __len__(self):
-        return self.length
-    def __repr__(self):
-        return self.__class__.__name__ + ' (' + self.db_path + ')'
-def make_dataset(dir, extension):
-    images = []
-    dir = os.path.expanduser(dir)
-    for root, _, fnames in sorted(os.walk(dir)):
-        for fname in sorted(fnames):
-            if has_file_allowed_extension(fname, [extension]):
-                path = os.path.join(root, fname)
-                images.append(path)
-    return images
-def raw_reader(path):
-    with open(path, 'rb') as f:
-        bin_data = f.read()
-    return bin_data
-def raw_npz_reader(path):
-    with open(path, 'rb') as f:
-        bin_data = f.read()
-    try:
-        npz_data = np.load(six.BytesIO(bin_data))['feat']
-    except Exception as e:
-        print(path)
-        npz_data = None
-        print(e)
-    return bin_data, npz_data
-def raw_npy_reader(path):
-    with open(path, 'rb') as f:
-        bin_data = f.read()
-    try:
-        npy_data = np.load(six.BytesIO(bin_data))
-    except Exception as e:
-        print(path)
-        npy_data = None
-        print(e)
-    return bin_data, npy_data
-class Folder(data.Dataset):
-    def __init__(self, root, loader, extension, fn_list=None):
-        super(Folder, self).__init__()
-        self.root = root
-        if fn_list:
-            samples = [os.path.join(root, str(_)+extension) for _ in fn_list]
-        else:
-            samples = make_dataset(self.root, extension)
-        self.loader = loader
-        self.extension = extension
-        self.samples = samples
-    def __getitem__(self, index):
-        """
-        Args:
-            index (int): Index
-        Returns:
-            tuple: (sample, target) where target is class_index of the target class.
-        """
-        path = self.samples[index]
-        sample = self.loader(path)
-        return (path.split('/')[-1].split('.')[0],) + sample
-    def __len__(self):
-        return len(self.samples)
-def folder2lmdb(dpath, fn_list, write_frequency=5000):
-    directory = osp.expanduser(osp.join(dpath))
-    print("Loading dataset from %s" % directory)
-    if args.extension == '.npz':
-        dataset = Folder(directory, loader=raw_npz_reader, extension='.npz',
-                         fn_list=fn_list)
-    else:
-        dataset = Folder(directory, loader=raw_npy_reader, extension='.npy',
-                         fn_list=fn_list)
-    data_loader = DataLoader(dataset, num_workers=16, collate_fn=lambda x: x)
-    # lmdb_path = osp.join(dpath, "%s.lmdb" % (directory.split('/')[-1]))
-    lmdb_path = osp.join("%s.lmdb" % (directory))
-    isdir = os.path.isdir(lmdb_path)
-    print("Generate LMDB to %s" % lmdb_path)
-    db = lmdbdict(lmdb_path, mode='w', key_method='ascii', value_method='identity')
-    tsvfile = open(args.output_file, 'a')
-    writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=FIELDNAMES)
-    names = []
-    all_keys = []
-    for idx, data in enumerate(tqdm.tqdm(data_loader)):
-        # print(type(data), data)
-        name, byte, npz = data[0]
-        if npz is not None:
-            db[name] = byte
-            all_keys.append(name)
-        names.append({'image_id': name, 'status': str(npz is not None)})
-        if idx % write_frequency == 0:
-            print("[%d/%d]" % (idx, len(data_loader)))
-            print('writing')
-            db.flush()
-            # write in tsv
-            for name in names:
-                writer.writerow(name)
-            names = []
-            tsvfile.flush()
-            print('writing finished')
-    # write all keys
-    # txn.put("keys".encode(), pickle.dumps(all_keys))
-    # # finish iterating through dataset
-    # txn.commit()
-    for name in names:
-        writer.writerow(name)
-    tsvfile.flush()
-    tsvfile.close()
-    print("Flushing database ...")
-    db.flush()
-    del db
-def parse_args():
-    """
-    Parse input arguments
-    """
-    parser = argparse.ArgumentParser(description='Generate bbox output from a Fast R-CNN network')
-    # parser.add_argument('--json)
-    parser.add_argument('--input_json', default='./data/dataset_coco.json', type=str)
-    parser.add_argument('--output_file', default='.dump_cache.tsv', type=str)
-    parser.add_argument('--folder', default='./data/cocobu_att', type=str)
-    parser.add_argument('--extension', default='.npz', type=str)
-    args = parser.parse_args()
-    return args
-if __name__ == "__main__":
-    global args
-    args = parse_args()
-    args.output_file += args.folder.split('/')[-1]
-    if args.folder.find('/') > 0:
-        args.output_file = args.folder[:args.folder.rfind('/')+1]+args.output_file
-    print(args.output_file)
-    img_list = json.load(open(args.input_json, 'r'))['images']
-    fn_list = [str(_['cocoid']) for _ in img_list]
-    found_ids = set()
-    try:
-        with open(args.output_file, 'r') as tsvfile:
-            reader = csv.DictReader(tsvfile, delimiter='\t', fieldnames=FIELDNAMES)
-            for item in reader:
-                if item['status'] == 'True':
-                    found_ids.add(item['image_id'])
-    except:
-        pass
-    fn_list = [_ for _ in fn_list if _ not in found_ids]
-    folder2lmdb(args.folder, fn_list)
-    # Test existing.
-    found_ids = set()
-    with open(args.output_file, 'r') as tsvfile:
-        reader = csv.DictReader(tsvfile, delimiter='\t', fieldnames=FIELDNAMES)
-        for item in reader:
-            if item['status'] == 'True':
-                found_ids.add(item['image_id'])
-    folder_dataset = FolderLMDB(args.folder+'.lmdb', list(found_ids))
-    data_loader = DataLoader(folder_dataset, num_workers=16, collate_fn=lambda x: x)
-    for data in tqdm.tqdm(data_loader):
-        assert data[0] is not None

scripts/make_bu_data.py DELETED Viewed

@@ -1,52 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import base64
-import numpy as np
-import csv
-import sys
-import zlib
-import time
-import mmap
-import argparse
-parser = argparse.ArgumentParser()
-# output_dir
-parser.add_argument('--downloaded_feats', default='data/bu_data', help='downloaded feature directory')
-parser.add_argument('--output_dir', default='data/cocobu', help='output feature files')
-args = parser.parse_args()
-csv.field_size_limit(sys.maxsize)
-FIELDNAMES = ['image_id', 'image_w','image_h','num_boxes', 'boxes', 'features']
-infiles = ['trainval/karpathy_test_resnet101_faster_rcnn_genome.tsv',
-          'trainval/karpathy_val_resnet101_faster_rcnn_genome.tsv',\
-          'trainval/karpathy_train_resnet101_faster_rcnn_genome.tsv.0', \
-           'trainval/karpathy_train_resnet101_faster_rcnn_genome.tsv.1']
-os.makedirs(args.output_dir+'_att')
-os.makedirs(args.output_dir+'_fc')
-os.makedirs(args.output_dir+'_box')
-for infile in infiles:
-    print('Reading ' + infile)
-    with open(os.path.join(args.downloaded_feats, infile), "r") as tsv_in_file:
-        reader = csv.DictReader(tsv_in_file, delimiter='\t', fieldnames = FIELDNAMES)
-        for item in reader:
-            item['image_id'] = int(item['image_id'])
-            item['num_boxes'] = int(item['num_boxes'])
-            for field in ['boxes', 'features']:
-                item[field] = np.frombuffer(base64.decodestring(item[field].encode('ascii')),
-                        dtype=np.float32).reshape((item['num_boxes'],-1))
-            np.savez_compressed(os.path.join(args.output_dir+'_att', str(item['image_id'])), feat=item['features'])
-            np.save(os.path.join(args.output_dir+'_fc', str(item['image_id'])), item['features'].mean(0))
-            np.save(os.path.join(args.output_dir+'_box', str(item['image_id'])), item['boxes'])

scripts/prepro_feats.py DELETED Viewed

@@ -1,103 +0,0 @@
-"""
-Preprocess a raw json dataset into features files for use in data_loader.py
-Input: json file that has the form
-[{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...]
-example element in this list would look like
-{'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}
-This script reads this json, does some basic preprocessing on the captions
-(e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays
-Output: two folders of features
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import json
-import argparse
-from random import shuffle, seed
-import string
-# non-standard dependencies:
-import h5py
-from six.moves import cPickle
-import numpy as np
-import torch
-import torchvision.models as models
-import skimage.io
-from torchvision import transforms as trn
-preprocess = trn.Compose([
-                #trn.ToTensor(),
-                trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
-])
-from captioning.utils.resnet_utils import myResnet
-import captioning.utils.resnet as resnet
-def main(params):
-    net = getattr(resnet, params['model'])()
-    net.load_state_dict(torch.load(os.path.join(params['model_root'],params['model']+'.pth')))
-    my_resnet = myResnet(net)
-    my_resnet.cuda()
-    my_resnet.eval()
-    imgs = json.load(open(params['input_json'], 'r'))
-    imgs = imgs['images']
-    N = len(imgs)
-    seed(123) # make reproducible
-    dir_fc = params['output_dir']+'_fc'
-    dir_att = params['output_dir']+'_att'
-    if not os.path.isdir(dir_fc):
-        os.mkdir(dir_fc)
-    if not os.path.isdir(dir_att):
-        os.mkdir(dir_att)
-    for i,img in enumerate(imgs):
-        # load the image
-        I = skimage.io.imread(os.path.join(params['images_root'], img['filepath'], img['filename']))
-        # handle grayscale input images
-        if len(I.shape) == 2:
-            I = I[:,:,np.newaxis]
-            I = np.concatenate((I,I,I), axis=2)
-        I = I.astype('float32')/255.0
-        I = torch.from_numpy(I.transpose([2,0,1])).cuda()
-        I = preprocess(I)
-        with torch.no_grad():
-            tmp_fc, tmp_att = my_resnet(I, params['att_size'])
-        # write to pkl
-        # print(dir_fc, str(img['cocoid']), tmp_fc.shape, tmp_att.shape, dir_att)
-        # exit()
-        np.save(os.path.join(dir_fc, str(img['cocoid'])), tmp_fc.data.cpu().float().numpy())
-        np.savez_compressed(os.path.join(dir_att, str(img['cocoid'])), feat=tmp_att.data.cpu().float().numpy())
-        if i % 1000 == 0:
-            print('processing %d/%d (%.2f%% done)' % (i, N, i*100.0/N))
-    print('wrote ', params['output_dir'])
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # input json
-    parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
-    parser.add_argument('--output_dir', default='data', help='output h5 file')
-    # options
-    parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json')
-    parser.add_argument('--att_size', default=14, type=int, help='14x14 or 7x7')
-    parser.add_argument('--model', default='resnet101', type=str, help='resnet101, resnet152')
-    parser.add_argument('--model_root', default='./data/imagenet_weights', type=str, help='model root')
-    args = parser.parse_args()
-    params = vars(args) # convert to ordinary dict
-    print('parsed input parameters:')
-    print(json.dumps(params, indent = 2))
-    main(params)

scripts/prepro_labels.py DELETED Viewed

@@ -1,206 +0,0 @@
-"""
-Preprocess a raw json dataset into hdf5/json files for use in data_loader.py
-Input: json file that has the form
-[{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...]
-example element in this list would look like
-{'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}
-This script reads this json, does some basic preprocessing on the captions
-(e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays
-Output: a json file and an hdf5 file
-The hdf5 file contains several fields:
-/labels is (M,max_length) uint32 array of encoded labels, zero padded
-/label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the
-  first and last indices (in range 1..M) of labels for each image
-/label_length stores the length of the sequence for each of the M sequences
-The json file has a dict that contains:
-- an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed
-- an 'images' field that is a list holding auxiliary information for each image,
-  such as in particular the 'split' it was assigned to.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import json
-import argparse
-from random import shuffle, seed
-import string
-# non-standard dependencies:
-import h5py
-import numpy as np
-import torch
-import torchvision.models as models
-import skimage.io
-from PIL import Image
-def build_vocab(imgs, params):
-    count_thr = params['word_count_threshold']
-    # count up the number of words
-    counts = {}
-    for img in imgs:
-        for sent in img['sentences']:
-            for w in sent['tokens']:
-                counts[w] = counts.get(w, 0) + 1
-    cw = sorted([(count,w) for w,count in counts.items()], reverse=True)
-    print('top words and their counts:')
-    print('\n'.join(map(str,cw[:20])))
-    # print some stats
-    total_words = sum(counts.values())
-    print('total words:', total_words)
-    bad_words = [w for w,n in counts.items() if n <= count_thr]
-    vocab = [w for w,n in counts.items() if n > count_thr]
-    bad_count = sum(counts[w] for w in bad_words)
-    print('number of bad words: %d/%d = %.2f%%' % (len(bad_words), len(counts), len(bad_words)*100.0/len(counts)))
-    print('number of words in vocab would be %d' % (len(vocab), ))
-    print('number of UNKs: %d/%d = %.2f%%' % (bad_count, total_words, bad_count*100.0/total_words))
-    # lets look at the distribution of lengths as well
-    sent_lengths = {}
-    for img in imgs:
-        for sent in img['sentences']:
-            txt = sent['tokens']
-            nw = len(txt)
-            sent_lengths[nw] = sent_lengths.get(nw, 0) + 1
-    max_len = max(sent_lengths.keys())
-    print('max length sentence in raw data: ', max_len)
-    print('sentence length distribution (count, number of words):')
-    sum_len = sum(sent_lengths.values())
-    for i in range(max_len+1):
-        print('%2d: %10d   %f%%' % (i, sent_lengths.get(i,0), sent_lengths.get(i,0)*100.0/sum_len))
-    # lets now produce the final annotations
-    if bad_count > 0:
-        # additional special UNK token we will use below to map infrequent words to
-        print('inserting the special UNK token')
-        vocab.append('UNK')
-    for img in imgs:
-        img['final_captions'] = []
-        for sent in img['sentences']:
-            txt = sent['tokens']
-            caption = [w if counts.get(w,0) > count_thr else 'UNK' for w in txt]
-            img['final_captions'].append(caption)
-    return vocab
-def encode_captions(imgs, params, wtoi):
-    """
-    encode all captions into one large array, which will be 1-indexed.
-    also produces label_start_ix and label_end_ix which store 1-indexed
-    and inclusive (Lua-style) pointers to the first and last caption for
-    each image in the dataset.
-    """
-    max_length = params['max_length']
-    N = len(imgs)
-    M = sum(len(img['final_captions']) for img in imgs) # total number of captions
-    label_arrays = []
-    label_start_ix = np.zeros(N, dtype='uint32') # note: these will be one-indexed
-    label_end_ix = np.zeros(N, dtype='uint32')
-    label_length = np.zeros(M, dtype='uint32')
-    caption_counter = 0
-    counter = 1
-    for i,img in enumerate(imgs):
-        n = len(img['final_captions'])
-        assert n > 0, 'error: some image has no captions'
-        Li = np.zeros((n, max_length), dtype='uint32')
-        for j,s in enumerate(img['final_captions']):
-            label_length[caption_counter] = min(max_length, len(s)) # record the length of this sequence
-            caption_counter += 1
-            for k,w in enumerate(s):
-                if k < max_length:
-                    Li[j,k] = wtoi[w]
-        # note: word indices are 1-indexed, and captions are padded with zeros
-        label_arrays.append(Li)
-        label_start_ix[i] = counter
-        label_end_ix[i] = counter + n - 1
-        counter += n
-    L = np.concatenate(label_arrays, axis=0) # put all the labels together
-    assert L.shape[0] == M, 'lengths don\'t match? that\'s weird'
-    assert np.all(label_length > 0), 'error: some caption had no words?'
-    print('encoded captions to array of size ', L.shape)
-    return L, label_start_ix, label_end_ix, label_length
-def main(params):
-    imgs = json.load(open(params['input_json'], 'r'))
-    imgs = imgs['images']
-    seed(123) # make reproducible
-    # create the vocab
-    vocab = build_vocab(imgs, params)
-    itow = {i+1:w for i,w in enumerate(vocab)} # a 1-indexed vocab translation table
-    wtoi = {w:i+1 for i,w in enumerate(vocab)} # inverse table
-    # encode captions in large arrays, ready to ship to hdf5 file
-    L, label_start_ix, label_end_ix, label_length = encode_captions(imgs, params, wtoi)
-    # create output h5 file
-    N = len(imgs)
-    f_lb = h5py.File(params['output_h5']+'_label.h5', "w")
-    f_lb.create_dataset("labels", dtype='uint32', data=L)
-    f_lb.create_dataset("label_start_ix", dtype='uint32', data=label_start_ix)
-    f_lb.create_dataset("label_end_ix", dtype='uint32', data=label_end_ix)
-    f_lb.create_dataset("label_length", dtype='uint32', data=label_length)
-    f_lb.close()
-    # create output json file
-    out = {}
-    out['ix_to_word'] = itow # encode the (1-indexed) vocab
-    out['images'] = []
-    for i,img in enumerate(imgs):
-        jimg = {}
-        jimg['split'] = img['split']
-        if 'filename' in img: jimg['file_path'] = os.path.join(img.get('filepath', ''), img['filename']) # copy it over, might need
-        if 'cocoid' in img:
-            jimg['id'] = img['cocoid'] # copy over & mantain an id, if present (e.g. coco ids, useful)
-        elif 'imgid' in img:
-            jimg['id'] = img['imgid']
-        if params['images_root'] != '':
-            with Image.open(os.path.join(params['images_root'], img['filepath'], img['filename'])) as _img:
-                jimg['width'], jimg['height'] = _img.size
-        out['images'].append(jimg)
-    json.dump(out, open(params['output_json'], 'w'))
-    print('wrote ', params['output_json'])
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # input json
-    parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
-    parser.add_argument('--output_json', default='data.json', help='output json file')
-    parser.add_argument('--output_h5', default='data', help='output h5 file')
-    parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json')
-    # options
-    parser.add_argument('--max_length', default=16, type=int, help='max length of a caption, in number of words. captions longer than this get clipped.')
-    parser.add_argument('--word_count_threshold', default=5, type=int, help='only words that occur more than this number of times will be put in vocab')
-    args = parser.parse_args()
-    params = vars(args) # convert to ordinary dict
-    print('parsed input parameters:')
-    print(json.dumps(params, indent = 2))
-    main(params)

scripts/prepro_ngrams.py DELETED Viewed

@@ -1,94 +0,0 @@
-"""
-Precompute ngram counts of captions, to accelerate cider computation during training time.
-"""
-import os
-import json
-import argparse
-from six.moves import cPickle
-import captioning.utils.misc as utils
-from collections import defaultdict
-import sys
-sys.path.append("cider")
-from pyciderevalcap.ciderD.ciderD_scorer import CiderScorer
-def get_doc_freq(refs, params):
-    tmp = CiderScorer(df_mode="corpus")
-    for ref in refs:
-        tmp.cook_append(None, ref)
-    tmp.compute_doc_freq()
-    return tmp.document_frequency, len(tmp.crefs)
-def build_dict(imgs, wtoi, params):
-    wtoi['<eos>'] = 0
-    count_imgs = 0
-    refs_words = []
-    refs_idxs = []
-    for img in imgs:
-        if (params['split'] == img['split']) or \
-            (params['split'] == 'train' and img['split'] == 'restval') or \
-            (params['split'] == 'all'):
-            #(params['split'] == 'val' and img['split'] == 'restval') or \
-            ref_words = []
-            ref_idxs = []
-            for sent in img['sentences']:
-                if hasattr(params, 'bpe'):
-                    sent['tokens'] = params.bpe.segment(' '.join(sent['tokens'])).strip().split(' ')
-                tmp_tokens = sent['tokens'] + ['<eos>']
-                tmp_tokens = [_ if _ in wtoi else 'UNK' for _ in tmp_tokens]
-                ref_words.append(' '.join(tmp_tokens))
-                ref_idxs.append(' '.join([str(wtoi[_]) for _ in tmp_tokens]))
-            refs_words.append(ref_words)
-            refs_idxs.append(ref_idxs)
-            count_imgs += 1
-    print('total imgs:', count_imgs)
-    ngram_words, count_refs = get_doc_freq(refs_words, params)
-    ngram_idxs, count_refs = get_doc_freq(refs_idxs, params)
-    print('count_refs:', count_refs)
-    return ngram_words, ngram_idxs, count_refs
-def main(params):
-    imgs = json.load(open(params['input_json'], 'r'))
-    dict_json = json.load(open(params['dict_json'], 'r'))
-    itow = dict_json['ix_to_word']
-    wtoi = {w:i for i,w in itow.items()}
-    # Load bpe
-    if 'bpe' in dict_json:
-        import tempfile
-        import codecs
-        codes_f = tempfile.NamedTemporaryFile(delete=False)
-        codes_f.close()
-        with open(codes_f.name, 'w') as f:
-            f.write(dict_json['bpe'])
-        with codecs.open(codes_f.name, encoding='UTF-8') as codes:
-            bpe = apply_bpe.BPE(codes)
-        params.bpe = bpe
-    imgs = imgs['images']
-    ngram_words, ngram_idxs, ref_len = build_dict(imgs, wtoi, params)
-    utils.pickle_dump({'document_frequency': ngram_words, 'ref_len': ref_len}, open(params['output_pkl']+'-words.p','wb'))
-    utils.pickle_dump({'document_frequency': ngram_idxs, 'ref_len': ref_len}, open(params['output_pkl']+'-idxs.p','wb'))
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # input json
-    parser.add_argument('--input_json', default='data/dataset_coco.json', help='input json file to process into hdf5')
-    parser.add_argument('--dict_json', default='data/cocotalk.json', help='output json file')
-    parser.add_argument('--output_pkl', default='data/coco-all', help='output pickle file')
-    parser.add_argument('--split', default='all', help='test, val, train, all')
-    args = parser.parse_args()
-    params = vars(args) # convert to ordinary dict
-    main(params)

scripts/prepro_reference_json.py DELETED Viewed

@@ -1,69 +0,0 @@
-# coding: utf-8
-"""
-Create a reference json file used for evaluation with `coco-caption` repo.
-Used when reference json is not provided, (e.g., flickr30k, or you have your own split of train/val/test)
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import json
-import argparse
-import sys
-import hashlib
-from random import shuffle, seed
-def main(params):
-    imgs = json.load(open(params['input_json'][0], 'r'))['images']
-    # tmp = []
-    # for k in imgs.keys():
-    #     for img in imgs[k]:
-    #         img['filename'] = img['image_id']  # k+'/'+img['image_id']
-    #         img['image_id'] = int(
-    #             int(hashlib.sha256(img['image_id']).hexdigest(), 16) % sys.maxint)
-    #         tmp.append(img)
-    # imgs = tmp
-    # create output json file
-    out = {'info': {'description': 'This is stable 1.0 version of the 2014 MS COCO dataset.', 'url': 'http://mscoco.org', 'version': '1.0', 'year': 2014, 'contributor': 'Microsoft COCO group', 'date_created': '2015-01-27 09:11:52.357475'}, 'licenses': [{'url': 'http://creativecommons.org/licenses/by-nc-sa/2.0/', 'id': 1, 'name': 'Attribution-NonCommercial-ShareAlike License'}, {'url': 'http://creativecommons.org/licenses/by-nc/2.0/', 'id': 2, 'name': 'Attribution-NonCommercial License'}, {'url': 'http://creativecommons.org/licenses/by-nc-nd/2.0/', 'id': 3, 'name': 'Attribution-NonCommercial-NoDerivs License'}, {'url': 'http://creativecommons.org/licenses/by/2.0/', 'id': 4, 'name': 'Attribution License'}, {'url': 'http://creativecommons.org/licenses/by-sa/2.0/', 'id': 5, 'name': 'Attribution-ShareAlike License'}, {'url': 'http://creativecommons.org/licenses/by-nd/2.0/', 'id': 6, 'name': 'Attribution-NoDerivs License'}, {'url': 'http://flickr.com/commons/usage/', 'id': 7, 'name': 'No known copyright restrictions'}, {'url': 'http://www.usa.gov/copyright.shtml', 'id': 8, 'name': 'United States Government Work'}], 'type': 'captions'}
-    out.update({'images': [], 'annotations': []})
-    cnt = 0
-    empty_cnt = 0
-    for i, img in enumerate(imgs):
-        if img['split'] == 'train':
-            continue
-        out['images'].append(
-            {'id': img.get('cocoid', img['imgid'])})
-        for j, s in enumerate(img['sentences']):
-            if len(s) == 0:
-                continue
-            s = ' '.join(s['tokens'])
-            out['annotations'].append(
-                {'image_id': out['images'][-1]['id'], 'caption': s, 'id': cnt})
-            cnt += 1
-    json.dump(out, open(params['output_json'], 'w'))
-    print('wrote ', params['output_json'])
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # input json
-    parser.add_argument('--input_json', nargs='+', required=True,
-                        help='input json file to process into hdf5')
-    parser.add_argument('--output_json', default='data.json',
-                        help='output json file')
-    args = parser.parse_args()
-    params = vars(args)  # convert to ordinary dict
-    print('parsed input parameters:')
-    print(json.dumps(params, indent=2))
-    main(params)

scripts_FineCapEval/clip_prepro_feats.py DELETED Viewed

@@ -1,163 +0,0 @@
-"""
-Preprocess a raw json dataset into features files for use in data_loader.py
-Input: json file that has the form
-[{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...]
-example element in this list would look like
-{'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}
-This script reads this json, does some basic preprocessing on the captions
-(e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays
-Output: two folders of features
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import json
-import argparse
-from random import shuffle, seed
-import string
-# non-standard dependencies:
-import h5py
-from six.moves import cPickle
-import numpy as np
-import torch
-import torchvision.models as models
-import skimage.io
-from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
-from PIL import Image
-from torch import nn
-preprocess = Compose([
-    Resize((448, 448), interpolation=Image.BICUBIC),
-    CenterCrop((448, 448)),
-    ToTensor()
-])
-from clip.clip import load
-from timm.models.vision_transformer import resize_pos_embed
-import timm
-from captioning.utils.resnet_utils import myResnet
-import captioning.utils.resnet as resnet
-from tqdm import tqdm
-def main(params):
-    if params["model_type"] != 'vit_base_patch32_224_in21k':
-        model, transform = load(params["model_type"], jit=False)
-    else:
-        model = timm.create_model(params["model_type"], pretrained=True)
-        model = model.cuda()
-    if params["model_type"] != 'vit_base_patch32_224_in21k':
-        save_model_type = params["model_type"].split("-")[0]
-        mean = torch.Tensor([0.48145466, 0.4578275, 0.40821073]).to("cuda").reshape(3, 1, 1)
-        std = torch.Tensor([0.26862954, 0.26130258, 0.27577711]).to("cuda").reshape(3, 1, 1)
-        if "RN" in params["model_type"]:
-            num_patches = 196 #600 * 1000 // 32 // 32
-            pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, model.visual.attnpool.positional_embedding.shape[-1],  device='cuda'),)
-            pos_embed.weight = resize_pos_embed(model.visual.attnpool.positional_embedding.unsqueeze(0), pos_embed)
-            model.visual.attnpool.positional_embedding = pos_embed
-    else:
-        save_model_type = 'vit_base'
-        mean = torch.Tensor([0.5, 0.5, 0.5]).to("cuda").reshape(3, 1, 1)
-        std = torch.Tensor([0.5, 0.5, 0.5]).to("cuda").reshape(3, 1, 1)
-        num_patches = 196 #600 * 1000 // 32 // 32
-        pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, 768,  device='cuda'),)
-        pos_embed.weight = resize_pos_embed(model.pos_embed, pos_embed)
-        model.pos_embed = pos_embed
-    if params["model_type"] == "ViT-B/32":
-        num_patches = 196 #600 * 1000 // 32 // 32
-        pos_embed = nn.Parameter(torch.zeros(num_patches + 1, 768,  device='cuda'),)
-        pos_embed.weight = resize_pos_embed(model.visual.positional_embedding.unsqueeze(0), pos_embed.unsqueeze(0))
-        model.visual.positional_embedding = pos_embed
-    imgs = json.load(open(params['input_json'], 'r'))
-    imgs = imgs['images']
-    N = len(imgs)
-    seed(123) # make reproducible
-    dir_fc = params['output_dir']+'_clip_'+save_model_type+'_fc'
-    dir_att = params['output_dir']+'_clip_'+save_model_type+'_att'
-    if not os.path.isdir(dir_fc):
-        os.mkdir(dir_fc)
-    if not os.path.isdir(dir_att):
-        os.mkdir(dir_att)
-    for i, img in enumerate(tqdm(imgs)):
-        with torch.no_grad():
-            # img_path = os.path.join(params['images_root'], img['filepath'], img['filename'])
-            # img_path = os.path.join(params['images_root'], img['file_name'])
-            img_path = os.path.join(params['images_root'], img['file_path'])
-            image = preprocess(Image.open( img_path ).convert("RGB"))
-            image = torch.tensor(np.stack([image])).cuda()
-            image -= mean
-            image /= std
-            if "RN" in params["model_type"]:
-                tmp_att, tmp_fc = model.encode_image(image)
-                tmp_att = tmp_att[0].permute(1, 2, 0)
-                tmp_fc = tmp_fc[0]
-            elif params["model_type"] == 'vit_base_patch32_224_in21k':
-                x = model(image)
-                tmp_fc = x[0, 0, :]
-                tmp_att = x[0, 1:, :].reshape( 14, 14, 768 )
-            else:
-                x = model.visual.conv1(image.half())  # shape = [*, width, grid, grid]
-                x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
-                x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
-                x = torch.cat([model.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
-                x = x + model.visual.positional_embedding.to(x.dtype)[:x.shape[1], :]
-                x = model.visual.ln_pre(x)
-                x = x.permute(1, 0, 2)  # NLD -> LND
-                for layer_idx, layer in enumerate(model.visual.transformer.resblocks):
-                    x = layer(x)
-                x = x.permute(1, 0, 2)
-                tmp_fc = x[0, 0, :]
-                tmp_att = x[0, 1:, :].reshape( 14, 14, 768 )
-        # np.save(os.path.join(dir_fc, str(img['cocoid'])), tmp_fc.data.cpu().float().numpy())
-        # np.savez_compressed(os.path.join(dir_att, str(img['cocoid'])), feat=tmp_att.data.cpu().float().numpy())
-        np.save(os.path.join(dir_fc, str(img['id'])), tmp_fc.data.cpu().float().numpy())
-        np.savez_compressed(os.path.join(dir_att, str(img['id'])), feat=tmp_att.data.cpu().float().numpy())
-        # if i % 1000 == 0:
-        #     print('processing %d/%d (%.2f%% done)' % (i, N, i*100.0/N))
-    print('wrote ', dir_fc, dir_att)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # input json
-    parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
-    parser.add_argument('--output_dir', default='data', help='output h5 file')
-    # options
-    parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json')
-    parser.add_argument('--att_size', default=14, type=int, help='14x14 or 7x7')
-    parser.add_argument('--model_type', default='RN50', type=str, help='RN50, RN101, RN50x4, ViT-B/32, vit_base_patch32_224_in21k')
-    args = parser.parse_args()
-    params = vars(args) # convert to ordinary dict
-    print('parsed input parameters:')
-    print(json.dumps(params, indent = 2))
-    main(params)

scripts_FineCapEval/clipscore_prepro_feats.py DELETED Viewed

@@ -1,154 +0,0 @@
-"""
-Preprocess a raw json dataset into features files for use in data_loader.py
-Input: json file that has the form
-[{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...]
-example element in this list would look like
-{'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}
-This script reads this json, does some basic preprocessing on the captions
-(e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays
-Output: two folders of features
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import json
-import argparse
-from random import shuffle, seed
-import string
-# non-standard dependencies:
-import h5py
-from six.moves import cPickle
-import numpy as np
-import torch
-import torchvision.models as models
-import skimage.io
-from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
-from PIL import Image
-from torch import nn
-# preprocess = Compose([
-#     Resize((448, 448), interpolation=Image.BICUBIC),
-#     CenterCrop((448, 448)),
-#     ToTensor()
-# ])
-# from clip.clip import load
-# from timm.models.vision_transformer import resize_pos_embed
-# import timm
-# from captioning.utils.resnet_utils import myResnet
-# import captioning.utils.resnet as resnet
-from captioning.utils.clipscore import CLIPScore
-from tqdm import tqdm
-def main(params):
-    clipscore_model = CLIPScore()
-    clipscore_model.to('cuda')
-    imgs = json.load(open(params['input_json'], 'r'))
-    imgs = imgs['images']
-    N = len(imgs)
-    seed(123) # make reproducible
-    # dir_fc = params['output_dir']+'_clip_'+save_model_type+'_fc'
-    # dir_att = params['output_dir']+'_clip_'+save_model_type+'_att'
-    vis_dir_fc = params['output_dir']+'_clipscore_vis'
-    if not os.path.isdir(vis_dir_fc):
-        os.mkdir(vis_dir_fc)
-    # text_dir_fc = params['output_dir']+'_clipscore_text'
-    # if not os.path.isdir(text_dir_fc):
-    #     os.mkdir(text_dir_fc)
-    # if not os.path.isdir(dir_att):
-    #     os.mkdir(dir_att)
-    for i,img in enumerate(tqdm(imgs)):
-        # load the image
-        # img_path = os.path.join(params['images_root'], img['filepath'], img['filename'])
-        # img_path = os.path.join(params['images_root'], img['file_name'])
-        img_path = os.path.join(params['images_root'], img['file_path'])
-        img_feat = clipscore_model.image_extract(img_path)
-        img_feat = img_feat.view(512)
-        # for d in img['sentences']:
-        #     text = d['raw'].strip()
-        #     text_feat = clipscore_model.text_extract(text)
-        # with torch.no_grad():
-            # image = preprocess(Image.open(os.path.join(params['images_root'], img['filepath'], img['filename']) ).convert("RGB"))
-            # image = torch.tensor(np.stack([image])).cuda()
-            # image -= mean
-            # image /= std
-            # if "RN" in params["model_type"]:
-            #     tmp_att, tmp_fc = model.encode_image(image)
-            #     tmp_att = tmp_att[0].permute(1, 2, 0)
-            #     tmp_fc = tmp_fc[0]
-            # elif params["model_type"] == 'vit_base_patch32_224_in21k':
-            #     x = model(image)
-            #     tmp_fc = x[0, 0, :]
-            #     tmp_att = x[0, 1:, :].reshape( 14, 14, 768 )
-            # else:
-            #     x = model.visual.conv1(image.half())  # shape = [*, width, grid, grid]
-            #     x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
-            #     x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
-            #     x = torch.cat([model.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
-            #     x = x + model.visual.positional_embedding.to(x.dtype)[:x.shape[1], :]
-            #     x = model.visual.ln_pre(x)
-            #     x = x.permute(1, 0, 2)  # NLD -> LND
-            #     for layer_idx, layer in enumerate(model.visual.transformer.resblocks):
-            #         x = layer(x)
-            #     x = x.permute(1, 0, 2)
-            #     tmp_fc = x[0, 0, :]
-            #     tmp_att = x[0, 1:, :].reshape( 14, 14, 768 )
-        np.save(os.path.join(vis_dir_fc, str(img['id'])), img_feat.data.cpu().float().numpy())
-        # np.save(os.path.join(text_dir_fc, str(img['cocoid'])), tmp_fc.data.cpu().float().numpy())
-        # np.savez_compressed(os.path.join(dir_att, str(img['cocoid'])), feat=tmp_att.data.cpu().float().numpy())
-        # if i % 1000 == 0:
-        #     print('processing %d/%d (%.2f%% done)' % (i, N, i*100.0/N))
-    print('wrote ', vis_dir_fc)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # input json
-    # dataset_coco.json
-    parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
-    parser.add_argument('--output_dir', default='data', help='output h5 file')
-    # options
-    parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json')
-    # parser.add_argument('--att_size', default=14, type=int, help='14x14 or 7x7')
-    # parser.add_argument('--model_type', default='RN50', type=str, help='RN50, RN101, RN50x4, ViT-B/32, vit_base_patch32_224_in21k')
-    args = parser.parse_args()
-    params = vars(args) # convert to ordinary dict
-    print('parsed input parameters:')
-    print(json.dumps(params, indent = 2))
-    main(params)

scripts_FineCapEval/prepro_labels.py DELETED Viewed

@@ -1,209 +0,0 @@
-"""
-Preprocess a raw json dataset into hdf5/json files for use in data_loader.py
-Input: json file that has the form
-[{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...]
-example element in this list would look like
-{'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}
-This script reads this json, does some basic preprocessing on the captions
-(e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays
-Output: a json file and an hdf5 file
-The hdf5 file contains several fields:
-/labels is (M,max_length) uint32 array of encoded labels, zero padded
-/label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the
-  first and last indices (in range 1..M) of labels for each image
-/label_length stores the length of the sequence for each of the M sequences
-The json file has a dict that contains:
-- an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed
-- an 'images' field that is a list holding auxiliary information for each image,
-  such as in particular the 'split' it was assigned to.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import json
-import argparse
-from random import shuffle, seed
-import string
-# non-standard dependencies:
-import h5py
-import numpy as np
-import torch
-import torchvision.models as models
-import skimage.io
-from PIL import Image
-def build_vocab(imgs, params):
-    count_thr = params['word_count_threshold']
-    # count up the number of words
-    counts = {}
-    for img in imgs:
-        for sent in img['sentences']:
-            for w in sent['tokens']:
-                counts[w] = counts.get(w, 0) + 1
-    cw = sorted([(count,w) for w,count in counts.items()], reverse=True)
-    print('top words and their counts:')
-    print('\n'.join(map(str,cw[:20])))
-    # print some stats
-    total_words = sum(counts.values())
-    print('total words:', total_words)
-    bad_words = [w for w,n in counts.items() if n <= count_thr]
-    vocab = [w for w,n in counts.items() if n > count_thr]
-    bad_count = sum(counts[w] for w in bad_words)
-    print('number of bad words: %d/%d = %.2f%%' % (len(bad_words), len(counts), len(bad_words)*100.0/len(counts)))
-    print('number of words in vocab would be %d' % (len(vocab), ))
-    print('number of UNKs: %d/%d = %.2f%%' % (bad_count, total_words, bad_count*100.0/total_words))
-    # lets look at the distribution of lengths as well
-    sent_lengths = {}
-    for img in imgs:
-        for sent in img['sentences']:
-            txt = sent['tokens']
-            nw = len(txt)
-            sent_lengths[nw] = sent_lengths.get(nw, 0) + 1
-    max_len = max(sent_lengths.keys())
-    print('max length sentence in raw data: ', max_len)
-    print('sentence length distribution (count, number of words):')
-    sum_len = sum(sent_lengths.values())
-    for i in range(max_len+1):
-        print('%2d: %10d   %f%%' % (i, sent_lengths.get(i,0), sent_lengths.get(i,0)*100.0/sum_len))
-    # lets now produce the final annotations
-    if bad_count > 0:
-        # additional special UNK token we will use below to map infrequent words to
-        print('inserting the special UNK token')
-        vocab.append('UNK')
-    for img in imgs:
-        img['final_captions'] = []
-        for sent in img['sentences']:
-            txt = sent['tokens']
-            caption = [w if counts.get(w,0) > count_thr else 'UNK' for w in txt]
-            img['final_captions'].append(caption)
-    return vocab
-def encode_captions(imgs, params, wtoi):
-    """
-    encode all captions into one large array, which will be 1-indexed.
-    also produces label_start_ix and label_end_ix which store 1-indexed
-    and inclusive (Lua-style) pointers to the first and last caption for
-    each image in the dataset.
-    """
-    max_length = params['max_length']
-    N = len(imgs)
-    M = sum(len(img['final_captions']) for img in imgs) # total number of captions
-    label_arrays = []
-    label_start_ix = np.zeros(N, dtype='uint32') # note: these will be one-indexed
-    label_end_ix = np.zeros(N, dtype='uint32')
-    label_length = np.zeros(M, dtype='uint32')
-    caption_counter = 0
-    counter = 1
-    for i,img in enumerate(imgs):
-        n = len(img['final_captions'])
-        assert n > 0, 'error: some image has no captions'
-        Li = np.zeros((n, max_length), dtype='uint32')
-        for j,s in enumerate(img['final_captions']):
-            label_length[caption_counter] = min(max_length, len(s)) # record the length of this sequence
-            caption_counter += 1
-            for k,w in enumerate(s):
-                if k < max_length:
-                    Li[j,k] = wtoi[w]
-        # note: word indices are 1-indexed, and captions are padded with zeros
-        label_arrays.append(Li)
-        label_start_ix[i] = counter
-        label_end_ix[i] = counter + n - 1
-        counter += n
-    L = np.concatenate(label_arrays, axis=0) # put all the labels together
-    assert L.shape[0] == M, 'lengths don\'t match? that\'s weird'
-    assert np.all(label_length > 0), 'error: some caption had no words?'
-    print('encoded captions to array of size ', L.shape)
-    return L, label_start_ix, label_end_ix, label_length
-def main(params):
-    imgs = json.load(open(params['input_json'], 'r'))
-    imgs = imgs['images']
-    seed(123) # make reproducible
-    # # create the vocab
-    # vocab = build_vocab(imgs, params)
-    # itow = {i+1:w for i,w in enumerate(vocab)} # a 1-indexed vocab translation table
-    # wtoi = {w:i+1 for i,w in enumerate(vocab)} # inverse table
-    itow = imgs['ix_to_word']
-    wtoi = {w:i for i, w in itow.items()}
-    # encode captions in large arrays, ready to ship to hdf5 file
-    L, label_start_ix, label_end_ix, label_length = encode_captions(imgs, params, wtoi)
-    # create output h5 file
-    N = len(imgs)
-    f_lb = h5py.File(params['output_h5']+'_label.h5', "w")
-    f_lb.create_dataset("labels", dtype='uint32', data=L)
-    f_lb.create_dataset("label_start_ix", dtype='uint32', data=label_start_ix)
-    f_lb.create_dataset("label_end_ix", dtype='uint32', data=label_end_ix)
-    f_lb.create_dataset("label_length", dtype='uint32', data=label_length)
-    f_lb.close()
-    # create output json file
-    out = {}
-    out['ix_to_word'] = itow # encode the (1-indexed) vocab
-    out['images'] = []
-    for i,img in enumerate(imgs):
-        jimg = {}
-        jimg['split'] = img['split']
-        if 'filename' in img: jimg['file_path'] = os.path.join(img.get('filepath', ''), img['filename']) # copy it over, might need
-        if 'cocoid' in img:
-            jimg['id'] = img['cocoid'] # copy over & mantain an id, if present (e.g. coco ids, useful)
-        elif 'imgid' in img:
-            jimg['id'] = img['imgid']
-        if params['images_root'] != '':
-            with Image.open(os.path.join(params['images_root'], img['filepath'], img['filename'])) as _img:
-                jimg['width'], jimg['height'] = _img.size
-        out['images'].append(jimg)
-    json.dump(out, open(params['output_json'], 'w'))
-    print('wrote ', params['output_json'])
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # input json
-    parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
-    parser.add_argument('--output_json', default='data.json', help='output json file')
-    parser.add_argument('--output_h5', default='data', help='output h5 file')
-    parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json')
-    # options
-    parser.add_argument('--max_length', default=16, type=int, help='max length of a caption, in number of words. captions longer than this get clipped.')
-    parser.add_argument('--word_count_threshold', default=5, type=int, help='only words that occur more than this number of times will be put in vocab')
-    args = parser.parse_args()
-    params = vars(args) # convert to ordinary dict
-    print('parsed input parameters:')
-    print(json.dumps(params, indent = 2))
-    main(params)

tools/eval.py DELETED Viewed

@@ -1,125 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import json
-import numpy as np
-import time
-import os
-from six.moves import cPickle
-import captioning.utils.opts as opts
-import captioning.models as models
-from captioning.data.dataloader import *
-# from captioning.data.dataloaderraw import *
-import captioning.utils.eval_utils as eval_utils
-import argparse
-import captioning.utils.misc as utils
-import captioning.modules.losses as losses
-import torch
-# Input arguments and options
-parser = argparse.ArgumentParser()
-# Input paths
-parser.add_argument('--model', type=str, default='',
-                help='path to model to evaluate')
-parser.add_argument('--cnn_model', type=str,  default='resnet101',
-                help='resnet101, resnet152')
-parser.add_argument('--infos_path', type=str, default='',
-                help='path to infos to evaluate')
-parser.add_argument('--only_lang_eval', type=int, default=0,
-                help='lang eval on saved results')
-parser.add_argument('--force', type=int, default=0,
-                help='force to evaluate no matter if there are results available')
-parser.add_argument('--device', type=str, default='cuda',
-                help='cpu or cuda')
-opts.add_eval_options(parser)
-opts.add_diversity_opts(parser)
-opt = parser.parse_args()
-# Load infos
-with open(opt.infos_path, 'rb') as f:
-    infos = utils.pickle_load(f)
-# override and collect parameters
-replace = ['input_fc_dir', 'input_att_dir', 'input_box_dir', 'input_label_h5', 'input_json', 'batch_size', 'id']
-ignore = ['start_from']
-for k in vars(infos['opt']).keys():
-    if k in replace:
-        setattr(opt, k, getattr(opt, k) or getattr(infos['opt'], k, ''))
-    elif k not in ignore:
-        if not k in vars(opt):
-            vars(opt).update({k: vars(infos['opt'])[k]}) # copy over options from model
-vocab = infos['vocab'] # ix -> word mapping
-pred_fn = os.path.join('eval_results/', '.saved_pred_'+ opt.id + '_' + opt.split + '.pth')
-result_fn = os.path.join('eval_results/', opt.id + '_' + opt.split + '.json')
-if opt.only_lang_eval == 1 or (not opt.force and os.path.isfile(pred_fn)):
-    # if results existed, then skip, unless force is on
-    if not opt.force:
-        try:
-            if os.path.isfile(result_fn):
-                print(result_fn)
-                json.load(open(result_fn, 'r'))
-                print('already evaluated')
-                os._exit(0)
-        except:
-            pass
-    predictions, n_predictions = torch.load(pred_fn)
-    lang_stats = eval_utils.language_eval(opt.input_json, predictions, n_predictions, vars(opt), opt.split)
-    print(lang_stats)
-    os._exit(0)
-# At this point only_lang_eval if 0
-if not opt.force:
-    # Check out if
-    try:
-        # if no pred exists, then continue
-        tmp = torch.load(pred_fn)
-        # if language_eval == 1, and no pred exists, then continue
-        if opt.language_eval == 1:
-            json.load(open(result_fn, 'r'))
-        print('Result is already there')
-        os._exit(0)
-    except:
-        pass
-# Setup the model
-opt.vocab = vocab
-model = models.setup(opt)
-del opt.vocab
-model.load_state_dict(torch.load(opt.model, map_location='cpu'))
-model.to(opt.device)
-model.eval()
-crit = losses.LanguageModelCriterion()
-# Create the Data Loader instance
-if len(opt.image_folder) == 0:
-    loader = DataLoader(opt)
-else:
-    loader = DataLoaderRaw({'folder_path': opt.image_folder,
-                            'coco_json': opt.coco_json,
-                            'batch_size': opt.batch_size,
-                            'cnn_model': opt.cnn_model})
-# When eval using provided pretrained model, the vocab may be different from what you have in your cocotalk.json
-# So make sure to use the vocab in infos file.
-loader.dataset.ix_to_word = infos['vocab']
-# Set sample options
-opt.dataset = opt.input_json
-loss, split_predictions, lang_stats = eval_utils.eval_split(model, crit, loader,
-        vars(opt))
-print('loss: ', loss)
-if lang_stats:
-    print(lang_stats)
-if opt.dump_json == 1:
-    # dump the json
-    json.dump(split_predictions, open('vis/vis.json', 'w'))

tools/eval_clip_retrieval.py DELETED Viewed

@@ -1,231 +0,0 @@
-from PIL import Image
-# import requests
-from transformers import CLIPProcessor, CLIPModel
-import torch
-from torch.utils.data import DataLoader, Dataset
-from pathlib import Path
-from tqdm import tqdm
-import json
-import argparse
-import numpy as np
-class COCODataset(Dataset):
-    def __init__(self,
-                 coco_root="/nas-ssd/jmincho/datasets/COCO/",
-                 gen_caption_path=None,
-                 is_gt=True):
-        super().__init__()
-        self.coco_root = Path(coco_root)
-        self.image_dir = self.coco_root.joinpath('images/val2014')
-        if is_gt:
-            print("Loading karpathy splits")
-            data_info_path = self.coco_root.joinpath('dataset_coco.json')
-            with open(data_info_path) as f:
-                karpathy_data = json.load(f)
-            data = []
-            for datum in karpathy_data['images']:
-                # karpathy test split
-                if datum['split'] == 'test':
-                    img_id = datum['filename'].split('.')[0]
-                    new_datum = {
-                        'img_id': img_id,
-                        'captions': [d['raw'].strip() for d in datum['sentences']],
-                    }
-                    data.append(new_datum)
-        else:
-            print("Loading generated captions")
-            gen_caption_path = Path(gen_caption_path)
-            with open(gen_caption_path) as f:
-                # karpathy_data = json.load(f)
-                imgTogen_results = json.load(f)['imgToEval']
-            data = []
-            for img_id, img_data in imgTogen_results.items():
-                new_datum = {
-                    'img_id': img_id,
-                    'captions': [img_data['caption']],
-                }
-                data.append(new_datum)
-        self.data = data
-        print('# images:', len(self.data))
-        self.img_transform = processor.feature_extractor
-        self.tokenizer = processor.tokenizer
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        datum = self.data[idx]
-        img_id = datum['img_id']
-        if 'COCO' not in img_id:
-            img_id = f'COCO_val2014_{str(img_id).zfill(12)}'
-        img_fname = f"{img_id}.jpg"
-        # COCO_val2014_000000522418.jpg
-        img_path = self.image_dir.joinpath(img_fname)
-        img = Image.open(img_path).convert("RGB")
-        # take first caption
-        caption = datum['captions'][0]
-        return {
-            "img": img,
-            "caption": caption,
-        }
-    def collate_fn(self, datum_list):
-        B = len(datum_list)
-        imgs = [datum['img'] for datum in datum_list]
-        images = self.img_transform(imgs, return_tensors="pt")
-        captions  = [datum['caption'] for datum in datum_list]
-        text_tokens = self.tokenizer(captions, return_tensors="pt", padding=True)
-        batch = {
-            'images': images,
-            'captions': text_tokens,
-        }
-        return batch
-def compute_similarity(image_features, text_features, bs = 1000):
-    # compute similarity
-    max_pairs = image_features.shape[0]
-    similarity_scores = torch.zeros(max_pairs, max_pairs)
-    for v in range(0, max_pairs, bs):
-        for t in range(0, max_pairs, bs):
-            # print('Processing Visual '+str(v)+' Text '+str(t), end='\r')
-            batch_visual_emb = image_features[v:v+bs]
-            batch_caption_emb = text_features[t:t+bs]
-            logits = batch_visual_emb @ batch_caption_emb.t()
-            similarity_scores[v:v+bs,t:t+bs] = logits
-    print('Done similarity')
-    return similarity_scores
-def compute_retrieval(a2b_sims, return_ranks=True):
-    """
-    Args:
-        a2b_sims: Result of computing similarity between two sets of embeddings (emb1 @ emb2.T)
-            with shape (num_datapoints, num_datapoints).
-    Returns:
-        Retrieval metrics for that similarity.
-    """
-    npts = a2b_sims.shape[0]
-    ranks = np.zeros(npts)
-    top1 = np.zeros(npts)
-    # loop source embedding indices
-    for index in range(npts):
-        # get order of similarities to target embeddings
-        inds = np.argsort(a2b_sims[index])[::-1]
-        # find where the correct embedding is ranked
-        where = np.where(inds == index)
-        rank = where[0][0]
-        ranks[index] = rank
-        # save the top1 result as well
-        top1[index] = inds[0]
-    # Compute metrics
-    r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
-    r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
-    r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
-    r50 = 100.0 * len(np.where(ranks < 50)[0]) / len(ranks)
-    medr = np.floor(np.median(ranks)) + 1
-    meanr = ranks.mean() + 1
-    report_dict = {"r1": r1, "r5": r5, "r10": r10, "r50": r50, "medr": medr, "meanr": meanr, "sum": r1 + r5 + r10}
-    if return_ranks:
-        return report_dict, (ranks, top1)
-    else:
-        return report_dict
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--coco_root', type=str, default="/nas-ssd/jmincho/datasets/COCO/")
-    parser.add_argument('--gt', action='store_true')
-    parser.add_argument('--gen_caption_path', type=str, default="./eval_results/clipRN50_cider_test.json")
-    args = parser.parse_args()
-    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-    device = "cuda"
-    model = model.to(device)
-    model.eval()
-    print(f"Loaded CLIP at {device}")
-    batch_size = 1000
-    dataset = COCODataset(
-        coco_root="/nas-ssd/jmincho/datasets/COCO/",
-        gen_caption_path=args.gen_caption_path,
-        is_gt=args.gt
-    )
-    data_loader = DataLoader(
-        dataset,
-        batch_size=batch_size,
-        collate_fn=dataset.collate_fn,
-        shuffle=False,
-        num_workers=8)
-    # fwd all samples
-    image_features = []
-    text_features = []
-    for batch_idx, batch in enumerate(tqdm(data_loader)):
-        # print('Evaluating batch {}/{}'.format(batch_idx, len(data_loader)), end="\r")
-    #     images, texts = batch
-        with torch.no_grad():
-            images = batch["images"].to(device)
-            texts = batch["captions"].to(device)
-            vision_outputs = model.vision_model(**batch['images'])
-            text_outputs = model.text_model(**batch['captions'])
-            image_embeds = vision_outputs[1]
-            image_embeds = model.visual_projection(image_embeds)
-            text_embeds = text_outputs[1]
-            text_embeds = model.text_projection(text_embeds)
-            # normalized features
-            image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
-            text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
-        text_features.append(text_embeds.detach().cpu())
-        image_features.append(image_embeds.detach().cpu())
-    image_features = torch.cat(image_features, 0)
-    text_features = torch.cat(text_features, 0)
-    print('Done forward')
-    # normalized features
-    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
-    text_features = text_features / text_features.norm(dim=-1, keepdim=True)
-    # if not single_caption:
-    #     for cap_idx in range(text_features.shape[1]):
-    #         similarity_scores = compute_similarity(image_features, text_features[:,cap_idx,:])
-    #         i2t_dict = compute_retrieval(similarity_scores.numpy())
-    #         t2i_dict = compute_retrieval(similarity_scores.t().numpy())
-    #         print(cap_idx, 'i2t', i2t_dict)
-    #         print(cap_idx, 't2i', t2i_dict)
-    # else:
-    similarity_scores = compute_similarity(image_features, text_features)
-    i2t_dict = compute_retrieval(similarity_scores.numpy())
-    t2i_dict = compute_retrieval(similarity_scores.t().numpy())
-    print('i2t', i2t_dict)
-    print('t2i', t2i_dict)

tools/eval_finecapeval.py DELETED Viewed

@@ -1,204 +0,0 @@
-from tqdm import tqdm
-from pprint import pprint
-import pandas as pd
-import argparse
-import re
-import json
-import nltk
-from nltk.tokenize import word_tokenize
-from nltk.stem.porter import PorterStemmer
-p_stemmer = PorterStemmer()
-# nltk.download('punkt')
-# nltk.download('wordnet')
-# nltk.download('stopwords')
-import language_evaluation
-evaluator = language_evaluation.CocoEvaluator()
-def nltk_process(text):
-    # Tokenization
-    nltk_tokenList = word_tokenize(text)
-    # Stemming
-    nltk_stemedList = []
-    for word in nltk_tokenList:
-        nltk_stemedList.append(p_stemmer.stem(word))
-    filtered_sentence = nltk_stemedList
-    # Removing Punctuation
-    tokens = [re.sub(r'[^a-zA-Z0-9]', '', tok) for tok in filtered_sentence]
-    text = " ".join(tokens)
-    return text
-def calculate_finegrained_scores(pred_id2sent, id2caption, use_coco_eval=False):
-    if use_coco_eval:
-        n_total = 0
-        refs = []
-        hyps = []
-        for id, gt_captions in id2caption.items():
-            pred_sent = pred_id2sent[id]
-            refs.append(gt_captions)
-            hyps.append(pred_sent)
-            n_total += 1
-        print('caption')
-        results = evaluator.run_evaluation(hyps, refs)
-        pprint(results)
-    n_total = 0
-    total_score = 0
-    for id, gt_phrases in id2background.items():
-        pred_sent = pred_id2sent[id]
-        score = 0
-        n_phrases = len(gt_phrases)
-        for gt_phrase in gt_phrases:
-            word_score = 0
-            for gt_word in gt_phrase.split():
-                if gt_word in pred_sent:
-                    word_score += 1
-            if len(gt_phrase.split()) > 0:
-                score += word_score / len(gt_phrase.split())
-        if n_phrases > 0:
-            score /= n_phrases
-        total_score += score
-        n_total += 1
-    print('background')
-#     print('# retrieved words:', n_retrieved)
-    print(f'Acc: {total_score / n_total * 100:.2f}')
-    n_total = 0
-    total_score = 0
-    for id, gt_phrases in id2object.items():
-        pred_sent = pred_id2sent[id]
-        score = 0
-        n_phrases = len(gt_phrases)
-        for gt_phrase in gt_phrases:
-            word_score = 0
-            for gt_word in gt_phrase.split():
-                if gt_word in pred_sent:
-                    word_score += 1
-            if len(gt_phrase.split()) > 0:
-                score += word_score / len(gt_phrase.split())
-        if n_phrases > 0:
-            score /= n_phrases
-        total_score += score
-        n_total += 1
-    print('object')
-#     print('# retrieved words:', n_retrieved)
-    print(f'Acc: {total_score / n_total * 100:.2f}')
-    n_total = 0
-    total_score = 0
-    for id, gt_phrases in id2relation.items():
-        pred_sent = pred_id2sent[id]
-        score = 0
-        n_phrases = len(gt_phrases)
-        for gt_phrase in gt_phrases:
-            word_score = 0
-            for gt_word in gt_phrase.split():
-                if gt_word in pred_sent:
-                    word_score += 1
-            if len(gt_phrase.split()) > 0:
-                score += word_score / len(gt_phrase.split())
-        if n_phrases > 0:
-            score /= n_phrases
-        total_score += score
-        n_total += 1
-    print('relation')
-#     print('# retrieved words:', n_retrieved)
-    print(f'Acc: {total_score / n_total * 100:.2f}')
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--finecapeval_path', type=str, default="data/FineCapEval.csv")
-    parser.add_argument('--generated_id2caption', type=str, default="FineCapEval_results/mle.json")
-    args = parser.parse_args()
-    df = pd.read_csv(args.finecapeval_path)
-    assert df.shape == (5000, 5)
-    generated_id2caption = json.load(open(args.generated_id2caption, 'r'))
-    print("Preprocessing GT FineCapEval data...")
-    id2caption = {}
-    id2background = {}
-    id2object = {}
-    id2relation = {}
-    for row in tqdm(df.itertuples(), total=len(df)):
-        id = row.image.split('.')[0]
-        caption = row.caption
-        background = row.background
-        object = row.object
-        relation = row.relation
-        if not isinstance(caption, str):
-            continue
-        if not isinstance(background, str):
-            continue
-        if not isinstance(object, str):
-            continue
-        if not isinstance(relation, str):
-            continue
-        if id not in id2caption:
-            id2caption[id] = []
-            id2background[id] = []
-            id2object[id] = []
-            id2relation[id] = []
-        id2caption[id].append(caption)
-        phrases = []
-        for phrase in background.lower().split('\;'):
-            if len(phrase) > 1:
-                phrase = nltk_process(phrase)
-                phrases.append(phrase)
-        id2background[id].extend(phrases)
-        phrases = []
-        for phrase in object.lower().split('\;'):
-            if len(phrase) > 1:
-                phrase = nltk_process(phrase)
-                phrases.append(phrase)
-        id2object[id].extend(phrases)
-        phrases = []
-        for phrase in relation.lower().split('\;'):
-            if len(phrase) > 1:
-                phrase = nltk_process(phrase)
-                phrases.append(phrase)
-        id2relation[id].extend(phrases)
-    print("Calculating scores...")
-    calculate_finegrained_scores(
-        generated_id2caption,
-        id2caption,
-        use_coco_eval=True)

tools/finecapeval_inference.py DELETED Viewed

@@ -1,186 +0,0 @@
-import sys
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-import numpy as np
-import time
-import os
-from collections import defaultdict
-import json
-import captioning.utils.opts as opts
-import captioning.models as models
-from captioning.data.pth_loader import CaptionDataset
-import captioning.utils.eval_utils as eval_utils
-# import captioning.utils.vizwiz_eval_utils as vizwiz_eval_utils
-import captioning.utils.misc as utils
-from captioning.utils.rewards import init_scorer, get_self_critical_reward
-from captioning.modules.loss_wrapper import LossWrapper
-import pytorch_lightning as pl
-class ModelCheckpoint(pl.callbacks.ModelCheckpoint):
-    def on_keyboard_interrupt(self, trainer, pl_module):
-        # Save model when keyboard interrupt
-        filepath = os.path.join(self.dirpath, self.prefix + 'interrupt.ckpt')
-        self._save_model(filepath)
-if __name__ == '__main__':
-    device = 'cuda'
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--reward', type=str, default='mle')
-    args = parser.parse_args()
-    if args.reward == 'mle':
-        cfg = f'configs/phase1/fg_clipRN50_{args.reward}.yml'
-    else:
-        cfg = f'configs/phase2/fg_clipRN50_{args.reward}.yml'
-    print("Loading cfg from", cfg)
-    opt = opts.parse_opt(parse=False, cfg=cfg)
-    dataset = CaptionDataset(opt)
-    opt.vocab_size = dataset.vocab_size
-    opt.seq_length = dataset.seq_length
-    opt.batch_size = 40
-    opt.vocab = dataset.get_vocab()
-    model = models.setup(opt)
-    del opt.vocab
-    ckpt_path = opt.checkpoint_path + '-last.ckpt'
-    print("Loading checkpoint from", ckpt_path)
-    raw_state_dict = torch.load(
-        ckpt_path,
-        map_location=device)
-    strict = True
-    state_dict = raw_state_dict['state_dict']
-    if '_vocab' in state_dict:
-        model.vocab = utils.deserialize(state_dict['_vocab'])
-        del state_dict['_vocab']
-    elif strict:
-        raise KeyError
-    if '_opt' in state_dict:
-        saved_model_opt = utils.deserialize(state_dict['_opt'])
-        del state_dict['_opt']
-        # Make sure the saved opt is compatible with the curren topt
-        need_be_same = ["caption_model",
-                        "rnn_type", "rnn_size", "num_layers"]
-        for checkme in need_be_same:
-            if getattr(saved_model_opt, checkme) in ['updown', 'topdown'] and \
-                    getattr(opt, checkme) in ['updown', 'topdown']:
-                continue
-            assert getattr(saved_model_opt, checkme) == getattr(
-                opt, checkme), "Command line argument and saved model disagree on '%s' " % checkme
-    elif strict:
-        raise KeyError
-    res = model.load_state_dict(state_dict, strict)
-    print(res)
-    opt.use_grammar = False
-    lw_model = LossWrapper(model, opt)
-    split = 'test'
-    print("Building dataloader...")
-    test_dataset = torch.utils.data.Subset(
-        dataset,
-        dataset.split_ix[split]
-    )
-    test_loader = torch.utils.data.DataLoader(
-        test_dataset,
-        batch_size=opt.batch_size,
-        shuffle=False,
-        num_workers=4,
-        drop_last=False,
-        collate_fn=dataset.collate_func
-    )
-    eval_kwargs = {'dataset': opt.input_json}
-    eval_kwargs.update(vars(opt))
-    verbose = eval_kwargs.get('verbose', True)
-    verbose_beam = eval_kwargs.get('verbose_beam', 0)
-    verbose_loss = eval_kwargs.get('verbose_loss', 1)
-    # num_images = eval_kwargs.get('num_images', eval_kwargs.get('val_images_use', -1))
-    # lang_eval = eval_kwargs.get('language_eval', 0)
-    dataset = eval_kwargs.get('dataset', 'coco')
-    beam_size = eval_kwargs.get('beam_size', 1)
-    sample_n = eval_kwargs.get('sample_n', 1)
-    remove_bad_endings = eval_kwargs.get('remove_bad_endings', 0)
-    crit = lw_model.crit
-    model = model.to(device)
-    from tqdm import tqdm
-    test_id2sent = {}
-    model.eval()
-    print("running inference...")
-    for data in tqdm(test_loader):
-        with torch.no_grad():
-            # forward the model to get loss
-            tmp = [data['fc_feats'], data['att_feats'],
-                data['labels'], data['masks'], data['att_masks']]
-            tmp = [d.to(device) if isinstance(d, torch.Tensor) else d for d in tmp]
-            fc_feats, att_feats, labels, masks, att_masks = tmp
-            loss = crit(model(fc_feats, att_feats,
-                            labels[..., :-1], att_masks), labels[..., 1:], masks[..., 1:])
-            # forward the model to also get generated samples for each image
-            # Only leave one feature for each image, in case duplicate sample
-            tmp_eval_kwargs = eval_kwargs.copy()
-            tmp_eval_kwargs.update({'sample_n': 1})
-            seq, seq_logprobs = model(
-                fc_feats, att_feats, att_masks, opt=tmp_eval_kwargs, mode='sample')
-            seq = seq.data
-            entropy = - (F.softmax(seq_logprobs, dim=2) *
-                        seq_logprobs).sum(2).sum(1) / ((seq > 0).to(seq_logprobs).sum(1)+1)
-            perplexity = - \
-                seq_logprobs.gather(2, seq.unsqueeze(2)).squeeze(
-                    2).sum(1) / ((seq > 0).to(seq_logprobs).sum(1)+1)
-            # Print beam search
-            if beam_size > 1 and verbose_beam:
-                for i in range(fc_feats.shape[0]):
-                    print('\n'.join([utils.decode_sequence(model.vocab, _[
-                        'seq'].unsqueeze(0))[0] for _ in model.done_beams[i]]))
-                    print('--' * 10)
-            sents = utils.decode_sequence(model.vocab, seq)
-        for d, sent in zip(data['infos'], sents):
-            test_id2sent[d['id']] = sent
-    res_path = f'FineCapEval_results/clipRN50_{args.reward}.json'
-    print("Results save at {}".format(res_path))
-    with open(res_path, 'w') as f:
-        json.dump(test_id2sent, f)

tools/train_pl.py DELETED Viewed

@@ -1,709 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-import numpy as np
-import time
-import os
-from collections import defaultdict
-import captioning.utils.opts as opts
-import captioning.models as models
-from captioning.data.pth_loader import CaptionDataset
-import captioning.utils.eval_utils as eval_utils
-import captioning.utils.misc as utils
-from captioning.utils.rewards import init_scorer, get_self_critical_reward
-from captioning.modules.loss_wrapper import LossWrapper
-import pytorch_lightning as pl
-import detectron2.utils.comm as d2comm
-from detectron2.utils.env import seed_all_rng
-seed_all_rng(1234)
-class LitModel(pl.LightningModule):
-    def __init__(self, opt):
-        super().__init__()
-        self.opt = opt
-        # Intilaize dataset
-        self.dataset = CaptionDataset(opt)
-        opt.vocab_size = self.dataset.vocab_size
-        opt.seq_length = self.dataset.seq_length
-        self.batch_size = opt.batch_size
-        # Build model
-        opt.vocab = self.dataset.get_vocab()
-        model = models.setup(opt)
-        # print(model)
-        del opt.vocab
-        # wrapper with loss in it.
-        lw_model = LossWrapper(model, opt)
-        self.model = model
-        self.lw_model = lw_model
-        self.struc_flag = None
-        self.sc_flag = None
-        # if self.opt.use_clipscore:
-        # if self.opt.use_clipscore or os.getenv('EVALUATE', '0') == '1':
-        # if CLIP-S+Grammar is used in reward -> Launch another CLIP-S where parameter is unchanged
-        if getattr(self.opt, 'use_grammar', False):
-            from captioning.utils.clipscore import CLIPScore
-            self.val_clipscore_model = CLIPScore(
-                mode=opt.clipscore_mode, use_grammar=False)
-            for p in self.val_clipscore_model.parameters():
-                p.requires_grad = False
-        else:
-            if self.lw_model.clipscore_model is not None:
-                self.val_clipscore_model = self.lw_model.clipscore_model
-            else:
-                from captioning.utils.clipscore import CLIPScore
-                self.val_clipscore_model = CLIPScore(
-                    mode=opt.clipscore_mode, use_grammar=False)
-                for p in self.val_clipscore_model.parameters():
-                    p.requires_grad = False
-        self.val_clipscore_model.eval()
-        # BERTSCORE
-        from bert_score import BERTScorer
-        self.bert_scorer = BERTScorer(
-            lang="en",
-        #     rescale_with_baseline=True,
-            rescale_with_baseline=False,
-            device='cpu'
-        )
-    def forward(self, *args, **kwargs):
-        """
-        I hate this design. Never pretend it as a nn.Module
-        """
-        raise NotImplementedError
-    def train_dataloader(self):
-        train_dataset = torch.utils.data.Subset(
-            self.dataset,
-            self.dataset.split_ix['train']
-        )
-        train_loader = torch.utils.data.DataLoader(
-            dataset=train_dataset,
-            batch_size=self.batch_size,
-            shuffle=True,
-            num_workers=4,
-            collate_fn=self.dataset.collate_func
-        )
-        return train_loader
-    def val_dataloader(self, split='val'):
-        val_dataset = torch.utils.data.Subset(
-            self.dataset,
-            self.dataset.split_ix[split]
-        )
-        val_loader = torch.utils.data.DataLoader(
-            val_dataset,
-            batch_size=self.batch_size,
-            shuffle=False,
-            num_workers=4,
-            drop_last=False,
-            collate_fn=self.dataset.collate_func
-        )
-        return val_loader
-    def test_dataloader(self):
-        return self.val_dataloader('test')
-    def training_step(self, data, batch_idx):
-        sc_flag, struc_flag = self.sc_flag, self.struc_flag
-        tmp = [data['fc_feats'], data['att_feats'],
-               data['labels'], data['masks'], data['att_masks']]
-        fc_feats, att_feats, labels, masks, att_masks = tmp
-        if int(os.getenv('M2_cider', '0')) != 0:
-            data['gts'] = data['rawgts']
-        if self.opt.use_clipscore:
-            clip_vis_feats = data['clip_vis_feats']
-            model_out = self.lw_model(fc_feats, att_feats, labels, masks, att_masks,
-                                      data['gts'], torch.arange(0, len(data['gts'])), sc_flag, struc_flag,
-                                      clip_vis_feats=clip_vis_feats)
-        else:
-            model_out = self.lw_model(fc_feats, att_feats, labels, masks, att_masks,
-                                      data['gts'], torch.arange(0, len(data['gts'])), sc_flag, struc_flag)
-        loss = model_out['loss']
-        data_time = self.trainer.profiler.recorded_durations["get_train_batch"][-1]
-        data_time = torch.tensor(data_time)
-        logger_logs = model_out.copy()
-        # if struc_flag or sc_flag:
-        #     logger_logs['reward'] = model_out['reward'].mean()
-        #     logger_logs['reward_var'] = model_out['reward'].var(1).mean()
-        if struc_flag or sc_flag:
-            logger_logs['reward'] = model_out['reward'].mean()
-            for k in ['CLIP-S', 'RefCLIP-S', 'CIDEr', 'grammar_reward']:
-                if k in model_out:
-                    logger_logs[k] = model_out[k]
-        if struc_flag:
-            logger_logs['reward_var'] = model_out['reward'].var(1).mean()
-        logger_logs['scheduled_sampling_prob'] = torch.tensor(
-            self.model.ss_prob)
-        # logger_logs['training_loss'] = loss
-        logger_logs['loss'] = loss
-        logger_logs['data_time'] = data_time
-        # UserWarning: The {progress_bar:dict keyword} was deprecated in 0.9.1 and will be removed in 1.0.0
-        # Please use self.log(...) inside the lightningModule instead.
-        # # log on a step or aggregate epoch metric to the logger and/or progress bar
-        # # (inside LightningModule)
-        # self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
-        # warnings.warn(*args, **kwargs)
-        # UserWarning: The {log:dict keyword} was deprecated in 0.9.1 and will be removed in 1.0.0
-        # Please use self.log(...) inside the lightningModule instead.
-        # output = {
-        #     'loss': loss,
-        #     'log': logger_logs,
-        #     'progress_bar': {'data_time': data_time}
-        # }
-        for k, v in logger_logs.items():
-            if k in ['reward', 'reward_var', 'data_time', 'CLIP-S', 'RefCLIP-S', 'CIDEr', 'grammar_reward']:
-                self.log('train/'+k, v, prog_bar=True)
-            else:
-                self.log('train/'+k, v)
-        return loss
-    def validation_step(self, data, batch_idx):
-        model = self.model
-        crit = self.lw_model.crit
-        opt = self.opt
-        eval_kwargs = {'dataset': opt.input_json}
-        eval_kwargs.update(vars(opt))
-        # CLIPScore
-        use_grammar = getattr(self.opt, 'use_grammar', False)
-        joint_out = getattr(self.opt, 'joint_out', False)
-        verbose = eval_kwargs.get('verbose', True)
-        verbose_beam = eval_kwargs.get('verbose_beam', 0)
-        verbose_loss = eval_kwargs.get('verbose_loss', 1)
-        # num_images = eval_kwargs.get('num_images', eval_kwargs.get('val_images_use', -1))
-        # lang_eval = eval_kwargs.get('language_eval', 0)
-        dataset = eval_kwargs.get('dataset', 'coco')
-        beam_size = eval_kwargs.get('beam_size', 1)
-        sample_n = eval_kwargs.get('sample_n', 1)
-        remove_bad_endings = eval_kwargs.get('remove_bad_endings', 0)
-        # Use this nasty way to make other code clean since it's a global configuration
-        os.environ["REMOVE_BAD_ENDINGS"] = str(remove_bad_endings)
-        predictions = []
-        n_predictions = []
-        loss = torch.tensor(0)
-        if data.get('labels', None) is not None and verbose_loss:
-            # forward the model to get loss
-            tmp = [data['fc_feats'], data['att_feats'],
-                   data['labels'], data['masks'], data['att_masks']]
-            fc_feats, att_feats, labels, masks, att_masks = tmp
-            loss = crit(model(fc_feats, att_feats,
-                              labels[..., :-1], att_masks), labels[..., 1:], masks[..., 1:])
-            # forward the model to also get generated samples for each image
-            # Only leave one feature for each image, in case duplicate sample
-            tmp_eval_kwargs = eval_kwargs.copy()
-            tmp_eval_kwargs.update({'sample_n': 1})
-            seq, seq_logprobs = model(
-                fc_feats, att_feats, att_masks, opt=tmp_eval_kwargs, mode='sample')
-            seq = seq.data
-            entropy = - (F.softmax(seq_logprobs, dim=2) *
-                         seq_logprobs).sum(2).sum(1) / ((seq > 0).to(seq_logprobs).sum(1)+1)
-            perplexity = - \
-                seq_logprobs.gather(2, seq.unsqueeze(2)).squeeze(
-                    2).sum(1) / ((seq > 0).to(seq_logprobs).sum(1)+1)
-            # Print beam search
-            if beam_size > 1 and verbose_beam:
-                for i in range(fc_feats.shape[0]):
-                    print('\n'.join([utils.decode_sequence(model.vocab, _[
-                          'seq'].unsqueeze(0))[0] for _ in model.done_beams[i]]))
-                    print('--' * 10)
-            sents = utils.decode_sequence(model.vocab, seq)
-            # if self.opt.use_clipscore or os.getenv('EVALUATE', '0') == '1':
-            # text_feat = self.lw_model.clipscore_model.text_extract(sents)
-            text_feat = self.val_clipscore_model.text_extract(sents, proj_norm=False)
-            text_cont_feat = self.val_clipscore_model.clip_model.text_projection(text_feat)
-            text_cont_feat = text_cont_feat / text_cont_feat.norm(dim=-1, keepdim=True)
-            vis_feat = data['clip_vis_feats']
-            # if self.opt.clipscore_mode == 'clip_s':
-            #     clip_s = self.val_clipscore_model(text_feat=text_cont_feat, img_feat=vis_feat, mode='clip_s')
-            # elif self.opt.clipscore_mode == 'refclip_s':
-            clip_s = self.val_clipscore_model(text_feat=text_cont_feat, img_feat=vis_feat, mode='clip_s')
-            # ref_text = utils.decode_sequence(model.vocab, data['gts'])
-            gt_indices = torch.arange(0, len(data['gts']))
-            data_gts = [data['gts'][_] for _ in gt_indices.tolist()]
-            B = len(data_gts)
-            gts = []
-            gts_valid_mask = []
-            max_n_refs = max([len(_gts) for _gts in data_gts])
-            for i in range(len(data_gts)):
-                _gts = utils.decode_sequence(model.vocab, data_gts[i])
-                # pad references
-                n_ref = len(_gts)
-                _gts.extend([''] * (max_n_refs - n_ref))
-                gts.extend(_gts)
-                gts_valid_mask.extend([1] * n_ref + [0] * (max_n_refs - n_ref))
-            assert len(gts) == B * max_n_refs
-            assert len(gts_valid_mask) == B * max_n_refs
-            ref_text = gts
-            ref_text_mask = gts_valid_mask
-            refclip_s = self.val_clipscore_model(
-                text_feat=text_cont_feat, img_feat=vis_feat,
-                ref_text=ref_text, ref_text_mask=ref_text_mask, mode='refclip_s')
-            # use_grammar = getattr(self.opt, 'use_grammar', False)
-            # joint_out = getattr(self.opt, 'joint_out', False)
-            if use_grammar and not joint_out:
-                with torch.no_grad():
-                    # grammar_logit = self.val_clipscore_model.grammar_score_head(text_feat.view(-1, 512))
-                    grammar_logit = self.lw_model.clipscore_model.grammar_score_head(text_feat.view(-1, 512))
-                    grammar_prob = torch.softmax(grammar_logit, dim=-1)[:, 1]
-            # BERTScore
-            if next(self.bert_scorer._model.parameters()).device != self.device:
-                self.bert_scorer._model.to(self.device)
-                self.bert_scorer.device = self.device
-            # [B*K] -> [B, K]
-            ref_text_per_example = []
-            for i in range(B):
-                ref_text_list_example = []
-                for k in range(max_n_refs):
-                    ref = ref_text[i * max_n_refs + k]
-                    if len(ref) > 0:
-                        ref_text_list_example.append(ref)
-                # assert len(ref_text_list_example) == max_n_refs
-                ref_text_per_example.append(ref_text_list_example)
-            assert len(ref_text_per_example) == B
-            P, R, F1 = self.bert_scorer.score(
-                sents,
-                ref_text_per_example,
-            )
-            bertscore_f1 = F1
-            # print('Example 5:')
-            # for i in range(5):
-            #     print('Generated:', sents[i])
-            #     print('ref_text:', ref_text_per_example[i])
-            #     print('BERT-Score:', F1[i].item())
-            for k, sent in enumerate(sents):
-                entry = {'image_id': data['infos'][k]['id'], 'caption': sent,
-                         'perplexity': perplexity[k].item(), 'entropy': entropy[k].item()}
-                if self.opt.use_clipscore or os.getenv('EVALUATE', '0') == '1':
-                    # if self.opt.clipscore_mode == 'clip_s':
-                        # entry['clipscore'] = clipscore[k].item()
-                        # entry['CLIP-S'] = clip_s[k].item()
-                    # elif self.opt.clipscore_mode == 'refclip_s':
-                    entry['CLIP-S'] = clip_s[k].item()
-                    entry['RefCLIP-S'] = refclip_s[k].item()
-                if use_grammar and not joint_out:
-                    entry['grammar_prob'] = grammar_prob[k].item()
-                # BERT-S
-                entry['BERT-S'] = bertscore_f1[k].item()
-                if eval_kwargs.get('dump_path', 0) == 1:
-                    entry['file_name'] = data['infos'][k]['file_path']
-                predictions.append(entry)
-                if eval_kwargs.get('dump_images', 0) == 1:
-                    # dump the raw image to vis/ folder
-                    cmd = 'cp "' + os.path.join(eval_kwargs['image_root'], data['infos'][k]['file_path']) + \
-                        '" vis/imgs/img' + \
-                        str(len(predictions)) + '.jpg'  # bit gross
-                    print(cmd)
-                    os.system(cmd)
-                if verbose:
-                    print('image %s: %s' %
-                          (entry['image_id'], entry['caption']))
-            if sample_n > 1:
-                eval_utils.eval_split_n(model, n_predictions, [
-                                        fc_feats, att_feats, att_masks, data], eval_kwargs)
-        output = {
-            # 'val_loss': loss,
-            'loss': loss,
-            'predictions': predictions,
-            'n_predictions': n_predictions,
-        }
-        return output
-    def test_step(self, *args, **kwargs):
-        return self.validation_step(*args, **kwargs)
-    def validation_epoch_end(self, outputs, split='val'):
-        outputs = d2comm.gather(outputs)
-        # master node
-        if d2comm.is_main_process():
-            assert self.trainer.node_rank == 0 and self.trainer.local_rank == 0
-            outputs = sum(outputs, [])
-            opt = self.opt
-            # val_loss_mean = sum([_['val_loss']
-            # val_loss_mean = sum([_['val_loss'].cpu()
-            val_loss_mean = sum([_['loss'].cpu()
-                                 for _ in outputs]) / len(outputs)
-            predictions = sum([_['predictions'] for _ in outputs], [])
-            if len(outputs[0]['n_predictions']) != 0:
-                n_predictions = sum([_['n_predictions'] for _ in outputs], [])
-            else:
-                n_predictions = []
-            lang_stats = None
-            if len(n_predictions) > 0 and 'perplexity' in n_predictions[0]:
-                n_predictions = sorted(
-                    n_predictions, key=lambda x: x['perplexity'])
-            if not os.path.isdir('eval_results'):
-                os.mkdir('eval_results')
-            torch.save((predictions, n_predictions), os.path.join(
-                'eval_results/', '.saved_pred_' + opt.id + '_' + split + '.pth'))
-            if opt.language_eval:
-                lang_stats = eval_utils.language_eval(
-                    opt.input_json, predictions, n_predictions, vars(opt), split)
-            if opt.reduce_on_plateau:
-                optimizer = self.trainer.optimizers[0]
-                if 'CIDEr' in lang_stats:
-                    optimizer.scheduler_step(-lang_stats['CIDEr'])
-                else:
-                    optimizer.scheduler_step(val_loss_mean)
-            # out = {
-            #     'val_loss': val_loss_mean
-            # }
-            out = {
-                'loss': val_loss_mean
-            }
-            out.update(lang_stats)
-            # out['to_monitor'] = lang_stats['CIDEr'] if lang_stats is not None else -val_loss_mean
-            if self.opt.use_clipscore or os.getenv('EVALUATE', '0') == '1':
-                # if self.opt.clipscore_mode == 'clip_s':
-                    # out['clipscore'] = sum([p['clipscore'] for p in predictions]) / len(predictions)
-                    # print('CLIPScore', out['clipscore'])
-                    # out['CLIP-S'] = sum([p['CLIP-S'] for p in predictions]) / len(predictions)
-                    # print('CLIP-S', out['CLIP-S'])
-                # elif self.opt.clipscore_mode == 'refclip_s':
-                out['CLIP-S'] = sum([p['CLIP-S'] for p in predictions]) / len(predictions)
-                print('CLIP-S', out['CLIP-S'])
-                out['RefCLIP-S'] = sum([p['RefCLIP-S'] for p in predictions]) / len(predictions)
-                print('RefCLIP-S', out['RefCLIP-S'])
-                if getattr(self.opt, 'use_grammar', False) and not getattr(self.opt, 'joint_out', False):
-                    out['grammar_prob'] = sum([p['grammar_prob'] for p in predictions]) / len(predictions)
-                    print('grammar_prob', out['grammar_prob'])
-                out['BERT-S'] = sum([p['BERT-S'] for p in predictions]) / len(predictions)
-                print('BERT-S', out['BERT-S'])
-        else:
-            out = {}
-        out = d2comm.all_gather(out)[0]  # Only the one from master node
-        assert len(out) > 0  # make sure the head has index 0
-        # must all be tensors
-        out = {k: torch.tensor(v) if not torch.is_tensor(
-            v) else v for k, v in out.items()}
-        # return {
-        #     'progress_bar': {'val_loss': out['val_loss']},
-        #     'log': out,
-        # }
-        for k, v in out.items():
-            # if k in ['loss', 'clipscore', 'RefCLIP-S', 'CIDEr']:
-            #     if split != 'test':
-            #         self.log(f'{split}/{k}', v, prog_bar=True)
-            # elif k == 'to_monitor':
-            #     if split != 'test':
-            #         self.log(f'{split}/{k}', v)
-            # else:
-            self.log(f'{split}/{k}', v)
-    def test_epoch_end(self, outputs):
-        # out = self.validation_epoch_end(outputs, 'test')
-        # out['progress_bar'] = {
-        #     # 'test_loss': out['progress_bar']['val_loss']
-        #     'test_loss': out['progress_bar']['loss']
-        # }
-        # out['log']['test_loss'] = out['log']['val_loss']
-        # del out['log']['val_loss']
-        # del out['log']['to_monitor']
-        # out['log'] = {'test_'+k if 'test' not in k else k:v \
-        #               for k,v in out['log'].items()}
-        # return out
-        self.validation_epoch_end(outputs, 'test')
-    def configure_optimizers(self):
-        opt = self.opt
-        model = self.model
-        parameters = [p for p in model.parameters() if p.requires_grad]
-        if opt.noamopt:
-            # assert opt.caption_model in ['transformer', 'bert', 'm2transformer'], 'noamopt can only work with transformer'
-            optimizer = utils.get_std_opt(
-                model, optim_func=opt.optim, factor=opt.noamopt_factor, warmup=opt.noamopt_warmup)
-        elif opt.reduce_on_plateau:
-            # optimizer = utils.build_optimizer(model.parameters(), opt)
-            optimizer = utils.build_optimizer(parameters, opt)
-            optimizer = utils.ReduceLROnPlateau(optimizer,
-                                                factor=opt.reduce_on_plateau_factor,
-                                                patience=opt.reduce_on_plateau_patience)
-        else:
-            # optimizer = utils.build_optimizer(model.parameters(), opt)
-            optimizer = utils.build_optimizer(parameters, opt)
-        return [optimizer], []
-    def optimizer_step(self, epoch, batch_idx, optimizer,
-                       optimizer_idx, *args, **kwargs):
-        # warm up lr
-        opt = self.opt
-        iteration = self.trainer.global_step
-        if opt.use_warmup and (iteration < opt.noamopt_warmup):
-            opt.current_lr = opt.learning_rate * \
-                (iteration+1) / opt.noamopt_warmup
-            utils.set_lr(optimizer, opt.current_lr)
-        super().optimizer_step(epoch, batch_idx, optimizer,
-                               optimizer_idx, *args, **kwargs)
-    def state_dict(self):
-        """
-        Save the model state dict as well as opt and vocab
-        """
-        state_dict = self.model.state_dict()
-        device = next(iter(state_dict.values())).device
-        assert '_vocab' not in state_dict and '_opt' not in state_dict, 'Just in case'
-        state_dict.update({
-            '_vocab': utils.serialize_to_tensor(self.model.vocab).to(device),
-            '_opt': utils.serialize_to_tensor(self.opt).to(device)
-        })
-        return state_dict
-    def load_state_dict(self, state_dict=None, strict=True):
-        if '_vocab' in state_dict:
-            self.model.vocab = utils.deserialize(state_dict['_vocab'])
-            del state_dict['_vocab']
-        # elif strict:
-        #     raise KeyError
-        if '_opt' in state_dict:
-            saved_model_opt = utils.deserialize(state_dict['_opt'])
-            del state_dict['_opt']
-            opt = self.opt
-            # Make sure the saved opt is compatible with the curren topt
-            need_be_same = ["caption_model",
-                            "rnn_type", "rnn_size", "num_layers"]
-            for checkme in need_be_same:
-                if getattr(saved_model_opt, checkme) in ['updown', 'topdown'] and \
-                        getattr(opt, checkme) in ['updown', 'topdown']:
-                    continue
-                assert getattr(saved_model_opt, checkme) == getattr(
-                    opt, checkme), "Command line argument and saved model disagree on '%s' " % checkme
-        # elif strict:
-        #     raise KeyError
-        self.model.load_state_dict(state_dict, strict)
-class OnEpochStartCallback(pl.Callback):
-    def on_epoch_start(self, trainer, pl_module):
-        # Update lr/training stage/scheduled sampling prob etc.
-        opt = pl_module.opt
-        model = pl_module.model
-        epoch = trainer.current_epoch
-        optimizer = trainer.optimizers[0]
-        if not opt.noamopt and not opt.reduce_on_plateau:
-            # Assign the learning rate
-            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
-                frac = (
-                    epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every
-                decay_factor = opt.learning_rate_decay_rate ** frac
-                opt.current_lr = opt.learning_rate * decay_factor
-            else:
-                opt.current_lr = opt.learning_rate
-            utils.set_lr(optimizer, opt.current_lr)  # set the decayed rate
-        # Assign the scheduled sampling prob
-        if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
-            frac = (
-                epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every
-            opt.ss_prob = min(opt.scheduled_sampling_increase_prob *
-                              frac, opt.scheduled_sampling_max_prob)
-            model.ss_prob = opt.ss_prob
-        # If start self critical training
-        if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
-            sc_flag = True
-            init_scorer(opt.cached_tokens)
-        else:
-            sc_flag = False
-        # If start structure loss training
-        if opt.structure_after != -1 and epoch >= opt.structure_after:
-            struc_flag = True
-            init_scorer(opt.cached_tokens)
-        else:
-            struc_flag = False
-        pl_module.struc_flag = struc_flag
-        pl_module.sc_flag = sc_flag
-class ModelCheckpoint(pl.callbacks.ModelCheckpoint):
-    def on_keyboard_interrupt(self, trainer, pl_module):
-        # Save model when keyboard interrupt
-        filepath = os.path.join(self.dirpath, self.prefix + 'interrupt.ckpt')
-        self._save_model(filepath)
-opt = opts.parse_opt()
-checkpoint_callback = ModelCheckpoint(
-    filepath=opt.checkpoint_path,
-    # dirpath=opt.checkpoint_path,
-    save_last=True,
-    save_top_k=1,
-    verbose=True,
-    # monitor='to_monitor',
-    # monitor='val/to_monitor',
-    monitor='val/CIDEr',
-    mode='max',
-    # prefix=opt.id+'_',
-    prefix=opt.id,
-    # filename=f'{opt.id}_',
-)
-verbose = True
-# import torch
-# if torch.cuda.current_device() in [0, -1]:
-if 'LOCAL_RANK' in os.environ and os.environ['LOCAL_RANK'] != '0':
-    verbose = False
-if verbose:
-    print(opt)
-    print("""
-    val_image_use,
-    save_checkpoint_very
-    save_every_epoch,
-    save_history-ckpt will be ignored.
-    """)
-# Lightning defines batch size as batch size per gpu
-assert opt.batch_size % torch.cuda.device_count() == 0
-opt.batch_size = opt.batch_size // torch.cuda.device_count()
-# If resume from last checkpoint
-# if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from, f'{opt.id}_last.ckpt')):
-#     resume_from = os.path.join(opt.start_from, f'{opt.id}_last.ckpt')
-if opt.start_from is not None:
-    resume_from = os.path.join(opt.start_from, f'{opt.id}-last.ckpt')
-    if os.path.isfile(resume_from):
-        if verbose:
-            print('Loading checkpoint from', resume_from)
-    else:
-        print("Checkpoint not found:", resume_from)
-        resume_from = None
-else:
-    resume_from = None
-from pytorch_lightning.loggers import WandbLogger
-wandb_logger = WandbLogger(
-    project='CLIP-ViL-COCOCaption',
-    name=opt.id,
-)
-if verbose:
-    wandb_logger.experiment.config.update(opt)
-    from pathlib import Path
-    import glob
-    import wandb
-    # src_dir = Path(__file__).resolve().parent.parent
-    glob_str = "**/*.py"
-    base_path = './'
-    wandb.save(glob_str=glob_str, base_path=base_path)
-    # code = wandb.Artifact('project-source', type='code')
-    # for path in glob.glob('**/*.py', recursive=True):
-    #     code.add_file(path, name='source/'+path)
-    #     print(path)
-    # wandb.run.use_artifact(code)
-lit = LitModel(opt)
-# warning grad_clip_mode is ignored.
-trainer = pl.Trainer(
-    callbacks=[
-        OnEpochStartCallback(),
-        # pl.callbacks.lr_logger.LearningRateLogger()
-        pl.callbacks.LearningRateMonitor()
-    ],
-    default_root_dir=opt.checkpoint_path,
-    resume_from_checkpoint=resume_from,
-    distributed_backend='ddp',
-    check_val_every_n_epoch=1,
-    max_epochs=opt.max_epochs,
-    gradient_clip_val=opt.grad_clip_value,
-    gpus=torch.cuda.device_count(),
-    checkpoint_callback=checkpoint_callback,
-    log_gpu_memory='min_max',
-    # log_save_interval=opt.losses_log_every,
-    log_every_n_steps=opt.losses_log_every,
-    profiler=True,
-    # profiler='simple',
-    # row_log_interval=10,  # what is it?
-    flush_logs_every_n_steps=10,
-    num_sanity_val_steps=0,
-    # val_check_interval=0.01,
-    # limit_train_batches=500,
-    # progress_bar_refresh_rate=0,
-    # fast_dev_run=True,
-    precision=opt.precision,
-    logger=wandb_logger
-)
-if os.getenv('EVALUATE', '0') == '1':
-    trainer.test(lit)
-else:
-    trainer.fit(lit)