EK100MIR

Paused

App Files Files Community

gina9726 commited on Mar 23, 2024

Commit

b3660df

verified ·

1 Parent(s): 2e599e6

Upload 29 files

Browse files

Files changed (29) hide show

README.md +6 -6
app_mir.py +115 -0
cache/cache.txt +0 -0
configs/base.yml +23 -0
configs/ek100_mir/egovpa.yml +39 -0
configs/ek100_mir/zeroshot.yml +21 -0
demo.py +461 -0
lavila/data/datasets.py +542 -0
lavila/data/video_transforms.py +186 -0
lavila/models/bpe_simple_vocab_16e6.txt.gz +3 -0
lavila/models/distributed_utils.py +89 -0
lavila/models/models.py +252 -0
lavila/models/openai_clip.py +237 -0
lavila/models/openai_model.py +535 -0
lavila/models/prompt_tuning.py +291 -0
lavila/models/timesformer.py +650 -0
lavila/models/tokenizer.py +239 -0
lavila/models/utils.py +110 -0
lavila/utils/config.py +18 -0
lavila/utils/evaluation.py +36 -0
lavila/utils/evaluation_charades.py +56 -0
lavila/utils/evaluation_ek100mir.py +201 -0
lavila/utils/preprocess.py +86 -0
meta/ek100_mir/EPIC_100_retrieval_test_sentence.csv +0 -0
meta/ek100_mir/relevancy_sel_t2v.npy +3 -0
meta/ek100_mir/relevancy_sel_v2t.npy +3 -0
meta/ek100_mir/sel_t2v.csv +10 -0
meta/ek100_mir/sel_v2t.csv +3 -0
requirements.txt +14 -0

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
-title: EK100MIR
-emoji: 🏃
-colorFrom: green
-colorTo: indigo
 sdk: gradio
-sdk_version: 4.22.0
 app_file: app.py
 pinned: false
-license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Ego VPA
+emoji: 📉
+colorFrom: blue
+colorTo: blue
 sdk: gradio
+sdk_version: 4.17.0
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app_mir.py ADDED Viewed

	@@ -0,0 +1,115 @@

+### app_mir.py
+# User interface for the demo.
+###
+import os, pdb
+import pandas as pd
+import gradio as gr
+from gradio_rich_textbox import RichTextbox
+from demo import VideoMIRModel
+def load_v2t_samples(data_root):
+    sample_videos = []
+    df = pd.read_csv("meta/ek100_mir/sel_v2t.csv", header=None)
+    idx2sid = {}
+    for i, x in enumerate(df[0].values):
+        sample_videos.append(f'{data_root}/video/gif/{x}.gif')
+        idx2sid[i] = x
+    return sample_videos, idx2sid
+def load_t2v_samples(data_root):
+    sample_text = ['cut the sausage', 'stir vegetables into salmon', 'rinse cutting board']
+    idx2sid = {0: 2119, 1: 1730, 2: 1276}
+    return sample_text, idx2sid
+def format_pred(pred, gt):
+    tp = '[color=green]{}[/color]'
+    fp = '[color=red]{}[/color]'
+    fmt_pred = []
+    for x in pred:
+        if x in gt:
+            fmt_pred.append(tp.format(x))
+        else:
+            fmt_pred.append(fp.format(x))
+    return ', '.join(fmt_pred)
+def main():
+    lavila = VideoMIRModel("configs/ek100_mir/zeroshot.yml")
+    egovpa = VideoMIRModel("configs/ek100_mir/egovpa.yml")
+    v2t_samples, idx2sid_v2t = load_v2t_samples('data/ek100_mir')
+    t2v_samples, idx2sid_t2v = load_t2v_samples('data/ek100_mir')
+    print(v2t_samples)
+    def predict_v2t(idx):
+        sid = idx2sid_v2t[idx]
+        zeroshot_action, gt_action = lavila.predict_v2t(idx, sid)
+        egovpa_action, gt_action = egovpa.predict_v2t(idx, sid)
+        zeroshot_action = format_pred(zeroshot_action, gt_action)
+        egovpa_action = format_pred(egovpa_action, gt_action)
+        return gt_action, zeroshot_action, egovpa_action
+    def predict_t2v(idx):
+        sid = idx2sid_t2v[idx]
+        zeroshot_video, gt_video = lavila.predict_t2v(idx, sid)
+        egovpa_video, gt_video = egovpa.predict_t2v(idx, sid)
+        return gt_video, zeroshot_video, egovpa_video
+    with gr.Blocks() as demo:
+        with gr.Tab("Video-to-text retrieval"):
+            gr.Markdown(
+                """
+                # Ego-VPA Demo
+                Choose a sample video and click predict to view the text queried by the selected video
+                (<span style="color:green">correct</span>/<span style="color:red">incorrect</span>).
+                """
+            )
+            with gr.Row():
+                with gr.Column():
+                    video = gr.Image(label="video query", height='300px', interactive=False)
+                with gr.Column():
+                    idx = gr.Number(label="Idx", visible=False)
+                    label = RichTextbox(label="Ground Truth", visible=False)
+                    zeroshot = RichTextbox(label="LaViLa (zero-shot) prediction")
+                    ours = RichTextbox(label="Ego-VPA prediction")
+            btn = gr.Button("Predict", variant="primary")
+            btn.click(predict_v2t, inputs=[idx], outputs=[label, zeroshot, ours])
+            gr.Examples(examples=[[i, x] for i, x in enumerate(v2t_samples)], inputs=[idx, video])
+        with gr.Tab("Text-to-video retrieval"):
+            gr.Markdown(
+                """
+                # Ego-VPA Demo
+                Choose a sample narration and click predict to view the video queried by the selected text.
+                """
+            )
+            with gr.Row():
+                with gr.Column():
+                    text = gr.Text(label="text query")
+                with gr.Column():
+                    idx = gr.Number(label="Idx", visible=False)
+                    zeroshot = gr.Textbox(label="LaViLa (zero-shot) prediction")
+                    #zeroshot = gr.Gallery(label="LaViLa (zero-shot) prediction", columns=[3], rows=[1], object_fit="contain", height="auto")
+                    ours = gr.Textbox(label="Ego-VPA prediction")
+                    #ours = gr.Gallery(label="Ego-VPA prediction", columns=[3], rows=[1], object_fit="contain", height="auto")
+            btn = gr.Button("Predict", variant="primary")
+            btn.click(predict_t2v, inputs=[idx], outputs=[label, zeroshot, ours])
+            gr.Examples(examples=[[i, x] for i, x in enumerate(t2v_samples)], inputs=[idx, text])
+    demo.launch(share=True)
+if __name__ == "__main__":
+    main()

cache/cache.txt ADDED Viewed

File without changes

configs/base.yml ADDED Viewed

	@@ -0,0 +1,23 @@

+model:
+    pretrain: ""
+    resume: ""
+    timesformer_freeze_space: false
+    drop_path_rate: 0.1
+    dropout_ratio: 0.5
+    freeze_vis_backbone: false
+    freeze_txt_backbone: false
+    use_vn_classifier: false
+data:
+    dataset: ek100_mir
+    root: datasets/EK100/video_ht256px
+    metadata: datasets/EK100/epic-kitchens-100-annotations/retrieval_annotations/EPIC_100_retrieval_train.csv
+    metadata_val: datasets/EK100/epic-kitchens-100-annotations/retrieval_annotations/EPIC_100_retrieval_test.csv
+    relevancy_path: datasets/EK100/epic-kitchens-100-annotations/retrieval_annotations/relevancy/caption_relevancy_EPIC_100_retrieval_test.pkl
+    clip_length: 16
+    clip_stride: 4
+    sparse_sample: false
+    num_crops: 1
+    num_clips: 1

configs/ek100_mir/egovpa.yml ADDED Viewed

	@@ -0,0 +1,39 @@

+model:
+    pretrain: ../ckpt/ek100mir.pt
+    freeze_vis_backbone: true
+    freeze_txt_backbone: true
+    inflat_posemb: true # false for cascade models; true for single-stage models (default: true)
+    num_frames: 16
+    text_prompt:
+        n_ctx: 8
+        use_bank: true
+    visual_prompt:
+        num_layers: 12
+        prompt_dim: 512
+        num_tokens: 128
+        deep: true
+        deep_shared: false
+        split_st: false
+        pt_spt: true
+        pt_tmp: false
+        style: VoP_c_pool
+        n_seg: 16     # number of segments per video (n_seg=clip_length -> 1 frame/seg)
+        K_s: 8        # boundary of intra-frame/inter-frame attention (VoP_f+c)
+        pool:
+            size: 10
+data:
+    dataset: ek100_mir
+    #root: /data/EK100/video_ht256px
+    #metadata: /data/EK100/epic-kitchens-100-annotations/retrieval_annotations/EPIC_100_retrieval_train.csv
+    #metadata_val: /data/EK100/epic-kitchens-100-annotations/retrieval_annotations/EPIC_100_retrieval_test.csv
+    #relevancy_path: /data/EK100/epic-kitchens-100-annotations/retrieval_annotations/relevancy/caption_relevancy_EPIC_100_retrieval_test.pkl
+    root: data/ek100_mir/video
+    metadata_val: data/ek100_mir/csv/{}.csv
+    relevancy_path: meta/ek100_mir/relevancy_sel.npy
+    narrations: meta/ek100_mir/EPIC_100_retrieval_test_sentence.csv
+    clip_length: 16

configs/ek100_mir/zeroshot.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+model:
+    pretrain: /store/nosnap/results/LaViLa/checkpoints/pt/TSF-B/lavila_best.pth
+    freeze_vis_backbone: true
+    freeze_txt_backbone: true
+    inflat_posemb: true # false for cascade models; true for single-stage models (default: true)
+    num_frames: 16
+data:
+    dataset: ek100_mir
+    #root: /data/EK100/video_ht256px
+    #metadata: /data/EK100/epic-kitchens-100-annotations/retrieval_annotations/EPIC_100_retrieval_train.csv
+    #metadata_val: /data/EK100/epic-kitchens-100-annotations/retrieval_annotations/EPIC_100_retrieval_test.csv
+    #relevancy_path: /data/EK100/epic-kitchens-100-annotations/retrieval_annotations/relevancy/caption_relevancy_EPIC_100_retrieval_test.pkl
+    root: data/ek100_mir/video
+    metadata_val: data/ek100_mir/csv/{}.csv
+    relevancy_path: meta/ek100_mir/relevancy_sel.npy
+    narrations: meta/ek100_mir/EPIC_100_retrieval_test_sentence.csv
+    clip_length: 16

demo.py ADDED Viewed

	@@ -0,0 +1,461 @@

+### demo.py
+# Define model classes for inference.
+###
+import argparse
+from collections import OrderedDict
+import json
+import numpy as np
+import os
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.backends.cudnn as cudnn
+import torchvision.transforms as transforms
+import torchvision.transforms._transforms_video as transforms_video
+from sklearn.metrics import confusion_matrix
+from lavila.data import datasets
+from lavila.data.video_transforms import Permute, SpatialCrop, TemporalCrop
+from lavila.models import models
+from lavila.models.tokenizer import (MyBertTokenizer, MyDistilBertTokenizer, MyGPT2Tokenizer, SimpleTokenizer)
+from lavila.models.utils import inflate_positional_embeds
+from lavila.utils.config import load_cfg
+from lavila.utils.evaluation_charades import charades_map
+from lavila.utils.evaluation import get_mean_accuracy
+from lavila.utils.evaluation_ek100mir import (calculate_k_counts, calculate_IDCG, calculate_mAP, calculate_nDCG)
+class VideoModel(nn.Module):
+    """ Base model for video understanding based on LaViLa architecture. """
+    def __init__(self, config):
+        """ Initializes the model.
+        Parameters:
+            config: config file
+        """
+        super(VideoModel, self).__init__()
+        self.cfg = load_cfg(config)
+        self.model = self.build_model()
+        self.tokenizer = self.get_tokenizer()
+        self.templates = ['{}']
+        self.dataset = self.cfg['data']['dataset']
+        self.eval()
+    def build_model(self):
+        cfg = self.cfg
+        if cfg['model'].get('pretrain', False):
+            ckpt_path = cfg['model']['pretrain']
+        else:
+            raise Exception('no checkpoint found')
+        ckpt = torch.load(ckpt_path, map_location='cpu')
+        state_dict = OrderedDict()
+        for k, v in ckpt['state_dict'].items():
+            state_dict[k.replace('module.', '')] = v
+        old_args = vars(ckpt['args'])
+        arch = old_args.get('model', 'CLIP_OPENAI_TIMESFORMER_BASE')
+        self.arch = arch
+        cfg['model']['arch'] = arch
+        cfg['model']['norm_embed'] = old_args.get('norm_embed', True)
+        print("=> creating model: {}".format(arch))
+        model = getattr(models, arch)(
+            pretrained=old_args.get('load_visual_pretrained', None),
+            pretrained2d=old_args.get('load_visual_pretrained', None) is not None,
+            text_use_cls_token=old_args.get('use_cls_token', False),
+            project_embed_dim=old_args.get('project_embed_dim', 256),
+            timesformer_gated_xattn=False,
+            num_frames=cfg['model'].get('num_frames', cfg['data']['clip_length']),
+            model_cfg=cfg['model']
+        )
+        model.logit_scale.requires_grad = False
+        if torch.cuda.is_available():
+            model.cuda()
+        if ('TIMESFORMER' in arch or 'EGOVLP' in arch) and cfg['model'].get('inflat_posemb', True):
+            # inflate weight
+            print('=> inflating PE in models due to different frame numbers')
+            state_dict = inflate_positional_embeds(
+                model.state_dict(), state_dict,
+                num_frames=cfg['model'].get('num_frames', cfg['data']['clip_length']),
+                load_temporal_fix='bilinear',
+            )
+        model.load_state_dict(state_dict, strict=True)
+        print("=> loaded resume checkpoint '{}' (epoch {})".format(ckpt_path, ckpt['epoch']))
+        return model
+    def eval(self):
+        cudnn.benchmark = True
+        for p in self.model.parameters():
+            p.requires_grad = False
+        self.model.eval()
+    def get_tokenizer(self):
+        arch = self.arch
+        if arch.endswith('DISTILBERT_BASE'):
+            tokenizer = MyDistilBertTokenizer('distilbert-base-uncased')
+        elif arch.endswith('BERT_BASE'):
+            tokenizer = MyBertTokenizer('bert-base-uncased')
+        elif arch.endswith('BERT_LARGE'):
+            tokenizer = MyBertTokenizer('bert-large-uncased')
+        elif arch.endswith('GPT2'):
+            tokenizer = MyGPT2Tokenizer('gpt2')
+        elif arch.endswith('GPT2_MEDIUM'):
+            tokenizer = MyGPT2Tokenizer('gpt2-medium')
+        elif arch.endswith('GPT2_LARGE'):
+            tokenizer = MyGPT2Tokenizer('gpt2-large')
+        elif arch.endswith('GPT2_XL'):
+            tokenizer = MyGPT2Tokenizer('gpt2-xl')
+        else:
+            print("Using SimpleTokenizer because of model '{}'. "
+                  "Please check if this is what you want".format(arch))
+            tokenizer = SimpleTokenizer()
+        return tokenizer
+class VideoCLSModel(VideoModel):
+    """ Video model for video classification tasks (Charades-Ego, EGTEA). """
+    def __init__(self, config):
+        super(VideoCLSModel, self).__init__(config)
+        self.labels, self.mapping_vn2act = self.gen_label_map()
+        self.text_features = self.get_text_features()
+    def gen_label_map(self):
+        labelmap = self.cfg.get('label_map', 'meta/charades_ego/label_map.json')
+        if os.path.isfile(labelmap):
+            print(f"=> Loading label maps from {labelmap}")
+            meta = json.load(open(labelmap, 'r'))
+            labels, mapping_vn2act = meta['labels'], meta['mapping_vn2act']
+        else:
+            from lavila.utils.preprocess import generate_label_map
+            labels, mapping_vn2act = generate_label_map(self.dataset)
+            meta = {'labels': labels, 'mapping_vn2act': mapping_vn2act}
+            meta_dir = f'meta/{self.dataset}'
+            if not os.path.exists(meta_dir):
+                os.makedirs(meta_dir)
+            json.dump(meta, open(f'{meta_dir}/label_map.json', 'w'))
+            print(f"=> Label map is generated and saved to {meta_dir}/label_map.json")
+        return labels, mapping_vn2act
+    def load_data(self, idx=None):
+        print(f"=> Creating dataset")
+        cfg, dataset = self.cfg, self.dataset
+        data_cfg = cfg['data']
+        crop_size = 224 if '336PX' not in self.arch else 336
+        val_transform = transforms.Compose([
+            Permute([3, 0, 1, 2]),    # T H W C -> C T H W
+            transforms.Resize(crop_size),
+            transforms.CenterCrop(crop_size),
+            transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305]),
+        ])
+        if idx is None:
+            metadata_val = data_cfg['metadata_val']
+        else:
+            metadata_val = data_cfg['metadata_val'].format(idx)
+        if dataset in ['charades_ego', 'egtea']:
+            val_dataset = datasets.VideoClassyDataset(
+                dataset, data_cfg['root'], metadata_val,
+                transform=val_transform, is_training=False,
+                label_mapping=self.mapping_vn2act, is_trimmed=False,
+                num_clips=1, clip_length=data_cfg['clip_length'], clip_stride=data_cfg['clip_stride'],
+                sparse_sample=data_cfg['sparse_sample']
+            )
+        else:
+            raise NotImplementedError
+        val_loader = torch.utils.data.DataLoader(
+            val_dataset, batch_size=8, shuffle=False,
+            num_workers=4, pin_memory=True, sampler=None, drop_last=False
+        )
+        return val_loader
+    @torch.no_grad()
+    def get_text_features(self):
+        print('=> Extracting text features')
+        text_features = []
+        for label in self.labels:
+            if isinstance(label, list):
+                texts = [tmpl.format(lbl) for tmpl in self.templates for lbl in label]
+            else:
+                texts = [tmpl.format(label) for tmpl in self.templates]
+            texts = self.tokenizer(texts)
+            if isinstance(texts, tuple):
+                # Bert-style tokenizer will output both ids and mask
+                texts, masks = texts
+                texts = texts.cuda(non_blocking=True)
+                masks = masks.cuda(non_blocking=True)
+            else:
+                texts = texts.cuda(non_blocking=True)
+                masks = None
+            texts = texts.view(-1, 77).contiguous()
+            masks = masks.view(-1, 77).contiguous() if masks is not None else None
+            if masks is not None:
+                class_embeddings, _ = self.model.encode_text(texts, attention_mask=masks)
+            else:
+                class_embeddings, _ = self.model.encode_text(texts)
+            class_embeddings = class_embeddings / class_embeddings.norm(dim=-1, keepdim=True)
+            class_embeddings = class_embeddings.mean(dim=0)
+            class_embeddings = class_embeddings / class_embeddings.norm(dim=-1, keepdim=True)
+            text_features.append(class_embeddings)
+        text_features = torch.stack(text_features, dim=0)
+        return text_features
+    @torch.no_grad()
+    def forward(self, idx=None):
+        print('=> Start forwarding')
+        val_loader = self.load_data(idx)
+        all_outputs = []
+        all_targets = []
+        for i, values in enumerate(val_loader):
+            images = values[0]
+            target = values[1]
+            images = images.cuda(non_blocking=True)
+            target = target.cuda(non_blocking=True)
+            # encode images
+            image_features, _ = self.model.encode_image(images)
+            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+            # cosine similarity as logits
+            logits_per_image = image_features @ self.text_features.t()
+            logits_per_image = torch.softmax(logits_per_image, dim=1)
+            all_outputs.append(logits_per_image.cpu())
+            all_targets.append(target.cpu())
+        all_outputs = torch.cat(all_outputs)
+        all_targets = torch.cat(all_targets)
+        return all_outputs, all_targets
+    @torch.no_grad()
+    def predict(self, idx=0):
+        all_outputs, all_targets = self.forward(idx)
+        preds, targets = all_outputs.numpy(), all_targets.numpy()
+        sel = np.where(np.cumsum(sorted(preds[0].tolist(), reverse=True)) > 0.055)[0][0]
+        #sel = 5
+        df = pd.DataFrame(self.labels)
+        pred_action = df.iloc[preds[0].argsort()[-sel:]].values.tolist()
+        gt_action = df.iloc[np.where(targets[0])[0]].values.tolist()
+        pred_action = sorted([x[0] for x in pred_action])
+        gt_action = sorted([x[0] for x in gt_action])
+        return pred_action, gt_action
+    @torch.no_grad()
+    def evaluate(self):
+        all_outputs, all_targets = self.forward()
+        preds, targets = all_outputs.numpy(), all_targets.numpy()
+        if self.dataset == 'charades_ego':
+            m_ap, _, m_aps = charades_map(preds, targets)
+            print('mAP = {:.3f}'.format(m_ap))
+        elif self.dataset == 'egtea':
+            cm = confusion_matrix(targets, preds.argmax(axis=1))
+            mean_class_acc, acc = get_mean_accuracy(cm)
+            print('Mean Acc. = {:.3f}, Top-1 Acc. = {:.3f}'.format(mean_class_acc, acc))
+        else:
+            raise NotImplementedError
+class VideoMIRModel(VideoModel):
+    """ Video model for video multi-instance retrieval tasks (EK100_MIR). """
+    def __init__(self, config):
+        super(VideoMIRModel, self).__init__(config)
+        self.narrations = pd.read_csv(self.cfg['data']['narrations']).values[:, 1]
+        self.text_features = self.get_text_features()
+        self.video_samples = pd.read_csv('meta/ek100_mir/sel_t2v.csv').values[:, 0]
+    def load_data(self, idx=None, t2v=False):
+        print(f"=> Creating dataset")
+        cfg, dataset = self.cfg, self.dataset
+        data_cfg = cfg['data']
+        crop_size = 224 if '336PX' not in self.arch else 336
+        val_transform = transforms.Compose([
+            Permute([3, 0, 1, 2]),    # T H W C -> C T H W
+            transforms.Resize(crop_size),
+            transforms.CenterCrop(crop_size),
+            transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305]),
+        ])
+        if dataset == 'ek100_mir':
+            if t2v:
+                metadata_val = 'meta/ek100_mir/sel_t2v.csv'
+                self.relevancy_mat_v2t = np.load(data_cfg['relevancy_path'].replace('sel', 'sel_v2t'))
+                self.relevancy_mat_t2v = np.load(data_cfg['relevancy_path'].replace('sel', 'sel_t2v'))
+                val_dataset = datasets.VideoCaptionDatasetCLIP(
+                    'ek100_mir_demo', data_cfg['root'], metadata_val, val_transform,
+                    is_training=False, tokenizer=self.tokenizer,
+                    clip_length=data_cfg['clip_length'], clip_stride=data_cfg['clip_stride']
+                )
+            elif idx is None:
+                metadata_val = data_cfg['metadata_val']
+                val_dataset = datasets.get_dataset(val_transform, self.tokenizer, cfg, is_training=False)
+            else:
+                metadata_val = data_cfg['metadata_val'].format(idx)
+                self.relevancy_mat_v2t = np.load(data_cfg['relevancy_path'].replace('sel', 'sel_v2t'))
+                self.relevancy_mat_t2v = np.load(data_cfg['relevancy_path'].replace('sel', 'sel_t2v'))
+                val_dataset = datasets.VideoCaptionDatasetCLIP(
+                    'ek100_mir_demo', data_cfg['root'], metadata_val, val_transform,
+                    is_training=False, tokenizer=self.tokenizer,
+                    clip_length=data_cfg['clip_length'], clip_stride=data_cfg['clip_stride']
+                )
+        else:
+            raise NotImplementedError
+        val_loader = torch.utils.data.DataLoader(
+            val_dataset, batch_size=8, shuffle=False,
+            num_workers=4, pin_memory=True, sampler=None, drop_last=False
+        )
+        return val_loader
+    @torch.no_grad()
+    def get_text_features(self):
+        print('=> Extracting text features')
+        text_features = []
+        for text in self.narrations:
+            text = self.tokenizer(text)
+            text = text.cuda(non_blocking=True)
+            text = text.view(-1, 77).contiguous()
+            text_embed, _ = self.model.encode_text(text)
+            text_embed = F.normalize(text_embed, dim=-1).squeeze()
+            text_features.append(text_embed)
+        text_features = torch.stack(text_features, dim=0)
+        return text_features
+    @torch.no_grad()
+    def forward_video(self, text_features=None, idx=None, t2v=False):
+        print('=> Start forwarding')
+        if t2v:
+            val_loader = self.load_data(t2v=t2v)
+        else:
+            val_loader = self.load_data(idx=idx)
+        all_outputs = []
+        for i, values in enumerate(val_loader):
+            images = values[0].cuda(non_blocking=True)
+            # encode images
+            image_features, _ = self.model.encode_image(images)
+            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+            if t2v:
+                all_outputs.append(image_features)
+            else:
+                # cosine similarity as logits
+                logits_per_image = image_features @ text_features.t()
+                logits_per_image = torch.softmax(logits_per_image, dim=1)
+                all_outputs.append(logits_per_image.cpu())
+        all_outputs = torch.cat(all_outputs)
+        if t2v:
+            all_outputs = torch.softmax(text_features @ all_outputs.t(), dim=1).cpu()
+        return all_outputs
+    @torch.no_grad()
+    def predict_v2t(self, idx=0, sid=0):
+        all_outputs = self.forward_video(self.text_features, sid)
+        preds = all_outputs.numpy()
+        relevancy = self.relevancy_mat_v2t[idx]
+        sel = 3
+        pred_action = self.narrations[(-preds[0]).argsort()[:sel]]
+        gt_action = self.narrations[np.where(relevancy == 1)[0]]
+        return pred_action, gt_action
+    @torch.no_grad()
+    def predict_t2v(self, idx=0, sid=0):
+        text_features = self.text_features[sid].unsqueeze(0)
+        all_outputs = self.forward_video(text_features, t2v=True)
+        preds = all_outputs.numpy()
+        relevancy = self.relevancy_mat_t2v[idx]
+        sel = 3
+        pred_video = self.video_samples[(-preds[0]).argsort()[:sel]]
+        gt_video = np.where(relevancy == 1)[0]
+        return pred_video, gt_video
+    @torch.no_grad()
+    def evaluate(self):
+        val_loader = self.load_data()
+        cfg, dataset = self.cfg, self.dataset
+        if self.dataset == 'ek100_mir':
+            all_video_embed = []
+            all_text_embed = []
+            for i, inputs in enumerate(val_loader):
+                inputs = [tensor.cuda(non_blocking=True) for tensor in inputs]
+                relevancies = inputs.pop()
+                # compute output
+                outputs = self.model(
+                    *inputs,
+                    use_checkpoint=True,
+                    norm_embed=cfg['model']['norm_embed']
+                )
+                image_features = outputs['image_embed']
+                text_features = outputs['text_embed']
+                all_video_embed.append(image_features.cpu().numpy())
+                all_text_embed.append(text_features.cpu().numpy())
+            all_text_embed = np.vstack(all_text_embed)
+            all_video_embed = np.vstack(all_video_embed)
+            similarity_matrix = np.matmul(all_video_embed, all_text_embed.T)
+            similarity_matrix = (similarity_matrix + 1) / 2
+            video_id = pd.read_csv(cfg['data']['metadata'].replace('train', 'test')).values[:, 0]
+            text_id = pd.read_csv(cfg['data']['metadata'].replace('train', 'test_sentence')).values[:, 0]
+            indexes = [video_id.tolist().index(elem) for elem in text_id]
+            similarity_matrix = similarity_matrix[:, indexes]
+            print(similarity_matrix.shape)
+            rel_matrix = pd.read_pickle(
+                cfg['data']['relevancy_path']
+            )
+            vis_map = calculate_mAP(similarity_matrix, rel_matrix)
+            txt_map = calculate_mAP(similarity_matrix.T, rel_matrix.T)
+            avg_map = (vis_map + txt_map) / 2
+            print('mAP: V->T: {:.3f} T->V: {:.3f} AVG: {:.3f}'.format(vis_map, txt_map, avg_map))
+            vis_k_counts = calculate_k_counts(rel_matrix)
+            txt_k_counts = calculate_k_counts(rel_matrix.T)
+            vis_IDCG = calculate_IDCG(rel_matrix, vis_k_counts)
+            txt_IDCG = calculate_IDCG(rel_matrix.T, txt_k_counts)
+            vis_nDCG = calculate_nDCG(similarity_matrix, rel_matrix, k_counts=vis_k_counts, IDCG=vis_IDCG)
+            txt_nDCG = calculate_nDCG(similarity_matrix.T, rel_matrix.T, k_counts=txt_k_counts, IDCG=txt_IDCG)
+            avg_nDCG = (vis_nDCG + txt_nDCG) / 2
+            print('nDCG: V->T: {:.3f} T->V: {:.3f} AVG: {:.3f}'.format(vis_nDCG, txt_nDCG, avg_nDCG))
+        else:
+            raise NotImplementedError
+def main():
+    parser = argparse.ArgumentParser(description='Ego-VPA inference', add_help=False)
+    parser.add_argument('--dataset',
+                        default='charades_ego',
+                        type=str, help='charades_ego/ek100_mir')
+    args = parser.parse_args()
+    if args.dataset in ['charades_ego']:
+        lavila = VideoCLSModel(f"configs/{args.dataset}/zeroshot.yml")
+        egovpa = VideoCLSModel(f"configs/{args.dataset}/egovpa.yml")
+    elif args.dataset == 'ek100_mir':
+        #lavila = VideoMIRModel(f"configs/{args.dataset}/zeroshot.yml")
+        egovpa = VideoMIRModel(f"configs/{args.dataset}/egovpa.yml")
+    else:
+        raise NotImplementedError
+    #lavila.evaluate()
+    #egovpa.evaluate()
+    egovpa.predict_t2v(idx=0, sid=2119)
+if __name__ == '__main__':
+    main()

lavila/data/datasets.py ADDED Viewed

	@@ -0,0 +1,542 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import csv
+import glob
+import json
+import numpy as np
+import os.path as osp
+import pickle
+import random
+import decord
+import pandas as pd
+import torch
+def datetime2sec(str):
+    hh, mm, ss = str.split(':')
+    return int(hh) * 3600 + int(mm) * 60 + float(ss)
+def video_loader(root, vid, second, end_second=None, chunk_len=300, fps=30, clip_length=32, jitter=False):
+    if chunk_len == -1:
+        vr = decord.VideoReader(osp.join(root, '{}.mp4'.format(vid)))
+        second_offset = second
+        if end_second is not None:
+            end_second = min(end_second, len(vr) / vr.get_avg_fps())
+        else:
+            end_second = len(vr) / vr.get_avg_fps()
+    else:
+        chunk_start = int(second) // chunk_len * chunk_len
+        second_offset = second - chunk_start
+        vr = decord.VideoReader(osp.join(root, '{}.mp4'.format(vid), '{}.mp4'.format(chunk_start)))
+    if fps == -1:
+        fps = vr.get_avg_fps()
+    # calculate frame_ids
+    frame_offset = int(np.round(second_offset * fps))
+    total_duration = max(int((end_second - second) * fps), clip_length)
+    if chunk_len == -1:
+        if end_second <= second:
+            raise ValueError("end_second should be greater than second")
+        else:
+            frame_ids = get_frame_ids(frame_offset, min(frame_offset + total_duration, len(vr)), num_segments=clip_length, jitter=jitter)
+    else:
+        frame_ids = get_frame_ids(frame_offset, frame_offset + total_duration, num_segments=clip_length, jitter=jitter)
+    # load frames
+    if max(frame_ids) < len(vr):
+        try:
+            frames = vr.get_batch(frame_ids).asnumpy()
+        except decord.DECORDError as error:
+            print(error)
+            frames = vr.get_batch([0] * len(frame_ids)).asnumpy()
+    else:
+        # find the remaining frames in the next chunk
+        try:
+            frame_ids_part1 = list(filter(lambda frame_id: frame_id < len(vr), frame_ids))
+            frames_part1 = vr.get_batch(frame_ids_part1).asnumpy()
+            vr2 = decord.VideoReader(osp.join(root, '{}.mp4'.format(vid), '{}.mp4'.format(chunk_start + chunk_len)))
+            frame_ids_part2 = list(filter(lambda frame_id: frame_id >= len(vr), frame_ids))
+            frame_ids_part2 = [min(frame_id % len(vr), len(vr2) - 1) for frame_id in frame_ids_part2]
+            frames_part2 = vr2.get_batch(frame_ids_part2).asnumpy()
+            frames = np.concatenate([frames_part1, frames_part2], axis=0)
+        # the next chunk does not exist; the current chunk is the last one
+        except (RuntimeError, decord.DECORDError) as error:
+            print(error)
+            frame_ids = get_frame_ids(min(frame_offset, len(vr) - 1), len(vr), num_segments=clip_length, jitter=jitter)
+            frames = vr.get_batch(frame_ids).asnumpy()
+    frames = [torch.tensor(frame, dtype=torch.float32) for frame in frames]
+    return torch.stack(frames, dim=0)
+def get_frame_ids(start_frame, end_frame, num_segments=32, jitter=True):
+    seg_size = float(end_frame - start_frame - 1) / num_segments
+    seq = []
+    for i in range(num_segments):
+        start = int(np.round(seg_size * i) + start_frame)
+        end = int(np.round(seg_size * (i + 1)) + start_frame)
+        end = min(end, end_frame)
+        if jitter:
+            frame_id = np.random.randint(low=start, high=(end + 1))
+        else:
+            frame_id = (start + end) // 2
+        seq.append(frame_id)
+    return seq
+def video_loader_by_frames(root, vid, frame_ids):
+    vr = decord.VideoReader(osp.join(root, vid))
+    try:
+        frames = vr.get_batch(frame_ids).asnumpy()
+        frames = [torch.tensor(frame, dtype=torch.float32) for frame in frames]
+    except (IndexError, decord.DECORDError) as error:
+        print(error)
+        print("Erroneous video: ", vid)
+        frames = [torch.zeros((240, 320, 3)) for _ in range(len(frame_ids))]
+    return torch.stack(frames, dim=0)
+class VideoCaptionDatasetBase(torch.utils.data.Dataset):
+    def __init__(self, dataset, root, metadata, is_trimmed=True):
+        self.dataset = dataset
+        self.root = root
+        self.is_trimmed = is_trimmed
+        if self.dataset == 'ego4d':
+            with open(metadata, 'rb') as f:
+                self.samples = pickle.load(f)
+        elif self.dataset == 'ego4d_mcq':
+            with open(metadata, 'r') as f:
+                self.samples = json.load(f)
+        elif self.dataset in ['ek100_cls', 'ek100_mir']:
+            video_list = glob.glob(osp.join(self.root, '*/*.MP4'))
+            fps_dict = {video: decord.VideoReader(video).get_avg_fps() for video in video_list}
+            self.samples = []
+            with open(metadata) as f:
+                csv_reader = csv.reader(f)
+                _ = next(csv_reader)  # skip the header
+                for row in csv_reader:
+                    pid, vid = row[1:3]
+                    # start_frame, end_frame = int(row[6]), int(row[7])
+                    # Deprecated: some videos might have fps mismatch issue
+                    start_timestamp, end_timestamp = datetime2sec(row[4]), datetime2sec(row[5])
+                    narration = row[8]
+                    verb, noun = int(row[10]), int(row[12])
+                    vid_path = '{}/{}.MP4'.format(pid, vid)
+                    fps = fps_dict[osp.join(self.root, vid_path)]
+                    start_frame = int(np.round(fps * start_timestamp))
+                    end_frame = int(np.ceil(fps * end_timestamp))
+                    self.samples.append((vid_path, start_frame, end_frame, narration, verb, noun))
+            if self.dataset == 'ek100_mir':
+                self.metadata_sentence = pd.read_csv(metadata[:metadata.index('.csv')] + '_sentence.csv')
+                if 'train' in metadata:
+                    self.relevancy_mat = pickle.load(open(osp.join(osp.dirname(metadata), 'relevancy', 'caption_relevancy_EPIC_100_retrieval_train.pkl'), 'rb'))
+                elif 'test' in metadata:
+                    self.relevancy_mat = pickle.load(open(osp.join(osp.dirname(metadata), 'relevancy', 'caption_relevancy_EPIC_100_retrieval_test.pkl'), 'rb'))
+                else:
+                    raise ValueError('{} should contain either "train" or "test"!'.format(metadata))
+                self.relevancy = .1
+        elif self.dataset == 'ek100_mir_demo':
+            df = pd.read_csv(metadata, header=None)
+            fps = 59.94
+            self.samples = []
+            for i in range(len(df)):
+                vid_path, start_timestamp, end_timestamp, narration, verb, noun = df.iloc[i:i+1].values[0].tolist()[1:]
+                start_frame = int(np.round(fps * start_timestamp))
+                end_frame = int(np.ceil(fps * end_timestamp))
+                self.samples.append((vid_path, start_frame, end_frame, narration, verb, noun))
+        elif self.dataset == 'egtea':
+            video_list = glob.glob(osp.join(self.root, '*/*'))
+            len_dict = {video: len(decord.VideoReader(video)) for video in video_list}
+            vn_list, labels = [], []
+            for row in open(osp.join(osp.dirname(metadata), 'action_idx.txt')):
+                row = row.strip()
+                vn = int(row.split(' ')[-1])
+                vn_list.append(vn)
+                narration = ' '.join(row.split(' ')[:-1])
+                labels.append(narration.replace('_', ' ').lower())
+                # labels.append(narration)
+            mapping_act2narration = {vn: narration for vn, narration in zip(vn_list, labels)}
+            self.samples = []
+            with open(metadata) as f:
+                for row in f:
+                    clip_id, action_idx = row.strip().split(' ')[:2]
+                    video_id = '-'.join(clip_id.split('-')[:3])
+                    vid_relpath = osp.join(video_id, '{}.mp4'.format(clip_id))
+                    vid_fullpath = osp.join(self.root, video_id, '{}.mp4'.format(clip_id))
+                    self.samples.append((vid_relpath, 0, len_dict[vid_fullpath], mapping_act2narration[int(action_idx)]))
+        elif self.dataset == 'charades_ego':
+            video_list = glob.glob(osp.join(self.root, '*.mp4'))
+            fps_dict = {video: decord.VideoReader(video).get_avg_fps() for video in video_list}
+            self.samples = []
+            with open(metadata) as f:
+                csv_reader = csv.reader(f)
+                _ = next(csv_reader)  # skip the header
+                for row in csv_reader:
+                    video_id = row[0]
+                    if self.is_trimmed:
+                        for action_tuple in row[9].split(';'):
+                            if not action_tuple:
+                                continue
+                            action, start_timestamp, end_timestamp = action_tuple.split(' ')
+                            start_timestamp, end_timestamp = float(start_timestamp), float(end_timestamp)
+                            vid_path = '{}.mp4'.format(video_id)
+                            fps = fps_dict[osp.join(self.root, vid_path)]
+                            start_frame = int(np.round(fps * start_timestamp))
+                            end_frame = int(np.ceil(fps * end_timestamp))
+                            self.samples.append((vid_path, start_frame, end_frame, action))
+                    else:
+                        if not row[9]:
+                            action_list = []
+                        else:
+                            action_list = [action_tuple.split(' ')[0] for action_tuple in row[9].split(';')]
+                        vid_path = '{}.mp4'.format(video_id)
+                        fps = fps_dict[osp.join(self.root, vid_path)]
+                        duration = fps * float(row[10])
+                        self.samples.append((vid_path, 0, duration, action_list))
+        elif self.dataset == 'charades_ego_trimmed':
+            with open(metadata, 'rb') as f:
+                self.samples = pickle.load(f)
+        else:
+            raise NotImplementedError
+    def get_raw_item(self, i, is_training=True, num_clips=1, clip_length=32, clip_stride=2, sparse_sample=False,
+                     narration_selection='random'):
+        if self.dataset == 'ego4d':
+            if len(self.samples[i]) == 4:
+                vid, start_second, end_second, narration = self.samples[i]
+                frames = video_loader(self.root, vid, start_second,
+                                      end_second=end_second,
+                                      clip_length=clip_length,
+                                      jitter=is_training)
+                if isinstance(narration, list):
+                    if narration_selection == 'random':
+                        narration = random.choice(narration)
+                    elif narration_selection == 'concat':
+                        narration = '. '.join(narration)
+                    elif narration_selection == 'list':
+                        narration = narration
+                    else:
+                        raise ValueError
+                return frames, narration
+            elif len(self.samples[i]) == 5:
+                # TODO: need better filtering strategy based on nll
+                vid, start_second, end_second, narration, _ = self.samples[i]
+                frames = video_loader(self.root, vid, start_second,
+                                      end_second=end_second,
+                                      clip_length=clip_length,
+                                      jitter=is_training)
+                if isinstance(narration, list):
+                    if narration_selection == 'random':
+                        narration = random.choice(narration)
+                    elif narration_selection == 'concat':
+                        narration = '. '.join(narration)
+                    elif narration_selection == 'list':
+                        narration = narration
+                    else:
+                        raise ValueError
+                return frames, narration
+        elif self.dataset == 'ego4d_mcq':
+            itemMCQ = self.samples[str(i)]
+            answerIndex = itemMCQ['answer']
+            textQuery = itemMCQ['query']['clip_text']
+            sampleOptions = itemMCQ['choices']
+            frames_options = []
+            narration_options = []
+            for option_id in range(len(sampleOptions)):
+                option = sampleOptions[str(option_id)]
+                frames = video_loader(self.root, option['video_uid'],
+                                      float(option['clip_start']), end_second=float(option['clip_end']),
+                                      clip_length=clip_length,
+                                      jitter=is_training)
+                frames_options.append(frames)
+                narration_options.append(option['clip_text'])
+            return textQuery, frames_options, narration_options, answerIndex, itemMCQ['types']
+        elif self.dataset == 'ek100_mir':
+            vid_path, start_frame, end_frame, narration, verb, noun = self.samples[i]
+            # from third_party.EgoVLP.base.base_dataset import sample_frames_start_end
+            # frame_ids = sample_frames_start_end(clip_length, start_frame, end_frame, sample='uniform', fix_start=None)
+            frame_ids = get_frame_ids(start_frame, end_frame, num_segments=clip_length, jitter=is_training)
+            frames = video_loader_by_frames(self.root, vid_path, frame_ids)
+            if is_training:
+                positive_list = np.where(self.relevancy_mat[i] > self.relevancy)[0].tolist()
+                if positive_list != []:
+                    pos = random.sample(positive_list, min(len(positive_list), 1))[0]
+                    if pos < len(self.metadata_sentence) and pos < self.relevancy_mat.shape[1]:
+                        return frames, (self.metadata_sentence.iloc[pos][1], self.relevancy_mat[i][pos])
+            else:
+                return frames, (narration, 1)
+        elif self.dataset == 'ek100_mir_demo':
+            vid_path, start_frame, end_frame, narration, verb, noun = self.samples[i]
+            frame_ids = get_frame_ids(start_frame, end_frame, num_segments=clip_length, jitter=is_training)
+            frames = video_loader_by_frames(self.root, vid_path, frame_ids)
+            return frames, (narration, 1)
+        elif self.dataset == 'ek100_cls':
+            vid_path, start_frame, end_frame, narration, verb, noun = self.samples[i]
+            frame_ids = get_frame_ids(start_frame, end_frame, num_segments=clip_length, jitter=is_training)
+            frames = video_loader_by_frames(self.root, vid_path, frame_ids)
+            return frames, '{}:{}'.format(verb, noun)
+        elif self.dataset == 'egtea':
+            vid_path, start_frame, end_frame, sentence = self.samples[i]
+            if is_training:
+                assert num_clips == 1
+                if end_frame < clip_length * clip_stride:
+                    frames = video_loader_by_frames(self.root, vid_path, list(np.arange(0, end_frame)))
+                    zeros = torch.zeros((clip_length * clip_stride - end_frame, *frames.shape[1:]))
+                    frames = torch.cat((frames, zeros), dim=0)
+                    frames = frames[::clip_stride]
+                else:
+                    start_id = np.random.randint(0, end_frame - clip_length * clip_stride + 1)
+                    frame_ids = np.arange(start_id, start_id + clip_length * clip_stride, clip_stride)
+                    frames = video_loader_by_frames(self.root, vid_path, frame_ids)
+            else:
+                if end_frame < clip_length * clip_stride:
+                    frames = video_loader_by_frames(self.root, vid_path, list(np.arange(0, end_frame)))
+                    zeros = torch.zeros((clip_length * clip_stride - end_frame, *frames.shape[1:]))
+                    frames = torch.cat((frames, zeros), dim=0)
+                    frames = frames[::clip_stride]
+                    frames = frames.repeat(num_clips, 1, 1, 1)
+                else:
+                    frame_ids = []
+                    for start_id in np.linspace(0, end_frame - clip_length * clip_stride, num_clips, dtype=int):
+                        frame_ids.extend(np.arange(start_id, start_id + clip_length * clip_stride, clip_stride))
+                    frames = video_loader_by_frames(self.root, vid_path, frame_ids)
+            return frames, sentence
+        elif self.dataset == 'charades_ego':
+            vid_path, start_frame, end_frame, action_list = self.samples[i]
+            if sparse_sample:
+                frame_ids = get_frame_ids(start_frame, end_frame, num_segments=num_clips * clip_length, jitter=is_training)
+                frames = video_loader_by_frames(self.root, vid_path, frame_ids)
+            else:
+                if end_frame < clip_length * clip_stride:
+                    frames = video_loader_by_frames(self.root, vid_path, list(np.arange(0, end_frame)))
+                    zeros = torch.zeros((clip_length * clip_stride - end_frame, *frames.shape[1:]))
+                    frames = torch.cat((frames, zeros), dim=0)
+                    frames = frames[::clip_stride]
+                    frames = frames.repeat(num_clips, 1, 1, 1)
+                else:
+                    frame_ids = []
+                    for start_id in np.linspace(0, end_frame - clip_length * clip_stride, num_clips, dtype=int):
+                        frame_ids.extend(np.arange(start_id, start_id + clip_length * clip_stride, clip_stride))
+                    #print('frame_ids:', frame_ids)
+                    frames = video_loader_by_frames(self.root, vid_path, frame_ids)
+            return frames, action_list, vid_path
+        elif self.dataset == 'charades_ego_trimmed':
+            vid, start_second, end_second, narration = self.samples[i]
+            frames = video_loader(self.root, vid, start_second,
+                                  end_second=end_second,
+                                  chunk_len=-1,  # no chunk for CharadesEgo
+                                  fps=-1,  # could be variable fps
+                                  clip_length=clip_length,
+                                  jitter=is_training)
+            return frames, narration
+        else:
+            raise NotImplementedError
+    def __getitem__(self, i):
+        raise NotImplementedError
+    def __len__(self):
+        return len(self.samples)
+class VideoCaptionDatasetCLIP(VideoCaptionDatasetBase):
+    def __init__(self, dataset, root, metadata, transform=None,
+                 is_training=True, tokenizer=None,
+                 clip_length=32, clip_stride=2, sparse_sample=False,
+                 narration_selection='random',
+                 num_hard_negatives=0,
+                 subsample_stride=None):
+        super().__init__(dataset, root, metadata)
+        self.full_samples = self.samples.copy()
+        if isinstance(subsample_stride, int):
+            self.samples = self.samples[::subsample_stride]
+        self.transform = transform
+        self.is_training = is_training
+        self.tokenizer = tokenizer
+        self.clip_length = clip_length
+        self.clip_stride = clip_stride
+        self.sparse_sample = sparse_sample
+        self.narration_selection = narration_selection
+        self.num_hard_negatives = num_hard_negatives
+        if num_hard_negatives > 0:
+            assert self.dataset == 'htm_aa'
+    def __getitem__(self, i):
+        frames, caption = self.get_raw_item(
+            i, is_training=self.is_training,
+            clip_length=self.clip_length,
+            clip_stride=self.clip_stride,
+            sparse_sample=self.sparse_sample,
+            narration_selection=self.narration_selection,
+        )
+        # ek100_mir will also output relevancy value
+        if isinstance(caption, tuple):
+            caption, relevancy = caption
+        else:
+            relevancy = 0.
+        # apply transformation
+        if self.transform is not None:
+            frames = self.transform(frames)
+        # tokenize caption
+        if self.tokenizer is not None:
+            caption = self.tokenizer(caption)
+        if isinstance(caption, tuple):
+            caption, mask = caption
+            return frames, caption, mask, relevancy
+        else:
+            return frames, caption, relevancy
+class VideoCaptionDatasetMCQ(VideoCaptionDatasetBase):
+    def __init__(self, dataset, root, metadata, transform=None,
+                 is_training=True, tokenizer=None,
+                 clip_length=32, clip_stride=2, sparse_sample=False,
+                 narration_selection='random'):
+        super().__init__(dataset, root, metadata)
+        self.full_samples = self.samples.copy()
+        self.transform = transform
+        self.is_training = is_training
+        self.tokenizer = tokenizer
+        self.clip_length = clip_length
+        self.clip_stride = clip_stride
+        self.sparse_sample = sparse_sample
+        self.narration_selection = narration_selection
+    def __getitem__(self, i):
+        textQuery, frames_options, narration_options, answerIndex, q_type = self.get_raw_item(
+            i, is_training=self.is_training,
+            clip_length=self.clip_length,
+            clip_stride=self.clip_stride,
+            sparse_sample=self.sparse_sample,
+            narration_selection=self.narration_selection,
+        )
+        # apply transformation
+        if self.transform is not None:
+            frames_options = [self.transform(frames) for frames in frames_options]
+        # tokenize caption
+        if self.tokenizer is not None:
+            textQuery = self.tokenizer(textQuery)
+            narration_options = self.tokenizer(narration_options)
+            if isinstance(textQuery, tuple):
+                textQuery, mask_query = textQuery
+                narration_options, mask_options = narration_options
+                return (
+                    textQuery, torch.stack(frames_options, dim=0),
+                    narration_options, answerIndex, q_type,
+                    mask_query, mask_options
+                )
+            else:
+                return textQuery, torch.stack(frames_options, dim=0), narration_options, answerIndex, q_type
+class VideoClassyDataset(VideoCaptionDatasetBase):
+    def __init__(
+        self, dataset, root, metadata, transform=None,
+        is_training=True, label_mapping=None,
+        num_clips=1,
+        clip_length=32, clip_stride=2,
+        sparse_sample=False,
+        is_trimmed=True,
+    ):
+        super().__init__(dataset, root, metadata, is_trimmed=is_trimmed)
+        self.transform = transform
+        self.is_training = is_training
+        self.label_mapping = label_mapping
+        self.num_clips = num_clips
+        self.clip_length = clip_length
+        self.clip_stride = clip_stride
+        self.sparse_sample = sparse_sample
+    def __getitem__(self, i):
+        frames, label, vid_path = self.get_raw_item(
+            i, is_training=self.is_training,
+            num_clips=self.num_clips,
+            clip_length=self.clip_length,
+            clip_stride=self.clip_stride,
+            sparse_sample=self.sparse_sample,
+        )
+        # apply transformation
+        if self.transform is not None:
+            frames = self.transform(frames)
+        if self.label_mapping is not None:
+            if isinstance(label, list):
+                # multi-label case
+                res_array = np.zeros(len(self.label_mapping))
+                for lbl in label:
+                    res_array[self.label_mapping[lbl]] = 1.
+                label = res_array
+            else:
+                label = self.label_mapping[label]
+        return frames, label, vid_path
+def get_dataset(train_transform, tokenizer, cfg, is_training=True):
+    narration_selection = cfg.get('narration_selection', 'random')
+    num_hard_neg = cfg.get('num_hard_neg', 0)
+    data_cfg = cfg['data']
+    if cfg['model']['arch'].startswith('CLIP') or cfg['model']['arch'].startswith('VCLM'):
+        if is_training:
+            metadata = data_cfg['metadata']
+        else:
+            metadata = data_cfg['metadata_val']
+        return VideoCaptionDatasetCLIP(
+            data_cfg['dataset'], data_cfg['root'], metadata, train_transform,
+            is_training=is_training,
+            tokenizer=tokenizer,
+            clip_length=data_cfg['clip_length'], clip_stride=data_cfg['clip_stride'],
+            sparse_sample=data_cfg['sparse_sample'],
+            narration_selection=narration_selection,
+            num_hard_negatives=num_hard_neg
+        )
+    else:
+        raise NotImplementedError
+def get_downstream_dataset(transform, tokenizer, cfg, is_training=True, num_clips=0, label_mapping=None):
+    data_cfg = cfg['data']
+    n_clips = num_clips if num_clips > 0 else data_cfg['num_clips']
+    if is_training:
+        metadata = data_cfg['metadata']
+        return VideoClassyDataset(
+            data_cfg['dataset'], data_cfg['root'], metadata, transform,
+            is_training=True, label_mapping=label_mapping,
+            num_clips=n_clips,
+            clip_length=data_cfg['clip_length'], clip_stride=data_cfg['clip_stride'],
+            sparse_sample=data_cfg['sparse_sample'],
+        )
+    else:
+        metadata = data_cfg['metadata_val']
+        return VideoClassyDataset(
+            data_cfg['dataset'], data_cfg['root'], metadata, transform,
+            is_training=False, label_mapping=label_mapping,
+            num_clips=n_clips,
+            clip_length=data_cfg['clip_length'], clip_stride=data_cfg['clip_stride'],
+            sparse_sample=data_cfg['sparse_sample'],
+            is_trimmed=not data_cfg['dataset'] == 'charades_ego'
+        )

lavila/data/video_transforms.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Sequence
+import torch
+import torch.nn as nn
+from torchvision import transforms
+class Permute(nn.Module):
+    """
+    Permutation as an op
+    """
+    def __init__(self, ordering):
+        super().__init__()
+        self.ordering = ordering
+    def forward(self, frames):
+        """
+        Args:
+            frames in some ordering, by default (C, T, H, W)
+        Returns:
+            frames in the ordering that was specified
+        """
+        return frames.permute(self.ordering)
+class TemporalCrop(nn.Module):
+    """
+    Convert the video into smaller clips temporally.
+    """
+    def __init__(
+        self, frames_per_clip: int = 8, stride: int = 8, frame_stride: int = 1
+    ):
+        super().__init__()
+        self.frames = frames_per_clip
+        self.stride = stride
+        self.frame_stride = frame_stride
+    def forward(self, video):
+        assert video.ndim == 4, "Must be (C, T, H, W)"
+        res = []
+        for start in range(
+            0, video.size(1) - (self.frames * self.frame_stride) + 1, self.stride
+        ):
+            end = start + (self.frames) * self.frame_stride
+            res.append(video[:, start: end: self.frame_stride, ...])
+        return res
+def crop_boxes(boxes, x_offset, y_offset):
+    """
+    Peform crop on the bounding boxes given the offsets.
+    Args:
+        boxes (ndarray or None): bounding boxes to peform crop. The dimension
+            is `num boxes` x 4.
+        x_offset (int): cropping offset in the x axis.
+        y_offset (int): cropping offset in the y axis.
+    Returns:
+        cropped_boxes (ndarray or None): the cropped boxes with dimension of
+            `num boxes` x 4.
+    """
+    cropped_boxes = boxes.copy()
+    cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
+    cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
+    return cropped_boxes
+def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
+    """
+    Perform uniform spatial sampling on the images and corresponding boxes.
+    Args:
+        images (tensor): images to perform uniform crop. The dimension is
+            `num frames` x `channel` x `height` x `width`.
+        size (int): size of height and weight to crop the images.
+        spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
+            is larger than height. Or 0, 1, or 2 for top, center, and bottom
+            crop if height is larger than width.
+        boxes (ndarray or None): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+        scale_size (int): optinal. If not None, resize the images to scale_size before
+            performing any crop.
+    Returns:
+        cropped (tensor): images with dimension of
+            `num frames` x `channel` x `size` x `size`.
+        cropped_boxes (ndarray or None): the cropped boxes with dimension of
+            `num boxes` x 4.
+    """
+    assert spatial_idx in [0, 1, 2]
+    ndim = len(images.shape)
+    if ndim == 3:
+        images = images.unsqueeze(0)
+    height = images.shape[2]
+    width = images.shape[3]
+    if scale_size is not None:
+        if width <= height:
+            width, height = scale_size, int(height / width * scale_size)
+        else:
+            width, height = int(width / height * scale_size), scale_size
+        images = torch.nn.functional.interpolate(
+            images,
+            size=(height, width),
+            mode="bilinear",
+            align_corners=False,
+        )
+    y_offset = int(math.ceil((height - size) / 2))
+    x_offset = int(math.ceil((width - size) / 2))
+    if height > width:
+        if spatial_idx == 0:
+            y_offset = 0
+        elif spatial_idx == 2:
+            y_offset = height - size
+    else:
+        if spatial_idx == 0:
+            x_offset = 0
+        elif spatial_idx == 2:
+            x_offset = width - size
+    cropped = images[:, :, y_offset: y_offset + size, x_offset: x_offset + size]
+    cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
+    if ndim == 3:
+        cropped = cropped.squeeze(0)
+    return cropped, cropped_boxes
+class SpatialCrop(nn.Module):
+    """
+    Convert the video into 3 smaller clips spatially. Must be used after the
+        temporal crops to get spatial crops, and should be used with
+        -2 in the spatial crop at the slowfast augmentation stage (so full
+        frames are passed in here). Will return a larger list with the
+        3x spatial crops as well. It's useful for 3x4 testing (eg in SwinT)
+        or 3x10 testing in SlowFast etc.
+    """
+    def __init__(self, crop_size: int = 224, num_crops: int = 3):
+        super().__init__()
+        self.crop_size = crop_size
+        if num_crops == 6:
+            self.crops_to_ext = [0, 1, 2]
+            # I guess Swin uses 5 crops without flipping, but that doesn't
+            # make sense given they first resize to 224 and take 224 crops.
+            # (pg 6 of https://arxiv.org/pdf/2106.13230.pdf)
+            # So I'm assuming we can use flipped crops and that will add sth..
+            self.flipped_crops_to_ext = [0, 1, 2]
+        elif num_crops == 3:
+            self.crops_to_ext = [0, 1, 2]
+            self.flipped_crops_to_ext = []
+        elif num_crops == 1:
+            self.crops_to_ext = [1]
+            self.flipped_crops_to_ext = []
+        else:
+            raise NotImplementedError(
+                "Nothing else supported yet, "
+                "slowfast only takes 0, 1, 2 as arguments"
+            )
+    def forward(self, videos: Sequence[torch.Tensor]):
+        """
+        Args:
+            videos: A list of C, T, H, W videos.
+        Returns:
+            videos: A list with 3x the number of elements. Each video converted
+                to C, T, H', W' by spatial cropping.
+        """
+        assert isinstance(videos, list), "Must be a list of videos after temporal crops"
+        assert all([video.ndim == 4 for video in videos]), "Must be (C,T,H,W)"
+        res = []
+        for video in videos:
+            for spatial_idx in self.crops_to_ext:
+                res.append(uniform_crop(video, self.crop_size, spatial_idx)[0])
+            if not self.flipped_crops_to_ext:
+                continue
+            flipped_video = transforms.functional.hflip(video)
+            for spatial_idx in self.flipped_crops_to_ext:
+                res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0])
+        return res

lavila/models/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

lavila/models/distributed_utils.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Part of the code is from
+# `https://github.com/facebookresearch/vissl/blob/main/vissl/utils/distributed_utils.py` and
+# `https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/generic/distributed_util.py`
+# Modified by Yue Zhao
+# The original code is under MIT License
+import torch
+import torch.distributed as dist
+from typing import Tuple
+def convert_to_distributed_tensor(tensor: torch.Tensor) -> Tuple[torch.Tensor, str]:
+    """
+    For some backends, such as NCCL, communication only works if the
+    tensor is on the GPU. This helper function converts to the correct
+    device and returns the tensor + original device.
+    """
+    orig_device = "cpu" if not tensor.is_cuda else "gpu"
+    if (
+        torch.distributed.is_available()
+        and torch.distributed.get_backend() == torch.distributed.Backend.NCCL
+        and not tensor.is_cuda
+    ):
+        tensor = tensor.cuda()
+    return (tensor, orig_device)
+def convert_to_normal_tensor(tensor: torch.Tensor, orig_device: str) -> torch.Tensor:
+    """
+    For some backends, such as NCCL, communication only works if the
+    tensor is on the GPU. This converts the tensor back to original device.
+    """
+    if tensor.is_cuda and orig_device == "cpu":
+        tensor = tensor.cpu()
+    return tensor
+def is_distributed_training_run() -> bool:
+    return (
+        torch.distributed.is_available()
+        and torch.distributed.is_initialized()
+        and (torch.distributed.get_world_size() > 1)
+    )
+class GatherLayer(torch.autograd.Function):
+    """
+    Gather tensors from all workers with support for backward propagation:
+    This implementation does not cut the gradients as torch.distributed.all_gather does.
+    """
+    @staticmethod
+    def forward(ctx, x):
+        output = [torch.zeros_like(x) for _ in range(dist.get_world_size())]
+        dist.all_gather(output, x)
+        return tuple(output)
+    @staticmethod
+    def backward(ctx, *grads):
+        all_gradients = torch.stack(grads)
+        dist.all_reduce(all_gradients)
+        return all_gradients[dist.get_rank()]
+def gather_from_all(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Similar to classy_vision.generic.distributed_util.gather_from_all
+    except that it does not cut the gradients
+    """
+    if tensor.ndim == 0:
+        # 0 dim tensors cannot be gathered. so unsqueeze
+        tensor = tensor.unsqueeze(0)
+    if is_distributed_training_run():
+        tensor, orig_device = convert_to_distributed_tensor(tensor)
+        gathered_tensors = GatherLayer.apply(tensor)
+        gathered_tensors = [
+            convert_to_normal_tensor(_tensor, orig_device)
+            for _tensor in gathered_tensors
+        ]
+    else:
+        gathered_tensors = [tensor]
+    gathered_tensor = torch.cat(gathered_tensors, 0)
+    return gathered_tensor

lavila/models/models.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from lavila.models.openai_clip import load as load_openai_clip
+from lavila.models.openai_model import QuickGELU, Transformer
+from lavila.models.timesformer import SpaceTimeTransformer
+from lavila.models.utils import remap_keys, rsetattr
+from lavila.models.prompt_tuning import PromptLearner
+class CLIP(nn.Module):
+    def __init__(self,
+                 cfg,
+                 embed_dim: int,
+                 # vision
+                 vision_width: int,
+                 vision_model: nn.Module,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int,
+                 tempearture_init=0.07,
+                 **kwargs,
+                 ):
+        super().__init__()
+        self.context_length = context_length
+        self.vision_width = vision_width
+        self.tune_bias = cfg.get('tune_bias', False)
+        self.freeze_vis_backbone = cfg.get('freeze_vis_backbone', False)
+        self.freeze_txt_backbone = cfg.get('freeze_txt_backbone', False)
+        self.visual = vision_model
+        self.t_step = cfg.get('t_step', self.visual.num_frames)
+        txt_prompt_cfg = cfg.get('text_prompt', {})
+        self.n_ctx = txt_prompt_cfg.get('n_ctx', 0)
+        self.txt_use_bank = txt_prompt_cfg.get('use_bank', False)
+        if self.txt_use_bank:
+            self.transformer = Transformer(
+                width=transformer_width,
+                layers=transformer_layers,
+                heads=transformer_heads,
+                attn_mask=self.build_attention_mask(),
+                prompt_cfg=txt_prompt_cfg,
+                prompt_learner=PromptLearner(transformer_width, self.n_ctx),
+                prompt_generator=self.visual.prompt_generator
+            )
+        else:
+            self.transformer = Transformer(
+                width=transformer_width,
+                layers=transformer_layers,
+                heads=transformer_heads,
+                attn_mask=self.build_attention_mask(),
+                prompt_cfg=txt_prompt_cfg,
+                prompt_learner=PromptLearner(transformer_width, self.n_ctx)
+            )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = nn.LayerNorm(transformer_width)  # used to be `models.transformer.LayerNorm``
+        self.image_projection = nn.Parameter(torch.empty(vision_width, embed_dim))
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        print("=> initialize initial temperature with {}".format(tempearture_init))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / tempearture_init))
+        self.initialize_parameters()
+        freeze_list = []
+        if self.freeze_vis_backbone:
+            print("=> Freeze visual backbone")
+            freeze_list += self.visual.param_list + [self.image_projection]
+        if self.freeze_txt_backbone:
+            print("=> Freeze text backbone")
+            if self.tune_bias:
+                freeze_list += [m for n, m in self.transformer.named_parameters() if 'prompt' not in n and 'bias' not in n]
+                freeze_list += [m for n, m in self.ln_final.named_parameters() if 'bias' not in n]
+            else:
+                freeze_list += [m for n, m in self.transformer.named_parameters() if 'prompt' not in n]
+                freeze_list += list(self.ln_final.parameters())
+            freeze_list += list(self.token_embedding.parameters())
+            freeze_list += [self.positional_embedding] + [self.text_projection]
+        for p in freeze_list:
+            p.requires_grad = False
+        # text prompts
+        if self.n_ctx > 0:
+            if self.txt_use_bank:
+                prompt_dim = self.visual.prompt_dim
+                if prompt_dim != transformer_width:
+                    self.transformer.prompt_inproj = nn.Linear(transformer_width, prompt_dim, bias=False)
+                else:
+                    self.transformer.prompt_inproj = nn.Identity()
+                self.transformer.prompt_outproj = nn.Linear(prompt_dim, transformer_width, bias=False)
+                nn.init.kaiming_normal_(
+                    self.transformer.prompt_outproj.weight, a=0, mode='fan_out')
+        params_to_update = [n for n, m in self.named_parameters() if m.requires_grad]
+        num_opt_params = sum([m.numel() for m in self.parameters() if m.requires_grad])
+        num_fz_params = sum([m.numel() for m in self.parameters() if not m.requires_grad])
+        print("=> Params to update: {}".format(params_to_update))
+        print("=> Update/Frozen: {}/{}".format(num_opt_params, num_fz_params))
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        nn.init.normal_(self.image_projection, std=self.vision_width ** -0.5)
+        nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    def encode_image(self, image, use_checkpoint=False, apply_project=True, istrain=False, gamma=1.0):
+        x, ps_loss = self.visual(image, use_checkpoint=use_checkpoint, istrain=istrain, gamma=gamma)
+        if isinstance(x, list):
+            assert len(x) == 1
+            x = x[0]
+        if apply_project:
+            x = x @ self.image_projection
+        return x, ps_loss
+    def encode_text(self, text, use_checkpoint=False, istrain=False, gamma=1.0):
+        x = self.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        B = x.shape[0]
+        eot = text.argmax(dim=-1)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x, ps_loss = self.transformer(x, self.positional_embedding, use_checkpoint=use_checkpoint, istrain=istrain, gamma=gamma, eot=eot)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), self.n_ctx + eot] @ self.text_projection
+        return x, ps_loss
+    def forward(self, image, text, use_checkpoint=False, norm_embed=False, istrain=False, gamma=1.0):
+        image_embed, ps_loss_img = self.encode_image(image, use_checkpoint=use_checkpoint, istrain=istrain, gamma=gamma)
+        text_embed, ps_loss_txt = self.encode_text(text, use_checkpoint=use_checkpoint, istrain=istrain, gamma=gamma)
+        if norm_embed:
+            image_embed = F.normalize(image_embed, dim=-1)
+            text_embed = F.normalize(text_embed, dim=-1)
+        return {'image_embed': image_embed,
+                'text_embed': text_embed,
+                'logit_scale': self.logit_scale.exp(),
+                'ps_loss': ps_loss_img + ps_loss_txt}
+    def train(self, mode=True):
+        if not isinstance(mode, bool):
+            raise ValueError("training mode is expected to be boolean")
+        self.training = mode
+        for m in self.modules():
+            m.training = mode
+        if mode:
+            if self.freeze_vis_backbone and not self.tune_bias:
+                for n, m in self.visual.named_modules():
+                    if 'prompt' not in n:
+                        m.training = False
+            if self.freeze_txt_backbone and not self.tune_bias:
+                for n, m in self.transformer.named_modules():
+                    if 'prompt' not in n:
+                        m.training = False
+                self.token_embedding.training = False
+                self.ln_final.training = False
+def CLIP_OPENAI_TIMESFORMER_BASE(
+    num_frames=4, timesformer_gated_xattn=False, temperature_init=0.07,
+    project_embed_dim=256, **kwargs
+):
+    cfg = kwargs.pop('model_cfg', {})
+    vision_model = SpaceTimeTransformer(
+        num_frames=num_frames,
+        time_init='zeros',
+        attention_style='frozen-in-time',
+        ln_pre=True,
+        act_layer=QuickGELU,
+        is_tanh_gating=timesformer_gated_xattn,
+        drop_path_rate=cfg.get('drop_path_rate', 0),
+        tune_bias=cfg.get('tune_bias', False),
+        prompt_cfg=cfg.get('visual_prompt', {})
+    )
+    clip_model, _ = load_openai_clip('ViT-B/16', 'cpu')
+    print("=> Loading CLIP (ViT-B/16) weights")
+    remapped_state_dict = remap_keys(clip_model.visual.state_dict(), transformer_layers=12)
+    res = vision_model.load_state_dict(remapped_state_dict, strict=False)
+    print(res)
+    vision_model.head = nn.Identity()
+    vision_model.pre_logits = nn.Identity()
+    vision_model.fc = nn.Identity()
+    model = CLIP(
+        cfg,
+        embed_dim=project_embed_dim,
+        vision_width=768,
+        vision_model=vision_model,
+        context_length=77,
+        vocab_size=49408,
+        transformer_width=512,
+        transformer_heads=8,
+        transformer_layers=12,
+        tempearture_init=temperature_init,
+        **kwargs
+    )
+    model.transformer.load_state_dict(clip_model.transformer.state_dict(), strict=False)
+    model.token_embedding.load_state_dict(clip_model.token_embedding.state_dict())
+    model.positional_embedding.data.copy_(clip_model.positional_embedding.data)
+    model.ln_final.load_state_dict(clip_model.ln_final.state_dict())
+    if project_embed_dim == clip_model.text_projection.shape[1]:
+        print("=> Loading CLIP's text_projection, image_projection and logit_scale directly")
+        model.image_projection.data.copy_(clip_model.visual.proj.data)
+        model.text_projection.data.copy_(clip_model.text_projection.data)
+        model.logit_scale.data.copy_(clip_model.logit_scale.data)
+    return model

lavila/models/openai_clip.py ADDED Viewed

	@@ -0,0 +1,237 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Part of the code is from https://github.com/openai/CLIP/blob/main/clip/clip.py
+# Modified by Yue Zhao
+# The original code is under MIT License
+import hashlib
+import os
+import urllib
+import warnings
+from typing import Union, List
+from pkg_resources import packaging
+import torch
+from PIL import Image
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
+from tqdm import tqdm
+from .openai_model import build_model
+from .tokenizer import SimpleTokenizer as _Tokenizer
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+if packaging.version.parse(torch.__version__) < packaging.version.parse("1.7.1"):
+    warnings.warn("PyTorch version 1.7.1 or higher is recommended")
+__all__ = ["available_models", "load", "tokenize"]
+_tokenizer = _Tokenizer()
+_MODELS = {
+    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
+    "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
+    "RN50x64": "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt",
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+    "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
+    "ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
+}
+def _download(url: str, root: str):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match")
+    return download_target
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+def _transform(n_px):
+    return Compose([
+        Resize(n_px, interpolation=BICUBIC),
+        CenterCrop(n_px),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit: bool = False, download_root: str = None):
+    """Load a CLIP model
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+    device : Union[str, torch.device]
+        The device to put the loaded model
+    jit : bool
+        Whether to load the optimized JIT model or more hackable non-JIT model (default).
+    download_root: str
+        path to download the model files; by default, it uses "~/.cache/clip"
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    if name in _MODELS:
+        model_path = _download(_MODELS[name], download_root or os.path.expanduser("cache/clip"))
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+    with open(model_path, 'rb') as opened_file:
+        try:
+            # loading JIT archive
+            model = torch.jit.load(opened_file, map_location=device if jit else "cpu").eval()
+            state_dict = None
+        except RuntimeError:
+            # loading saved state dict
+            if jit:
+                warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
+                jit = False
+            state_dict = torch.load(opened_file, map_location="cpu")
+    if not jit:
+        model = build_model(state_dict or model.state_dict()).to(device)
+        if str(device) == "cpu":
+            model.float()
+        return model, _transform(model.visual.input_resolution)
+    # patch the device names
+    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
+    def patch_device(module):
+        try:
+            graphs = [module.graph] if hasattr(module, "graph") else []
+        except RuntimeError:
+            graphs = []
+        if hasattr(module, "forward1"):
+            graphs.append(module.forward1.graph)
+        for graph in graphs:
+            for node in graph.findAllNodes("prim::Constant"):
+                if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
+                    node.copyAttributes(device_node)
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+    # patch dtype to float32 on CPU
+    if str(device) == "cpu":
+        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
+        float_node = float_input.node()
+        def patch_float(module):
+            try:
+                graphs = [module.graph] if hasattr(module, "graph") else []
+            except RuntimeError:
+                graphs = []
+            if hasattr(module, "forward1"):
+                graphs.append(module.forward1.graph)
+            for graph in graphs:
+                for node in graph.findAllNodes("aten::to"):
+                    inputs = list(node.inputs())
+                    for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()["value"] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+        model.float()
+    return model, _transform(model.input_resolution.item())
+def tokenize(texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False) -> Union[torch.IntTensor, torch.LongTensor]:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length].
+    We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long.
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+    sot_token = _tokenizer.encoder["<|startoftext|>"]
+    eot_token = _tokenizer.encoder["<|endoftext|>"]
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
+    if packaging.version.parse(torch.__version__) < packaging.version.parse("1.8.0"):
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    else:
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.int)
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+            else:
+                raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result

lavila/models/openai_model.py ADDED Viewed

	@@ -0,0 +1,535 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Part of the code is from https://github.com/openai/CLIP/blob/main/clip/model.py
+# Modified by Yue Zhao
+# The original code is under MIT License
+from collections import OrderedDict
+from typing import Tuple, Union
+from einops import rearrange
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from torch import nn
+import pdb
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu3(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1], key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x.squeeze(0)
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(2)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        def stem(x):
+            x = self.relu1(self.bn1(self.conv1(x)))
+            x = self.relu2(self.bn2(self.conv2(x)))
+            x = self.relu3(self.bn3(self.conv3(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = nn.LayerNorm(d_model)  # used to be `models.transformer.LayerNorm`
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = nn.LayerNorm(d_model)  # used to be `models.transformer.LayerNorm`
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward_part1(self, x):
+        return self.attention(self.ln_1(x))
+    def forward_part2(self, x):
+        return self.mlp(self.ln_2(x))
+    def forward(self, x: torch.Tensor, use_checkpoint=False):
+        if use_checkpoint:
+            x = x + checkpoint.checkpoint(self.forward_part1, x)
+        else:
+            x = x + self.forward_part1(x)
+        if use_checkpoint:
+            x = x + checkpoint.checkpoint(self.forward_part2, x)
+        else:
+            x = x + self.forward_part2(x)
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, prompt_cfg={}, prompt_learner=None, prompt_generator=None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.ModuleList([ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+        self.num_tokens = prompt_cfg.pop('n_ctx', 0)
+        self.use_bank = prompt_cfg.pop('use_bank', False)
+        if self.num_tokens > 0:
+            self.prompt_learner = prompt_learner
+            self.prompt_generator = prompt_generator
+            self.k_s = 0
+            if self.prompt_generator is not None:
+                if self.prompt_generator.use_bank:
+                    self.k_s = len(self.prompt_generator.prompt_pool)
+            self.prompt_inproj = None
+            self.prompt_outproj = None
+    def forward(self, x: torch.Tensor, pos_emb, use_checkpoint=False, istrain=False, gamma=1.0, eot=None):
+        ps_loss = x.new_zeros([1])
+        BZ = x.size(1)
+        if not self.use_bank:
+            if self.num_tokens > 0:
+                ctx = self.prompt_learner()
+                ctx = ctx.unsqueeze(1).expand(-1, BZ, -1)
+                x = torch.cat((
+                    x[:1, :, :], # SOT
+                    ctx,
+                    x[1:, :, :]
+                ), dim=0)
+            x = x[:pos_emb.size(0)] + pos_emb.unsqueeze(1)
+        for i, blk in enumerate(self.resblocks):
+            if self.num_tokens > 0 and self.use_bank:
+                k = self.num_tokens
+                num_tokens = 0 if i == 0 else self.num_tokens
+                x = torch.cat((x[:1, :, :], x[num_tokens+1:, :, :]), dim=0)
+                query = self.prompt_inproj(x[eot, torch.arange(BZ), :].detach())
+                if i < self.k_s:
+                    out = self.prompt_generator.prompt_pool[i](query, k, istrain=istrain, gamma=gamma)
+                    ctx = self.prompt_outproj(out['prompts'])
+                    ctx = ctx.transpose(1, 0) + pos_emb.unsqueeze(1)[1:self.num_tokens+1, :]
+                    ps_loss += out.get('ps_loss', 0)
+                else:
+                    ctx = self.prompt_learner()
+                    ctx = ctx.unsqueeze(1).expand(-1, BZ, -1)
+                    ctx = ctx + pos_emb.unsqueeze(1)[1:self.num_tokens+1, :]
+                x = torch.cat((
+                    x[:1, :, :], # SOT
+                    ctx,
+                    x[1:, :, :]
+                ), dim=0)
+                x = x[:pos_emb.size(0)]
+            if use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        return x, ps_loss
+class VisionTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def forward(self, x: torch.Tensor, apply_project=True, use_checkpoint=False, cls_at_last=True):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x, use_checkpoint=use_checkpoint)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        if cls_at_last:
+            x = self.ln_post(x[:, 0, :])
+            if self.proj is not None and apply_project:
+                x = x @ self.proj
+            return x
+        else:
+            return x[:, 1:, :]
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int
+                 ):
+        super().__init__()
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim
+            )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image, apply_project=True, use_checkpoint=False):
+        if image.ndim == 4:
+            return self.visual(image.type(self.dtype))
+        else:
+            image = image.permute(0, 2, 1, 3, 4)  # BCTHW -> BTCHW
+            bb, tt, _, _, _ = image.shape
+            x = self.visual(image.reshape(-1, *image.shape[2:]), apply_project=apply_project, use_checkpoint=use_checkpoint)  # ND
+            x = x.view(bb, tt, -1)
+            image_features = x.mean(1)
+            # image_features = x.max(1).values
+            return image_features
+    def encode_text(self, text, use_checkpoint=False):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x, use_checkpoint=use_checkpoint)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, image, text, use_checkpoint=False, norm_embed=True):
+        image_features = self.encode_image(image, use_checkpoint=use_checkpoint)
+        text_features = self.encode_text(text, use_checkpoint=use_checkpoint)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        # # cosine similarity as logits
+        # logit_scale = self.logit_scale.exp()
+        # logits_per_image = logit_scale * image_features @ text_features.t()
+        # logits_per_text = logits_per_image.t()
+        # # shape = [global_batch_size, global_batch_size]
+        # return logits_per_image, logits_per_text
+        return {'image_embed': image_features,
+                'text_embed': text_features,
+                'logit_scale': self.logit_scale.exp()}
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+    model.apply(_convert_weights_to_fp16)
+def build_model(state_dict: dict):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len(
+            [k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")]
+        )
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [
+            len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]
+        ]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith("transformer.resblocks")))
+    model = CLIP(
+        embed_dim,
+        image_resolution, vision_layers, vision_width, vision_patch_size,
+        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
+    )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+    convert_weights(model)
+    model.load_state_dict(state_dict)
+    return model.eval()

lavila/models/prompt_tuning.py ADDED Viewed

	@@ -0,0 +1,291 @@

+import math
+from functools import reduce
+from operator import mul
+from einops import rearrange, repeat
+import pdb
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class PromptLearner(nn.Module):
+    def __init__(self, ctx_dim=512, n_ctx=16):
+        super(PromptLearner, self).__init__()
+        self.n_ctx = n_ctx
+        self.ctx_dim = ctx_dim
+        # initialize prompts
+        ctx_vectors = torch.empty(n_ctx, ctx_dim)
+        nn.init.normal_(ctx_vectors, std=0.02)
+        prompt_prefix = " ".join(["X"] * n_ctx)
+        self.ctx = nn.Parameter(ctx_vectors)  # to be optimized
+        print(f'Initial context: "{prompt_prefix}"')
+        print(f"Number of context words (tokens): {n_ctx}")
+    def forward(self):
+        return self.ctx
+class PromptPoolLearner(nn.Module):
+    def __init__(self, prompt_dim=256, size=128, length=1):
+        super(PromptPoolLearner, self).__init__()
+        self.prompt_dim = prompt_dim
+        self.length = length
+        self.size = size
+        # initiate prompt
+        self.prompt_values = nn.Parameter(torch.zeros(size, length, prompt_dim))
+        self.id_table = torch.ones([size]).cuda()
+        # xavier_uniform initialization
+        nn.init.uniform_(self.prompt_values.data, -1, 1)
+    def l2_normalize(self, x, dim=None, epsilon=1e-12):
+        """Normalizes a given vector or matrix."""
+        square_sum = torch.sum(x ** 2, dim=dim, keepdim=True)
+        x_inv_norm = torch.rsqrt(torch.maximum(square_sum, torch.tensor(epsilon, device=x.device)))
+        return x * x_inv_norm
+    def forward(self, query, k=0, istrain=False, gamma=1.0):
+        BZ = query.shape[0]
+        out = dict()
+        query = self.l2_normalize(query.squeeze(1), dim=1)
+        keys = self.prompt_values.mean(dim=1)
+        keys = self.l2_normalize(keys, dim=1)
+        similarity = torch.matmul(query, keys.t())
+        if k > 0 and k < self.size:
+            if istrain:
+                inv_freq = self.id_table.sum() / self.id_table.float()
+                weights = (similarity + 1) / 2 * gamma + (1 - gamma) * torch.softmax(inv_freq, dim=-1)
+                idx = torch.multinomial(weights, k, replacement=False)
+            else:
+                idx = torch.argsort(similarity, dim=-1, descending=True)[:, :k]
+            prompt_id, id_counts = torch.unique(idx, return_counts=True, sorted=True)
+            self.id_table[prompt_id] += id_counts
+            prompts = self.prompt_values[idx.flatten(), ...].view(BZ, k * self.length, self.prompt_dim)
+        else:
+            idx = torch.arange(self.size).unsqueeze(0).expand(BZ, -1)
+            prompts = self.prompt_values.flatten(0, 1).unsqueeze(0).expand(BZ, -1, -1)
+        prompts = self.l2_normalize(prompts, dim=-1)
+        out['prompts'] = prompts
+        sel_sim = similarity[torch.arange(BZ).view(-1, 1), idx]
+        sel_key = keys[idx.flatten(), ...].view(BZ, k, self.prompt_dim)
+        diff = F.mse_loss((sel_sim.unsqueeze(1) @ sel_key).squeeze(1), query.detach(), reduction='sum') / BZ
+        ksim = torch.sum(torch.abs(torch.matmul(keys, keys.t()) - torch.eye(self.size).to(keys.device))) / BZ
+        out['ps_loss'] = diff + ksim
+        return out
+class VisualPromptLearner(nn.Module):
+    def __init__(self, patch_size=16, embed_dim=768, num_layers=12, prompt_dim=256, num_tokens=5, deep=False,
+            deep_shared=False, split_st=False, dropout=0.1, pool={}):
+        super(VisualPromptLearner, self).__init__()
+        self.num_layers = num_layers
+        self.embed_dim = embed_dim
+        self.prompt_dim = prompt_dim
+        self.num_tokens = num_tokens  # number of prompted tokens
+        self.prompt_dropout = nn.Dropout(dropout)
+        pool_size = pool.get('size', 0)
+        self.pool_length = pool.get('length', 1)
+        self.use_bank = True if pool_size > 0 and num_tokens <= (pool_size * self.pool_length) else False
+        if self.use_bank:
+            print(f'Using feature bank with size {pool_size} (dimension: {prompt_dim})')
+        if prompt_dim != embed_dim:
+            self.prompt_inproj = nn.Linear(embed_dim, prompt_dim, bias=False)
+        else:
+            self.prompt_inproj = nn.Identity()
+        if self.use_bank:
+            self.prompt_outproj = nn.Linear(prompt_dim, embed_dim, bias=False)
+            nn.init.kaiming_normal_(
+                self.prompt_outproj.weight, a=0, mode='fan_out')
+        else:
+            self.prompt_outproj = nn.Identity()
+        self.split_st = split_st # split spatial and temporal prompts
+        # initiate prompt:
+        val = math.sqrt(6. / float(3 * reduce(mul, (patch_size, patch_size), 1) + prompt_dim))
+        if split_st:
+            if self.use_bank:
+                pool['size'] //= 2
+                self.spatial_prompt_pool = PromptPoolLearner(prompt_dim, **pool)
+                self.temporal_prompt_pool = PromptPoolLearner(prompt_dim, **pool)
+            else:
+                self.spatial_prompt_embeddings = nn.Parameter(torch.zeros(
+                    1, num_tokens // 2, prompt_dim))
+                self.temporal_prompt_embeddings = nn.Parameter(torch.zeros(
+                    1, num_tokens // 2, prompt_dim))
+                # xavier_uniform initialization
+                nn.init.uniform_(self.spatial_prompt_embeddings.data, -val, val)
+                nn.init.uniform_(self.temporal_prompt_embeddings.data, -val, val)
+        else:
+            if self.use_bank:
+                self.prompt_pool = PromptPoolLearner(prompt_dim, **pool)
+            else:
+                self.prompt_embeddings = nn.Parameter(torch.zeros(
+                    1, num_tokens, prompt_dim))
+                # xavier_uniform initialization
+                nn.init.uniform_(self.prompt_embeddings.data, -val, val)
+        self.deep = deep or deep_shared
+        self.deep_shared = deep_shared
+        if deep and (not deep_shared):
+            total_d_layer = num_layers - 1
+            if split_st:
+                if self.use_bank:
+                    self.spatial_deep_prompt_pool = nn.ModuleList([
+                        PromptPoolLearner(prompt_dim, **pool)
+                        for i in range(total_d_layer)])
+                    self.temporal_deep_prompt_pool = nn.ModuleList([
+                        PromptPoolLearner(prompt_dim, **pool)
+                        for i in range(total_d_layer)])
+                else:
+                    self.spatial_deep_prompt_embeddings = nn.Parameter(torch.zeros(
+                        total_d_layer, num_tokens // 2, prompt_dim))
+                    self.temporal_deep_prompt_embeddings = nn.Parameter(torch.zeros(
+                        total_d_layer, num_tokens // 2, prompt_dim))
+                    # xavier_uniform initialization
+                    nn.init.uniform_(self.spatial_deep_prompt_embeddings.data, -val, val)
+                    nn.init.uniform_(self.temporal_deep_prompt_embeddings.data, -val, val)
+            else:
+                if self.use_bank:
+                    self.deep_prompt_pool = nn.ModuleList([
+                        PromptPoolLearner(prompt_dim, **pool)
+                        for i in range(total_d_layer)])
+                else:
+                    self.deep_prompt_embeddings = nn.Parameter(torch.zeros(
+                        total_d_layer, num_tokens, prompt_dim))
+                    # xavier_uniform initialization
+                    nn.init.uniform_(self.deep_prompt_embeddings.data, -val, val)
+    def forward(self, query=None, layer=0, istrain=False, gamma=1.0):
+        query = query.detach()
+        query = self.prompt_inproj(query)
+        ps_loss = query.new_zeros([1])
+        if self.split_st:
+            if self.deep and (not self.deep_shared) and layer > 0:
+                if self.use_bank:
+                    k = (self.num_tokens // 2) // self.pool_length
+                    spatial_out = self.spatial_deep_prompt_pool[layer-1](query, k, istrain, gamma)
+                    spatial_prompts = spatial_out['prompts']
+                    temporal_out = self.temporal_deep_prompt_pool[layer-1](query, k, istrain, gamma)
+                    temporal_prompts = temporal_out['prompts']
+                    ps_loss += spatial_out.get('ps_loss', 0) + temporal_out.get('ps_loss', 0)
+                else:
+                    spatial_prompts = self.spatial_deep_prompt_embeddings[layer-1]
+                    temporal_prompts = self.temporal_deep_prompt_embeddings[layer-1]
+            else:
+                if self.use_bank:
+                    k = (self.num_tokens // 2) // self.pool_length
+                    spatial_out = self.spatial_prompt_pool(query, k, istrain, gamma)
+                    spatial_prompts = spatial_out['prompts']
+                    temporal_out = self.temporal_prompt_pool(query, k, istrain, gamma)
+                    temporal_prompts = temporal_out['prompts']
+                    ps_loss += spatial_out.get('ps_loss', 0) + temporal_out.get('ps_loss', 0)
+                else:
+                    spatial_prompts = self.spatial_prompt_embeddings
+                    temporal_prompts = self.temporal_prompt_embeddings
+            prompts = torch.cat((spatial_prompts, temporal_prompts), dim=1)
+        else:
+            if self.deep and (not self.deep_shared) and layer > 0:
+                if self.use_bank:
+                    k = self.num_tokens // self.pool_length
+                    out = self.deep_prompt_pool[layer-1](query, k, istrain, gamma)
+                    prompts = out['prompts']
+                    ps_loss += out.get('ps_loss', 0)
+                else:
+                    prompts = self.deep_prompt_embeddings[layer-1]
+            else:
+                if self.use_bank:
+                    k = self.num_tokens // self.pool_length
+                    out = self.prompt_pool(query, k, istrain, gamma)
+                    prompts = out['prompts']
+                    ps_loss += out.get('ps_loss', 0)
+                else:
+                    prompts = self.prompt_embeddings
+        prompts = self.prompt_dropout(self.prompt_outproj(prompts))
+        return prompts, ps_loss
+class CMM(nn.Module):
+    '''Context modeling module'''
+    def __init__(self, num_tokens=8, num_frames=16, embed_dim=768, prompt_dim=256, dropout=0., num_layer=1, shared=False, pool={}):
+        super(CMM, self).__init__()
+        self.num_tokens = num_tokens
+        self.num_frames = num_frames
+        self.embed_dim = embed_dim
+        self.prompt_dim = prompt_dim
+        self.pool_size = pool.get('size', 0)
+        self.pool_length = pool.get('length', 1)
+        self.use_bank = True if self.pool_size > 0 else False
+        self.use_rnn = not self.use_bank
+        if self.use_rnn:
+            self.rnn = nn.LSTM(input_size=embed_dim, hidden_size=embed_dim,
+                num_layers=1, batch_first=True, dropout=dropout, bidirectional=True)
+        self.shared = shared
+        self.prompt_dropout = nn.Dropout(dropout)
+        if self.use_bank:
+            print(f'Using feature bank with size {self.pool_size} (dimension: {prompt_dim})')
+            if self.use_rnn:
+                self.prompt_inproj = nn.Linear(embed_dim * 2, prompt_dim)
+                nn.init.kaiming_normal_(
+                    self.prompt_inproj.weight, a=0, mode='fan_out')
+            else:
+                if embed_dim != prompt_dim:
+                    self.prompt_inproj = nn.Linear(embed_dim, prompt_dim, bias=False)
+                else:
+                    self.prompt_inproj = nn.Identity()
+            self.prompt_outproj = nn.Linear(prompt_dim, embed_dim, bias=False)
+            nn.init.kaiming_normal_(
+                self.prompt_outproj.weight, a=0, mode='fan_out')
+            if shared:
+                self.prompt_pool = PromptPoolLearner(prompt_dim, **pool)
+            else:
+                self.prompt_pool = nn.ModuleList([
+                    PromptPoolLearner(prompt_dim, **pool)
+                    for i in range(num_layer)])
+        else:
+            self.fc = nn.Linear(embed_dim * 2, embed_dim * num_tokens)
+    def forward(self, x, layer=0, istrain=False, gamma=1.0):
+        BZ = x.size(0)
+        x = x.detach()
+        x = rearrange(x, 'b (f n) d -> b f n d', f=self.num_frames)
+        x = torch.mean(x, dim=2)
+        if self.use_rnn:
+            x, _ = self.rnn(x)
+        ps_loss = x.new_zeros([1])
+        if self.use_bank:
+            query = self.prompt_inproj(x).flatten(0, 1)
+            k = self.num_tokens // self.pool_length
+            if self.shared:
+                out = self.prompt_pool(query, k, istrain, gamma)
+            else:
+                out = self.prompt_pool[layer](query, k, istrain, gamma)
+            prompts = rearrange(out['prompts'], '(b f) p d -> b (f p) d', f=self.num_frames)
+            prompts = self.prompt_outproj(prompts)
+            ps_loss += out.get('ps_loss', 0) * self.num_frames
+        else:
+            prompts = self.fc(x)
+            prompts = rearrange(prompts, 'b f (p d) -> b (f p) d', p=self.num_tokens)
+        return prompts, ps_loss

lavila/models/timesformer.py ADDED Viewed

	@@ -0,0 +1,650 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Part of the code is from https://github.com/m-bain/frozen-in-time/blob/main/model/video_transformer.py
+# Modified by Yue Zhao
+# The original code is under MIT License
+"""
+Implementations of Video Transformers in PyTorch
+A PyTorch implementation of space-time transformer as described in
+'Frozen in Time: A Joint Image and Video Encoder for End-to-End Retrieval' - https://arxiv.org/abs/2104.00650
+A PyTorch implementation of timesformer as described in
+'Is Space-Time Attention All You Need for Video Understanding?' - https://arxiv.org/abs/2102.05095
+Acknowledgments:
+- This code builds on Ross Wightman's vision_transformer code in pytorch-image-models:
+https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+- It is also inspired by lucidrains timesformer implementation:
+https://github.com/lucidrains/TimeSformer-pytorch
+Hacked together by Max Bain
+"""
+from collections import OrderedDict, defaultdict
+from functools import partial, reduce
+import operator
+import copy
+import torch
+import torch.utils.checkpoint as checkpoint
+from einops import rearrange, repeat
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from torch import einsum, nn
+import torch.nn.functional as F
+import pdb
+from lavila.models.prompt_tuning import VisualPromptLearner, CMM
+def attn(q, k, v):
+    sim = einsum('b i d, b j d -> b i j', q, k)
+    attn = sim.softmax(dim=-1)
+    out = einsum('b i j, b j d -> b i d', attn, v)
+    return out
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class VideoPatchEmbed(nn.Module):
+    """ Video to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768,
+                 num_frames=8, ln_pre=False):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * num_frames
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.num_frames = num_frames
+        self.embed_dim = embed_dim
+        # ln_pre is inserted to be compatible with CLIP-style model
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=not ln_pre)
+    def forward(self, x):
+        B, F, C, H, W = x.shape
+        assert F <= self.num_frames
+        x = x.view(-1, C, H, W)
+        x = self.proj(x)
+        return x
+class VarAttention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.,
+                 initialize='random', num_tokens=0):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        if initialize == 'zeros':
+            self.qkv.weight.data.fill_(0)
+            self.qkv.bias.data.fill_(0)
+            # fill proj weight with 1 here to improve training dynamics. Otherwise temporal attention inputs
+            # are multiplied by 0*0, which is hard for the model to move out of.
+            self.proj.weight.data.fill_(1)
+            self.proj.bias.data.fill_(0)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.num_tokens = num_tokens
+    def forward(self, x, einops_from, einops_to, einops_dims, cfg):
+        style = cfg.get('style', 'default')
+        pt_att = cfg.get('pt_att', True)
+        n_seg = cfg.get('n_seg', 4)
+        if 'VoP' in style:
+            return self.forward_VoP(x, einops_from, einops_to, einops_dims, n_seg)
+        elif style == 'attall':
+            return self.forward_attall(x, pt_att)
+        else:
+            return self.forward_features(x, einops_from, einops_to, einops_dims, pt_att)
+    def forward_features(self, x, einops_from, einops_to, einops_dims, pt_att=True):
+        h = self.num_heads
+        num_tokens = self.num_tokens
+        if self.num_tokens > 0 and not pt_att:
+            prompts = x[:, 1:self.num_tokens+1, :]
+            x = torch.cat((
+                x[:, :1, :], # cls_token
+                x[:, self.num_tokens+1:, :] # patch embeddings
+            ), dim=1)
+            num_tokens = 0
+        # project x to q, k, v values
+        q, k, v = self.qkv(x).chunk(3, dim=-1)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+        q *= self.scale
+        # splice out CLS token at index 1 (and prompts)
+        (cls_q, q_), (cls_k, k_), (cls_v, v_) = map(lambda t: (t[:, 0:num_tokens+1], t[:, num_tokens+1:]), (q, k, v)) # Bh x () x d
+        # let CLS token attend to key / values of all patches across time and space
+        cls_out = attn(cls_q, k, v) # Bh x (1 + p) x d
+        # rearrange across time or space
+        q_, k_, v_ = map(lambda t: rearrange(t, f'{einops_from} -> {einops_to}', **einops_dims), (q_, k_, v_)) # Bh x NT x d -> Bhr x s x d
+        # expand cls token keys and values across time or space and concat
+        r = q_.shape[0] // cls_k.shape[0]
+        cls_k, cls_v = map(lambda t: repeat(t, 'b p d -> (b r) p d', r=r), (cls_k, cls_v))  # Bhr x (1 + p) x d
+        k_ = torch.cat((cls_k, k_), dim=1)
+        v_ = torch.cat((cls_v, v_), dim=1)
+        # attention
+        out = attn(q_, k_, v_)
+        # merge back time or space
+        out = rearrange(out, f'{einops_to} -> {einops_from}', **einops_dims) # Bh x NT x d
+        # concat back the cls token
+        out = torch.cat((cls_out, out), dim=1) # Bh x (1 + p + NT) x d
+        # merge back the heads
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h) # B x (1 + p + NT) x hd
+        if self.num_tokens > 0 and not pt_att:
+            out = torch.cat((
+                out[:, :1, :], # cls_tokens
+                prompts,
+                out[:, 1:, :]  # patch embeddings
+            ), dim=1)
+        # to out
+        x = self.proj(out)
+        x = self.proj_drop(x)
+        return x
+    def forward_VoP(self, x, einops_from, einops_to, einops_dims, n_seg=4):
+        # position-specific prompts for spatial attention
+        h = self.num_heads
+        num_tokens = self.num_tokens
+        # project x to q, k, v values
+        q, k, v = self.qkv(x).chunk(3, dim=-1) # B x (1+p+NT) x hd
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v)) # Bh x (1+p+NT) x d
+        q *= self.scale
+        # splice out CLS token at index 1 and prompts
+        (cls_q, q_), (cls_k, k_), (cls_v, v_) = map(lambda t: (t[:, 0:num_tokens+1], t[:, num_tokens+1:]), (q, k, v)) # Bh x () x d
+        # let CLS token attend to key / values of all patches across time and space
+        cls_out = attn(cls_q[:, :1, :], k, v) # cls token: Bh x 1 x d
+        # segment prompts into s segments in time
+        pstep = num_tokens // n_seg
+        pseg = [range(st, en) for st, en in zip(range(1, num_tokens+1, pstep), range(pstep+1, num_tokens+2, pstep))]
+        p_q, p_k, p_v = map(lambda t: rearrange(t[:, pseg, :], 'b s p d -> (b s) p d'), (cls_q, cls_k, cls_v)) # prompt query: (Bh x n_seg) x p_per_seg x d
+        # segment patch embeddings into s segments in time
+        q_, k_, v_ = map(lambda t: rearrange(t, 'b (f n) d -> b f n d', **einops_dims), (q_, k_, v_)) # Bh x T x N x d
+        num_frames = k_.size(1)
+        tstep = num_frames // n_seg
+        tseg = [range(st, en) for st, en in zip(range(0, num_frames, tstep), range(tstep, num_frames+1, tstep))]
+        q_, k_, v_ = map(lambda t: t[:, tseg, ...], (q_, k_, v_)) # Bh x n_seg x f_per_seg x n x d
+        q_, k_, v_ = map(lambda t: rearrange(t, 'b s f n d -> (b s) (f n) d'), (q_, k_, v_)) # (Bh x n_seg) x (f_per_seg x n) x d
+        # concatenate prompts and patch embeddings
+        k_, v_ = map(lambda t: torch.cat((t[0], t[1]), dim=1), ((p_k, k_), (p_v, v_)))
+        p_out = attn(p_q, k_, v_) # (Bh x n_seg) x p_per_seg x d
+        out = attn(q_, k_, v_)    # (Bh x n_seg) x (f_per_seg x n) x d
+        p_out = rearrange(p_out, '(b s) p d -> b (s p) d', s=n_seg) # Bh x p x d
+        out = rearrange(out, '(b s) (f n) d -> b (s f n) d', s=n_seg, f=tstep) # Bh x NT x d
+        # merge tokens
+        out = torch.cat((cls_out, p_out, out), dim=1) # Bh x (1+p+NT) x d
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h) # B x (NT+1) x hd
+        # to out
+        x = self.proj(out)
+        x = self.proj_drop(x)
+        return x
+    def forward_attall(self, x, pt_att=True):
+        h = self.num_heads
+        if self.num_tokens > 0 and not pt_att:
+            prompts = x[:, 1:self.num_tokens+1, :]
+            x = torch.cat((
+                x[:, :1, :], # cls_token
+                x[:, self.num_tokens+1:, :] # patch embeddings
+            ), dim=1)
+        # project x to q, k, v values
+        q, k, v = self.qkv(x).chunk(3, dim=-1)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+        q *= self.scale
+        # all tokens attend to all tokens
+        out = attn(q, k, v)
+        # merge back the heads
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h) # B x (1 + p + NT) x hd
+        if self.num_tokens > 0 and not pt_att:
+            out = torch.cat((
+                out[:, :1, :], # cls_tokens
+                prompts,
+                out[:, 1:, :]  # patch embeddings
+            ), dim=1)
+        # to out
+        x = self.proj(out)
+        x = self.proj_drop(x)
+        return x
+class SpaceTimeBlock(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, time_init='zeros',
+                 attention_style='frozen-in-time', is_tanh_gating=False, num_tokens=0, split_st=False):
+        super().__init__()
+        self.split_st = split_st # split spatial and temporal prompts
+        if split_st:
+            num_tokens = num_tokens // 2
+            self.num_tokens = num_tokens # learnable prompts
+        self.norm1 = norm_layer(dim)
+        self.attn = VarAttention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, num_tokens=num_tokens)
+        self.timeattn = VarAttention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, num_tokens=num_tokens,
+            initialize=time_init)
+        if is_tanh_gating:
+            self.alpha_timeattn = nn.Parameter(torch.zeros([]))
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.norm3 = norm_layer(dim)
+        self.attention_style = attention_style
+    def forward(self, x, einops_from_space, einops_to_space, einops_from_time, einops_to_time,
+                time_n, space_f, use_checkpoint=False, pt_spt=True, pt_tmp=True, style='default', n_seg=4):
+        if self.split_st:
+            spatial_prompts = x[:, 1:self.num_tokens+1, :]
+            x = torch.cat((
+                x[:, :1, :], # cls_token
+                x[:, self.num_tokens+1:, :] # temporal prompts and patch embeddings
+            ), dim=1)
+        if use_checkpoint:
+            time_output = checkpoint.checkpoint(
+                self.timeattn, self.norm3(x), einops_from_time, einops_to_time, {"n": time_n}, {'pt_att': pt_tmp}
+            )
+        else:
+            time_output = self.timeattn(self.norm3(x), einops_from_time, einops_to_time, {"n": time_n}, {'pt_att': pt_tmp})
+        if hasattr(self, "alpha_timeattn"):
+            time_output = torch.tanh(self.alpha_timeattn) * time_output
+        time_residual = x + time_output
+        if self.split_st:
+            temporal_prompts = time_residual[:, 1:self.num_tokens+1, :]
+            time_residual = torch.cat((
+                time_residual[:, :1, :], # cls_token
+                spatial_prompts,
+                time_residual[:, self.num_tokens+1:, :] # patch embeddings
+            ), dim=1)
+        cfg = {'style': style, 'pt_att': pt_spt, 'n_seg': n_seg}
+        if use_checkpoint:
+            space_output = checkpoint.checkpoint(
+                self.attn, self.norm1(time_residual), einops_from_space, einops_to_space, {"f": space_f}, cfg
+            )
+        else:
+            space_output = self.attn(self.norm1(time_residual), einops_from_space,
+                                     einops_to_space, {"f": space_f}, cfg)
+        if self.attention_style == 'frozen-in-time':
+            space_residual = x + self.drop_path(space_output)
+        else:
+            raise NotImplementedError
+        if self.split_st:
+            space_residual = torch.cat((
+                space_residual[:, :self.num_tokens+1, :], # cls_token and spacial prompts
+                temporal_prompts,
+                space_residual[:, self.num_tokens+1:, :]  # patch embeddings
+            ), dim=1)
+        x = space_residual + self.drop_path(self.mlp(self.norm2(space_residual)))
+        return x
+class SpaceTimeTransformer(nn.Module):
+    """ Vision Transformer
+    A PyTorch impl of : `Space-Time Transformer` from Frozen-in-time  - by Max Bain.
+        https://arxiv.org/abs/2104.00650
+    Based off:
+     - ViT implementation from the timm library [https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py]
+    lucidrains timesformer implementation [https://github.com/lucidrains/TimeSformer-pytorch].
+    Notable differences:
+     - allows for variable length input frames (<= num_frames)
+     - allows for variable length input resolution  (<= (img_size, img_size)) [UNTESTED]
+     - different attention block mechanism
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0., hybrid_backbone=None, norm_layer=None,
+                 num_frames=8, time_init='rand', attention_style='frozen-in-time', ln_pre=False,
+                 act_layer=nn.GELU, is_tanh_gating=False, tune_bias=False, prompt_cfg={}):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            hybrid_backbone (nn.Module): CNN backbone to use in-place of PatchEmbed module
+            norm_layer: (nn.Module): normalization layer
+            num_frames: (int) maximum number of frames expected as input
+            time_init: (str) how to initialise the time attention layer, 'zeros' allows for the timesformer to start off
+                        as ViT.
+            attention_style: (str) how to attend to space and time.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_frames = num_frames
+        self.embed_dim = embed_dim
+        self.tune_bias = tune_bias
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        print("######USING ATTENTION STYLE: ", attention_style)
+        self.param_list = []
+        if hybrid_backbone is not None:
+            raise NotImplementedError('hybrid backbone not implemented')
+        else:
+            self.patch_embed = VideoPatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, num_frames=num_frames, ln_pre=ln_pre)
+            self.param_list += list(self.patch_embed.parameters())
+        num_patches = self.patch_embed.num_patches
+        self.patches_per_frame = num_patches // num_frames
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, self.patches_per_frame + 1,
+                        embed_dim))  # remember to take pos_embed[1:] for tiling over time
+        self.temporal_embed = nn.Parameter(torch.zeros(1, num_frames, embed_dim))
+        self.param_list += [self.cls_token, self.pos_embed, self.temporal_embed]
+        if ln_pre:
+            self.ln_pre = nn.LayerNorm(embed_dim)
+            if self.tune_bias:
+                self.param_list += [m for n, m in self.ln_pre.named_parameters() if 'bias' not in n]
+            else:
+                self.param_list += list(self.ln_pre.parameters())
+        else:
+            self.ln_pre = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # config for prompts
+        self.num_tokens = prompt_cfg.get('num_tokens', 0)
+        self.prompt_dim = prompt_cfg.get('prompt_dim', 768)
+        self.pt_spt = prompt_cfg.pop('pt_spt', True)
+        self.pt_tmp = prompt_cfg.pop('pt_tmp', True)
+        self.style = prompt_cfg.pop('style', 'default')
+        self.query = prompt_cfg.pop('query', 'cls')
+        self.n_seg = prompt_cfg.pop('n_seg', 4)
+        self.k_s = prompt_cfg.pop('K_s', depth)
+        self.st = prompt_cfg.pop('st', 0)
+        self.end = prompt_cfg.pop('end', depth)
+        assert self.st <= self.end
+        if self.style == 'default':
+            print(f'Prompting {self.st}-{self.end} layer of the visual backbone')
+        elif self.style == 'VoP_c' and self.k_s < depth:
+            self.prompt_embed = nn.Parameter(torch.zeros(1, num_frames, embed_dim))
+        elif self.style == 'VoP_c_pool':
+            self.prompt_temp_embed = nn.Parameter(torch.zeros(1, self.n_seg, embed_dim))
+            trunc_normal_(self.prompt_temp_embed, std=.02)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        blocks = []
+        for i in range(depth):
+            stblk_cfg = {}
+            if self.num_tokens > 0:
+                stblk_cfg = {'num_tokens': prompt_cfg['num_tokens'], 'split_st': prompt_cfg.get('split_st', False)}
+            blocks.append(
+                SpaceTimeBlock(
+                    dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                    drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, time_init=time_init,
+                    attention_style=attention_style, act_layer=act_layer, is_tanh_gating=is_tanh_gating, **stblk_cfg)
+            )
+        self.blocks = nn.ModuleList(blocks)
+        self.norm = norm_layer(embed_dim)
+        if self.tune_bias:
+            self.param_list += reduce(operator.add, [[m for n, m in x.named_parameters() if 'bias' not in n] for x in self.blocks])
+            self.param_list += [m for n, m in self.norm.named_parameters() if 'bias' not in n]
+        else:
+            self.param_list += reduce(operator.add, [list(x.parameters()) for x in self.blocks])
+            self.param_list += list(self.norm.parameters())
+        # Representation layer
+        if representation_size:
+            self.num_features = representation_size
+            self.pre_logits = nn.Sequential(OrderedDict([
+                ('fc', nn.Linear(embed_dim, representation_size)),
+                ('act', nn.Tanh())
+            ]))
+            if self.tune_bias:
+                self.param_list += [m for n, m in self.pre_logits.named_parameters() if 'bias' not in n]
+            else:
+                self.param_list += list(self.pre_logits.parameters())
+        else:
+            self.pre_logits = nn.Identity()
+        # Classifier head
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        # if num_frames > 1, then we perform ViT inflation and initialise time attention to zero so not necessary.
+        if num_frames == 1:
+            self.apply(self._init_weights)
+        # einops transformations
+        self.einops_from_space = 'b (f n) d'
+        self.einops_to_space = '(b f) n d'
+        self.einops_from_time = 'b (f n) d'
+        self.einops_to_time = '(b n) f d'
+        # freeze the backbone and only learn the prompts
+        self.prompt_learner = None
+        if self.num_tokens > 0:
+            if 'VoP_c' in self.style:
+                pool = prompt_cfg.pop('pool', {}) if 'pool' in self.style else {}
+                if self.k_s > 0:
+                    self.prompt_generator = CMM(self.num_tokens // self.n_seg, self.n_seg, embed_dim, self.prompt_dim, num_layer=self.k_s, \
+                        shared=prompt_cfg.get('deep_shared', False), pool=pool)
+                n_prompt_layer = depth - self.k_s
+            else:
+                n_prompt_layer = self.end - self.st
+            if n_prompt_layer > 0:
+                prompt_cfg['num_layers'] = n_prompt_layer
+                prompt_cfg['prompt_dim'] = embed_dim
+                self.prompt_learner = VisualPromptLearner(patch_size, embed_dim, **prompt_cfg)
+            for p in self.param_list:
+                p.requies_grad = False
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+    def get_classifier(self):
+        return self.head
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+    def forward_features(self, x, use_checkpoint=False, cls_at_last=True, istrain=False, gamma=1.0):
+        # print(x.shape)
+        b, curr_frames, channels, _, _ = x.shape
+        x = self.patch_embed(x)
+        x = x.flatten(2).transpose(2, 1)
+        x = x.reshape(b, -1, self.patch_embed.embed_dim)
+        BF = x.shape[0]
+        cls_tokens = self.cls_token.expand(BF, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        # positional embed needs to be tiled for each frame (this does [1,2,3] --> [1,2,3,1,2,3]...)
+        cls_embed = self.pos_embed[:, 0, :].unsqueeze(1)
+        tile_pos_embed = self.pos_embed[:, 1:, :].repeat(1, self.num_frames, 1)
+        # temporal embed needs to be repeated within each frame (this does [1,2,3] --> [1,1,1,2,2,2,3,3,3]...)
+        tile_temporal_embed = self.temporal_embed.repeat_interleave(self.patches_per_frame, 1)
+        total_pos_embed = tile_pos_embed + tile_temporal_embed
+        total_pos_embed = torch.cat([cls_embed, total_pos_embed], dim=1) # 1 x (NT + 1) x D
+        curr_patches = x.shape[1]
+        x = x + total_pos_embed[:, :curr_patches] # B x (NT + 1) x D
+        ps_loss = x.new_zeros([1])
+        # incorporate prompts
+        if self.num_tokens > 0:
+            if 'VoP_c' in self.style and self.k_s > 0:
+                ctx, ps = self.prompt_generator(x[:, 1:, :], 0, istrain=istrain, gamma=gamma)
+                ps_loss += ps
+                if self.prompt_generator.use_bank:
+                    prompt_temp_embed = self.prompt_temp_embed.repeat_interleave(self.num_tokens // self.n_seg, 1)
+                    ctx = ctx + prompt_temp_embed
+            elif self.prompt_learner is not None:
+                ctx, ps = self.prompt_learner(x[:, :1, :], 0, istrain=istrain, gamma=gamma)
+                ps_loss += ps
+                if ctx.size(0) != BF:
+                    ctx = ctx.expand(BF, -1, -1)
+            x = torch.cat((
+                x[:, :1, :], # cls_token
+                ctx,
+                x[:, 1:, :]
+            ), dim=1)
+        if self.ln_pre is not None:
+            x = self.ln_pre(x)
+        x = self.pos_drop(x)
+        n = self.patches_per_frame
+        f = curr_frames
+        for i, blk in enumerate(self.blocks):
+            if self.num_tokens > 0 and i > 0 and i >= self.st and i < self.end:
+                if 'VoP_c' in self.style:
+                    if i < self.k_s:
+                        ctx, ps = self.prompt_generator(x[:, self.num_tokens+1:, :], i, istrain=istrain, gamma=gamma)
+                        ps_loss += ps
+                        if self.prompt_generator.use_bank:
+                            prompt_temp_embed = self.prompt_temp_embed.repeat_interleave(self.num_tokens // self.n_seg, 1)
+                            ctx = ctx + prompt_temp_embed
+                    else:
+                        ctx, ps = self.prompt_learner(x[:, :1, :], i-self.k_s, istrain=istrain, gamma=gamma)
+                        ps_loss += ps
+                        if 'pool' in self.style:
+                            prompt_embed = self.prompt_temp_embed.repeat_interleave(self.num_tokens // self.n_seg, 1)
+                        else:
+                            prompt_embed = self.prompt_embed.repeat_interleave(self.num_tokens // self.num_frames, 1)
+                        ctx = ctx + prompt_embed
+                        if ctx.size(0) != BF:
+                            ctx = ctx.expand(BF, -1, -1)
+                elif (i - self.st) < self.prompt_learner.num_layers:
+                    ctx, ps = self.prompt_learner(x[:, :1, :], i-self.st, istrain=istrain, gamma=gamma)
+                    ps_loss += ps
+                    if ctx.size(0) != BF:
+                        ctx = ctx.expand(BF, -1, -1)
+                x = torch.cat((
+                    x[:, :1, :], # cls_token
+                    ctx,
+                    x[:, self.num_tokens+1:, :]
+                ), dim=1)
+            style = 'default' if i >= self.k_s else self.style
+            pt_tmp = self.pt_tmp if i >= self.st and i < self.end else False
+            pt_spt = self.pt_spt if i >= self.st and i < self.end else False
+            x = blk(x, self.einops_from_space, self.einops_to_space, self.einops_from_time,
+                    self.einops_to_time,
+                    time_n=n, space_f=f, use_checkpoint=use_checkpoint, pt_spt=pt_spt,
+                    pt_tmp=pt_tmp, style=style, n_seg=self.n_seg)
+        if cls_at_last:
+            x = self.norm(x)
+            x = x[:, 0]
+            x = self.pre_logits(x)
+            return x, ps_loss
+        else:
+            return self.norm(x), ps_loss
+    def forward(self, x, use_checkpoint=False, istrain=False, gamma=1.0):
+        # Note:  B C T H W => B T C H W
+        # The default input order is different from the one in Frozen-in-Time
+        x = x.permute(0, 2, 1, 3, 4).contiguous()
+        x, ps_loss = self.forward_features(x, use_checkpoint=use_checkpoint, istrain=istrain, gamma=gamma)
+        x = self.head(x)
+        return x, ps_loss
+    def train(self, mode=True):
+        if not isinstance(mode, bool):
+            raise ValueError("training mode is expected to be boolean")
+        self.training = mode
+        for m in self.modules():
+            m.training = mode
+        if mode and self.num_tokens > 0:
+            for n, m in self.named_modules():
+                if 'prompt' not in n:
+                    m.training = False

lavila/models/tokenizer.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Part of the code is from https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py
+# Modified by Yue Zhao
+# The original code is under MIT License
+import gzip
+import html
+import os
+from functools import lru_cache
+import ftfy
+import regex as re
+import torch
+from transformers import (BertTokenizer, DistilBertTokenizer, GPT2Tokenizer)
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token+'</w>'
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text
+    def __call__(self, texts, context_length=77):
+        if isinstance(texts, str):
+            texts = [texts]
+        sot_token = self.encoder["<|startoftext|>"]
+        eot_token = self.encoder["<|endoftext|>"]
+        all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts]
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+        for i, tokens in enumerate(all_tokens):
+            tokens = tokens[:context_length]
+            result[i, :len(tokens)] = torch.tensor(tokens)
+        if len(result) == 1:
+            return result[0]
+        return result
+class MyBertTokenizer(object):
+    def __init__(self, name=''):
+        print('=> Initialize MyBertTokenizer ({})'.format(name))
+        self.tokenizer = BertTokenizer.from_pretrained(name)
+        self.bos_token_id, self.eos_token_id = self.tokenizer('').input_ids
+        self.pad_token_id = 0
+    def __call__(self, texts, context_length=77):
+        if isinstance(texts, str):
+            texts = [texts]
+        result = torch.zeros(len(texts), context_length, dtype=torch.long)
+        mask = torch.zeros(len(texts), context_length, dtype=torch.float32)
+        for i, text in enumerate(texts):
+            tokens = self.tokenizer(text)
+            input_ids = tokens.input_ids[:context_length]
+            attention_mask = tokens.attention_mask[:context_length]
+            result[i, :len(input_ids)] = torch.tensor(input_ids)
+            mask[i, :len(attention_mask)] = torch.tensor(attention_mask)
+        if len(result) == 1:
+            return result[0], mask[0]
+        return result, mask
+class MyDistilBertTokenizer(object):
+    def __init__(self, name=''):
+        print('=> Initialize MyDistilBertTokenizer ({})'.format(name))
+        self.tokenizer = DistilBertTokenizer.from_pretrained(name)
+    def __call__(self, texts, context_length=77):
+        if isinstance(texts, str):
+            texts = [texts]
+        result = torch.zeros(len(texts), context_length, dtype=torch.long)
+        mask = torch.zeros(len(texts), context_length, dtype=torch.float32)
+        for i, text in enumerate(texts):
+            tokens = self.tokenizer(text)
+            input_ids = tokens.input_ids[:context_length]
+            attention_mask = tokens.attention_mask[:context_length]
+            result[i, :len(input_ids)] = torch.tensor(input_ids)
+            mask[i, :len(attention_mask)] = torch.tensor(attention_mask)
+        if len(result) == 1:
+            return result[0], mask[0]
+        return result, mask
+class MyGPT2Tokenizer(object):
+    def __init__(self, name='', add_bos=False):
+        print('=> Initialize MyGPT2Tokenizer ({})'.format(name))
+        self.tokenizer = GPT2Tokenizer.from_pretrained(name)
+        self.bos_token_id, self.eos_token_id = self.tokenizer.bos_token_id, self.tokenizer.eos_token_id
+        self.pad_token_id = 0
+        self.add_bos = add_bos
+        # num_added_tokens = self.tokenizer.add_special_tokens({'pad_token': "[PAD]"})
+        # print('num_added_tokens={}'.format(len(num_added_tokens)))
+    def __call__(self, texts, context_length=77):
+        if isinstance(texts, str):
+            texts = [texts]
+        result = torch.zeros(len(texts), context_length, dtype=torch.long)
+        for i, text in enumerate(texts):
+            tokens = self.tokenizer(text)
+            if not self.add_bos:
+                input_ids = tokens.input_ids[:context_length - 1]
+                input_ids = input_ids + [self.tokenizer.eos_token_id]  # add [EOS]
+            else:
+                input_ids = tokens.input_ids[:context_length - 2]
+                input_ids = [self.tokenizer.bos_token_id] + input_ids + [self.tokenizer.eos_token_id]  # add [EOS]
+            # attention_mask = tokens.attention_mask[:context_length]
+            # attention_mask = attention_mask + [0.] * pad_length
+            result[i, :len(input_ids)] = torch.tensor(input_ids)
+        if len(result) == 1:
+            return result[0]
+        return result

lavila/models/utils.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from collections import OrderedDict
+import functools
+import torch
+import torch.nn.functional as F
+def inflate_positional_embeds(
+    current_model_state_dict, new_state_dict,
+    num_frames=4,
+    load_temporal_fix='bilinear',
+):
+    # allow loading of timesformer with fewer num_frames
+    curr_keys = list(current_model_state_dict.keys())
+    temporal_embed = ['visual.temporal_embed', 'visual.prompt_embed']
+    for x in temporal_embed:
+        if x in new_state_dict and x in curr_keys:
+            load_temporal_embed = new_state_dict[x]
+            load_num_frames = load_temporal_embed.shape[1]
+            curr_num_frames = num_frames
+            embed_dim = load_temporal_embed.shape[2]
+            if load_num_frames != curr_num_frames:
+                if load_num_frames > curr_num_frames:
+                    print(f'### loaded SpaceTimeTransformer model has MORE frames than current...'
+                          f'### loading {x} weights, filling in the extras via {load_temporal_fix}')
+                    new_temporal_embed = load_temporal_embed[:, :curr_num_frames, :]
+                else:
+                    print(f'### loaded SpaceTimeTransformer model has FEWER frames than current...'
+                          f'### loading {x} weights, filling in the extras via {load_temporal_fix}')
+                    if load_temporal_fix == 'zeros':
+                        new_temporal_embed = torch.zeros([load_temporal_embed.shape[0], curr_num_frames, embed_dim])
+                        new_temporal_embed[:, :load_num_frames] = load_temporal_embed
+                    elif load_temporal_fix in ['interp', 'bilinear']:
+                        # interpolate
+                        # unsqueeze so pytorch thinks its an image
+                        mode = 'nearest'
+                        if load_temporal_fix == 'bilinear':
+                            mode = 'bilinear'
+                        load_temporal_embed = load_temporal_embed.unsqueeze(0)
+                        new_temporal_embed = F.interpolate(load_temporal_embed,
+                                                           (curr_num_frames, embed_dim), mode=mode).squeeze(0)
+                    else:
+                        raise NotImplementedError
+                new_state_dict[x] = new_temporal_embed
+    # allow loading with smaller spatial patches. assumes custom border crop, to append the
+    # border patches to the input sequence
+    if 'visual.pos_embed' in new_state_dict and 'visual.pos_embed' in curr_keys:
+        load_pos_embed = new_state_dict['visual.pos_embed']
+        load_num_patches = load_pos_embed.shape[1]
+        curr_pos_embed = current_model_state_dict['visual.pos_embed']
+        if load_num_patches != curr_pos_embed.shape[1]:
+            raise NotImplementedError(
+                'Loading models with different spatial resolution / patch number not yet implemented, sorry.')
+    return new_state_dict
+def rsetattr(obj, attr, val):
+    pre, _, post = attr.rpartition('.')
+    return setattr(rgetattr(obj, pre) if pre else obj, post, val)
+def rgetattr(obj, attr, *args):
+    def _getattr(obj, attr):
+        return getattr(obj, attr, *args)
+    return functools.reduce(_getattr, [obj] + attr.split('.'))
+# util functions to convert CLIP-style model keys to TimeSformer-style
+def remap_keys(clip_state_dict, transformer_layers=12):
+    remapped_state_dict = OrderedDict()
+    key_mapping = {
+        "class_embedding": "cls_token",
+        "positional_embedding": "pos_embed",
+        "conv1.weight": "patch_embed.proj.weight",
+        "ln_pre.weight": "ln_pre.weight",
+        "ln_pre.bias": "ln_pre.bias",
+        "ln_post.weight": "norm.weight",
+        "ln_post.bias": "norm.bias",
+    }
+    for layer in range(transformer_layers):
+        key_mapping[f"transformer.resblocks.{layer}.attn.in_proj_weight"] = f"blocks.{layer}.attn.qkv.weight"
+        key_mapping[f"transformer.resblocks.{layer}.attn.in_proj_bias"] = f"blocks.{layer}.attn.qkv.bias"
+        key_mapping[f"transformer.resblocks.{layer}.attn.out_proj.weight"] = f"blocks.{layer}.attn.proj.weight"
+        key_mapping[f"transformer.resblocks.{layer}.attn.out_proj.bias"] = f"blocks.{layer}.attn.proj.bias"
+        key_mapping[f"transformer.resblocks.{layer}.ln_1.weight"] = f"blocks.{layer}.norm1.weight"
+        key_mapping[f"transformer.resblocks.{layer}.ln_1.bias"] = f"blocks.{layer}.norm1.bias"
+        key_mapping[f"transformer.resblocks.{layer}.mlp.c_fc.weight"] = f"blocks.{layer}.mlp.fc1.weight"
+        key_mapping[f"transformer.resblocks.{layer}.mlp.c_fc.bias"] = f"blocks.{layer}.mlp.fc1.bias"
+        key_mapping[f"transformer.resblocks.{layer}.mlp.c_proj.weight"] = f"blocks.{layer}.mlp.fc2.weight"
+        key_mapping[f"transformer.resblocks.{layer}.mlp.c_proj.bias"] = f"blocks.{layer}.mlp.fc2.bias"
+        key_mapping[f"transformer.resblocks.{layer}.ln_2.weight"] = f"blocks.{layer}.norm2.weight"
+        key_mapping[f"transformer.resblocks.{layer}.ln_2.bias"] = f"blocks.{layer}.norm2.bias"
+    for key in clip_state_dict:
+        if key == 'proj':
+            continue  # due to possible dim mismatch, we load this later
+        if key == "class_embedding":
+            clip_state_dict[key] = clip_state_dict[key].unsqueeze(0).unsqueeze(0)
+        if key == "positional_embedding":
+            clip_state_dict[key] = clip_state_dict[key].unsqueeze(0)
+        remapped_state_dict[key_mapping[key]] = clip_state_dict[key]
+    return remapped_state_dict

lavila/utils/config.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import yaml
+def load_base_cfg():
+    with open('configs/base.yml', 'r') as fp:
+        cfg = yaml.load(fp, Loader=yaml.SafeLoader)
+    return cfg
+def load_cfg(cfg_file):
+    cfg = load_base_cfg()
+    with open(cfg_file, 'r') as fp:
+        exp_cfg = yaml.load(fp, Loader=yaml.SafeLoader)
+    cfg['model'].update(exp_cfg.get('model', {}))
+    cfg['data'].update(exp_cfg.get('data', {}))
+    dataset = cfg['data'].get('dataset')
+    return cfg

lavila/utils/evaluation.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.reshape(1, -1).expand_as(pred))
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+def get_mean_accuracy(cm):
+    list_acc = []
+    for i in range(len(cm)):
+        acc = 0
+        if cm[i, :].sum() > 0:
+            acc = cm[i, i] / cm[i, :].sum()
+        list_acc.append(acc)
+    return 100 * np.mean(list_acc), 100 * np.trace(cm) / np.sum(cm)

lavila/utils/evaluation_charades.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+def compute_map(submission_array, gt_array):
+    """ Returns mAP, weighted mAP, and AP array """
+    m_aps = []
+    n_classes = submission_array.shape[1]
+    for oc_i in range(n_classes):
+        sorted_idxs = np.argsort(-submission_array[:, oc_i])
+        tp = gt_array[:, oc_i][sorted_idxs] == 1
+        fp = np.invert(tp)
+        n_pos = tp.sum()
+        if n_pos < 0.1:
+            m_aps.append(float('nan'))
+            continue
+        fp.sum()
+        f_pcs = np.cumsum(fp)
+        t_pcs = np.cumsum(tp)
+        prec = t_pcs / (f_pcs+t_pcs).astype(float)
+        avg_prec = 0
+        for i in range(submission_array.shape[0]):
+            if tp[i]:
+                avg_prec += prec[i]
+        m_aps.append(avg_prec / n_pos.astype(float))
+    m_aps = np.array(m_aps)
+    #m_ap = np.mean(m_aps)
+    m_ap = m_aps[~np.isnan(m_aps)]
+    print(f'num of available classes: {len(m_ap)}')
+    m_ap = m_ap.mean() # compute mean w/o nan
+    w_ap = (m_aps * gt_array.sum(axis=0) / gt_array.sum().sum().astype(float))
+    return m_ap, w_ap, m_aps
+def charades_map(submission_array, gt_array):
+    """
+    Approximate version of the charades evaluation function
+    For precise numbers, use the submission file with the official matlab script
+    """
+    fix = submission_array.copy()
+    empty = np.sum(gt_array, axis=1) == 0
+    fix[empty, :] = np.NINF
+    return compute_map(fix, gt_array)
+def create_submission(video_list, predictions, out_file):
+    assert len(video_list) == predictions.shape[0]
+    with open(out_file, 'w') as f:
+        for i, video_id in enumerate(video_list):
+            pred_str = ' '.join(map(lambda x: str(x), predictions[i].tolist()))
+            f.write('{} {}\n\n'.format(video_id, pred_str))

lavila/utils/evaluation_ek100mir.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Part of the code is from
+# `https://github.com/mwray/Joint-Part-of-Speech-Embeddings/tree/main/src/evaluation/NDCG.py`
+# and
+# `https://github.com/mwray/Joint-Part-of-Speech-Embeddings/tree/main/src/evaluation/mAP.py`
+# Modified by Yue Zhao
+import numpy as np
+def calculate_DCG(similarity_matrix, relevancy_matrix, k_counts):
+    """
+    Calculates the Discounted Cumulative Gain (DCG) between two modalities for
+    the first modality.
+    DCG = \sum_{i=1}^k \frac{rel_i}{log_2(i + 1)}
+    i.e. the sum of the k relevant retrievals which is calculated as the scaled
+    relevancy for the ith item. The scale is designed such that early
+    retrievals are more important than later retrievals.
+    Params:
+        - similarity_matrix: matrix of size n1 x n2 where n1 is the number of
+          items in the first modality and n2 is the number of items in the
+          second modality. The [ith,jth] element is the predicted similarity
+          between the ith item from the first modality and the jth item from
+          the second modality.
+        - relevancy_matrix: matrix of size n1 x n2 (see similarity_matrix
+          above). The [ith, jth] element is the semantic relevancy between the
+          ith item from the first modality and the jth item from the second
+          modality.
+        - k_counts: matrix of size n1 x n2 (see similarity_matrix above) which
+          includes information on which items to use to calculate the DCG for
+          (see calculate_k_counts for more info on this matrix).
+    Returns:
+        - The DCG for each item in the first modality, a n1 length vector.
+    """
+    x_sz, y_sz = similarity_matrix.shape
+    ranks = np.argsort(similarity_matrix)[:, ::-1]
+    # Create vector of size (n,) where n is the length of the last dimension in
+    # similarity matrix
+    # This vector is of the form log(i+1)
+    logs = np.log2(np.arange(y_sz) + 2)
+    # Convert logs into the divisor for the DCG calculation, of size similarity
+    # matrix
+    divisors = np.repeat(np.expand_dims(logs, axis=0), x_sz, axis=0)
+    # mask out the sorted relevancy matrix to only use the first k relevant
+    # retrievals for each item.
+    columns = np.repeat(np.expand_dims(np.arange(x_sz), axis=1), y_sz, axis=1)
+    numerators = relevancy_matrix[columns, ranks] * k_counts
+    # Calculate the final DCG score (note that this isn't expected to sum to 1)
+    return np.sum(numerators / divisors, axis=1)
+def calculate_k_counts(relevancy_matrix):
+    """
+    Works out the maximum number of allowed retrievals when working out the
+    Discounted Cumulative Gain. For each query the DCG only uses the first k
+    items retrieved which constitute the k relevant items for that query
+    (otherwise the nDCG scores can be deceptively high for bad rankings).
+    Params:
+        - relevancy_matrix: matrix of size n1 x n2 where n1 is the number of
+          items in the first modality and n2 is the number of items in the
+          second modality.  The [ith, jth] element is the semantic relevancy
+          between the ith item from the first modality and the jth item from
+          the second modality.
+    Returns:
+        - Matrix of size n1 x n2 (see relevancy matrix for more info). This is
+          created as a mask such that if the [ith, jth] element is 1 it
+          represents a valid item to use for the calculation of DCG for the
+          ith item after sorting. For example, if relevancy matrix of:
+        [[1, 0.5, 0],
+          [0, 0  , 1]]
+          is given, then the k_counts matrix will be:
+        [[1, 1, 0],
+         [1, 0, 0]]
+         i.e. the first row has 2 non-zero items, so the first two retrieved
+         items should be used in the calculation. In the second row there is
+         only 1 relevant item, therefore only the first retrieved item should
+         be used for the DCG calculation.
+    """
+    return (np.sort(relevancy_matrix)[:, ::-1] > 0).astype(int)
+def calculate_IDCG(relevancy_matrix, k_counts):
+    """
+    Calculates the Ideal Discounted Cumulative Gain (IDCG) which is the value
+    of the Discounted Cumulative Gain (DCG) for a perfect retrieval, i.e. the
+    items in the second modality were retrieved in order of their descending
+    relevancy.
+    Params:
+        - relevancy_matrix: matrix of size n1 x n2 where n1 is the number of
+          items in the first modality and n2 is the number of items in the
+          second modality. The [ith, jth] element is the semantic relevancy
+          between the ith item from the first modality and the jth item from
+          the second modality.
+        - k_counts: matrix of size n1 x n2 (see similarity_matrix above) which
+          includes information on which items to use to calculate the DCG for
+          (see calculate_k_counts for more info on this matrix).
+    """
+    return calculate_DCG(relevancy_matrix, relevancy_matrix, k_counts)
+def calculate_nDCG(similarity_matrix, relevancy_matrix, k_counts=None, IDCG=None, reduction='mean'):
+    """
+    Calculates the normalised Discounted Cumulative Gain (nDCG) between two
+    modalities for the first modality using the Discounted Cumulative Gain
+    (DCG) and the Ideal Discounted Cumulative Gain (IDCG).
+    nDCG = \frac{DCG}{IDCG}
+    Params:
+        - similarity_matrix: matrix of size n1 x n2 where n1 is the number of
+          items in the first modality and n2 is the number of items in the second
+          modality. The [ith,jth] element is the predicted similarity between
+          the ith item from the first modality and the jth item from the second
+          modality.
+        - relevancy_matrix: matrix of size n1 x n2 (see similarity_matrix
+          above). The [ith, jth] element is the semantic relevancy between the
+          ith item from the first modality and the jth item from the second
+          modality.
+        - k_counts: optional parameter: matrix of size n1 x n2 (see
+          similarity_matrix above) which includes information on which items to
+          use to calculate the DCG for (see calculate_k_counts for more info on
+          this matrix). This will be calculated using calculate_IDCG if not
+          present, but should be pre-processed for efficiency.
+        - IDCG: Optional parameter which includes the pre-processed Ideal
+          Discounted Cumulative Gain (IDCG). This is a vector of size n1 (see
+          similarity_matrix above) which contains the IDCG value for each item
+          from the first modality. This will be calculated using calculate_IDCG
+          if not present, but should be pre-processed for efficiency.
+        - reduction: what to use to reduce the different nDCG scores. By
+          default this applies np.mean across all different queries.
+    Returns:
+        - The nDCG values for the first modality.
+    """
+    if k_counts is None:
+        k_counts = calculate_k_counts(relevancy_matrix)
+    DCG = calculate_DCG(similarity_matrix, relevancy_matrix, k_counts)
+    if IDCG is None:
+        IDCG = calculate_IDCG(relevancy_matrix, k_counts)
+    if reduction == 'mean':
+        return np.mean(DCG / IDCG)
+    elif reduction is None:
+        return DCG / IDCG
+def calculate_mAP(sim_mat, relevancy_matrix):
+    """
+    Computes the mean average precision according to the following formula of
+    average precision:
+    \frac{\sum_{k=1}^n p(k) x rel(k)}{num_rel_docs}
+    where p(k) is the precision at k, rel(k) is an indicator function
+    determining whether the kth returned item is relevant or not and
+    num_rel_docs is the number of relevant items to find within the search.
+    The mean average precision is the mean of the average precision for each
+    query item (i.e row in the matrix)
+    This function takes in two parameters:
+        - sim_mat: a NxM matrix which represents the similarity between two
+        modalities (with modality 1 being of size N and modality 2 of size M).
+        - relevancy_matrix: an NxM matrix which represents the relevancy between two
+        modalities of items (with modality 1 being of size N and modality 2 of
+        size M).
+    """
+    # Find the order of the items in modality 2 according to modality 1
+    ranked_order = (-sim_mat).argsort()
+    ranked_sim_mat = sim_mat[np.arange(sim_mat.shape[0])[:, None], ranked_order]
+    # re-order the relevancy matrix to accommodate the proposals
+    ranked_rel_mat = relevancy_matrix[np.arange(relevancy_matrix.shape[0])[:, None], ranked_order]
+    # find the number of relevant items found at each k
+    cumulative_rel_mat = np.cumsum(ranked_rel_mat, axis=1)
+    # Mask this ensuring that it is non zero if the kth term is 1 (rel(k) above)
+    cumulative_rel_mat[ranked_rel_mat != 1] = 0
+    # find the divisor for p(k)
+    divisor = np.arange(ranked_rel_mat.shape[1]) + 1
+    # find the number of relevant docs per query item
+    number_rel_docs = np.sum(ranked_rel_mat == 1, axis=1)
+    # find the average precision per query, within np.sum finds p(k) * rel(k)
+    avg_precision = np.sum(cumulative_rel_mat / divisor, axis=1) / number_rel_docs
+    mAP = np.mean(avg_precision)
+    return mAP
+def get_mAP(similarity_matrix, rel_matrix):
+    vis_map = calculate_mAP(similarity_matrix, rel_matrix)
+    txt_map = calculate_mAP(similarity_matrix.T, rel_matrix.T)
+    return vis_map, txt_map, (vis_map + txt_map) / 2
+def get_nDCG(similarity_matrix, rel_matrix):
+    vis_k_counts = calculate_k_counts(rel_matrix)
+    txt_k_counts = calculate_k_counts(rel_matrix.T)
+    vis_IDCG = calculate_IDCG(rel_matrix, vis_k_counts)
+    txt_IDCG = calculate_IDCG(rel_matrix.T, txt_k_counts)
+    vis_nDCG = calculate_nDCG(similarity_matrix, rel_matrix, k_counts=vis_k_counts, IDCG=vis_IDCG)
+    txt_nDCG = calculate_nDCG(similarity_matrix.T, rel_matrix.T, k_counts=txt_k_counts, IDCG=txt_IDCG)
+    return vis_nDCG, txt_nDCG, (vis_nDCG + txt_nDCG) / 2

lavila/utils/preprocess.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import csv
+from lavila.models.tokenizer import MyBertTokenizer, MyDistilBertTokenizer, MyGPT2Tokenizer, SimpleTokenizer
+def generate_label_map(dataset):
+    if dataset == 'ek100_cls':
+        print("Preprocess ek100 action label space")
+        vn_list = []
+        mapping_vn2narration = {}
+        for f in [
+            '/data/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv',
+            '/data/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv',
+        ]:
+            csv_reader = csv.reader(open(f))
+            _ = next(csv_reader)  # skip the header
+            for row in csv_reader:
+                vn = '{}:{}'.format(int(row[10]), int(row[12]))
+                narration = row[8]
+                if vn not in vn_list:
+                    vn_list.append(vn)
+                if vn not in mapping_vn2narration:
+                    mapping_vn2narration[vn] = [narration]
+                else:
+                    mapping_vn2narration[vn].append(narration)
+                # mapping_vn2narration[vn] = [narration]
+        vn_list = sorted(vn_list)
+        print('# of action= {}'.format(len(vn_list)))
+        mapping_vn2act = {vn: i for i, vn in enumerate(vn_list)}
+        labels = [list(set(mapping_vn2narration[vn_list[i]])) for i in range(len(mapping_vn2act))]
+        print(labels[:5])
+    elif dataset == 'charades_ego':
+        print("=> preprocessing charades_ego action label space")
+        vn_list = []
+        labels = []
+        with open('/data/CharadesEgo/CharadesEgo/Charades_v1_classes.txt') as f:
+            csv_reader = csv.reader(f)
+            for row in csv_reader:
+                vn = row[0][:4]
+                vn_list.append(vn)
+                narration = row[0][5:]
+                labels.append(narration)
+        mapping_vn2act = {vn: i for i, vn in enumerate(vn_list)}
+        print(labels[:5])
+    elif dataset == 'egtea':
+        print("=> preprocessing egtea action label space")
+        labels = []
+        with open('/data/EGTEA/action_idx.txt') as f:
+            for row in f:
+                row = row.strip()
+                narration = ' '.join(row.split(' ')[:-1])
+                labels.append(narration.replace('_', ' ').lower())
+                # labels.append(narration)
+        mapping_vn2act = {label: i for i, label in enumerate(labels)}
+        print(len(labels), labels[:5])
+    else:
+        raise NotImplementedError
+    return labels, mapping_vn2act
+def generate_tokenizer(model):
+    if model.endswith('DISTILBERT_BASE'):
+        tokenizer = MyDistilBertTokenizer('distilbert-base-uncased')
+    elif model.endswith('BERT_BASE'):
+        tokenizer = MyBertTokenizer('bert-base-uncased')
+    elif model.endswith('BERT_LARGE'):
+        tokenizer = MyBertTokenizer('bert-large-uncased')
+    elif model.endswith('GPT2'):
+        tokenizer = MyGPT2Tokenizer('gpt2', add_bos=True)
+    elif model.endswith('GPT2_MEDIUM'):
+        tokenizer = MyGPT2Tokenizer('gpt2-medium', add_bos=True)
+    elif model.endswith('GPT2_LARGE'):
+        tokenizer = MyGPT2Tokenizer('gpt2-large', add_bos=True)
+    elif model.endswith('GPT2_XL'):
+        tokenizer = MyGPT2Tokenizer('gpt2-xl', add_bos=True)
+    else:
+        print("Using SimpleTokenizer because of model '{}'. "
+              "Please check if this is what you want".format(model))
+        tokenizer = SimpleTokenizer()
+    return tokenizer

meta/ek100_mir/EPIC_100_retrieval_test_sentence.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

meta/ek100_mir/relevancy_sel_t2v.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6eed7e20dbe71de579e467ca9ab340154ba434461c34a7d089f9291c90739d9f
+size 232160

meta/ek100_mir/relevancy_sel_v2t.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4054061cdc842aa090de3e5bca380af61be9e2e7d93cdacca0aa13237026ce4
+size 92336

meta/ek100_mir/sel_t2v.csv ADDED Viewed

	@@ -0,0 +1,10 @@

+4465,P16/P16_04.MP4,307.63,322.87,cut the sausage,7,86
+4466,P16/P16_04.MP4,317.15,338.93,cut the sausage,7,86
+4467,P16/P16_04.MP4,333.73,367.15,cut the sausage,7,86
+3552,P11/P11_17.MP4,462.77,473.31,stir vegetables into salmon,10,94
+3555,P11/P11_17.MP4,481.05,492.14,stir vegetables into salmon,10,94
+3557,P11/P11_17.MP4,492.24,502.75,stir vegetables into salmon,10,94
+730,P01/P01_15.MP4,605.88,619.71,rinse cutting board,2,18
+2526,P07/P07_15.MP4,72.62,82.25,rinse board,2,18
+4494,P16/P16_04.MP4,25.09,30.59,wash the cutting board,2,18
+1760,P04/P04_28.MP4,9.03,12.69,pour coconut milk into pan,9,64

meta/ek100_mir/sel_v2t.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+762,P01/P01_15.MP4,72.97,73.87,take sponge,0,9
+9051,P30/P30_08.MP4,203.34,204.35,open microwave,3,90
+5920,P22/P22_01.MP4,224.87,226.46,rinse sponge,2,9

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+timm==0.5.4
+torch==1.11.0
+torchvision==0.12.0
+decord==0.6.0
+einops==0.4.1
+pandas==1.4.2
+pytorchvideo==0.1.5
+transformers==4.27
+ftfy==4.4.3
+spacy==3.4.1
+scikit-learn==1.1.1
+numpy==1.22.3
+gradio==4.19.1
+gradio_rich_textbox==0.4.2