Spaces:

yeliudev
/

VideoMind-2B

Running on Zero

App Files Files Community

yeliudev commited on Mar 28

Commit

23fdbc0

verified ·

1 Parent(s): 198751c

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +16 -0
.gitignore +9 -0
app.py +640 -0
assets/bot.png +0 -0
assets/user.png +0 -0
data/10309844035.mp4 +3 -0
data/13887487955.mp4 +3 -0
data/4167294363.mp4 +3 -0
data/4742652230.mp4 +3 -0
data/4766274786.mp4 +3 -0
data/5012237466.mp4 +3 -0
data/5188348585.mp4 +3 -0
data/9383140374.mp4 +3 -0
data/DTInxNfWXVc_210.0_360.0.mp4 +3 -0
data/RoripwjYFp8_210.0_360.0.mp4 +3 -0
data/UFWQKrcbhjI_360.0_510.0.mp4 +3 -0
data/Z3-IZ3HAmIA_60.0_210.0.mp4 +3 -0
data/h6QKDqomIPk_210.0_360.0.mp4 +3 -0
data/pA6Z-qYhSNg_60.0_210.0.mp4 +3 -0
data/rrTIeJRVGjg_60.0_210.0.mp4 +3 -0
data/yId2wIocTys_210.0_360.0.mp4 +3 -0
requirements.txt +26 -0
setup.cfg +16 -0
videomind/constants.py +42 -0
videomind/conversation.py +49 -0
videomind/dataset/__init__.py +61 -0
videomind/dataset/collator.py +40 -0
videomind/dataset/hybrid.py +180 -0
videomind/dataset/sub_classes/__init__.py +69 -0
videomind/dataset/sub_classes/activitynet_captions.py +96 -0
videomind/dataset/sub_classes/activitynet_rtl.py +68 -0
videomind/dataset/sub_classes/cgbench.py +47 -0
videomind/dataset/sub_classes/charades_sta.py +45 -0
videomind/dataset/sub_classes/cosmo_cap.py +37 -0
videomind/dataset/sub_classes/didemo.py +59 -0
videomind/dataset/sub_classes/ego4d_naq.py +81 -0
videomind/dataset/sub_classes/ego4d_nlq.py +41 -0
videomind/dataset/sub_classes/ego_timeqa.py +93 -0
videomind/dataset/sub_classes/hirest.py +150 -0
videomind/dataset/sub_classes/internvit_vtime.py +45 -0
videomind/dataset/sub_classes/longvideobench.py +53 -0
videomind/dataset/sub_classes/lvbench.py +52 -0
videomind/dataset/sub_classes/mlvu.py +55 -0
videomind/dataset/sub_classes/mvbench.py +74 -0
videomind/dataset/sub_classes/nextgqa.py +87 -0
videomind/dataset/sub_classes/nextqa.py +63 -0
videomind/dataset/sub_classes/qa_ego4d.py +98 -0
videomind/dataset/sub_classes/queryd.py +49 -0
videomind/dataset/sub_classes/qvhighlights.py +78 -0
videomind/dataset/sub_classes/rextime.py +81 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/10309844035.mp4 filter=lfs diff=lfs merge=lfs -text
+data/13887487955.mp4 filter=lfs diff=lfs merge=lfs -text
+data/4167294363.mp4 filter=lfs diff=lfs merge=lfs -text
+data/4742652230.mp4 filter=lfs diff=lfs merge=lfs -text
+data/4766274786.mp4 filter=lfs diff=lfs merge=lfs -text
+data/5012237466.mp4 filter=lfs diff=lfs merge=lfs -text
+data/5188348585.mp4 filter=lfs diff=lfs merge=lfs -text
+data/9383140374.mp4 filter=lfs diff=lfs merge=lfs -text
+data/DTInxNfWXVc_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/RoripwjYFp8_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/UFWQKrcbhjI_360.0_510.0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/Z3-IZ3HAmIA_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/h6QKDqomIPk_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/pA6Z-qYhSNg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/rrTIeJRVGjg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/yId2wIocTys_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+# Byte-compiled / optimized / DLL files
+__pycache__
+*.egg-info
+*.py[cod]
+*$py.class
+# Temporary data
+.DS_Store
+._*

app.py ADDED Viewed

	@@ -0,0 +1,640 @@

+# Copyright (c) 2024 Ye Liu. Licensed under the BSD-3-Clause license.
+import html
+import json
+import os
+import random
+import time
+from functools import partial
+from threading import Thread
+import gradio as gr
+import nncore
+import torch
+from huggingface_hub import snapshot_download
+from transformers import TextIteratorStreamer
+from videomind.constants import GROUNDER_PROMPT, PLANNER_PROMPT, VERIFIER_PROMPT
+from videomind.dataset.utils import process_vision_info
+from videomind.model.builder import build_model
+from videomind.utils.io import get_duration
+from videomind.utils.parser import parse_query, parse_span
+BASE_MODEL = 'model_zoo/Qwen2-VL-2B-Instruct'
+BASE_MODEL_HF = 'Qwen/Qwen2-VL-2B-Instruct'
+MODEL = 'model_zoo/VideoMind-2B'
+MODEL_HF = 'yeliudev/VideoMind-2B'
+TITLE = 'VideoMind: A Chain-of-LoRA Agent for Long Video Reasoning'
+TITLE_MD = f'<h1 align="center">💡 {TITLE}</h1>'
+DESCRIPTION_MD = """VideoMind is a multi-modal agent framework that enhances video reasoning by emulating *human-like* processes, such as *breaking down tasks*, *localizing and verifying moments*, and *synthesizing answers*. This approach addresses the unique challenges of temporal-grounded reasoning in a progressive strategy. Please find more details at our <a href="https://videomind.github.io/" target="_blank">Project Page</a>, <a href="https://arxiv.org/abs/2503.13444" target="_blank">Tech Report</a> and <a href="https://github.com/yeliudev/VideoMind" target="_blank">GitHub Repo</a>."""  # noqa
+# yapf:disable
+EXAMPLES = [
+    ('data/4167294363.mp4', 'Why did the old man stand up?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/5012237466.mp4', 'How does the child in stripes react about the fountain?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/13887487955.mp4', 'What did the excavator do after it pushed the cement forward?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/5188348585.mp4', 'What did the person do before pouring the liquor?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/4766274786.mp4', 'What did the girl do after the baby lost the balloon?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/4742652230.mp4', 'Why is the girl pushing the boy only around the toy but not to other places?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/9383140374.mp4', 'How does the girl in pink control the movement of the claw?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/10309844035.mp4', 'Why are they holding up the phones?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/pA6Z-qYhSNg_60.0_210.0.mp4', 'Different types of meat products are being cut, shaped and prepared', ['gnd', 'ver']),
+    ('data/UFWQKrcbhjI_360.0_510.0.mp4', 'A man talks to the camera whilst walking along a roadside in a rural area', ['gnd', 'ver']),
+    ('data/RoripwjYFp8_210.0_360.0.mp4', 'A woman wearing glasses eating something at a street market', ['gnd', 'ver']),
+    ('data/h6QKDqomIPk_210.0_360.0.mp4', 'A toddler sits in his car seat, holding his yellow tablet', ['gnd', 'ver']),
+    ('data/Z3-IZ3HAmIA_60.0_210.0.mp4', 'A view from the window as the plane accelerates and takes off from the runway', ['gnd', 'ver']),
+    ('data/yId2wIocTys_210.0_360.0.mp4', "Temporally locate the visual content mentioned in the text query 'kids exercise in front of parked cars' within the video.", ['pla', 'gnd', 'ver']),
+    ('data/rrTIeJRVGjg_60.0_210.0.mp4', "Localize the moment that provides relevant context about 'man stands in front of a white building monologuing'.", ['pla', 'gnd', 'ver']),
+    ('data/DTInxNfWXVc_210.0_360.0.mp4', "Find the video segment that corresponds to the given textual query 'man with headphones talking'.", ['pla', 'gnd', 'ver']),
+]
+# yapf:enable
+CSS = """button .box { text-align: left }"""
+JS = """
+function init() {
+    var info = document.getElementById('role').querySelectorAll('[class^="svelte"]')[1]
+    info.innerHTML = info.innerHTML.replace(/&lt;/g, '<').replace(/&gt;/g, '>')
+}
+"""
+class CustomStreamer(TextIteratorStreamer):
+    def put(self, value):
+        if len(value.shape) > 1 and value.shape[0] > 1:
+            raise ValueError('TextStreamer only supports batch size 1')
+        elif len(value.shape) > 1:
+            value = value[0]
+        if self.skip_prompt and self.next_tokens_are_prompt:
+            self.next_tokens_are_prompt = False
+            return
+        self.token_cache.extend(value.tolist())
+        # force skipping eos token
+        if self.token_cache[-1] == self.tokenizer.eos_token_id:
+            self.token_cache = self.token_cache[:-1]
+        text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
+        # cache decoded text for future use
+        self.text_cache = text
+        if text.endswith('\n'):
+            printable_text = text[self.print_len:]
+            self.token_cache = []
+            self.print_len = 0
+        elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
+            printable_text = text[self.print_len:]
+            self.print_len += len(printable_text)
+        else:
+            printable_text = text[self.print_len:text.rfind(' ') + 1]
+            self.print_len += len(printable_text)
+        self.on_finalized_text(printable_text)
+def seconds_to_hms(seconds):
+    hours, remainder = divmod(round(seconds), 3600)
+    minutes, seconds = divmod(remainder, 60)
+    return f'{hours:02}:{minutes:02}:{seconds:02}'
+def enable_btns():
+    return (gr.Button(interactive=True), ) * 3
+def disable_btns():
+    return (gr.Button(interactive=False), ) * 3
+def update_placeholder(role):
+    placeholder = 'Ask a question about the video...' if 'ans' in role else 'Write a query to search for a moment...'
+    return gr.Textbox(placeholder=placeholder)
+def main(video, prompt, role, temperature, max_new_tokens, model, processor, streamer, device):
+    history = []
+    if not video:
+        gr.Warning('Please upload a video or click [Random] to sample one.')
+        return history
+    if not prompt:
+        gr.Warning('Please provide a prompt or click [Random] to sample one.')
+        return history
+    if 'gnd' not in role and 'ans' not in role:
+        gr.Warning('Please at least select Grounder or Answerer.')
+        return history
+    if 'ver' in role and 'gnd' not in role:
+        gr.Warning('Verifier cannot be used without Grounder.')
+        return history
+    if 'pla' in role and any(k not in role for k in ('gnd', 'ver', 'ans')):
+        gr.Warning('Planner can only be used when all other roles are selected.')
+        return history
+    history.append({'role': 'user', 'content': prompt})
+    yield history
+    duration = get_duration(video)
+    # do grounding and answering by default
+    do_grounding = True
+    do_answering = True
+    # initialize grounding query as prompt
+    query = prompt
+    if 'pla' in role:
+        text = PLANNER_PROMPT.format(prompt)
+        history.append({
+            'metadata': {
+                'title': '🗺️ Working as Planner...'
+            },
+            'role': 'assistant',
+            'content': f'##### Planner Prompt:\n\n{html.escape(text)}\n\n##### Planner Response:\n\n...'
+        })
+        yield history
+        start_time = time.perf_counter()
+        messages = [{
+            'role':
+            'user',
+            'content': [{
+                'type': 'video',
+                'video': video,
+                'num_threads': 1,
+                'min_pixels': 36 * 28 * 28,
+                'max_pixels': 64 * 28 * 28,
+                'max_frames': 100,
+                'fps': 1.0
+            }, {
+                'type': 'text',
+                'text': text
+            }]
+        }]
+        text = processor.apply_chat_template(messages, add_generation_prompt=True)
+        images, videos = process_vision_info(messages)
+        data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
+        data = data.to(device)
+        model.base_model.disable_adapter_layers()
+        model.base_model.enable_adapter_layers()
+        model.set_adapter('planner')
+        generation_kwargs = dict(
+            **data,
+            streamer=streamer,
+            do_sample=temperature > 0,
+            temperature=temperature if temperature > 0 else None,
+            top_p=None,
+            top_k=None,
+            repetition_penalty=None,
+            max_new_tokens=max_new_tokens)
+        t = Thread(target=model.generate, kwargs=generation_kwargs)
+        t.start()
+        skipped = False
+        for i, text in enumerate(streamer):
+            if text and not skipped:
+                history[-1]['content'] = history[-1]['content'].rstrip('.')
+                skipped = True
+            history[-1]['content'] += text
+            yield history
+        elapsed_time = round(time.perf_counter() - start_time, 1)
+        history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
+        yield history
+        try:
+            parsed = json.loads(streamer.text_cache)
+            action = parsed[0] if isinstance(parsed, list) else parsed
+            if action['type'].lower() == 'grounder' and action['value']:
+                query = action['value']
+            elif action['type'].lower() == 'answerer':
+                do_grounding = False
+                do_answering = True
+        except Exception:
+            pass
+        response = 'After browsing the video and the question. My plan to figure out the answer is as follows:\n'
+        step_idx = 1
+        if 'gnd' in role and do_grounding:
+            response += f'\n{step_idx}. Localize the relevant moment in this video using the query "<span style="color:red">{query}</span>".'
+            step_idx += 1
+        if 'ver' in role and do_grounding:
+            response += f'\n{step_idx}. Verify the grounded moments one-by-one and select the best cancdidate.'
+            step_idx += 1
+        if 'ans' in role and do_answering:
+            if step_idx > 1:
+                response += f'\n{step_idx}. Crop the video segment and zoom-in to higher resolution.'
+            else:
+                response += f'\n{step_idx}. Analyze the whole video directly without cropping.'
+        history.append({'role': 'assistant', 'content': ''})
+        for i, text in enumerate(response.split(' ')):
+            history[-1]['content'] += ' ' + text if i > 0 else text
+            yield history
+    if 'gnd' in role and do_grounding:
+        query = parse_query(query)
+        text = GROUNDER_PROMPT.format(query)
+        history.append({
+            'metadata': {
+                'title': '🔍 Working as Grounder...'
+            },
+            'role': 'assistant',
+            'content': f'##### Grounder Prompt:\n\n{html.escape(text)}\n\n##### Grounder Response:\n\n...'
+        })
+        yield history
+        start_time = time.perf_counter()
+        messages = [{
+            'role':
+            'user',
+            'content': [{
+                'type': 'video',
+                'video': video,
+                'num_threads': 1,
+                'min_pixels': 36 * 28 * 28,
+                'max_pixels': 64 * 28 * 28,
+                'max_frames': 150,
+                'fps': 1.0
+            }, {
+                'type': 'text',
+                'text': text
+            }]
+        }]
+        text = processor.apply_chat_template(messages, add_generation_prompt=True)
+        images, videos = process_vision_info(messages)
+        data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
+        data = data.to(device)
+        model.base_model.disable_adapter_layers()
+        model.base_model.enable_adapter_layers()
+        model.set_adapter('grounder')
+        generation_kwargs = dict(
+            **data,
+            streamer=streamer,
+            do_sample=temperature > 0,
+            temperature=temperature if temperature > 0 else None,
+            top_p=None,
+            top_k=None,
+            repetition_penalty=None,
+            max_new_tokens=max_new_tokens)
+        t = Thread(target=model.generate, kwargs=generation_kwargs)
+        t.start()
+        skipped = False
+        for i, text in enumerate(streamer):
+            if text and not skipped:
+                history[-1]['content'] = history[-1]['content'].rstrip('.')
+                skipped = True
+            history[-1]['content'] += text
+            yield history
+        elapsed_time = round(time.perf_counter() - start_time, 1)
+        history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
+        yield history
+        if len(model.reg) > 0:
+            # 1. extract timestamps and confidences
+            blob = model.reg[0].cpu().float()
+            pred, conf = blob[:, :2] * duration, blob[:, -1].tolist()
+            # 2. clamp timestamps
+            pred = pred.clamp(min=0, max=duration)
+            # 3. sort timestamps
+            inds = (pred[:, 1] - pred[:, 0] < 0).nonzero()[:, 0]
+            pred[inds] = pred[inds].roll(1)
+            # 4. convert timestamps to list
+            pred = pred.tolist()
+        else:
+            if 'ver' in role:
+                pred = [[i * duration / 6, (i + 2) * duration / 6] for i in range(5)]
+                conf = [0] * 5
+            else:
+                pred = [[0, duration]]
+                conf = [0]
+        response = 'The candidate moments and confidence scores are as follows:\n'
+        response += '\n| ID | Start Time | End Time | Confidence |'
+        response += '\n| :-: | :-: | :-: | :-: |'
+        # using top-5 predictions
+        for i, (p, c) in enumerate(zip(pred[:5], conf[:5])):
+            response += f'\n| {i} | {seconds_to_hms(p[0])} | {seconds_to_hms(p[1])} | {c:.2f} |'
+        response += f'\n\nTherefore, the target moment might happens from <span style="color:red">{seconds_to_hms(pred[0][0])}</span> to <span style="color:red">{seconds_to_hms(pred[0][1])}</span>.'
+        history.append({'role': 'assistant', 'content': ''})
+        for i, text in enumerate(response.split(' ')):
+            history[-1]['content'] += ' ' + text if i > 0 else text
+            yield history
+    if 'ver' in role and do_grounding:
+        text = VERIFIER_PROMPT.format(query)
+        history.append({
+            'metadata': {
+                'title': '📊 Working as Verifier...'
+            },
+            'role': 'assistant',
+            'content': f'##### Verifier Prompt:\n\n{html.escape(text)}\n\n##### Verifier Response:\n\n...'
+        })
+        yield history
+        start_time = time.perf_counter()
+        # using top-5 predictions
+        prob = []
+        for i, cand in enumerate(pred[:5]):
+            s0, e0 = parse_span(cand, duration, 2)
+            offset = (e0 - s0) / 2
+            s1, e1 = parse_span([s0 - offset, e0 + offset], duration)
+            # percentage of s0, e0 within s1, e1
+            s = (s0 - s1) / (e1 - s1)
+            e = (e0 - s1) / (e1 - s1)
+            messages = [{
+                'role':
+                'user',
+                'content': [{
+                    'type': 'video',
+                    'video': video,
+                    'num_threads': 1,
+                    'video_start': s1,
+                    'video_end': e1,
+                    'min_pixels': 36 * 28 * 28,
+                    'max_pixels': 64 * 28 * 28,
+                    'max_frames': 64,
+                    'fps': 2.0
+                }, {
+                    'type': 'text',
+                    'text': text
+                }]
+            }]
+            text = processor.apply_chat_template(messages, add_generation_prompt=True)
+            images, videos = process_vision_info(messages)
+            data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
+            # ===== insert segment start/end tokens =====
+            video_grid_thw = data['video_grid_thw'][0]
+            num_frames, window = int(video_grid_thw[0]), int(video_grid_thw[1] * video_grid_thw[2] / 4)
+            assert num_frames * window * 4 == data['pixel_values_videos'].size(0)
+            pos_s, pos_e = round(s * num_frames), round(e * num_frames)
+            pos_s, pos_e = min(max(0, pos_s), num_frames), min(max(0, pos_e), num_frames)
+            assert pos_s <= pos_e, (num_frames, s, e)
+            base_idx = torch.nonzero(data['input_ids'][0] == model.config.vision_start_token_id).item()
+            pos_s, pos_e = pos_s * window + base_idx + 1, pos_e * window + base_idx + 2
+            input_ids = data['input_ids'][0].tolist()
+            input_ids.insert(pos_s, model.config.seg_s_token_id)
+            input_ids.insert(pos_e, model.config.seg_e_token_id)
+            data['input_ids'] = torch.LongTensor([input_ids])
+            data['attention_mask'] = torch.ones_like(data['input_ids'])
+            # ===========================================
+            data = data.to(device)
+            model.base_model.disable_adapter_layers()
+            model.base_model.enable_adapter_layers()
+            model.set_adapter('verifier')
+            with torch.inference_mode():
+                logits = model(**data).logits[0, -1].softmax(dim=-1)
+            # NOTE: magic numbers here
+            # In Qwen2-VL vocab: 9454 -> Yes, 2753 -> No
+            score = (logits[9454] - logits[2753]).sigmoid().item()
+            prob.append(score)
+            if i == 0:
+                history[-1]['content'] = history[-1]['content'].rstrip('.')[:-1]
+            response = f'\nCandidate ID {i}: P(Yes) = {score:.2f}'
+            for j, text in enumerate(response.split(' ')):
+                history[-1]['content'] += ' ' + text if j > 0 else text
+                yield history
+        elapsed_time = round(time.perf_counter() - start_time, 1)
+        history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
+        yield history
+        ranks = torch.Tensor(prob).argsort(descending=True).tolist()
+        prob = [prob[idx] for idx in ranks]
+        pred = [pred[idx] for idx in ranks]
+        conf = [conf[idx] for idx in ranks]
+        response = 'After verification, the candidate moments are re-ranked as follows:\n'
+        response += '\n| ID | Start Time | End Time | Score |'
+        response += '\n| :-: | :-: | :-: | :-: |'
+        ids = list(range(len(ranks)))
+        for r, p, c in zip(ranks, pred, prob):
+            response += f'\n| {ids[r]} | {seconds_to_hms(p[0])} | {seconds_to_hms(p[1])} | {c:.2f} |'
+        response += f'\n\nTherefore, the target moment should be from <span style="color:red">{seconds_to_hms(pred[0][0])}</span> to <span style="color:red">{seconds_to_hms(pred[0][1])}</span>.'
+        history.append({'role': 'assistant', 'content': ''})
+        for i, text in enumerate(response.split(' ')):
+            history[-1]['content'] += ' ' + text if i > 0 else text
+            yield history
+    if 'ans' in role and do_answering:
+        text = f'{prompt} Please think step by step and provide your response.'
+        history.append({
+            'metadata': {
+                'title': '📝 Working as Answerer...'
+            },
+            'role': 'assistant',
+            'content': f'##### Answerer Prompt:\n\n{html.escape(text)}\n\n##### Answerer Response:\n\n...'
+        })
+        yield history
+        start_time = time.perf_counter()
+        # choose the potential best moment
+        selected = pred[0] if 'gnd' in role and do_grounding else [0, duration]
+        s, e = parse_span(selected, duration, 32)
+        messages = [{
+            'role':
+            'user',
+            'content': [{
+                'type': 'video',
+                'video': video,
+                'num_threads': 1,
+                'video_start': s,
+                'video_end': e,
+                'min_pixels': 128 * 28 * 28,
+                'max_pixels': 256 * 28 * 28,
+                'max_frames': 32,
+                'fps': 2.0
+            }, {
+                'type': 'text',
+                'text': text
+            }]
+        }]
+        text = processor.apply_chat_template(messages, add_generation_prompt=True)
+        images, videos = process_vision_info(messages)
+        data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
+        data = data.to(device)
+        with model.disable_adapter():
+            generation_kwargs = dict(
+                **data,
+                streamer=streamer,
+                do_sample=temperature > 0,
+                temperature=temperature if temperature > 0 else None,
+                top_p=None,
+                top_k=None,
+                repetition_penalty=None,
+                max_new_tokens=max_new_tokens)
+            t = Thread(target=model.generate, kwargs=generation_kwargs)
+            t.start()
+            skipped = False
+            for i, text in enumerate(streamer):
+                if text and not skipped:
+                    history[-1]['content'] = history[-1]['content'].rstrip('.')
+                    skipped = True
+                history[-1]['content'] += text
+                yield history
+        elapsed_time = round(time.perf_counter() - start_time, 1)
+        history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
+        yield history
+        if 'gnd' in role and do_grounding:
+            response = f'After zooming in and analyzing the target moment, I finalize my answer: <span style="color:green">{streamer.text_cache}</span>'
+        else:
+            response = f'After watching the whole video, my answer is: <span style="color:green">{streamer.text_cache}</span>'
+        history.append({'role': 'assistant', 'content': ''})
+        for i, text in enumerate(response.split(' ')):
+            history[-1]['content'] += ' ' + text if i > 0 else text
+            yield history
+if __name__ == '__main__':
+    if not nncore.is_dir(BASE_MODEL):
+        snapshot_download(BASE_MODEL_HF, local_dir=BASE_MODEL)
+    if not nncore.is_dir(MODEL):
+        snapshot_download(MODEL_HF, local_dir=MODEL)
+    print('Initializing role *grounder*')
+    model, processor = build_model(MODEL)
+    print('Initializing role *planner*')
+    model.load_adapter(nncore.join(MODEL, 'planner'), adapter_name='planner')
+    print('Initializing role *verifier*')
+    model.load_adapter(nncore.join(MODEL, 'verifier'), adapter_name='verifier')
+    streamer = CustomStreamer(processor.tokenizer, skip_prompt=True)
+    device = next(model.parameters()).device
+    main = partial(main, model=model, processor=processor, streamer=streamer, device=device)
+    path = os.path.dirname(os.path.realpath(__file__))
+    chat = gr.Chatbot(
+        type='messages',
+        height='70vh',
+        avatar_images=[f'{path}/assets/user.png', f'{path}/assets/bot.png'],
+        placeholder='A conversation with VideoMind',
+        label='VideoMind')
+    prompt = gr.Textbox(label='Text Prompt', placeholder='Ask a question about the video...')
+    with gr.Blocks(title=TITLE, css=CSS, js=JS) as demo:
+        gr.Markdown(TITLE_MD)
+        gr.Markdown(DESCRIPTION_MD)
+        with gr.Row():
+            with gr.Column(scale=3):
+                video = gr.Video()
+                with gr.Group():
+                    role = gr.CheckboxGroup(
+                        choices=[('🗺️ Planner', 'pla'), ('🔍 Grounder', 'gnd'), ('📊 Verifier', 'ver'),
+                                 ('📝 Answerer', 'ans')],
+                        value=['pla', 'gnd', 'ver', 'ans'],
+                        interactive=True,
+                        elem_id='role',
+                        label='Role(s) To Use',
+                        info='[Auto Planning]: Planner + Grounder + Verifier + Answerer<br>'
+                        '[Grounded Video Question-Answering]: Grounder + Verifier + Answerer<br>'
+                        '[Video Temporal Grounding]: Grounder + Verifier<br>'
+                        '[Direct Video Question-Answering]: Answerer<br>')
+                    role.change(update_placeholder, role, prompt)
+                    with gr.Accordion(label='Hyperparameters', open=False):
+                        temperature = gr.Slider(
+                            0,
+                            1,
+                            value=0,
+                            step=0.1,
+                            interactive=True,
+                            label='Temperature',
+                            info='Higher value leads to more creativity and randomness (Default: 0)')
+                        max_new_tokens = gr.Slider(
+                            1,
+                            1024,
+                            value=256,
+                            interactive=True,
+                            label='Max Output Tokens',
+                            info='The maximum number of output tokens for each role (Default: 256)')
+                prompt.render()
+                with gr.Row():
+                    random_btn = gr.Button(value='🔮 Random')
+                    random_btn.click(lambda: random.choice(EXAMPLES), None, [video, prompt, role])
+                    reset_btn = gr.ClearButton([video, prompt, chat], value='🗑️ Reset')
+                    reset_btn.click(lambda: (['pla', 'gnd', 'ver', 'ans'], 0, 256), None,
+                                    [role, temperature, max_new_tokens])
+                    submit_btn = gr.Button(value='🚀 Submit', variant='primary')
+                    submit_ctx = submit_btn.click(disable_btns, None, [random_btn, reset_btn, submit_btn])
+                    submit_ctx = submit_ctx.then(main, [video, prompt, role, temperature, max_new_tokens], chat)
+                    submit_ctx.then(enable_btns, None, [random_btn, reset_btn, submit_btn])
+            with gr.Column(scale=5):
+                chat.render()
+        demo.queue()
+        demo.launch(server_name='0.0.0.0')

assets/bot.png ADDED Viewed

assets/user.png ADDED Viewed

data/10309844035.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8996ff134787d6b769c2491b9079a02c05953465ad770f07a8d9138e2668d24f
+size 4041678

data/13887487955.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5fecab1076ee42b3804718f9f64bef06cbfafd6995ad5f5ee42ba6354721429
+size 5544739

data/4167294363.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d0e0a4a381836f68e16a816d87f241fed3e31ea321f544b921743d6c1c50666
+size 6611151

data/4742652230.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8733ab4b0716d13ea7a79fc4ddacaf9eede567db364f0ecddfa4582c2f237f82
+size 2200304

data/4766274786.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:afa38a9ce9e89f934293214d79755c89159664223b3ca366813fd5fe524ed013
+size 3395545

data/5012237466.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd1929aa93d037f809f402e9801047125dc9fe8060301e69ded9ba1f2d785cc8
+size 4822293

data/5188348585.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b225f448a546ba2f65958f18c6731a6dde9b1f437014e90036b22eb40e9ad0a5
+size 5051675

data/9383140374.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30b6b3eb43f711bef194150d473a59850ff5d7fec0f5cc30e7526aa9e382303f
+size 2518081

data/DTInxNfWXVc_210.0_360.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a09eee0dc404688731fb768c120d3519605f2343376b9bd727a71b91379fd9a9
+size 4999970

data/RoripwjYFp8_210.0_360.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b39b15158dc20c0bc6f1758a9239c8f3eed20ba4a90953338eec2246fa8f1f0
+size 9287252

data/UFWQKrcbhjI_360.0_510.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8669153d9ffac4b5534c20fab8d795347f5babe588da9b8330e049d623ebb443
+size 14510618

data/Z3-IZ3HAmIA_60.0_210.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b3a342993ee61efc5f3b859cd9c1e0d360b3331eed9deb8466891e4bcacc554
+size 14397799

data/h6QKDqomIPk_210.0_360.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:103820de2b8a1a3935b39ed80d91cd08e546e5617310b3d1bb3dadb06b2ffb95
+size 13485144

data/pA6Z-qYhSNg_60.0_210.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c84660fd4ebd8c23a2a7364174b1e819fec8b0e1cb8b9d9cd86f9e429cbdf66c
+size 8658509

data/rrTIeJRVGjg_60.0_210.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efe6f48a49963bd4880ef5065840e05dd25e2aa975870140bcdaf4220bbd2827
+size 11410412

data/yId2wIocTys_210.0_360.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:447fcb1fd1f94ed6a88d56dd0f6f859646cb8c58ed8e3b7a82f374e2cfee1646
+size 14769130

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+accelerate==1.2.1
+decord==0.6.0
+gradio==4.44.1
+pandas==2.2.3
+peft==0.14.0
+pysrt==1.1.2
+scikit-image==0.25.0
+scikit-learn==1.6.1
+sentencepiece==0.2.0
+termplotlib==0.3.9
+triton==3.0.0
+# our codebase contains necessary patches for 4.45.2
+transformers==4.45.2
+# https://github.com/microsoft/DeepSpeed/issues/6793
+deepspeed==0.15.4
+# https://github.com/pytorch/pytorch/issues/138386
+torch==2.4.1
+torchvision==0.19.1
+# torch-npu only supports torch 2.4.0
+# torch==2.4.0+cpu
+# torch-npu==2.4.0.post2
+# torchvision==0.19.0+cpu

setup.cfg ADDED Viewed

	@@ -0,0 +1,16 @@

+[yapf]
+column_limit = 120
+based_on_style = pep8
+blank_line_before_nested_class_or_def = true
+split_before_expression_after_opening_paren = true
+[isort]
+line_length = 120
+multi_line_output = 0
+known_third_party = decord,deepspeed,gradio,huggingface_hub,nncore,numpy,pandas,peft,PIL,pysrt,safetensors,tabulate,termplotlib,torch,torchvision,transformers
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = FIRSTPARTY
+[flake8]
+max-line-length = 500
+extend-ignore = E741

videomind/constants.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+IGNORE_INDEX = -100
+REG_TOKEN = '<|reg|>'
+SEG_S_TOKEN = '<|seg_start|>'
+SEG_E_TOKEN = '<|seg_end|>'
+PLANNER_PROMPT = (
+    'You are acting as the planner now. '
+    'Given a question about the video, your task is to analyze the question and identify the best way to answer this question. '
+    'You have access to the following tools:\n\n'
+    'Grounder: Accepts a text query and localize the relevant video segment according to the query.\n'
+    'Verifier: A tool supporting grounder by verifying the reliability of its outputs.\n'
+    'Answerer: Answer a given question directly based on the whole video or a cropped video segment.\n\n'
+    'Your response must be a list in JSON format. '
+    'A valid plan for reasoning could be "grounder, verifier, answer", "grounder, verifier", or "answerer", depending on the given question. '
+    'Please see an example for the format below.\n\n'
+    '[{{"type": "grounder", "value": "<text query>"}}, {{"type": "verifier"}}, {{"type": "answerer"}}]\n\n'
+    'Note that only the grounder can accept an argument called "value", which is the text query used for grounding. '
+    "Now I give you the question: '{}'. "
+    'Please think carefully and respond with your plan in JSON directly.')
+GROUNDER_PROMPT = (
+    'You are acting as the grounder now. '
+    'Given a video and a text query, your goal is to temporally localize the video moment described by the query. '
+    'If the query is directly describing a moment, simply localize it according to its content. '
+    "Otherwise, if the moment is described as 'before/after a pivotal event', you need to determine the actual event it refers to. "
+    'The localized moment should only cover the target event. '
+    "Now I give you the query: '{}'. "
+    'Please think carefully and provide your response.')
+VERIFIER_PROMPT = (
+    'You are acting as the verifier now. '
+    'You will be presented a text query describing a moment that potentialy happens in the given video. '
+    f'Your task is to identify whether the video segment between {SEG_S_TOKEN} and {SEG_E_TOKEN} perfectly covers the moment. '
+    f'If the described moment can be seen in the video, please focus on verifying whether the moment starts at {SEG_S_TOKEN} and ends at {SEG_E_TOKEN}. '
+    "Respond with 'Yes' if you think the moment boundaries are correct, otherwise 'No'. "
+    "If the described moment cannot be seen in the video, respond with 'No' directly. "
+    "Now I give you the query: '{}'. "
+    "Please think carefully and respond with 'Yes' or 'No' directly.")

videomind/conversation.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+from dataclasses import dataclass
+from typing import List
+@dataclass
+class Conversation:
+    style: str
+    system: str
+    roles: List[str]
+    seps: List[str]
+    messages: List[str]
+    def append_message(self, role, msg):
+        self.messages.append([role, msg])
+    def clear(self):
+        self.messages = []
+    def get_prompt(self):
+        assert self.style in ('chatml', )
+        prompt = self.system + self.seps[0] if self.system is not None else ''
+        for i, (role, msg) in enumerate(self.messages):
+            prompt += role
+            sep = self.seps[i % 2]
+            if msg is not None:
+                prompt += msg
+                if not prompt.endswith(sep):
+                    prompt += sep
+        prompt = prompt.lstrip('\n')
+        return prompt
+def get_conv(conv_type):
+    if conv_type == 'chatml':
+        conv = Conversation(
+            style='chatml',
+            system='<|im_start|>system\nYou are a helpful assistant.',
+            roles=('\n<|im_start|>user\n', '\n<|im_start|>assistant\n'),
+            seps=('<|im_end|>', '<|im_end|>'),
+            messages=[])
+    else:
+        raise ValueError(f'unknown conversation type: {conv_type}')
+    return conv

videomind/dataset/__init__.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from .collator import HybridDataCollator
+from .hybrid import HybridDataset
+from .sub_classes import (ActivitynetCaptionsBiasDataset, ActivitynetCaptionsDataset, ActivitynetRTLDataset,
+                          CGBenchDataset, CharadesSTADataset, CosMoCapDataset, DiDeMoDataset, Ego4DNaQDataset,
+                          Ego4DNLQDataset, EgoTimeQACropDataset, EgoTimeQADataset, EgoTimeQAGroundingDataset,
+                          HiRESTGroundingDataset, HiRESTStepBiasDataset, HiRESTStepDataset, InternVidVTimeDataset,
+                          LongVideoBenchDataset, LVBenchDataset, MLVUDataset, MVBenchDataset, NExTGQACropDataset,
+                          NExTGQADataset, NExTGQAGroundingDataset, NExTQADataset, QAEgo4DCropDataset, QAEgo4DDataset,
+                          QAEgo4DGroundingDataset, QuerYDDataset, QVHighlightsDataset, ReXTimeCropDataset,
+                          ReXTimeDataset, ReXTimeGroundingDataset, STARDataset, TACoSDataset, VideoMMEDataset,
+                          VideoXumDataset, VidMorpDataset, YouCook2BiasDataset, YouCook2Dataset)
+from .wrappers import AnsweringCropDataset, AnsweringDataset, GroundingDataset, PlanningDataset, VerifyingDataset
+__all__ = [
+    'HybridDataCollator',
+    'HybridDataset',
+    'ActivitynetCaptionsBiasDataset',
+    'ActivitynetCaptionsDataset',
+    'ActivitynetRTLDataset',
+    'CGBenchDataset',
+    'CharadesSTADataset',
+    'CosMoCapDataset',
+    'DiDeMoDataset',
+    'Ego4DNaQDataset',
+    'Ego4DNLQDataset',
+    'EgoTimeQACropDataset',
+    'EgoTimeQADataset',
+    'EgoTimeQAGroundingDataset',
+    'HiRESTGroundingDataset',
+    'HiRESTStepBiasDataset',
+    'HiRESTStepDataset',
+    'InternVidVTimeDataset',
+    'LongVideoBenchDataset',
+    'LVBenchDataset',
+    'MLVUDataset',
+    'MVBenchDataset',
+    'NExTGQACropDataset',
+    'NExTGQADataset',
+    'NExTGQAGroundingDataset',
+    'NExTQADataset',
+    'QAEgo4DCropDataset',
+    'QAEgo4DDataset',
+    'QAEgo4DGroundingDataset',
+    'QuerYDDataset',
+    'QVHighlightsDataset',
+    'ReXTimeCropDataset',
+    'ReXTimeDataset',
+    'ReXTimeGroundingDataset',
+    'STARDataset',
+    'TACoSDataset',
+    'VideoMMEDataset',
+    'VideoXumDataset',
+    'VidMorpDataset',
+    'YouCook2BiasDataset',
+    'YouCook2Dataset',
+    'AnsweringCropDataset',
+    'AnsweringDataset',
+    'GroundingDataset',
+    'PlanningDataset',
+    'VerifyingDataset',
+]

videomind/dataset/collator.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import warnings
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from videomind.constants import IGNORE_INDEX
+class HybridDataCollator(object):
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+    def __call__(self, batch):
+        input_ids = [d['input_ids'] for d in batch]
+        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+        labels = [d['labels'] for d in batch]
+        labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+        assert input_ids.size() == labels.size()
+        seq_len, max_len = input_ids.size(1), self.tokenizer.model_max_length
+        if seq_len > max_len:
+            warnings.warn(f'The length of input sequence is exceeding model max length: {seq_len} > {max_len}')
+            input_ids, labels = input_ids[:, :max_len], labels[:, :max_len]
+        data = dict(input_ids=input_ids, labels=labels, attention_mask=input_ids != self.tokenizer.pad_token_id)
+        for key in ('pixel_values', 'pixel_values_videos', 'image_grid_thw', 'video_grid_thw'):
+            if key in batch[0]:
+                data[key] = torch.cat([d[key] for d in batch])
+        for key in ('timestamps', 'saliency', 'pos_clip'):
+            if key in batch[0]:
+                data[key] = [d[key] for d in batch]
+        return data

videomind/dataset/hybrid.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import math
+import random
+from collections import defaultdict
+from itertools import accumulate
+import nncore
+import numpy as np
+import termplotlib as tpl
+import torch
+from tabulate import tabulate
+from torch.utils.data import Dataset
+from videomind.constants import IGNORE_INDEX
+from videomind.dataset.utils import preprocess, process_vision_info
+from videomind.utils.parser import parse_span
+DATASETS = nncore.Registry('datasets')
+class HybridDataset(Dataset):
+    def __init__(self, processor, model_config, model_args, data_args, training_args):
+        super().__init__()
+        datasets = []
+        for key in data_args.datasets.split(','):
+            datasets.append(DATASETS.get(key)(processor, model_args, data_args, training_args))
+        data_types = [a['data_type'] for d in datasets for a in d.annos]
+        cum_length = [0] + list(accumulate([len(d) for d in datasets]))
+        idx_ranges = [[cum_length[i], cum_length[i + 1]] for i in range(len(cum_length) - 1)]
+        if training_args.local_rank in (0, -1):
+            raw_length = sum(d.raw_length for d in datasets)
+            cur_length = idx_ranges[-1][-1]
+            ratio = round(cur_length / raw_length * 100, 2)
+            print(f'Number of samples: {raw_length} (original) -> {cur_length} (filtered) {ratio}%')
+            data_type_cnt = ' '.join([f'{data_types.count(t)} ({t})' for t in list(set(data_types))])
+            print(f'Data types: {data_type_cnt}')
+            tab = defaultdict(int)
+            for dataset in datasets:
+                for anno in dataset.annos:
+                    tab[anno.get('source', 'unknown')] += 1
+            tab = [[k, v, round(v / cur_length, 3)] for k, v in tab.items()]
+            print(tabulate(tab, headers=['Source', '#Samples', 'Ratio'], tablefmt='pretty', stralign='left'))
+            d, _ = torch.Tensor([a['duration'] for d in datasets for a in d.annos if 'duration' in a]).sort()
+            if d.size(0) > 0:
+                n, r = min(d.size(0), 10), d.flip(0)
+                print(f'Top-{n} max video durations: {[round(r[i].item(), 1) for i in range(n)]}')
+                print(f'Top-{n} min video durations: {[round(d[i].item(), 1) for i in range(n)]}')
+                print(f'Average video duration ({d.size(0)} samples): {round(d.mean().item(), 1)}s')
+                print('Video duration histogram:')
+                counts, edges = np.histogram(d)
+                labels = [f'{edges[i]:.2f}s - {edges[i + 1]:.2f}s' for i in range(len(edges) - 1)]
+                fig = tpl.figure()
+                fig.barh(counts, labels)
+                fig.show()
+            d, _ = torch.Tensor([abs(b[0] - b[1]) for d in datasets for a in d.annos if 'span' in a
+                                 for b in a['span']]).sort()
+            if d.size(0) > 0:
+                n, r = min(d.size(0), 10), d.flip(0)
+                print(f'Top-{n} max span durations: {[round(r[i].item(), 1) for i in range(n)]}')
+                print(f'Top-{n} min span durations: {[round(d[i].item(), 1) for i in range(n)]}')
+                print(f'Average span duration ({d.size(0)} samples): {round(d.mean().item(), 1)}s')
+                print('Span duration histogram:')
+                counts, edges = np.histogram(d)
+                labels = [f'{edges[i]:.2f}s - {edges[i + 1]:.2f}s' for i in range(len(edges) - 1)]
+                fig = tpl.figure()
+                fig.barh(counts, labels)
+                fig.show()
+        self.datasets = datasets
+        self.data_types = data_types
+        self.idx_ranges = idx_ranges
+        self.processor = processor
+        self.model_config = model_config
+        self.model_args = model_args
+        self.data_args = data_args
+        self.training_args = training_args
+    def __len__(self):
+        return self.idx_ranges[-1][-1]
+    def __getitem__(self, idx):
+        for retry in range(self.data_args.max_retries + 1):
+            try:
+                return self.fetch_data(idx)
+            except Exception as e:
+                print(f'Error in loading {idx}: {type(e).__name__}({e})')
+                idx = random.choice([i for i, t in enumerate(self.data_types) if t == self.data_types[idx]])
+        raise RuntimeError(f'Data loading failed after {retry} retries')
+    def map(self, *args, **kwargs):
+        return self
+    def fetch_data(self, idx):
+        for (s, e), dataset in zip(self.idx_ranges, self.datasets):
+            if s <= idx < e:
+                meta = dataset[idx - s]
+                break
+        text = self.processor.apply_chat_template(meta['messages'])
+        text = [text.strip()]
+        images, videos = process_vision_info(meta['messages'], sanity_check=True)
+        data = self.processor(text=text, images=images, videos=videos, return_tensors='pt')
+        assert data['input_ids'].size(0) == 1
+        data['input_ids'] = data['input_ids'][0]
+        data['labels'] = preprocess(data['input_ids'], text[0], self.processor.tokenizer, self.model_args.conv_type)
+        # insert segment start/end tokens
+        if 'ss' in meta and 'se' in meta:
+            video_grid_thw = data['video_grid_thw'][0]
+            num_frames, window = int(video_grid_thw[0]), int(video_grid_thw[1] * video_grid_thw[2] / 4)
+            assert num_frames * window * 4 == data['pixel_values_videos'].size(0)
+            pos_s, pos_e = round(meta['ss'] * num_frames), round(meta['se'] * num_frames)
+            pos_s, pos_e = min(max(0, pos_s), num_frames), min(max(0, pos_e), num_frames)
+            assert pos_s <= pos_e, (num_frames, meta['ss'], meta['se'])
+            base_idx = torch.nonzero(data['input_ids'] == self.model_config.vision_start_token_id).item()
+            pos_s, pos_e = pos_s * window + base_idx + 1, pos_e * window + base_idx + 2
+            input_ids = data['input_ids'].tolist()
+            input_ids.insert(pos_s, self.model_config.seg_s_token_id)
+            input_ids.insert(pos_e, self.model_config.seg_e_token_id)
+            data['input_ids'] = torch.LongTensor(input_ids)
+            labels = data['labels'].tolist()
+            labels.insert(pos_s, IGNORE_INDEX)
+            labels.insert(pos_e, IGNORE_INDEX)
+            data['labels'] = torch.LongTensor(labels)
+        if 'span' in meta:
+            span, duration = meta['span'], meta['duration']
+            pixel_values_videos, video_grid_thw = data['pixel_values_videos'], data['video_grid_thw']
+            num_frames = int(video_grid_thw[0][0])
+            assert video_grid_thw.size(0) == 1
+            assert video_grid_thw.prod() == pixel_values_videos.size(0)
+            # actual fps would be 1/2 of config (temporal patch size = 2)
+            fps = num_frames / duration
+            safe_span = [parse_span(b, duration, 1 / fps) for b in span]
+            # num_reg_tokens -> num_bnds -> s & e
+            timestamps = [[[s / duration, e / duration] for s, e in safe_span]]
+            saliency, pos_inds = torch.zeros(num_frames), []
+            for s, e in safe_span:
+                span_ind = max(0, s * fps), min(e * fps, num_frames)
+                pos_inds = list(range(math.ceil(span_ind[0]), math.ceil(span_ind[1])))
+                assert len(pos_inds) > 0, f'empty pos_inds ({idx}): {fps} {num_frames} {duration} {span}'
+                saliency[pos_inds] = 1
+            assert saliency.any(), f'empty saliency ({idx}): {pos_inds} {fps} {num_frames} {duration} {span}'
+            pos_clip = random.sample(saliency.nonzero()[:, 0].tolist(), 1)
+            pos_clip = torch.LongTensor(pos_clip)
+            data['timestamps'] = timestamps
+            data['saliency'] = saliency
+            data['pos_clip'] = pos_clip
+        return data

videomind/dataset/sub_classes/__init__.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from .activitynet_captions import ActivitynetCaptionsBiasDataset, ActivitynetCaptionsDataset
+from .activitynet_rtl import ActivitynetRTLDataset
+from .cgbench import CGBenchDataset
+from .charades_sta import CharadesSTADataset
+from .cosmo_cap import CosMoCapDataset
+from .didemo import DiDeMoDataset
+from .ego4d_naq import Ego4DNaQDataset
+from .ego4d_nlq import Ego4DNLQDataset
+from .ego_timeqa import EgoTimeQACropDataset, EgoTimeQADataset, EgoTimeQAGroundingDataset
+from .hirest import HiRESTGroundingDataset, HiRESTStepBiasDataset, HiRESTStepDataset
+from .internvit_vtime import InternVidVTimeDataset
+from .longvideobench import LongVideoBenchDataset
+from .lvbench import LVBenchDataset
+from .mlvu import MLVUDataset
+from .mvbench import MVBenchDataset
+from .nextgqa import NExTGQACropDataset, NExTGQADataset, NExTGQAGroundingDataset
+from .nextqa import NExTQADataset
+from .qa_ego4d import QAEgo4DCropDataset, QAEgo4DDataset, QAEgo4DGroundingDataset
+from .queryd import QuerYDDataset
+from .qvhighlights import QVHighlightsDataset
+from .rextime import ReXTimeCropDataset, ReXTimeDataset, ReXTimeGroundingDataset
+from .star import STARDataset
+from .tacos import TACoSDataset
+from .vid_morp import VidMorpDataset
+from .videomme import VideoMMEDataset
+from .videoxum import VideoXumDataset
+from .youcook2 import YouCook2BiasDataset, YouCook2Dataset
+__all__ = [
+    'ActivitynetCaptionsBiasDataset',
+    'ActivitynetCaptionsDataset',
+    'ActivitynetRTLDataset',
+    'CGBenchDataset',
+    'CharadesSTADataset',
+    'CosMoCapDataset',
+    'DiDeMoDataset',
+    'Ego4DNaQDataset',
+    'Ego4DNLQDataset',
+    'EgoTimeQACropDataset',
+    'EgoTimeQADataset',
+    'EgoTimeQAGroundingDataset',
+    'HiRESTGroundingDataset',
+    'HiRESTStepBiasDataset',
+    'HiRESTStepDataset',
+    'InternVidVTimeDataset',
+    'LongVideoBenchDataset',
+    'LVBenchDataset',
+    'MLVUDataset',
+    'MVBenchDataset',
+    'NExTGQACropDataset',
+    'NExTGQADataset',
+    'NExTGQAGroundingDataset',
+    'NExTQADataset',
+    'QAEgo4DCropDataset',
+    'QAEgo4DDataset',
+    'QAEgo4DGroundingDataset',
+    'QuerYDDataset',
+    'QVHighlightsDataset',
+    'ReXTimeCropDataset',
+    'ReXTimeDataset',
+    'ReXTimeGroundingDataset',
+    'STARDataset',
+    'TACoSDataset',
+    'VidMorpDataset',
+    'VideoMMEDataset',
+    'VideoXumDataset',
+    'YouCook2BiasDataset',
+    'YouCook2Dataset',
+]

videomind/dataset/sub_classes/activitynet_captions.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+from collections import OrderedDict
+import nncore
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+@DATASETS.register(name='activitynet_captions')
+class ActivitynetCaptionsDataset(GroundingDataset):
+    ANNO_PATH_TRAIN = 'data/activitynet_captions/train.json'
+    ANNO_PATH_VALID = 'data/activitynet_captions/val_1.json'
+    ANNO_PATH_TEST = 'data/activitynet_captions/val_2.json'
+    VIDEO_ROOT = 'data/activitynet/videos_3fps_480_noaudio'
+    DURATIONS = 'data/activitynet/durations.json'
+    UNIT = 0.01
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
+        elif split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST, object_pairs_hook=OrderedDict)
+        durations = nncore.load(self.DURATIONS)
+        annos = []
+        for vid, raw_anno in raw_annos.items():
+            for query, span in zip(raw_anno['sentences'], raw_anno['timestamps']):
+                anno = dict(
+                    source='activitynet_captions',
+                    data_type='grounding',
+                    video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                    duration=durations[vid],
+                    query=parse_query(query),
+                    span=[span])
+                annos.append(anno)
+        return annos
+@DATASETS.register(name='activitynet_captions_bias')
+class ActivitynetCaptionsBiasDataset(ActivitynetCaptionsDataset):
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
+        elif split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST, object_pairs_hook=OrderedDict)
+        durations = nncore.load(self.DURATIONS)
+        annos = []
+        for vid, raw_anno in raw_annos.items():
+            assert len(raw_anno['sentences']) == len(raw_anno['timestamps'])
+            for i in range(len(raw_anno['sentences']) - 1):
+                span_a = raw_anno['timestamps'][i]
+                span_b = raw_anno['timestamps'][i + 1]
+                if span_b[0] - span_a[1] < 3:
+                    query_a = parse_query(f"The moment before {raw_anno['sentences'][i + 1]}")
+                    query_b = parse_query(f"The moment after {raw_anno['sentences'][i]}")
+                    anno_a = dict(
+                        source='activitynet_captions_bias',
+                        data_type='grounding',
+                        video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                        duration=durations[vid],
+                        query=query_a,
+                        span=[span_a])
+                    anno_b = dict(
+                        source='activitynet_captions_bias',
+                        data_type='grounding',
+                        video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                        duration=durations[vid],
+                        query=query_b,
+                        span=[span_b])
+                    annos.append(anno_a)
+                    annos.append(anno_b)
+        return annos

videomind/dataset/sub_classes/activitynet_rtl.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import re
+from collections import OrderedDict
+import nncore
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+@DATASETS.register(name='activitynet_rtl')
+class ActivitynetRTLDataset(GroundingDataset):
+    ANNO_PATH_TRAIN = 'data/activitynet_rtl/activitynet_train_gpt-4-0613_temp_6_f10009.json'
+    ANNO_PATH_TEST = 'data/activitynet_rtl/annot_val_1_q229.json'
+    VIDEO_ROOT = 'data/activitynet/videos_3fps_480_noaudio'
+    UNIT = 0.01
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
+            annos = []
+            for vid, raw_anno in raw_annos.items():
+                for meta in raw_anno['QA']:
+                    match = re.findall(r'<(\d+(\.\d+)?)>', meta['a'])
+                    span = [float(m[0]) for m in match[:2]]
+                    # some samples do not have timestamps
+                    if len(span) != 2:
+                        continue
+                    anno = dict(
+                        source='activitynet_rtl',
+                        data_type='grounding',
+                        video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                        duration=raw_anno['duration'],
+                        query=parse_query(meta['q']),
+                        span=[span])
+                    annos.append(anno)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST, object_pairs_hook=OrderedDict)
+            annos = []
+            for raw_anno in raw_annos:
+                vid = f"v_{raw_anno['vid']}"
+                match = re.findall(r'<(\d+(\.\d+)?)>', raw_anno['answer'])
+                span = [float(m[0]) for m in match[:2]]
+                assert len(span) == 2
+                anno = dict(
+                    source='activitynet_rtl',
+                    data_type='grounding',
+                    video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                    duration=raw_anno['duration'],
+                    query=parse_query(raw_anno['question']),
+                    span=[span])
+                annos.append(anno)
+        return annos

videomind/dataset/sub_classes/cgbench.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import nncore
+from torch.utils.data import Dataset
+from videomind.dataset.hybrid import DATASETS
+from videomind.utils.parser import parse_query, parse_question
+@DATASETS.register(name='cgbench')
+class CGBenchDataset(Dataset):
+    ANNO_PATH_TEST = 'data/cgbench/cgbench_mini.json'
+    VIDEO_ROOT = 'data/cgbench/videos_3fps_480_noaudio'
+    SUBTITLE_ROOT = 'data/cgbench/subtitles'
+    UNIT = 0.001
+    @classmethod
+    def load_annos(self, split='test'):
+        assert split == 'test'
+        raw_annos = nncore.load(self.ANNO_PATH_TEST)
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video_uid']
+            anno = dict(
+                source='cgbench',
+                data_type='multimodal',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                subtitle_path=nncore.join(self.SUBTITLE_ROOT, vid + '.srt'),
+                duration=raw_anno['duration'],
+                query=parse_query(raw_anno['question']),
+                question=parse_question(raw_anno['question']),
+                options=[o.capitalize() for o in raw_anno['choices']],
+                answer=raw_anno['answer'].capitalize(),
+                ans=raw_anno['right_answer'],
+                span=raw_anno['clue_intervals'],
+                task=raw_anno['sub_category'],
+                domain=raw_anno['domain'])
+            annos.append(anno)
+        return annos

videomind/dataset/sub_classes/charades_sta.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import nncore
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+@DATASETS.register(name='charades_sta')
+class CharadesSTADataset(GroundingDataset):
+    ANNO_PATH_TRAIN = 'data/charades_sta/charades_sta_train.txt'
+    ANNO_PATH_TEST = 'data/charades_sta/charades_sta_test.txt'
+    VIDEO_ROOT = 'data/charades_sta/videos_3fps_480_noaudio'
+    DURATIONS = 'data/charades_sta/durations.json'
+    UNIT = 0.1
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST)
+        durations = nncore.load(self.DURATIONS)
+        annos = []
+        for raw_anno in raw_annos:
+            info, query = raw_anno.split('##')
+            vid, s, e = info.split()
+            anno = dict(
+                source='charades_sta',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=durations[vid],
+                query=parse_query(query),
+                span=[[float(s), float(e)]])
+            annos.append(anno)
+        return annos

videomind/dataset/sub_classes/cosmo_cap.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import nncore
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+@DATASETS.register(name='cosmo_cap')
+class CosMoCapDataset(GroundingDataset):
+    ANNO_PATH = 'data/cosmo_cap/anno_cosmo_cap.jsonl'
+    VIDEO_ROOT = 'data/cosmo_cap/videos_3fps_480_noaudio'
+    UNIT = 1.0
+    @classmethod
+    def load_annos(self, split='train'):
+        assert split == 'train'
+        raw_annos = nncore.load(self.ANNO_PATH)
+        annos = []
+        for raw_anno in raw_annos:
+            anno = dict(
+                source='cosmo_cap',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, raw_anno['vid'] + '.mp4'),
+                duration=raw_anno['duration'],
+                query=parse_query(raw_anno['query']),
+                span=[raw_anno['span']])
+            annos.append(anno)
+        return annos

videomind/dataset/sub_classes/didemo.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import random
+import nncore
+import numpy as np
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+@DATASETS.register(name='didemo')
+class DiDeMoDataset(GroundingDataset):
+    ANNO_PATH_TRAIN = 'data/didemo/train_data.json'
+    ANNO_PATH_VALID = 'data/didemo/val_data.json'
+    ANNO_PATH_TEST = 'data/didemo/test_data.json'
+    VIDEO_ROOT = 'data/didemo/videos_3fps_480_noaudio'
+    DURATIONS = 'data/didemo/durations.json'
+    UNIT = 1.0
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        elif split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST)
+        durations = nncore.load(self.DURATIONS)
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video'].split('.')[0]
+            # apply mean on multiple spans
+            span = np.array(raw_anno['times']).mean(axis=0).tolist()
+            span = [round(span[0] * 5), round((span[1] + 1) * 5)]
+            # augment spans during training
+            if split == 'train':
+                offset = random.randint(-2, 2)
+                span = [span[0] + offset, span[1] + offset]
+            anno = dict(
+                source='didemo',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=durations[vid],
+                query=parse_query(raw_anno['description']),
+                span=[span])
+            annos.append(anno)
+        return annos

videomind/dataset/sub_classes/ego4d_naq.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+from collections import OrderedDict
+import nncore
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+@DATASETS.register(name='ego4d_naq')
+class Ego4DNaQDataset(GroundingDataset):
+    ANNO_PATH_TRAIN = 'data/ego4d_naq/train.json'
+    ANNO_PATH_VALID = 'data/ego4d_naq/val.json'
+    ANNO_PATH_TEST = 'data/ego4d_naq/test.json'
+    VIDEO_ROOT = 'data/ego4d/v2/videos_3fps_480_noaudio'
+    UNIT = 0.001
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
+        elif split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST, object_pairs_hook=OrderedDict)
+        annos = []
+        for vid, raw_anno in raw_annos.items():
+            duration = raw_anno['num_frames'] / raw_anno['fps']
+            # 300s: 254k samples (dropped 121k samples merged 156k samples)
+            # 480s: 567k samples (dropped 249k samples merged 328k samples)
+            if split == 'train' and (duration < 10 or duration > 600):
+                continue
+            meta = dict()
+            for span, query in zip(raw_anno['exact_times'], raw_anno['sentences']):
+                span = [round(span[0], 3), round(span[1], 3)]
+                query = parse_query(query)
+                # these annotations might be from nlq
+                nlq_keys = ('who', 'what', 'when', 'in what', 'did', 'where', 'how', 'i what')
+                if split == 'train' and any(query.startswith(k) for k in nlq_keys):
+                    continue
+                # bad samples
+                if split == 'train' and '#unsure' in query:
+                    continue
+                # too short or too long samples
+                num_words = len(query.split(' '))
+                if split == 'train' and (num_words < 3 or num_words > 30):
+                    continue
+                if query not in meta:
+                    meta[query] = []
+                meta[query].append(span)
+            for query, span in meta.items():
+                # skip samples with multiple moments
+                if len(span) > 1:
+                    continue
+                anno = dict(
+                    source='ego4d_naq',
+                    data_type='grounding',
+                    video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                    duration=duration,
+                    query=query,
+                    span=span)
+                annos.append(anno)
+        return annos

videomind/dataset/sub_classes/ego4d_nlq.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import nncore
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+@DATASETS.register(name='ego4d_nlq')
+class Ego4DNLQDataset(GroundingDataset):
+    ANNO_PATH_TRAIN = 'data/ego4d_nlq/nlq_train.jsonl'
+    ANNO_PATH_VALID = 'data/ego4d_nlq/nlq_val.jsonl'
+    VIDEO_ROOT = 'data/ego4d/v2/videos_3fps_480_noaudio'
+    UNIT = 0.001
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_VALID)
+        annos = []
+        for raw_anno in raw_annos:
+            assert len(raw_anno['relevant_windows']) == 1
+            anno = dict(
+                source='ego4d_nlq',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, raw_anno['vid'] + '.mp4'),
+                duration=raw_anno['duration'],
+                query=parse_query(raw_anno['query']),
+                span=raw_anno['relevant_windows'])
+            annos.append(anno)
+        return annos

videomind/dataset/sub_classes/ego_timeqa.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import random
+import nncore
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import AnsweringCropDataset, AnsweringDataset, GroundingDataset
+from videomind.utils.parser import parse_query, parse_question
+@DATASETS.register(name='ego_timeqa')
+class EgoTimeQADataset(AnsweringDataset):
+    ANNO_PATH_TRAIN = 'data/ego_timeqa/annotations.EgoTimeQA.json'
+    VIDEO_ROOT = 'data/ego4d/v2/videos_3fps_480_noaudio'
+    DURATIONS = 'data/ego4d/v2/durations.json'
+    SOURCE = 'ego_timeqa'
+    DATA_TYPE = 'multimodal'
+    UNIT = 0.001
+    @classmethod
+    def load_annos(self, split='train'):
+        assert split == 'train'
+        raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        durations = nncore.load(self.DURATIONS)
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video_id']
+            duration = durations[vid]
+            # 303k -> 284k (to be verified)
+            if duration < 10 or duration > 600:
+                continue
+            span = [raw_anno['moment_start_frame'] / 30, raw_anno['moment_end_frame'] / 30]
+            span = [round(span[0], 3), round(span[1], 3)]
+            # this would remove many samples (284k -> 37k)
+            # if span[1] - span[0] < 2:
+            #     continue
+            question = raw_anno['question'].replace(' l ', ' I ').capitalize()
+            question = parse_question(question)
+            query = parse_query(question)
+            # too short or too long samples
+            num_words = len(query.split(' '))
+            if split == 'train' and (num_words < 3 or num_words > 30):
+                continue
+            answer = raw_anno['answer'].capitalize()
+            assert len(raw_anno['wrong_answers']) == 3
+            idx = random.randint(0, 3)
+            ans = chr(ord('A') + idx)
+            options = [o.capitalize() for o in raw_anno['wrong_answers']]
+            options.insert(idx, answer)
+            anno = dict(
+                source=self.SOURCE,
+                data_type=self.DATA_TYPE,
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=duration,
+                query=query,
+                question=question,
+                options=options,
+                answer=answer,
+                ans=ans,
+                span=[span])
+            annos.append(anno)
+        return annos
+@DATASETS.register(name='ego_timeqa_crop')
+class EgoTimeQACropDataset(AnsweringCropDataset, EgoTimeQADataset):
+    SOURCE = 'ego_timeqa_crop'
+@DATASETS.register(name='ego_timeqa_grounding')
+class EgoTimeQAGroundingDataset(GroundingDataset, EgoTimeQADataset):
+    SOURCE = 'ego_timeqa_grounding'
+    DATA_TYPE = 'grounding'

videomind/dataset/sub_classes/hirest.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+from collections import OrderedDict
+import nncore
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+@DATASETS.register(name='hirest_grounding')
+class HiRESTGroundingDataset(GroundingDataset):
+    ANNO_PATH_TRAIN = 'data/hirest/all_data_train.json'
+    ANNO_PATH_VALID = 'data/hirest/all_data_val.json'
+    VIDEO_ROOT = 'data/hirest/videos_3fps_480_noaudio'
+    UNIT = 1.0
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
+        all_videos = nncore.ls(self.VIDEO_ROOT, ext='.mp4')
+        all_videos = set(v[:11] for v in all_videos)
+        annos = []
+        for query, videos in raw_annos.items():
+            for video_name, raw_anno in videos.items():
+                if not raw_anno['relevant'] or not raw_anno['clip']:
+                    continue
+                assert len(raw_anno['bounds']) == 2
+                vid = video_name.split('.')[0]
+                if vid not in all_videos:
+                    continue
+                anno = dict(
+                    source='hirest_grounding',
+                    data_type='grounding',
+                    video_path=nncore.join(self.VIDEO_ROOT, video_name),
+                    duration=raw_anno['v_duration'],
+                    query=parse_query(query),
+                    span=[raw_anno['bounds']])
+                annos.append(anno)
+        return annos
+@DATASETS.register(name='hirest_step')
+class HiRESTStepDataset(HiRESTGroundingDataset):
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
+        all_videos = nncore.ls(self.VIDEO_ROOT, ext='.mp4')
+        all_videos = set(v[:11] for v in all_videos)
+        annos = []
+        for query, videos in raw_annos.items():
+            for video_name, raw_anno in videos.items():
+                if not raw_anno['relevant'] or not raw_anno['clip'] or len(raw_anno['steps']) == 0:
+                    continue
+                vid = video_name.split('.')[0]
+                if vid not in all_videos:
+                    continue
+                for step in raw_anno['steps']:
+                    assert len(step['absolute_bounds']) == 2
+                    anno = dict(
+                        source='hirest_step',
+                        data_type='grounding',
+                        video_path=nncore.join(self.VIDEO_ROOT, video_name),
+                        duration=raw_anno['v_duration'],
+                        query=parse_query(step['heading']),
+                        span=[step['absolute_bounds']])
+                    annos.append(anno)
+        return annos
+@DATASETS.register(name='hirest_step_bias')
+class HiRESTStepBiasDataset(HiRESTStepDataset):
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
+        all_videos = nncore.ls(self.VIDEO_ROOT, ext='.mp4')
+        all_videos = set(v[:11] for v in all_videos)
+        annos = []
+        for query, videos in raw_annos.items():
+            for video_name, raw_anno in videos.items():
+                if not raw_anno['relevant'] or not raw_anno['clip'] or len(raw_anno['steps']) == 0:
+                    continue
+                vid = video_name.split('.')[0]
+                if vid not in all_videos:
+                    continue
+                for i in range(len(raw_anno['steps']) - 1):
+                    span_a = raw_anno['steps'][i]['absolute_bounds']
+                    span_b = raw_anno['steps'][i + 1]['absolute_bounds']
+                    assert len(span_a) == 2 and len(span_b) == 2 and span_a[1] == span_b[0]
+                    query_a = parse_query(f"The moment before {raw_anno['steps'][i + 1]['heading']}")
+                    query_b = parse_query(f"The moment after {raw_anno['steps'][i]['heading']}")
+                    anno_a = dict(
+                        source='hirest_step_bias',
+                        data_type='grounding',
+                        video_path=nncore.join(self.VIDEO_ROOT, video_name),
+                        duration=raw_anno['v_duration'],
+                        query=query_a,
+                        span=[span_a])
+                    anno_b = dict(
+                        source='hirest_step_bias',
+                        data_type='grounding',
+                        video_path=nncore.join(self.VIDEO_ROOT, video_name),
+                        duration=raw_anno['v_duration'],
+                        query=query_b,
+                        span=[span_b])
+                    annos.append(anno_a)
+                    annos.append(anno_b)
+        return annos

videomind/dataset/sub_classes/internvit_vtime.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import nncore
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+@DATASETS.register(name='internvid_vtime')
+class InternVidVTimeDataset(GroundingDataset):
+    ANNO_PATH = 'data/internvid_vtime/anno_internvid_vtime_query_gpt4o_mini.jsonl'
+    VIDEO_ROOT = 'data/internvid_vtime/videos_crop_3fps_480_noaudio'
+    UNIT = 0.1
+    @classmethod
+    def load_annos(self, split='train'):
+        assert split == 'train'
+        raw_annos = nncore.load(self.ANNO_PATH)
+        all_videos = nncore.ls(self.VIDEO_ROOT, ext='.mp4')
+        all_videos = set(v[:11] for v in all_videos)
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['vid']
+            if vid not in all_videos:
+                continue
+            anno = dict(
+                source='internvid_vtime',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=raw_anno['duration'],
+                query=parse_query(raw_anno['query']),
+                span=[raw_anno['span']])
+            annos.append(anno)
+        return annos

videomind/dataset/sub_classes/longvideobench.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import nncore
+from torch.utils.data import Dataset
+from videomind.dataset.hybrid import DATASETS
+from videomind.utils.parser import parse_query, parse_question
+@DATASETS.register(name='longvideobench')
+class LongVideoBenchDataset(Dataset):
+    ANNO_PATH_VALID = 'data/longvideobench/lvb_val.json'
+    ANNO_PATH_TEST = 'data/longvideobench/lvb_test_wo_gt.json'
+    VIDEO_ROOT = 'data/longvideobench/videos_3fps_480_noaudio'
+    @classmethod
+    def load_annos(self, split='valid'):
+        if split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID)
+        else:
+            print('WARNING: Test split does not have ground truth annotations')
+            raw_annos = nncore.load(self.ANNO_PATH_TEST)
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video_id']
+            if vid.startswith('@'):
+                vid = vid[-19:]
+            # videos might come from youtube or other sources
+            assert len(vid) in (11, 19)
+            anno = dict(
+                source='longvideobench',
+                data_type='multimodal',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                query=parse_query(raw_anno['question']),
+                question=parse_question(raw_anno['question']),
+                options=raw_anno['candidates'],
+                task=str(raw_anno['duration_group']),
+                level=raw_anno['level'],
+                question_category=raw_anno['question_category'])
+            if 'correct_choice' in raw_anno:
+                anno['answer'] = raw_anno['candidates'][raw_anno['correct_choice']]
+                anno['ans'] = chr(ord('A') + raw_anno['correct_choice'])
+            annos.append(anno)
+        return annos

videomind/dataset/sub_classes/lvbench.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import nncore
+from torch.utils.data import Dataset
+from videomind.dataset.hybrid import DATASETS
+from videomind.utils.parser import parse_query, parse_question
+@DATASETS.register(name='lvbench')
+class LVBenchDataset(Dataset):
+    ANNO_PATH = 'data/lvbench/LVBench/video_info.meta.jsonl'
+    VIDEO_ROOT = 'data/lvbench/videos_3fps_480_noaudio'
+    @classmethod
+    def load_annos(self, split='test'):
+        assert split == 'test'
+        raw_annos = nncore.load(self.ANNO_PATH)
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['key']
+            for meta in raw_anno['qa']:
+                tok = meta['question'].split('\n')
+                assert len(tok) == 5
+                assert all(any(o.startswith(k) for k in ('(A) ', '(B) ', '(C) ', '(D) ')) for o in tok[1:])
+                options = [o[4:] for o in tok[1:]]
+                ans = meta['answer']
+                answer = options[ord(ans) - ord('A')]
+                assert ans in 'ABCD'
+                anno = dict(
+                    source='lvbench',
+                    data_type='multimodal',
+                    video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                    query=parse_query(tok[0]),
+                    question=parse_question(tok[0]),
+                    options=options,
+                    answer=answer,
+                    ans=ans,
+                    task=meta['question_type'],
+                    time_reference=meta['time_reference'])
+                annos.append(anno)
+        return annos

videomind/dataset/sub_classes/mlvu.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import nncore
+from torch.utils.data import Dataset
+from videomind.dataset.hybrid import DATASETS
+from videomind.utils.parser import parse_query, parse_question
+@DATASETS.register(name='mlvu')
+class MLVUDataset(Dataset):
+    TASK_TO_DIR_MAP = {
+        'plotQA': '1_plotQA',
+        'findNeedle': '2_needle',
+        'ego': '3_ego',
+        'count': '4_count',
+        'order': '5_order',
+        'anomaly_reco': '6_anomaly_reco',
+        'topic_reasoning': '7_topic_reasoning'
+    }
+    DATA_ROOT = 'data/mlvu'
+    @classmethod
+    def load_annos(self, split='test'):
+        assert split == 'test'
+        paths = [nncore.join(self.DATA_ROOT, 'json', f'{n}.json') for n in self.TASK_TO_DIR_MAP.values()]
+        raw_annos = nncore.flatten([nncore.load(p) for p in paths])
+        annos = []
+        for raw_anno in raw_annos:
+            task = raw_anno['question_type']
+            video_name = nncore.join(self.TASK_TO_DIR_MAP[task], raw_anno['video'])
+            options = raw_anno['candidates']
+            answer = raw_anno['answer']
+            ans = chr(ord('A') + options.index(answer))
+            anno = dict(
+                source='mlvu',
+                data_type='multimodal',
+                video_path=nncore.join(self.DATA_ROOT, 'video', video_name),
+                query=parse_query(raw_anno['question']),
+                question=parse_question(raw_anno['question']),
+                options=options,
+                answer=answer,
+                ans=ans,
+                task=task)
+            annos.append(anno)
+        return annos

videomind/dataset/sub_classes/mvbench.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import nncore
+from torch.utils.data import Dataset
+from videomind.dataset.hybrid import DATASETS
+from videomind.utils.parser import parse_query, parse_question
+@DATASETS.register(name='mvbench')
+class MVBenchDataset(Dataset):
+    META_DATA = [('Episodic Reasoning', 'episodic_reasoning.json', 'tvqa/frames_fps3_hq', 'frame'),
+                 ('Action Sequence', 'action_sequence.json', 'star/Charades_v1_480', 'video'),
+                 ('Action Prediction', 'action_prediction.json', 'star/Charades_v1_480', 'video'),
+                 ('Action Antonym', 'action_antonym.json', 'ssv2_video', 'video'),
+                 ('Fine-grained Action', 'fine_grained_action.json', 'Moments_in_Time_Raw/videos', 'video'),
+                 ('Unexpected Action', 'unexpected_action.json', 'FunQA_test/test', 'video'),
+                 ('Object Existence', 'object_existence.json', 'clevrer/video_validation', 'video'),
+                 ('Object Interaction', 'object_interaction.json', 'star/Charades_v1_480', 'video'),
+                 ('Object Shuffle', 'object_shuffle.json', 'perception/videos', 'video'),
+                 ('Moving Direction', 'moving_direction.json', 'clevrer/video_validation', 'video'),
+                 ('Action Localization', 'action_localization.json', 'sta/sta_video', 'video'),
+                 ('Scene Transition', 'scene_transition.json', 'scene_qa/video', 'video'),
+                 ('Action Count', 'action_count.json', 'perception/videos', 'video'),
+                 ('Moving Count', 'moving_count.json', 'clevrer/video_validation', 'video'),
+                 ('Moving Attribute', 'moving_attribute.json', 'clevrer/video_validation', 'video'),
+                 ('State Change', 'state_change.json', 'perception/videos', 'video'),
+                 ('Fine-grained Pose', 'fine_grained_pose.json', 'nturgbd', 'video'),
+                 ('Character Order', 'character_order.json', 'perception/videos', 'video'),
+                 ('Egocentric Navigation', 'egocentric_navigation.json', 'vlnqa', 'video'),
+                 ('Counterfactual Inference', 'counterfactual_inference.json', 'clevrer/video_validation', 'video')]
+    DATA_ROOT = 'data/mvbench'
+    MIN_LEN = 64
+    @classmethod
+    def load_annos(self, split='test', sample_frames=32):
+        assert split == 'test'
+        annos = []
+        for meta in self.META_DATA:
+            raw_annos = nncore.load(nncore.join(self.DATA_ROOT, 'json', meta[1]))
+            for raw_anno in raw_annos:
+                video_name = nncore.join(meta[2], raw_anno['video'])
+                video_path = nncore.join(self.DATA_ROOT, 'video', video_name)
+                if meta[3] == 'frame':
+                    num_frames = len(nncore.ls(video_path, ext='.jpg'))
+                    video_path = [
+                        nncore.join(video_path, f'{i:0>5}.jpg')
+                        for i in range(1, num_frames + 1, num_frames // (sample_frames - 1))
+                    ][:sample_frames]
+                options = raw_anno['candidates']
+                answer = raw_anno['answer']
+                ans = chr(ord('A') + options.index(answer))
+                anno = dict(
+                    source='mvbench',
+                    data_type='multimodal',
+                    video_path=video_path,
+                    query=parse_query(raw_anno['question']),
+                    question=parse_question(raw_anno['question']),
+                    options=options,
+                    answer=answer,
+                    ans=ans,
+                    task=meta[0])
+                annos.append(anno)
+        return annos

videomind/dataset/sub_classes/nextgqa.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import csv
+import nncore
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import AnsweringCropDataset, AnsweringDataset, GroundingDataset
+from videomind.utils.parser import parse_query, parse_question
+@DATASETS.register(name='nextgqa')
+class NExTGQADataset(AnsweringDataset):
+    ANNO_PATH_VALID = 'data/nextgqa/val.csv'
+    ANNO_PATH_TEST = 'data/nextgqa/test.csv'
+    SPAN_PATH_VALID = 'data/nextgqa/gsub_val.json'
+    SPAN_PATH_TEST = 'data/nextgqa/gsub_test.json'
+    VIDEO_ID_MAP = 'data/nextgqa/map_vid_vidorID.json'
+    VIDEO_ROOT = 'data/nextqa/videos'
+    SOURCE = 'nextgqa'
+    DATA_TYPE = 'multimodal'
+    UNIT = 0.1
+    @classmethod
+    def load_annos(self, split='valid'):
+        assert split in ('valid', 'test')
+        if split == 'valid':
+            anno_path = self.ANNO_PATH_VALID
+            raw_spans = nncore.load(self.SPAN_PATH_VALID)
+        else:
+            anno_path = self.ANNO_PATH_TEST
+            raw_spans = nncore.load(self.SPAN_PATH_TEST)
+        with open(anno_path, mode='r') as f:
+            reader = csv.DictReader(f)
+            raw_annos = [d for d in reader]
+        video_id_map = nncore.load(self.VIDEO_ID_MAP)
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video_id']
+            qid = raw_anno['qid']
+            video_id = video_id_map[vid]
+            query = parse_query(raw_anno['question'].capitalize() + '?')
+            question = parse_question(raw_anno['question'].capitalize() + '?')
+            options = [raw_anno[k].capitalize() for k in ('a0', 'a1', 'a2', 'a3', 'a4')]
+            answer = raw_anno['answer'].capitalize()
+            ans = chr(ord('A') + options.index(answer))
+            anno = dict(
+                source=self.SOURCE,
+                data_type=self.DATA_TYPE,
+                video_path=nncore.join(self.VIDEO_ROOT, video_id + '.mp4'),
+                duration=raw_spans[vid]['duration'],
+                query=query,
+                question=question,
+                options=options,
+                answer=answer,
+                ans=ans,
+                span=raw_spans[vid]['location'][qid],
+                task=raw_anno['type'])
+            annos.append(anno)
+        return annos
+@DATASETS.register(name='nextgqa_crop')
+class NExTGQACropDataset(AnsweringCropDataset, NExTGQADataset):
+    SOURCE = 'nextgqa_crop'
+@DATASETS.register(name='nextgqa_grounding')
+class NExTGQAGroundingDataset(GroundingDataset, NExTGQADataset):
+    SOURCE = 'nextgqa_grounding'
+    DATA_TYPE = 'grounding'

videomind/dataset/sub_classes/nextqa.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import csv
+import nncore
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import AnsweringDataset
+from videomind.utils.parser import parse_query, parse_question
+@DATASETS.register(name='nextqa')
+class NExTQADataset(AnsweringDataset):
+    ANNO_PATH_TRAIN = 'data/nextqa/train.csv'
+    ANNO_PATH_VALID = 'data/nextqa/val.csv'
+    ANNO_PATH_TEST = 'data/nextqa/test.csv'
+    VIDEO_ID_MAP = 'data/nextqa/map_vid_vidorID.json'
+    VIDEO_ROOT = 'data/nextqa/NExTVideo'
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            anno_path = self.ANNO_PATH_TRAIN
+        elif split == 'valid':
+            anno_path = self.ANNO_PATH_VALID
+        else:
+            anno_path = self.ANNO_PATH_TEST
+        with open(anno_path, mode='r') as f:
+            reader = csv.DictReader(f)
+            raw_annos = [d for d in reader]
+        video_id_map = nncore.load(self.VIDEO_ID_MAP)
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video']
+            qid = raw_anno['qid']
+            video_id = video_id_map[vid]
+            query = parse_query(raw_anno['question'].capitalize() + '?')
+            question = parse_question(raw_anno['question'].capitalize() + '?')
+            options = [raw_anno[k].capitalize() for k in ('a0', 'a1', 'a2', 'a3', 'a4')]
+            ans = chr(ord('A') + int(raw_anno['answer']))
+            answer = options[int(raw_anno['answer'])]
+            anno = dict(
+                source='nextqa',
+                data_type='multimodal',
+                uid=f'{vid}_{qid}',
+                video_path=nncore.join(self.VIDEO_ROOT, video_id + '.mp4'),
+                query=query,
+                question=question,
+                options=options,
+                answer=answer,
+                ans=ans,
+                task=raw_anno['type'])
+            annos.append(anno)
+        return annos

videomind/dataset/sub_classes/qa_ego4d.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import random
+import nncore
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import AnsweringCropDataset, AnsweringDataset, GroundingDataset
+from videomind.utils.parser import parse_query, parse_question
+@DATASETS.register(name='qa_ego4d')
+class QAEgo4DDataset(AnsweringDataset):
+    ANNO_PATH_TRAIN = 'data/qa_ego4d/annotations.QaEgo4D_train.json'
+    ANNO_PATH_VALID = 'data/qa_ego4d/annotations.QaEgo4D_val_options.json'
+    ANNO_PATH_TEST = 'data/qa_ego4d/annotations.QaEgo4D_test_options.json'
+    VIDEO_ROOT = 'data/ego4d/v1/videos_3fps_480_noaudio'
+    DURATIONS = 'data/ego4d/v1/durations.json'
+    SOURCE = 'qa_ego4d'
+    DATA_TYPE = 'multimodal'
+    UNIT = 0.001
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        elif split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST)
+        durations = nncore.load(self.DURATIONS)
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video_id']
+            duration = durations[vid]
+            # too short or too long samples
+            if split == 'train' and (duration < 10 or duration > 600):
+                continue
+            span = [raw_anno['moment_start_frame'] / 30, raw_anno['moment_end_frame'] / 30]
+            span = [round(span[0], 3), round(span[1], 3)]
+            # skip samples with too short moments
+            # if split == 'train' and span[1] - span[0] < 2:
+            #     continue
+            answer = raw_anno['answer'].capitalize()
+            if 'options' in raw_anno:
+                options = [o.capitalize() for o in raw_anno['options']]
+                idx = options.index(answer)
+                ans = chr(ord('A') + idx)
+            else:
+                # NOTE: indeterministic evaluation
+                assert len(raw_anno['wrong_answers']) == 3
+                idx = random.randint(0, 3)
+                ans = chr(ord('A') + idx)
+                options = [o.capitalize() for o in raw_anno['wrong_answers']]
+                options.insert(idx, answer)
+            assert len(options) == 4, options
+            anno = dict(
+                source=self.SOURCE,
+                data_type=self.DATA_TYPE,
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=duration,
+                query=parse_query(raw_anno['question'].capitalize()),
+                question=parse_question(raw_anno['question'].capitalize()),
+                options=options,
+                answer=answer,
+                ans=ans,
+                span=[span])
+            annos.append(anno)
+        return annos
+@DATASETS.register(name='qa_ego4d_crop')
+class QAEgo4DCropDataset(AnsweringCropDataset, QAEgo4DDataset):
+    SOURCE = 'qa_ego4d_crop'
+@DATASETS.register(name='qa_ego4d_grounding')
+class QAEgo4DGroundingDataset(GroundingDataset, QAEgo4DDataset):
+    SOURCE = 'qa_ego4d_grounding'
+    DATA_TYPE = 'grounding'

videomind/dataset/sub_classes/queryd.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import nncore
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+@DATASETS.register(name='queryd')
+class QuerYDDataset(GroundingDataset):
+    VID_PATH = 'data/queryd/train_list.txt'
+    QUERY_PATH = 'data/queryd/raw_captions_combined_filtered-v2.pkl'
+    SPAN_PATH = 'data/queryd/times_captions_combined_filtered-v2.pkl'
+    VIDEO_ROOT = 'data/queryd/videos_3fps_480_noaudio'
+    DURATIONS = 'data/queryd/durations.json'
+    UNIT = 0.001
+    @classmethod
+    def load_annos(self, split='train'):
+        assert split == 'train'
+        vids = nncore.load(self.VID_PATH)
+        queries = nncore.load(self.QUERY_PATH)
+        spans = nncore.load(self.SPAN_PATH)
+        durations = nncore.load(self.DURATIONS)
+        annos = []
+        for vid in vids:
+            for query, span in zip(queries[vid], spans[vid]):
+                video_name = vid[6:]
+                if video_name not in durations:
+                    continue
+                anno = dict(
+                    source='queryd',
+                    data_type='grounding',
+                    video_path=nncore.join(self.VIDEO_ROOT, video_name + '.mp4'),
+                    duration=durations[video_name],
+                    query=parse_query(' '.join(query)),
+                    span=[span])
+                annos.append(anno)
+        return annos

videomind/dataset/sub_classes/qvhighlights.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import nncore
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+@DATASETS.register(name='qvhighlights')
+class QVHighlightsDataset(GroundingDataset):
+    ANNO_PATH_TRAIN = 'data/qvhighlights/highlight_train_release.jsonl'
+    ANNO_PATH_VALID = 'data/qvhighlights/highlight_val_release.jsonl'
+    ANNO_PATH_TEST = 'data/qvhighlights/highlight_test_release.jsonl'
+    VIDEO_ROOT = 'data/qvhighlights/videos_3fps_480_noaudio'
+    UNIT = 2.0
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        elif split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID)
+        else:
+            print('WARNING: Test split does not have ground truth annotations')
+            raw_annos = nncore.load(self.ANNO_PATH_TEST)
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['vid']
+            qid = raw_anno['qid']
+            anno = dict(
+                source='qvhighlights',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=raw_anno['duration'],
+                query=parse_query(raw_anno['query']),
+                span=raw_anno.get('relevant_windows'),
+                vid=vid,
+                qid=qid)
+            annos.append(anno)
+        return annos
+@DATASETS.register(name='qvhighlights_single')
+class QVHighlightsSingleDataset(QVHighlightsDataset):
+    @classmethod
+    def load_annos(self, split='train'):
+        assert split == 'train'
+        raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        annos = []
+        for raw_anno in raw_annos:
+            # skip samples with multiple moments
+            if len(raw_anno['relevant_windows']) > 1:
+                continue
+            vid = raw_anno['vid']
+            anno = dict(
+                source='qvhighlights_single',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=raw_anno['duration'],
+                query=parse_query(raw_anno['query']),
+                span=raw_anno.get('relevant_windows'))
+            annos.append(anno)
+        return annos

videomind/dataset/sub_classes/rextime.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+import nncore
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import AnsweringCropDataset, AnsweringDataset, GroundingDataset
+from videomind.utils.parser import parse_query, parse_question
+@DATASETS.register(name='rextime')
+class ReXTimeDataset(AnsweringDataset):
+    ANNO_PATH_TRAIN = 'data/rextime/rextime_train.json'
+    ANNO_PATH_VALID = 'data/rextime/rextime_val.json'
+    ANNO_PATH_TEST = 'data/rextime/rextime_test_release.json'
+    VIDEO_ROOT_ANET = 'data/activitynet/videos_3fps_480_noaudio'
+    VIDEO_ROOT_QVHL = 'data/qvhighlights/videos_3fps_480_noaudio'
+    DURATIONS_ANET = 'data/activitynet/durations.json'
+    DURATIONS_QVHL = 'data/qvhighlights/durations.json'
+    SOURCE = 'rextime'
+    DATA_TYPE = 'multimodal'
+    UNIT = 1.0
+    MIN_LEN = 64
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        elif split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID)
+        else:
+            print('WARNING: Test split does not have ground truth annotations')
+            raw_annos = nncore.load(self.ANNO_PATH_TEST)
+        durations_anet = nncore.load(self.DURATIONS_ANET)
+        durations_qvhl = nncore.load(self.DURATIONS_QVHL)
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['vid']
+            if len(vid) == 13:
+                video_path = nncore.join(self.VIDEO_ROOT_ANET, vid + '.mp4')
+                duration = durations_anet[vid]
+            else:
+                video_path = nncore.join(self.VIDEO_ROOT_QVHL, vid + '.mp4')
+                duration = durations_qvhl[vid]
+            anno = dict(
+                source=self.SOURCE,
+                data_type=self.DATA_TYPE,
+                video_path=video_path,
+                duration=duration,
+                query=parse_query(raw_anno['question']),
+                question=parse_question(raw_anno['question']),
+                options=[o.capitalize() for o in raw_anno['options']],
+                answer=raw_anno['answer'].replace('From <s0> to <e0>, ', '').capitalize(),
+                ans=raw_anno['ans'],
+                span=[raw_anno['span']],
+                task=raw_anno['category'])
+            annos.append(anno)
+        return annos
+@DATASETS.register(name='rextime_crop')
+class ReXTimeCropDataset(AnsweringCropDataset, ReXTimeDataset):
+    SOURCE = 'rextime_crop'
+@DATASETS.register(name='rextime_grounding')
+class ReXTimeGroundingDataset(GroundingDataset, ReXTimeDataset):
+    SOURCE = 'rextime_grounding'
+    DATA_TYPE = 'grounding'