diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..f9ec6e2fdd9173a6886cdac2517aaf8375240d0f 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/10309844035.mp4 filter=lfs diff=lfs merge=lfs -text
+data/13887487955.mp4 filter=lfs diff=lfs merge=lfs -text
+data/4167294363.mp4 filter=lfs diff=lfs merge=lfs -text
+data/4742652230.mp4 filter=lfs diff=lfs merge=lfs -text
+data/4766274786.mp4 filter=lfs diff=lfs merge=lfs -text
+data/5012237466.mp4 filter=lfs diff=lfs merge=lfs -text
+data/5188348585.mp4 filter=lfs diff=lfs merge=lfs -text
+data/9383140374.mp4 filter=lfs diff=lfs merge=lfs -text
+data/DTInxNfWXVc_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/RoripwjYFp8_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/UFWQKrcbhjI_360.0_510.0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/Z3-IZ3HAmIA_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/h6QKDqomIPk_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/pA6Z-qYhSNg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/rrTIeJRVGjg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/yId2wIocTys_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..fe47536fab4a3c41da6f2b55620f067d0b32fd66
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+# Byte-compiled / optimized / DLL files
+__pycache__
+*.egg-info
+*.py[cod]
+*$py.class
+
+# Temporary data
+.DS_Store
+._*
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a5a997f4880eb71330bf2f833ae821a45ef2fc0
--- /dev/null
+++ b/app.py
@@ -0,0 +1,640 @@
+# Copyright (c) 2024 Ye Liu. Licensed under the BSD-3-Clause license.
+
+import html
+import json
+import os
+import random
+import time
+from functools import partial
+from threading import Thread
+
+import gradio as gr
+import nncore
+import torch
+from huggingface_hub import snapshot_download
+from transformers import TextIteratorStreamer
+
+from videomind.constants import GROUNDER_PROMPT, PLANNER_PROMPT, VERIFIER_PROMPT
+from videomind.dataset.utils import process_vision_info
+from videomind.model.builder import build_model
+from videomind.utils.io import get_duration
+from videomind.utils.parser import parse_query, parse_span
+
+BASE_MODEL = 'model_zoo/Qwen2-VL-2B-Instruct'
+BASE_MODEL_HF = 'Qwen/Qwen2-VL-2B-Instruct'
+
+MODEL = 'model_zoo/VideoMind-2B'
+MODEL_HF = 'yeliudev/VideoMind-2B'
+
+TITLE = 'VideoMind: A Chain-of-LoRA Agent for Long Video Reasoning'
+
+TITLE_MD = f'<h1 align="center">💡 {TITLE}</h1>'
+DESCRIPTION_MD = """VideoMind is a multi-modal agent framework that enhances video reasoning by emulating *human-like* processes, such as *breaking down tasks*, *localizing and verifying moments*, and *synthesizing answers*. This approach addresses the unique challenges of temporal-grounded reasoning in a progressive strategy. Please find more details at our <a href="https://videomind.github.io/" target="_blank">Project Page</a>, <a href="https://arxiv.org/abs/2503.13444" target="_blank">Tech Report</a> and <a href="https://github.com/yeliudev/VideoMind" target="_blank">GitHub Repo</a>."""  # noqa
+
+# yapf:disable
+EXAMPLES = [
+    ('data/4167294363.mp4', 'Why did the old man stand up?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/5012237466.mp4', 'How does the child in stripes react about the fountain?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/13887487955.mp4', 'What did the excavator do after it pushed the cement forward?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/5188348585.mp4', 'What did the person do before pouring the liquor?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/4766274786.mp4', 'What did the girl do after the baby lost the balloon?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/4742652230.mp4', 'Why is the girl pushing the boy only around the toy but not to other places?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/9383140374.mp4', 'How does the girl in pink control the movement of the claw?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/10309844035.mp4', 'Why are they holding up the phones?', ['pla', 'gnd', 'ver', 'ans']),
+    ('data/pA6Z-qYhSNg_60.0_210.0.mp4', 'Different types of meat products are being cut, shaped and prepared', ['gnd', 'ver']),
+    ('data/UFWQKrcbhjI_360.0_510.0.mp4', 'A man talks to the camera whilst walking along a roadside in a rural area', ['gnd', 'ver']),
+    ('data/RoripwjYFp8_210.0_360.0.mp4', 'A woman wearing glasses eating something at a street market', ['gnd', 'ver']),
+    ('data/h6QKDqomIPk_210.0_360.0.mp4', 'A toddler sits in his car seat, holding his yellow tablet', ['gnd', 'ver']),
+    ('data/Z3-IZ3HAmIA_60.0_210.0.mp4', 'A view from the window as the plane accelerates and takes off from the runway', ['gnd', 'ver']),
+    ('data/yId2wIocTys_210.0_360.0.mp4', "Temporally locate the visual content mentioned in the text query 'kids exercise in front of parked cars' within the video.", ['pla', 'gnd', 'ver']),
+    ('data/rrTIeJRVGjg_60.0_210.0.mp4', "Localize the moment that provides relevant context about 'man stands in front of a white building monologuing'.", ['pla', 'gnd', 'ver']),
+    ('data/DTInxNfWXVc_210.0_360.0.mp4', "Find the video segment that corresponds to the given textual query 'man with headphones talking'.", ['pla', 'gnd', 'ver']),
+]
+# yapf:enable
+
+CSS = """button .box { text-align: left }"""
+
+JS = """
+function init() {
+    var info = document.getElementById('role').querySelectorAll('[class^="svelte"]')[1]
+    info.innerHTML = info.innerHTML.replace(/&lt;/g, '<').replace(/&gt;/g, '>')
+}
+"""
+
+
+class CustomStreamer(TextIteratorStreamer):
+
+    def put(self, value):
+        if len(value.shape) > 1 and value.shape[0] > 1:
+            raise ValueError('TextStreamer only supports batch size 1')
+        elif len(value.shape) > 1:
+            value = value[0]
+
+        if self.skip_prompt and self.next_tokens_are_prompt:
+            self.next_tokens_are_prompt = False
+            return
+
+        self.token_cache.extend(value.tolist())
+
+        # force skipping eos token
+        if self.token_cache[-1] == self.tokenizer.eos_token_id:
+            self.token_cache = self.token_cache[:-1]
+
+        text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
+
+        # cache decoded text for future use
+        self.text_cache = text
+
+        if text.endswith('\n'):
+            printable_text = text[self.print_len:]
+            self.token_cache = []
+            self.print_len = 0
+        elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
+            printable_text = text[self.print_len:]
+            self.print_len += len(printable_text)
+        else:
+            printable_text = text[self.print_len:text.rfind(' ') + 1]
+            self.print_len += len(printable_text)
+
+        self.on_finalized_text(printable_text)
+
+
+def seconds_to_hms(seconds):
+    hours, remainder = divmod(round(seconds), 3600)
+    minutes, seconds = divmod(remainder, 60)
+    return f'{hours:02}:{minutes:02}:{seconds:02}'
+
+
+def enable_btns():
+    return (gr.Button(interactive=True), ) * 3
+
+
+def disable_btns():
+    return (gr.Button(interactive=False), ) * 3
+
+
+def update_placeholder(role):
+    placeholder = 'Ask a question about the video...' if 'ans' in role else 'Write a query to search for a moment...'
+    return gr.Textbox(placeholder=placeholder)
+
+
+def main(video, prompt, role, temperature, max_new_tokens, model, processor, streamer, device):
+    history = []
+
+    if not video:
+        gr.Warning('Please upload a video or click [Random] to sample one.')
+        return history
+
+    if not prompt:
+        gr.Warning('Please provide a prompt or click [Random] to sample one.')
+        return history
+
+    if 'gnd' not in role and 'ans' not in role:
+        gr.Warning('Please at least select Grounder or Answerer.')
+        return history
+
+    if 'ver' in role and 'gnd' not in role:
+        gr.Warning('Verifier cannot be used without Grounder.')
+        return history
+
+    if 'pla' in role and any(k not in role for k in ('gnd', 'ver', 'ans')):
+        gr.Warning('Planner can only be used when all other roles are selected.')
+        return history
+
+    history.append({'role': 'user', 'content': prompt})
+    yield history
+
+    duration = get_duration(video)
+
+    # do grounding and answering by default
+    do_grounding = True
+    do_answering = True
+
+    # initialize grounding query as prompt
+    query = prompt
+
+    if 'pla' in role:
+        text = PLANNER_PROMPT.format(prompt)
+
+        history.append({
+            'metadata': {
+                'title': '🗺️ Working as Planner...'
+            },
+            'role': 'assistant',
+            'content': f'##### Planner Prompt:\n\n{html.escape(text)}\n\n##### Planner Response:\n\n...'
+        })
+        yield history
+
+        start_time = time.perf_counter()
+
+        messages = [{
+            'role':
+            'user',
+            'content': [{
+                'type': 'video',
+                'video': video,
+                'num_threads': 1,
+                'min_pixels': 36 * 28 * 28,
+                'max_pixels': 64 * 28 * 28,
+                'max_frames': 100,
+                'fps': 1.0
+            }, {
+                'type': 'text',
+                'text': text
+            }]
+        }]
+
+        text = processor.apply_chat_template(messages, add_generation_prompt=True)
+
+        images, videos = process_vision_info(messages)
+        data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
+        data = data.to(device)
+
+        model.base_model.disable_adapter_layers()
+        model.base_model.enable_adapter_layers()
+        model.set_adapter('planner')
+
+        generation_kwargs = dict(
+            **data,
+            streamer=streamer,
+            do_sample=temperature > 0,
+            temperature=temperature if temperature > 0 else None,
+            top_p=None,
+            top_k=None,
+            repetition_penalty=None,
+            max_new_tokens=max_new_tokens)
+
+        t = Thread(target=model.generate, kwargs=generation_kwargs)
+        t.start()
+
+        skipped = False
+        for i, text in enumerate(streamer):
+            if text and not skipped:
+                history[-1]['content'] = history[-1]['content'].rstrip('.')
+                skipped = True
+            history[-1]['content'] += text
+            yield history
+
+        elapsed_time = round(time.perf_counter() - start_time, 1)
+        history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
+        yield history
+
+        try:
+            parsed = json.loads(streamer.text_cache)
+            action = parsed[0] if isinstance(parsed, list) else parsed
+            if action['type'].lower() == 'grounder' and action['value']:
+                query = action['value']
+            elif action['type'].lower() == 'answerer':
+                do_grounding = False
+                do_answering = True
+        except Exception:
+            pass
+
+        response = 'After browsing the video and the question. My plan to figure out the answer is as follows:\n'
+        step_idx = 1
+        if 'gnd' in role and do_grounding:
+            response += f'\n{step_idx}. Localize the relevant moment in this video using the query "<span style="color:red">{query}</span>".'
+            step_idx += 1
+        if 'ver' in role and do_grounding:
+            response += f'\n{step_idx}. Verify the grounded moments one-by-one and select the best cancdidate.'
+            step_idx += 1
+        if 'ans' in role and do_answering:
+            if step_idx > 1:
+                response += f'\n{step_idx}. Crop the video segment and zoom-in to higher resolution.'
+            else:
+                response += f'\n{step_idx}. Analyze the whole video directly without cropping.'
+
+        history.append({'role': 'assistant', 'content': ''})
+        for i, text in enumerate(response.split(' ')):
+            history[-1]['content'] += ' ' + text if i > 0 else text
+            yield history
+
+    if 'gnd' in role and do_grounding:
+        query = parse_query(query)
+
+        text = GROUNDER_PROMPT.format(query)
+
+        history.append({
+            'metadata': {
+                'title': '🔍 Working as Grounder...'
+            },
+            'role': 'assistant',
+            'content': f'##### Grounder Prompt:\n\n{html.escape(text)}\n\n##### Grounder Response:\n\n...'
+        })
+        yield history
+
+        start_time = time.perf_counter()
+
+        messages = [{
+            'role':
+            'user',
+            'content': [{
+                'type': 'video',
+                'video': video,
+                'num_threads': 1,
+                'min_pixels': 36 * 28 * 28,
+                'max_pixels': 64 * 28 * 28,
+                'max_frames': 150,
+                'fps': 1.0
+            }, {
+                'type': 'text',
+                'text': text
+            }]
+        }]
+
+        text = processor.apply_chat_template(messages, add_generation_prompt=True)
+        images, videos = process_vision_info(messages)
+        data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
+        data = data.to(device)
+
+        model.base_model.disable_adapter_layers()
+        model.base_model.enable_adapter_layers()
+        model.set_adapter('grounder')
+
+        generation_kwargs = dict(
+            **data,
+            streamer=streamer,
+            do_sample=temperature > 0,
+            temperature=temperature if temperature > 0 else None,
+            top_p=None,
+            top_k=None,
+            repetition_penalty=None,
+            max_new_tokens=max_new_tokens)
+
+        t = Thread(target=model.generate, kwargs=generation_kwargs)
+        t.start()
+
+        skipped = False
+        for i, text in enumerate(streamer):
+            if text and not skipped:
+                history[-1]['content'] = history[-1]['content'].rstrip('.')
+                skipped = True
+            history[-1]['content'] += text
+            yield history
+
+        elapsed_time = round(time.perf_counter() - start_time, 1)
+        history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
+        yield history
+
+        if len(model.reg) > 0:
+            # 1. extract timestamps and confidences
+            blob = model.reg[0].cpu().float()
+            pred, conf = blob[:, :2] * duration, blob[:, -1].tolist()
+
+            # 2. clamp timestamps
+            pred = pred.clamp(min=0, max=duration)
+
+            # 3. sort timestamps
+            inds = (pred[:, 1] - pred[:, 0] < 0).nonzero()[:, 0]
+            pred[inds] = pred[inds].roll(1)
+
+            # 4. convert timestamps to list
+            pred = pred.tolist()
+        else:
+            if 'ver' in role:
+                pred = [[i * duration / 6, (i + 2) * duration / 6] for i in range(5)]
+                conf = [0] * 5
+            else:
+                pred = [[0, duration]]
+                conf = [0]
+
+        response = 'The candidate moments and confidence scores are as follows:\n'
+        response += '\n| ID | Start Time | End Time | Confidence |'
+        response += '\n| :-: | :-: | :-: | :-: |'
+
+        # using top-5 predictions
+        for i, (p, c) in enumerate(zip(pred[:5], conf[:5])):
+            response += f'\n| {i} | {seconds_to_hms(p[0])} | {seconds_to_hms(p[1])} | {c:.2f} |'
+
+        response += f'\n\nTherefore, the target moment might happens from <span style="color:red">{seconds_to_hms(pred[0][0])}</span> to <span style="color:red">{seconds_to_hms(pred[0][1])}</span>.'
+
+        history.append({'role': 'assistant', 'content': ''})
+        for i, text in enumerate(response.split(' ')):
+            history[-1]['content'] += ' ' + text if i > 0 else text
+            yield history
+
+    if 'ver' in role and do_grounding:
+        text = VERIFIER_PROMPT.format(query)
+
+        history.append({
+            'metadata': {
+                'title': '📊 Working as Verifier...'
+            },
+            'role': 'assistant',
+            'content': f'##### Verifier Prompt:\n\n{html.escape(text)}\n\n##### Verifier Response:\n\n...'
+        })
+        yield history
+
+        start_time = time.perf_counter()
+
+        # using top-5 predictions
+        prob = []
+        for i, cand in enumerate(pred[:5]):
+            s0, e0 = parse_span(cand, duration, 2)
+            offset = (e0 - s0) / 2
+            s1, e1 = parse_span([s0 - offset, e0 + offset], duration)
+
+            # percentage of s0, e0 within s1, e1
+            s = (s0 - s1) / (e1 - s1)
+            e = (e0 - s1) / (e1 - s1)
+
+            messages = [{
+                'role':
+                'user',
+                'content': [{
+                    'type': 'video',
+                    'video': video,
+                    'num_threads': 1,
+                    'video_start': s1,
+                    'video_end': e1,
+                    'min_pixels': 36 * 28 * 28,
+                    'max_pixels': 64 * 28 * 28,
+                    'max_frames': 64,
+                    'fps': 2.0
+                }, {
+                    'type': 'text',
+                    'text': text
+                }]
+            }]
+
+            text = processor.apply_chat_template(messages, add_generation_prompt=True)
+            images, videos = process_vision_info(messages)
+            data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
+
+            # ===== insert segment start/end tokens =====
+            video_grid_thw = data['video_grid_thw'][0]
+            num_frames, window = int(video_grid_thw[0]), int(video_grid_thw[1] * video_grid_thw[2] / 4)
+            assert num_frames * window * 4 == data['pixel_values_videos'].size(0)
+
+            pos_s, pos_e = round(s * num_frames), round(e * num_frames)
+            pos_s, pos_e = min(max(0, pos_s), num_frames), min(max(0, pos_e), num_frames)
+            assert pos_s <= pos_e, (num_frames, s, e)
+
+            base_idx = torch.nonzero(data['input_ids'][0] == model.config.vision_start_token_id).item()
+            pos_s, pos_e = pos_s * window + base_idx + 1, pos_e * window + base_idx + 2
+
+            input_ids = data['input_ids'][0].tolist()
+            input_ids.insert(pos_s, model.config.seg_s_token_id)
+            input_ids.insert(pos_e, model.config.seg_e_token_id)
+            data['input_ids'] = torch.LongTensor([input_ids])
+            data['attention_mask'] = torch.ones_like(data['input_ids'])
+            # ===========================================
+
+            data = data.to(device)
+
+            model.base_model.disable_adapter_layers()
+            model.base_model.enable_adapter_layers()
+            model.set_adapter('verifier')
+
+            with torch.inference_mode():
+                logits = model(**data).logits[0, -1].softmax(dim=-1)
+
+            # NOTE: magic numbers here
+            # In Qwen2-VL vocab: 9454 -> Yes, 2753 -> No
+            score = (logits[9454] - logits[2753]).sigmoid().item()
+            prob.append(score)
+
+            if i == 0:
+                history[-1]['content'] = history[-1]['content'].rstrip('.')[:-1]
+
+            response = f'\nCandidate ID {i}: P(Yes) = {score:.2f}'
+            for j, text in enumerate(response.split(' ')):
+                history[-1]['content'] += ' ' + text if j > 0 else text
+                yield history
+
+        elapsed_time = round(time.perf_counter() - start_time, 1)
+        history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
+        yield history
+
+        ranks = torch.Tensor(prob).argsort(descending=True).tolist()
+
+        prob = [prob[idx] for idx in ranks]
+        pred = [pred[idx] for idx in ranks]
+        conf = [conf[idx] for idx in ranks]
+
+        response = 'After verification, the candidate moments are re-ranked as follows:\n'
+        response += '\n| ID | Start Time | End Time | Score |'
+        response += '\n| :-: | :-: | :-: | :-: |'
+
+        ids = list(range(len(ranks)))
+        for r, p, c in zip(ranks, pred, prob):
+            response += f'\n| {ids[r]} | {seconds_to_hms(p[0])} | {seconds_to_hms(p[1])} | {c:.2f} |'
+
+        response += f'\n\nTherefore, the target moment should be from <span style="color:red">{seconds_to_hms(pred[0][0])}</span> to <span style="color:red">{seconds_to_hms(pred[0][1])}</span>.'
+
+        history.append({'role': 'assistant', 'content': ''})
+        for i, text in enumerate(response.split(' ')):
+            history[-1]['content'] += ' ' + text if i > 0 else text
+            yield history
+
+    if 'ans' in role and do_answering:
+        text = f'{prompt} Please think step by step and provide your response.'
+
+        history.append({
+            'metadata': {
+                'title': '📝 Working as Answerer...'
+            },
+            'role': 'assistant',
+            'content': f'##### Answerer Prompt:\n\n{html.escape(text)}\n\n##### Answerer Response:\n\n...'
+        })
+        yield history
+
+        start_time = time.perf_counter()
+
+        # choose the potential best moment
+        selected = pred[0] if 'gnd' in role and do_grounding else [0, duration]
+        s, e = parse_span(selected, duration, 32)
+
+        messages = [{
+            'role':
+            'user',
+            'content': [{
+                'type': 'video',
+                'video': video,
+                'num_threads': 1,
+                'video_start': s,
+                'video_end': e,
+                'min_pixels': 128 * 28 * 28,
+                'max_pixels': 256 * 28 * 28,
+                'max_frames': 32,
+                'fps': 2.0
+            }, {
+                'type': 'text',
+                'text': text
+            }]
+        }]
+
+        text = processor.apply_chat_template(messages, add_generation_prompt=True)
+        images, videos = process_vision_info(messages)
+        data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
+        data = data.to(device)
+
+        with model.disable_adapter():
+            generation_kwargs = dict(
+                **data,
+                streamer=streamer,
+                do_sample=temperature > 0,
+                temperature=temperature if temperature > 0 else None,
+                top_p=None,
+                top_k=None,
+                repetition_penalty=None,
+                max_new_tokens=max_new_tokens)
+
+            t = Thread(target=model.generate, kwargs=generation_kwargs)
+            t.start()
+
+            skipped = False
+            for i, text in enumerate(streamer):
+                if text and not skipped:
+                    history[-1]['content'] = history[-1]['content'].rstrip('.')
+                    skipped = True
+                history[-1]['content'] += text
+                yield history
+
+        elapsed_time = round(time.perf_counter() - start_time, 1)
+        history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
+        yield history
+
+        if 'gnd' in role and do_grounding:
+            response = f'After zooming in and analyzing the target moment, I finalize my answer: <span style="color:green">{streamer.text_cache}</span>'
+        else:
+            response = f'After watching the whole video, my answer is: <span style="color:green">{streamer.text_cache}</span>'
+
+        history.append({'role': 'assistant', 'content': ''})
+        for i, text in enumerate(response.split(' ')):
+            history[-1]['content'] += ' ' + text if i > 0 else text
+            yield history
+
+
+if __name__ == '__main__':
+    if not nncore.is_dir(BASE_MODEL):
+        snapshot_download(BASE_MODEL_HF, local_dir=BASE_MODEL)
+
+    if not nncore.is_dir(MODEL):
+        snapshot_download(MODEL_HF, local_dir=MODEL)
+
+    print('Initializing role *grounder*')
+    model, processor = build_model(MODEL)
+
+    print('Initializing role *planner*')
+    model.load_adapter(nncore.join(MODEL, 'planner'), adapter_name='planner')
+
+    print('Initializing role *verifier*')
+    model.load_adapter(nncore.join(MODEL, 'verifier'), adapter_name='verifier')
+
+    streamer = CustomStreamer(processor.tokenizer, skip_prompt=True)
+
+    device = next(model.parameters()).device
+
+    main = partial(main, model=model, processor=processor, streamer=streamer, device=device)
+
+    path = os.path.dirname(os.path.realpath(__file__))
+
+    chat = gr.Chatbot(
+        type='messages',
+        height='70vh',
+        avatar_images=[f'{path}/assets/user.png', f'{path}/assets/bot.png'],
+        placeholder='A conversation with VideoMind',
+        label='VideoMind')
+
+    prompt = gr.Textbox(label='Text Prompt', placeholder='Ask a question about the video...')
+
+    with gr.Blocks(title=TITLE, css=CSS, js=JS) as demo:
+        gr.Markdown(TITLE_MD)
+        gr.Markdown(DESCRIPTION_MD)
+
+        with gr.Row():
+            with gr.Column(scale=3):
+                video = gr.Video()
+
+                with gr.Group():
+                    role = gr.CheckboxGroup(
+                        choices=[('🗺️ Planner', 'pla'), ('🔍 Grounder', 'gnd'), ('📊 Verifier', 'ver'),
+                                 ('📝 Answerer', 'ans')],
+                        value=['pla', 'gnd', 'ver', 'ans'],
+                        interactive=True,
+                        elem_id='role',
+                        label='Role(s) To Use',
+                        info='[Auto Planning]: Planner + Grounder + Verifier + Answerer<br>'
+                        '[Grounded Video Question-Answering]: Grounder + Verifier + Answerer<br>'
+                        '[Video Temporal Grounding]: Grounder + Verifier<br>'
+                        '[Direct Video Question-Answering]: Answerer<br>')
+                    role.change(update_placeholder, role, prompt)
+
+                    with gr.Accordion(label='Hyperparameters', open=False):
+                        temperature = gr.Slider(
+                            0,
+                            1,
+                            value=0,
+                            step=0.1,
+                            interactive=True,
+                            label='Temperature',
+                            info='Higher value leads to more creativity and randomness (Default: 0)')
+                        max_new_tokens = gr.Slider(
+                            1,
+                            1024,
+                            value=256,
+                            interactive=True,
+                            label='Max Output Tokens',
+                            info='The maximum number of output tokens for each role (Default: 256)')
+
+                prompt.render()
+
+                with gr.Row():
+                    random_btn = gr.Button(value='🔮 Random')
+                    random_btn.click(lambda: random.choice(EXAMPLES), None, [video, prompt, role])
+
+                    reset_btn = gr.ClearButton([video, prompt, chat], value='🗑️ Reset')
+                    reset_btn.click(lambda: (['pla', 'gnd', 'ver', 'ans'], 0, 256), None,
+                                    [role, temperature, max_new_tokens])
+
+                    submit_btn = gr.Button(value='🚀 Submit', variant='primary')
+                    submit_ctx = submit_btn.click(disable_btns, None, [random_btn, reset_btn, submit_btn])
+                    submit_ctx = submit_ctx.then(main, [video, prompt, role, temperature, max_new_tokens], chat)
+                    submit_ctx.then(enable_btns, None, [random_btn, reset_btn, submit_btn])
+
+            with gr.Column(scale=5):
+                chat.render()
+
+        demo.queue()
+        demo.launch(server_name='0.0.0.0')
diff --git a/assets/bot.png b/assets/bot.png
new file mode 100644
index 0000000000000000000000000000000000000000..696a3bb5c360358ca7174f49746001cba331e6b4
Binary files /dev/null and b/assets/bot.png differ
diff --git a/assets/user.png b/assets/user.png
new file mode 100644
index 0000000000000000000000000000000000000000..e43cee4e6edd51e83ab35aa6270cf16e80e7622d
Binary files /dev/null and b/assets/user.png differ
diff --git a/data/10309844035.mp4 b/data/10309844035.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..76e90e5f9f6c97f6f5dfe19242227a489a981169
--- /dev/null
+++ b/data/10309844035.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8996ff134787d6b769c2491b9079a02c05953465ad770f07a8d9138e2668d24f
+size 4041678
diff --git a/data/13887487955.mp4 b/data/13887487955.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..1e83efb63125b1ca29cd6162eaa5843b034f5961
--- /dev/null
+++ b/data/13887487955.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5fecab1076ee42b3804718f9f64bef06cbfafd6995ad5f5ee42ba6354721429
+size 5544739
diff --git a/data/4167294363.mp4 b/data/4167294363.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..339380896781b1ec77eaea1122a2dbcb0b91fc99
--- /dev/null
+++ b/data/4167294363.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d0e0a4a381836f68e16a816d87f241fed3e31ea321f544b921743d6c1c50666
+size 6611151
diff --git a/data/4742652230.mp4 b/data/4742652230.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..0e45cf00cf375d5eb97ab9fd179854daae792e27
--- /dev/null
+++ b/data/4742652230.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8733ab4b0716d13ea7a79fc4ddacaf9eede567db364f0ecddfa4582c2f237f82
+size 2200304
diff --git a/data/4766274786.mp4 b/data/4766274786.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..a8e42d09d51e795f05f895e788f9022470691f3f
--- /dev/null
+++ b/data/4766274786.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afa38a9ce9e89f934293214d79755c89159664223b3ca366813fd5fe524ed013
+size 3395545
diff --git a/data/5012237466.mp4 b/data/5012237466.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..1a96cf099d8d4baabc7ee45ebb6f78a5ad11fe48
--- /dev/null
+++ b/data/5012237466.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd1929aa93d037f809f402e9801047125dc9fe8060301e69ded9ba1f2d785cc8
+size 4822293
diff --git a/data/5188348585.mp4 b/data/5188348585.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..9e55d094776b4f180cc0fb008d1a9ac171ebd316
--- /dev/null
+++ b/data/5188348585.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b225f448a546ba2f65958f18c6731a6dde9b1f437014e90036b22eb40e9ad0a5
+size 5051675
diff --git a/data/9383140374.mp4 b/data/9383140374.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..b6899d90c9ecb2e4dfc82f003c270a84e877e59d
--- /dev/null
+++ b/data/9383140374.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30b6b3eb43f711bef194150d473a59850ff5d7fec0f5cc30e7526aa9e382303f
+size 2518081
diff --git a/data/DTInxNfWXVc_210.0_360.0.mp4 b/data/DTInxNfWXVc_210.0_360.0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..a4d851d6b9c28fef1e4a5b0feba8462faacf5ed1
--- /dev/null
+++ b/data/DTInxNfWXVc_210.0_360.0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a09eee0dc404688731fb768c120d3519605f2343376b9bd727a71b91379fd9a9
+size 4999970
diff --git a/data/RoripwjYFp8_210.0_360.0.mp4 b/data/RoripwjYFp8_210.0_360.0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..7f04c0f8971239598ee92cdafbf4ba601c8e7a9c
--- /dev/null
+++ b/data/RoripwjYFp8_210.0_360.0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b39b15158dc20c0bc6f1758a9239c8f3eed20ba4a90953338eec2246fa8f1f0
+size 9287252
diff --git a/data/UFWQKrcbhjI_360.0_510.0.mp4 b/data/UFWQKrcbhjI_360.0_510.0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..e87ba5ae900d689cc0ad626b4599af6ac25cd28d
--- /dev/null
+++ b/data/UFWQKrcbhjI_360.0_510.0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8669153d9ffac4b5534c20fab8d795347f5babe588da9b8330e049d623ebb443
+size 14510618
diff --git a/data/Z3-IZ3HAmIA_60.0_210.0.mp4 b/data/Z3-IZ3HAmIA_60.0_210.0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..739af1eea71252773de9c6392548a1ef086b8ad8
--- /dev/null
+++ b/data/Z3-IZ3HAmIA_60.0_210.0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b3a342993ee61efc5f3b859cd9c1e0d360b3331eed9deb8466891e4bcacc554
+size 14397799
diff --git a/data/h6QKDqomIPk_210.0_360.0.mp4 b/data/h6QKDqomIPk_210.0_360.0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..819d3e86ec306105087248a3983c5c86cc0bd3cd
--- /dev/null
+++ b/data/h6QKDqomIPk_210.0_360.0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:103820de2b8a1a3935b39ed80d91cd08e546e5617310b3d1bb3dadb06b2ffb95
+size 13485144
diff --git a/data/pA6Z-qYhSNg_60.0_210.0.mp4 b/data/pA6Z-qYhSNg_60.0_210.0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..2ba3f64450c0f778e547d7ce2d7279bae48e52cc
--- /dev/null
+++ b/data/pA6Z-qYhSNg_60.0_210.0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c84660fd4ebd8c23a2a7364174b1e819fec8b0e1cb8b9d9cd86f9e429cbdf66c
+size 8658509
diff --git a/data/rrTIeJRVGjg_60.0_210.0.mp4 b/data/rrTIeJRVGjg_60.0_210.0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..9adaea8c1071a4b2e1caab23223b057bc8a9d052
--- /dev/null
+++ b/data/rrTIeJRVGjg_60.0_210.0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efe6f48a49963bd4880ef5065840e05dd25e2aa975870140bcdaf4220bbd2827
+size 11410412
diff --git a/data/yId2wIocTys_210.0_360.0.mp4 b/data/yId2wIocTys_210.0_360.0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..ad9d951b134ed06f0473f0cd5da48bdd404d66a5
--- /dev/null
+++ b/data/yId2wIocTys_210.0_360.0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:447fcb1fd1f94ed6a88d56dd0f6f859646cb8c58ed8e3b7a82f374e2cfee1646
+size 14769130
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..77959fa0157d314d3505add89f3372c8e7a27662
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,26 @@
+accelerate==1.2.1
+decord==0.6.0
+gradio==4.44.1
+pandas==2.2.3
+peft==0.14.0
+pysrt==1.1.2
+scikit-image==0.25.0
+scikit-learn==1.6.1
+sentencepiece==0.2.0
+termplotlib==0.3.9
+triton==3.0.0
+
+# our codebase contains necessary patches for 4.45.2
+transformers==4.45.2
+
+# https://github.com/microsoft/DeepSpeed/issues/6793
+deepspeed==0.15.4
+
+# https://github.com/pytorch/pytorch/issues/138386
+torch==2.4.1
+torchvision==0.19.1
+
+# torch-npu only supports torch 2.4.0
+# torch==2.4.0+cpu
+# torch-npu==2.4.0.post2
+# torchvision==0.19.0+cpu
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..3e1da030391c89b484ebd58e303514e1e409af80
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,16 @@
+[yapf]
+column_limit = 120
+based_on_style = pep8
+blank_line_before_nested_class_or_def = true
+split_before_expression_after_opening_paren = true
+
+[isort]
+line_length = 120
+multi_line_output = 0
+known_third_party = decord,deepspeed,gradio,huggingface_hub,nncore,numpy,pandas,peft,PIL,pysrt,safetensors,tabulate,termplotlib,torch,torchvision,transformers
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = FIRSTPARTY
+
+[flake8]
+max-line-length = 500
+extend-ignore = E741
diff --git a/videomind/constants.py b/videomind/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..31da716b64740f8e5574765328c67ee7f42ce1e9
--- /dev/null
+++ b/videomind/constants.py
@@ -0,0 +1,42 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+IGNORE_INDEX = -100
+
+REG_TOKEN = '<|reg|>'
+
+SEG_S_TOKEN = '<|seg_start|>'
+SEG_E_TOKEN = '<|seg_end|>'
+
+PLANNER_PROMPT = (
+    'You are acting as the planner now. '
+    'Given a question about the video, your task is to analyze the question and identify the best way to answer this question. '
+    'You have access to the following tools:\n\n'
+    'Grounder: Accepts a text query and localize the relevant video segment according to the query.\n'
+    'Verifier: A tool supporting grounder by verifying the reliability of its outputs.\n'
+    'Answerer: Answer a given question directly based on the whole video or a cropped video segment.\n\n'
+    'Your response must be a list in JSON format. '
+    'A valid plan for reasoning could be "grounder, verifier, answer", "grounder, verifier", or "answerer", depending on the given question. '
+    'Please see an example for the format below.\n\n'
+    '[{{"type": "grounder", "value": "<text query>"}}, {{"type": "verifier"}}, {{"type": "answerer"}}]\n\n'
+    'Note that only the grounder can accept an argument called "value", which is the text query used for grounding. '
+    "Now I give you the question: '{}'. "
+    'Please think carefully and respond with your plan in JSON directly.')
+
+GROUNDER_PROMPT = (
+    'You are acting as the grounder now. '
+    'Given a video and a text query, your goal is to temporally localize the video moment described by the query. '
+    'If the query is directly describing a moment, simply localize it according to its content. '
+    "Otherwise, if the moment is described as 'before/after a pivotal event', you need to determine the actual event it refers to. "
+    'The localized moment should only cover the target event. '
+    "Now I give you the query: '{}'. "
+    'Please think carefully and provide your response.')
+
+VERIFIER_PROMPT = (
+    'You are acting as the verifier now. '
+    'You will be presented a text query describing a moment that potentialy happens in the given video. '
+    f'Your task is to identify whether the video segment between {SEG_S_TOKEN} and {SEG_E_TOKEN} perfectly covers the moment. '
+    f'If the described moment can be seen in the video, please focus on verifying whether the moment starts at {SEG_S_TOKEN} and ends at {SEG_E_TOKEN}. '
+    "Respond with 'Yes' if you think the moment boundaries are correct, otherwise 'No'. "
+    "If the described moment cannot be seen in the video, respond with 'No' directly. "
+    "Now I give you the query: '{}'. "
+    "Please think carefully and respond with 'Yes' or 'No' directly.")
diff --git a/videomind/conversation.py b/videomind/conversation.py
new file mode 100644
index 0000000000000000000000000000000000000000..04e8f645099b8cc0838ac8e1acde1180a771158e
--- /dev/null
+++ b/videomind/conversation.py
@@ -0,0 +1,49 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+from dataclasses import dataclass
+from typing import List
+
+
+@dataclass
+class Conversation:
+    style: str
+    system: str
+    roles: List[str]
+    seps: List[str]
+    messages: List[str]
+
+    def append_message(self, role, msg):
+        self.messages.append([role, msg])
+
+    def clear(self):
+        self.messages = []
+
+    def get_prompt(self):
+        assert self.style in ('chatml', )
+
+        prompt = self.system + self.seps[0] if self.system is not None else ''
+
+        for i, (role, msg) in enumerate(self.messages):
+            prompt += role
+            sep = self.seps[i % 2]
+            if msg is not None:
+                prompt += msg
+                if not prompt.endswith(sep):
+                    prompt += sep
+
+        prompt = prompt.lstrip('\n')
+        return prompt
+
+
+def get_conv(conv_type):
+    if conv_type == 'chatml':
+        conv = Conversation(
+            style='chatml',
+            system='<|im_start|>system\nYou are a helpful assistant.',
+            roles=('\n<|im_start|>user\n', '\n<|im_start|>assistant\n'),
+            seps=('<|im_end|>', '<|im_end|>'),
+            messages=[])
+    else:
+        raise ValueError(f'unknown conversation type: {conv_type}')
+
+    return conv
diff --git a/videomind/dataset/__init__.py b/videomind/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7a88064ccc36a04f9e565bfe7ba1d850509c655
--- /dev/null
+++ b/videomind/dataset/__init__.py
@@ -0,0 +1,61 @@
+from .collator import HybridDataCollator
+from .hybrid import HybridDataset
+from .sub_classes import (ActivitynetCaptionsBiasDataset, ActivitynetCaptionsDataset, ActivitynetRTLDataset,
+                          CGBenchDataset, CharadesSTADataset, CosMoCapDataset, DiDeMoDataset, Ego4DNaQDataset,
+                          Ego4DNLQDataset, EgoTimeQACropDataset, EgoTimeQADataset, EgoTimeQAGroundingDataset,
+                          HiRESTGroundingDataset, HiRESTStepBiasDataset, HiRESTStepDataset, InternVidVTimeDataset,
+                          LongVideoBenchDataset, LVBenchDataset, MLVUDataset, MVBenchDataset, NExTGQACropDataset,
+                          NExTGQADataset, NExTGQAGroundingDataset, NExTQADataset, QAEgo4DCropDataset, QAEgo4DDataset,
+                          QAEgo4DGroundingDataset, QuerYDDataset, QVHighlightsDataset, ReXTimeCropDataset,
+                          ReXTimeDataset, ReXTimeGroundingDataset, STARDataset, TACoSDataset, VideoMMEDataset,
+                          VideoXumDataset, VidMorpDataset, YouCook2BiasDataset, YouCook2Dataset)
+from .wrappers import AnsweringCropDataset, AnsweringDataset, GroundingDataset, PlanningDataset, VerifyingDataset
+
+__all__ = [
+    'HybridDataCollator',
+    'HybridDataset',
+    'ActivitynetCaptionsBiasDataset',
+    'ActivitynetCaptionsDataset',
+    'ActivitynetRTLDataset',
+    'CGBenchDataset',
+    'CharadesSTADataset',
+    'CosMoCapDataset',
+    'DiDeMoDataset',
+    'Ego4DNaQDataset',
+    'Ego4DNLQDataset',
+    'EgoTimeQACropDataset',
+    'EgoTimeQADataset',
+    'EgoTimeQAGroundingDataset',
+    'HiRESTGroundingDataset',
+    'HiRESTStepBiasDataset',
+    'HiRESTStepDataset',
+    'InternVidVTimeDataset',
+    'LongVideoBenchDataset',
+    'LVBenchDataset',
+    'MLVUDataset',
+    'MVBenchDataset',
+    'NExTGQACropDataset',
+    'NExTGQADataset',
+    'NExTGQAGroundingDataset',
+    'NExTQADataset',
+    'QAEgo4DCropDataset',
+    'QAEgo4DDataset',
+    'QAEgo4DGroundingDataset',
+    'QuerYDDataset',
+    'QVHighlightsDataset',
+    'ReXTimeCropDataset',
+    'ReXTimeDataset',
+    'ReXTimeGroundingDataset',
+    'STARDataset',
+    'TACoSDataset',
+    'VideoMMEDataset',
+    'VideoXumDataset',
+    'VidMorpDataset',
+    'YouCook2BiasDataset',
+    'YouCook2Dataset',
+    'AnsweringCropDataset',
+    'AnsweringDataset',
+    'GroundingDataset',
+    'PlanningDataset',
+    'VerifyingDataset',
+]
diff --git a/videomind/dataset/collator.py b/videomind/dataset/collator.py
new file mode 100644
index 0000000000000000000000000000000000000000..de6d2f34d36c9a1b1970d6f5035b5f10e1e3557f
--- /dev/null
+++ b/videomind/dataset/collator.py
@@ -0,0 +1,40 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import warnings
+
+import torch
+from torch.nn.utils.rnn import pad_sequence
+
+from videomind.constants import IGNORE_INDEX
+
+
+class HybridDataCollator(object):
+
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+
+    def __call__(self, batch):
+        input_ids = [d['input_ids'] for d in batch]
+        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+
+        labels = [d['labels'] for d in batch]
+        labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+
+        assert input_ids.size() == labels.size()
+
+        seq_len, max_len = input_ids.size(1), self.tokenizer.model_max_length
+        if seq_len > max_len:
+            warnings.warn(f'The length of input sequence is exceeding model max length: {seq_len} > {max_len}')
+            input_ids, labels = input_ids[:, :max_len], labels[:, :max_len]
+
+        data = dict(input_ids=input_ids, labels=labels, attention_mask=input_ids != self.tokenizer.pad_token_id)
+
+        for key in ('pixel_values', 'pixel_values_videos', 'image_grid_thw', 'video_grid_thw'):
+            if key in batch[0]:
+                data[key] = torch.cat([d[key] for d in batch])
+
+        for key in ('timestamps', 'saliency', 'pos_clip'):
+            if key in batch[0]:
+                data[key] = [d[key] for d in batch]
+
+        return data
diff --git a/videomind/dataset/hybrid.py b/videomind/dataset/hybrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..69f7ea6677f0d6cc057a31642995b7a1b4762739
--- /dev/null
+++ b/videomind/dataset/hybrid.py
@@ -0,0 +1,180 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import math
+import random
+from collections import defaultdict
+from itertools import accumulate
+
+import nncore
+import numpy as np
+import termplotlib as tpl
+import torch
+from tabulate import tabulate
+from torch.utils.data import Dataset
+
+from videomind.constants import IGNORE_INDEX
+from videomind.dataset.utils import preprocess, process_vision_info
+from videomind.utils.parser import parse_span
+
+DATASETS = nncore.Registry('datasets')
+
+
+class HybridDataset(Dataset):
+
+    def __init__(self, processor, model_config, model_args, data_args, training_args):
+        super().__init__()
+
+        datasets = []
+        for key in data_args.datasets.split(','):
+            datasets.append(DATASETS.get(key)(processor, model_args, data_args, training_args))
+
+        data_types = [a['data_type'] for d in datasets for a in d.annos]
+
+        cum_length = [0] + list(accumulate([len(d) for d in datasets]))
+        idx_ranges = [[cum_length[i], cum_length[i + 1]] for i in range(len(cum_length) - 1)]
+
+        if training_args.local_rank in (0, -1):
+            raw_length = sum(d.raw_length for d in datasets)
+            cur_length = idx_ranges[-1][-1]
+
+            ratio = round(cur_length / raw_length * 100, 2)
+            print(f'Number of samples: {raw_length} (original) -> {cur_length} (filtered) {ratio}%')
+
+            data_type_cnt = ' '.join([f'{data_types.count(t)} ({t})' for t in list(set(data_types))])
+            print(f'Data types: {data_type_cnt}')
+
+            tab = defaultdict(int)
+            for dataset in datasets:
+                for anno in dataset.annos:
+                    tab[anno.get('source', 'unknown')] += 1
+
+            tab = [[k, v, round(v / cur_length, 3)] for k, v in tab.items()]
+            print(tabulate(tab, headers=['Source', '#Samples', 'Ratio'], tablefmt='pretty', stralign='left'))
+
+            d, _ = torch.Tensor([a['duration'] for d in datasets for a in d.annos if 'duration' in a]).sort()
+            if d.size(0) > 0:
+                n, r = min(d.size(0), 10), d.flip(0)
+                print(f'Top-{n} max video durations: {[round(r[i].item(), 1) for i in range(n)]}')
+                print(f'Top-{n} min video durations: {[round(d[i].item(), 1) for i in range(n)]}')
+                print(f'Average video duration ({d.size(0)} samples): {round(d.mean().item(), 1)}s')
+
+                print('Video duration histogram:')
+                counts, edges = np.histogram(d)
+                labels = [f'{edges[i]:.2f}s - {edges[i + 1]:.2f}s' for i in range(len(edges) - 1)]
+                fig = tpl.figure()
+                fig.barh(counts, labels)
+                fig.show()
+
+            d, _ = torch.Tensor([abs(b[0] - b[1]) for d in datasets for a in d.annos if 'span' in a
+                                 for b in a['span']]).sort()
+            if d.size(0) > 0:
+                n, r = min(d.size(0), 10), d.flip(0)
+                print(f'Top-{n} max span durations: {[round(r[i].item(), 1) for i in range(n)]}')
+                print(f'Top-{n} min span durations: {[round(d[i].item(), 1) for i in range(n)]}')
+                print(f'Average span duration ({d.size(0)} samples): {round(d.mean().item(), 1)}s')
+
+                print('Span duration histogram:')
+                counts, edges = np.histogram(d)
+                labels = [f'{edges[i]:.2f}s - {edges[i + 1]:.2f}s' for i in range(len(edges) - 1)]
+                fig = tpl.figure()
+                fig.barh(counts, labels)
+                fig.show()
+
+        self.datasets = datasets
+        self.data_types = data_types
+        self.idx_ranges = idx_ranges
+        self.processor = processor
+        self.model_config = model_config
+        self.model_args = model_args
+        self.data_args = data_args
+        self.training_args = training_args
+
+    def __len__(self):
+        return self.idx_ranges[-1][-1]
+
+    def __getitem__(self, idx):
+        for retry in range(self.data_args.max_retries + 1):
+            try:
+                return self.fetch_data(idx)
+            except Exception as e:
+                print(f'Error in loading {idx}: {type(e).__name__}({e})')
+                idx = random.choice([i for i, t in enumerate(self.data_types) if t == self.data_types[idx]])
+
+        raise RuntimeError(f'Data loading failed after {retry} retries')
+
+    def map(self, *args, **kwargs):
+        return self
+
+    def fetch_data(self, idx):
+        for (s, e), dataset in zip(self.idx_ranges, self.datasets):
+            if s <= idx < e:
+                meta = dataset[idx - s]
+                break
+
+        text = self.processor.apply_chat_template(meta['messages'])
+        text = [text.strip()]
+
+        images, videos = process_vision_info(meta['messages'], sanity_check=True)
+
+        data = self.processor(text=text, images=images, videos=videos, return_tensors='pt')
+        assert data['input_ids'].size(0) == 1
+
+        data['input_ids'] = data['input_ids'][0]
+        data['labels'] = preprocess(data['input_ids'], text[0], self.processor.tokenizer, self.model_args.conv_type)
+
+        # insert segment start/end tokens
+        if 'ss' in meta and 'se' in meta:
+            video_grid_thw = data['video_grid_thw'][0]
+            num_frames, window = int(video_grid_thw[0]), int(video_grid_thw[1] * video_grid_thw[2] / 4)
+            assert num_frames * window * 4 == data['pixel_values_videos'].size(0)
+
+            pos_s, pos_e = round(meta['ss'] * num_frames), round(meta['se'] * num_frames)
+            pos_s, pos_e = min(max(0, pos_s), num_frames), min(max(0, pos_e), num_frames)
+            assert pos_s <= pos_e, (num_frames, meta['ss'], meta['se'])
+
+            base_idx = torch.nonzero(data['input_ids'] == self.model_config.vision_start_token_id).item()
+            pos_s, pos_e = pos_s * window + base_idx + 1, pos_e * window + base_idx + 2
+
+            input_ids = data['input_ids'].tolist()
+            input_ids.insert(pos_s, self.model_config.seg_s_token_id)
+            input_ids.insert(pos_e, self.model_config.seg_e_token_id)
+            data['input_ids'] = torch.LongTensor(input_ids)
+
+            labels = data['labels'].tolist()
+            labels.insert(pos_s, IGNORE_INDEX)
+            labels.insert(pos_e, IGNORE_INDEX)
+            data['labels'] = torch.LongTensor(labels)
+
+        if 'span' in meta:
+            span, duration = meta['span'], meta['duration']
+
+            pixel_values_videos, video_grid_thw = data['pixel_values_videos'], data['video_grid_thw']
+            num_frames = int(video_grid_thw[0][0])
+
+            assert video_grid_thw.size(0) == 1
+            assert video_grid_thw.prod() == pixel_values_videos.size(0)
+
+            # actual fps would be 1/2 of config (temporal patch size = 2)
+            fps = num_frames / duration
+
+            safe_span = [parse_span(b, duration, 1 / fps) for b in span]
+
+            # num_reg_tokens -> num_bnds -> s & e
+            timestamps = [[[s / duration, e / duration] for s, e in safe_span]]
+
+            saliency, pos_inds = torch.zeros(num_frames), []
+            for s, e in safe_span:
+                span_ind = max(0, s * fps), min(e * fps, num_frames)
+                pos_inds = list(range(math.ceil(span_ind[0]), math.ceil(span_ind[1])))
+                assert len(pos_inds) > 0, f'empty pos_inds ({idx}): {fps} {num_frames} {duration} {span}'
+                saliency[pos_inds] = 1
+
+            assert saliency.any(), f'empty saliency ({idx}): {pos_inds} {fps} {num_frames} {duration} {span}'
+            pos_clip = random.sample(saliency.nonzero()[:, 0].tolist(), 1)
+            pos_clip = torch.LongTensor(pos_clip)
+
+            data['timestamps'] = timestamps
+            data['saliency'] = saliency
+            data['pos_clip'] = pos_clip
+
+        return data
diff --git a/videomind/dataset/sub_classes/__init__.py b/videomind/dataset/sub_classes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..54279430775aef6395c99f636cbc2a2ac9d0d883
--- /dev/null
+++ b/videomind/dataset/sub_classes/__init__.py
@@ -0,0 +1,69 @@
+from .activitynet_captions import ActivitynetCaptionsBiasDataset, ActivitynetCaptionsDataset
+from .activitynet_rtl import ActivitynetRTLDataset
+from .cgbench import CGBenchDataset
+from .charades_sta import CharadesSTADataset
+from .cosmo_cap import CosMoCapDataset
+from .didemo import DiDeMoDataset
+from .ego4d_naq import Ego4DNaQDataset
+from .ego4d_nlq import Ego4DNLQDataset
+from .ego_timeqa import EgoTimeQACropDataset, EgoTimeQADataset, EgoTimeQAGroundingDataset
+from .hirest import HiRESTGroundingDataset, HiRESTStepBiasDataset, HiRESTStepDataset
+from .internvit_vtime import InternVidVTimeDataset
+from .longvideobench import LongVideoBenchDataset
+from .lvbench import LVBenchDataset
+from .mlvu import MLVUDataset
+from .mvbench import MVBenchDataset
+from .nextgqa import NExTGQACropDataset, NExTGQADataset, NExTGQAGroundingDataset
+from .nextqa import NExTQADataset
+from .qa_ego4d import QAEgo4DCropDataset, QAEgo4DDataset, QAEgo4DGroundingDataset
+from .queryd import QuerYDDataset
+from .qvhighlights import QVHighlightsDataset
+from .rextime import ReXTimeCropDataset, ReXTimeDataset, ReXTimeGroundingDataset
+from .star import STARDataset
+from .tacos import TACoSDataset
+from .vid_morp import VidMorpDataset
+from .videomme import VideoMMEDataset
+from .videoxum import VideoXumDataset
+from .youcook2 import YouCook2BiasDataset, YouCook2Dataset
+
+__all__ = [
+    'ActivitynetCaptionsBiasDataset',
+    'ActivitynetCaptionsDataset',
+    'ActivitynetRTLDataset',
+    'CGBenchDataset',
+    'CharadesSTADataset',
+    'CosMoCapDataset',
+    'DiDeMoDataset',
+    'Ego4DNaQDataset',
+    'Ego4DNLQDataset',
+    'EgoTimeQACropDataset',
+    'EgoTimeQADataset',
+    'EgoTimeQAGroundingDataset',
+    'HiRESTGroundingDataset',
+    'HiRESTStepBiasDataset',
+    'HiRESTStepDataset',
+    'InternVidVTimeDataset',
+    'LongVideoBenchDataset',
+    'LVBenchDataset',
+    'MLVUDataset',
+    'MVBenchDataset',
+    'NExTGQACropDataset',
+    'NExTGQADataset',
+    'NExTGQAGroundingDataset',
+    'NExTQADataset',
+    'QAEgo4DCropDataset',
+    'QAEgo4DDataset',
+    'QAEgo4DGroundingDataset',
+    'QuerYDDataset',
+    'QVHighlightsDataset',
+    'ReXTimeCropDataset',
+    'ReXTimeDataset',
+    'ReXTimeGroundingDataset',
+    'STARDataset',
+    'TACoSDataset',
+    'VidMorpDataset',
+    'VideoMMEDataset',
+    'VideoXumDataset',
+    'YouCook2BiasDataset',
+    'YouCook2Dataset',
+]
diff --git a/videomind/dataset/sub_classes/activitynet_captions.py b/videomind/dataset/sub_classes/activitynet_captions.py
new file mode 100644
index 0000000000000000000000000000000000000000..84487635903dbf4665e203d143d06ed82b338228
--- /dev/null
+++ b/videomind/dataset/sub_classes/activitynet_captions.py
@@ -0,0 +1,96 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+from collections import OrderedDict
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+
+
+@DATASETS.register(name='activitynet_captions')
+class ActivitynetCaptionsDataset(GroundingDataset):
+
+    ANNO_PATH_TRAIN = 'data/activitynet_captions/train.json'
+    ANNO_PATH_VALID = 'data/activitynet_captions/val_1.json'
+    ANNO_PATH_TEST = 'data/activitynet_captions/val_2.json'
+
+    VIDEO_ROOT = 'data/activitynet/videos_3fps_480_noaudio'
+    DURATIONS = 'data/activitynet/durations.json'
+
+    UNIT = 0.01
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
+        elif split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST, object_pairs_hook=OrderedDict)
+
+        durations = nncore.load(self.DURATIONS)
+
+        annos = []
+        for vid, raw_anno in raw_annos.items():
+            for query, span in zip(raw_anno['sentences'], raw_anno['timestamps']):
+                anno = dict(
+                    source='activitynet_captions',
+                    data_type='grounding',
+                    video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                    duration=durations[vid],
+                    query=parse_query(query),
+                    span=[span])
+
+                annos.append(anno)
+
+        return annos
+
+
+@DATASETS.register(name='activitynet_captions_bias')
+class ActivitynetCaptionsBiasDataset(ActivitynetCaptionsDataset):
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
+        elif split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST, object_pairs_hook=OrderedDict)
+
+        durations = nncore.load(self.DURATIONS)
+
+        annos = []
+        for vid, raw_anno in raw_annos.items():
+            assert len(raw_anno['sentences']) == len(raw_anno['timestamps'])
+
+            for i in range(len(raw_anno['sentences']) - 1):
+                span_a = raw_anno['timestamps'][i]
+                span_b = raw_anno['timestamps'][i + 1]
+
+                if span_b[0] - span_a[1] < 3:
+                    query_a = parse_query(f"The moment before {raw_anno['sentences'][i + 1]}")
+                    query_b = parse_query(f"The moment after {raw_anno['sentences'][i]}")
+
+                    anno_a = dict(
+                        source='activitynet_captions_bias',
+                        data_type='grounding',
+                        video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                        duration=durations[vid],
+                        query=query_a,
+                        span=[span_a])
+
+                    anno_b = dict(
+                        source='activitynet_captions_bias',
+                        data_type='grounding',
+                        video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                        duration=durations[vid],
+                        query=query_b,
+                        span=[span_b])
+
+                    annos.append(anno_a)
+                    annos.append(anno_b)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/activitynet_rtl.py b/videomind/dataset/sub_classes/activitynet_rtl.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d6e892d9739c8b511ce3aa493cfeda09c37f0c4
--- /dev/null
+++ b/videomind/dataset/sub_classes/activitynet_rtl.py
@@ -0,0 +1,68 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import re
+from collections import OrderedDict
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+
+
+@DATASETS.register(name='activitynet_rtl')
+class ActivitynetRTLDataset(GroundingDataset):
+
+    ANNO_PATH_TRAIN = 'data/activitynet_rtl/activitynet_train_gpt-4-0613_temp_6_f10009.json'
+    ANNO_PATH_TEST = 'data/activitynet_rtl/annot_val_1_q229.json'
+
+    VIDEO_ROOT = 'data/activitynet/videos_3fps_480_noaudio'
+
+    UNIT = 0.01
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
+
+            annos = []
+            for vid, raw_anno in raw_annos.items():
+                for meta in raw_anno['QA']:
+                    match = re.findall(r'<(\d+(\.\d+)?)>', meta['a'])
+                    span = [float(m[0]) for m in match[:2]]
+
+                    # some samples do not have timestamps
+                    if len(span) != 2:
+                        continue
+
+                    anno = dict(
+                        source='activitynet_rtl',
+                        data_type='grounding',
+                        video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                        duration=raw_anno['duration'],
+                        query=parse_query(meta['q']),
+                        span=[span])
+
+                    annos.append(anno)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST, object_pairs_hook=OrderedDict)
+
+            annos = []
+            for raw_anno in raw_annos:
+                vid = f"v_{raw_anno['vid']}"
+
+                match = re.findall(r'<(\d+(\.\d+)?)>', raw_anno['answer'])
+                span = [float(m[0]) for m in match[:2]]
+                assert len(span) == 2
+
+                anno = dict(
+                    source='activitynet_rtl',
+                    data_type='grounding',
+                    video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                    duration=raw_anno['duration'],
+                    query=parse_query(raw_anno['question']),
+                    span=[span])
+
+                annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/cgbench.py b/videomind/dataset/sub_classes/cgbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3130038172c971b28e114f67615fbd81a863e4e
--- /dev/null
+++ b/videomind/dataset/sub_classes/cgbench.py
@@ -0,0 +1,47 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+from torch.utils.data import Dataset
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.utils.parser import parse_query, parse_question
+
+
+@DATASETS.register(name='cgbench')
+class CGBenchDataset(Dataset):
+
+    ANNO_PATH_TEST = 'data/cgbench/cgbench_mini.json'
+
+    VIDEO_ROOT = 'data/cgbench/videos_3fps_480_noaudio'
+    SUBTITLE_ROOT = 'data/cgbench/subtitles'
+
+    UNIT = 0.001
+
+    @classmethod
+    def load_annos(self, split='test'):
+        assert split == 'test'
+
+        raw_annos = nncore.load(self.ANNO_PATH_TEST)
+
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video_uid']
+
+            anno = dict(
+                source='cgbench',
+                data_type='multimodal',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                subtitle_path=nncore.join(self.SUBTITLE_ROOT, vid + '.srt'),
+                duration=raw_anno['duration'],
+                query=parse_query(raw_anno['question']),
+                question=parse_question(raw_anno['question']),
+                options=[o.capitalize() for o in raw_anno['choices']],
+                answer=raw_anno['answer'].capitalize(),
+                ans=raw_anno['right_answer'],
+                span=raw_anno['clue_intervals'],
+                task=raw_anno['sub_category'],
+                domain=raw_anno['domain'])
+
+            annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/charades_sta.py b/videomind/dataset/sub_classes/charades_sta.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3f4448e29a32fe69363da3efec5dbbeabde9bf7
--- /dev/null
+++ b/videomind/dataset/sub_classes/charades_sta.py
@@ -0,0 +1,45 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+
+
+@DATASETS.register(name='charades_sta')
+class CharadesSTADataset(GroundingDataset):
+
+    ANNO_PATH_TRAIN = 'data/charades_sta/charades_sta_train.txt'
+    ANNO_PATH_TEST = 'data/charades_sta/charades_sta_test.txt'
+
+    VIDEO_ROOT = 'data/charades_sta/videos_3fps_480_noaudio'
+    DURATIONS = 'data/charades_sta/durations.json'
+
+    UNIT = 0.1
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST)
+
+        durations = nncore.load(self.DURATIONS)
+
+        annos = []
+        for raw_anno in raw_annos:
+            info, query = raw_anno.split('##')
+            vid, s, e = info.split()
+
+            anno = dict(
+                source='charades_sta',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=durations[vid],
+                query=parse_query(query),
+                span=[[float(s), float(e)]])
+
+            annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/cosmo_cap.py b/videomind/dataset/sub_classes/cosmo_cap.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e7fea88b2d57310db72429a1300d1aa26c67908
--- /dev/null
+++ b/videomind/dataset/sub_classes/cosmo_cap.py
@@ -0,0 +1,37 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+
+
+@DATASETS.register(name='cosmo_cap')
+class CosMoCapDataset(GroundingDataset):
+
+    ANNO_PATH = 'data/cosmo_cap/anno_cosmo_cap.jsonl'
+
+    VIDEO_ROOT = 'data/cosmo_cap/videos_3fps_480_noaudio'
+
+    UNIT = 1.0
+
+    @classmethod
+    def load_annos(self, split='train'):
+        assert split == 'train'
+
+        raw_annos = nncore.load(self.ANNO_PATH)
+
+        annos = []
+        for raw_anno in raw_annos:
+            anno = dict(
+                source='cosmo_cap',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, raw_anno['vid'] + '.mp4'),
+                duration=raw_anno['duration'],
+                query=parse_query(raw_anno['query']),
+                span=[raw_anno['span']])
+
+            annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/didemo.py b/videomind/dataset/sub_classes/didemo.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6bdba5eb6b62c52bc57207f9b487cf10d5378c6
--- /dev/null
+++ b/videomind/dataset/sub_classes/didemo.py
@@ -0,0 +1,59 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import random
+
+import nncore
+import numpy as np
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+
+
+@DATASETS.register(name='didemo')
+class DiDeMoDataset(GroundingDataset):
+
+    ANNO_PATH_TRAIN = 'data/didemo/train_data.json'
+    ANNO_PATH_VALID = 'data/didemo/val_data.json'
+    ANNO_PATH_TEST = 'data/didemo/test_data.json'
+
+    VIDEO_ROOT = 'data/didemo/videos_3fps_480_noaudio'
+    DURATIONS = 'data/didemo/durations.json'
+
+    UNIT = 1.0
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        elif split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST)
+
+        durations = nncore.load(self.DURATIONS)
+
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video'].split('.')[0]
+
+            # apply mean on multiple spans
+            span = np.array(raw_anno['times']).mean(axis=0).tolist()
+            span = [round(span[0] * 5), round((span[1] + 1) * 5)]
+
+            # augment spans during training
+            if split == 'train':
+                offset = random.randint(-2, 2)
+                span = [span[0] + offset, span[1] + offset]
+
+            anno = dict(
+                source='didemo',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=durations[vid],
+                query=parse_query(raw_anno['description']),
+                span=[span])
+
+            annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/ego4d_naq.py b/videomind/dataset/sub_classes/ego4d_naq.py
new file mode 100644
index 0000000000000000000000000000000000000000..917e4d1fef969e12f6a6de455f9ed5d25d1c8004
--- /dev/null
+++ b/videomind/dataset/sub_classes/ego4d_naq.py
@@ -0,0 +1,81 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+from collections import OrderedDict
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+
+
+@DATASETS.register(name='ego4d_naq')
+class Ego4DNaQDataset(GroundingDataset):
+
+    ANNO_PATH_TRAIN = 'data/ego4d_naq/train.json'
+    ANNO_PATH_VALID = 'data/ego4d_naq/val.json'
+    ANNO_PATH_TEST = 'data/ego4d_naq/test.json'
+
+    VIDEO_ROOT = 'data/ego4d/v2/videos_3fps_480_noaudio'
+
+    UNIT = 0.001
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
+        elif split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST, object_pairs_hook=OrderedDict)
+
+        annos = []
+        for vid, raw_anno in raw_annos.items():
+            duration = raw_anno['num_frames'] / raw_anno['fps']
+
+            # 300s: 254k samples (dropped 121k samples merged 156k samples)
+            # 480s: 567k samples (dropped 249k samples merged 328k samples)
+            if split == 'train' and (duration < 10 or duration > 600):
+                continue
+
+            meta = dict()
+            for span, query in zip(raw_anno['exact_times'], raw_anno['sentences']):
+                span = [round(span[0], 3), round(span[1], 3)]
+
+                query = parse_query(query)
+
+                # these annotations might be from nlq
+                nlq_keys = ('who', 'what', 'when', 'in what', 'did', 'where', 'how', 'i what')
+                if split == 'train' and any(query.startswith(k) for k in nlq_keys):
+                    continue
+
+                # bad samples
+                if split == 'train' and '#unsure' in query:
+                    continue
+
+                # too short or too long samples
+                num_words = len(query.split(' '))
+                if split == 'train' and (num_words < 3 or num_words > 30):
+                    continue
+
+                if query not in meta:
+                    meta[query] = []
+
+                meta[query].append(span)
+
+            for query, span in meta.items():
+                # skip samples with multiple moments
+                if len(span) > 1:
+                    continue
+
+                anno = dict(
+                    source='ego4d_naq',
+                    data_type='grounding',
+                    video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                    duration=duration,
+                    query=query,
+                    span=span)
+
+                annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/ego4d_nlq.py b/videomind/dataset/sub_classes/ego4d_nlq.py
new file mode 100644
index 0000000000000000000000000000000000000000..936c1f2b427e2023c37d9d8c839f906dee8e9dbc
--- /dev/null
+++ b/videomind/dataset/sub_classes/ego4d_nlq.py
@@ -0,0 +1,41 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+
+
+@DATASETS.register(name='ego4d_nlq')
+class Ego4DNLQDataset(GroundingDataset):
+
+    ANNO_PATH_TRAIN = 'data/ego4d_nlq/nlq_train.jsonl'
+    ANNO_PATH_VALID = 'data/ego4d_nlq/nlq_val.jsonl'
+
+    VIDEO_ROOT = 'data/ego4d/v2/videos_3fps_480_noaudio'
+
+    UNIT = 0.001
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_VALID)
+
+        annos = []
+        for raw_anno in raw_annos:
+            assert len(raw_anno['relevant_windows']) == 1
+
+            anno = dict(
+                source='ego4d_nlq',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, raw_anno['vid'] + '.mp4'),
+                duration=raw_anno['duration'],
+                query=parse_query(raw_anno['query']),
+                span=raw_anno['relevant_windows'])
+
+            annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/ego_timeqa.py b/videomind/dataset/sub_classes/ego_timeqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..521b02dfa13e95c9788b4bed23b3f00f6d744621
--- /dev/null
+++ b/videomind/dataset/sub_classes/ego_timeqa.py
@@ -0,0 +1,93 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import random
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import AnsweringCropDataset, AnsweringDataset, GroundingDataset
+from videomind.utils.parser import parse_query, parse_question
+
+
+@DATASETS.register(name='ego_timeqa')
+class EgoTimeQADataset(AnsweringDataset):
+
+    ANNO_PATH_TRAIN = 'data/ego_timeqa/annotations.EgoTimeQA.json'
+
+    VIDEO_ROOT = 'data/ego4d/v2/videos_3fps_480_noaudio'
+    DURATIONS = 'data/ego4d/v2/durations.json'
+
+    SOURCE = 'ego_timeqa'
+    DATA_TYPE = 'multimodal'
+
+    UNIT = 0.001
+
+    @classmethod
+    def load_annos(self, split='train'):
+        assert split == 'train'
+
+        raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        durations = nncore.load(self.DURATIONS)
+
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video_id']
+
+            duration = durations[vid]
+
+            # 303k -> 284k (to be verified)
+            if duration < 10 or duration > 600:
+                continue
+
+            span = [raw_anno['moment_start_frame'] / 30, raw_anno['moment_end_frame'] / 30]
+            span = [round(span[0], 3), round(span[1], 3)]
+
+            # this would remove many samples (284k -> 37k)
+            # if span[1] - span[0] < 2:
+            #     continue
+
+            question = raw_anno['question'].replace(' l ', ' I ').capitalize()
+            question = parse_question(question)
+            query = parse_query(question)
+
+            # too short or too long samples
+            num_words = len(query.split(' '))
+            if split == 'train' and (num_words < 3 or num_words > 30):
+                continue
+
+            answer = raw_anno['answer'].capitalize()
+
+            assert len(raw_anno['wrong_answers']) == 3
+            idx = random.randint(0, 3)
+            ans = chr(ord('A') + idx)
+            options = [o.capitalize() for o in raw_anno['wrong_answers']]
+            options.insert(idx, answer)
+
+            anno = dict(
+                source=self.SOURCE,
+                data_type=self.DATA_TYPE,
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=duration,
+                query=query,
+                question=question,
+                options=options,
+                answer=answer,
+                ans=ans,
+                span=[span])
+
+            annos.append(anno)
+
+        return annos
+
+
+@DATASETS.register(name='ego_timeqa_crop')
+class EgoTimeQACropDataset(AnsweringCropDataset, EgoTimeQADataset):
+
+    SOURCE = 'ego_timeqa_crop'
+
+
+@DATASETS.register(name='ego_timeqa_grounding')
+class EgoTimeQAGroundingDataset(GroundingDataset, EgoTimeQADataset):
+
+    SOURCE = 'ego_timeqa_grounding'
+    DATA_TYPE = 'grounding'
diff --git a/videomind/dataset/sub_classes/hirest.py b/videomind/dataset/sub_classes/hirest.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6681379e88a7005b0f255420b22b4c563219935
--- /dev/null
+++ b/videomind/dataset/sub_classes/hirest.py
@@ -0,0 +1,150 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+from collections import OrderedDict
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+
+
+@DATASETS.register(name='hirest_grounding')
+class HiRESTGroundingDataset(GroundingDataset):
+
+    ANNO_PATH_TRAIN = 'data/hirest/all_data_train.json'
+    ANNO_PATH_VALID = 'data/hirest/all_data_val.json'
+
+    VIDEO_ROOT = 'data/hirest/videos_3fps_480_noaudio'
+
+    UNIT = 1.0
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
+
+        all_videos = nncore.ls(self.VIDEO_ROOT, ext='.mp4')
+        all_videos = set(v[:11] for v in all_videos)
+
+        annos = []
+        for query, videos in raw_annos.items():
+            for video_name, raw_anno in videos.items():
+                if not raw_anno['relevant'] or not raw_anno['clip']:
+                    continue
+
+                assert len(raw_anno['bounds']) == 2
+
+                vid = video_name.split('.')[0]
+
+                if vid not in all_videos:
+                    continue
+
+                anno = dict(
+                    source='hirest_grounding',
+                    data_type='grounding',
+                    video_path=nncore.join(self.VIDEO_ROOT, video_name),
+                    duration=raw_anno['v_duration'],
+                    query=parse_query(query),
+                    span=[raw_anno['bounds']])
+
+                annos.append(anno)
+
+        return annos
+
+
+@DATASETS.register(name='hirest_step')
+class HiRESTStepDataset(HiRESTGroundingDataset):
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
+
+        all_videos = nncore.ls(self.VIDEO_ROOT, ext='.mp4')
+        all_videos = set(v[:11] for v in all_videos)
+
+        annos = []
+        for query, videos in raw_annos.items():
+            for video_name, raw_anno in videos.items():
+                if not raw_anno['relevant'] or not raw_anno['clip'] or len(raw_anno['steps']) == 0:
+                    continue
+
+                vid = video_name.split('.')[0]
+
+                if vid not in all_videos:
+                    continue
+
+                for step in raw_anno['steps']:
+                    assert len(step['absolute_bounds']) == 2
+
+                    anno = dict(
+                        source='hirest_step',
+                        data_type='grounding',
+                        video_path=nncore.join(self.VIDEO_ROOT, video_name),
+                        duration=raw_anno['v_duration'],
+                        query=parse_query(step['heading']),
+                        span=[step['absolute_bounds']])
+
+                    annos.append(anno)
+
+        return annos
+
+
+@DATASETS.register(name='hirest_step_bias')
+class HiRESTStepBiasDataset(HiRESTStepDataset):
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
+
+        all_videos = nncore.ls(self.VIDEO_ROOT, ext='.mp4')
+        all_videos = set(v[:11] for v in all_videos)
+
+        annos = []
+        for query, videos in raw_annos.items():
+            for video_name, raw_anno in videos.items():
+                if not raw_anno['relevant'] or not raw_anno['clip'] or len(raw_anno['steps']) == 0:
+                    continue
+
+                vid = video_name.split('.')[0]
+
+                if vid not in all_videos:
+                    continue
+
+                for i in range(len(raw_anno['steps']) - 1):
+                    span_a = raw_anno['steps'][i]['absolute_bounds']
+                    span_b = raw_anno['steps'][i + 1]['absolute_bounds']
+
+                    assert len(span_a) == 2 and len(span_b) == 2 and span_a[1] == span_b[0]
+
+                    query_a = parse_query(f"The moment before {raw_anno['steps'][i + 1]['heading']}")
+                    query_b = parse_query(f"The moment after {raw_anno['steps'][i]['heading']}")
+
+                    anno_a = dict(
+                        source='hirest_step_bias',
+                        data_type='grounding',
+                        video_path=nncore.join(self.VIDEO_ROOT, video_name),
+                        duration=raw_anno['v_duration'],
+                        query=query_a,
+                        span=[span_a])
+
+                    anno_b = dict(
+                        source='hirest_step_bias',
+                        data_type='grounding',
+                        video_path=nncore.join(self.VIDEO_ROOT, video_name),
+                        duration=raw_anno['v_duration'],
+                        query=query_b,
+                        span=[span_b])
+
+                    annos.append(anno_a)
+                    annos.append(anno_b)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/internvit_vtime.py b/videomind/dataset/sub_classes/internvit_vtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7e5abd90c86bd7447b06b732f2033d73be0f5ab
--- /dev/null
+++ b/videomind/dataset/sub_classes/internvit_vtime.py
@@ -0,0 +1,45 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+
+
+@DATASETS.register(name='internvid_vtime')
+class InternVidVTimeDataset(GroundingDataset):
+
+    ANNO_PATH = 'data/internvid_vtime/anno_internvid_vtime_query_gpt4o_mini.jsonl'
+
+    VIDEO_ROOT = 'data/internvid_vtime/videos_crop_3fps_480_noaudio'
+
+    UNIT = 0.1
+
+    @classmethod
+    def load_annos(self, split='train'):
+        assert split == 'train'
+
+        raw_annos = nncore.load(self.ANNO_PATH)
+
+        all_videos = nncore.ls(self.VIDEO_ROOT, ext='.mp4')
+        all_videos = set(v[:11] for v in all_videos)
+
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['vid']
+
+            if vid not in all_videos:
+                continue
+
+            anno = dict(
+                source='internvid_vtime',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=raw_anno['duration'],
+                query=parse_query(raw_anno['query']),
+                span=[raw_anno['span']])
+
+            annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/longvideobench.py b/videomind/dataset/sub_classes/longvideobench.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba0406555b43d76fbd12ee508c9f55983ded6337
--- /dev/null
+++ b/videomind/dataset/sub_classes/longvideobench.py
@@ -0,0 +1,53 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+from torch.utils.data import Dataset
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.utils.parser import parse_query, parse_question
+
+
+@DATASETS.register(name='longvideobench')
+class LongVideoBenchDataset(Dataset):
+
+    ANNO_PATH_VALID = 'data/longvideobench/lvb_val.json'
+    ANNO_PATH_TEST = 'data/longvideobench/lvb_test_wo_gt.json'
+
+    VIDEO_ROOT = 'data/longvideobench/videos_3fps_480_noaudio'
+
+    @classmethod
+    def load_annos(self, split='valid'):
+        if split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID)
+        else:
+            print('WARNING: Test split does not have ground truth annotations')
+            raw_annos = nncore.load(self.ANNO_PATH_TEST)
+
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video_id']
+
+            if vid.startswith('@'):
+                vid = vid[-19:]
+
+            # videos might come from youtube or other sources
+            assert len(vid) in (11, 19)
+
+            anno = dict(
+                source='longvideobench',
+                data_type='multimodal',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                query=parse_query(raw_anno['question']),
+                question=parse_question(raw_anno['question']),
+                options=raw_anno['candidates'],
+                task=str(raw_anno['duration_group']),
+                level=raw_anno['level'],
+                question_category=raw_anno['question_category'])
+
+            if 'correct_choice' in raw_anno:
+                anno['answer'] = raw_anno['candidates'][raw_anno['correct_choice']]
+                anno['ans'] = chr(ord('A') + raw_anno['correct_choice'])
+
+            annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/lvbench.py b/videomind/dataset/sub_classes/lvbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..0464c31d5dea502381305d128552231484623224
--- /dev/null
+++ b/videomind/dataset/sub_classes/lvbench.py
@@ -0,0 +1,52 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+from torch.utils.data import Dataset
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.utils.parser import parse_query, parse_question
+
+
+@DATASETS.register(name='lvbench')
+class LVBenchDataset(Dataset):
+
+    ANNO_PATH = 'data/lvbench/LVBench/video_info.meta.jsonl'
+
+    VIDEO_ROOT = 'data/lvbench/videos_3fps_480_noaudio'
+
+    @classmethod
+    def load_annos(self, split='test'):
+        assert split == 'test'
+
+        raw_annos = nncore.load(self.ANNO_PATH)
+
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['key']
+
+            for meta in raw_anno['qa']:
+                tok = meta['question'].split('\n')
+
+                assert len(tok) == 5
+                assert all(any(o.startswith(k) for k in ('(A) ', '(B) ', '(C) ', '(D) ')) for o in tok[1:])
+
+                options = [o[4:] for o in tok[1:]]
+                ans = meta['answer']
+                answer = options[ord(ans) - ord('A')]
+                assert ans in 'ABCD'
+
+                anno = dict(
+                    source='lvbench',
+                    data_type='multimodal',
+                    video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                    query=parse_query(tok[0]),
+                    question=parse_question(tok[0]),
+                    options=options,
+                    answer=answer,
+                    ans=ans,
+                    task=meta['question_type'],
+                    time_reference=meta['time_reference'])
+
+                annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/mlvu.py b/videomind/dataset/sub_classes/mlvu.py
new file mode 100644
index 0000000000000000000000000000000000000000..28dbe2bffc2ceb75d28e2b863440ce5fbac81d06
--- /dev/null
+++ b/videomind/dataset/sub_classes/mlvu.py
@@ -0,0 +1,55 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+from torch.utils.data import Dataset
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.utils.parser import parse_query, parse_question
+
+
+@DATASETS.register(name='mlvu')
+class MLVUDataset(Dataset):
+
+    TASK_TO_DIR_MAP = {
+        'plotQA': '1_plotQA',
+        'findNeedle': '2_needle',
+        'ego': '3_ego',
+        'count': '4_count',
+        'order': '5_order',
+        'anomaly_reco': '6_anomaly_reco',
+        'topic_reasoning': '7_topic_reasoning'
+    }
+
+    DATA_ROOT = 'data/mlvu'
+
+    @classmethod
+    def load_annos(self, split='test'):
+        assert split == 'test'
+
+        paths = [nncore.join(self.DATA_ROOT, 'json', f'{n}.json') for n in self.TASK_TO_DIR_MAP.values()]
+
+        raw_annos = nncore.flatten([nncore.load(p) for p in paths])
+
+        annos = []
+        for raw_anno in raw_annos:
+            task = raw_anno['question_type']
+            video_name = nncore.join(self.TASK_TO_DIR_MAP[task], raw_anno['video'])
+
+            options = raw_anno['candidates']
+            answer = raw_anno['answer']
+            ans = chr(ord('A') + options.index(answer))
+
+            anno = dict(
+                source='mlvu',
+                data_type='multimodal',
+                video_path=nncore.join(self.DATA_ROOT, 'video', video_name),
+                query=parse_query(raw_anno['question']),
+                question=parse_question(raw_anno['question']),
+                options=options,
+                answer=answer,
+                ans=ans,
+                task=task)
+
+            annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/mvbench.py b/videomind/dataset/sub_classes/mvbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..559660cecff898640de3d429a932eb8ad8e7272f
--- /dev/null
+++ b/videomind/dataset/sub_classes/mvbench.py
@@ -0,0 +1,74 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+from torch.utils.data import Dataset
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.utils.parser import parse_query, parse_question
+
+
+@DATASETS.register(name='mvbench')
+class MVBenchDataset(Dataset):
+
+    META_DATA = [('Episodic Reasoning', 'episodic_reasoning.json', 'tvqa/frames_fps3_hq', 'frame'),
+                 ('Action Sequence', 'action_sequence.json', 'star/Charades_v1_480', 'video'),
+                 ('Action Prediction', 'action_prediction.json', 'star/Charades_v1_480', 'video'),
+                 ('Action Antonym', 'action_antonym.json', 'ssv2_video', 'video'),
+                 ('Fine-grained Action', 'fine_grained_action.json', 'Moments_in_Time_Raw/videos', 'video'),
+                 ('Unexpected Action', 'unexpected_action.json', 'FunQA_test/test', 'video'),
+                 ('Object Existence', 'object_existence.json', 'clevrer/video_validation', 'video'),
+                 ('Object Interaction', 'object_interaction.json', 'star/Charades_v1_480', 'video'),
+                 ('Object Shuffle', 'object_shuffle.json', 'perception/videos', 'video'),
+                 ('Moving Direction', 'moving_direction.json', 'clevrer/video_validation', 'video'),
+                 ('Action Localization', 'action_localization.json', 'sta/sta_video', 'video'),
+                 ('Scene Transition', 'scene_transition.json', 'scene_qa/video', 'video'),
+                 ('Action Count', 'action_count.json', 'perception/videos', 'video'),
+                 ('Moving Count', 'moving_count.json', 'clevrer/video_validation', 'video'),
+                 ('Moving Attribute', 'moving_attribute.json', 'clevrer/video_validation', 'video'),
+                 ('State Change', 'state_change.json', 'perception/videos', 'video'),
+                 ('Fine-grained Pose', 'fine_grained_pose.json', 'nturgbd', 'video'),
+                 ('Character Order', 'character_order.json', 'perception/videos', 'video'),
+                 ('Egocentric Navigation', 'egocentric_navigation.json', 'vlnqa', 'video'),
+                 ('Counterfactual Inference', 'counterfactual_inference.json', 'clevrer/video_validation', 'video')]
+
+    DATA_ROOT = 'data/mvbench'
+
+    MIN_LEN = 64
+
+    @classmethod
+    def load_annos(self, split='test', sample_frames=32):
+        assert split == 'test'
+
+        annos = []
+        for meta in self.META_DATA:
+            raw_annos = nncore.load(nncore.join(self.DATA_ROOT, 'json', meta[1]))
+
+            for raw_anno in raw_annos:
+                video_name = nncore.join(meta[2], raw_anno['video'])
+                video_path = nncore.join(self.DATA_ROOT, 'video', video_name)
+
+                if meta[3] == 'frame':
+                    num_frames = len(nncore.ls(video_path, ext='.jpg'))
+                    video_path = [
+                        nncore.join(video_path, f'{i:0>5}.jpg')
+                        for i in range(1, num_frames + 1, num_frames // (sample_frames - 1))
+                    ][:sample_frames]
+
+                options = raw_anno['candidates']
+                answer = raw_anno['answer']
+                ans = chr(ord('A') + options.index(answer))
+
+                anno = dict(
+                    source='mvbench',
+                    data_type='multimodal',
+                    video_path=video_path,
+                    query=parse_query(raw_anno['question']),
+                    question=parse_question(raw_anno['question']),
+                    options=options,
+                    answer=answer,
+                    ans=ans,
+                    task=meta[0])
+
+                annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/nextgqa.py b/videomind/dataset/sub_classes/nextgqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..5afbb6e8bda0d3f940814b8cbad5f24742478ec9
--- /dev/null
+++ b/videomind/dataset/sub_classes/nextgqa.py
@@ -0,0 +1,87 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import csv
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import AnsweringCropDataset, AnsweringDataset, GroundingDataset
+from videomind.utils.parser import parse_query, parse_question
+
+
+@DATASETS.register(name='nextgqa')
+class NExTGQADataset(AnsweringDataset):
+
+    ANNO_PATH_VALID = 'data/nextgqa/val.csv'
+    ANNO_PATH_TEST = 'data/nextgqa/test.csv'
+
+    SPAN_PATH_VALID = 'data/nextgqa/gsub_val.json'
+    SPAN_PATH_TEST = 'data/nextgqa/gsub_test.json'
+
+    VIDEO_ID_MAP = 'data/nextgqa/map_vid_vidorID.json'
+    VIDEO_ROOT = 'data/nextqa/videos'
+
+    SOURCE = 'nextgqa'
+    DATA_TYPE = 'multimodal'
+
+    UNIT = 0.1
+
+    @classmethod
+    def load_annos(self, split='valid'):
+        assert split in ('valid', 'test')
+
+        if split == 'valid':
+            anno_path = self.ANNO_PATH_VALID
+            raw_spans = nncore.load(self.SPAN_PATH_VALID)
+        else:
+            anno_path = self.ANNO_PATH_TEST
+            raw_spans = nncore.load(self.SPAN_PATH_TEST)
+
+        with open(anno_path, mode='r') as f:
+            reader = csv.DictReader(f)
+            raw_annos = [d for d in reader]
+
+        video_id_map = nncore.load(self.VIDEO_ID_MAP)
+
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video_id']
+            qid = raw_anno['qid']
+
+            video_id = video_id_map[vid]
+
+            query = parse_query(raw_anno['question'].capitalize() + '?')
+            question = parse_question(raw_anno['question'].capitalize() + '?')
+            options = [raw_anno[k].capitalize() for k in ('a0', 'a1', 'a2', 'a3', 'a4')]
+            answer = raw_anno['answer'].capitalize()
+            ans = chr(ord('A') + options.index(answer))
+
+            anno = dict(
+                source=self.SOURCE,
+                data_type=self.DATA_TYPE,
+                video_path=nncore.join(self.VIDEO_ROOT, video_id + '.mp4'),
+                duration=raw_spans[vid]['duration'],
+                query=query,
+                question=question,
+                options=options,
+                answer=answer,
+                ans=ans,
+                span=raw_spans[vid]['location'][qid],
+                task=raw_anno['type'])
+
+            annos.append(anno)
+
+        return annos
+
+
+@DATASETS.register(name='nextgqa_crop')
+class NExTGQACropDataset(AnsweringCropDataset, NExTGQADataset):
+
+    SOURCE = 'nextgqa_crop'
+
+
+@DATASETS.register(name='nextgqa_grounding')
+class NExTGQAGroundingDataset(GroundingDataset, NExTGQADataset):
+
+    SOURCE = 'nextgqa_grounding'
+    DATA_TYPE = 'grounding'
diff --git a/videomind/dataset/sub_classes/nextqa.py b/videomind/dataset/sub_classes/nextqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..11e46acb67fd24da9e61c0a91522c5dd0e648093
--- /dev/null
+++ b/videomind/dataset/sub_classes/nextqa.py
@@ -0,0 +1,63 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import csv
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import AnsweringDataset
+from videomind.utils.parser import parse_query, parse_question
+
+
+@DATASETS.register(name='nextqa')
+class NExTQADataset(AnsweringDataset):
+
+    ANNO_PATH_TRAIN = 'data/nextqa/train.csv'
+    ANNO_PATH_VALID = 'data/nextqa/val.csv'
+    ANNO_PATH_TEST = 'data/nextqa/test.csv'
+
+    VIDEO_ID_MAP = 'data/nextqa/map_vid_vidorID.json'
+    VIDEO_ROOT = 'data/nextqa/NExTVideo'
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            anno_path = self.ANNO_PATH_TRAIN
+        elif split == 'valid':
+            anno_path = self.ANNO_PATH_VALID
+        else:
+            anno_path = self.ANNO_PATH_TEST
+
+        with open(anno_path, mode='r') as f:
+            reader = csv.DictReader(f)
+            raw_annos = [d for d in reader]
+
+        video_id_map = nncore.load(self.VIDEO_ID_MAP)
+
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video']
+            qid = raw_anno['qid']
+
+            video_id = video_id_map[vid]
+            query = parse_query(raw_anno['question'].capitalize() + '?')
+            question = parse_question(raw_anno['question'].capitalize() + '?')
+            options = [raw_anno[k].capitalize() for k in ('a0', 'a1', 'a2', 'a3', 'a4')]
+            ans = chr(ord('A') + int(raw_anno['answer']))
+            answer = options[int(raw_anno['answer'])]
+
+            anno = dict(
+                source='nextqa',
+                data_type='multimodal',
+                uid=f'{vid}_{qid}',
+                video_path=nncore.join(self.VIDEO_ROOT, video_id + '.mp4'),
+                query=query,
+                question=question,
+                options=options,
+                answer=answer,
+                ans=ans,
+                task=raw_anno['type'])
+
+            annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/qa_ego4d.py b/videomind/dataset/sub_classes/qa_ego4d.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7db13cb507cfce7cf9c038a0c38156b8e834f62
--- /dev/null
+++ b/videomind/dataset/sub_classes/qa_ego4d.py
@@ -0,0 +1,98 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import random
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import AnsweringCropDataset, AnsweringDataset, GroundingDataset
+from videomind.utils.parser import parse_query, parse_question
+
+
+@DATASETS.register(name='qa_ego4d')
+class QAEgo4DDataset(AnsweringDataset):
+
+    ANNO_PATH_TRAIN = 'data/qa_ego4d/annotations.QaEgo4D_train.json'
+    ANNO_PATH_VALID = 'data/qa_ego4d/annotations.QaEgo4D_val_options.json'
+    ANNO_PATH_TEST = 'data/qa_ego4d/annotations.QaEgo4D_test_options.json'
+
+    VIDEO_ROOT = 'data/ego4d/v1/videos_3fps_480_noaudio'
+    DURATIONS = 'data/ego4d/v1/durations.json'
+
+    SOURCE = 'qa_ego4d'
+    DATA_TYPE = 'multimodal'
+
+    UNIT = 0.001
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        elif split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST)
+
+        durations = nncore.load(self.DURATIONS)
+
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video_id']
+
+            duration = durations[vid]
+
+            # too short or too long samples
+            if split == 'train' and (duration < 10 or duration > 600):
+                continue
+
+            span = [raw_anno['moment_start_frame'] / 30, raw_anno['moment_end_frame'] / 30]
+            span = [round(span[0], 3), round(span[1], 3)]
+
+            # skip samples with too short moments
+            # if split == 'train' and span[1] - span[0] < 2:
+            #     continue
+
+            answer = raw_anno['answer'].capitalize()
+
+            if 'options' in raw_anno:
+                options = [o.capitalize() for o in raw_anno['options']]
+                idx = options.index(answer)
+                ans = chr(ord('A') + idx)
+            else:
+                # NOTE: indeterministic evaluation
+                assert len(raw_anno['wrong_answers']) == 3
+                idx = random.randint(0, 3)
+                ans = chr(ord('A') + idx)
+                options = [o.capitalize() for o in raw_anno['wrong_answers']]
+                options.insert(idx, answer)
+
+            assert len(options) == 4, options
+
+            anno = dict(
+                source=self.SOURCE,
+                data_type=self.DATA_TYPE,
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=duration,
+                query=parse_query(raw_anno['question'].capitalize()),
+                question=parse_question(raw_anno['question'].capitalize()),
+                options=options,
+                answer=answer,
+                ans=ans,
+                span=[span])
+
+            annos.append(anno)
+
+        return annos
+
+
+@DATASETS.register(name='qa_ego4d_crop')
+class QAEgo4DCropDataset(AnsweringCropDataset, QAEgo4DDataset):
+
+    SOURCE = 'qa_ego4d_crop'
+
+
+@DATASETS.register(name='qa_ego4d_grounding')
+class QAEgo4DGroundingDataset(GroundingDataset, QAEgo4DDataset):
+
+    SOURCE = 'qa_ego4d_grounding'
+    DATA_TYPE = 'grounding'
diff --git a/videomind/dataset/sub_classes/queryd.py b/videomind/dataset/sub_classes/queryd.py
new file mode 100644
index 0000000000000000000000000000000000000000..928a4b17541b6166223542498182f2cee51145f0
--- /dev/null
+++ b/videomind/dataset/sub_classes/queryd.py
@@ -0,0 +1,49 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+
+
+@DATASETS.register(name='queryd')
+class QuerYDDataset(GroundingDataset):
+
+    VID_PATH = 'data/queryd/train_list.txt'
+    QUERY_PATH = 'data/queryd/raw_captions_combined_filtered-v2.pkl'
+    SPAN_PATH = 'data/queryd/times_captions_combined_filtered-v2.pkl'
+
+    VIDEO_ROOT = 'data/queryd/videos_3fps_480_noaudio'
+    DURATIONS = 'data/queryd/durations.json'
+
+    UNIT = 0.001
+
+    @classmethod
+    def load_annos(self, split='train'):
+        assert split == 'train'
+
+        vids = nncore.load(self.VID_PATH)
+        queries = nncore.load(self.QUERY_PATH)
+        spans = nncore.load(self.SPAN_PATH)
+        durations = nncore.load(self.DURATIONS)
+
+        annos = []
+        for vid in vids:
+            for query, span in zip(queries[vid], spans[vid]):
+                video_name = vid[6:]
+
+                if video_name not in durations:
+                    continue
+
+                anno = dict(
+                    source='queryd',
+                    data_type='grounding',
+                    video_path=nncore.join(self.VIDEO_ROOT, video_name + '.mp4'),
+                    duration=durations[video_name],
+                    query=parse_query(' '.join(query)),
+                    span=[span])
+
+                annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/qvhighlights.py b/videomind/dataset/sub_classes/qvhighlights.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9b1cfc25bd938c1e44b627e2ac71efdea22c183
--- /dev/null
+++ b/videomind/dataset/sub_classes/qvhighlights.py
@@ -0,0 +1,78 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+
+
+@DATASETS.register(name='qvhighlights')
+class QVHighlightsDataset(GroundingDataset):
+
+    ANNO_PATH_TRAIN = 'data/qvhighlights/highlight_train_release.jsonl'
+    ANNO_PATH_VALID = 'data/qvhighlights/highlight_val_release.jsonl'
+    ANNO_PATH_TEST = 'data/qvhighlights/highlight_test_release.jsonl'
+
+    VIDEO_ROOT = 'data/qvhighlights/videos_3fps_480_noaudio'
+
+    UNIT = 2.0
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        elif split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID)
+        else:
+            print('WARNING: Test split does not have ground truth annotations')
+            raw_annos = nncore.load(self.ANNO_PATH_TEST)
+
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['vid']
+            qid = raw_anno['qid']
+
+            anno = dict(
+                source='qvhighlights',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=raw_anno['duration'],
+                query=parse_query(raw_anno['query']),
+                span=raw_anno.get('relevant_windows'),
+                vid=vid,
+                qid=qid)
+
+            annos.append(anno)
+
+        return annos
+
+
+@DATASETS.register(name='qvhighlights_single')
+class QVHighlightsSingleDataset(QVHighlightsDataset):
+
+    @classmethod
+    def load_annos(self, split='train'):
+        assert split == 'train'
+
+        raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+
+        annos = []
+        for raw_anno in raw_annos:
+            # skip samples with multiple moments
+            if len(raw_anno['relevant_windows']) > 1:
+                continue
+
+            vid = raw_anno['vid']
+
+            anno = dict(
+                source='qvhighlights_single',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=raw_anno['duration'],
+                query=parse_query(raw_anno['query']),
+                span=raw_anno.get('relevant_windows'))
+
+            annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/rextime.py b/videomind/dataset/sub_classes/rextime.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d782dcb8538f0fee50fcbd47a0078b4d029b3d4
--- /dev/null
+++ b/videomind/dataset/sub_classes/rextime.py
@@ -0,0 +1,81 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import AnsweringCropDataset, AnsweringDataset, GroundingDataset
+from videomind.utils.parser import parse_query, parse_question
+
+
+@DATASETS.register(name='rextime')
+class ReXTimeDataset(AnsweringDataset):
+
+    ANNO_PATH_TRAIN = 'data/rextime/rextime_train.json'
+    ANNO_PATH_VALID = 'data/rextime/rextime_val.json'
+    ANNO_PATH_TEST = 'data/rextime/rextime_test_release.json'
+
+    VIDEO_ROOT_ANET = 'data/activitynet/videos_3fps_480_noaudio'
+    VIDEO_ROOT_QVHL = 'data/qvhighlights/videos_3fps_480_noaudio'
+
+    DURATIONS_ANET = 'data/activitynet/durations.json'
+    DURATIONS_QVHL = 'data/qvhighlights/durations.json'
+
+    SOURCE = 'rextime'
+    DATA_TYPE = 'multimodal'
+
+    UNIT = 1.0
+    MIN_LEN = 64
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        elif split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID)
+        else:
+            print('WARNING: Test split does not have ground truth annotations')
+            raw_annos = nncore.load(self.ANNO_PATH_TEST)
+
+        durations_anet = nncore.load(self.DURATIONS_ANET)
+        durations_qvhl = nncore.load(self.DURATIONS_QVHL)
+
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['vid']
+
+            if len(vid) == 13:
+                video_path = nncore.join(self.VIDEO_ROOT_ANET, vid + '.mp4')
+                duration = durations_anet[vid]
+            else:
+                video_path = nncore.join(self.VIDEO_ROOT_QVHL, vid + '.mp4')
+                duration = durations_qvhl[vid]
+
+            anno = dict(
+                source=self.SOURCE,
+                data_type=self.DATA_TYPE,
+                video_path=video_path,
+                duration=duration,
+                query=parse_query(raw_anno['question']),
+                question=parse_question(raw_anno['question']),
+                options=[o.capitalize() for o in raw_anno['options']],
+                answer=raw_anno['answer'].replace('From <s0> to <e0>, ', '').capitalize(),
+                ans=raw_anno['ans'],
+                span=[raw_anno['span']],
+                task=raw_anno['category'])
+
+            annos.append(anno)
+
+        return annos
+
+
+@DATASETS.register(name='rextime_crop')
+class ReXTimeCropDataset(AnsweringCropDataset, ReXTimeDataset):
+
+    SOURCE = 'rextime_crop'
+
+
+@DATASETS.register(name='rextime_grounding')
+class ReXTimeGroundingDataset(GroundingDataset, ReXTimeDataset):
+
+    SOURCE = 'rextime_grounding'
+    DATA_TYPE = 'grounding'
diff --git a/videomind/dataset/sub_classes/star.py b/videomind/dataset/sub_classes/star.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f1f7349fd411969ccd2c024f07939f964fffe41
--- /dev/null
+++ b/videomind/dataset/sub_classes/star.py
@@ -0,0 +1,54 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import AnsweringCropDataset
+from videomind.utils.parser import parse_query, parse_question
+
+
+@DATASETS.register(name='star')
+class STARDataset(AnsweringCropDataset):
+
+    ANNO_PATH_TRAIN = 'data/star/STAR_train.json'
+    ANNO_PATH_VALID = 'data/star/STAR_val.json'
+
+    VIDEO_ROOT = 'data/charades_sta/videos_3fps_480_noaudio'
+    DURATIONS = 'data/charades_sta/durations.json'
+
+    UNIT = 0.1
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_VALID)
+
+        durations = nncore.load(self.DURATIONS)
+
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video_id']
+
+            options = [c['choice'] for c in raw_anno['choices']]
+            answer = raw_anno['answer']
+            ans = chr(ord('A') + options.index(answer))
+
+            anno = dict(
+                source='star',
+                data_type='multimodal',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=durations[vid],
+                query=parse_query(raw_anno['question']),
+                question=parse_question(raw_anno['question']),
+                options=options,
+                answer=answer,
+                ans=ans,
+                span=[[raw_anno['start'], raw_anno['end']]],
+                task=raw_anno['question_id'].split('_')[0],
+                no_aug=True)
+
+            annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/tacos.py b/videomind/dataset/sub_classes/tacos.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e606d77c56b787d5ed6b7b47b423ad82cc02032
--- /dev/null
+++ b/videomind/dataset/sub_classes/tacos.py
@@ -0,0 +1,46 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+
+
+@DATASETS.register(name='tacos')
+class TACoSDataset(GroundingDataset):
+
+    ANNO_PATH_TRAIN = 'data/tacos/train.jsonl'
+    ANNO_PATH_VALID = 'data/tacos/val.jsonl'
+    ANNO_PATH_TEST = 'data/tacos/test.jsonl'
+
+    VIDEO_ROOT = 'data/tacos/videos_3fps_480_noaudio'
+
+    UNIT = 0.001
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        elif split == 'val':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST)
+
+        annos = []
+        for raw_anno in raw_annos:
+            assert len(raw_anno['relevant_windows']) == 1
+
+            vid = raw_anno['vid']
+
+            anno = dict(
+                source='tacos',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '-cam-002.mp4'),
+                duration=raw_anno['duration'],
+                query=parse_query(raw_anno['query']),
+                span=raw_anno['relevant_windows'])
+
+            annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/vid_morp.py b/videomind/dataset/sub_classes/vid_morp.py
new file mode 100644
index 0000000000000000000000000000000000000000..3992abe238f888d0ba15dfd7924d951e14803f47
--- /dev/null
+++ b/videomind/dataset/sub_classes/vid_morp.py
@@ -0,0 +1,45 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+
+
+@DATASETS.register(name='vid_morp')
+class VidMorpDataset(GroundingDataset):
+
+    ANNO_PATH = 'data/vid_morp/anno_vid_morp.jsonl'
+
+    VIDEO_ROOT = 'data/vid_morp/videos_3fps_480_noaudio'
+
+    UNIT = 0.001
+
+    @classmethod
+    def load_annos(self, split='train'):
+        assert split == 'train'
+
+        raw_annos = nncore.load(self.ANNO_PATH)
+
+        all_videos = nncore.ls(self.VIDEO_ROOT, ext='.mp4')
+        all_videos = set(v[:11] for v in all_videos)
+
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['vid']
+
+            if vid not in all_videos:
+                continue
+
+            anno = dict(
+                source='vid_morp',
+                data_type='grounding',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                duration=raw_anno['duration'],
+                query=parse_query(raw_anno['query']),
+                span=[raw_anno['span']])
+
+            annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/videomme.py b/videomind/dataset/sub_classes/videomme.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee7fd32f443e87ce6dbdf16222dce5efcad43058
--- /dev/null
+++ b/videomind/dataset/sub_classes/videomme.py
@@ -0,0 +1,52 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+from torch.utils.data import Dataset
+
+import pandas as pd
+from videomind.dataset.hybrid import DATASETS
+from videomind.utils.parser import parse_query, parse_question
+
+
+@DATASETS.register(name='videomme')
+class VideoMMEDataset(Dataset):
+
+    ANNO_PATH = 'data/videomme/test-00000-of-00001.parquet'
+
+    VIDEO_ROOT = 'data/videomme/videos'
+    SUBTITLE_ROOT = 'data/videomme/subtitles'
+
+    @classmethod
+    def load_annos(self, split='test'):
+        assert split == 'test'
+
+        raw_annos = pd.read_parquet(self.ANNO_PATH).to_dict(orient='records')
+
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['videoID']
+
+            options = raw_anno['options'].tolist()
+
+            assert len(options) == 4
+            assert all(any(o.startswith(k) for k in ('A. ', 'B. ', 'C. ', 'D. ')) for o in options)
+
+            options = [o[3:] for o in options]
+            ans = raw_anno['answer']
+            answer = options[ord(ans) - ord('A')]
+            assert ans in 'ABCD'
+
+            anno = dict(
+                source='videomme',
+                data_type='multimodal',
+                video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                query=parse_query(raw_anno['question']),
+                question=parse_question(raw_anno['question']),
+                options=options,
+                answer=answer,
+                ans=ans,
+                task=raw_anno['duration'])
+
+            annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/videoxum.py b/videomind/dataset/sub_classes/videoxum.py
new file mode 100644
index 0000000000000000000000000000000000000000..16e60e8c04ea82f29e23bc81f510a58a732d6b08
--- /dev/null
+++ b/videomind/dataset/sub_classes/videoxum.py
@@ -0,0 +1,54 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+
+
+@DATASETS.register(name='videoxum')
+class VideoXumDataset(GroundingDataset):
+
+    ANNO_PATH_TRAIN = 'data/videoxum/train_videoxum.json'
+    ANNO_PATH_VALID = 'data/videoxum/val_videoxum.json'
+    ANNO_PATH_TEST = 'data/videoxum/test_videoxum.json'
+
+    VIDEO_ROOT = 'data/activitynet/videos_3fps_480_noaudio'
+
+    UNIT = 0.01
+
+    @classmethod
+    def load_annos(self, split='train'):
+        if split == 'train':
+            raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
+        elif split == 'valid':
+            raw_annos = nncore.load(self.ANNO_PATH_VALID)
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH_TEST)
+
+        annos = []
+        for raw_anno in raw_annos:
+            vid = raw_anno['video_id']
+
+            duration = raw_anno['duration']
+
+            for query, spans in zip(raw_anno['tsum'], raw_anno['vsum']):
+                assert len(spans) == 10
+
+                # average the spans from 10 annotators
+                span = [round(sum(s[0] for s in spans) / 10, 2), round(sum(s[1] for s in spans) / 10, 2)]
+
+                anno = dict(
+                    source='videoxum',
+                    data_type='grounding',
+                    video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                    duration=duration,
+                    query=parse_query(query),
+                    span=[span])
+
+                annos.append(anno)
+
+            annos.append(anno)
+
+        return annos
diff --git a/videomind/dataset/sub_classes/youcook2.py b/videomind/dataset/sub_classes/youcook2.py
new file mode 100644
index 0000000000000000000000000000000000000000..e277ebe1317c4d3d5eac37e32ba1ebe8db694a8e
--- /dev/null
+++ b/videomind/dataset/sub_classes/youcook2.py
@@ -0,0 +1,107 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+from collections import OrderedDict
+
+import nncore
+
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.wrappers import GroundingDataset
+from videomind.utils.parser import parse_query
+
+
+@DATASETS.register(name='youcook2')
+class YouCook2Dataset(GroundingDataset):
+
+    ANNO_PATH = 'data/youcook2/youcookii_annotations_trainval.json'
+
+    VIDEO_ROOT = 'data/youcook2/videos_3fps_480_noaudio'
+
+    UNIT = 1.0
+
+    @classmethod
+    def load_annos(self, split='train'):
+        subset = 'training' if split == 'train' else 'validation'
+
+        raw_annos = nncore.load(self.ANNO_PATH, object_pairs_hook=OrderedDict)['database']
+
+        all_videos = nncore.ls(self.VIDEO_ROOT, ext='.mp4')
+        all_videos = set(v[:11] for v in all_videos)
+
+        annos = []
+        for vid, raw_anno in raw_annos.items():
+            if raw_anno['subset'] != subset:
+                continue
+
+            if vid not in all_videos:
+                continue
+
+            duration = raw_anno['duration']
+
+            for meta in raw_anno['annotations']:
+                anno = dict(
+                    source='youcook2',
+                    data_type='grounding',
+                    video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                    duration=duration,
+                    query=parse_query(meta['sentence']),
+                    span=[meta['segment']])
+
+                annos.append(anno)
+
+            annos.append(anno)
+
+        return annos
+
+
+@DATASETS.register(name='youcook2_bias')
+class YouCook2BiasDataset(YouCook2Dataset):
+
+    @classmethod
+    def load_annos(self, split='train'):
+        subset = 'training' if split == 'train' else 'validation'
+
+        raw_annos = nncore.load(self.ANNO_PATH, object_pairs_hook=OrderedDict)['database']
+
+        all_videos = nncore.ls(self.VIDEO_ROOT, ext='.mp4')
+        all_videos = set(v[:11] for v in all_videos)
+
+        annos = []
+        for vid, raw_anno in raw_annos.items():
+            if raw_anno['subset'] != subset:
+                continue
+
+            if vid not in all_videos:
+                continue
+
+            duration = raw_anno['duration']
+
+            moments = raw_anno['annotations']
+
+            for i in range(len(moments) - 1):
+                span_a = moments[i]['segment']
+                span_b = moments[i + 1]['segment']
+
+                if span_b[0] - span_a[1] < 3:
+                    query_a = parse_query(f"The moment before {moments[i + 1]['sentence']}")
+                    query_b = parse_query(f"The moment after {moments[i]['sentence']}")
+
+                    anno_a = dict(
+                        source='youcook2_bias',
+                        data_type='grounding',
+                        video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                        duration=duration,
+                        query=parse_query(query_a),
+                        span=[span_a])
+
+                    anno_b = dict(
+                        source='youcook2_bias',
+                        data_type='grounding',
+                        video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
+                        duration=duration,
+                        query=parse_query(query_b),
+                        span=[span_b])
+
+                    annos.append(anno_a)
+                    annos.append(anno_b)
+
+        return annos
diff --git a/videomind/dataset/utils.py b/videomind/dataset/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..61378c8eec998aaaa1b2d5ec11d41bfff49e724b
--- /dev/null
+++ b/videomind/dataset/utils.py
@@ -0,0 +1,351 @@
+# Modified from https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
+
+import base64
+import math
+import warnings
+from io import BytesIO
+
+import decord
+import numpy as np
+import torch
+from PIL import Image, ImageSequence
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+
+import requests
+from videomind.constants import IGNORE_INDEX
+from videomind.conversation import get_conv
+
+IMAGE_FACTOR = 28
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+MAX_RATIO = 200
+
+VIDEO_MIN_PIXELS = 128 * 28 * 28
+VIDEO_MAX_PIXELS = 768 * 28 * 28
+VIDEO_TOTAL_PIXELS = 24576 * 28 * 28
+FRAME_FACTOR = 2
+FPS = 2.0
+FPS_MIN_FRAMES = 4
+FPS_MAX_FRAMES = 768
+
+
+def round_by_factor(number: int, factor: int) -> int:
+    """Returns the closest integer to 'number' that is divisible by 'factor'."""
+    return round(number / factor) * factor
+
+
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+    return math.ceil(number / factor) * factor
+
+
+def floor_by_factor(number: int, factor: int) -> int:
+    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+    return math.floor(number / factor) * factor
+
+
+def smart_resize(height: int,
+                 width: int,
+                 factor: int = IMAGE_FACTOR,
+                 min_pixels: int = MIN_PIXELS,
+                 max_pixels: int = MAX_PIXELS) -> tuple[int, int]:
+    """
+    Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}")
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    # change order here to ensure not exceeding max_pixels
+    if h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    return h_bar, w_bar
+
+
+def fetch_image(ele: dict[str, str | Image.Image], size_factor: int = IMAGE_FACTOR) -> Image.Image:
+    if "image" in ele:
+        image = ele["image"]
+    else:
+        image = ele["image_url"]
+    image_obj = None
+    if isinstance(image, Image.Image):
+        image_obj = image
+    elif image.startswith("http://") or image.startswith("https://"):
+        image_obj = Image.open(requests.get(image, stream=True).raw)
+    elif image.startswith("file://"):
+        image_obj = Image.open(image[7:])
+    elif image.startswith("data:image"):
+        if "base64," in image:
+            _, base64_data = image.split("base64,", 1)
+            data = base64.b64decode(base64_data)
+            image_obj = Image.open(BytesIO(data))
+    else:
+        image_obj = Image.open(image)
+    if image_obj is None:
+        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
+    image = image_obj.convert("RGB")
+
+    if "resized_height" in ele and "resized_width" in ele:
+        resized_height, resized_width = smart_resize(
+            ele["resized_height"],
+            ele["resized_width"],
+            factor=size_factor,
+        )
+    else:
+        width, height = image.size
+        min_pixels = ele.get("min_pixels", MIN_PIXELS)
+        max_pixels = ele.get("max_pixels", MAX_PIXELS)
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=size_factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+    image = image.resize((resized_width, resized_height))
+
+    return image
+
+
+def smart_nframes(
+    ele: dict,
+    total_frames: int,
+    video_fps: int | float,
+) -> int:
+    """calculate the number of frames for video used for model inputs.
+
+    Args:
+        ele (dict): a dict contains the configuration of video.
+            support either `fps` or `nframes`:
+                - nframes: the number of frames to extract for model inputs.
+                - fps: the fps to extract frames for model inputs.
+                    - min_frames: the minimum number of frames of the video, only used when fps is provided.
+                    - max_frames: the maximum number of frames of the video, only used when fps is provided.
+        total_frames (int): the original total number of frames of the video.
+        video_fps (int | float): the original fps of the video.
+
+    Raises:
+        ValueError: nframes should in interval [FRAME_FACTOR, total_frames].
+
+    Returns:
+        int: the number of frames for video used for model inputs.
+    """
+    assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`"
+    if "nframes" in ele:
+        nframes = round_by_factor(ele["nframes"], FRAME_FACTOR)
+    else:
+        fps = ele.get("fps", FPS)
+        min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
+        max_frames = floor_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR)
+        nframes = total_frames / video_fps * fps
+        nframes = min(max(nframes, min_frames), max_frames)
+        nframes = round_by_factor(nframes, FRAME_FACTOR)
+    if not (FRAME_FACTOR <= nframes and nframes <= total_frames):
+        raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.")
+    return nframes
+
+
+def _read_video_gif(path):
+    gif = Image.open(path)
+    frames = []
+    for frame in ImageSequence.Iterator(gif):
+        frames.append(np.array(frame.convert('RGB')))
+    frames = np.stack(frames, axis=0)
+    return frames
+
+
+def _read_video_decord(ele: dict, ) -> torch.Tensor:
+    """read video using decord.VideoReader
+
+    Args:
+        ele (dict): a dict contains the configuration of video.
+        support keys:
+            - video: the path of video. support "file://", "http://", "https://" and local path.
+            - video_start: the start time of video.
+            - video_end: the end time of video.
+    Returns:
+        torch.Tensor: the video tensor with shape (T, C, H, W).
+    """
+    video_path = ele["video"]
+    if video_path.endswith('.gif'):
+        video = _read_video_gif(video_path)
+        total_frames, video_fps = video.shape[0], ele.get('fps', FPS)
+    else:
+        vr = decord.VideoReader(video_path, num_threads=ele.get('num_threads', 0))
+        total_frames, video_fps = len(vr), vr.get_avg_fps()
+
+    # 1. re-calculate total frames
+    s = ele.get('video_start')
+    s = 0 if s is None else s
+    e = ele.get('video_end')
+    e = total_frames / video_fps if e is None else e
+    s_frame = min(max(0, round(s * video_fps)), total_frames - 1)
+    e_frame = min(max(0, round(e * video_fps)), total_frames - 1)
+    if s_frame > e_frame:
+        warnings.warn(f's_frame ({s_frame}) is greater than e_frame ({e_frame}), total_frames: {total_frames}')
+        s_frame, e_frame = e_frame, s_frame
+
+    # TODO: the actual total_frames shall be computed by e_frame - s_frame + 1
+    # but it would affect verifier's performance when video_start and video_end get clamped
+    # shall be fixed by using normalized timestamps instead of real time
+    total_frames = min(max(1, round((e - s) * video_fps)), total_frames)
+
+    if total_frames > FPS_MIN_FRAMES:
+        nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
+    else:
+        nframes = total_frames
+
+    # 2. generate frame ids
+    idx = torch.linspace(s_frame, e_frame, nframes).round().long().tolist()
+    assert len(idx) == nframes, (len(idx), nframes)
+
+    if video_path.endswith('.gif'):
+        video = video[idx]
+    else:
+        video = vr.get_batch(idx).asnumpy()
+
+    video = torch.tensor(video).permute(0, 3, 1, 2)  # Convert to TCHW format
+    return video
+
+
+def fetch_video(ele: dict, image_factor: int = IMAGE_FACTOR, sanity_check=False) -> torch.Tensor | list[Image.Image]:
+    if isinstance(ele["video"], str):
+        video = _read_video_decord(ele)
+        nframes, _, height, width = video.shape
+
+        min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
+        total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
+        max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
+        max_pixels = ele.get("max_pixels", max_pixels)
+        if "resized_height" in ele and "resized_width" in ele:
+            resized_height, resized_width = smart_resize(
+                ele["resized_height"],
+                ele["resized_width"],
+                factor=image_factor,
+            )
+        else:
+            resized_height, resized_width = smart_resize(
+                height,
+                width,
+                factor=image_factor,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+            )
+        video = transforms.functional.resize(
+            video,
+            [resized_height, resized_width],
+            interpolation=InterpolationMode.BICUBIC,
+            antialias=True,
+        ).float()
+
+        if sanity_check and (video == 0).all():
+            raise ValueError("video '{}' contains all zeros".format(ele["video"]))
+
+        return video
+    else:
+        assert isinstance(ele["video"], (list, tuple))
+        process_info = ele.copy()
+        process_info.pop("type", None)
+        process_info.pop("video", None)
+        images = [
+            fetch_image({
+                "image": video_element,
+                **process_info
+            }, size_factor=image_factor) for video_element in ele["video"]
+        ]
+        nframes = ceil_by_factor(len(images), FRAME_FACTOR)
+        if len(images) < nframes:
+            images.extend([images[-1]] * (nframes - len(images)))
+        return images
+
+
+def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[dict]:
+    vision_infos = []
+    if isinstance(conversations[0], dict):
+        conversations = [conversations]
+    for conversation in conversations:
+        for message in conversation:
+            if isinstance(message["content"], list):
+                for ele in message["content"]:
+                    if ("image" in ele or "image_url" in ele or "video" in ele
+                            or ele["type"] in ("image", "image_url", "video")):
+                        vision_infos.append(ele)
+    return vision_infos
+
+
+def process_vision_info(
+        conversations: list[dict] | list[list[dict]],
+        sanity_check=False) -> tuple[list[Image.Image] | None, list[torch.Tensor | list[Image.Image]] | None]:
+    vision_infos = extract_vision_info(conversations)
+    # Read images or videos
+    image_inputs = []
+    video_inputs = []
+    for vision_info in vision_infos:
+        if "image" in vision_info or "image_url" in vision_info:
+            image_inputs.append(fetch_image(vision_info))
+        elif "video" in vision_info:
+            video_inputs.append(fetch_video(vision_info, sanity_check=sanity_check))
+        else:
+            raise ValueError("image, image_url or video should in content.")
+    if len(image_inputs) == 0:
+        image_inputs = None
+    if len(video_inputs) == 0:
+        video_inputs = None
+    return image_inputs, video_inputs
+
+
+def preprocess_chatml(input_ids, text, tokenizer):
+    conv = get_conv('chatml')
+
+    rounds = [m + conv.seps[0] for m in text.split(conv.seps[0])]
+    assert (len(rounds) % 2 == 0) == (conv.system is not None)
+    assert rounds[-1] == conv.seps[0]
+    rounds = rounds[:-1]
+
+    if conv.system is None:
+        rounds = [''.join(rounds[i:i + 2]) for i in range(0, len(rounds), 2)]
+    else:
+        rounds = [''.join(rounds[:3])] + [''.join(rounds[i:i + 2]) for i in range(3, len(rounds), 2)]
+
+    labels = input_ids.clone()
+
+    sep = conv.seps[0] + conv.roles[1]
+    cur_len = 0
+
+    for i, rou in enumerate(rounds):
+        if len(rou) == 0:
+            break
+
+        ins = sep.join(rou.split(sep)[:-1]) + sep
+
+        rou_len = tokenizer(rou, return_length=True).length[0]
+        ins_len = tokenizer(ins, return_length=True).length[0]
+
+        labels[cur_len:cur_len + ins_len] = IGNORE_INDEX
+        cur_len += rou_len
+
+    if labels.size(0) != cur_len:
+        warnings.warn(f'Tokenization mismatch: {labels.size(0)} and {cur_len}')
+
+    return labels
+
+
+def preprocess(input_ids, text, tokenizer, conv_type):
+    if conv_type == 'chatml':
+        return preprocess_chatml(input_ids, text, tokenizer)
+    else:
+        raise ValueError(f'unknown conversation type: {conv_type}')
diff --git a/videomind/dataset/wrappers/__init__.py b/videomind/dataset/wrappers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..45952e692101072127f58be8115b8b6f0d0fac78
--- /dev/null
+++ b/videomind/dataset/wrappers/__init__.py
@@ -0,0 +1,6 @@
+from .answering import AnsweringCropDataset, AnsweringDataset
+from .grounding import GroundingDataset
+from .planning import PlanningDataset
+from .verifying import VerifyingDataset
+
+__all__ = ['AnsweringCropDataset', 'AnsweringDataset', 'GroundingDataset', 'PlanningDataset', 'VerifyingDataset']
diff --git a/videomind/dataset/wrappers/answering.py b/videomind/dataset/wrappers/answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..42d5ad3364391b3552ff6d49020a327e487103d3
--- /dev/null
+++ b/videomind/dataset/wrappers/answering.py
@@ -0,0 +1,112 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import copy
+import random
+
+from torch.utils.data import Dataset
+
+from videomind.utils.parser import parse_span
+
+
+class AnsweringDataset(Dataset):
+
+    def __init__(self, processor, model_args, data_args, training_args):
+        super(AnsweringDataset, self).__init__()
+
+        raw_annos = self.load_annos()
+
+        annos = []
+        for anno in raw_annos:
+            num_words = len(anno['question'].split(' ')) + len(anno['answer'].split(' '))
+            if data_args.min_num_words >= 0 and num_words < data_args.min_num_words:
+                continue
+            if data_args.max_num_words >= 0 and num_words > data_args.max_num_words:
+                continue
+            if data_args.min_video_len >= 0 and anno.get('duration', float('inf')) < data_args.min_video_len:
+                continue
+            if data_args.max_video_len >= 0 and anno.get('duration', 0) > data_args.max_video_len:
+                continue
+            annos.append(anno)
+
+        self.annos = annos
+        self.raw_length = len(raw_annos)
+        self.processor = processor
+        self.model_args = model_args
+        self.data_args = data_args
+        self.training_args = training_args
+
+    def __len__(self):
+        return len(self.annos)
+
+    def __getitem__(self, idx):
+        anno = copy.deepcopy(self.annos[idx])
+
+        video_path, question, answer = anno['video_path'], anno['question'], anno['answer']
+
+        messages = [{
+            'role':
+            'user',
+            'content': [{
+                'type': 'video',
+                'video': video_path,
+                'min_pixels': 128 * 28 * 28,
+                'max_pixels': 256 * 28 * 28,
+                'max_frames': 32,
+                'fps': 2.0
+            }, {
+                'type': 'text',
+                'text': question
+            }]
+        }, {
+            'role': 'assistant',
+            'content': answer
+        }]
+
+        meta = dict(messages=messages)
+        return meta
+
+
+class AnsweringCropDataset(AnsweringDataset):
+
+    def __getitem__(self, idx):
+        anno = copy.deepcopy(self.annos[idx])
+
+        video_path, question, answer = anno['video_path'], anno['question'], anno['answer']
+
+        if anno.get('no_aug'):
+            s, e = anno['span'][0]
+        else:
+            # max 32 frames / 2 fps
+            s, e = parse_span(anno['span'][0], anno['duration'], 16)
+
+            # apply temporal jittering
+            offset = (e - s) / 4
+            s = random.uniform(s - offset, s + offset)
+            e = random.uniform(e - offset, e + offset)
+
+            # clamp the augmented span
+            s, e = parse_span([s, e], anno['duration'])
+
+        messages = [{
+            'role':
+            'user',
+            'content': [{
+                'type': 'video',
+                'video': video_path,
+                'video_start': s,
+                'video_end': e,
+                'min_pixels': 128 * 28 * 28,
+                'max_pixels': 256 * 28 * 28,
+                'max_frames': 32,
+                'fps': 2.0
+            }, {
+                'type': 'text',
+                'text': question
+            }]
+        }, {
+            'role': 'assistant',
+            'content': answer
+        }]
+
+        meta = dict(messages=messages)
+        return meta
diff --git a/videomind/dataset/wrappers/grounding.py b/videomind/dataset/wrappers/grounding.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f0908d0ddf8f0c66f8c9cd6efe208b53e58a050
--- /dev/null
+++ b/videomind/dataset/wrappers/grounding.py
@@ -0,0 +1,65 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import copy
+
+from torch.utils.data import Dataset
+
+from videomind.constants import GROUNDER_PROMPT, REG_TOKEN
+
+
+class GroundingDataset(Dataset):
+
+    def __init__(self, processor, model_args, data_args, training_args):
+        super(GroundingDataset, self).__init__()
+
+        raw_annos = self.load_annos()
+
+        annos = []
+        for anno in raw_annos:
+            num_words = len(anno['query'].split(' '))
+            if data_args.min_num_words >= 0 and num_words < data_args.min_num_words:
+                continue
+            if data_args.max_num_words >= 0 and num_words > data_args.max_num_words:
+                continue
+            if data_args.min_video_len >= 0 and anno.get('duration', float('inf')) < data_args.min_video_len:
+                continue
+            if data_args.max_video_len >= 0 and anno.get('duration', 0) > data_args.max_video_len:
+                continue
+            annos.append(anno)
+
+        self.annos = annos
+        self.raw_length = len(raw_annos)
+        self.processor = processor
+        self.model_args = model_args
+        self.data_args = data_args
+        self.training_args = training_args
+
+    def __len__(self):
+        return len(self.annos)
+
+    def __getitem__(self, idx):
+        anno = copy.deepcopy(self.annos[idx])
+
+        video_path, duration, query, span = anno['video_path'], anno['duration'], anno['query'], anno['span']
+
+        messages = [{
+            'role':
+            'user',
+            'content': [{
+                'type': 'video',
+                'video': video_path,
+                'min_pixels': 36 * 28 * 28,
+                'max_pixels': 64 * 28 * 28,
+                'max_frames': 150,
+                'fps': 1.0
+            }, {
+                'type': 'text',
+                'text': GROUNDER_PROMPT.format(query)
+            }]
+        }, {
+            'role': 'assistant',
+            'content': f'The relevant moment happens in {REG_TOKEN}.'
+        }]
+
+        meta = dict(messages=messages, span=span, duration=duration)
+        return meta
diff --git a/videomind/dataset/wrappers/planning.py b/videomind/dataset/wrappers/planning.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b801a79e32b8b7555913935d159f83f6e9c1160
--- /dev/null
+++ b/videomind/dataset/wrappers/planning.py
@@ -0,0 +1,94 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import copy
+
+import nncore
+from torch.utils.data import Dataset
+
+from videomind.constants import PLANNER_PROMPT
+from videomind.dataset.hybrid import DATASETS
+
+
+class PlanningDataset(Dataset):
+
+    def __init__(self, processor, model_args, data_args, training_args):
+        super(PlanningDataset, self).__init__()
+
+        raw_annos = self.load_annos()
+
+        annos = []
+        for anno in raw_annos:
+            num_words = len(anno.get('question', '').split(' ')) + len(anno.get('query', '').split(' '))
+            if data_args.min_num_words >= 0 and num_words < data_args.min_num_words:
+                continue
+            if data_args.max_num_words >= 0 and num_words > data_args.max_num_words:
+                continue
+            if data_args.min_video_len >= 0 and anno.get('duration', float('inf')) < data_args.min_video_len:
+                continue
+            if data_args.max_video_len >= 0 and anno.get('duration', 0) > data_args.max_video_len:
+                continue
+            annos.append(anno)
+
+        self.annos = annos
+        self.raw_length = len(raw_annos)
+        self.processor = processor
+        self.model_args = model_args
+        self.data_args = data_args
+        self.training_args = training_args
+
+    def __len__(self):
+        return len(self.annos)
+
+    @classmethod
+    def load_annos(self, split='train'):
+        assert split == 'train'
+        annos = nncore.load(self.ANNO_PATH)
+        return annos
+
+    def __getitem__(self, idx):
+        anno = copy.deepcopy(self.annos[idx])
+
+        video_path, route, question, query = anno['video_path'], anno['route'], anno['question'], anno.get('query')
+
+        if route == 1:
+            # rephrasing + grounding + answering
+            response = f'[{{"type": "grounder", "value": "{query}"}}, {{"type": "verifier"}}, {{"type": "answerer"}}]'
+        elif route == 2:
+            # grounding + answering
+            response = f'[{{"type": "grounder", "value": "{question}"}}, {{"type": "verifier"}}, {{"type": "answerer"}}]'
+        elif route == 3:
+            # rephrasing + grounding
+            response = f'[{{"type": "grounder", "value": "{query}"}}, {{"type": "verifier"}}]'
+        elif route == 4:
+            # answering
+            response = '[{"type": "answerer"}]'
+        else:
+            raise KeyError(f'unknown route type: {route}')
+
+        messages = [{
+            'role':
+            'user',
+            'content': [{
+                'type': 'video',
+                'video': video_path,
+                'min_pixels': 36 * 28 * 28,
+                'max_pixels': 64 * 28 * 28,
+                'max_frames': 100,
+                'fps': 1.0
+            }, {
+                'type': 'text',
+                'text': PLANNER_PROMPT.format(question)
+            }]
+        }, {
+            'role': 'assistant',
+            'content': response
+        }]
+
+        meta = dict(messages=messages)
+        return meta
+
+
+@DATASETS.register(name='mixed_planning')
+class MixedPlanningDataset(PlanningDataset):
+
+    ANNO_PATH = 'data/planning/planning_nextqa_qvhighlights_gpt4o_mini.jsonl'
diff --git a/videomind/dataset/wrappers/verifying.py b/videomind/dataset/wrappers/verifying.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3279ead5ffece6c9c15932057b5065669c2fbd8
--- /dev/null
+++ b/videomind/dataset/wrappers/verifying.py
@@ -0,0 +1,180 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import copy
+
+import nncore
+import torch
+from nncore.ops import temporal_iou
+from torch.utils.data import Dataset
+
+from videomind.constants import VERIFIER_PROMPT
+from videomind.dataset.hybrid import DATASETS
+from videomind.utils.parser import parse_span
+
+
+class VerifyingDataset(Dataset):
+
+    def __init__(self, processor, model_args, data_args, training_args):
+        super(VerifyingDataset, self).__init__()
+
+        raw_annos = self.load_annos()
+
+        annos = []
+        for anno in raw_annos:
+            num_words = len(anno['query'].split(' '))
+            if data_args.min_num_words >= 0 and num_words < data_args.min_num_words:
+                continue
+            if data_args.max_num_words >= 0 and num_words > data_args.max_num_words:
+                continue
+            if data_args.min_video_len >= 0 and anno.get('duration', float('inf')) < data_args.min_video_len:
+                continue
+            if data_args.max_video_len >= 0 and anno.get('duration', 0) > data_args.max_video_len:
+                continue
+            annos.append(anno)
+
+        self.annos = annos
+        self.raw_length = len(raw_annos)
+        self.processor = processor
+        self.model_args = model_args
+        self.data_args = data_args
+        self.training_args = training_args
+
+    def __len__(self):
+        return len(self.annos)
+
+    @classmethod
+    def load_annos(self, split='train'):
+        assert split == 'train'
+
+        if nncore.is_dir(self.ANNO_PATH):
+            raw_paths = nncore.ls(self.ANNO_PATH, ext='json', join_path=True, sort=True)
+            raw_annos = nncore.flatten([nncore.load(p) for p in raw_paths])
+        else:
+            raw_annos = nncore.load(self.ANNO_PATH)
+
+        annos = []
+        for raw_anno in raw_annos:
+            # using top-5 predictions
+            for pred in raw_anno['pred'][:5]:
+                iou = temporal_iou(torch.Tensor([pred]), torch.Tensor(raw_anno['span']))
+                iou = torch.where(iou.isfinite(), iou, 0)
+                iou = iou.max().item()
+
+                positive = iou >= 0.5
+
+                anno = dict(
+                    source=self.SOURCE,
+                    data_type='multimodal',
+                    video_path=raw_anno['video_path'],
+                    duration=raw_anno['duration'],
+                    query=raw_anno['query'],
+                    span=raw_anno['span'],
+                    pred=pred,
+                    positive=positive,
+                    task=raw_anno.get('task', 'unknown'))
+
+                annos.append(anno)
+
+        pos_inds = [i for i, a in enumerate(annos) if a['positive']]
+        neg_inds = [i for i, a in enumerate(annos) if not a['positive']]
+
+        num_pos = len(pos_inds)
+        num_neg = len(neg_inds)
+
+        print(f'[{self.SOURCE}] pos: {num_pos} neg: {num_neg} n/p ratio: {num_neg / num_pos}')
+
+        # filter negative samples
+        # if num_neg > num_pos * 3:
+        #     neg_inds = random.sample(neg_inds, int(num_pos * 3))
+
+        # inds = pos_inds + neg_inds
+        # random.shuffle(inds)
+        # inds = comm.broadcast(inds)
+
+        # annos = [annos[i] for i in inds]
+
+        return annos
+
+    def __getitem__(self, idx):
+        anno = copy.deepcopy(self.annos[idx])
+
+        video_path, duration, query, positive = anno['video_path'], anno['duration'], anno['query'], anno['positive']
+
+        s0, e0 = parse_span(anno['pred'], duration, 2)
+        offset = (e0 - s0) / 2
+        s1, e1 = parse_span([s0 - offset, e0 + offset], duration)
+
+        # percentage of s0, e0 within s1, e1
+        s = (s0 - s1) / (e1 - s1)
+        e = (e0 - s1) / (e1 - s1)
+
+        messages = [{
+            'role':
+            'user',
+            'content': [{
+                'type': 'video',
+                'video': video_path,
+                'video_start': s1,
+                'video_end': e1,
+                'min_pixels': 36 * 28 * 28,
+                'max_pixels': 64 * 28 * 28,
+                'max_frames': 64,
+                'fps': 2.0
+            }, {
+                'type': 'text',
+                'text': VERIFIER_PROMPT.format(query)
+            }]
+        }]
+
+        messages = messages + [{'role': 'assistant', 'content': 'Yes.' if positive else 'No.'}]
+        meta = dict(messages=messages, ss=s, se=e)
+
+        return meta
+
+
+@DATASETS.register(name='qvhighlights_verify_2b')
+class QVHighlightsVerify2BDataset(VerifyingDataset):
+
+    ANNO_PATH = 'data/verifying/verifying_qvhighlights_2b.json'
+
+    SOURCE = 'qvhighlights_verify_2b'
+
+
+@DATASETS.register(name='didemo_verify_2b')
+class DiDeMoVerify2BDataset(VerifyingDataset):
+
+    ANNO_PATH = 'data/verifying/verifying_didemo_2b.json'
+
+    SOURCE = 'didemo_verify_2b'
+
+
+@DATASETS.register(name='tacos_verify_2b')
+class TACoSVerify2BDataset(VerifyingDataset):
+
+    ANNO_PATH = 'data/verifying/verifying_tacos_2b.json'
+
+    SOURCE = 'tacos_verify_2b'
+
+
+@DATASETS.register(name='qvhighlights_verify_7b')
+class QVHighlightsVerify7BDataset(VerifyingDataset):
+
+    ANNO_PATH = 'data/verifying/verifying_qvhighlights_7b.json'
+
+    SOURCE = 'qvhighlights_verify_7b'
+
+
+@DATASETS.register(name='didemo_verify_7b')
+class DiDeMoVerify7BDataset(VerifyingDataset):
+
+    ANNO_PATH = 'data/verifying/verifying_didemo_7b.json'
+
+    SOURCE = 'didemo_verify_7b'
+
+
+@DATASETS.register(name='tacos_verify_7b')
+class TACoSVerify7BDataset(VerifyingDataset):
+
+    ANNO_PATH = 'data/verifying/verifying_tacos_7b.json'
+
+    SOURCE = 'tacos_verify_7b'
diff --git a/videomind/eval/eval_auto.py b/videomind/eval/eval_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..e840b672d3f1dc5888e5091b154647d93f35fd1a
--- /dev/null
+++ b/videomind/eval/eval_auto.py
@@ -0,0 +1,289 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import argparse
+
+import nncore
+import torch
+from nncore.ops import temporal_area, temporal_intersection, temporal_iof, temporal_iou
+from tabulate import tabulate
+
+
+class SafeInt(int):
+
+    def __truediv__(self, other):
+        try:
+            return SafeInt(super().__truediv__(other))
+        except ZeroDivisionError:
+            return SafeInt(0)
+
+
+def check_ans(options, ans, response):
+    a = ans.lower()
+    b = response.lower().split(' ')[0].replace('(', '').replace(')', '').replace('.', '')
+    if len(b) != 1:
+        b = b[0]
+        nncore.log(f'WARNING: {response} -> {b}')
+    if b not in [chr(ord('a') + i) for i in range(len(options))]:
+        nncore.log(f'ERROR: {response} -> {b}')
+        return
+    return a == b
+
+
+def compute_iou(pred, span, conf, cgbench_mode, conf_thr):
+    pred_tensor = torch.Tensor(pred)
+    span_tensor = torch.Tensor(span)
+
+    if cgbench_mode:
+        if conf_thr > 0:
+            conf_tensor = torch.Tensor(conf)
+            keep = torch.cat((torch.LongTensor([0]), torch.where(conf_tensor > conf_thr)[0])).unique()
+            pred_tensor = pred_tensor[keep]
+        else:
+            pred_tensor = pred_tensor[:1]
+        pred_area = temporal_area(pred_tensor).sum()
+        span_area = temporal_area(span_tensor).sum()
+        inter = temporal_intersection(pred_tensor, span_tensor).sum()
+        iou = (inter / (pred_area + span_area - inter)).unsqueeze(0)
+        assert iou.numel() == 1
+    else:
+        iou = temporal_iou(pred_tensor, span_tensor)
+
+    iou = torch.where(iou.isfinite(), iou, 0)
+    return iou
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('pred_path')
+    parser.add_argument('--dataset')
+    parser.add_argument('--out_name', default='metrics.log')
+    parser.add_argument('--conf_thr', type=float, default=-1)
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    assert nncore.is_dir(args.pred_path)
+
+    log_file = nncore.join(args.pred_path, args.out_name)
+    nncore.set_default_logger(logger='eval', fmt=None, log_file=log_file)
+
+    if args.dataset is not None:
+        cgbench_mode = args.dataset == 'cgbench'
+        nncore.log(f'CG-Bench mode: {cgbench_mode}')
+    else:
+        cgbench_mode = False
+        nncore.log('Dataset is unknown, using default mode', log_level='WARNING')
+
+    pred_paths = nncore.ls(args.pred_path, ext=['json', 'jsonl'], join_path=True)
+    nncore.log(f'Total number of files: {len(pred_paths)}')
+
+    if cgbench_mode:
+        top_k = [1]
+        thres = [0.1, 0.2, 0.3, 0.4, 0.5]
+    else:
+        top_k = [1, 3, 5]
+        thres = [0.3, 0.5, 0.7]
+
+    tab_iou, tab_iop, tab_ans = dict(), dict(), dict()
+    iou_raise, iou_lower, iop_raise, iop_lower = SafeInt(0), SafeInt(0), SafeInt(0), SafeInt(0)
+    tab_iou_all = [SafeInt(0) for _ in range(len(top_k) * len(thres) + 3)]
+    tab_iop_all = [SafeInt(0) for _ in range(len(top_k) * len(thres) + 3)]
+    tab_ans_all = [SafeInt(0) for _ in range(len(thres) + 5)]
+
+    for path in pred_paths:
+        data = nncore.load(path)
+
+        for sample in data:
+            task = sample.get('task', 'unknown')
+
+            # samples in lvbench might have multiple tasks
+            if isinstance(task, str):
+                task = [task]
+
+            for t in task:
+                if t not in tab_iou:
+                    tab_iou[t] = [SafeInt(0) for _ in range(len(top_k) * len(thres) + 3)]
+
+                if t not in tab_iop:
+                    tab_iop[t] = [SafeInt(0) for _ in range(len(top_k) * len(thres) + 3)]
+
+                if t not in tab_ans:
+                    tab_ans[t] = [SafeInt(0) for _ in range(len(thres) + 5)]
+
+            iou_hit = [False for _ in range(len(thres) + 1)]
+            iop_hit = False
+
+            if 'pred' in sample and 'conf' in sample and 'span' in sample:
+                for t in task:
+                    tab_iou[t][0] += 1
+                    tab_iop[t][0] += 1
+                tab_iou_all[0] += 1
+                tab_iop_all[0] += 1
+
+                iou = compute_iou(sample['pred'], sample['span'], sample['conf'], cgbench_mode, args.conf_thr)
+                top = iou[0].max().item()
+
+                for t in task:
+                    tab_iou[t][-1] += top
+                tab_iou_all[-1] += top
+
+                for i, k in enumerate(top_k):
+                    for j, h in enumerate(thres):
+                        if iou[:k].max() >= h:
+                            for t in task:
+                                tab_iou[t][i * len(thres) + j + 2] += 1
+                            tab_iou_all[i * len(thres) + j + 2] += 1
+                            if k == 1:
+                                iou_hit[j + 1] = True
+                                if h == 0.5:
+                                    iou_hit[0] = True
+
+                if sample.get('pred_ori') is not None:
+                    iou = compute_iou(sample['pred_ori'], sample['span'], sample['conf_ori'], cgbench_mode,
+                                      args.conf_thr)
+                    iou = iou[0].max().item()
+
+                    if iou < top:
+                        iou_raise += 1
+                    if iou > top:
+                        iou_lower += 1
+
+                iop = temporal_iof(torch.Tensor(sample['pred']), torch.Tensor(sample['span']))
+                iop = torch.where(iop.isfinite(), iop, 0)
+                top = iop[0].max().item()
+
+                for t in task:
+                    tab_iop[t][-1] += top
+                tab_iop_all[-1] += top
+
+                for i, k in enumerate(top_k):
+                    for j, h in enumerate(thres):
+                        if iop[:k].max() >= h:
+                            for t in task:
+                                tab_iop[t][i * len(thres) + j + 2] += 1
+                            tab_iop_all[i * len(thres) + j + 2] += 1
+                            if k == 1 and h == 0.5:
+                                iop_hit = True
+
+                if sample.get('pred_ori') is not None:
+                    iop = temporal_iof(torch.Tensor(sample['pred_ori']), torch.Tensor(sample['span']))
+                    iop = torch.where(iop.isfinite(), iop, 0)
+                    iop = iop[0].max().item()
+
+                    if iop < top:
+                        iop_raise += 1
+                    if iop > top:
+                        iop_lower += 1
+
+                if not sample.get('grounder_success', True):
+                    for t in task:
+                        tab_iou[t][1] += 1
+                        tab_iop[t][1] += 1
+                    tab_iou_all[1] += 1
+                    tab_iop_all[1] += 1
+
+            if 'question' in sample and 'response' in sample:
+                for t in task:
+                    tab_ans[t][0] += 1
+                tab_ans_all[0] += 1
+
+                correct = check_ans(sample['options'], sample['ans'], sample['response'])
+
+                if correct:
+                    for t in task:
+                        tab_ans[t][2] += 1
+                    tab_ans_all[2] += 1
+                    if iou_hit[0]:
+                        for t in task:
+                            tab_ans[t][3] += 1
+                        tab_ans_all[3] += 1
+                    if iop_hit:
+                        for t in task:
+                            tab_ans[t][4] += 1
+                        tab_ans_all[4] += 1
+                    for i in range(1, len(iou_hit)):
+                        if iou_hit[i]:
+                            for t in task:
+                                tab_ans[t][i + 4] += 1
+                            tab_ans_all[i + 4] += 1
+                elif correct is None:
+                    for t in task:
+                        tab_ans[t][1] += 1
+                    tab_ans_all[1] += 1
+
+    tasks = sorted(list(set(list(tab_iou.keys()) + list(tab_iop.keys()) + list(tab_ans.keys()))))
+
+    if cgbench_mode:
+        nncore.log('\nGrounding (IoU):')
+        tab = tabulate(
+            [[task, tab_iou[task][0], tab_iou[task][1]] +
+             [f'{tab_iou[task][i] / tab_iou[task][0] * 100:.2f}' for i in range(2, len(tab_iou[task]))] +
+             [f'{sum(tab_iou[task][i] / tab_iou[task][0] for i in range(2, 2 + len(thres))) / len(thres) * 100:.2f}']
+             for task in tasks if task in tab_iou] +
+            [['all', tab_iou_all[0], tab_iou_all[1]] +
+             [f'{tab_iou_all[i] / tab_iou_all[0] * 100:.2f}' for i in range(2, len(tab_iou_all))] +
+             [f'{sum(tab_iou_all[i] / tab_iou_all[0] for i in range(2, 2 + len(thres))) / len(thres) * 100:.2f}']],
+            headers=['Task', '#Samples', 'Failed'] + [f'R{k}@{t}' for k in top_k for t in thres] + ['mIoU', 'rec.@IoU'],
+            tablefmt='pretty',
+            stralign='left')
+        nncore.log(tab)
+
+        nncore.log(f'\nIoU Raise ({tab_iou_all[0]} Samples): {iou_raise} ({iou_raise / tab_iou_all[0] * 100:.2f}%)')
+        nncore.log(f'IoU Lower ({tab_iou_all[0]} Samples): {iou_lower} ({iou_lower / tab_iou_all[0] * 100:.2f}%)')
+
+        nncore.log('\nQA:')
+        tab = tabulate(
+            [[task, tab_ans[task][0], tab_ans[task][1], f'{tab_ans[task][2] / tab_ans[task][0] * 100:.2f}'] +
+             [f'{sum(tab_ans[task][i] / tab_ans[task][0] for i in range(5, 5 + len(thres))) / len(thres) * 100:.2f}']
+             for task in tasks if task in tab_ans] +
+            [['all', tab_ans_all[0], tab_ans_all[1], f'{tab_ans_all[2] / tab_ans_all[0] * 100:.2f}'] +
+             [f'{sum(tab_ans_all[i] / tab_ans_all[0] for i in range(5, 5 + len(thres))) / len(thres) * 100:.2f}']],
+            headers=['Task', '#Samples', 'Failed', 'long-acc.', 'acc.@IoU'],
+            tablefmt='pretty',
+            stralign='left')
+        nncore.log(tab)
+    else:
+        nncore.log('\nGrounding (IoU):')
+        tab = tabulate(
+            [[task, tab_iou[task][0], tab_iou[task][1]] +
+             [f'{tab_iou[task][i] / tab_iou[task][0] * 100:.2f}' for i in range(2, len(tab_iou[task]))]
+             for task in tasks if task in tab_iou] +
+            [['all', tab_iou_all[0], tab_iou_all[1]] +
+             [f'{tab_iou_all[i] / tab_iou_all[0] * 100:.2f}' for i in range(2, len(tab_iou_all))]],
+            headers=['Task', '#Samples', 'Failed'] + [f'R{k}@{t}' for k in top_k for t in thres] + ['mIoU'],
+            tablefmt='pretty',
+            stralign='left')
+        nncore.log(tab)
+
+        nncore.log(f'\nIoU Raise ({tab_iou_all[0]} Samples): {iou_raise} ({iou_raise / tab_iou_all[0] * 100:.2f}%)')
+        nncore.log(f'IoU Lower ({tab_iou_all[0]} Samples): {iou_lower} ({iou_lower / tab_iou_all[0] * 100:.2f}%)')
+
+        nncore.log('\nGrounding (IoP):')
+        tab = tabulate(
+            [[task, tab_iop[task][0], tab_iop[task][1]] +
+             [f'{tab_iop[task][i] / tab_iop[task][0] * 100:.2f}' for i in range(2, len(tab_iop[task]))]
+             for task in tasks if task in tab_iop] +
+            [['all', tab_iop_all[0], tab_iop_all[1]] +
+             [f'{tab_iop_all[i] / tab_iop_all[0] * 100:.2f}' for i in range(2, len(tab_iop_all))]],
+            headers=['Task', '#Samples', 'Failed'] + [f'R{k}@{t}' for k in top_k for t in thres] + ['mIoP'],
+            tablefmt='pretty',
+            stralign='left')
+        nncore.log(tab)
+
+        nncore.log(f'\nIoP Raise ({tab_iop_all[0]} Samples): {iop_raise} ({iop_raise / tab_iop_all[0] * 100:.2f}%)')
+        nncore.log(f'IoP Lower ({tab_iop_all[0]} Samples): {iop_lower} ({iop_lower / tab_iop_all[0] * 100:.2f}%)')
+
+        nncore.log('\nQA:')
+        tab = tabulate(
+            [[task, tab_ans[task][0], tab_ans[task][1]] +
+             [f'{tab_ans[task][i] / tab_ans[task][0] * 100:.2f}' for i in range(2, 5)]
+             for task in tasks if task in tab_ans] +
+            [['all', tab_ans_all[0], tab_ans_all[1]] +
+             [f'{tab_ans_all[i] / tab_ans_all[0] * 100:.2f}' for i in range(2, 5)]],
+            headers=['Task', '#Samples', 'Failed', 'Acc', 'Acc (IoU >= 0.5)', 'Acc (IoP >= 0.5)'],
+            tablefmt='pretty',
+            stralign='left')
+        nncore.log(tab)
diff --git a/videomind/eval/eval_qvhighlights.py b/videomind/eval/eval_qvhighlights.py
new file mode 100644
index 0000000000000000000000000000000000000000..34ed2128660f2765e78e3c4e99c9beb43af2d8dc
--- /dev/null
+++ b/videomind/eval/eval_qvhighlights.py
@@ -0,0 +1,413 @@
+# Modified from https://github.com/showlab/UniVTG/blob/main/eval/eval.py
+
+import argparse
+import copy
+from collections import OrderedDict, defaultdict
+
+import nncore
+import numpy as np
+
+from sklearn.metrics import precision_recall_curve
+
+
+def compute_temporal_iou_batch_paired(a, b):
+    intersection = np.maximum(0, np.minimum(a[:, 1], b[:, 1]) - np.maximum(a[:, 0], b[:, 0]))
+    union = np.maximum(a[:, 1], b[:, 1]) - np.minimum(a[:, 0], b[:, 0])
+    return np.divide(intersection, union, out=np.zeros_like(intersection), where=union != 0)
+
+
+def compute_temporal_iou_batch_cross(spans1, spans2):
+    areas1 = spans1[:, 1] - spans1[:, 0]
+    areas2 = spans2[:, 1] - spans2[:, 0]
+    l = np.maximum(spans1[:, None, 0], spans2[None, :, 0])
+    r = np.minimum(spans1[:, None, 1], spans2[None, :, 1])
+    inter = np.clip(r - l, 0, None)
+    union = areas1[:, None] + areas2[None, :] - inter
+    iou = inter / union
+    return iou, union
+
+
+def interpolated_precision_recall(prc, rec):
+    mprc = np.hstack([[0], prc, [0]])
+    mrec = np.hstack([[0], rec, [1]])
+    for i in range(len(mprc) - 1)[::-1]:
+        mprc[i] = max(mprc[i], mprc[i + 1])
+    idx = np.where(mrec[1::] != mrec[0:-1])[0] + 1
+    ap = np.sum((mrec[idx] - mrec[idx - 1]) * mprc[idx])
+    return ap
+
+
+def compute_average_precision_detection(annos, prediction, tiou_thresholds=np.linspace(0.5, 0.95, 10)):
+    num_thresholds = len(tiou_thresholds)
+    num_gts = len(annos)
+    num_preds = len(prediction)
+    ap = np.zeros(num_thresholds)
+    if len(prediction) == 0:
+        return ap
+
+    num_positive = float(num_gts)
+    lock_gt = np.ones((num_thresholds, num_gts)) * -1
+    prediction.sort(key=lambda x: -x['score'])
+    tp = np.zeros((num_thresholds, num_preds))
+    fp = np.zeros((num_thresholds, num_preds))
+
+    ground_truth_by_videoid = dict()
+    for i, item in enumerate(annos):
+        item['index'] = i
+        ground_truth_by_videoid.setdefault(item['video-id'], []).append(item)
+
+    for idx, pred in enumerate(prediction):
+        if pred['video-id'] in ground_truth_by_videoid:
+            gts = ground_truth_by_videoid[pred['video-id']]
+        else:
+            fp[:, idx] = 1
+            continue
+
+        _pred = np.array([[pred['t-start'], pred['t-end']]])
+        _gt = np.array([[gt['t-start'], gt['t-end']] for gt in gts])
+        tiou_arr = compute_temporal_iou_batch_cross(_pred, _gt)[0]
+
+        tiou_arr = tiou_arr.reshape(-1)
+        tiou_sorted_idx = tiou_arr.argsort()[::-1]
+        for t_idx, tiou_threshold in enumerate(tiou_thresholds):
+            for j_idx in tiou_sorted_idx:
+                if tiou_arr[j_idx] < tiou_threshold:
+                    fp[t_idx, idx] = 1
+                    break
+                if lock_gt[t_idx, gts[j_idx]['index']] >= 0:
+                    continue
+                tp[t_idx, idx] = 1
+                lock_gt[t_idx, gts[j_idx]['index']] = idx
+                break
+
+            if fp[t_idx, idx] == 0 and tp[t_idx, idx] == 0:
+                fp[t_idx, idx] = 1
+
+    tp_cumsum = np.cumsum(tp, axis=1).astype(float)
+    fp_cumsum = np.cumsum(fp, axis=1).astype(float)
+    recall_cumsum = tp_cumsum / num_positive
+
+    precision_cumsum = tp_cumsum / (tp_cumsum + fp_cumsum)
+
+    for t_idx in range(len(tiou_thresholds)):
+        ap[t_idx] = interpolated_precision_recall(precision_cumsum[t_idx, :], recall_cumsum[t_idx, :])
+
+    return ap
+
+
+def get_ap(y_true, y_pred, interpolate=True, point_11=False):
+    assert len(y_true) == len(y_pred), 'Prediction and ground truth need to be of the same length'
+    if len(set(y_true)) == 1:
+        if y_true[0] == 0:
+            return 0
+        else:
+            return 1
+    else:
+        assert sorted(set(y_true)) == [0, 1], 'Ground truth can only contain elements {0,1}'
+
+    precision, recall, _ = precision_recall_curve(y_true, y_pred)
+    recall = recall.astype(np.float32)
+
+    if interpolate:
+        for i in range(1, len(precision)):
+            precision[i] = max(precision[i - 1], precision[i])
+
+    if point_11:
+        precision_11 = [precision[np.where(recall >= t)[0][-1]] for t in np.arange(0, 1.01, 0.1)]
+        return np.mean(precision_11)
+    else:
+        indices = np.where(np.diff(recall))
+        return np.mean(precision[indices])
+
+
+def compute_average_precision_detection_wrapper(input_triple, tiou_thresholds=np.linspace(0.5, 0.95, 10)):
+    qid, annos, prediction = input_triple
+    scores = compute_average_precision_detection(annos, prediction, tiou_thresholds=tiou_thresholds)
+    return qid, scores
+
+
+def compute_mr_ap(preds, annos, iou_thds=np.linspace(0.5, 0.95, 10), max_gt_windows=None, max_pred_windows=10):
+    iou_thds = [float(f'{e:.2f}') for e in iou_thds]
+    pred_qid2data = defaultdict(list)
+    for d in preds:
+        pred_windows = d['pred_relevant_windows'][:max_pred_windows] \
+            if max_pred_windows is not None else d['pred_relevant_windows']
+        qid = d['qid']
+        for w in pred_windows:
+            pred_qid2data[qid].append({'video-id': d['qid'], 't-start': w[0], 't-end': w[1], 'score': w[2]})
+
+    gt_qid2data = defaultdict(list)
+    for d in annos:
+        gt_windows = d['relevant_windows'][:max_gt_windows] \
+            if max_gt_windows is not None else d['relevant_windows']
+        qid = d['qid']
+        for w in gt_windows:
+            gt_qid2data[qid].append({'video-id': d['qid'], 't-start': w[0], 't-end': w[1]})
+    qid2ap_list = dict()
+    data_triples = [[qid, gt_qid2data[qid], pred_qid2data[qid]] for qid in pred_qid2data]
+    from functools import partial
+    compute_ap_from_triple = partial(compute_average_precision_detection_wrapper, tiou_thresholds=iou_thds)
+
+    for data_triple in data_triples:
+        qid, scores = compute_ap_from_triple(data_triple)
+        qid2ap_list[qid] = scores
+
+    ap_array = np.array(list(qid2ap_list.values()))
+    ap_thds = ap_array.mean(0)
+    iou_thd2ap = dict(zip([str(e) for e in iou_thds], ap_thds))
+    iou_thd2ap['average'] = np.mean(ap_thds)
+
+    iou_thd2ap = {k: float(f'{100 * v:.2f}') for k, v in iou_thd2ap.items()}
+    return iou_thd2ap
+
+
+def compute_mr_r1(preds, annos, iou_thds=np.linspace(0.3, 0.95, 14)):
+    iou_thds = [float(f'{e:.2f}') for e in iou_thds]
+    pred_qid2window = {d['qid']: d['pred_relevant_windows'][0][:2] for d in preds}
+    gt_qid2window = dict()
+    for d in annos:
+        cur_gt_windows = d['relevant_windows']
+        cur_qid = d['qid']
+        cur_max_iou_idx = 0
+        if len(cur_gt_windows) > 0:
+            cur_ious = compute_temporal_iou_batch_cross(
+                np.array([pred_qid2window[cur_qid]]), np.array(d['relevant_windows']))[0]
+            cur_max_iou_idx = np.argmax(cur_ious)
+        gt_qid2window[cur_qid] = cur_gt_windows[cur_max_iou_idx]
+
+    qids = list(pred_qid2window.keys())
+    pred_windows = np.array([pred_qid2window[k] for k in qids]).astype(float)
+    gt_windows = np.array([gt_qid2window[k] for k in qids]).astype(float)
+    pred_gt_iou = compute_temporal_iou_batch_paired(pred_windows, gt_windows)
+    iou_thd2recall_at_one = dict()
+    miou_at_one = float(f'{np.mean(pred_gt_iou) * 100:.2f}')
+    for thd in iou_thds:
+        iou_thd2recall_at_one[str(thd)] = float(f'{np.mean(pred_gt_iou >= thd) * 100:.2f}')
+    return iou_thd2recall_at_one, miou_at_one
+
+
+def compute_mr_r5(preds, annos, iou_thds=np.linspace(0.3, 0.95, 14)):
+    iou_thds = [float(f'{e:.2f}') for e in iou_thds]
+    pred_qid2window = {d['qid']: [x[:2] for x in d['pred_relevant_windows'][:5]] for d in preds}
+    gt_qid2window = dict()
+    pred_optimal_qid2window = dict()
+    for d in annos:
+        cur_gt_windows = d['relevant_windows']
+        cur_qid = d['qid']
+        cur_max_iou_pred = 0
+        cur_max_iou_gt = 0
+        if len(cur_gt_windows) > 0:
+            cur_ious = compute_temporal_iou_batch_cross(
+                np.array(pred_qid2window[cur_qid]), np.array(d['relevant_windows']))[0]
+            cur_ious[np.isnan(cur_ious)] = 0
+            cur_max_iou_pred, cur_max_iou_gt = np.where(cur_ious == np.max(cur_ious))
+            cur_max_iou_pred, cur_max_iou_gt = cur_max_iou_pred[0], cur_max_iou_gt[0]
+        pred_optimal_qid2window[cur_qid] = pred_qid2window[cur_qid][cur_max_iou_pred]
+        gt_qid2window[cur_qid] = cur_gt_windows[cur_max_iou_gt]
+
+    qids = list(pred_qid2window.keys())
+    pred_windows = np.array([pred_optimal_qid2window[k] for k in qids]).astype(float)
+    gt_windows = np.array([gt_qid2window[k] for k in qids]).astype(float)
+    pred_gt_iou = compute_temporal_iou_batch_paired(pred_windows, gt_windows)
+    iou_thd2recall_at_one = dict()
+    for thd in iou_thds:
+        iou_thd2recall_at_one[str(thd)] = float(f'{np.mean(pred_gt_iou >= thd) * 100:.2f}')
+    return iou_thd2recall_at_one
+
+
+def get_data_by_range(preds, annos, len_range):
+    min_l, max_l = len_range
+    if min_l == 0 and max_l == float('inf'):
+        return preds, annos
+
+    ground_truth_in_range = []
+    gt_qids_in_range = set()
+    for d in annos:
+        rel_windows_in_range = [w for w in d['relevant_windows'] if min_l < (w[1] - w[0]) <= max_l]
+        if len(rel_windows_in_range) > 0:
+            d = copy.deepcopy(d)
+            d['relevant_windows'] = rel_windows_in_range
+            ground_truth_in_range.append(d)
+            gt_qids_in_range.add(d['qid'])
+
+    submission_in_range = []
+    for d in preds:
+        if d['qid'] in gt_qids_in_range:
+            submission_in_range.append(copy.deepcopy(d))
+
+    if submission_in_range == ground_truth_in_range == []:
+        return preds, annos
+
+    return submission_in_range, ground_truth_in_range
+
+
+def eval_moment_retrieval(preds, annos):
+    length_ranges = [[0, 10], [10, 30], [30, float('inf')], [0, float('inf')]]
+    range_names = ['short', 'middle', 'long', 'full']
+
+    ret_metrics = dict()
+    for l_range, name in zip(length_ranges, range_names):
+        _submission, _ground_truth = get_data_by_range(preds, annos, l_range)
+        print(f'{name}: {l_range}, {len(_ground_truth)}/{len(annos)}={100*len(_ground_truth)/len(annos):.2f} samples')
+        iou_thd2average_precision = compute_mr_ap(_submission, _ground_truth)
+        iou_thd2recall_at_one, miou_at_one = compute_mr_r1(_submission, _ground_truth)
+        iou_thd2recall_at_five = compute_mr_r5(_submission, _ground_truth)
+        ret_metrics[name] = {
+            'MR-mIoU': miou_at_one,
+            'MR-mAP': iou_thd2average_precision,
+            'MR-R1': iou_thd2recall_at_one,
+            'MR-R5': iou_thd2recall_at_five
+        }
+
+    return ret_metrics
+
+
+def compute_hl_hit1(qid2preds, qid2gt_scores_binary):
+    qid2max_scored_clip_idx = {k: np.argmax(v['pred_saliency_scores']) for k, v in qid2preds.items()}
+    hit_scores = np.zeros((len(qid2preds), 3))
+    qids = list(qid2preds.keys())
+    for idx, qid in enumerate(qids):
+        pred_clip_idx = qid2max_scored_clip_idx[qid]
+        gt_scores_binary = qid2gt_scores_binary[qid]
+        if pred_clip_idx < len(gt_scores_binary):
+            hit_scores[idx] = gt_scores_binary[pred_clip_idx]
+    hit_at_one = float(f'{100 * np.mean(np.max(hit_scores, 1)):.2f}')
+    return hit_at_one
+
+
+def compute_hl_ap(qid2preds, qid2gt_scores_binary):
+    qid2pred_scores = {k: v['pred_saliency_scores'] for k, v in qid2preds.items()}
+    ap_scores = np.zeros((len(qid2preds), 3))
+    qids = list(qid2preds.keys())
+    input_tuples = []
+    for idx, qid in enumerate(qids):
+        for w_idx in range(3):
+            y_true = qid2gt_scores_binary[qid][:, w_idx]
+            y_pred = np.array(qid2pred_scores[qid])
+            input_tuples.append((idx, w_idx, y_true, y_pred))
+
+    for input_tuple in input_tuples:
+        idx, w_idx, score = compute_ap_from_tuple(input_tuple)
+        ap_scores[idx, w_idx] = score
+
+    mean_ap = float(f'{100 * np.mean(ap_scores):.2f}')
+    return mean_ap
+
+
+def compute_ap_from_tuple(input_tuple):
+    idx, w_idx, y_true, y_pred = input_tuple
+    if len(y_true) < len(y_pred):
+        y_pred = y_pred[:len(y_true)]
+    elif len(y_true) > len(y_pred):
+        _y_predict = np.zeros(len(y_true))
+        _y_predict[:len(y_pred)] = y_pred
+        y_pred = _y_predict
+
+    score = get_ap(y_true, y_pred)
+    return idx, w_idx, score
+
+
+def mk_gt_scores(gt_data, clip_length=2):
+    num_clips = int(gt_data['duration'] / clip_length)
+    saliency_scores_full_video = np.zeros((num_clips, 3))
+    relevant_clip_ids = np.array(gt_data['relevant_clip_ids'])
+    saliency_scores_relevant_clips = np.array(gt_data['saliency_scores'])
+    saliency_scores_full_video[relevant_clip_ids] = saliency_scores_relevant_clips
+    return saliency_scores_full_video
+
+
+def eval_highlight(preds, annos):
+    qid2preds = {d['qid']: d for d in preds}
+    qid2gt_scores_full_range = {d['qid']: mk_gt_scores(d) for d in annos}
+    gt_saliency_score_min_list = [2, 3, 4]
+    saliency_score_names = ['Fair', 'Good', 'VeryGood']
+    highlight_det_metrics = dict()
+    for gt_saliency_score_min, score_name in zip(gt_saliency_score_min_list, saliency_score_names):
+        qid2gt_scores_binary = {
+            k: (v >= gt_saliency_score_min).astype(float)
+            for k, v in qid2gt_scores_full_range.items()
+        }
+        hit_at_one = compute_hl_hit1(qid2preds, qid2gt_scores_binary)
+        mean_ap = compute_hl_ap(qid2preds, qid2gt_scores_binary)
+        highlight_det_metrics[f'HL-min-{score_name}'] = {'HL-mAP': mean_ap, 'HL-Hit1': hit_at_one}
+    return highlight_det_metrics
+
+
+def qvhighlights_eval(preds, annos):
+    pred_qids = set([e['qid'] for e in preds])
+    gt_qids = set([e['qid'] for e in annos])
+    assert pred_qids == gt_qids, 'qids in annos and preds must match'
+
+    eval_metrics = dict()
+    eval_metrics_brief = OrderedDict()
+    if 'pred_relevant_windows' in preds[0]:
+        moment_ret_scores = eval_moment_retrieval(preds, annos)
+        eval_metrics.update(moment_ret_scores)
+        moment_ret_scores_brief = {
+            'MR-full-mAP': moment_ret_scores['full']['MR-mAP']['average'],
+            'MR-full-mAP@0.5': moment_ret_scores['full']['MR-mAP']['0.5'],
+            'MR-full-mAP@0.75': moment_ret_scores['full']['MR-mAP']['0.75'],
+            'MR-short-mAP': moment_ret_scores['short']['MR-mAP']['average'],
+            'MR-middle-mAP': moment_ret_scores['middle']['MR-mAP']['average'],
+            'MR-long-mAP': moment_ret_scores['long']['MR-mAP']['average'],
+            'MR-short-mIoU': moment_ret_scores['short']['MR-mIoU'],
+            'MR-middle-mIoU': moment_ret_scores['middle']['MR-mIoU'],
+            'MR-long-mIoU': moment_ret_scores['long']['MR-mIoU'],
+            'MR-full-mIoU': moment_ret_scores['full']['MR-mIoU'],
+            'MR-full-R1@0.3': moment_ret_scores['full']['MR-R1']['0.3'],
+            'MR-full-R1@0.5': moment_ret_scores['full']['MR-R1']['0.5'],
+            'MR-full-R1@0.7': moment_ret_scores['full']['MR-R1']['0.7'],
+            'MR-full-R5@0.3': moment_ret_scores['full']['MR-R5']['0.3'],
+            'MR-full-R5@0.5': moment_ret_scores['full']['MR-R5']['0.5'],
+            'MR-full-R5@0.7': moment_ret_scores['full']['MR-R5']['0.7']
+        }
+        eval_metrics_brief.update(sorted([(k, v) for k, v in moment_ret_scores_brief.items()], key=lambda x: x[0]))
+
+    if ('pred_saliency_scores' in preds[0]) and ('saliency_scores' in annos[0]):
+        if isinstance(annos[0]['saliency_scores'], list):
+            highlight_det_scores = eval_highlight(preds, annos)
+            eval_metrics.update(highlight_det_scores)
+            highlight_det_scores_brief = dict([(f"{k}-{sub_k.split('-')[1]}", v[sub_k])
+                                               for k, v in highlight_det_scores.items() for sub_k in v])
+            eval_metrics_brief.update(highlight_det_scores_brief)
+            eval_metrics_brief['HL-min-VeryGood-mAP'] = eval_metrics_brief.pop('HL-min-VeryGood-mAP')
+            eval_metrics_brief['HL-min-VeryGood-Hit1'] = eval_metrics_brief.pop('HL-min-VeryGood-Hit1')
+
+    final_eval_metrics = OrderedDict()
+    final_eval_metrics['brief'] = eval_metrics_brief
+    final_eval_metrics.update(sorted([(k, v) for k, v in eval_metrics.items()], key=lambda x: x[0]))
+    return final_eval_metrics
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('pred_path')
+    parser.add_argument('--anno_path', default='data/qvhighlights/highlight_val_release.jsonl')
+    parser.add_argument('--out_name', default='metrics.log')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if nncore.is_dir(args.pred_path):
+        log_file = nncore.join(args.pred_path, args.out_name)
+    else:
+        log_file = nncore.same_dir(args.pred_path, args.out_name)
+
+    nncore.set_default_logger(logger='eval', fmt=None, log_file=log_file)
+
+    if nncore.is_dir(args.pred_path):
+        pred_paths = nncore.ls(args.pred_path, ext=['json', 'jsonl'], join_path=True, sort=True)
+        nncore.log(f'Total number of files: {len(pred_paths)}\n')
+        preds = nncore.flatten([nncore.load(p) for p in pred_paths])
+    else:
+        nncore.log(f'Loading predictions from {args.pred_path}')
+        preds = nncore.load(args.pred_path)
+
+    annos = nncore.load(args.anno_path)
+
+    res = qvhighlights_eval(preds, annos)['brief']
+    for k, v in res.items():
+        nncore.log(f'{k}: {v}')
diff --git a/videomind/eval/infer_auto.py b/videomind/eval/infer_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5b54526f78d74e3913b6334a16596b3ae61d8aa
--- /dev/null
+++ b/videomind/eval/infer_auto.py
@@ -0,0 +1,437 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import argparse
+import copy
+import json
+from contextlib import nullcontext
+
+import nncore
+import torch
+
+from videomind.constants import GROUNDER_PROMPT, PLANNER_PROMPT, VERIFIER_PROMPT
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.utils import process_vision_info
+from videomind.model.builder import build_model
+from videomind.utils.io import get_duration, load_subtitle
+from videomind.utils.parser import parse_query, parse_span
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset')
+    parser.add_argument('--pred_path')
+    parser.add_argument('--model_gnd_path')
+    parser.add_argument('--model_ver_path')
+    parser.add_argument('--model_pla_path')
+    parser.add_argument('--model_ans_path')
+    parser.add_argument('--split', default='test', choices=['train', 'valid', 'test'])
+    parser.add_argument('--style', default='mcq', choices=['mcq', 'options', 'direct'])
+    parser.add_argument('--use_subtitle', action='store_true')
+    parser.add_argument('--auto_rephrasing', action='store_true')
+    parser.add_argument('--auto_planning', action='store_true')
+    parser.add_argument('--num_threads', type=int, default=1)
+    parser.add_argument('--device', default='auto')
+    parser.add_argument('--chunk', type=int, default=1)
+    parser.add_argument('--index', type=int, default=0)
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if args.chunk > 1:
+        pred_path = nncore.join(args.pred_path, f'output_{args.index}.json')
+    else:
+        pred_path = nncore.join(args.pred_path, 'output.json')
+
+    print(f'Dataset: {args.dataset}({args.split}) Chunk: {args.chunk} Index: {args.index} Output Path: {pred_path}')
+
+    # NOTE:
+    # 1. grounder is always true so no need to store
+    # 2. answerer would always be used (when set to false, the base model would be used as the answerer)
+    adapter_state = dict(planner=False, verifier=False, answerer=False)
+
+    print('Initializing role *grounder*')
+    model, processor = build_model(args.model_gnd_path, device=args.device)
+    device = next(model.parameters()).device
+
+    if args.model_pla_path is not None:
+        adapter_path = nncore.join(args.model_pla_path, 'planner')
+        if nncore.is_dir(adapter_path):
+            print('Initializing role *planner*')
+            model.load_adapter(adapter_path, adapter_name='planner')
+            adapter_state['planner'] = True
+
+    if args.model_ver_path is not None:
+        adapter_path = nncore.join(args.model_ver_path, 'verifier')
+        if nncore.is_dir(adapter_path):
+            print('Initializing role *verifier*')
+            model.load_adapter(adapter_path, adapter_name='verifier')
+            adapter_state['verifier'] = True
+
+    if args.model_ans_path is not None:
+        adapter_path = nncore.join(args.model_ans_path, 'answerer')
+        if nncore.is_dir(adapter_path):
+            print('Initializing role *answerer*')
+            model.load_adapter(adapter_path, adapter_name='answerer')
+            adapter_state['answerer'] = True
+
+    annos = DATASETS.get(args.dataset).load_annos(split=args.split)
+    annos = [annos[i::args.chunk] for i in range(args.chunk)][args.index]
+
+    dumps = []
+    for i in nncore.ProgressBar(range(len(annos))):
+        anno = copy.deepcopy(annos[i])
+        dump = copy.deepcopy(annos[i])
+
+        video_path, duration, span = anno['video_path'], anno.get('duration'), anno.get('span')
+
+        if duration is None:
+            duration = get_duration(video_path, num_threads=args.num_threads)
+            dump['duration'] = duration
+
+        print()
+        print(video_path)
+        print(duration)
+
+        # sometimes the sample is for grounding only
+        do_answering = all(k in anno for k in ('question', 'options'))
+
+        if do_answering:
+            question, options, ans = anno['question'], anno['options'], anno['ans']
+
+            if args.style in ('mcq', 'options'):
+                prompt = question + '\nOptions:'
+                for idx, opt in enumerate(options):
+                    prompt += f"\n({chr(ord('A') + idx)}) {opt.capitalize()}"
+                prompt += '\nPlease only give the best option.'
+            else:
+                prompt = question
+
+            print(prompt)
+            print(options)
+            print(ans)
+        else:
+            question = anno['query']
+            print(question)
+
+        # do grounding by default
+        do_grounding = True
+
+        # initialize grounding query as question
+        query = question
+
+        # initialize agent list
+        dump['agents'] = []
+
+        if adapter_state['planner'] and (args.auto_rephrasing or args.auto_planning):
+            print('=============== planner ===============')
+
+            dump['agents'].append('planner')
+
+            messages = [{
+                'role':
+                'user',
+                'content': [{
+                    'type': 'video',
+                    'video': video_path,
+                    'num_threads': args.num_threads,
+                    'min_pixels': 36 * 28 * 28,
+                    'max_pixels': 64 * 28 * 28,
+                    'max_frames': 100,
+                    'fps': 1.0
+                }, {
+                    'type': 'text',
+                    'text': PLANNER_PROMPT.format(question)
+                }]
+            }]
+
+            text = processor.apply_chat_template(messages, add_generation_prompt=True)
+            print(text)
+            images, videos = process_vision_info(messages)
+            data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
+            data = data.to(device)
+
+            model.base_model.disable_adapter_layers()
+            model.base_model.enable_adapter_layers()
+            model.set_adapter('planner')
+
+            output_ids = model.generate(
+                **data,
+                do_sample=False,
+                temperature=None,
+                top_p=None,
+                top_k=None,
+                repetition_penalty=None,
+                max_new_tokens=256)
+
+            assert data.input_ids.size(0) == output_ids.size(0) == 1
+            output_ids = output_ids[0, data.input_ids.size(1):]
+            if output_ids[-1] == processor.tokenizer.eos_token_id:
+                output_ids = output_ids[:-1]
+            response = processor.decode(output_ids, clean_up_tokenization_spaces=False)
+            print(response)
+
+            dump['planner_response'] = response
+
+            try:
+                parsed = json.loads(response)
+                action = parsed[0] if isinstance(parsed, list) else parsed
+                if args.auto_rephrasing and action['type'].lower() == 'grounder' and action['value']:
+                    query = action['value']
+                    dump['planner_parsed_query'] = query
+                elif args.auto_planning and action['type'].lower() == 'answerer':
+                    do_grounding = False
+            except Exception:
+                print('WARNING: Failed to parse planner response')
+
+        if do_grounding:
+            print('=============== grounder ===============')
+
+            dump['agents'].append('grounder')
+
+            query = parse_query(query)
+
+            messages = [{
+                'role':
+                'user',
+                'content': [{
+                    'type': 'video',
+                    'video': video_path,
+                    'num_threads': args.num_threads,
+                    'min_pixels': 36 * 28 * 28,
+                    'max_pixels': 64 * 28 * 28,
+                    'max_frames': 150,
+                    'fps': 1.0
+                }, {
+                    'type': 'text',
+                    'text': GROUNDER_PROMPT.format(query)
+                }]
+            }]
+
+            text = processor.apply_chat_template(messages, add_generation_prompt=True)
+            print(text)
+            images, videos = process_vision_info(messages)
+            data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
+            data = data.to(device)
+
+            model.base_model.disable_adapter_layers()
+            model.base_model.enable_adapter_layers()
+            model.set_adapter('grounder')
+
+            output_ids = model.generate(
+                **data,
+                do_sample=False,
+                temperature=None,
+                top_p=None,
+                top_k=None,
+                repetition_penalty=None,
+                max_new_tokens=256)
+
+            assert data.input_ids.size(0) == output_ids.size(0) == 1
+            output_ids = output_ids[0, data.input_ids.size(1):]
+            if output_ids[-1] == processor.tokenizer.eos_token_id:
+                output_ids = output_ids[:-1]
+            response = processor.decode(output_ids, clean_up_tokenization_spaces=False)
+            print(response)
+
+            dump['grounder_response'] = response
+            dump['grounder_success'] = len(model.reg) > 0
+
+            if dump['grounder_success']:
+                # 1. extract timestamps and confidences
+                blob = model.reg[0].cpu().float()
+                pred, conf = blob[:, :2] * duration, blob[:, -1].tolist()
+
+                # 2. clamp timestamps
+                pred = pred.clamp(min=0, max=duration)
+
+                # 3. round timestamps to units
+                unit = getattr(DATASETS.get(args.dataset), 'UNIT', 0.001)
+                pred = torch.round(pred / unit).long() * unit
+
+                # 4. sort timestamps
+                inds = (pred[:, 1] - pred[:, 0] < 0).nonzero()[:, 0]
+                pred[inds] = pred[inds].roll(1)
+
+                # 5. convert timestamps to list
+                pred = pred.tolist()
+            else:
+                print('WARNING: Failed to parse grounder response')
+
+                if adapter_state['verifier']:
+                    pred = [[i * duration / 6, (i + 2) * duration / 6] for i in range(5)]
+                    conf = [0] * 5
+                else:
+                    pred = [[0, duration]]
+                    conf = [0]
+
+            print(pred[0], span, duration)
+            dump['pred'] = pred
+            dump['conf'] = conf
+
+        if do_grounding and adapter_state['verifier'] and len(pred) > 1:
+            print('=============== verifier ===============')
+
+            dump['agents'].append('verifier')
+
+            # using top-5 predictions
+            probs = []
+            for cand in pred[:5]:
+                s0, e0 = parse_span(cand, duration, 2)
+                offset = (e0 - s0) / 2
+                s1, e1 = parse_span([s0 - offset, e0 + offset], duration)
+
+                # percentage of s0, e0 within s1, e1
+                s = (s0 - s1) / (e1 - s1)
+                e = (e0 - s1) / (e1 - s1)
+
+                messages = [{
+                    'role':
+                    'user',
+                    'content': [{
+                        'type': 'video',
+                        'video': video_path,
+                        'num_threads': args.num_threads,
+                        'video_start': s1,
+                        'video_end': e1,
+                        'min_pixels': 36 * 28 * 28,
+                        'max_pixels': 64 * 28 * 28,
+                        'max_frames': 64,
+                        'fps': 2.0
+                    }, {
+                        'type': 'text',
+                        'text': VERIFIER_PROMPT.format(question)
+                    }]
+                }]
+
+                text = processor.apply_chat_template(messages, add_generation_prompt=True)
+                print(text)
+                images, videos = process_vision_info(messages)
+                data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
+
+                # ===== insert segment start/end tokens =====
+                video_grid_thw = data['video_grid_thw'][0]
+                num_frames, window = int(video_grid_thw[0]), int(video_grid_thw[1] * video_grid_thw[2] / 4)
+                assert num_frames * window * 4 == data['pixel_values_videos'].size(0)
+
+                pos_s, pos_e = round(s * num_frames), round(e * num_frames)
+                pos_s, pos_e = min(max(0, pos_s), num_frames), min(max(0, pos_e), num_frames)
+                assert pos_s <= pos_e, (num_frames, s, e)
+
+                base_idx = torch.nonzero(data['input_ids'][0] == model.config.vision_start_token_id).item()
+                pos_s, pos_e = pos_s * window + base_idx + 1, pos_e * window + base_idx + 2
+
+                input_ids = data['input_ids'][0].tolist()
+                input_ids.insert(pos_s, model.config.seg_s_token_id)
+                input_ids.insert(pos_e, model.config.seg_e_token_id)
+                data['input_ids'] = torch.LongTensor([input_ids])
+                data['attention_mask'] = torch.ones_like(data['input_ids'])
+                # ===========================================
+
+                data = data.to(device)
+
+                model.base_model.disable_adapter_layers()
+                model.base_model.enable_adapter_layers()
+                model.set_adapter('verifier')
+
+                with torch.inference_mode():
+                    logits = model(**data).logits[0, -1].softmax(dim=-1)
+
+                # NOTE: magic numbers here
+                # In Qwen2-VL vocab: 9454 -> Yes, 2753 -> No
+                score = (logits[9454] - logits[2753]).sigmoid().item()
+                probs.append(score)
+
+            ranks = torch.Tensor(probs).argsort(descending=True).tolist()
+            print(probs)
+            print(ranks)
+
+            pred = [pred[idx] for idx in ranks]
+            conf = [conf[idx] for idx in ranks]
+            print(pred[0], span, duration)
+
+            dump['probs'] = probs
+            dump['ranks'] = ranks
+            dump['pred_ori'] = dump['pred']
+            dump['conf_ori'] = dump['conf']
+            dump['pred'] = pred
+            dump['conf'] = conf
+
+        if do_answering:
+            print('=============== answerer ===============')
+
+            dump['agents'].append('answerer')
+
+            # choose the potential best moment
+            selected = pred[0] if 'pred' in dump else [0, duration]
+
+            min_len = getattr(DATASETS.get(args.dataset), 'MIN_LEN', 32)
+            s, e = parse_span(selected, duration, min_len)
+            print([s, e], span, duration)
+
+            if args.use_subtitle and 'subtitle_path' in anno and nncore.is_file(anno['subtitle_path']):
+                # use only the first 100 subtitles to save memory
+                subs = load_subtitle(anno['subtitle_path'])[:100]
+                subs = [f'{round(a - s, 1)}s - {round(b - s, 1)}s, {t}\n' for a, b, t in subs if a >= s and b <= e]
+                subs = ''.join(subs)
+                prompt = f'You are given a video with {round(e - s, 1)} seconds long.\nSubtitles:\n{subs}' + prompt
+
+            messages = [{
+                'role':
+                'user',
+                'content': [{
+                    'type': 'video',
+                    'video': video_path,
+                    'num_threads': args.num_threads,
+                    'video_start': s,
+                    'video_end': e,
+                    'min_pixels': 128 * 28 * 28,
+                    'max_pixels': 256 * 28 * 28,
+                    'max_frames': 32,
+                    'fps': 2.0
+                }, {
+                    'type': 'text',
+                    'text': prompt
+                }]
+            }]
+
+            text = processor.apply_chat_template(messages, add_generation_prompt=True)
+            text += 'Best Option: (' if args.style == 'mcq' else ''
+            print(text)
+            images, videos = process_vision_info(messages)
+            data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
+            data = data.to(device)
+
+            if adapter_state['answerer']:
+                model.base_model.disable_adapter_layers()
+                model.base_model.enable_adapter_layers()
+                model.set_adapter('answerer')
+                context = nullcontext
+            else:
+                context = model.disable_adapter
+
+            with context():
+                output_ids = model.generate(
+                    **data,
+                    do_sample=False,
+                    temperature=None,
+                    top_p=None,
+                    top_k=None,
+                    repetition_penalty=None,
+                    max_new_tokens=256)
+
+            assert data.input_ids.size(0) == output_ids.size(0) == 1
+            output_ids = output_ids[0, data.input_ids.size(1):]
+            if output_ids[-1] == processor.tokenizer.eos_token_id:
+                output_ids = output_ids[:-1]
+            response = processor.decode(output_ids, clean_up_tokenization_spaces=False)
+            print(response)
+
+            dump['answerer_response'] = response
+            dump['response'] = response
+
+        dumps.append(dump)
+
+    nncore.dump(dumps, pred_path)
diff --git a/videomind/eval/infer_qvhighlights.py b/videomind/eval/infer_qvhighlights.py
new file mode 100644
index 0000000000000000000000000000000000000000..c32c60da79f4307ca875eb50ff427800487eca30
--- /dev/null
+++ b/videomind/eval/infer_qvhighlights.py
@@ -0,0 +1,137 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import argparse
+import copy
+
+import nncore
+import torch
+
+from videomind.constants import GROUNDER_PROMPT
+from videomind.dataset.hybrid import DATASETS
+from videomind.dataset.utils import process_vision_info
+from videomind.model.builder import build_model
+from videomind.utils.io import get_duration
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset')
+    parser.add_argument('--pred_path')
+    parser.add_argument('--model_gnd_path')
+    parser.add_argument('--split', default='test', choices=['train', 'valid', 'test'])
+    parser.add_argument('--num_threads', type=int, default=1)
+    parser.add_argument('--device', default='auto')
+    parser.add_argument('--chunk', type=int, default=1)
+    parser.add_argument('--index', type=int, default=0)
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if args.chunk > 1:
+        pred_path = nncore.join(args.pred_path, f'output_{args.index}.jsonl')
+    else:
+        pred_path = nncore.join(args.pred_path, 'output.jsonl')
+
+    print(f'Dataset: {args.dataset}({args.split}) Chunk: {args.chunk} Index: {args.index} Output Path: {pred_path}')
+
+    model, processor = build_model(args.model_gnd_path, device=args.device)
+    device = next(model.parameters()).device
+
+    annos = DATASETS.get(args.dataset).load_annos(split=args.split)
+    annos = [annos[i::args.chunk] for i in range(args.chunk)][args.index]
+
+    dumps = []
+    for i in nncore.ProgressBar(range(len(annos))):
+        anno = copy.deepcopy(annos[i])
+        dump = dict()
+
+        video_path, query, duration, span = anno['video_path'], anno['query'], anno.get('duration'), anno.get('span')
+
+        if duration is None:
+            duration = get_duration(video_path, num_threads=args.num_threads)
+
+        print()
+        print(video_path)
+        print(duration)
+        print(query)
+
+        messages = [{
+            'role':
+            'user',
+            'content': [{
+                'type': 'video',
+                'video': video_path,
+                'num_threads': args.num_threads,
+                'min_pixels': 36 * 28 * 28,
+                'max_pixels': 64 * 28 * 28,
+                'max_frames': 150,
+                'fps': 1.0
+            }, {
+                'type': 'text',
+                'text': GROUNDER_PROMPT.format(query)
+            }]
+        }]
+
+        text = processor.apply_chat_template(messages, add_generation_prompt=True)
+        print(text)
+
+        images, videos = process_vision_info(messages)
+
+        data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
+        data = data.to(device)
+
+        output_ids = model.generate(
+            **data,
+            do_sample=False,
+            temperature=None,
+            top_p=None,
+            top_k=None,
+            repetition_penalty=None,
+            max_new_tokens=256)
+
+        assert data.input_ids.size(0) == output_ids.size(0) == 1
+        output_ids = output_ids[0, data.input_ids.size(1):]
+
+        if output_ids[-1] == processor.tokenizer.eos_token_id:
+            output_ids = output_ids[:-1]
+
+        response = processor.decode(output_ids, clean_up_tokenization_spaces=False)
+        print(response)
+
+        grounder_success = len(model.reg) > 0
+
+        if grounder_success:
+            # 1. extract timestamps and confidences
+            blob = model.reg[0].cpu().float()
+            pred, conf = blob[:, :2] * duration, blob[:, 2:]
+            print(pred[0], span, duration)
+
+            # 2. clamp timestamps
+            pred = pred.clamp(min=0, max=duration)
+
+            # 3. round timestamps to units
+            unit = getattr(DATASETS.get(args.dataset), 'UNIT', 0.001)
+            pred = torch.round(pred / unit).long() * unit
+
+            # 4. sort timestamps
+            inds = (pred[:, 1] - pred[:, 0] < 0).nonzero()[:, 0]
+            pred[inds] = pred[inds].roll(1)
+
+            # 5. merge timestamps back with confidences
+            pred = torch.cat((pred, conf), dim=1)
+        else:
+            print('WARNING: Failed to parse grounder response')
+            pred = torch.Tensor([[0, duration, 1]])
+
+        print(pred[0], span, duration)
+
+        dump['vid'] = anno['vid']
+        dump['qid'] = anno['qid']
+        dump['pred_relevant_windows'] = pred.tolist()
+
+        dumps.append(dump)
+
+    nncore.dump(dumps, pred_path)
diff --git a/videomind/model/__init__.py b/videomind/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbdbfae1e29d7db467f06b152a9efe4b23026691
--- /dev/null
+++ b/videomind/model/__init__.py
@@ -0,0 +1,3 @@
+from .model import AgentQwen2VLConfig, AgentQwen2VLForConditionalGeneration
+
+MODELS = {'qwen2_vl': (AgentQwen2VLConfig, AgentQwen2VLForConditionalGeneration)}
diff --git a/videomind/model/blocks.py b/videomind/model/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..c439be495330032effdf04108f65d378613d2877
--- /dev/null
+++ b/videomind/model/blocks.py
@@ -0,0 +1,93 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from nncore.nn import Parameter
+
+
+class Permute(nn.Module):
+
+    def forward(self, x):
+        return x.transpose(-1, -2)
+
+
+class LearnableEmbedding(nn.Module):
+
+    def __init__(self, dims):
+        super().__init__()
+        self.weights = Parameter(1, 1, dims)
+
+    def forward(self, x):
+        return x + self.weights.expand_as(x)
+
+
+class ConvPyramid(nn.Module):
+
+    def __init__(self, dims, strides, act_cls=nn.ReLU):
+        super().__init__()
+
+        self.blocks = nn.ModuleList()
+        for s in strides:
+            p = int(math.log2(s))
+            if p == 0:
+                layers = act_cls()
+            else:
+                conv_cls = nn.Conv1d if p > 0 else nn.ConvTranspose1d
+                layers = nn.Sequential()
+                for _ in range(abs(p)):
+                    module = [Permute(), conv_cls(dims, dims, 2, stride=2), Permute(), nn.LayerNorm(dims), act_cls()]
+                    layers.extend(module)
+            self.blocks.append(layers)
+
+        self.strides = strides
+
+    def forward(self, x, mask, return_mask=False):
+        pymid, pymid_msk = [], []
+
+        for s, blk in zip(self.strides, self.blocks):
+            if x.size(1) < s:
+                continue
+
+            pymid.append(blk(x))
+
+            if return_mask:
+                if s > 1:
+                    msk = F.max_pool1d(mask.float(), s, stride=s).long()
+                elif s < 1:
+                    msk = mask.repeat_interleave(int(1 / s), dim=1)
+                else:
+                    msk = mask
+                pymid_msk.append(msk)
+
+        return (pymid, pymid_msk) if return_mask else pymid
+
+
+class Scale(nn.Module):
+
+    def __init__(self, strides):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(len(strides)))
+
+    def forward(self, x, i):
+        return x * self.scale[i]
+
+
+class ConvHead(nn.Module):
+
+    def __init__(self, dims, out_dims, kernal_size=3, act_cls=nn.ReLU):
+        super().__init__()
+
+        # yapf:disable
+        self.module = nn.Sequential(
+            Permute(),
+            nn.Conv1d(dims, dims, kernal_size, padding=kernal_size // 2),
+            act_cls(),
+            nn.Conv1d(dims, out_dims, kernal_size, padding=kernal_size // 2),
+            Permute())
+        # yapf:enable
+
+    def forward(self, x):
+        return self.module(x)
diff --git a/videomind/model/builder.py b/videomind/model/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbc6a2442b0b156831f4cd586b6e9c367ac4fefd
--- /dev/null
+++ b/videomind/model/builder.py
@@ -0,0 +1,108 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import warnings
+
+import nncore
+import torch
+import torch.nn as nn
+from peft import PeftModel
+from safetensors.torch import load_model
+from transformers import AutoConfig, AutoModel, AutoProcessor, GenerationConfig, Qwen2VLForConditionalGeneration
+
+
+def get_auto_device(device):
+    try:
+        import torch_npu
+        has_npu = torch_npu.npu.is_available()
+    except ImportError:
+        has_npu = False
+
+    return 'cuda' if torch.cuda.is_available() else 'npu' if has_npu else 'cpu'
+
+
+def build_model(model_path, config=None, is_trainable=False, merge_adapter=False, device='auto', dtype=torch.float16):
+    # set do_resize to false to avoid duplicated resizing
+    # https://github.com/huggingface/transformers/tree/main/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
+    processor = AutoProcessor.from_pretrained(model_path, do_resize=False)
+
+    # eager attention has known & unknown bugs
+    # [4.46.2] broken causality fp16: https://github.com/huggingface/transformers/issues/35151
+    # [4.48.1] broken sliding window: https://github.com/huggingface/transformers/issues/35924
+    attn_implementation = 'sdpa'
+
+    config = config or AutoConfig.from_pretrained(model_path)
+
+    adapter_path = nncore.join(model_path, getattr(config, 'role', 'unknown'))
+    partial_path = nncore.join(model_path, 'pytorch_model.safetensors')
+
+    if nncore.is_dir(adapter_path) or nncore.is_file(partial_path):
+        print(f'Loading base model from {config.base_model_path}...')
+        model = AutoModel.from_pretrained(
+            config.base_model_path,
+            config=config,
+            low_cpu_mem_usage=True,
+            ignore_mismatched_sizes=True,
+            attn_implementation=attn_implementation,
+            torch_dtype=dtype)
+
+        try:
+            model.generation_config = GenerationConfig.from_pretrained(model_path)
+        except OSError:
+            warnings.warn('generation_config.json not found')
+
+        meta_state_dict = {
+            n: torch.empty_like(p, device='cpu')
+            for n, p in model.named_parameters() if p.device == torch.device('meta')
+        }
+        model.load_state_dict(meta_state_dict, strict=False, assign=True)
+
+        size = (model.model.embed_tokens.num_embeddings, model.model.embed_tokens.embedding_dim)
+        if model.model.embed_tokens.weight.size() != size:
+            print(f'Resizing embed_tokens to {size}...')
+            model.model.embed_tokens.weight = nn.Parameter(model.model.embed_tokens.weight.new_empty(size))
+
+        size = (model.lm_head.out_features, model.lm_head.in_features)
+        if model.lm_head.weight.size() != size:
+            print(f'Resizing lm_head to {size}...')
+            model.lm_head.weight = nn.Parameter(model.lm_head.weight.new_empty(size))
+
+        if nncore.is_dir(adapter_path):
+            print(f'Loading adapter from {adapter_path}...')
+            # transformers integration does not support merge_and_unload, use peft instead
+            model = PeftModel.from_pretrained(
+                model,
+                adapter_path,
+                adapter_name=config.role,
+                is_trainable=is_trainable,
+                low_cpu_mem_usage=True,
+                torch_device=str(model.device))
+
+        if nncore.is_file(partial_path):
+            print(f'Loading state dict from {partial_path}...')
+            _, unexpected = load_model(model, partial_path, strict=False, device=str(model.device))
+            assert len(unexpected) == 0, f'unexpected parameters: {unexpected}'
+
+        if merge_adapter and nncore.is_dir(adapter_path):
+            print('Merging adapter and unloading...')
+            model = model.merge_and_unload()
+            model._hf_peft_config_loaded = False
+    else:
+        print(f'Loading full model from {model_path}...')
+
+        if len(config.architectures) == 1 and config.model_type == 'qwen2_vl':
+            model_cls = Qwen2VLForConditionalGeneration
+        else:
+            model_cls = AutoModel
+
+        model = model_cls.from_pretrained(
+            model_path,
+            config=config,
+            low_cpu_mem_usage=True,
+            attn_implementation=attn_implementation,
+            torch_dtype=dtype)
+
+    if not is_trainable:
+        device = get_auto_device(device) if device == 'auto' else device
+        model = model.to(device).eval()
+
+    return model, processor
diff --git a/videomind/model/generator.py b/videomind/model/generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6666f7df404a8081f6a73e1b669809ee95e635f
--- /dev/null
+++ b/videomind/model/generator.py
@@ -0,0 +1,66 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import torch
+import torch.nn as nn
+
+
+class BufferList(nn.Module):
+
+    def __init__(self, buffers):
+        super(BufferList, self).__init__()
+        for i, buffer in enumerate(buffers):
+            self.register_buffer(str(i), buffer, persistent=False)
+
+    def __len__(self):
+        return len(self._buffers)
+
+    def __iter__(self):
+        return iter(self._buffers.values())
+
+
+class PointGenerator(nn.Module):
+
+    def __init__(self, strides, buffer_size, offset=False):
+        super(PointGenerator, self).__init__()
+
+        reg_range, last = [], 0
+        for stride in strides[1:]:
+            reg_range.append((last, stride))
+            last = stride
+        reg_range.append((last, float('inf')))
+
+        self.strides = strides
+        self.reg_range = reg_range
+        self.buffer_size = buffer_size
+        self.offset = offset
+
+        self.buffer = self._cache_points()
+
+    def _cache_points(self):
+        buffer_list = []
+        for stride, reg_range in zip(self.strides, self.reg_range):
+            reg_range = torch.Tensor([reg_range])
+            lv_stride = torch.Tensor([stride])
+            points = torch.arange(0, self.buffer_size, stride)[:, None]
+            if self.offset:
+                points += 0.5 * stride
+            reg_range = reg_range.repeat(points.size(0), 1)
+            lv_stride = lv_stride.repeat(points.size(0), 1)
+            buffer_list.append(torch.cat((points, reg_range, lv_stride), dim=1))
+        buffer = BufferList(buffer_list)
+        return buffer
+
+    def forward(self, pymid):
+        assert self.strides[0] == 1
+        # video_size = pymid[0].size(1)
+        points = []
+        sizes = [p.size(1) for p in pymid] + [0] * (len(self.buffer) - len(pymid))
+        for size, buffer in zip(sizes, self.buffer):
+            if size == 0:
+                continue
+            assert size <= buffer.size(0), 'reached max buffer size'
+            point = buffer[:size, :].clone()
+            # point[:, 0] /= video_size
+            points.append(point)
+        points = torch.cat(points)
+        return points
diff --git a/videomind/model/loss.py b/videomind/model/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..e54f46371a991919266b9b1f9f822b2f0294804e
--- /dev/null
+++ b/videomind/model/loss.py
@@ -0,0 +1,177 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from nncore.nn import LOSSES, Parameter, build_loss
+
+
+@LOSSES.register()
+class SampledNCELoss(nn.Module):
+
+    def __init__(self, temperature=0.07, max_scale=100, learnable=False, direction=('row', 'col'), loss_weight=1.0):
+        super().__init__()
+
+        scale = torch.Tensor([math.log(1 / temperature)])
+
+        if learnable:
+            self.scale = Parameter(scale)
+        else:
+            self.register_buffer('scale', scale)
+
+        self.temperature = temperature
+        self.max_scale = max_scale
+        self.learnable = learnable
+        self.direction = (direction, ) if isinstance(direction, str) else direction
+        self.loss_weight = loss_weight
+
+    def forward(self, video_emb, query_emb, video_msk, saliency, pos_clip):
+        batch_inds = torch.arange(video_emb.size(0), device=video_emb.device)
+
+        pos_scores = saliency[batch_inds, pos_clip].unsqueeze(-1)
+        loss_msk = (saliency <= pos_scores) * video_msk
+        if not loss_msk.any():
+            warnings.warn(f'loss_msk is all zeros: {loss_msk} {saliency} {video_msk} {pos_clip}')
+
+        scale = self.scale.exp().clamp(max=self.max_scale)
+        i_sim = F.cosine_similarity(video_emb, query_emb, dim=-1) * scale
+        i_sim = i_sim + torch.where(loss_msk > 0, .0, float('-inf'))
+
+        loss = 0
+
+        if 'row' in self.direction:
+            i_met = F.log_softmax(i_sim, dim=1)[batch_inds, pos_clip]
+            loss = loss - i_met.sum()
+
+        if 'col' in self.direction:
+            j_met = F.log_softmax(i_sim.t(), dim=1)[pos_clip, batch_inds]
+            loss = loss - j_met.sum() / j_met.size(0)
+
+        loss = loss * self.loss_weight
+        return loss
+
+
+@LOSSES.register()
+class BundleLoss(nn.Module):
+
+    def __init__(self, sample_radius=1.5, loss_cls=None, loss_reg=None, loss_sal=None):
+        super().__init__()
+
+        self._loss_cls = build_loss(loss_cls)
+        self._loss_reg = build_loss(loss_reg)
+        self._loss_sal = build_loss(loss_sal)
+
+        self.sample_radius = sample_radius
+
+    def get_target_single(self, point, gt_bnd, gt_cls):
+        num_pts, num_gts = point.size(0), gt_bnd.size(0)
+
+        lens = gt_bnd[:, 1] - gt_bnd[:, 0]
+        lens = lens[None, :].repeat(num_pts, 1)
+
+        gt_seg = gt_bnd[None].expand(num_pts, num_gts, 2)
+        s = point[:, 0, None] - gt_seg[:, :, 0]
+        e = gt_seg[:, :, 1] - point[:, 0, None]
+        r_tgt = torch.stack((s, e), dim=-1)
+
+        if self.sample_radius > 0:
+            center = (gt_seg[:, :, 0] + gt_seg[:, :, 1]) / 2
+            t_mins = center - point[:, 3, None] * self.sample_radius
+            t_maxs = center + point[:, 3, None] * self.sample_radius
+            dist_s = point[:, 0, None] - torch.maximum(t_mins, gt_seg[:, :, 0])
+            dist_e = torch.minimum(t_maxs, gt_seg[:, :, 1]) - point[:, 0, None]
+            center = torch.stack((dist_s, dist_e), dim=-1)
+            cls_msk = center.min(-1)[0] >= 0
+        else:
+            cls_msk = r_tgt.min(-1)[0] >= 0
+
+        reg_dist = r_tgt.max(-1)[0]
+        reg_msk = torch.logical_and((reg_dist >= point[:, 1, None]), (reg_dist <= point[:, 2, None]))
+
+        lens.masked_fill_(cls_msk == 0, float('inf'))
+        lens.masked_fill_(reg_msk == 0, float('inf'))
+        min_len, min_len_inds = lens.min(dim=1)
+
+        min_len_mask = torch.logical_and((lens <= (min_len[:, None] + 1e-3)), (lens < float('inf'))).to(r_tgt.dtype)
+
+        label = F.one_hot(gt_cls[:, 0], 2).to(r_tgt.dtype)
+        c_tgt = torch.matmul(min_len_mask, label).clamp(min=0.0, max=1.0)[:, 1]
+        r_tgt = r_tgt[range(num_pts), min_len_inds] / point[:, 3, None]
+
+        return c_tgt, r_tgt
+
+    def get_target(self, data):
+        cls_tgt, reg_tgt = [], []
+
+        for i in range(data['boundary'].size(0)):
+            gt_bnd = data['boundary'][i] * data['video_emb'].size(1)
+            # gt_bnd = data['boundary'][i]
+            gt_cls = gt_bnd.new_ones(gt_bnd.size(0), 1).long()
+
+            c_tgt, r_tgt = self.get_target_single(data['point'], gt_bnd, gt_cls)
+
+            cls_tgt.append(c_tgt)
+            reg_tgt.append(r_tgt)
+
+        cls_tgt = torch.stack(cls_tgt)
+        reg_tgt = torch.stack(reg_tgt)
+
+        return cls_tgt, reg_tgt
+
+    def loss_cls(self, data, output, cls_tgt):
+        src = data['out_class'].squeeze(-1)
+        msk = torch.cat(data['pymid_msk'], dim=1)
+
+        cls_tgt = cls_tgt.repeat(src.size(0) // cls_tgt.size(0), 1)
+
+        loss_cls = self._loss_cls(src, cls_tgt, weight=msk)
+        loss_cls = (loss_cls.sum(dim=1) / msk.sum(dim=1)).sum()
+
+        output['loss_cls'] = loss_cls
+        return output
+
+    def loss_reg(self, data, output, cls_tgt, reg_tgt):
+        src = data['out_coord']
+        msk = cls_tgt.unsqueeze(2).repeat(1, 1, 2).bool()
+        assert msk.any(), 'empty mask in reg loss'
+
+        reg_tgt = reg_tgt.repeat(src.size(0) // reg_tgt.size(0), 1, 1)
+        msk = msk.repeat(src.size(0) // msk.size(0), 1, 1)
+
+        loss_reg = self._loss_reg(src, reg_tgt, weight=msk)
+        loss_reg = (loss_reg.sum(dim=[1, 2]) / msk.sum(dim=[1, 2])).sum()
+
+        output['loss_reg'] = loss_reg
+        return output
+
+    def loss_sal(self, data, output):
+        video_emb = data['video_emb']
+        query_emb = data['query_emb']
+        video_msk = data['video_msk']
+
+        saliency = data['saliency']
+        pos_clip = data['pos_clip'][:, 0]
+
+        saliency = saliency.repeat(video_emb.size(0) // saliency.size(0), 1)
+        pos_clip = pos_clip.repeat(video_emb.size(0) // pos_clip.size(0))
+
+        output['loss_sal'] = self._loss_sal(video_emb, query_emb, video_msk, saliency, pos_clip)
+        return output
+
+    def forward(self, data, output):
+        if self._loss_reg is not None:
+            cls_tgt, reg_tgt = self.get_target(data)
+            output = self.loss_reg(data, output, cls_tgt, reg_tgt)
+        else:
+            cls_tgt = data['saliency']
+
+        if self._loss_cls is not None:
+            output = self.loss_cls(data, output, cls_tgt)
+
+        if self._loss_sal is not None:
+            output = self.loss_sal(data, output)
+
+        return output
diff --git a/videomind/model/model.py b/videomind/model/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aaedf22558bbc47739bdc76f7830b3639c94c2c
--- /dev/null
+++ b/videomind/model/model.py
@@ -0,0 +1,358 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import warnings
+
+import nncore
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from nncore.nn import ModuleList, PositionalEncoding, Sequential, TransformerEncoderLayer, xavier_init_
+from nncore.ops import temporal_iou
+from transformers import AutoConfig, AutoModel, Qwen2VLConfig, Qwen2VLForConditionalGeneration, Qwen2VLModel
+from transformers.activations import ACT2CLS, ACT2FN
+from transformers.models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
+from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VisionTransformerPretrainedModel
+
+from .blocks import ConvHead, ConvPyramid, LearnableEmbedding, Scale
+from .generator import PointGenerator
+from .loss import BundleLoss
+
+
+class AgentQwen2VLConfig(Qwen2VLConfig):
+    model_type = 'agent_qwen2_vl'
+
+
+class AgentQwen2VisionTransformerPretrainedModel(Qwen2VisionTransformerPretrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.gradient_checkpointing = False
+
+    # add support for gradient checkpointing
+    # https://github.com/huggingface/transformers/pull/34724
+    def forward(self, hidden_states, grid_thw):
+        hidden_states = self.patch_embed(hidden_states)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        for blk in self.blocks:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(blk.__call__, hidden_states, cu_seqlens,
+                                                                  rotary_pos_emb)
+            else:
+                hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
+
+        return self.merger(hidden_states)
+
+
+class AgentQwen2VLModel(Qwen2VLModel):
+    config_class = AgentQwen2VLConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.norm.register_forward_pre_hook(lambda module, args: setattr(module, 'state', args[0]))
+
+    def forward(self, input_ids=None, inputs_embeds=None, **kwargs):
+        # ensure gradient tracking (in case that embed_tokens has been frozen)
+        assert input_ids is None and inputs_embeds is not None
+        if self.training and not inputs_embeds.requires_grad:
+            inputs_embeds.requires_grad = True
+        return super().forward(input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs)
+
+
+class AgentQwen2VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
+    config_class = AgentQwen2VLConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.visual = AgentQwen2VisionTransformerPretrainedModel._from_config(config.vision_config)
+        self.model = AgentQwen2VLModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.rope_deltas = None
+
+        if self.config.role in ('all_in_one', 'grounder'):
+            hidden_size, hidden_act = self.config.hidden_size, self.config.hidden_act
+
+            self.dims = 256
+
+            self.vis_proj = Sequential(nn.LayerNorm(hidden_size), nn.Linear(hidden_size, self.dims))
+            self.reg_proj = Sequential(nn.LayerNorm(hidden_size), nn.Linear(hidden_size, self.dims))
+            self.vis_norm = nn.LayerNorm(self.dims)
+            self.vis_fuse = ModuleList(
+                TransformerEncoderLayer(self.dims, act_cfg=ACT2FN[hidden_act]),
+                TransformerEncoderLayer(self.dims, act_cfg=ACT2FN[hidden_act]),
+                TransformerEncoderLayer(self.dims, act_cfg=ACT2FN[hidden_act]))
+
+            self.vis_pos = PositionalEncoding(self.dims, normalize=True, learnable=False)
+            self.vis_emb = LearnableEmbedding(self.dims)
+            self.reg_emb = LearnableEmbedding(self.dims)
+
+            self.strides = (1, 2, 4, 8)
+            self.vis_pad_length = self.strides[-1]
+
+            self.pyramid = ConvPyramid(self.dims, self.strides, act_cls=ACT2CLS[hidden_act])
+            self.class_head = ConvHead(self.dims, 1, act_cls=ACT2CLS[hidden_act])
+            self.coord_head = ConvHead(self.dims, 2, act_cls=ACT2CLS[hidden_act])
+
+            self.generator = PointGenerator(self.strides, 1024)
+            self.coef = Scale(self.strides)
+            self.bundle_loss = BundleLoss(
+                sample_radius=1.5,
+                loss_cls=dict(type='FocalLoss', reduction='none', loss_weight=5.0),
+                loss_reg=dict(type='L1Loss', reduction='none', loss_weight=1.0),
+                loss_sal=dict(type='SampledNCELoss', direction='row', loss_weight=0.05))
+
+        self.post_init()
+
+    def reset_conv_parameters(self):
+        for s in ('pyramid', 'class_head', 'coord_head'):
+            b = getattr(self, s, None)
+            if b is None:
+                continue
+            for n, m in b.named_modules():
+                if isinstance(m, (nn.Conv1d, nn.ConvTranspose1d)):
+                    print(f'Reset parameters of {b.__class__.__name__} {n} ({m.__class__.__name__})')
+                    xavier_init_(m, distribution='uniform')
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                position_ids=None,
+                past_key_values=None,
+                inputs_embeds=None,
+                labels=None,
+                use_cache=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                pixel_values=None,
+                pixel_values_videos=None,
+                image_grid_thw=None,
+                video_grid_thw=None,
+                rope_deltas=None,
+                timestamps=None,
+                saliency=None,
+                pos_clip=None):
+        mode = 'training' if self.training else 'caching' if (
+            past_key_values is None or len(past_key_values) == 0) else 'generating'
+
+        # https://github.com/huggingface/transformers/pull/33487
+        if position_ids is None and input_ids is not None:
+            position_ids, _ = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask)
+
+        if mode in ('training', 'caching'):
+            vision_s_inds = torch.nonzero(input_ids == self.config.vision_start_token_id).tolist()
+            vision_e_inds = torch.nonzero(input_ids == self.config.vision_end_token_id).tolist()
+            assert len(vision_s_inds) == len(vision_e_inds)
+
+            self.cache_vision_inds = [[] for _ in range(input_ids.size(0))]
+            for i in range(len(vision_s_inds)):
+                assert vision_s_inds[i][0] == vision_e_inds[i][0]
+                self.cache_vision_inds[vision_s_inds[i][0]].append([vision_s_inds[i][1] + 1, vision_e_inds[i][1]])
+
+        outputs = super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=not self.training,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            rope_deltas=rope_deltas)
+
+        if mode == 'caching':
+            self.cache_norm_state = self.model.norm.state
+            self.reg = []
+            self.sal = []
+
+        if mode == 'training' and timestamps is not None:
+            loss_regs, avg_factors = [], []
+            shift_labels = labels[..., 1:].contiguous()
+            for batch_idx, (vision_inds, ts) in enumerate(zip(self.cache_vision_inds, timestamps)):
+                # only consider the first video
+                s, e = vision_inds[0]
+
+                # spatial merge size set to 2
+                window = int(video_grid_thw[0][1] * video_grid_thw[0][2] / 4)
+                assert video_grid_thw[0][0] * window == e - s
+
+                inds = torch.where(shift_labels[batch_idx] == self.config.reg_token_id)[0]
+                reg_tokens = self.reg_proj(self.model.norm.state[batch_idx, inds, None])
+                # reg_tokens: num_reg_tokens * 1 * channel
+
+                vis_tokens = self.model.norm.state[batch_idx, None, s:e]
+                vis_tokens = vis_tokens.transpose(-1, -2)
+                vis_tokens = F.avg_pool1d(vis_tokens.float(), window, stride=window).to(vis_tokens.dtype)
+                vis_tokens = vis_tokens.transpose(-1, -2)
+                vis_tokens = self.vis_proj(vis_tokens).repeat(reg_tokens.size(0), 1, 1)
+                # vis_tokens: num_reg_tokens * num_frames * channel
+
+                vis_tokens = self.vis_emb(vis_tokens)
+                reg_tokens = self.reg_emb(reg_tokens)
+                pe = self.vis_pos(vis_tokens).to(vis_tokens.dtype)
+
+                joint_tokens = torch.cat((vis_tokens + pe, reg_tokens), dim=1)
+                collected = [joint_tokens]
+                for blk in self.vis_fuse:
+                    collected.append(blk(collected[-1]))
+                collected = collected[1:]
+                joint_tokens = torch.cat(collected)
+                joint_tokens = self.vis_norm(joint_tokens)
+
+                video_emb = joint_tokens[:, :-1]
+                # video_emb: num_reg_tokens * num_frames * channel
+
+                query_emb = joint_tokens[:, -1:]
+                # query_emb: num_reg_tokens * 1 * channel
+
+                b, t, c = video_emb.size()
+                video_msk = video_emb.new_ones(b, t)
+
+                if t < self.vis_pad_length:
+                    emb_pad = video_emb.new_zeros(b, self.vis_pad_length - t, c)
+                    msk_pad = video_msk.new_zeros(b, self.vis_pad_length - t)
+                    pymid_emb = torch.cat((video_emb, emb_pad), dim=1)
+                    pymid_msk = torch.cat((video_msk, msk_pad), dim=1)
+                else:
+                    pymid_emb, pymid_msk = video_emb, video_msk
+
+                pymid, pymid_msk = self.pyramid(pymid_emb, pymid_msk, return_mask=True)
+                if not len(pymid) == len(pymid_msk) == len(self.strides):
+                    warnings.warn(f'pyramid size mismatch: {len(pymid)} {len(pymid_msk)} {len(self.strides)}')
+
+                point = self.generator(pymid)
+
+                out_class = [self.class_head(e) for e in pymid]
+                out_class = torch.cat(out_class, dim=1)
+
+                out_coord = [self.coef(self.coord_head(e).exp(), i) for i, e in enumerate(pymid)]
+                out_coord = torch.cat(out_coord, dim=1)
+
+                data = dict(
+                    point=point,
+                    video_emb=video_emb,
+                    query_emb=query_emb,
+                    video_msk=video_msk,
+                    pymid_msk=pymid_msk,
+                    out_class=out_class,
+                    out_coord=out_coord,
+                    boundary=point.new_tensor(ts),
+                    saliency=saliency[batch_idx].unsqueeze(0),
+                    pos_clip=pos_clip[batch_idx].unsqueeze(0))
+
+                losses = self.bundle_loss(data, dict())
+                # print({k: v.item() for k, v in losses.items()})
+
+                loss_regs.append(sum(v for v in losses.values()))
+                avg_factors.append(len(ts))
+
+            assert len(loss_regs) in (1, 2) and len(loss_regs) == len(avg_factors)
+
+            if len(loss_regs) == 2 and loss_regs[0] > loss_regs[1]:
+                loss_reg, avg_factor = loss_regs[1], avg_factors[1]
+            else:
+                loss_reg, avg_factor = loss_regs[0], avg_factors[0]
+
+            if avg_factor > 0:
+                outputs.loss = outputs.loss + loss_reg / avg_factor
+        elif mode == 'generating':
+            logits = outputs.logits[0, -1]
+            if logits.argmax() == self.config.reg_token_id:
+                assert self.model.norm.state.size() == (1, 1, self.config.hidden_size)
+
+                # only consider the first video
+                s, e = self.cache_vision_inds[0][0]
+
+                # spatial merge size set to 2
+                window = int(video_grid_thw[0][1] * video_grid_thw[0][2] / 4)
+                assert video_grid_thw[0][0] * window == e - s
+
+                reg_tokens = self.reg_proj(self.model.norm.state)
+                # reg_tokens: num_reg_tokens * 1 * channel
+
+                vis_tokens = self.cache_norm_state[:, s:e]
+                vis_tokens = vis_tokens.transpose(-1, -2)
+                vis_tokens = F.avg_pool1d(vis_tokens.float(), window, stride=window).to(vis_tokens.dtype)
+                vis_tokens = vis_tokens.transpose(-1, -2)
+                vis_tokens = self.vis_proj(vis_tokens).repeat(reg_tokens.size(0), 1, 1)
+                # vis_tokens: num_reg_tokens * num_frames * channel
+
+                vis_tokens = self.vis_emb(vis_tokens)
+                reg_tokens = self.reg_emb(reg_tokens)
+                pe = self.vis_pos(vis_tokens).to(vis_tokens.dtype)
+
+                joint_tokens = torch.cat((vis_tokens + pe, reg_tokens), dim=1)
+                for blk in self.vis_fuse:
+                    joint_tokens = blk(joint_tokens)
+                joint_tokens = self.vis_norm(joint_tokens)
+
+                video_emb = joint_tokens[:, :-1]
+                # video_emb: num_reg_tokens * num_frames * channel
+
+                query_emb = joint_tokens[:, -1:]
+                # query_emb: num_reg_tokens * 1 * channel
+
+                b, t, _ = video_emb.size()
+                video_msk = video_emb.new_ones(b, t)
+
+                pymid = self.pyramid(video_emb, video_msk)
+                point = self.generator(pymid)
+
+                out_class = [self.class_head(e).sigmoid() for e in pymid]
+                out_class = torch.cat(out_class, dim=1)
+
+                out_coord = [self.coef(self.coord_head(e).exp(), i) for i, e in enumerate(pymid)]
+                out_coord = torch.cat(out_coord, dim=1)
+
+                sal = out_class[0]
+                bnd = out_coord[0]
+
+                bnd[:, 0] *= -1
+                bnd *= point[:, 3, None].repeat(1, 2)
+                bnd += point[:, 0, None].repeat(1, 2)
+                bnd /= t
+                bnd = torch.cat((bnd, sal), dim=-1)
+
+                _, inds = bnd[:, -1].sort(descending=True)
+                bnd = bnd[inds]
+
+                # hard coding nms config here
+                nms_cfg = dict(type='normal', thres=0.75)
+                assert nms_cfg['type'] in ('normal', 'linear', 'gaussian')
+
+                for i in range(bnd.size(0)):
+                    max_idx = bnd[i:, -1].argmax(dim=0)
+                    bnd = nncore.swap_element(bnd, i, max_idx + i)
+                    iou = temporal_iou(bnd[i, None, :-1], bnd[i + 1:, :-1])[0]
+
+                    if nms_cfg['type'] == 'normal':
+                        bnd[i + 1:, -1][iou >= nms_cfg['thres']] = 0
+                    elif nms_cfg['type'] == 'linear':
+                        bnd[i + 1:, -1] *= 1 - iou
+                    else:
+                        bnd[i + 1:, -1] *= (-iou.pow(2) / nms_cfg['sigma']).exp()
+
+                # save top-100 predictions
+                self.reg.append(bnd[:100])
+
+                # save all saliency scores
+                self.sal.append(sal)
+
+        return outputs
+
+
+# set the patched model to a vision model
+MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES[AgentQwen2VLConfig.model_type] = 'AgentQwen2VLForConditionalGeneration'
+
+AutoConfig.register(AgentQwen2VLConfig.model_type, AgentQwen2VLConfig)
+AutoModel.register(AgentQwen2VLConfig, AgentQwen2VLForConditionalGeneration)
diff --git a/videomind/train/custom_trainer.py b/videomind/train/custom_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..abe65d5b9a58552394f08ddc716e023f8930d2fc
--- /dev/null
+++ b/videomind/train/custom_trainer.py
@@ -0,0 +1,251 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import warnings
+
+import nncore
+import torch
+from deepspeed import zero
+from safetensors.torch import load_model, save_file
+from torch.utils.data import Sampler
+from transformers import Trainer, TrainerCallback
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.trainer_pt_utils import get_parameter_names
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+from transformers.utils import CHAT_TEMPLATE_NAME
+
+
+def gather(param):
+    if hasattr(param, 'ds_id'):
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+def gather_lora_params(model, bias):
+    assert bias in ('lora_only', 'all', 'none')
+
+    if bias == 'lora_only':
+        state_dict, maybe_lora_bias, lora_bias_names = dict(), dict(), set()
+        for n, p in model.named_parameters():
+            if 'modules_to_save' in n:
+                state_dict[n] = p
+            elif 'lora_' in n:
+                state_dict[n] = p
+                bias_name = n.split('lora_')[0] + 'bias'
+                lora_bias_names.add(bias_name)
+            elif 'bias' in n:
+                maybe_lora_bias[n] = p
+        for n, p in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                state_dict[bias_name] = p
+    else:
+        keys = ['lora_', 'modules_to_save', 'bias'] if bias == 'all' else ['lora_', 'modules_to_save']
+        state_dict = {n: p for n, p in model.named_parameters() if any(k in n for k in keys)}
+
+    state_dict = {n: gather(p) for n, p in state_dict.items()}
+    return state_dict
+
+
+def gather_key_params(model, keys):
+    state_dict = {n: p for n, p in model.named_parameters() if p.requires_grad and any(k in n for k in keys)}
+    state_dict = {n: gather(p) for n, p in state_dict.items()}
+    return state_dict
+
+
+class GroupSampler(Sampler):
+
+    def __init__(self, group_size, data_types, seed):
+        self.group_size = group_size
+        self.data_types = data_types
+        self.seed = seed
+
+    def __len__(self):
+        return len(self.data_types)
+
+    def __iter__(self):
+        g = torch.Generator()
+        g.manual_seed(self.seed + self.epoch)
+
+        # avoid using dict or set here as they are not deterministic
+        unique_types, groups = [], []
+        for i, t in enumerate(self.data_types):
+            if t not in unique_types:
+                unique_types.append(t)
+                groups.append([])
+            groups[unique_types.index(t)].append(i)
+
+        group_batches = []
+        for group in groups:
+            inds = [group[i] for i in torch.randperm(len(group), generator=g)]
+            batches = [inds[i:i + self.group_size] for i in range(0, len(inds), self.group_size)]
+
+            if len(batches[-1]) < self.group_size:
+                batches = batches[:-1]
+
+            group_batches += batches
+
+        perm_group_batches = [group_batches[i] for i in torch.randperm(len(group_batches), generator=g)]
+        inds = [i for batch in perm_group_batches for i in batch]
+
+        return iter(inds)
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+class SetEpochCallback(TrainerCallback):
+
+    # partially fixed in https://github.com/huggingface/accelerate/pull/3246
+    # but not for the case of batch_sampler.batch_sampler.sampler
+    def on_epoch_begin(self, args, state, control, **kwargs):
+        shard_sampler = kwargs['train_dataloader'].batch_sampler
+        batch_sampler = getattr(shard_sampler, 'batch_sampler', shard_sampler)
+        batch_sampler.sampler.set_epoch(int(state.epoch))
+
+
+class CustomTrainer(Trainer):
+
+    def __init__(self, *args, processor=None, head_keys=None, **kwargs):
+        super().__init__(*args, tokenizer=processor, **kwargs)
+        self.add_callback(SetEpochCallback())
+        self.processor = processor
+        self.head_keys = head_keys
+
+    def _get_train_sampler(self):
+        if self.args.group_by_data_type:
+            return GroupSampler(self.args.train_batch_size * self.args.world_size, self.train_dataset.data_types,
+                                self.args.seed)
+        else:
+            return super()._get_train_sampler()
+
+    def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
+        if model is None:
+            model = self.model
+
+        super()._load_from_checkpoint(resume_from_checkpoint, model=model)
+
+        partial_path = nncore.join(resume_from_checkpoint, 'pytorch_model.safetensors')
+        if nncore.is_file(partial_path):
+            load_model(model, partial_path, strict=False, device=model.device)
+
+    def create_optimizer(self):
+        if self.optimizer is None:
+            grad_ps = [(n, p) for n, p in self.model.named_parameters() if p.requires_grad]
+
+            decay_ps = get_parameter_names(self.model, ALL_LAYERNORM_LAYERS)
+            decay_ps = [n for n in decay_ps if 'bias' not in n]
+
+            if self.args.lora_lr is None:
+                self.args.lora_lr = self.args.learning_rate
+
+            if self.args.head_lr is None:
+                self.args.head_lr = self.args.learning_rate
+
+            lora_ps = [n for n, _ in grad_ps if 'lora' in n]
+            head_ps = [n for n, _ in grad_ps if any(k in n for k in self.head_keys)]
+            assert all(n not in lora_ps for n in head_ps) and all(n not in head_ps for n in lora_ps)
+
+            groups = [{
+                'params': [p for n, p in grad_ps if (n in decay_ps and n not in lora_ps and n not in head_ps)],
+                'weight_decay': self.args.weight_decay
+            }, {
+                'params': [p for n, p in grad_ps if (n not in decay_ps and n not in lora_ps and n not in head_ps)],
+                'weight_decay': 0.0
+            }, {
+                'params': [p for n, p in grad_ps if (n in decay_ps and n in lora_ps)],
+                'weight_decay': self.args.weight_decay,
+                'lr': self.args.lora_lr
+            }, {
+                'params': [p for n, p in grad_ps if (n not in decay_ps and n in lora_ps)],
+                'weight_decay': 0.0,
+                'lr': self.args.lora_lr
+            }, {
+                'params': [p for n, p in grad_ps if (n in decay_ps and n in head_ps)],
+                'weight_decay': self.args.weight_decay,
+                'lr': self.args.head_lr
+            }, {
+                'params': [p for n, p in grad_ps if (n not in decay_ps and n in head_ps)],
+                'weight_decay': 0.0,
+                'lr': self.args.head_lr
+            }]
+
+            optim_cls, kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+            self.optimizer = optim_cls(groups, **kwargs)
+
+        return self.optimizer
+
+    def gather_and_save_model(self):
+        deepspeed_zero3 = self.accelerator.deepspeed_config['zero_optimization']['stage'] == 3
+        output_dir = self.args.output_dir
+
+        if self.args.should_save:
+            print(f'Saving final model to {nncore.abs_path(output_dir)}...')
+
+        if self.processor is not None and self.args.should_save:
+            self.processor.save_pretrained(output_dir)
+
+            # https://github.com/huggingface/transformers/pull/33462
+            if self.processor.chat_template is not None:
+                chat_template = {'chat_template': self.processor.chat_template}
+                nncore.dump(chat_template, nncore.join(output_dir, CHAT_TEMPLATE_NAME), indent=2)
+
+        if self.args.save_full_model and self.args.lora_enable and deepspeed_zero3:
+            warnings.warn('LoRA models cannot be saved in full mode under zero3, saving adapters instead')
+            self.args.save_full_model = False
+
+        if self.args.save_full_model:
+            if self.args.lora_enable:
+                self.model = self.model.merge_and_unload()
+
+            if deepspeed_zero3 and not self.model_wrapped.zero_gather_16bit_weights_on_model_save():
+                warnings.warn('Saving zero checkpoint, use zero_to_fp32.py to recover weights')
+                self.model_wrapped.save_checkpoint(output_dir)
+                return
+
+            if deepspeed_zero3:
+                state_dict = self.model_wrapped._zero3_consolidated_16bit_state_dict()
+            else:
+                state_dict = self.model.state_dict()
+
+            if self.args.should_save:
+                state_dict = {k[17:] if k.startswith('base_model.model.') else k: v for k, v in state_dict.items()}
+                self._save(output_dir, state_dict=state_dict)
+        else:
+            if self.args.lora_enable:
+                state_dict = gather_lora_params(self.model, self.args.lora_bias)
+                if self.args.should_save:
+                    self.model.save_pretrained(output_dir, state_dict=state_dict)
+
+            if self.args.should_save:
+                self.model.config.save_pretrained(output_dir)
+                self.model.generation_config.save_pretrained(output_dir)
+                self.tokenizer.save_pretrained(output_dir)
+
+            state_dict = gather_key_params(self.model, self.head_keys)
+            if self.args.should_save and state_dict:
+                save_file(state_dict, nncore.join(output_dir, 'pytorch_model.safetensors'))
+
+    def _save_checkpoint(self, model, trial, **kwargs):
+        output_dir = self._get_output_dir(trial)
+        output_dir = nncore.join(output_dir, f'{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}')
+
+        if self.args.should_save:
+            print(f'Saving checkpoint to {nncore.abs_path(output_dir)}...')
+
+        super()._save_checkpoint(model, trial, **kwargs)
+
+        if self.processor is not None and self.args.should_save:
+            self.processor.save_pretrained(output_dir)
+
+            # https://github.com/huggingface/transformers/pull/33462
+            if self.processor.chat_template is not None:
+                chat_template = {'chat_template': self.processor.chat_template}
+                nncore.dump(chat_template, nncore.join(output_dir, CHAT_TEMPLATE_NAME), indent=2)
+
+        if self.args.lora_enable:
+            state_dict = gather_key_params(self.model, self.head_keys)
+            if self.args.should_save:
+                self.model.config.save_pretrained(output_dir)
+                save_file(state_dict, nncore.join(output_dir, 'pytorch_model.safetensors'))
diff --git a/videomind/train/train.py b/videomind/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..3217a36fc1540e3dfff77b38eed05422e2a9e274
--- /dev/null
+++ b/videomind/train/train.py
@@ -0,0 +1,200 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+import nncore
+import torch
+import torch.nn as nn
+from peft import LoraConfig, PeftModel, get_peft_model
+from transformers import AutoProcessor, HfArgumentParser, TrainingArguments
+
+from videomind.constants import REG_TOKEN, SEG_E_TOKEN, SEG_S_TOKEN
+from videomind.dataset import HybridDataCollator, HybridDataset
+from videomind.model import MODELS
+from videomind.model.builder import build_model
+from videomind.train.custom_trainer import CustomTrainer
+
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default=None)
+    base_model: Optional[str] = field(default=None)
+    conv_type: Optional[str] = field(default=None)
+    role: Optional[str] = field(default=None)
+
+
+@dataclass
+class DataArguments:
+    datasets: Optional[str] = field(default=None)
+    min_video_len: Optional[int] = field(default=-1)
+    max_video_len: Optional[int] = field(default=-1)
+    min_num_words: Optional[int] = field(default=-1)
+    max_num_words: Optional[int] = field(default=-1)
+    max_retries: Optional[int] = field(default=10)
+
+
+@dataclass
+class CustomArguments:
+    optim: Optional[str] = field(default='adamw_torch')
+    group_by_data_type: Optional[bool] = field(default=True)
+    merge_adapter: Optional[bool] = field(default=False)
+    lora_enable: Optional[bool] = field(default=False)
+    lora_type: Optional[str] = field(default='qkvo')
+    lora_r: Optional[int] = field(default=64)
+    lora_alpha: Optional[int] = field(default=64)
+    lora_dropout: Optional[float] = field(default=0.1)
+    lora_bias: Optional[str] = field(default='none')
+    lora_lr: Optional[float] = field(default=None)
+    head_lr: Optional[float] = field(default=None)
+    tuning_modules: Optional[str] = field(default=None)
+    save_full_model: Optional[bool] = field(default=False)
+    remove_unused_columns: Optional[bool] = field(default=False)
+
+
+@dataclass
+class TrainingArguments(CustomArguments, TrainingArguments):
+    pass
+
+
+def get_target_modules(model, lora_type, base_model):
+    lora_type = lora_type.split('_')
+    assert all(t in ('qkvo', 'linear', 'all') for t in lora_type)
+
+    if base_model == 'qwen2_vl':
+        # all qkvo layers in the visual encoder and the llm
+        qkvo_keys = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'attn.qkv', 'attn.proj']
+
+        target_modules = set()
+        for n, m in model.named_modules():
+            if not isinstance(m, nn.Linear):
+                continue
+            if 'all' not in lora_type and 'visual' in n:
+                continue
+            if 'qkvo' in lora_type and not any(n.endswith(k) for k in qkvo_keys):
+                continue
+            target_modules.add(n)
+    else:
+        raise ValueError(f'unknown base model: {base_model}')
+
+    return target_modules
+
+
+def train(TrainingArguments, Trainer):
+    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    assert model_args.role in ('all_in_one', 'planner', 'grounder', 'verifier', 'answerer')
+
+    config_cls, model_cls = MODELS[model_args.base_model]
+
+    dtype = torch.bfloat16 if training_args.bf16 else torch.float32
+
+    config = config_cls.from_pretrained(model_args.model_name_or_path, torch_dtype=dtype)
+    config.update(model_args.__dict__)
+
+    if config.model_type == 'agent_qwen2_vl':
+        model, processor = build_model(
+            model_args.model_name_or_path,
+            config=config,
+            is_trainable=True,
+            merge_adapter=training_args.merge_adapter,
+            dtype=dtype)
+    else:
+        # set do_resize to false to avoid duplicated resizing
+        # https://github.com/huggingface/transformers/tree/main/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
+        processor = AutoProcessor.from_pretrained(model_args.model_name_or_path, do_resize=False)
+
+        # eager attention has known & unknown bugs
+        # [4.46.2] broken causality fp16: https://github.com/huggingface/transformers/issues/35151
+        # [4.48.1] broken sliding window: https://github.com/huggingface/transformers/issues/35924
+        model = model_cls.from_pretrained(model_args.model_name_or_path, config=config, attn_implementation='sdpa')
+
+        # save base model path for inference
+        model.config.base_model_path = model_args.model_name_or_path
+
+        # conv parameters may become inf after casting to fp16
+        model.reset_conv_parameters()
+
+        model.requires_grad_(False)
+
+    if training_args.lora_enable and not isinstance(model, PeftModel):
+        target_modules = get_target_modules(model, training_args.lora_type, model.config.base_model)
+        tune_lm_head = model.config.role in ('all_in_one', 'grounder', 'verifier')
+        print(f'LoRA target modules: {target_modules}')
+        lora_config = LoraConfig(
+            task_type='CAUSAL_LM',
+            r=training_args.lora_r,
+            lora_alpha=training_args.lora_alpha,
+            lora_dropout=training_args.lora_dropout,
+            bias=training_args.lora_bias,
+            target_modules=target_modules,
+            modules_to_save=['embed_tokens', 'lm_head'] if tune_lm_head else None)
+        # transformers integration does not support merge_and_unload, use peft instead
+        model = get_peft_model(model, lora_config, adapter_name=model_args.role)
+
+    new_tokens = processor.tokenizer.add_special_tokens(
+        dict(additional_special_tokens=[REG_TOKEN, SEG_S_TOKEN, SEG_E_TOKEN]))
+    print(f'Added {new_tokens} new token(s)')
+
+    model.config.reg_token_id = processor.tokenizer.convert_tokens_to_ids(REG_TOKEN)
+    model.config.seg_s_token_id = processor.tokenizer.convert_tokens_to_ids(SEG_S_TOKEN)
+    model.config.seg_e_token_id = processor.tokenizer.convert_tokens_to_ids(SEG_E_TOKEN)
+
+    if new_tokens > 0 and len(processor.tokenizer) > model.config.vocab_size:
+        print(f'Expanding vocab size: {model.config.vocab_size} -> {len(processor.tokenizer)}')
+        model.resize_token_embeddings(len(processor.tokenizer))
+        i_emb = model.get_input_embeddings().weight.data
+        o_emb = model.get_output_embeddings().weight.data
+        i_emb[-new_tokens:] = i_emb[:-new_tokens].mean(0, keepdim=True)
+        o_emb[-new_tokens:] = o_emb[:-new_tokens].mean(0, keepdim=True)
+
+    tuning_modules = [] if training_args.tuning_modules is None else training_args.tuning_modules.split(',')
+
+    head_keys = [
+        'vis_proj', 'reg_proj', 'vis_fuse', 'vis_norm', 'vis_pos', 'vis_emb', 'reg_emb', 'pyramid', 'class_head',
+        'coord_head', 'coef', 'bundle_loss'
+    ]
+
+    for n, p in model.named_parameters():
+        # embed_tokens and lm_head might be handled by lora
+        if not training_args.lora_enable and new_tokens > 0 and any(k in n for k in ('embed_tokens', 'lm_head')):
+            p.requires_grad = True
+
+        if 'projector' in tuning_modules and 'visual.merger' in n:
+            p.requires_grad = True
+
+        if model_args.role in ('all_in_one', 'grounder') and any(k in n for k in head_keys):
+            p.requires_grad = True
+
+    if training_args.local_rank in (0, -1):
+        for n, p in model.named_parameters():
+            print(p.requires_grad, p.dtype, p.shape, n)
+
+        total_params = sum(p.numel() for p in model.parameters())
+        learnable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        ratio = round(learnable_params / total_params * 100, 2) if total_params > 0 else 0
+        print(f'Total params: {total_params} Learnable params: {learnable_params} ({ratio}%)')
+
+        i_size = model.get_input_embeddings().num_embeddings
+        o_size = model.get_output_embeddings().out_features
+        assert i_size == o_size, (i_size, o_size)
+        print(f'Tokenizer size: {len(processor.tokenizer)} Vocab size: {model.config.vocab_size} Embed size: {i_size}')
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        data_collator=HybridDataCollator(processor.tokenizer),
+        train_dataset=HybridDataset(processor, model.config, model_args, data_args, training_args),
+        processor=processor,
+        head_keys=head_keys)
+
+    has_ckpt = bool(nncore.find(training_args.output_dir, 'checkpoint-*'))
+    trainer.train(resume_from_checkpoint=has_ckpt)
+
+    trainer.save_state()
+    trainer.gather_and_save_model()
+
+
+if __name__ == '__main__':
+    train(TrainingArguments, CustomTrainer)
diff --git a/videomind/utils/io.py b/videomind/utils/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..4040700eeba7c01c17b877f2e15dc80c90d6d21e
--- /dev/null
+++ b/videomind/utils/io.py
@@ -0,0 +1,30 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import pysrt
+from decord import VideoReader
+
+
+def time_to_seconds(t):
+    return (t.hour * 60 + t.minute) * 60 + t.second + t.microsecond / 1000000
+
+
+def load_subtitle(path):
+    subs = pysrt.open(path)
+
+    parsed = []
+    for sub in subs:
+        s = time_to_seconds(sub.start.to_time())
+        e = time_to_seconds(sub.end.to_time())
+        parsed.append((s, e, sub.text))
+
+    return parsed
+
+
+def get_duration(path, num_threads=1):
+    # sometimes the video is loaded as a list of frames
+    if isinstance(path, list):
+        return len(path)
+
+    vr = VideoReader(path, num_threads=num_threads)
+    duration = len(vr) / vr.get_avg_fps()
+    return duration
diff --git a/videomind/utils/parser.py b/videomind/utils/parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ca1a643c8e050680686ecb7ce65d1ac073de4d8
--- /dev/null
+++ b/videomind/utils/parser.py
@@ -0,0 +1,25 @@
+# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
+
+import re
+
+
+def parse_span(span, duration, min_len=-1):
+    s, e = span
+    s, e = min(duration, max(0, s)), min(duration, max(0, e))
+    s, e = min(s, e), max(s, e)
+
+    if min_len != -1 and e - s < min_len:
+        h = min_len / 2
+        c = min(duration - h, max(h, (s + e) / 2))
+        s, e = c - h, c + h
+
+    s, e = min(duration, max(0, s)), min(duration, max(0, e))
+    return s, e
+
+
+def parse_query(query):
+    return re.sub(r'\s+', ' ', query).strip().strip('.').strip()
+
+
+def parse_question(question):
+    return re.sub(r'\s+', ' ', question).strip()