# Copyright (c) 2025 Ye Liu. Licensed under the BSD-3-Clause license.
import html
import json
import os
import random
import time
import gradio as gr
import nncore
import spaces
import torch
from huggingface_hub import snapshot_download
from videomind.constants import GROUNDER_PROMPT, PLANNER_PROMPT, VERIFIER_PROMPT
from videomind.dataset.utils import process_vision_info
from videomind.model.builder import build_model
from videomind.utils.io import get_duration
from videomind.utils.parser import parse_query, parse_span
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
PATH = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
BASE_MODEL = 'model_zoo/Qwen2-VL-2B-Instruct'
BASE_MODEL_REPO = 'Qwen/Qwen2-VL-2B-Instruct'
MODEL = 'model_zoo/VideoMind-2B'
MODEL_REPO = 'yeliudev/VideoMind-2B'
TITLE = 'VideoMind: A Chain-of-LoRA Agent for Long Video Reasoning'
BADGE = """
A Chain-of-LoRA Agent for Long Video Reasoning
"""
LOGO = '
'
DISC = 'VideoMind is a multi-modal agent framework that enhances video reasoning by emulating *human-like* processes, such as *breaking down tasks*, *localizing and verifying moments*, and *synthesizing answers*. This demo showcases how VideoMind-2B handles video-language tasks. Please open an issue if you meet any problems.' # noqa
# yapf:disable
EXAMPLES = [
[f'{PATH}/examples/4167294363.mp4', 'Why did the old man stand up?', ['pla', 'gnd', 'ver', 'ans']],
[f'{PATH}/examples/5012237466.mp4', 'How does the child in stripes react about the fountain?', ['pla', 'gnd', 'ver', 'ans']],
[f'{PATH}/examples/13887487955.mp4', 'What did the excavator do after it pushed the cement forward?', ['pla', 'gnd', 'ver', 'ans']],
[f'{PATH}/examples/5188348585.mp4', 'What did the person do before pouring the liquor?', ['pla', 'gnd', 'ver', 'ans']],
[f'{PATH}/examples/4766274786.mp4', 'What did the girl do after the baby lost the balloon?', ['pla', 'gnd', 'ver', 'ans']],
[f'{PATH}/examples/4742652230.mp4', 'Why is the girl pushing the boy only around the toy but not to other places?', ['pla', 'gnd', 'ver', 'ans']],
[f'{PATH}/examples/9383140374.mp4', 'How does the girl in pink control the movement of the claw?', ['pla', 'gnd', 'ver', 'ans']],
[f'{PATH}/examples/10309844035.mp4', 'Why are they holding up the phones?', ['pla', 'gnd', 'ver', 'ans']],
[f'{PATH}/examples/pA6Z-qYhSNg_60.0_210.0.mp4', 'Different types of meat products are being cut, shaped and prepared', ['gnd', 'ver']],
[f'{PATH}/examples/UFWQKrcbhjI_360.0_510.0.mp4', 'A man talks to the camera whilst walking along a roadside in a rural area', ['gnd', 'ver']],
[f'{PATH}/examples/RoripwjYFp8_210.0_360.0.mp4', 'A woman wearing glasses eating something at a street market', ['gnd', 'ver']],
[f'{PATH}/examples/h6QKDqomIPk_210.0_360.0.mp4', 'A toddler sits in his car seat, holding his yellow tablet', ['gnd', 'ver']],
[f'{PATH}/examples/Z3-IZ3HAmIA_60.0_210.0.mp4', 'A view from the window as the plane accelerates and takes off from the runway', ['gnd', 'ver']],
[f'{PATH}/examples/yId2wIocTys_210.0_360.0.mp4', "Temporally locate the visual content mentioned in the text query 'kids exercise in front of parked cars' within the video.", ['pla', 'gnd', 'ver']],
[f'{PATH}/examples/rrTIeJRVGjg_60.0_210.0.mp4', "Localize the moment that provides relevant context about 'man stands in front of a white building monologuing'.", ['pla', 'gnd', 'ver']],
[f'{PATH}/examples/DTInxNfWXVc_210.0_360.0.mp4', "Find the video segment that corresponds to the given textual query 'man with headphones talking'.", ['pla', 'gnd', 'ver']],
]
# yapf:enable
# https://github.com/gradio-app/gradio/pull/10552
JS = """
function init() {
if (window.innerWidth >= 1536) {
document.querySelector('main').style.maxWidth = '1536px'
}
}
"""
if not nncore.is_dir(BASE_MODEL):
snapshot_download(BASE_MODEL_REPO, local_dir=BASE_MODEL)
if not nncore.is_dir(MODEL):
snapshot_download(MODEL_REPO, local_dir=MODEL)
print('Initializing role *grounder*')
model, processor = build_model(MODEL)
print('Initializing role *planner*')
model.load_adapter(nncore.join(MODEL, 'planner'), adapter_name='planner')
print('Initializing role *verifier*')
model.load_adapter(nncore.join(MODEL, 'verifier'), adapter_name='verifier')
device = torch.device('cuda')
def seconds_to_hms(seconds):
hours, remainder = divmod(round(seconds), 3600)
minutes, seconds = divmod(remainder, 60)
return f'{hours:02}:{minutes:02}:{seconds:02}'
def random_sample():
return random.choice(EXAMPLES)
def enable_btns():
return (gr.Button(interactive=True), ) * 3
def disable_btns():
return (gr.Button(interactive=False), ) * 3
def update_placeholder(role):
placeholder = 'Ask a question about the video...' if 'ans' in role else 'Write a query to search for a moment...'
return gr.Textbox(placeholder=placeholder)
def reset_components():
return ['pla', 'gnd', 'ver', 'ans'], 5, 0, 256
@spaces.GPU
def main(video, prompt, role, max_candidates, temperature, max_new_tokens):
global model, processor, device
history = []
if not video:
gr.Warning('Please upload a video or click [Random] to sample one.')
return history
if not prompt:
gr.Warning('Please provide a prompt or click [Random] to sample one.')
return history
if 'gnd' not in role and 'ans' not in role:
gr.Warning('Please at least select Grounder or Answerer.')
return history
if 'ver' in role and 'gnd' not in role:
gr.Warning('Verifier cannot be used without Grounder.')
return history
if 'pla' in role and 'gnd' not in role and 'ver' not in role:
gr.Warning('Planner can only be used with Grounder and Verifier.')
return history
history.append({'role': 'user', 'content': prompt})
yield history
model = model.to(device)
duration = get_duration(video)
# do grounding and answering by default
do_grounding = True
do_answering = True
# initialize grounding query as prompt
query = prompt
if 'pla' in role:
text = PLANNER_PROMPT.format(prompt)
history.append({
'metadata': {
'title': '🗺️ Working as Planner...'
},
'role': 'assistant',
'content': f'##### Planner Prompt:\n\n{html.escape(text)}\n\n##### Planner Response:\n\n...'
})
yield history
start_time = time.perf_counter()
messages = [{
'role':
'user',
'content': [{
'type': 'video',
'video': video,
'num_threads': 1,
'min_pixels': 36 * 28 * 28,
'max_pixels': 64 * 28 * 28,
'max_frames': 100,
'fps': 1.0
}, {
'type': 'text',
'text': text
}]
}]
text = processor.apply_chat_template(messages, add_generation_prompt=True)
images, videos = process_vision_info(messages)
data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
data = data.to(device)
model.base_model.disable_adapter_layers()
model.base_model.enable_adapter_layers()
model.set_adapter('planner')
output_ids = model.generate(
**data,
do_sample=temperature > 0,
temperature=temperature if temperature > 0 else None,
top_p=None,
top_k=None,
repetition_penalty=None,
max_new_tokens=max_new_tokens)
assert data.input_ids.size(0) == output_ids.size(0) == 1
output_ids = output_ids[0, data.input_ids.size(1):]
if output_ids[-1] == processor.tokenizer.eos_token_id:
output_ids = output_ids[:-1]
response = processor.decode(output_ids, clean_up_tokenization_spaces=False)
for i, text in enumerate(response.split(' ')):
if i == 0:
history[-1]['content'] = history[-1]['content'].rstrip('.')
history[-1]['content'] += text
else:
history[-1]['content'] += ' ' + text
yield history
elapsed_time = round(time.perf_counter() - start_time, 1)
history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
yield history
try:
parsed = json.loads(response)
action = parsed[0] if isinstance(parsed, list) else parsed
if action['type'].lower() == 'grounder' and action['value']:
query = action['value']
elif action['type'].lower() == 'answerer':
do_grounding = False
do_answering = True
except Exception:
pass
response = 'After browsing the video and the question. My plan to figure out the answer is as follows:\n'
step_idx = 1
if 'gnd' in role and do_grounding:
response += f'\n{step_idx}. Localize the relevant moment in this video using the query "{query}".'
step_idx += 1
if 'ver' in role and do_grounding:
response += f'\n{step_idx}. Verify the grounded moments one-by-one and select the best cancdidate.'
step_idx += 1
if 'ans' in role and do_answering:
if step_idx > 1:
response += f'\n{step_idx}. Crop the video segment and zoom-in to higher resolution.'
else:
response += f'\n{step_idx}. Analyze the whole video directly without cropping.'
history.append({'role': 'assistant', 'content': ''})
for i, text in enumerate(response.split(' ')):
history[-1]['content'] += ' ' + text if i > 0 else text
yield history
if 'gnd' in role and do_grounding:
query = parse_query(query)
text = GROUNDER_PROMPT.format(query)
history.append({
'metadata': {
'title': '🔍 Working as Grounder...'
},
'role': 'assistant',
'content': f'##### Grounder Prompt:\n\n{html.escape(text)}\n\n##### Grounder Response:\n\n...'
})
yield history
start_time = time.perf_counter()
messages = [{
'role':
'user',
'content': [{
'type': 'video',
'video': video,
'num_threads': 1,
'min_pixels': 36 * 28 * 28,
'max_pixels': 64 * 28 * 28,
'max_frames': 150,
'fps': 1.0
}, {
'type': 'text',
'text': text
}]
}]
text = processor.apply_chat_template(messages, add_generation_prompt=True)
images, videos = process_vision_info(messages)
data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
data = data.to(device)
model.base_model.disable_adapter_layers()
model.base_model.enable_adapter_layers()
model.set_adapter('grounder')
output_ids = model.generate(
**data,
do_sample=temperature > 0,
temperature=temperature if temperature > 0 else None,
top_p=None,
top_k=None,
repetition_penalty=None,
max_new_tokens=max_new_tokens)
assert data.input_ids.size(0) == output_ids.size(0) == 1
output_ids = output_ids[0, data.input_ids.size(1):]
if output_ids[-1] == processor.tokenizer.eos_token_id:
output_ids = output_ids[:-1]
response = processor.decode(output_ids, clean_up_tokenization_spaces=False)
for i, text in enumerate(response.split(' ')):
if i == 0:
history[-1]['content'] = history[-1]['content'].rstrip('.')
history[-1]['content'] += text
else:
history[-1]['content'] += ' ' + text
yield history
elapsed_time = round(time.perf_counter() - start_time, 1)
history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
yield history
if len(model.reg) > 0:
# 1. extract timestamps and confidences
blob = model.reg[0].cpu().float()
pred, conf = blob[:, :2] * duration, blob[:, -1].tolist()
# 2. clamp timestamps
pred = pred.clamp(min=0, max=duration)
# 3. sort timestamps
inds = (pred[:, 1] - pred[:, 0] < 0).nonzero()[:, 0]
pred[inds] = pred[inds].roll(1)
# 4. convert timestamps to list
pred = pred.tolist()
else:
if 'ver' in role:
pred = [[i * duration / 6, (i + 2) * duration / 6] for i in range(5)]
conf = [0] * 5
else:
pred = [[0, duration]]
conf = [0]
response = 'The candidate moments and confidence scores are as follows:\n'
response += '\n| ID | Start Time | End Time | Confidence |'
response += '\n| :-: | :-: | :-: | :-: |'
for i, (p, c) in enumerate(zip(pred[:max_candidates], conf[:max_candidates])):
response += f'\n| {i} | {seconds_to_hms(p[0])} | {seconds_to_hms(p[1])} | {c:.2f} |'
response += f'\n\nTherefore, the target moment might happens from {seconds_to_hms(pred[0][0])} to {seconds_to_hms(pred[0][1])}.'
history.append({'role': 'assistant', 'content': ''})
for i, text in enumerate(response.split(' ')):
history[-1]['content'] += ' ' + text if i > 0 else text
yield history
if 'ver' in role and do_grounding:
text = VERIFIER_PROMPT.format(query)
history.append({
'metadata': {
'title': '📊 Working as Verifier...'
},
'role': 'assistant',
'content': f'##### Verifier Prompt:\n\n{html.escape(text)}\n\n##### Verifier Response:\n\n...'
})
yield history
start_time = time.perf_counter()
prob = []
for i, cand in enumerate(pred[:max_candidates]):
s0, e0 = parse_span(cand, duration, 2)
offset = (e0 - s0) / 2
s1, e1 = parse_span([s0 - offset, e0 + offset], duration)
# percentage of s0, e0 within s1, e1
s = (s0 - s1) / (e1 - s1)
e = (e0 - s1) / (e1 - s1)
messages = [{
'role':
'user',
'content': [{
'type': 'video',
'video': video,
'num_threads': 1,
'video_start': s1,
'video_end': e1,
'min_pixels': 36 * 28 * 28,
'max_pixels': 64 * 28 * 28,
'max_frames': 64,
'fps': 2.0
}, {
'type': 'text',
'text': text
}]
}]
text = processor.apply_chat_template(messages, add_generation_prompt=True)
images, videos = process_vision_info(messages)
data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
# ===== insert segment start/end tokens =====
video_grid_thw = data['video_grid_thw'][0]
num_frames, window = int(video_grid_thw[0]), int(video_grid_thw[1] * video_grid_thw[2] / 4)
assert num_frames * window * 4 == data['pixel_values_videos'].size(0)
pos_s, pos_e = round(s * num_frames), round(e * num_frames)
pos_s, pos_e = min(max(0, pos_s), num_frames), min(max(0, pos_e), num_frames)
assert pos_s <= pos_e, (num_frames, s, e)
base_idx = torch.nonzero(data['input_ids'][0] == model.config.vision_start_token_id).item()
pos_s, pos_e = pos_s * window + base_idx + 1, pos_e * window + base_idx + 2
input_ids = data['input_ids'][0].tolist()
input_ids.insert(pos_s, model.config.seg_s_token_id)
input_ids.insert(pos_e, model.config.seg_e_token_id)
data['input_ids'] = torch.LongTensor([input_ids])
data['attention_mask'] = torch.ones_like(data['input_ids'])
# ===========================================
data = data.to(device)
model.base_model.disable_adapter_layers()
model.base_model.enable_adapter_layers()
model.set_adapter('verifier')
with torch.inference_mode():
logits = model(**data).logits[0, -1].softmax(dim=-1)
# NOTE: magic numbers here
# In Qwen2-VL vocab: 9454 -> Yes, 2753 -> No
score = (logits[9454] - logits[2753]).sigmoid().item()
prob.append(score)
if i == 0:
history[-1]['content'] = history[-1]['content'].rstrip('.')[:-1]
response = f'\nCandidate ID {i}: P(Yes) = {score:.2f}'
for j, text in enumerate(response.split(' ')):
history[-1]['content'] += ' ' + text if j > 0 else text
yield history
elapsed_time = round(time.perf_counter() - start_time, 1)
history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
yield history
ranks = torch.Tensor(prob).argsort(descending=True).tolist()
prob = [prob[idx] for idx in ranks]
pred = [pred[idx] for idx in ranks]
conf = [conf[idx] for idx in ranks]
response = 'After verification, the candidate moments are re-ranked as follows:\n'
response += '\n| ID | Start Time | End Time | Score |'
response += '\n| :-: | :-: | :-: | :-: |'
ids = list(range(len(ranks)))
for r, p, c in zip(ranks, pred, prob):
response += f'\n| {ids[r]} | {seconds_to_hms(p[0])} | {seconds_to_hms(p[1])} | {c:.2f} |'
response += f'\n\nTherefore, the target moment should be from {seconds_to_hms(pred[0][0])} to {seconds_to_hms(pred[0][1])}.'
history.append({'role': 'assistant', 'content': ''})
for i, text in enumerate(response.split(' ')):
history[-1]['content'] += ' ' + text if i > 0 else text
yield history
if 'ans' in role and do_answering:
text = f'{prompt} Please think step by step and provide your response.'
history.append({
'metadata': {
'title': '📝 Working as Answerer...'
},
'role': 'assistant',
'content': f'##### Answerer Prompt:\n\n{html.escape(text)}\n\n##### Answerer Response:\n\n...'
})
yield history
start_time = time.perf_counter()
# choose the potential best moment
selected = pred[0] if 'gnd' in role and do_grounding else [0, duration]
s, e = parse_span(selected, duration, 32)
messages = [{
'role':
'user',
'content': [{
'type': 'video',
'video': video,
'num_threads': 1,
'video_start': s,
'video_end': e,
'min_pixels': 128 * 28 * 28,
'max_pixels': 256 * 28 * 28,
'max_frames': 32,
'fps': 2.0
}, {
'type': 'text',
'text': text
}]
}]
text = processor.apply_chat_template(messages, add_generation_prompt=True)
images, videos = process_vision_info(messages)
data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
data = data.to(device)
with model.disable_adapter():
output_ids = model.generate(
**data,
do_sample=temperature > 0,
temperature=temperature if temperature > 0 else None,
top_p=None,
top_k=None,
repetition_penalty=None,
max_new_tokens=max_new_tokens)
assert data.input_ids.size(0) == output_ids.size(0) == 1
output_ids = output_ids[0, data.input_ids.size(1):]
if output_ids[-1] == processor.tokenizer.eos_token_id:
output_ids = output_ids[:-1]
response = processor.decode(output_ids, clean_up_tokenization_spaces=False)
for i, text in enumerate(response.split(' ')):
if i == 0:
history[-1]['content'] = history[-1]['content'].rstrip('.')
history[-1]['content'] += text
else:
history[-1]['content'] += ' ' + text
yield history
elapsed_time = round(time.perf_counter() - start_time, 1)
history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
yield history
if 'gnd' in role and do_grounding:
response = f'After zooming in and analyzing the target moment, I finalize my answer: {response}'
else:
response = f'After watching the whole video, my answer is: {response}'
history.append({'role': 'assistant', 'content': ''})
for i, text in enumerate(response.split(' ')):
history[-1]['content'] += ' ' + text if i > 0 else text
yield history
def build_demo():
chat = gr.Chatbot(
type='messages',
height='70em',
resizable=True,
avatar_images=[f'{PATH}/assets/user.png', f'{PATH}/assets/bot.png'],
placeholder='A conversation with VideoMind',
label='VideoMind')
prompt = gr.Textbox(label='Text Prompt', placeholder='Ask a question about the video...')
with gr.Blocks(title=TITLE, js=JS) as demo:
gr.HTML(LOGO)
gr.HTML(BADGE)
gr.Markdown(DISC)
with gr.Row():
with gr.Column(scale=3):
video = gr.Video()
with gr.Group():
role = gr.CheckboxGroup(
choices=[('🗺️ Planner', 'pla'), ('🔍 Grounder', 'gnd'), ('📊 Verifier', 'ver'),
('📝 Answerer', 'ans')],
value=['pla', 'gnd', 'ver', 'ans'],
interactive=True,
label='Roles',
info='Select the role(s) you would like to activate.')
role.change(update_placeholder, role, prompt)
with gr.Accordion(label='Hyperparameters', open=False):
max_candidates = gr.Slider(
1,
100,
value=5,
step=1,
interactive=True,
label='Max Candidate Moments',
info='The maximum number of candidate moments in Grounder (Default: 5)')
temperature = gr.Slider(
0,
1,
value=0,
step=0.1,
interactive=True,
label='Temperature',
info='Higher value leads to more creativity and randomness (Default: 0)')
max_new_tokens = gr.Slider(
1,
1024,
value=256,
step=1,
interactive=True,
label='Max Output Tokens',
info='The maximum number of output tokens for each role (Default: 256)')
prompt.render()
with gr.Row():
random_btn = gr.Button(value='🔮 Random')
random_btn.click(random_sample, None, [video, prompt, role])
reset_btn = gr.ClearButton([video, prompt, chat], value='🗑️ Reset')
reset_btn.click(reset_components, None, [role, max_candidates, temperature, max_new_tokens])
submit_btn = gr.Button(value='🚀 Submit', variant='primary')
ctx = submit_btn.click(disable_btns, None, [random_btn, reset_btn, submit_btn])
ctx = ctx.then(main, [video, prompt, role, max_candidates, temperature, max_new_tokens], chat)
ctx.then(enable_btns, None, [random_btn, reset_btn, submit_btn])
gr.Examples(examples=EXAMPLES, inputs=[video, prompt, role], examples_per_page=3)
with gr.Column(scale=5):
chat.render()
return demo
if __name__ == '__main__':
demo = build_demo()
demo.queue()
demo.launch(server_name='0.0.0.0', allowed_paths=[f'{PATH}/assets', f'{PATH}/examples'])