# Copyright (c) 2025 Ye Liu. Licensed under the BSD-3-Clause license. import html import json import os import random import time import gradio as gr import nncore import spaces import torch from huggingface_hub import snapshot_download from videomind.constants import GROUNDER_PROMPT, PLANNER_PROMPT, VERIFIER_PROMPT from videomind.dataset.utils import process_vision_info from videomind.model.builder import build_model from videomind.utils.io import get_duration from videomind.utils.parser import parse_query, parse_span os.environ['TOKENIZERS_PARALLELISM'] = 'false' PATH = os.path.abspath(os.path.dirname(os.path.realpath(__file__))) BASE_MODEL = 'model_zoo/Qwen2-VL-2B-Instruct' BASE_MODEL_REPO = 'Qwen/Qwen2-VL-2B-Instruct' MODEL = 'model_zoo/VideoMind-2B' MODEL_REPO = 'yeliudev/VideoMind-2B' TITLE = 'VideoMind: A Chain-of-LoRA Agent for Long Video Reasoning' BADGE = """

A Chain-of-LoRA Agent for Long Video Reasoning

""" LOGO = '

' DISC = 'VideoMind is a multi-modal agent framework that enhances video reasoning by emulating *human-like* processes, such as *breaking down tasks*, *localizing and verifying moments*, and *synthesizing answers*. This demo showcases how VideoMind-2B handles video-language tasks. Please open an issue if you meet any problems.' # noqa # yapf:disable EXAMPLES = [ [f'{PATH}/examples/4167294363.mp4', 'Why did the old man stand up?', ['pla', 'gnd', 'ver', 'ans']], [f'{PATH}/examples/5012237466.mp4', 'How does the child in stripes react about the fountain?', ['pla', 'gnd', 'ver', 'ans']], [f'{PATH}/examples/13887487955.mp4', 'What did the excavator do after it pushed the cement forward?', ['pla', 'gnd', 'ver', 'ans']], [f'{PATH}/examples/5188348585.mp4', 'What did the person do before pouring the liquor?', ['pla', 'gnd', 'ver', 'ans']], [f'{PATH}/examples/4766274786.mp4', 'What did the girl do after the baby lost the balloon?', ['pla', 'gnd', 'ver', 'ans']], [f'{PATH}/examples/4742652230.mp4', 'Why is the girl pushing the boy only around the toy but not to other places?', ['pla', 'gnd', 'ver', 'ans']], [f'{PATH}/examples/9383140374.mp4', 'How does the girl in pink control the movement of the claw?', ['pla', 'gnd', 'ver', 'ans']], [f'{PATH}/examples/10309844035.mp4', 'Why are they holding up the phones?', ['pla', 'gnd', 'ver', 'ans']], [f'{PATH}/examples/pA6Z-qYhSNg_60.0_210.0.mp4', 'Different types of meat products are being cut, shaped and prepared', ['gnd', 'ver']], [f'{PATH}/examples/UFWQKrcbhjI_360.0_510.0.mp4', 'A man talks to the camera whilst walking along a roadside in a rural area', ['gnd', 'ver']], [f'{PATH}/examples/RoripwjYFp8_210.0_360.0.mp4', 'A woman wearing glasses eating something at a street market', ['gnd', 'ver']], [f'{PATH}/examples/h6QKDqomIPk_210.0_360.0.mp4', 'A toddler sits in his car seat, holding his yellow tablet', ['gnd', 'ver']], [f'{PATH}/examples/Z3-IZ3HAmIA_60.0_210.0.mp4', 'A view from the window as the plane accelerates and takes off from the runway', ['gnd', 'ver']], [f'{PATH}/examples/yId2wIocTys_210.0_360.0.mp4', "Temporally locate the visual content mentioned in the text query 'kids exercise in front of parked cars' within the video.", ['pla', 'gnd', 'ver']], [f'{PATH}/examples/rrTIeJRVGjg_60.0_210.0.mp4', "Localize the moment that provides relevant context about 'man stands in front of a white building monologuing'.", ['pla', 'gnd', 'ver']], [f'{PATH}/examples/DTInxNfWXVc_210.0_360.0.mp4', "Find the video segment that corresponds to the given textual query 'man with headphones talking'.", ['pla', 'gnd', 'ver']], ] # yapf:enable # https://github.com/gradio-app/gradio/pull/10552 JS = """ function init() { if (window.innerWidth >= 1536) { document.querySelector('main').style.maxWidth = '1536px' } } """ if not nncore.is_dir(BASE_MODEL): snapshot_download(BASE_MODEL_REPO, local_dir=BASE_MODEL) if not nncore.is_dir(MODEL): snapshot_download(MODEL_REPO, local_dir=MODEL) print('Initializing role *grounder*') model, processor = build_model(MODEL) print('Initializing role *planner*') model.load_adapter(nncore.join(MODEL, 'planner'), adapter_name='planner') print('Initializing role *verifier*') model.load_adapter(nncore.join(MODEL, 'verifier'), adapter_name='verifier') device = torch.device('cuda') def seconds_to_hms(seconds): hours, remainder = divmod(round(seconds), 3600) minutes, seconds = divmod(remainder, 60) return f'{hours:02}:{minutes:02}:{seconds:02}' def random_sample(): return random.choice(EXAMPLES) def enable_btns(): return (gr.Button(interactive=True), ) * 3 def disable_btns(): return (gr.Button(interactive=False), ) * 3 def update_placeholder(role): placeholder = 'Ask a question about the video...' if 'ans' in role else 'Write a query to search for a moment...' return gr.Textbox(placeholder=placeholder) def reset_components(): return ['pla', 'gnd', 'ver', 'ans'], 5, 0, 256 @spaces.GPU def main(video, prompt, role, max_candidates, temperature, max_new_tokens): global model, processor, device history = [] if not video: gr.Warning('Please upload a video or click [Random] to sample one.') return history if not prompt: gr.Warning('Please provide a prompt or click [Random] to sample one.') return history if 'gnd' not in role and 'ans' not in role: gr.Warning('Please at least select Grounder or Answerer.') return history if 'ver' in role and 'gnd' not in role: gr.Warning('Verifier cannot be used without Grounder.') return history if 'pla' in role and 'gnd' not in role and 'ver' not in role: gr.Warning('Planner can only be used with Grounder and Verifier.') return history history.append({'role': 'user', 'content': prompt}) yield history model = model.to(device) duration = get_duration(video) # do grounding and answering by default do_grounding = True do_answering = True # initialize grounding query as prompt query = prompt if 'pla' in role: text = PLANNER_PROMPT.format(prompt) history.append({ 'metadata': { 'title': '🗺️ Working as Planner...' }, 'role': 'assistant', 'content': f'##### Planner Prompt:\n\n{html.escape(text)}\n\n##### Planner Response:\n\n...' }) yield history start_time = time.perf_counter() messages = [{ 'role': 'user', 'content': [{ 'type': 'video', 'video': video, 'num_threads': 1, 'min_pixels': 36 * 28 * 28, 'max_pixels': 64 * 28 * 28, 'max_frames': 100, 'fps': 1.0 }, { 'type': 'text', 'text': text }] }] text = processor.apply_chat_template(messages, add_generation_prompt=True) images, videos = process_vision_info(messages) data = processor(text=[text], images=images, videos=videos, return_tensors='pt') data = data.to(device) model.base_model.disable_adapter_layers() model.base_model.enable_adapter_layers() model.set_adapter('planner') output_ids = model.generate( **data, do_sample=temperature > 0, temperature=temperature if temperature > 0 else None, top_p=None, top_k=None, repetition_penalty=None, max_new_tokens=max_new_tokens) assert data.input_ids.size(0) == output_ids.size(0) == 1 output_ids = output_ids[0, data.input_ids.size(1):] if output_ids[-1] == processor.tokenizer.eos_token_id: output_ids = output_ids[:-1] response = processor.decode(output_ids, clean_up_tokenization_spaces=False) for i, text in enumerate(response.split(' ')): if i == 0: history[-1]['content'] = history[-1]['content'].rstrip('.') history[-1]['content'] += text else: history[-1]['content'] += ' ' + text yield history elapsed_time = round(time.perf_counter() - start_time, 1) history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)' yield history try: parsed = json.loads(response) action = parsed[0] if isinstance(parsed, list) else parsed if action['type'].lower() == 'grounder' and action['value']: query = action['value'] elif action['type'].lower() == 'answerer': do_grounding = False do_answering = True except Exception: pass response = 'After browsing the video and the question. My plan to figure out the answer is as follows:\n' step_idx = 1 if 'gnd' in role and do_grounding: response += f'\n{step_idx}. Localize the relevant moment in this video using the query "{query}".' step_idx += 1 if 'ver' in role and do_grounding: response += f'\n{step_idx}. Verify the grounded moments one-by-one and select the best cancdidate.' step_idx += 1 if 'ans' in role and do_answering: if step_idx > 1: response += f'\n{step_idx}. Crop the video segment and zoom-in to higher resolution.' else: response += f'\n{step_idx}. Analyze the whole video directly without cropping.' history.append({'role': 'assistant', 'content': ''}) for i, text in enumerate(response.split(' ')): history[-1]['content'] += ' ' + text if i > 0 else text yield history if 'gnd' in role and do_grounding: query = parse_query(query) text = GROUNDER_PROMPT.format(query) history.append({ 'metadata': { 'title': '🔍 Working as Grounder...' }, 'role': 'assistant', 'content': f'##### Grounder Prompt:\n\n{html.escape(text)}\n\n##### Grounder Response:\n\n...' }) yield history start_time = time.perf_counter() messages = [{ 'role': 'user', 'content': [{ 'type': 'video', 'video': video, 'num_threads': 1, 'min_pixels': 36 * 28 * 28, 'max_pixels': 64 * 28 * 28, 'max_frames': 150, 'fps': 1.0 }, { 'type': 'text', 'text': text }] }] text = processor.apply_chat_template(messages, add_generation_prompt=True) images, videos = process_vision_info(messages) data = processor(text=[text], images=images, videos=videos, return_tensors='pt') data = data.to(device) model.base_model.disable_adapter_layers() model.base_model.enable_adapter_layers() model.set_adapter('grounder') output_ids = model.generate( **data, do_sample=temperature > 0, temperature=temperature if temperature > 0 else None, top_p=None, top_k=None, repetition_penalty=None, max_new_tokens=max_new_tokens) assert data.input_ids.size(0) == output_ids.size(0) == 1 output_ids = output_ids[0, data.input_ids.size(1):] if output_ids[-1] == processor.tokenizer.eos_token_id: output_ids = output_ids[:-1] response = processor.decode(output_ids, clean_up_tokenization_spaces=False) for i, text in enumerate(response.split(' ')): if i == 0: history[-1]['content'] = history[-1]['content'].rstrip('.') history[-1]['content'] += text else: history[-1]['content'] += ' ' + text yield history elapsed_time = round(time.perf_counter() - start_time, 1) history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)' yield history if len(model.reg) > 0: # 1. extract timestamps and confidences blob = model.reg[0].cpu().float() pred, conf = blob[:, :2] * duration, blob[:, -1].tolist() # 2. clamp timestamps pred = pred.clamp(min=0, max=duration) # 3. sort timestamps inds = (pred[:, 1] - pred[:, 0] < 0).nonzero()[:, 0] pred[inds] = pred[inds].roll(1) # 4. convert timestamps to list pred = pred.tolist() else: if 'ver' in role: pred = [[i * duration / 6, (i + 2) * duration / 6] for i in range(5)] conf = [0] * 5 else: pred = [[0, duration]] conf = [0] response = 'The candidate moments and confidence scores are as follows:\n' response += '\n| ID | Start Time | End Time | Confidence |' response += '\n| :-: | :-: | :-: | :-: |' for i, (p, c) in enumerate(zip(pred[:max_candidates], conf[:max_candidates])): response += f'\n| {i} | {seconds_to_hms(p[0])} | {seconds_to_hms(p[1])} | {c:.2f} |' response += f'\n\nTherefore, the target moment might happens from {seconds_to_hms(pred[0][0])} to {seconds_to_hms(pred[0][1])}.' history.append({'role': 'assistant', 'content': ''}) for i, text in enumerate(response.split(' ')): history[-1]['content'] += ' ' + text if i > 0 else text yield history if 'ver' in role and do_grounding: text = VERIFIER_PROMPT.format(query) history.append({ 'metadata': { 'title': '📊 Working as Verifier...' }, 'role': 'assistant', 'content': f'##### Verifier Prompt:\n\n{html.escape(text)}\n\n##### Verifier Response:\n\n...' }) yield history start_time = time.perf_counter() prob = [] for i, cand in enumerate(pred[:max_candidates]): s0, e0 = parse_span(cand, duration, 2) offset = (e0 - s0) / 2 s1, e1 = parse_span([s0 - offset, e0 + offset], duration) # percentage of s0, e0 within s1, e1 s = (s0 - s1) / (e1 - s1) e = (e0 - s1) / (e1 - s1) messages = [{ 'role': 'user', 'content': [{ 'type': 'video', 'video': video, 'num_threads': 1, 'video_start': s1, 'video_end': e1, 'min_pixels': 36 * 28 * 28, 'max_pixels': 64 * 28 * 28, 'max_frames': 64, 'fps': 2.0 }, { 'type': 'text', 'text': text }] }] text = processor.apply_chat_template(messages, add_generation_prompt=True) images, videos = process_vision_info(messages) data = processor(text=[text], images=images, videos=videos, return_tensors='pt') # ===== insert segment start/end tokens ===== video_grid_thw = data['video_grid_thw'][0] num_frames, window = int(video_grid_thw[0]), int(video_grid_thw[1] * video_grid_thw[2] / 4) assert num_frames * window * 4 == data['pixel_values_videos'].size(0) pos_s, pos_e = round(s * num_frames), round(e * num_frames) pos_s, pos_e = min(max(0, pos_s), num_frames), min(max(0, pos_e), num_frames) assert pos_s <= pos_e, (num_frames, s, e) base_idx = torch.nonzero(data['input_ids'][0] == model.config.vision_start_token_id).item() pos_s, pos_e = pos_s * window + base_idx + 1, pos_e * window + base_idx + 2 input_ids = data['input_ids'][0].tolist() input_ids.insert(pos_s, model.config.seg_s_token_id) input_ids.insert(pos_e, model.config.seg_e_token_id) data['input_ids'] = torch.LongTensor([input_ids]) data['attention_mask'] = torch.ones_like(data['input_ids']) # =========================================== data = data.to(device) model.base_model.disable_adapter_layers() model.base_model.enable_adapter_layers() model.set_adapter('verifier') with torch.inference_mode(): logits = model(**data).logits[0, -1].softmax(dim=-1) # NOTE: magic numbers here # In Qwen2-VL vocab: 9454 -> Yes, 2753 -> No score = (logits[9454] - logits[2753]).sigmoid().item() prob.append(score) if i == 0: history[-1]['content'] = history[-1]['content'].rstrip('.')[:-1] response = f'\nCandidate ID {i}: P(Yes) = {score:.2f}' for j, text in enumerate(response.split(' ')): history[-1]['content'] += ' ' + text if j > 0 else text yield history elapsed_time = round(time.perf_counter() - start_time, 1) history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)' yield history ranks = torch.Tensor(prob).argsort(descending=True).tolist() prob = [prob[idx] for idx in ranks] pred = [pred[idx] for idx in ranks] conf = [conf[idx] for idx in ranks] response = 'After verification, the candidate moments are re-ranked as follows:\n' response += '\n| ID | Start Time | End Time | Score |' response += '\n| :-: | :-: | :-: | :-: |' ids = list(range(len(ranks))) for r, p, c in zip(ranks, pred, prob): response += f'\n| {ids[r]} | {seconds_to_hms(p[0])} | {seconds_to_hms(p[1])} | {c:.2f} |' response += f'\n\nTherefore, the target moment should be from {seconds_to_hms(pred[0][0])} to {seconds_to_hms(pred[0][1])}.' history.append({'role': 'assistant', 'content': ''}) for i, text in enumerate(response.split(' ')): history[-1]['content'] += ' ' + text if i > 0 else text yield history if 'ans' in role and do_answering: text = f'{prompt} Please think step by step and provide your response.' history.append({ 'metadata': { 'title': '📝 Working as Answerer...' }, 'role': 'assistant', 'content': f'##### Answerer Prompt:\n\n{html.escape(text)}\n\n##### Answerer Response:\n\n...' }) yield history start_time = time.perf_counter() # choose the potential best moment selected = pred[0] if 'gnd' in role and do_grounding else [0, duration] s, e = parse_span(selected, duration, 32) messages = [{ 'role': 'user', 'content': [{ 'type': 'video', 'video': video, 'num_threads': 1, 'video_start': s, 'video_end': e, 'min_pixels': 128 * 28 * 28, 'max_pixels': 256 * 28 * 28, 'max_frames': 32, 'fps': 2.0 }, { 'type': 'text', 'text': text }] }] text = processor.apply_chat_template(messages, add_generation_prompt=True) images, videos = process_vision_info(messages) data = processor(text=[text], images=images, videos=videos, return_tensors='pt') data = data.to(device) with model.disable_adapter(): output_ids = model.generate( **data, do_sample=temperature > 0, temperature=temperature if temperature > 0 else None, top_p=None, top_k=None, repetition_penalty=None, max_new_tokens=max_new_tokens) assert data.input_ids.size(0) == output_ids.size(0) == 1 output_ids = output_ids[0, data.input_ids.size(1):] if output_ids[-1] == processor.tokenizer.eos_token_id: output_ids = output_ids[:-1] response = processor.decode(output_ids, clean_up_tokenization_spaces=False) for i, text in enumerate(response.split(' ')): if i == 0: history[-1]['content'] = history[-1]['content'].rstrip('.') history[-1]['content'] += text else: history[-1]['content'] += ' ' + text yield history elapsed_time = round(time.perf_counter() - start_time, 1) history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)' yield history if 'gnd' in role and do_grounding: response = f'After zooming in and analyzing the target moment, I finalize my answer: {response}' else: response = f'After watching the whole video, my answer is: {response}' history.append({'role': 'assistant', 'content': ''}) for i, text in enumerate(response.split(' ')): history[-1]['content'] += ' ' + text if i > 0 else text yield history def build_demo(): chat = gr.Chatbot( type='messages', height='70em', resizable=True, avatar_images=[f'{PATH}/assets/user.png', f'{PATH}/assets/bot.png'], placeholder='A conversation with VideoMind', label='VideoMind') prompt = gr.Textbox(label='Text Prompt', placeholder='Ask a question about the video...') with gr.Blocks(title=TITLE, js=JS) as demo: gr.HTML(LOGO) gr.HTML(BADGE) gr.Markdown(DISC) with gr.Row(): with gr.Column(scale=3): video = gr.Video() with gr.Group(): role = gr.CheckboxGroup( choices=[('🗺️ Planner', 'pla'), ('🔍 Grounder', 'gnd'), ('📊 Verifier', 'ver'), ('📝 Answerer', 'ans')], value=['pla', 'gnd', 'ver', 'ans'], interactive=True, label='Roles', info='Select the role(s) you would like to activate.') role.change(update_placeholder, role, prompt) with gr.Accordion(label='Hyperparameters', open=False): max_candidates = gr.Slider( 1, 100, value=5, step=1, interactive=True, label='Max Candidate Moments', info='The maximum number of candidate moments in Grounder (Default: 5)') temperature = gr.Slider( 0, 1, value=0, step=0.1, interactive=True, label='Temperature', info='Higher value leads to more creativity and randomness (Default: 0)') max_new_tokens = gr.Slider( 1, 1024, value=256, step=1, interactive=True, label='Max Output Tokens', info='The maximum number of output tokens for each role (Default: 256)') prompt.render() with gr.Row(): random_btn = gr.Button(value='🔮 Random') random_btn.click(random_sample, None, [video, prompt, role]) reset_btn = gr.ClearButton([video, prompt, chat], value='🗑️ Reset') reset_btn.click(reset_components, None, [role, max_candidates, temperature, max_new_tokens]) submit_btn = gr.Button(value='🚀 Submit', variant='primary') ctx = submit_btn.click(disable_btns, None, [random_btn, reset_btn, submit_btn]) ctx = ctx.then(main, [video, prompt, role, max_candidates, temperature, max_new_tokens], chat) ctx.then(enable_btns, None, [random_btn, reset_btn, submit_btn]) gr.Examples(examples=EXAMPLES, inputs=[video, prompt, role], examples_per_page=3) with gr.Column(scale=5): chat.render() return demo if __name__ == '__main__': demo = build_demo() demo.queue() demo.launch(server_name='0.0.0.0', allowed_paths=[f'{PATH}/assets', f'{PATH}/examples'])