Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	| # Copyright (c) 2025 Ye Liu. Licensed under the BSD-3-Clause license. | |
| import html | |
| import json | |
| import os | |
| import random | |
| import time | |
| import gradio as gr | |
| import nncore | |
| import spaces | |
| import torch | |
| from huggingface_hub import snapshot_download | |
| from videomind.constants import GROUNDER_PROMPT, PLANNER_PROMPT, VERIFIER_PROMPT | |
| from videomind.dataset.utils import process_vision_info | |
| from videomind.model.builder import build_model | |
| from videomind.utils.io import get_duration | |
| from videomind.utils.parser import parse_query, parse_span | |
| os.environ['TOKENIZERS_PARALLELISM'] = 'false' | |
| PATH = os.path.abspath(os.path.dirname(os.path.realpath(__file__))) | |
| BASE_MODEL = 'model_zoo/Qwen2-VL-2B-Instruct' | |
| BASE_MODEL_REPO = 'Qwen/Qwen2-VL-2B-Instruct' | |
| MODEL = 'model_zoo/VideoMind-2B' | |
| MODEL_REPO = 'yeliudev/VideoMind-2B' | |
| TITLE = 'VideoMind: A Chain-of-LoRA Agent for Long Video Reasoning' | |
| BADGE = """ | |
| <h3 align="center" style="margin-top: -0.5em;">A Chain-of-LoRA Agent for Long Video Reasoning</h3> | |
| <div style="display: flex; justify-content: center; gap: 5px; margin-bottom: -0.7em !important;"> | |
| <a href="https://arxiv.org/abs/2503.13444" target="_blank"><img src="https://img.shields.io/badge/arXiv-2503.13444-red"></a> | |
| <a href="https://videomind.github.io/" target="_blank"><img src="https://img.shields.io/badge/Project-Page-brightgreen"></a> | |
| <a href="https://huggingface.co/collections/yeliudev/videomind-67dd41f42c57f0e7433afb36" target="_blank"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue"></a> | |
| <a href="https://huggingface.co/datasets/yeliudev/VideoMind-Dataset" target="_blank"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-orange"></a> | |
| <a href="https://github.com/yeliudev/VideoMind/blob/main/README.md" target="_blank"><img src="https://img.shields.io/badge/License-BSD--3--Clause-purple"></a> | |
| <a href="https://github.com/yeliudev/VideoMind" target="_blank"><img src="https://img.shields.io/github/stars/yeliudev/VideoMind"></a> | |
| </div> | |
| """ | |
| LOGO = '<p align="center"><img width="350" src="https://raw.githubusercontent.com/yeliudev/VideoMind/refs/heads/main/.github/logo.png"></p>' | |
| DISC = 'VideoMind is a multi-modal agent framework that enhances video reasoning by emulating *human-like* processes, such as *breaking down tasks*, *localizing and verifying moments*, and *synthesizing answers*. This demo showcases how VideoMind-2B handles video-language tasks. Please open an <a href="https://github.com/yeliudev/VideoMind/issues/new" target="_blank">issue</a> if you meet any problems.' # noqa | |
| # yapf:disable | |
| EXAMPLES = [ | |
| [f'{PATH}/examples/4167294363.mp4', 'Why did the old man stand up?', ['pla', 'gnd', 'ver', 'ans']], | |
| [f'{PATH}/examples/5012237466.mp4', 'How does the child in stripes react about the fountain?', ['pla', 'gnd', 'ver', 'ans']], | |
| [f'{PATH}/examples/13887487955.mp4', 'What did the excavator do after it pushed the cement forward?', ['pla', 'gnd', 'ver', 'ans']], | |
| [f'{PATH}/examples/5188348585.mp4', 'What did the person do before pouring the liquor?', ['pla', 'gnd', 'ver', 'ans']], | |
| [f'{PATH}/examples/4766274786.mp4', 'What did the girl do after the baby lost the balloon?', ['pla', 'gnd', 'ver', 'ans']], | |
| [f'{PATH}/examples/4742652230.mp4', 'Why is the girl pushing the boy only around the toy but not to other places?', ['pla', 'gnd', 'ver', 'ans']], | |
| [f'{PATH}/examples/9383140374.mp4', 'How does the girl in pink control the movement of the claw?', ['pla', 'gnd', 'ver', 'ans']], | |
| [f'{PATH}/examples/10309844035.mp4', 'Why are they holding up the phones?', ['pla', 'gnd', 'ver', 'ans']], | |
| [f'{PATH}/examples/pA6Z-qYhSNg_60.0_210.0.mp4', 'Different types of meat products are being cut, shaped and prepared', ['gnd', 'ver']], | |
| [f'{PATH}/examples/UFWQKrcbhjI_360.0_510.0.mp4', 'A man talks to the camera whilst walking along a roadside in a rural area', ['gnd', 'ver']], | |
| [f'{PATH}/examples/RoripwjYFp8_210.0_360.0.mp4', 'A woman wearing glasses eating something at a street market', ['gnd', 'ver']], | |
| [f'{PATH}/examples/h6QKDqomIPk_210.0_360.0.mp4', 'A toddler sits in his car seat, holding his yellow tablet', ['gnd', 'ver']], | |
| [f'{PATH}/examples/Z3-IZ3HAmIA_60.0_210.0.mp4', 'A view from the window as the plane accelerates and takes off from the runway', ['gnd', 'ver']], | |
| [f'{PATH}/examples/yId2wIocTys_210.0_360.0.mp4', "Temporally locate the visual content mentioned in the text query 'kids exercise in front of parked cars' within the video.", ['pla', 'gnd', 'ver']], | |
| [f'{PATH}/examples/rrTIeJRVGjg_60.0_210.0.mp4', "Localize the moment that provides relevant context about 'man stands in front of a white building monologuing'.", ['pla', 'gnd', 'ver']], | |
| [f'{PATH}/examples/DTInxNfWXVc_210.0_360.0.mp4', "Find the video segment that corresponds to the given textual query 'man with headphones talking'.", ['pla', 'gnd', 'ver']], | |
| ] | |
| # yapf:enable | |
| # https://github.com/gradio-app/gradio/pull/10552 | |
| JS = """ | |
| function init() { | |
| if (window.innerWidth >= 1536) { | |
| document.querySelector('main').style.maxWidth = '1536px' | |
| } | |
| } | |
| """ | |
| if not nncore.is_dir(BASE_MODEL): | |
| snapshot_download(BASE_MODEL_REPO, local_dir=BASE_MODEL) | |
| if not nncore.is_dir(MODEL): | |
| snapshot_download(MODEL_REPO, local_dir=MODEL) | |
| print('Initializing role *grounder*') | |
| model, processor = build_model(MODEL) | |
| print('Initializing role *planner*') | |
| model.load_adapter(nncore.join(MODEL, 'planner'), adapter_name='planner') | |
| print('Initializing role *verifier*') | |
| model.load_adapter(nncore.join(MODEL, 'verifier'), adapter_name='verifier') | |
| device = torch.device('cuda') | |
| def seconds_to_hms(seconds): | |
| hours, remainder = divmod(round(seconds), 3600) | |
| minutes, seconds = divmod(remainder, 60) | |
| return f'{hours:02}:{minutes:02}:{seconds:02}' | |
| def random_sample(): | |
| return random.choice(EXAMPLES) | |
| def enable_btns(): | |
| return (gr.Button(interactive=True), ) * 3 | |
| def disable_btns(): | |
| return (gr.Button(interactive=False), ) * 3 | |
| def update_placeholder(role): | |
| placeholder = 'Ask a question about the video...' if 'ans' in role else 'Write a query to search for a moment...' | |
| return gr.Textbox(placeholder=placeholder) | |
| def reset_components(): | |
| return ['pla', 'gnd', 'ver', 'ans'], 5, 0, 256 | |
| def main(video, prompt, role, max_candidates, temperature, max_new_tokens): | |
| global model, processor, device | |
| history = [] | |
| if not video: | |
| gr.Warning('Please upload a video or click [Random] to sample one.') | |
| return history | |
| if not prompt: | |
| gr.Warning('Please provide a prompt or click [Random] to sample one.') | |
| return history | |
| if 'gnd' not in role and 'ans' not in role: | |
| gr.Warning('Please at least select Grounder or Answerer.') | |
| return history | |
| if 'ver' in role and 'gnd' not in role: | |
| gr.Warning('Verifier cannot be used without Grounder.') | |
| return history | |
| if 'pla' in role and 'gnd' not in role and 'ver' not in role: | |
| gr.Warning('Planner can only be used with Grounder and Verifier.') | |
| return history | |
| history.append({'role': 'user', 'content': prompt}) | |
| yield history | |
| model = model.to(device) | |
| duration = get_duration(video) | |
| # do grounding and answering by default | |
| do_grounding = True | |
| do_answering = True | |
| # initialize grounding query as prompt | |
| query = prompt | |
| if 'pla' in role: | |
| text = PLANNER_PROMPT.format(prompt) | |
| history.append({ | |
| 'metadata': { | |
| 'title': '๐บ๏ธ Working as Planner...' | |
| }, | |
| 'role': 'assistant', | |
| 'content': f'##### Planner Prompt:\n\n{html.escape(text)}\n\n##### Planner Response:\n\n...' | |
| }) | |
| yield history | |
| start_time = time.perf_counter() | |
| messages = [{ | |
| 'role': | |
| 'user', | |
| 'content': [{ | |
| 'type': 'video', | |
| 'video': video, | |
| 'num_threads': 1, | |
| 'min_pixels': 36 * 28 * 28, | |
| 'max_pixels': 64 * 28 * 28, | |
| 'max_frames': 100, | |
| 'fps': 1.0 | |
| }, { | |
| 'type': 'text', | |
| 'text': text | |
| }] | |
| }] | |
| text = processor.apply_chat_template(messages, add_generation_prompt=True) | |
| images, videos = process_vision_info(messages) | |
| data = processor(text=[text], images=images, videos=videos, return_tensors='pt') | |
| data = data.to(device) | |
| model.base_model.disable_adapter_layers() | |
| model.base_model.enable_adapter_layers() | |
| model.set_adapter('planner') | |
| output_ids = model.generate( | |
| **data, | |
| do_sample=temperature > 0, | |
| temperature=temperature if temperature > 0 else None, | |
| top_p=None, | |
| top_k=None, | |
| repetition_penalty=None, | |
| max_new_tokens=max_new_tokens) | |
| assert data.input_ids.size(0) == output_ids.size(0) == 1 | |
| output_ids = output_ids[0, data.input_ids.size(1):] | |
| if output_ids[-1] == processor.tokenizer.eos_token_id: | |
| output_ids = output_ids[:-1] | |
| response = processor.decode(output_ids, clean_up_tokenization_spaces=False) | |
| for i, text in enumerate(response.split(' ')): | |
| if i == 0: | |
| history[-1]['content'] = history[-1]['content'].rstrip('.') | |
| history[-1]['content'] += text | |
| else: | |
| history[-1]['content'] += ' ' + text | |
| yield history | |
| elapsed_time = round(time.perf_counter() - start_time, 1) | |
| history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)' | |
| yield history | |
| try: | |
| parsed = json.loads(response) | |
| action = parsed[0] if isinstance(parsed, list) else parsed | |
| if action['type'].lower() == 'grounder' and action['value']: | |
| query = action['value'] | |
| elif action['type'].lower() == 'answerer': | |
| do_grounding = False | |
| do_answering = True | |
| except Exception: | |
| pass | |
| response = 'After browsing the video and the question. My plan to figure out the answer is as follows:\n' | |
| step_idx = 1 | |
| if 'gnd' in role and do_grounding: | |
| response += f'\n{step_idx}. Localize the relevant moment in this video using the query "<span style="color:red">{query}</span>".' | |
| step_idx += 1 | |
| if 'ver' in role and do_grounding: | |
| response += f'\n{step_idx}. Verify the grounded moments one-by-one and select the best cancdidate.' | |
| step_idx += 1 | |
| if 'ans' in role and do_answering: | |
| if step_idx > 1: | |
| response += f'\n{step_idx}. Crop the video segment and zoom-in to higher resolution.' | |
| else: | |
| response += f'\n{step_idx}. Analyze the whole video directly without cropping.' | |
| history.append({'role': 'assistant', 'content': ''}) | |
| for i, text in enumerate(response.split(' ')): | |
| history[-1]['content'] += ' ' + text if i > 0 else text | |
| yield history | |
| if 'gnd' in role and do_grounding: | |
| query = parse_query(query) | |
| text = GROUNDER_PROMPT.format(query) | |
| history.append({ | |
| 'metadata': { | |
| 'title': '๐ Working as Grounder...' | |
| }, | |
| 'role': 'assistant', | |
| 'content': f'##### Grounder Prompt:\n\n{html.escape(text)}\n\n##### Grounder Response:\n\n...' | |
| }) | |
| yield history | |
| start_time = time.perf_counter() | |
| messages = [{ | |
| 'role': | |
| 'user', | |
| 'content': [{ | |
| 'type': 'video', | |
| 'video': video, | |
| 'num_threads': 1, | |
| 'min_pixels': 36 * 28 * 28, | |
| 'max_pixels': 64 * 28 * 28, | |
| 'max_frames': 150, | |
| 'fps': 1.0 | |
| }, { | |
| 'type': 'text', | |
| 'text': text | |
| }] | |
| }] | |
| text = processor.apply_chat_template(messages, add_generation_prompt=True) | |
| images, videos = process_vision_info(messages) | |
| data = processor(text=[text], images=images, videos=videos, return_tensors='pt') | |
| data = data.to(device) | |
| model.base_model.disable_adapter_layers() | |
| model.base_model.enable_adapter_layers() | |
| model.set_adapter('grounder') | |
| output_ids = model.generate( | |
| **data, | |
| do_sample=temperature > 0, | |
| temperature=temperature if temperature > 0 else None, | |
| top_p=None, | |
| top_k=None, | |
| repetition_penalty=None, | |
| max_new_tokens=max_new_tokens) | |
| assert data.input_ids.size(0) == output_ids.size(0) == 1 | |
| output_ids = output_ids[0, data.input_ids.size(1):] | |
| if output_ids[-1] == processor.tokenizer.eos_token_id: | |
| output_ids = output_ids[:-1] | |
| response = processor.decode(output_ids, clean_up_tokenization_spaces=False) | |
| for i, text in enumerate(response.split(' ')): | |
| if i == 0: | |
| history[-1]['content'] = history[-1]['content'].rstrip('.') | |
| history[-1]['content'] += text | |
| else: | |
| history[-1]['content'] += ' ' + text | |
| yield history | |
| elapsed_time = round(time.perf_counter() - start_time, 1) | |
| history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)' | |
| yield history | |
| if len(model.reg) > 0: | |
| # 1. extract timestamps and confidences | |
| blob = model.reg[0].cpu().float() | |
| pred, conf = blob[:, :2] * duration, blob[:, -1].tolist() | |
| # 2. clamp timestamps | |
| pred = pred.clamp(min=0, max=duration) | |
| # 3. sort timestamps | |
| inds = (pred[:, 1] - pred[:, 0] < 0).nonzero()[:, 0] | |
| pred[inds] = pred[inds].roll(1) | |
| # 4. convert timestamps to list | |
| pred = pred.tolist() | |
| else: | |
| if 'ver' in role: | |
| pred = [[i * duration / 6, (i + 2) * duration / 6] for i in range(5)] | |
| conf = [0] * 5 | |
| else: | |
| pred = [[0, duration]] | |
| conf = [0] | |
| response = 'The candidate moments and confidence scores are as follows:\n' | |
| response += '\n| ID | Start Time | End Time | Confidence |' | |
| response += '\n| :-: | :-: | :-: | :-: |' | |
| for i, (p, c) in enumerate(zip(pred[:max_candidates], conf[:max_candidates])): | |
| response += f'\n| {i} | {seconds_to_hms(p[0])} | {seconds_to_hms(p[1])} | {c:.2f} |' | |
| response += f'\n\nTherefore, the target moment might happens from <span style="color:red">{seconds_to_hms(pred[0][0])}</span> to <span style="color:red">{seconds_to_hms(pred[0][1])}</span>.' | |
| history.append({'role': 'assistant', 'content': ''}) | |
| for i, text in enumerate(response.split(' ')): | |
| history[-1]['content'] += ' ' + text if i > 0 else text | |
| yield history | |
| if 'ver' in role and do_grounding: | |
| text = VERIFIER_PROMPT.format(query) | |
| history.append({ | |
| 'metadata': { | |
| 'title': '๐ Working as Verifier...' | |
| }, | |
| 'role': 'assistant', | |
| 'content': f'##### Verifier Prompt:\n\n{html.escape(text)}\n\n##### Verifier Response:\n\n...' | |
| }) | |
| yield history | |
| start_time = time.perf_counter() | |
| prob = [] | |
| for i, cand in enumerate(pred[:max_candidates]): | |
| s0, e0 = parse_span(cand, duration, 2) | |
| offset = (e0 - s0) / 2 | |
| s1, e1 = parse_span([s0 - offset, e0 + offset], duration) | |
| # percentage of s0, e0 within s1, e1 | |
| s = (s0 - s1) / (e1 - s1) | |
| e = (e0 - s1) / (e1 - s1) | |
| messages = [{ | |
| 'role': | |
| 'user', | |
| 'content': [{ | |
| 'type': 'video', | |
| 'video': video, | |
| 'num_threads': 1, | |
| 'video_start': s1, | |
| 'video_end': e1, | |
| 'min_pixels': 36 * 28 * 28, | |
| 'max_pixels': 64 * 28 * 28, | |
| 'max_frames': 64, | |
| 'fps': 2.0 | |
| }, { | |
| 'type': 'text', | |
| 'text': text | |
| }] | |
| }] | |
| text = processor.apply_chat_template(messages, add_generation_prompt=True) | |
| images, videos = process_vision_info(messages) | |
| data = processor(text=[text], images=images, videos=videos, return_tensors='pt') | |
| # ===== insert segment start/end tokens ===== | |
| video_grid_thw = data['video_grid_thw'][0] | |
| num_frames, window = int(video_grid_thw[0]), int(video_grid_thw[1] * video_grid_thw[2] / 4) | |
| assert num_frames * window * 4 == data['pixel_values_videos'].size(0) | |
| pos_s, pos_e = round(s * num_frames), round(e * num_frames) | |
| pos_s, pos_e = min(max(0, pos_s), num_frames), min(max(0, pos_e), num_frames) | |
| assert pos_s <= pos_e, (num_frames, s, e) | |
| base_idx = torch.nonzero(data['input_ids'][0] == model.config.vision_start_token_id).item() | |
| pos_s, pos_e = pos_s * window + base_idx + 1, pos_e * window + base_idx + 2 | |
| input_ids = data['input_ids'][0].tolist() | |
| input_ids.insert(pos_s, model.config.seg_s_token_id) | |
| input_ids.insert(pos_e, model.config.seg_e_token_id) | |
| data['input_ids'] = torch.LongTensor([input_ids]) | |
| data['attention_mask'] = torch.ones_like(data['input_ids']) | |
| # =========================================== | |
| data = data.to(device) | |
| model.base_model.disable_adapter_layers() | |
| model.base_model.enable_adapter_layers() | |
| model.set_adapter('verifier') | |
| with torch.inference_mode(): | |
| logits = model(**data).logits[0, -1].softmax(dim=-1) | |
| # NOTE: magic numbers here | |
| # In Qwen2-VL vocab: 9454 -> Yes, 2753 -> No | |
| score = (logits[9454] - logits[2753]).sigmoid().item() | |
| prob.append(score) | |
| if i == 0: | |
| history[-1]['content'] = history[-1]['content'].rstrip('.')[:-1] | |
| response = f'\nCandidate ID {i}: P(Yes) = {score:.2f}' | |
| for j, text in enumerate(response.split(' ')): | |
| history[-1]['content'] += ' ' + text if j > 0 else text | |
| yield history | |
| elapsed_time = round(time.perf_counter() - start_time, 1) | |
| history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)' | |
| yield history | |
| ranks = torch.Tensor(prob).argsort(descending=True).tolist() | |
| prob = [prob[idx] for idx in ranks] | |
| pred = [pred[idx] for idx in ranks] | |
| conf = [conf[idx] for idx in ranks] | |
| response = 'After verification, the candidate moments are re-ranked as follows:\n' | |
| response += '\n| ID | Start Time | End Time | Score |' | |
| response += '\n| :-: | :-: | :-: | :-: |' | |
| ids = list(range(len(ranks))) | |
| for r, p, c in zip(ranks, pred, prob): | |
| response += f'\n| {ids[r]} | {seconds_to_hms(p[0])} | {seconds_to_hms(p[1])} | {c:.2f} |' | |
| response += f'\n\nTherefore, the target moment should be from <span style="color:red">{seconds_to_hms(pred[0][0])}</span> to <span style="color:red">{seconds_to_hms(pred[0][1])}</span>.' | |
| history.append({'role': 'assistant', 'content': ''}) | |
| for i, text in enumerate(response.split(' ')): | |
| history[-1]['content'] += ' ' + text if i > 0 else text | |
| yield history | |
| if 'ans' in role and do_answering: | |
| text = f'{prompt} Please think step by step and provide your response.' | |
| history.append({ | |
| 'metadata': { | |
| 'title': '๐ Working as Answerer...' | |
| }, | |
| 'role': 'assistant', | |
| 'content': f'##### Answerer Prompt:\n\n{html.escape(text)}\n\n##### Answerer Response:\n\n...' | |
| }) | |
| yield history | |
| start_time = time.perf_counter() | |
| # choose the potential best moment | |
| selected = pred[0] if 'gnd' in role and do_grounding else [0, duration] | |
| s, e = parse_span(selected, duration, 32) | |
| messages = [{ | |
| 'role': | |
| 'user', | |
| 'content': [{ | |
| 'type': 'video', | |
| 'video': video, | |
| 'num_threads': 1, | |
| 'video_start': s, | |
| 'video_end': e, | |
| 'min_pixels': 128 * 28 * 28, | |
| 'max_pixels': 256 * 28 * 28, | |
| 'max_frames': 32, | |
| 'fps': 2.0 | |
| }, { | |
| 'type': 'text', | |
| 'text': text | |
| }] | |
| }] | |
| text = processor.apply_chat_template(messages, add_generation_prompt=True) | |
| images, videos = process_vision_info(messages) | |
| data = processor(text=[text], images=images, videos=videos, return_tensors='pt') | |
| data = data.to(device) | |
| with model.disable_adapter(): | |
| output_ids = model.generate( | |
| **data, | |
| do_sample=temperature > 0, | |
| temperature=temperature if temperature > 0 else None, | |
| top_p=None, | |
| top_k=None, | |
| repetition_penalty=None, | |
| max_new_tokens=max_new_tokens) | |
| assert data.input_ids.size(0) == output_ids.size(0) == 1 | |
| output_ids = output_ids[0, data.input_ids.size(1):] | |
| if output_ids[-1] == processor.tokenizer.eos_token_id: | |
| output_ids = output_ids[:-1] | |
| response = processor.decode(output_ids, clean_up_tokenization_spaces=False) | |
| for i, text in enumerate(response.split(' ')): | |
| if i == 0: | |
| history[-1]['content'] = history[-1]['content'].rstrip('.') | |
| history[-1]['content'] += text | |
| else: | |
| history[-1]['content'] += ' ' + text | |
| yield history | |
| elapsed_time = round(time.perf_counter() - start_time, 1) | |
| history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)' | |
| yield history | |
| if 'gnd' in role and do_grounding: | |
| response = f'After zooming in and analyzing the target moment, I finalize my answer: <span style="color:green">{response}</span>' | |
| else: | |
| response = f'After watching the whole video, my answer is: <span style="color:green">{response}</span>' | |
| history.append({'role': 'assistant', 'content': ''}) | |
| for i, text in enumerate(response.split(' ')): | |
| history[-1]['content'] += ' ' + text if i > 0 else text | |
| yield history | |
| def build_demo(): | |
| chat = gr.Chatbot( | |
| type='messages', | |
| height='70em', | |
| resizable=True, | |
| avatar_images=[f'{PATH}/assets/user.png', f'{PATH}/assets/bot.png'], | |
| placeholder='A conversation with VideoMind', | |
| label='VideoMind') | |
| prompt = gr.Textbox(label='Text Prompt', placeholder='Ask a question about the video...') | |
| with gr.Blocks(title=TITLE, js=JS) as demo: | |
| gr.HTML(LOGO) | |
| gr.HTML(BADGE) | |
| gr.Markdown(DISC) | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| video = gr.Video() | |
| with gr.Group(): | |
| role = gr.CheckboxGroup( | |
| choices=[('๐บ๏ธ Planner', 'pla'), ('๐ Grounder', 'gnd'), ('๐ Verifier', 'ver'), | |
| ('๐ Answerer', 'ans')], | |
| value=['pla', 'gnd', 'ver', 'ans'], | |
| interactive=True, | |
| label='Roles', | |
| info='Select the role(s) you would like to activate.') | |
| role.change(update_placeholder, role, prompt) | |
| with gr.Accordion(label='Hyperparameters', open=False): | |
| max_candidates = gr.Slider( | |
| 1, | |
| 100, | |
| value=5, | |
| step=1, | |
| interactive=True, | |
| label='Max Candidate Moments', | |
| info='The maximum number of candidate moments in Grounder (Default: 5)') | |
| temperature = gr.Slider( | |
| 0, | |
| 1, | |
| value=0, | |
| step=0.1, | |
| interactive=True, | |
| label='Temperature', | |
| info='Higher value leads to more creativity and randomness (Default: 0)') | |
| max_new_tokens = gr.Slider( | |
| 1, | |
| 1024, | |
| value=256, | |
| step=1, | |
| interactive=True, | |
| label='Max Output Tokens', | |
| info='The maximum number of output tokens for each role (Default: 256)') | |
| prompt.render() | |
| with gr.Row(): | |
| random_btn = gr.Button(value='๐ฎ Random') | |
| random_btn.click(random_sample, None, [video, prompt, role]) | |
| reset_btn = gr.ClearButton([video, prompt, chat], value='๐๏ธ Reset') | |
| reset_btn.click(reset_components, None, [role, max_candidates, temperature, max_new_tokens]) | |
| submit_btn = gr.Button(value='๐ Submit', variant='primary') | |
| ctx = submit_btn.click(disable_btns, None, [random_btn, reset_btn, submit_btn]) | |
| ctx = ctx.then(main, [video, prompt, role, max_candidates, temperature, max_new_tokens], chat) | |
| ctx.then(enable_btns, None, [random_btn, reset_btn, submit_btn]) | |
| gr.Examples(examples=EXAMPLES, inputs=[video, prompt, role], examples_per_page=3) | |
| with gr.Column(scale=5): | |
| chat.render() | |
| return demo | |
| if __name__ == '__main__': | |
| demo = build_demo() | |
| demo.queue() | |
| demo.launch(server_name='0.0.0.0', allowed_paths=[f'{PATH}/assets', f'{PATH}/examples']) | |