Spaces:
Running
on
Zero
Running
on
Zero
# Copyright (c) 2025 Ye Liu. Licensed under the BSD-3-Clause license. | |
import html | |
import json | |
import os | |
import random | |
import time | |
import gradio as gr | |
import nncore | |
import spaces | |
import torch | |
from huggingface_hub import snapshot_download | |
from videomind.constants import GROUNDER_PROMPT, PLANNER_PROMPT, VERIFIER_PROMPT | |
from videomind.dataset.utils import process_vision_info | |
from videomind.model.builder import build_model | |
from videomind.utils.io import get_duration | |
from videomind.utils.parser import parse_query, parse_span | |
os.environ['TOKENIZERS_PARALLELISM'] = 'false' | |
PATH = os.path.abspath(os.path.dirname(os.path.realpath(__file__))) | |
BASE_MODEL = 'model_zoo/Qwen2-VL-2B-Instruct' | |
BASE_MODEL_REPO = 'Qwen/Qwen2-VL-2B-Instruct' | |
MODEL = 'model_zoo/VideoMind-2B' | |
MODEL_REPO = 'yeliudev/VideoMind-2B' | |
TITLE = 'VideoMind: A Chain-of-LoRA Agent for Long Video Reasoning' | |
BADGE = """ | |
<h3 align="center" style="margin-top: -0.5em;">A Chain-of-LoRA Agent for Long Video Reasoning</h3> | |
<div style="display: flex; justify-content: center; gap: 5px; margin-bottom: -0.7em !important;"> | |
<a href="https://arxiv.org/abs/2503.13444" target="_blank"><img src="https://img.shields.io/badge/arXiv-2503.13444-red"></a> | |
<a href="https://videomind.github.io/" target="_blank"><img src="https://img.shields.io/badge/Project-Page-brightgreen"></a> | |
<a href="https://huggingface.co/collections/yeliudev/videomind-67dd41f42c57f0e7433afb36" target="_blank"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue"></a> | |
<a href="https://huggingface.co/datasets/yeliudev/VideoMind-Dataset" target="_blank"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-orange"></a> | |
<a href="https://github.com/yeliudev/VideoMind/blob/main/README.md" target="_blank"><img src="https://img.shields.io/badge/License-BSD--3--Clause-purple"></a> | |
<a href="https://github.com/yeliudev/VideoMind" target="_blank"><img src="https://img.shields.io/github/stars/yeliudev/VideoMind"></a> | |
</div> | |
""" | |
LOGO = '<p align="center"><img width="350" src="https://raw.githubusercontent.com/yeliudev/VideoMind/refs/heads/main/.github/logo.png"></p>' | |
DISC = 'VideoMind is a multi-modal agent framework that enhances video reasoning by emulating *human-like* processes, such as *breaking down tasks*, *localizing and verifying moments*, and *synthesizing answers*. This demo showcases how VideoMind-2B handles video-language tasks. Please open an <a href="https://github.com/yeliudev/VideoMind/issues/new" target="_blank">issue</a> if you meet any problems.' # noqa | |
# yapf:disable | |
EXAMPLES = [ | |
[f'{PATH}/examples/4167294363.mp4', 'Why did the old man stand up?', ['pla', 'gnd', 'ver', 'ans']], | |
[f'{PATH}/examples/5012237466.mp4', 'How does the child in stripes react about the fountain?', ['pla', 'gnd', 'ver', 'ans']], | |
[f'{PATH}/examples/13887487955.mp4', 'What did the excavator do after it pushed the cement forward?', ['pla', 'gnd', 'ver', 'ans']], | |
[f'{PATH}/examples/5188348585.mp4', 'What did the person do before pouring the liquor?', ['pla', 'gnd', 'ver', 'ans']], | |
[f'{PATH}/examples/4766274786.mp4', 'What did the girl do after the baby lost the balloon?', ['pla', 'gnd', 'ver', 'ans']], | |
[f'{PATH}/examples/4742652230.mp4', 'Why is the girl pushing the boy only around the toy but not to other places?', ['pla', 'gnd', 'ver', 'ans']], | |
[f'{PATH}/examples/9383140374.mp4', 'How does the girl in pink control the movement of the claw?', ['pla', 'gnd', 'ver', 'ans']], | |
[f'{PATH}/examples/10309844035.mp4', 'Why are they holding up the phones?', ['pla', 'gnd', 'ver', 'ans']], | |
[f'{PATH}/examples/pA6Z-qYhSNg_60.0_210.0.mp4', 'Different types of meat products are being cut, shaped and prepared', ['gnd', 'ver']], | |
[f'{PATH}/examples/UFWQKrcbhjI_360.0_510.0.mp4', 'A man talks to the camera whilst walking along a roadside in a rural area', ['gnd', 'ver']], | |
[f'{PATH}/examples/RoripwjYFp8_210.0_360.0.mp4', 'A woman wearing glasses eating something at a street market', ['gnd', 'ver']], | |
[f'{PATH}/examples/h6QKDqomIPk_210.0_360.0.mp4', 'A toddler sits in his car seat, holding his yellow tablet', ['gnd', 'ver']], | |
[f'{PATH}/examples/Z3-IZ3HAmIA_60.0_210.0.mp4', 'A view from the window as the plane accelerates and takes off from the runway', ['gnd', 'ver']], | |
[f'{PATH}/examples/yId2wIocTys_210.0_360.0.mp4', "Temporally locate the visual content mentioned in the text query 'kids exercise in front of parked cars' within the video.", ['pla', 'gnd', 'ver']], | |
[f'{PATH}/examples/rrTIeJRVGjg_60.0_210.0.mp4', "Localize the moment that provides relevant context about 'man stands in front of a white building monologuing'.", ['pla', 'gnd', 'ver']], | |
[f'{PATH}/examples/DTInxNfWXVc_210.0_360.0.mp4', "Find the video segment that corresponds to the given textual query 'man with headphones talking'.", ['pla', 'gnd', 'ver']], | |
] | |
# yapf:enable | |
# https://github.com/gradio-app/gradio/pull/10552 | |
JS = """ | |
function init() { | |
if (window.innerWidth >= 1536) { | |
document.querySelector('main').style.maxWidth = '1536px' | |
} | |
} | |
""" | |
if not nncore.is_dir(BASE_MODEL): | |
snapshot_download(BASE_MODEL_REPO, local_dir=BASE_MODEL) | |
if not nncore.is_dir(MODEL): | |
snapshot_download(MODEL_REPO, local_dir=MODEL) | |
print('Initializing role *grounder*') | |
model, processor = build_model(MODEL) | |
print('Initializing role *planner*') | |
model.load_adapter(nncore.join(MODEL, 'planner'), adapter_name='planner') | |
print('Initializing role *verifier*') | |
model.load_adapter(nncore.join(MODEL, 'verifier'), adapter_name='verifier') | |
device = torch.device('cuda') | |
def seconds_to_hms(seconds): | |
hours, remainder = divmod(round(seconds), 3600) | |
minutes, seconds = divmod(remainder, 60) | |
return f'{hours:02}:{minutes:02}:{seconds:02}' | |
def random_sample(): | |
return random.choice(EXAMPLES) | |
def enable_btns(): | |
return (gr.Button(interactive=True), ) * 3 | |
def disable_btns(): | |
return (gr.Button(interactive=False), ) * 3 | |
def update_placeholder(role): | |
placeholder = 'Ask a question about the video...' if 'ans' in role else 'Write a query to search for a moment...' | |
return gr.Textbox(placeholder=placeholder) | |
def reset_components(): | |
return ['pla', 'gnd', 'ver', 'ans'], 5, 0, 256 | |
def main(video, prompt, role, max_candidates, temperature, max_new_tokens): | |
global model, processor, device | |
history = [] | |
if not video: | |
gr.Warning('Please upload a video or click [Random] to sample one.') | |
return history | |
if not prompt: | |
gr.Warning('Please provide a prompt or click [Random] to sample one.') | |
return history | |
if 'gnd' not in role and 'ans' not in role: | |
gr.Warning('Please at least select Grounder or Answerer.') | |
return history | |
if 'ver' in role and 'gnd' not in role: | |
gr.Warning('Verifier cannot be used without Grounder.') | |
return history | |
if 'pla' in role and 'gnd' not in role and 'ver' not in role: | |
gr.Warning('Planner can only be used with Grounder and Verifier.') | |
return history | |
history.append({'role': 'user', 'content': prompt}) | |
yield history | |
model = model.to(device) | |
duration = get_duration(video) | |
# do grounding and answering by default | |
do_grounding = True | |
do_answering = True | |
# initialize grounding query as prompt | |
query = prompt | |
if 'pla' in role: | |
text = PLANNER_PROMPT.format(prompt) | |
history.append({ | |
'metadata': { | |
'title': '๐บ๏ธ Working as Planner...' | |
}, | |
'role': 'assistant', | |
'content': f'##### Planner Prompt:\n\n{html.escape(text)}\n\n##### Planner Response:\n\n...' | |
}) | |
yield history | |
start_time = time.perf_counter() | |
messages = [{ | |
'role': | |
'user', | |
'content': [{ | |
'type': 'video', | |
'video': video, | |
'num_threads': 1, | |
'min_pixels': 36 * 28 * 28, | |
'max_pixels': 64 * 28 * 28, | |
'max_frames': 100, | |
'fps': 1.0 | |
}, { | |
'type': 'text', | |
'text': text | |
}] | |
}] | |
text = processor.apply_chat_template(messages, add_generation_prompt=True) | |
images, videos = process_vision_info(messages) | |
data = processor(text=[text], images=images, videos=videos, return_tensors='pt') | |
data = data.to(device) | |
model.base_model.disable_adapter_layers() | |
model.base_model.enable_adapter_layers() | |
model.set_adapter('planner') | |
output_ids = model.generate( | |
**data, | |
do_sample=temperature > 0, | |
temperature=temperature if temperature > 0 else None, | |
top_p=None, | |
top_k=None, | |
repetition_penalty=None, | |
max_new_tokens=max_new_tokens) | |
assert data.input_ids.size(0) == output_ids.size(0) == 1 | |
output_ids = output_ids[0, data.input_ids.size(1):] | |
if output_ids[-1] == processor.tokenizer.eos_token_id: | |
output_ids = output_ids[:-1] | |
response = processor.decode(output_ids, clean_up_tokenization_spaces=False) | |
for i, text in enumerate(response.split(' ')): | |
if i == 0: | |
history[-1]['content'] = history[-1]['content'].rstrip('.') | |
history[-1]['content'] += text | |
else: | |
history[-1]['content'] += ' ' + text | |
yield history | |
elapsed_time = round(time.perf_counter() - start_time, 1) | |
history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)' | |
yield history | |
try: | |
parsed = json.loads(response) | |
action = parsed[0] if isinstance(parsed, list) else parsed | |
if action['type'].lower() == 'grounder' and action['value']: | |
query = action['value'] | |
elif action['type'].lower() == 'answerer': | |
do_grounding = False | |
do_answering = True | |
except Exception: | |
pass | |
response = 'After browsing the video and the question. My plan to figure out the answer is as follows:\n' | |
step_idx = 1 | |
if 'gnd' in role and do_grounding: | |
response += f'\n{step_idx}. Localize the relevant moment in this video using the query "<span style="color:red">{query}</span>".' | |
step_idx += 1 | |
if 'ver' in role and do_grounding: | |
response += f'\n{step_idx}. Verify the grounded moments one-by-one and select the best cancdidate.' | |
step_idx += 1 | |
if 'ans' in role and do_answering: | |
if step_idx > 1: | |
response += f'\n{step_idx}. Crop the video segment and zoom-in to higher resolution.' | |
else: | |
response += f'\n{step_idx}. Analyze the whole video directly without cropping.' | |
history.append({'role': 'assistant', 'content': ''}) | |
for i, text in enumerate(response.split(' ')): | |
history[-1]['content'] += ' ' + text if i > 0 else text | |
yield history | |
if 'gnd' in role and do_grounding: | |
query = parse_query(query) | |
text = GROUNDER_PROMPT.format(query) | |
history.append({ | |
'metadata': { | |
'title': '๐ Working as Grounder...' | |
}, | |
'role': 'assistant', | |
'content': f'##### Grounder Prompt:\n\n{html.escape(text)}\n\n##### Grounder Response:\n\n...' | |
}) | |
yield history | |
start_time = time.perf_counter() | |
messages = [{ | |
'role': | |
'user', | |
'content': [{ | |
'type': 'video', | |
'video': video, | |
'num_threads': 1, | |
'min_pixels': 36 * 28 * 28, | |
'max_pixels': 64 * 28 * 28, | |
'max_frames': 150, | |
'fps': 1.0 | |
}, { | |
'type': 'text', | |
'text': text | |
}] | |
}] | |
text = processor.apply_chat_template(messages, add_generation_prompt=True) | |
images, videos = process_vision_info(messages) | |
data = processor(text=[text], images=images, videos=videos, return_tensors='pt') | |
data = data.to(device) | |
model.base_model.disable_adapter_layers() | |
model.base_model.enable_adapter_layers() | |
model.set_adapter('grounder') | |
output_ids = model.generate( | |
**data, | |
do_sample=temperature > 0, | |
temperature=temperature if temperature > 0 else None, | |
top_p=None, | |
top_k=None, | |
repetition_penalty=None, | |
max_new_tokens=max_new_tokens) | |
assert data.input_ids.size(0) == output_ids.size(0) == 1 | |
output_ids = output_ids[0, data.input_ids.size(1):] | |
if output_ids[-1] == processor.tokenizer.eos_token_id: | |
output_ids = output_ids[:-1] | |
response = processor.decode(output_ids, clean_up_tokenization_spaces=False) | |
for i, text in enumerate(response.split(' ')): | |
if i == 0: | |
history[-1]['content'] = history[-1]['content'].rstrip('.') | |
history[-1]['content'] += text | |
else: | |
history[-1]['content'] += ' ' + text | |
yield history | |
elapsed_time = round(time.perf_counter() - start_time, 1) | |
history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)' | |
yield history | |
if len(model.reg) > 0: | |
# 1. extract timestamps and confidences | |
blob = model.reg[0].cpu().float() | |
pred, conf = blob[:, :2] * duration, blob[:, -1].tolist() | |
# 2. clamp timestamps | |
pred = pred.clamp(min=0, max=duration) | |
# 3. sort timestamps | |
inds = (pred[:, 1] - pred[:, 0] < 0).nonzero()[:, 0] | |
pred[inds] = pred[inds].roll(1) | |
# 4. convert timestamps to list | |
pred = pred.tolist() | |
else: | |
if 'ver' in role: | |
pred = [[i * duration / 6, (i + 2) * duration / 6] for i in range(5)] | |
conf = [0] * 5 | |
else: | |
pred = [[0, duration]] | |
conf = [0] | |
response = 'The candidate moments and confidence scores are as follows:\n' | |
response += '\n| ID | Start Time | End Time | Confidence |' | |
response += '\n| :-: | :-: | :-: | :-: |' | |
for i, (p, c) in enumerate(zip(pred[:max_candidates], conf[:max_candidates])): | |
response += f'\n| {i} | {seconds_to_hms(p[0])} | {seconds_to_hms(p[1])} | {c:.2f} |' | |
response += f'\n\nTherefore, the target moment might happens from <span style="color:red">{seconds_to_hms(pred[0][0])}</span> to <span style="color:red">{seconds_to_hms(pred[0][1])}</span>.' | |
history.append({'role': 'assistant', 'content': ''}) | |
for i, text in enumerate(response.split(' ')): | |
history[-1]['content'] += ' ' + text if i > 0 else text | |
yield history | |
if 'ver' in role and do_grounding: | |
text = VERIFIER_PROMPT.format(query) | |
history.append({ | |
'metadata': { | |
'title': '๐ Working as Verifier...' | |
}, | |
'role': 'assistant', | |
'content': f'##### Verifier Prompt:\n\n{html.escape(text)}\n\n##### Verifier Response:\n\n...' | |
}) | |
yield history | |
start_time = time.perf_counter() | |
prob = [] | |
for i, cand in enumerate(pred[:max_candidates]): | |
s0, e0 = parse_span(cand, duration, 2) | |
offset = (e0 - s0) / 2 | |
s1, e1 = parse_span([s0 - offset, e0 + offset], duration) | |
# percentage of s0, e0 within s1, e1 | |
s = (s0 - s1) / (e1 - s1) | |
e = (e0 - s1) / (e1 - s1) | |
messages = [{ | |
'role': | |
'user', | |
'content': [{ | |
'type': 'video', | |
'video': video, | |
'num_threads': 1, | |
'video_start': s1, | |
'video_end': e1, | |
'min_pixels': 36 * 28 * 28, | |
'max_pixels': 64 * 28 * 28, | |
'max_frames': 64, | |
'fps': 2.0 | |
}, { | |
'type': 'text', | |
'text': text | |
}] | |
}] | |
text = processor.apply_chat_template(messages, add_generation_prompt=True) | |
images, videos = process_vision_info(messages) | |
data = processor(text=[text], images=images, videos=videos, return_tensors='pt') | |
# ===== insert segment start/end tokens ===== | |
video_grid_thw = data['video_grid_thw'][0] | |
num_frames, window = int(video_grid_thw[0]), int(video_grid_thw[1] * video_grid_thw[2] / 4) | |
assert num_frames * window * 4 == data['pixel_values_videos'].size(0) | |
pos_s, pos_e = round(s * num_frames), round(e * num_frames) | |
pos_s, pos_e = min(max(0, pos_s), num_frames), min(max(0, pos_e), num_frames) | |
assert pos_s <= pos_e, (num_frames, s, e) | |
base_idx = torch.nonzero(data['input_ids'][0] == model.config.vision_start_token_id).item() | |
pos_s, pos_e = pos_s * window + base_idx + 1, pos_e * window + base_idx + 2 | |
input_ids = data['input_ids'][0].tolist() | |
input_ids.insert(pos_s, model.config.seg_s_token_id) | |
input_ids.insert(pos_e, model.config.seg_e_token_id) | |
data['input_ids'] = torch.LongTensor([input_ids]) | |
data['attention_mask'] = torch.ones_like(data['input_ids']) | |
# =========================================== | |
data = data.to(device) | |
model.base_model.disable_adapter_layers() | |
model.base_model.enable_adapter_layers() | |
model.set_adapter('verifier') | |
with torch.inference_mode(): | |
logits = model(**data).logits[0, -1].softmax(dim=-1) | |
# NOTE: magic numbers here | |
# In Qwen2-VL vocab: 9454 -> Yes, 2753 -> No | |
score = (logits[9454] - logits[2753]).sigmoid().item() | |
prob.append(score) | |
if i == 0: | |
history[-1]['content'] = history[-1]['content'].rstrip('.')[:-1] | |
response = f'\nCandidate ID {i}: P(Yes) = {score:.2f}' | |
for j, text in enumerate(response.split(' ')): | |
history[-1]['content'] += ' ' + text if j > 0 else text | |
yield history | |
elapsed_time = round(time.perf_counter() - start_time, 1) | |
history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)' | |
yield history | |
ranks = torch.Tensor(prob).argsort(descending=True).tolist() | |
prob = [prob[idx] for idx in ranks] | |
pred = [pred[idx] for idx in ranks] | |
conf = [conf[idx] for idx in ranks] | |
response = 'After verification, the candidate moments are re-ranked as follows:\n' | |
response += '\n| ID | Start Time | End Time | Score |' | |
response += '\n| :-: | :-: | :-: | :-: |' | |
ids = list(range(len(ranks))) | |
for r, p, c in zip(ranks, pred, prob): | |
response += f'\n| {ids[r]} | {seconds_to_hms(p[0])} | {seconds_to_hms(p[1])} | {c:.2f} |' | |
response += f'\n\nTherefore, the target moment should be from <span style="color:red">{seconds_to_hms(pred[0][0])}</span> to <span style="color:red">{seconds_to_hms(pred[0][1])}</span>.' | |
history.append({'role': 'assistant', 'content': ''}) | |
for i, text in enumerate(response.split(' ')): | |
history[-1]['content'] += ' ' + text if i > 0 else text | |
yield history | |
if 'ans' in role and do_answering: | |
text = f'{prompt} Please think step by step and provide your response.' | |
history.append({ | |
'metadata': { | |
'title': '๐ Working as Answerer...' | |
}, | |
'role': 'assistant', | |
'content': f'##### Answerer Prompt:\n\n{html.escape(text)}\n\n##### Answerer Response:\n\n...' | |
}) | |
yield history | |
start_time = time.perf_counter() | |
# choose the potential best moment | |
selected = pred[0] if 'gnd' in role and do_grounding else [0, duration] | |
s, e = parse_span(selected, duration, 32) | |
messages = [{ | |
'role': | |
'user', | |
'content': [{ | |
'type': 'video', | |
'video': video, | |
'num_threads': 1, | |
'video_start': s, | |
'video_end': e, | |
'min_pixels': 128 * 28 * 28, | |
'max_pixels': 256 * 28 * 28, | |
'max_frames': 32, | |
'fps': 2.0 | |
}, { | |
'type': 'text', | |
'text': text | |
}] | |
}] | |
text = processor.apply_chat_template(messages, add_generation_prompt=True) | |
images, videos = process_vision_info(messages) | |
data = processor(text=[text], images=images, videos=videos, return_tensors='pt') | |
data = data.to(device) | |
with model.disable_adapter(): | |
output_ids = model.generate( | |
**data, | |
do_sample=temperature > 0, | |
temperature=temperature if temperature > 0 else None, | |
top_p=None, | |
top_k=None, | |
repetition_penalty=None, | |
max_new_tokens=max_new_tokens) | |
assert data.input_ids.size(0) == output_ids.size(0) == 1 | |
output_ids = output_ids[0, data.input_ids.size(1):] | |
if output_ids[-1] == processor.tokenizer.eos_token_id: | |
output_ids = output_ids[:-1] | |
response = processor.decode(output_ids, clean_up_tokenization_spaces=False) | |
for i, text in enumerate(response.split(' ')): | |
if i == 0: | |
history[-1]['content'] = history[-1]['content'].rstrip('.') | |
history[-1]['content'] += text | |
else: | |
history[-1]['content'] += ' ' + text | |
yield history | |
elapsed_time = round(time.perf_counter() - start_time, 1) | |
history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)' | |
yield history | |
if 'gnd' in role and do_grounding: | |
response = f'After zooming in and analyzing the target moment, I finalize my answer: <span style="color:green">{response}</span>' | |
else: | |
response = f'After watching the whole video, my answer is: <span style="color:green">{response}</span>' | |
history.append({'role': 'assistant', 'content': ''}) | |
for i, text in enumerate(response.split(' ')): | |
history[-1]['content'] += ' ' + text if i > 0 else text | |
yield history | |
def build_demo(): | |
chat = gr.Chatbot( | |
type='messages', | |
height='70em', | |
resizable=True, | |
avatar_images=[f'{PATH}/assets/user.png', f'{PATH}/assets/bot.png'], | |
placeholder='A conversation with VideoMind', | |
label='VideoMind') | |
prompt = gr.Textbox(label='Text Prompt', placeholder='Ask a question about the video...') | |
with gr.Blocks(title=TITLE, js=JS) as demo: | |
gr.HTML(LOGO) | |
gr.HTML(BADGE) | |
gr.Markdown(DISC) | |
with gr.Row(): | |
with gr.Column(scale=3): | |
video = gr.Video() | |
with gr.Group(): | |
role = gr.CheckboxGroup( | |
choices=[('๐บ๏ธ Planner', 'pla'), ('๐ Grounder', 'gnd'), ('๐ Verifier', 'ver'), | |
('๐ Answerer', 'ans')], | |
value=['pla', 'gnd', 'ver', 'ans'], | |
interactive=True, | |
label='Roles', | |
info='Select the role(s) you would like to activate.') | |
role.change(update_placeholder, role, prompt) | |
with gr.Accordion(label='Hyperparameters', open=False): | |
max_candidates = gr.Slider( | |
1, | |
100, | |
value=5, | |
step=1, | |
interactive=True, | |
label='Max Candidate Moments', | |
info='The maximum number of candidate moments in Grounder (Default: 5)') | |
temperature = gr.Slider( | |
0, | |
1, | |
value=0, | |
step=0.1, | |
interactive=True, | |
label='Temperature', | |
info='Higher value leads to more creativity and randomness (Default: 0)') | |
max_new_tokens = gr.Slider( | |
1, | |
1024, | |
value=256, | |
step=1, | |
interactive=True, | |
label='Max Output Tokens', | |
info='The maximum number of output tokens for each role (Default: 256)') | |
prompt.render() | |
with gr.Row(): | |
random_btn = gr.Button(value='๐ฎ Random') | |
random_btn.click(random_sample, None, [video, prompt, role]) | |
reset_btn = gr.ClearButton([video, prompt, chat], value='๐๏ธ Reset') | |
reset_btn.click(reset_components, None, [role, max_candidates, temperature, max_new_tokens]) | |
submit_btn = gr.Button(value='๐ Submit', variant='primary') | |
ctx = submit_btn.click(disable_btns, None, [random_btn, reset_btn, submit_btn]) | |
ctx = ctx.then(main, [video, prompt, role, max_candidates, temperature, max_new_tokens], chat) | |
ctx.then(enable_btns, None, [random_btn, reset_btn, submit_btn]) | |
gr.Examples(examples=EXAMPLES, inputs=[video, prompt, role], examples_per_page=3) | |
with gr.Column(scale=5): | |
chat.render() | |
return demo | |
if __name__ == '__main__': | |
demo = build_demo() | |
demo.queue() | |
demo.launch(server_name='0.0.0.0', allowed_paths=[f'{PATH}/assets', f'{PATH}/examples']) | |