Spaces:

yeliudev
/

VideoMind-2B

Running on Zero

App Files Files Community

yeliudev commited on Mar 29

Commit

ba0b022

verified ·

1 Parent(s): 70cddaf

Upload folder using huggingface_hub

Browse files

Files changed (22) hide show

.gitattributes +16 -0
.gitignore +4 -0
README.md +1 -1
app.py +84 -59
examples/10309844035.mp4 +3 -0
examples/13887487955.mp4 +3 -0
examples/4167294363.mp4 +3 -0
examples/4742652230.mp4 +3 -0
examples/4766274786.mp4 +3 -0
examples/5012237466.mp4 +3 -0
examples/5188348585.mp4 +3 -0
examples/9383140374.mp4 +3 -0
examples/DTInxNfWXVc_210.0_360.0.mp4 +3 -0
examples/RoripwjYFp8_210.0_360.0.mp4 +3 -0
examples/UFWQKrcbhjI_360.0_510.0.mp4 +3 -0
examples/Z3-IZ3HAmIA_60.0_210.0.mp4 +3 -0
examples/h6QKDqomIPk_210.0_360.0.mp4 +3 -0
examples/pA6Z-qYhSNg_60.0_210.0.mp4 +3 -0
examples/rrTIeJRVGjg_60.0_210.0.mp4 +3 -0
examples/yId2wIocTys_210.0_360.0.mp4 +3 -0
requirements.txt +6 -2
setup.cfg +1 -1

.gitattributes CHANGED Viewed

@@ -49,3 +49,19 @@ data/h6QKDqomIPk_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
 data/pA6Z-qYhSNg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
 data/rrTIeJRVGjg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
 data/yId2wIocTys_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text

 data/pA6Z-qYhSNg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
 data/rrTIeJRVGjg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
 data/yId2wIocTys_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/10309844035.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/13887487955.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/4167294363.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/4742652230.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/4766274786.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/5012237466.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/5188348585.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/9383140374.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/DTInxNfWXVc_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/RoripwjYFp8_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/UFWQKrcbhjI_360.0_510.0.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/Z3-IZ3HAmIA_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/h6QKDqomIPk_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/pA6Z-qYhSNg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/rrTIeJRVGjg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/yId2wIocTys_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -5,5 +5,9 @@ __pycache__
 *$py.class
 # Temporary data
 .DS_Store
 ._*

 *$py.class
 # Temporary data
+/data*
+/demo/examples
+/model_zoo
+/work_dirs*
 .DS_Store
 ._*

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 💡
 colorFrom: red
 colorTo: blue
 sdk: gradio
-sdk_version: 4.44.1
 app_file: app.py
 pinned: true
 license: bsd-3-clause

 colorFrom: red
 colorTo: blue
 sdk: gradio
+sdk_version: 5.15.0
 app_file: app.py
 pinned: true
 license: bsd-3-clause

app.py CHANGED Viewed

@@ -5,60 +5,92 @@ import json
 import os
 import random
 import time
-from functools import partial
 import gradio as gr
 import nncore
 import torch
 from huggingface_hub import snapshot_download
-import spaces
 from videomind.constants import GROUNDER_PROMPT, PLANNER_PROMPT, VERIFIER_PROMPT
 from videomind.dataset.utils import process_vision_info
 from videomind.model.builder import build_model
 from videomind.utils.io import get_duration
 from videomind.utils.parser import parse_query, parse_span
 BASE_MODEL = 'model_zoo/Qwen2-VL-2B-Instruct'
-BASE_MODEL_HF = 'Qwen/Qwen2-VL-2B-Instruct'
 MODEL = 'model_zoo/VideoMind-2B'
-MODEL_HF = 'yeliudev/VideoMind-2B'
 TITLE = 'VideoMind: A Chain-of-LoRA Agent for Long Video Reasoning'
-LOGO_MD = '<p align="center"><img width="350" src="https://raw.githubusercontent.com/yeliudev/VideoMind/refs/heads/main/.github/logo.png"></p>'
-DESCRIPTION_MD = """VideoMind is a multi-modal agent framework that enhances video reasoning by emulating *human-like* processes, such as *breaking down tasks*, *localizing and verifying moments*, and *synthesizing answers*. This approach addresses the unique challenges of temporal-grounded reasoning in a progressive strategy. Please find more details at our <a href="https://videomind.github.io/" target="_blank">Project Page</a>, <a href="https://arxiv.org/abs/2503.13444" target="_blank">Tech Report</a> and <a href="https://github.com/yeliudev/VideoMind" target="_blank">GitHub Repo</a>."""  # noqa
 # yapf:disable
 EXAMPLES = [
-    ('data/4167294363.mp4', 'Why did the old man stand up?', ['pla', 'gnd', 'ver', 'ans']),
-    ('data/5012237466.mp4', 'How does the child in stripes react about the fountain?', ['pla', 'gnd', 'ver', 'ans']),
-    ('data/13887487955.mp4', 'What did the excavator do after it pushed the cement forward?', ['pla', 'gnd', 'ver', 'ans']),
-    ('data/5188348585.mp4', 'What did the person do before pouring the liquor?', ['pla', 'gnd', 'ver', 'ans']),
-    ('data/4766274786.mp4', 'What did the girl do after the baby lost the balloon?', ['pla', 'gnd', 'ver', 'ans']),
-    ('data/4742652230.mp4', 'Why is the girl pushing the boy only around the toy but not to other places?', ['pla', 'gnd', 'ver', 'ans']),
-    ('data/9383140374.mp4', 'How does the girl in pink control the movement of the claw?', ['pla', 'gnd', 'ver', 'ans']),
-    ('data/10309844035.mp4', 'Why are they holding up the phones?', ['pla', 'gnd', 'ver', 'ans']),
-    ('data/pA6Z-qYhSNg_60.0_210.0.mp4', 'Different types of meat products are being cut, shaped and prepared', ['gnd', 'ver']),
-    ('data/UFWQKrcbhjI_360.0_510.0.mp4', 'A man talks to the camera whilst walking along a roadside in a rural area', ['gnd', 'ver']),
-    ('data/RoripwjYFp8_210.0_360.0.mp4', 'A woman wearing glasses eating something at a street market', ['gnd', 'ver']),
-    ('data/h6QKDqomIPk_210.0_360.0.mp4', 'A toddler sits in his car seat, holding his yellow tablet', ['gnd', 'ver']),
-    ('data/Z3-IZ3HAmIA_60.0_210.0.mp4', 'A view from the window as the plane accelerates and takes off from the runway', ['gnd', 'ver']),
-    ('data/yId2wIocTys_210.0_360.0.mp4', "Temporally locate the visual content mentioned in the text query 'kids exercise in front of parked cars' within the video.", ['pla', 'gnd', 'ver']),
-    ('data/rrTIeJRVGjg_60.0_210.0.mp4', "Localize the moment that provides relevant context about 'man stands in front of a white building monologuing'.", ['pla', 'gnd', 'ver']),
-    ('data/DTInxNfWXVc_210.0_360.0.mp4', "Find the video segment that corresponds to the given textual query 'man with headphones talking'.", ['pla', 'gnd', 'ver']),
 ]
 # yapf:enable
-CSS = """button .box { text-align: left }"""
-JS = """
-function init() {
-    var info = document.getElementById('role').querySelectorAll('[class^="svelte"]')[1]
-    info.innerHTML = info.innerHTML.replace(/&lt;/g, '<').replace(/&gt;/g, '>')
-}
-"""
 def seconds_to_hms(seconds):
@@ -89,7 +121,9 @@ def reset_components():
 @spaces.GPU
-def main(video, prompt, role, temperature, max_new_tokens, model, processor, device):
     history = []
     if not video:
@@ -525,40 +559,20 @@ def main(video, prompt, role, temperature, max_new_tokens, model, processor, dev
             yield history
-if __name__ == '__main__':
-    if not nncore.is_dir(BASE_MODEL):
-        snapshot_download(BASE_MODEL_HF, local_dir=BASE_MODEL)
-    if not nncore.is_dir(MODEL):
-        snapshot_download(MODEL_HF, local_dir=MODEL)
-    print('Initializing role *grounder*')
-    model, processor = build_model(MODEL)
-    print('Initializing role *planner*')
-    model.load_adapter(nncore.join(MODEL, 'planner'), adapter_name='planner')
-    print('Initializing role *verifier*')
-    model.load_adapter(nncore.join(MODEL, 'verifier'), adapter_name='verifier')
-    device = torch.device('cuda')
-    main = partial(main, model=model, processor=processor, device=device)
-    path = os.path.dirname(os.path.realpath(__file__))
     chat = gr.Chatbot(
         type='messages',
         height='70vh',
-        avatar_images=[f'{path}/assets/user.png', f'{path}/assets/bot.png'],
         placeholder='A conversation with VideoMind',
         label='VideoMind')
     prompt = gr.Textbox(label='Text Prompt', placeholder='Ask a question about the video...')
-    with gr.Blocks(title=TITLE, css=CSS, js=JS) as demo:
-        gr.Markdown(LOGO_MD)
-        gr.Markdown(DESCRIPTION_MD)
         with gr.Row():
             with gr.Column(scale=3):
@@ -592,7 +606,11 @@ if __name__ == '__main__':
                             label='Max Output Tokens',
                             info='The maximum number of output tokens for each role (Default: 256)')
-                prompt.render()
                 with gr.Row():
                     random_btn = gr.Button(value='🔮 Random')
@@ -606,9 +624,16 @@ if __name__ == '__main__':
                     submit_ctx = submit_ctx.then(main, [video, prompt, role, temperature, max_new_tokens], chat)
                     submit_ctx.then(enable_btns, None, [random_btn, reset_btn, submit_btn])
-                gr.Markdown('##### Need an example video and question? Just click 🔮 Random to sample one!')
             with gr.Column(scale=5):
                 chat.render()
-        demo.launch(server_name='0.0.0.0')

 import os
 import random
 import time
 import gradio as gr
 import nncore
+import spaces
 import torch
 from huggingface_hub import snapshot_download
 from videomind.constants import GROUNDER_PROMPT, PLANNER_PROMPT, VERIFIER_PROMPT
 from videomind.dataset.utils import process_vision_info
 from videomind.model.builder import build_model
 from videomind.utils.io import get_duration
 from videomind.utils.parser import parse_query, parse_span
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+PATH = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
 BASE_MODEL = 'model_zoo/Qwen2-VL-2B-Instruct'
+BASE_MODEL_REPO = 'Qwen/Qwen2-VL-2B-Instruct'
 MODEL = 'model_zoo/VideoMind-2B'
+MODEL_REPO = 'yeliudev/VideoMind-2B'
 TITLE = 'VideoMind: A Chain-of-LoRA Agent for Long Video Reasoning'
+BADGE = """
+<h3 align="center" style="margin-top: -0.5em;">A Chain-of-LoRA Agent for Long Video Reasoning</h3>
+<div style="display: flex; justify-content: center; gap: 5px; margin-bottom: -0.7em !important;">
+    <a href="https://arxiv.org/abs/2503.13444" target="_blank">
+        <img src="https://img.shields.io/badge/arXiv-2503.13444-red">
+    </a>
+    <a href="https://videomind.github.io/" target="_blank">
+        <img src="https://img.shields.io/badge/Project-Page-brightgreen">
+    </a>
+    <a href="https://huggingface.co/collections/yeliudev/videomind-67dd41f42c57f0e7433afb36" target="_blank">
+        <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue">
+    </a>
+    <a href="https://huggingface.co/datasets/yeliudev/VideoMind-Dataset" target="_blank">
+        <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-orange">
+    </a>
+    <a href="https://github.com/yeliudev/VideoMind/blob/main/README.md" target="_blank">
+        <img src="https://img.shields.io/badge/License-BSD--3--Clause-purple">
+    </a>
+</div>
+"""
+LOGO = '<p align="center"><img width="350" src="https://raw.githubusercontent.com/yeliudev/VideoMind/refs/heads/main/.github/logo.png"></p>'
+DISC = '**VideoMind** is a multi-modal agent framework that enhances video reasoning by emulating *human-like* processes, such as *breaking down tasks*, *localizing and verifying moments*, and *synthesizing answers*. This approach addresses the unique challenges of temporal-grounded reasoning in a progressive strategy. This demo showcases how VideoMind-2B handles video-language tasks. Please open an <a href="https://github.com/yeliudev/VideoMind/issues/new" target="_blank">issue</a> if you meet any problems or have any suggestions.'  # noqa
 # yapf:disable
 EXAMPLES = [
+    [f'{PATH}/examples/4167294363.mp4', 'Why did the old man stand up?', ['pla', 'gnd', 'ver', 'ans']],
+    [f'{PATH}/examples/5012237466.mp4', 'How does the child in stripes react about the fountain?', ['pla', 'gnd', 'ver', 'ans']],
+    [f'{PATH}/examples/13887487955.mp4', 'What did the excavator do after it pushed the cement forward?', ['pla', 'gnd', 'ver', 'ans']],
+    [f'{PATH}/examples/5188348585.mp4', 'What did the person do before pouring the liquor?', ['pla', 'gnd', 'ver', 'ans']],
+    [f'{PATH}/examples/4766274786.mp4', 'What did the girl do after the baby lost the balloon?', ['pla', 'gnd', 'ver', 'ans']],
+    [f'{PATH}/examples/4742652230.mp4', 'Why is the girl pushing the boy only around the toy but not to other places?', ['pla', 'gnd', 'ver', 'ans']],
+    [f'{PATH}/examples/9383140374.mp4', 'How does the girl in pink control the movement of the claw?', ['pla', 'gnd', 'ver', 'ans']],
+    [f'{PATH}/examples/10309844035.mp4', 'Why are they holding up the phones?', ['pla', 'gnd', 'ver', 'ans']],
+    [f'{PATH}/examples/pA6Z-qYhSNg_60.0_210.0.mp4', 'Different types of meat products are being cut, shaped and prepared', ['gnd', 'ver']],
+    [f'{PATH}/examples/UFWQKrcbhjI_360.0_510.0.mp4', 'A man talks to the camera whilst walking along a roadside in a rural area', ['gnd', 'ver']],
+    [f'{PATH}/examples/RoripwjYFp8_210.0_360.0.mp4', 'A woman wearing glasses eating something at a street market', ['gnd', 'ver']],
+    [f'{PATH}/examples/h6QKDqomIPk_210.0_360.0.mp4', 'A toddler sits in his car seat, holding his yellow tablet', ['gnd', 'ver']],
+    [f'{PATH}/examples/Z3-IZ3HAmIA_60.0_210.0.mp4', 'A view from the window as the plane accelerates and takes off from the runway', ['gnd', 'ver']],
+    [f'{PATH}/examples/yId2wIocTys_210.0_360.0.mp4', "Temporally locate the visual content mentioned in the text query 'kids exercise in front of parked cars' within the video.", ['pla', 'gnd', 'ver']],
+    [f'{PATH}/examples/rrTIeJRVGjg_60.0_210.0.mp4', "Localize the moment that provides relevant context about 'man stands in front of a white building monologuing'.", ['pla', 'gnd', 'ver']],
+    [f'{PATH}/examples/DTInxNfWXVc_210.0_360.0.mp4', "Find the video segment that corresponds to the given textual query 'man with headphones talking'.", ['pla', 'gnd', 'ver']],
 ]
 # yapf:enable
+if not nncore.is_dir(BASE_MODEL):
+    snapshot_download(BASE_MODEL_REPO, local_dir=BASE_MODEL)
+if not nncore.is_dir(MODEL):
+    snapshot_download(MODEL_REPO, local_dir=MODEL)
+print('Initializing role *grounder*')
+model, processor = build_model(MODEL)
+print('Initializing role *planner*')
+model.load_adapter(nncore.join(MODEL, 'planner'), adapter_name='planner')
+print('Initializing role *verifier*')
+model.load_adapter(nncore.join(MODEL, 'verifier'), adapter_name='verifier')
+device = torch.device('cuda')
 def seconds_to_hms(seconds):
 @spaces.GPU
+def main(video, prompt, role, temperature, max_new_tokens):
+    global model, processor, device
     history = []
     if not video:
             yield history
+def build_demo():
     chat = gr.Chatbot(
         type='messages',
         height='70vh',
+        avatar_images=[f'{PATH}/assets/user.png', f'{PATH}/assets/bot.png'],
         placeholder='A conversation with VideoMind',
         label='VideoMind')
     prompt = gr.Textbox(label='Text Prompt', placeholder='Ask a question about the video...')
+    with gr.Blocks(title=TITLE) as demo:
+        gr.Markdown(LOGO)
+        gr.HTML(BADGE)
+        gr.Markdown(DISC)
         with gr.Row():
             with gr.Column(scale=3):
                             label='Max Output Tokens',
                             info='The maximum number of output tokens for each role (Default: 256)')
+                with gr.Group():
+                    prompt.render()
+                    with gr.Accordion(label='Examples', open=False):
+                        gr.Examples(examples=EXAMPLES, inputs=[video, prompt, role], examples_per_page=3)
                 with gr.Row():
                     random_btn = gr.Button(value='🔮 Random')
                     submit_ctx = submit_ctx.then(main, [video, prompt, role, temperature, max_new_tokens], chat)
                     submit_ctx.then(enable_btns, None, [random_btn, reset_btn, submit_btn])
+                gr.Markdown('##### Need example data? Explore examples tab or click 🔮 Random to sample one!')
             with gr.Column(scale=5):
                 chat.render()
+    return demo
+if __name__ == '__main__':
+    demo = build_demo()
+    demo.queue()
+    demo.launch(server_name='0.0.0.0', allowed_paths=[f'{PATH}/assets', f'{PATH}/examples'])

examples/10309844035.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8996ff134787d6b769c2491b9079a02c05953465ad770f07a8d9138e2668d24f
+size 4041678

examples/13887487955.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5fecab1076ee42b3804718f9f64bef06cbfafd6995ad5f5ee42ba6354721429
+size 5544739

examples/4167294363.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d0e0a4a381836f68e16a816d87f241fed3e31ea321f544b921743d6c1c50666
+size 6611151

examples/4742652230.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8733ab4b0716d13ea7a79fc4ddacaf9eede567db364f0ecddfa4582c2f237f82
+size 2200304

examples/4766274786.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:afa38a9ce9e89f934293214d79755c89159664223b3ca366813fd5fe524ed013
+size 3395545

examples/5012237466.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd1929aa93d037f809f402e9801047125dc9fe8060301e69ded9ba1f2d785cc8
+size 4822293

examples/5188348585.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b225f448a546ba2f65958f18c6731a6dde9b1f437014e90036b22eb40e9ad0a5
+size 5051675

examples/9383140374.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30b6b3eb43f711bef194150d473a59850ff5d7fec0f5cc30e7526aa9e382303f
+size 2518081

examples/DTInxNfWXVc_210.0_360.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a09eee0dc404688731fb768c120d3519605f2343376b9bd727a71b91379fd9a9
+size 4999970

examples/RoripwjYFp8_210.0_360.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b39b15158dc20c0bc6f1758a9239c8f3eed20ba4a90953338eec2246fa8f1f0
+size 9287252

examples/UFWQKrcbhjI_360.0_510.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8669153d9ffac4b5534c20fab8d795347f5babe588da9b8330e049d623ebb443
+size 14510618

examples/Z3-IZ3HAmIA_60.0_210.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b3a342993ee61efc5f3b859cd9c1e0d360b3331eed9deb8466891e4bcacc554
+size 14397799

examples/h6QKDqomIPk_210.0_360.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:103820de2b8a1a3935b39ed80d91cd08e546e5617310b3d1bb3dadb06b2ffb95
+size 13485144

examples/pA6Z-qYhSNg_60.0_210.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c84660fd4ebd8c23a2a7364174b1e819fec8b0e1cb8b9d9cd86f9e429cbdf66c
+size 8658509

examples/rrTIeJRVGjg_60.0_210.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efe6f48a49963bd4880ef5065840e05dd25e2aa975870140bcdaf4220bbd2827
+size 11410412

examples/yId2wIocTys_210.0_360.0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:447fcb1fd1f94ed6a88d56dd0f6f859646cb8c58ed8e3b7a82f374e2cfee1646
+size 14769130

requirements.txt CHANGED Viewed

@@ -1,10 +1,8 @@
 accelerate==1.2.1
 decord==0.6.0
-gradio==4.44.1
 nncore==0.4.5
 pandas==2.2.3
 peft==0.14.0
-pydantic==2.10.6
 pysrt==1.1.2
 scikit-image==0.25.0
 scikit-learn==1.6.1
@@ -13,6 +11,12 @@ spaces==0.34.0
 termplotlib==0.3.9
 triton==3.0.0
 # our codebase contains necessary patches for 4.45.2
 transformers==4.45.2

 accelerate==1.2.1
 decord==0.6.0
 nncore==0.4.5
 pandas==2.2.3
 peft==0.14.0
 pysrt==1.1.2
 scikit-image==0.25.0
 scikit-learn==1.6.1
 termplotlib==0.3.9
 triton==3.0.0
+# gradio 5.16.0 to 5.23.1 have wrong horizontal margins
+gradio==5.15.0
+# https://github.com/gradio-app/gradio/issues/10662
+pydantic==2.10.6
 # our codebase contains necessary patches for 4.45.2
 transformers==4.45.2

setup.cfg CHANGED Viewed

@@ -7,7 +7,7 @@ split_before_expression_after_opening_paren = true
 [isort]
 line_length = 120
 multi_line_output = 0
-known_third_party = decord,deepspeed,gradio,huggingface_hub,nncore,numpy,pandas,peft,PIL,pysrt,safetensors,tabulate,termplotlib,torch,torchvision,transformers
 no_lines_before = STDLIB,LOCALFOLDER
 default_section = FIRSTPARTY

 [isort]
 line_length = 120
 multi_line_output = 0
+known_third_party = decord,deepspeed,gradio,huggingface_hub,nncore,numpy,pandas,peft,PIL,pysrt,safetensors,spaces,tabulate,termplotlib,torch,torchvision,transformers
 no_lines_before = STDLIB,LOCALFOLDER
 default_section = FIRSTPARTY