yeliudev commited on
Commit
ba0b022
·
verified ·
1 Parent(s): 70cddaf

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -49,3 +49,19 @@ data/h6QKDqomIPk_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
49
  data/pA6Z-qYhSNg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
50
  data/rrTIeJRVGjg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
51
  data/yId2wIocTys_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  data/pA6Z-qYhSNg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
50
  data/rrTIeJRVGjg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
51
  data/yId2wIocTys_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
52
+ examples/10309844035.mp4 filter=lfs diff=lfs merge=lfs -text
53
+ examples/13887487955.mp4 filter=lfs diff=lfs merge=lfs -text
54
+ examples/4167294363.mp4 filter=lfs diff=lfs merge=lfs -text
55
+ examples/4742652230.mp4 filter=lfs diff=lfs merge=lfs -text
56
+ examples/4766274786.mp4 filter=lfs diff=lfs merge=lfs -text
57
+ examples/5012237466.mp4 filter=lfs diff=lfs merge=lfs -text
58
+ examples/5188348585.mp4 filter=lfs diff=lfs merge=lfs -text
59
+ examples/9383140374.mp4 filter=lfs diff=lfs merge=lfs -text
60
+ examples/DTInxNfWXVc_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
61
+ examples/RoripwjYFp8_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
62
+ examples/UFWQKrcbhjI_360.0_510.0.mp4 filter=lfs diff=lfs merge=lfs -text
63
+ examples/Z3-IZ3HAmIA_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
64
+ examples/h6QKDqomIPk_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
65
+ examples/pA6Z-qYhSNg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
66
+ examples/rrTIeJRVGjg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
67
+ examples/yId2wIocTys_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -5,5 +5,9 @@ __pycache__
5
  *$py.class
6
 
7
  # Temporary data
 
 
 
 
8
  .DS_Store
9
  ._*
 
5
  *$py.class
6
 
7
  # Temporary data
8
+ /data*
9
+ /demo/examples
10
+ /model_zoo
11
+ /work_dirs*
12
  .DS_Store
13
  ._*
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 💡
4
  colorFrom: red
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 4.44.1
8
  app_file: app.py
9
  pinned: true
10
  license: bsd-3-clause
 
4
  colorFrom: red
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.15.0
8
  app_file: app.py
9
  pinned: true
10
  license: bsd-3-clause
app.py CHANGED
@@ -5,60 +5,92 @@ import json
5
  import os
6
  import random
7
  import time
8
- from functools import partial
9
 
10
  import gradio as gr
11
  import nncore
 
12
  import torch
13
  from huggingface_hub import snapshot_download
14
 
15
- import spaces
16
  from videomind.constants import GROUNDER_PROMPT, PLANNER_PROMPT, VERIFIER_PROMPT
17
  from videomind.dataset.utils import process_vision_info
18
  from videomind.model.builder import build_model
19
  from videomind.utils.io import get_duration
20
  from videomind.utils.parser import parse_query, parse_span
21
 
 
 
 
 
22
  BASE_MODEL = 'model_zoo/Qwen2-VL-2B-Instruct'
23
- BASE_MODEL_HF = 'Qwen/Qwen2-VL-2B-Instruct'
24
 
25
  MODEL = 'model_zoo/VideoMind-2B'
26
- MODEL_HF = 'yeliudev/VideoMind-2B'
27
 
28
  TITLE = 'VideoMind: A Chain-of-LoRA Agent for Long Video Reasoning'
29
 
30
- LOGO_MD = '<p align="center"><img width="350" src="https://raw.githubusercontent.com/yeliudev/VideoMind/refs/heads/main/.github/logo.png"></p>'
31
- DESCRIPTION_MD = """VideoMind is a multi-modal agent framework that enhances video reasoning by emulating *human-like* processes, such as *breaking down tasks*, *localizing and verifying moments*, and *synthesizing answers*. This approach addresses the unique challenges of temporal-grounded reasoning in a progressive strategy. Please find more details at our <a href="https://videomind.github.io/" target="_blank">Project Page</a>, <a href="https://arxiv.org/abs/2503.13444" target="_blank">Tech Report</a> and <a href="https://github.com/yeliudev/VideoMind" target="_blank">GitHub Repo</a>.""" # noqa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  # yapf:disable
34
  EXAMPLES = [
35
- ('data/4167294363.mp4', 'Why did the old man stand up?', ['pla', 'gnd', 'ver', 'ans']),
36
- ('data/5012237466.mp4', 'How does the child in stripes react about the fountain?', ['pla', 'gnd', 'ver', 'ans']),
37
- ('data/13887487955.mp4', 'What did the excavator do after it pushed the cement forward?', ['pla', 'gnd', 'ver', 'ans']),
38
- ('data/5188348585.mp4', 'What did the person do before pouring the liquor?', ['pla', 'gnd', 'ver', 'ans']),
39
- ('data/4766274786.mp4', 'What did the girl do after the baby lost the balloon?', ['pla', 'gnd', 'ver', 'ans']),
40
- ('data/4742652230.mp4', 'Why is the girl pushing the boy only around the toy but not to other places?', ['pla', 'gnd', 'ver', 'ans']),
41
- ('data/9383140374.mp4', 'How does the girl in pink control the movement of the claw?', ['pla', 'gnd', 'ver', 'ans']),
42
- ('data/10309844035.mp4', 'Why are they holding up the phones?', ['pla', 'gnd', 'ver', 'ans']),
43
- ('data/pA6Z-qYhSNg_60.0_210.0.mp4', 'Different types of meat products are being cut, shaped and prepared', ['gnd', 'ver']),
44
- ('data/UFWQKrcbhjI_360.0_510.0.mp4', 'A man talks to the camera whilst walking along a roadside in a rural area', ['gnd', 'ver']),
45
- ('data/RoripwjYFp8_210.0_360.0.mp4', 'A woman wearing glasses eating something at a street market', ['gnd', 'ver']),
46
- ('data/h6QKDqomIPk_210.0_360.0.mp4', 'A toddler sits in his car seat, holding his yellow tablet', ['gnd', 'ver']),
47
- ('data/Z3-IZ3HAmIA_60.0_210.0.mp4', 'A view from the window as the plane accelerates and takes off from the runway', ['gnd', 'ver']),
48
- ('data/yId2wIocTys_210.0_360.0.mp4', "Temporally locate the visual content mentioned in the text query 'kids exercise in front of parked cars' within the video.", ['pla', 'gnd', 'ver']),
49
- ('data/rrTIeJRVGjg_60.0_210.0.mp4', "Localize the moment that provides relevant context about 'man stands in front of a white building monologuing'.", ['pla', 'gnd', 'ver']),
50
- ('data/DTInxNfWXVc_210.0_360.0.mp4', "Find the video segment that corresponds to the given textual query 'man with headphones talking'.", ['pla', 'gnd', 'ver']),
51
  ]
52
  # yapf:enable
53
 
54
- CSS = """button .box { text-align: left }"""
 
55
 
56
- JS = """
57
- function init() {
58
- var info = document.getElementById('role').querySelectorAll('[class^="svelte"]')[1]
59
- info.innerHTML = info.innerHTML.replace(/&lt;/g, '<').replace(/&gt;/g, '>')
60
- }
61
- """
 
 
 
 
 
 
 
62
 
63
 
64
  def seconds_to_hms(seconds):
@@ -89,7 +121,9 @@ def reset_components():
89
 
90
 
91
  @spaces.GPU
92
- def main(video, prompt, role, temperature, max_new_tokens, model, processor, device):
 
 
93
  history = []
94
 
95
  if not video:
@@ -525,40 +559,20 @@ def main(video, prompt, role, temperature, max_new_tokens, model, processor, dev
525
  yield history
526
 
527
 
528
- if __name__ == '__main__':
529
- if not nncore.is_dir(BASE_MODEL):
530
- snapshot_download(BASE_MODEL_HF, local_dir=BASE_MODEL)
531
-
532
- if not nncore.is_dir(MODEL):
533
- snapshot_download(MODEL_HF, local_dir=MODEL)
534
-
535
- print('Initializing role *grounder*')
536
- model, processor = build_model(MODEL)
537
-
538
- print('Initializing role *planner*')
539
- model.load_adapter(nncore.join(MODEL, 'planner'), adapter_name='planner')
540
-
541
- print('Initializing role *verifier*')
542
- model.load_adapter(nncore.join(MODEL, 'verifier'), adapter_name='verifier')
543
-
544
- device = torch.device('cuda')
545
-
546
- main = partial(main, model=model, processor=processor, device=device)
547
-
548
- path = os.path.dirname(os.path.realpath(__file__))
549
-
550
  chat = gr.Chatbot(
551
  type='messages',
552
  height='70vh',
553
- avatar_images=[f'{path}/assets/user.png', f'{path}/assets/bot.png'],
554
  placeholder='A conversation with VideoMind',
555
  label='VideoMind')
556
 
557
  prompt = gr.Textbox(label='Text Prompt', placeholder='Ask a question about the video...')
558
 
559
- with gr.Blocks(title=TITLE, css=CSS, js=JS) as demo:
560
- gr.Markdown(LOGO_MD)
561
- gr.Markdown(DESCRIPTION_MD)
 
562
 
563
  with gr.Row():
564
  with gr.Column(scale=3):
@@ -592,7 +606,11 @@ if __name__ == '__main__':
592
  label='Max Output Tokens',
593
  info='The maximum number of output tokens for each role (Default: 256)')
594
 
595
- prompt.render()
 
 
 
 
596
 
597
  with gr.Row():
598
  random_btn = gr.Button(value='🔮 Random')
@@ -606,9 +624,16 @@ if __name__ == '__main__':
606
  submit_ctx = submit_ctx.then(main, [video, prompt, role, temperature, max_new_tokens], chat)
607
  submit_ctx.then(enable_btns, None, [random_btn, reset_btn, submit_btn])
608
 
609
- gr.Markdown('##### Need an example video and question? Just click 🔮 Random to sample one!')
610
 
611
  with gr.Column(scale=5):
612
  chat.render()
613
 
614
- demo.launch(server_name='0.0.0.0')
 
 
 
 
 
 
 
 
5
  import os
6
  import random
7
  import time
 
8
 
9
  import gradio as gr
10
  import nncore
11
+ import spaces
12
  import torch
13
  from huggingface_hub import snapshot_download
14
 
 
15
  from videomind.constants import GROUNDER_PROMPT, PLANNER_PROMPT, VERIFIER_PROMPT
16
  from videomind.dataset.utils import process_vision_info
17
  from videomind.model.builder import build_model
18
  from videomind.utils.io import get_duration
19
  from videomind.utils.parser import parse_query, parse_span
20
 
21
+ os.environ['TOKENIZERS_PARALLELISM'] = 'false'
22
+
23
+ PATH = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
24
+
25
  BASE_MODEL = 'model_zoo/Qwen2-VL-2B-Instruct'
26
+ BASE_MODEL_REPO = 'Qwen/Qwen2-VL-2B-Instruct'
27
 
28
  MODEL = 'model_zoo/VideoMind-2B'
29
+ MODEL_REPO = 'yeliudev/VideoMind-2B'
30
 
31
  TITLE = 'VideoMind: A Chain-of-LoRA Agent for Long Video Reasoning'
32
 
33
+ BADGE = """
34
+ <h3 align="center" style="margin-top: -0.5em;">A Chain-of-LoRA Agent for Long Video Reasoning</h3>
35
+ <div style="display: flex; justify-content: center; gap: 5px; margin-bottom: -0.7em !important;">
36
+ <a href="https://arxiv.org/abs/2503.13444" target="_blank">
37
+ <img src="https://img.shields.io/badge/arXiv-2503.13444-red">
38
+ </a>
39
+ <a href="https://videomind.github.io/" target="_blank">
40
+ <img src="https://img.shields.io/badge/Project-Page-brightgreen">
41
+ </a>
42
+ <a href="https://huggingface.co/collections/yeliudev/videomind-67dd41f42c57f0e7433afb36" target="_blank">
43
+ <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue">
44
+ </a>
45
+ <a href="https://huggingface.co/datasets/yeliudev/VideoMind-Dataset" target="_blank">
46
+ <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-orange">
47
+ </a>
48
+ <a href="https://github.com/yeliudev/VideoMind/blob/main/README.md" target="_blank">
49
+ <img src="https://img.shields.io/badge/License-BSD--3--Clause-purple">
50
+ </a>
51
+ </div>
52
+ """
53
+
54
+ LOGO = '<p align="center"><img width="350" src="https://raw.githubusercontent.com/yeliudev/VideoMind/refs/heads/main/.github/logo.png"></p>'
55
+ DISC = '**VideoMind** is a multi-modal agent framework that enhances video reasoning by emulating *human-like* processes, such as *breaking down tasks*, *localizing and verifying moments*, and *synthesizing answers*. This approach addresses the unique challenges of temporal-grounded reasoning in a progressive strategy. This demo showcases how VideoMind-2B handles video-language tasks. Please open an <a href="https://github.com/yeliudev/VideoMind/issues/new" target="_blank">issue</a> if you meet any problems or have any suggestions.' # noqa
56
 
57
  # yapf:disable
58
  EXAMPLES = [
59
+ [f'{PATH}/examples/4167294363.mp4', 'Why did the old man stand up?', ['pla', 'gnd', 'ver', 'ans']],
60
+ [f'{PATH}/examples/5012237466.mp4', 'How does the child in stripes react about the fountain?', ['pla', 'gnd', 'ver', 'ans']],
61
+ [f'{PATH}/examples/13887487955.mp4', 'What did the excavator do after it pushed the cement forward?', ['pla', 'gnd', 'ver', 'ans']],
62
+ [f'{PATH}/examples/5188348585.mp4', 'What did the person do before pouring the liquor?', ['pla', 'gnd', 'ver', 'ans']],
63
+ [f'{PATH}/examples/4766274786.mp4', 'What did the girl do after the baby lost the balloon?', ['pla', 'gnd', 'ver', 'ans']],
64
+ [f'{PATH}/examples/4742652230.mp4', 'Why is the girl pushing the boy only around the toy but not to other places?', ['pla', 'gnd', 'ver', 'ans']],
65
+ [f'{PATH}/examples/9383140374.mp4', 'How does the girl in pink control the movement of the claw?', ['pla', 'gnd', 'ver', 'ans']],
66
+ [f'{PATH}/examples/10309844035.mp4', 'Why are they holding up the phones?', ['pla', 'gnd', 'ver', 'ans']],
67
+ [f'{PATH}/examples/pA6Z-qYhSNg_60.0_210.0.mp4', 'Different types of meat products are being cut, shaped and prepared', ['gnd', 'ver']],
68
+ [f'{PATH}/examples/UFWQKrcbhjI_360.0_510.0.mp4', 'A man talks to the camera whilst walking along a roadside in a rural area', ['gnd', 'ver']],
69
+ [f'{PATH}/examples/RoripwjYFp8_210.0_360.0.mp4', 'A woman wearing glasses eating something at a street market', ['gnd', 'ver']],
70
+ [f'{PATH}/examples/h6QKDqomIPk_210.0_360.0.mp4', 'A toddler sits in his car seat, holding his yellow tablet', ['gnd', 'ver']],
71
+ [f'{PATH}/examples/Z3-IZ3HAmIA_60.0_210.0.mp4', 'A view from the window as the plane accelerates and takes off from the runway', ['gnd', 'ver']],
72
+ [f'{PATH}/examples/yId2wIocTys_210.0_360.0.mp4', "Temporally locate the visual content mentioned in the text query 'kids exercise in front of parked cars' within the video.", ['pla', 'gnd', 'ver']],
73
+ [f'{PATH}/examples/rrTIeJRVGjg_60.0_210.0.mp4', "Localize the moment that provides relevant context about 'man stands in front of a white building monologuing'.", ['pla', 'gnd', 'ver']],
74
+ [f'{PATH}/examples/DTInxNfWXVc_210.0_360.0.mp4', "Find the video segment that corresponds to the given textual query 'man with headphones talking'.", ['pla', 'gnd', 'ver']],
75
  ]
76
  # yapf:enable
77
 
78
+ if not nncore.is_dir(BASE_MODEL):
79
+ snapshot_download(BASE_MODEL_REPO, local_dir=BASE_MODEL)
80
 
81
+ if not nncore.is_dir(MODEL):
82
+ snapshot_download(MODEL_REPO, local_dir=MODEL)
83
+
84
+ print('Initializing role *grounder*')
85
+ model, processor = build_model(MODEL)
86
+
87
+ print('Initializing role *planner*')
88
+ model.load_adapter(nncore.join(MODEL, 'planner'), adapter_name='planner')
89
+
90
+ print('Initializing role *verifier*')
91
+ model.load_adapter(nncore.join(MODEL, 'verifier'), adapter_name='verifier')
92
+
93
+ device = torch.device('cuda')
94
 
95
 
96
  def seconds_to_hms(seconds):
 
121
 
122
 
123
  @spaces.GPU
124
+ def main(video, prompt, role, temperature, max_new_tokens):
125
+ global model, processor, device
126
+
127
  history = []
128
 
129
  if not video:
 
559
  yield history
560
 
561
 
562
+ def build_demo():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  chat = gr.Chatbot(
564
  type='messages',
565
  height='70vh',
566
+ avatar_images=[f'{PATH}/assets/user.png', f'{PATH}/assets/bot.png'],
567
  placeholder='A conversation with VideoMind',
568
  label='VideoMind')
569
 
570
  prompt = gr.Textbox(label='Text Prompt', placeholder='Ask a question about the video...')
571
 
572
+ with gr.Blocks(title=TITLE) as demo:
573
+ gr.Markdown(LOGO)
574
+ gr.HTML(BADGE)
575
+ gr.Markdown(DISC)
576
 
577
  with gr.Row():
578
  with gr.Column(scale=3):
 
606
  label='Max Output Tokens',
607
  info='The maximum number of output tokens for each role (Default: 256)')
608
 
609
+ with gr.Group():
610
+ prompt.render()
611
+
612
+ with gr.Accordion(label='Examples', open=False):
613
+ gr.Examples(examples=EXAMPLES, inputs=[video, prompt, role], examples_per_page=3)
614
 
615
  with gr.Row():
616
  random_btn = gr.Button(value='🔮 Random')
 
624
  submit_ctx = submit_ctx.then(main, [video, prompt, role, temperature, max_new_tokens], chat)
625
  submit_ctx.then(enable_btns, None, [random_btn, reset_btn, submit_btn])
626
 
627
+ gr.Markdown('##### Need example data? Explore examples tab or click 🔮 Random to sample one!')
628
 
629
  with gr.Column(scale=5):
630
  chat.render()
631
 
632
+ return demo
633
+
634
+
635
+ if __name__ == '__main__':
636
+ demo = build_demo()
637
+
638
+ demo.queue()
639
+ demo.launch(server_name='0.0.0.0', allowed_paths=[f'{PATH}/assets', f'{PATH}/examples'])
examples/10309844035.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8996ff134787d6b769c2491b9079a02c05953465ad770f07a8d9138e2668d24f
3
+ size 4041678
examples/13887487955.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5fecab1076ee42b3804718f9f64bef06cbfafd6995ad5f5ee42ba6354721429
3
+ size 5544739
examples/4167294363.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d0e0a4a381836f68e16a816d87f241fed3e31ea321f544b921743d6c1c50666
3
+ size 6611151
examples/4742652230.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8733ab4b0716d13ea7a79fc4ddacaf9eede567db364f0ecddfa4582c2f237f82
3
+ size 2200304
examples/4766274786.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afa38a9ce9e89f934293214d79755c89159664223b3ca366813fd5fe524ed013
3
+ size 3395545
examples/5012237466.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd1929aa93d037f809f402e9801047125dc9fe8060301e69ded9ba1f2d785cc8
3
+ size 4822293
examples/5188348585.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b225f448a546ba2f65958f18c6731a6dde9b1f437014e90036b22eb40e9ad0a5
3
+ size 5051675
examples/9383140374.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30b6b3eb43f711bef194150d473a59850ff5d7fec0f5cc30e7526aa9e382303f
3
+ size 2518081
examples/DTInxNfWXVc_210.0_360.0.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a09eee0dc404688731fb768c120d3519605f2343376b9bd727a71b91379fd9a9
3
+ size 4999970
examples/RoripwjYFp8_210.0_360.0.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b39b15158dc20c0bc6f1758a9239c8f3eed20ba4a90953338eec2246fa8f1f0
3
+ size 9287252
examples/UFWQKrcbhjI_360.0_510.0.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8669153d9ffac4b5534c20fab8d795347f5babe588da9b8330e049d623ebb443
3
+ size 14510618
examples/Z3-IZ3HAmIA_60.0_210.0.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b3a342993ee61efc5f3b859cd9c1e0d360b3331eed9deb8466891e4bcacc554
3
+ size 14397799
examples/h6QKDqomIPk_210.0_360.0.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:103820de2b8a1a3935b39ed80d91cd08e546e5617310b3d1bb3dadb06b2ffb95
3
+ size 13485144
examples/pA6Z-qYhSNg_60.0_210.0.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c84660fd4ebd8c23a2a7364174b1e819fec8b0e1cb8b9d9cd86f9e429cbdf66c
3
+ size 8658509
examples/rrTIeJRVGjg_60.0_210.0.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efe6f48a49963bd4880ef5065840e05dd25e2aa975870140bcdaf4220bbd2827
3
+ size 11410412
examples/yId2wIocTys_210.0_360.0.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:447fcb1fd1f94ed6a88d56dd0f6f859646cb8c58ed8e3b7a82f374e2cfee1646
3
+ size 14769130
requirements.txt CHANGED
@@ -1,10 +1,8 @@
1
  accelerate==1.2.1
2
  decord==0.6.0
3
- gradio==4.44.1
4
  nncore==0.4.5
5
  pandas==2.2.3
6
  peft==0.14.0
7
- pydantic==2.10.6
8
  pysrt==1.1.2
9
  scikit-image==0.25.0
10
  scikit-learn==1.6.1
@@ -13,6 +11,12 @@ spaces==0.34.0
13
  termplotlib==0.3.9
14
  triton==3.0.0
15
 
 
 
 
 
 
 
16
  # our codebase contains necessary patches for 4.45.2
17
  transformers==4.45.2
18
 
 
1
  accelerate==1.2.1
2
  decord==0.6.0
 
3
  nncore==0.4.5
4
  pandas==2.2.3
5
  peft==0.14.0
 
6
  pysrt==1.1.2
7
  scikit-image==0.25.0
8
  scikit-learn==1.6.1
 
11
  termplotlib==0.3.9
12
  triton==3.0.0
13
 
14
+ # gradio 5.16.0 to 5.23.1 have wrong horizontal margins
15
+ gradio==5.15.0
16
+
17
+ # https://github.com/gradio-app/gradio/issues/10662
18
+ pydantic==2.10.6
19
+
20
  # our codebase contains necessary patches for 4.45.2
21
  transformers==4.45.2
22
 
setup.cfg CHANGED
@@ -7,7 +7,7 @@ split_before_expression_after_opening_paren = true
7
  [isort]
8
  line_length = 120
9
  multi_line_output = 0
10
- known_third_party = decord,deepspeed,gradio,huggingface_hub,nncore,numpy,pandas,peft,PIL,pysrt,safetensors,tabulate,termplotlib,torch,torchvision,transformers
11
  no_lines_before = STDLIB,LOCALFOLDER
12
  default_section = FIRSTPARTY
13
 
 
7
  [isort]
8
  line_length = 120
9
  multi_line_output = 0
10
+ known_third_party = decord,deepspeed,gradio,huggingface_hub,nncore,numpy,pandas,peft,PIL,pysrt,safetensors,spaces,tabulate,termplotlib,torch,torchvision,transformers
11
  no_lines_before = STDLIB,LOCALFOLDER
12
  default_section = FIRSTPARTY
13