File size: 9,886 Bytes
a904bab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32e3566
1cdb1e8
5928b40
 
 
 
 
a904bab
 
 
 
 
75795d2
a904bab
b73b5a7
 
a904bab
84ea603
75795d2
a904bab
 
84ea603
 
0df35f9
84ea603
a904bab
a9bad44
a904bab
84ea603
a904bab
 
a9bad44
a904bab
ff9e632
a9bad44
8fb60c2
a904bab
 
 
 
 
 
84ea603
a904bab
 
 
852662b
 
 
 
 
 
 
 
 
a904bab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
import os
# this is .py for store constants 
MODEL_INFO = [
    "Model Name (clickable)",
    "Sampled by",
    "Evaluated by",
    "Accessibility",
    "Date",
    "Total Score",
    "Quality Score",
    "Semantic Score",
    "Selected Score",
    ]

MODEL_INFO_TAB_QUALITY = [
    "Model Name (clickable)",
    "Quality Score",
    "Selected Score"
]

MODEL_INFO_TAB_I2V = [
    "Model Name (clickable)",
    "Sampled by",
    "Evaluated by",
    "Accessibility",
    "Date",
    "Total Score",
    "I2V Score",
    "Quality Score",
    "Selected Score"
]

TASK_INFO = [
    "subject consistency",
    "background consistency",
    "temporal flickering",
    "motion smoothness",
    "dynamic degree",
    "aesthetic quality",
    "imaging quality",
    "object class",
    "multiple objects",
    "human action",
    "color",
    "spatial relationship",
    "scene",
    "appearance style",
    "temporal style",
    "overall consistency"
]

DEFAULT_INFO = [
    "subject consistency",
    "background consistency",
    "temporal flickering",
    "motion smoothness",
    "dynamic degree",
    "aesthetic quality",
    "imaging quality",
    "object class",
    "multiple objects",
    "human action",
    "color",
    "spatial relationship",
    "scene",
    "appearance style",
    "temporal style",
    "overall consistency"
    ]

QUALITY_LIST = [ 
    "subject consistency",
    "background consistency",
    "temporal flickering",
    "motion smoothness",
    "aesthetic quality",
    "imaging quality",
    "dynamic degree",]

SEMANTIC_LIST = [
    "object class",
    "multiple objects",
    "human action",
    "color",
    "spatial relationship",
    "scene",
    "appearance style",
    "temporal style",
    "overall consistency"
]

QUALITY_TAB = [ 
    "subject consistency",
    "background consistency",
    "motion smoothness",
    "aesthetic quality",
    "imaging quality",
    "dynamic degree",]

I2V_LIST = [
    "Video-Text Camera Motion",
    "Video-Image Subject Consistency",
    "Video-Image Background Consistency",
]

I2V_QUALITY_LIST = [
    "Subject Consistency",
    "Background Consistency",
    "Motion Smoothness",
    "Dynamic Degree",
    "Aesthetic Quality",
    "Imaging Quality",
    # "Temporal Flickering"
]

I2V_TAB = [
    "Video-Text Camera Motion",
    "Video-Image Subject Consistency",
    "Video-Image Background Consistency",
    "Subject Consistency",
    "Background Consistency",
    "Motion Smoothness",
    "Dynamic Degree",
    "Aesthetic Quality",
    "Imaging Quality",
    # "Temporal Flickering"
]

DIM_WEIGHT = {
"subject consistency":1,
"background consistency":1,
"temporal flickering":1,
"motion smoothness":1,
"aesthetic quality":1,
"imaging quality":1,
"dynamic degree":0.5,
"object class":1,
"multiple objects":1,
"human action":1,
"color":1,
"spatial relationship":1,
"scene":1,
"appearance style":1,
"temporal style":1,
"overall consistency":1
}

DIM_WEIGHT_I2V = {
"Video-Text Camera Motion": 0.1,
"Video-Image Subject Consistency": 1,
"Video-Image Background Consistency": 1,
"Subject Consistency": 1,
"Background Consistency": 1,
"Motion Smoothness": 1,
"Dynamic Degree": 0.5,
"Aesthetic Quality": 1,
"Imaging Quality": 1,
"Temporal Flickering": 1
}

SEMANTIC_WEIGHT = 1
QUALITY_WEIGHT = 4
I2V_WEIGHT = 1.0
I2V_QUALITY_WEIGHT = 1.0

DATA_TITILE_TYPE = ['markdown', 'markdown', 'markdown', 'markdown', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
I2V_TITILE_TYPE =  ['markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']

SUBMISSION_NAME = "vstar_leaderboard_submission"
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/V-STaR-Bench", SUBMISSION_NAME)
CSV_DIR = "./vstar_leaderboard_submission/results.csv"
QUALITY_DIR = "./vstar_leaderboard_submission/quality.csv"
I2V_DIR = "./vstar_leaderboard_submission/i2v_results.csv"
LONG_DIR = "./vstar_leaderboard_submission/long_debug.csv"
INFO_DIR = "./vstar_leaderboard_submission/model_info.csv"

COLUMN_NAMES = MODEL_INFO + TASK_INFO
COLUMN_NAMES_QUALITY = MODEL_INFO_TAB_QUALITY + QUALITY_TAB
COLUMN_NAMES_I2V = MODEL_INFO_TAB_I2V + I2V_TAB

LEADERBORAD_INTRODUCTION = """# V-STaR Leaderboard
    
    *"Can Video-LLMs “reason through a sequential spatio-temporal logic” in videos?"*  
    🏆 Welcome to the leaderboard of the **V-STaR**! 🎦 *A spatio-temporal reasoning benchmark for Video-LLMs*   [![Code](https://img.shields.io/github/stars/V-STaR-Bench/V-STaR.svg?style=social&label=Official)](https://github.com/V-STaR-Bench/V-STaR) 
    <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
    <a href=''><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
    <a href='https://v-star-bench.github.io/'><img src='https://img.shields.io/badge/VBench-Website-green?logo=googlechrome&logoColor=green'></a>
    </div>
    
    - **Comprehensive Dimensions:** We evaluate Video-LLM’s spatio-temporal reasoning ability in answering questions explicitly in the context of “when”, “where”, and “what”.
    - **Human Alignment:** We conducted extensive experiments and human annotations to validate robustness of V-STaR.
    - **New Metrics:** We proposed to use Arithmetic Mean (AM) and modified logarithmic Geometric Mean (LGM) to measure the spatio-temporal reasoning capability of Video-LLMs. We calculate AM and LGM from the "Accuracy" of VQA, "m_tIoU" of Temporal grounding and "m_vIoU" of Spatial Grounding, and we get the mean AM (mAM) and mean LGM (mLGM) from the results of our proposed 2 RSTR question chains.
    - **Valuable Insights:** V-STaR reveals a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning.
    
    **Join Leaderboard**: Please contact us to update your results.
    
    **Credits**: This leaderboard is updated and maintained by the team of [V-STaR Contributors]().
    """

SUBMIT_INTRODUCTION = """# Submit on V-STaR Benchmark Introduction
## 🎈
⚠️ Please note that you need to obtain the file `results/*.json` by running V-STaR in Github. You may conduct an [Offline Eval](https://github.com/V-STaR-Bench/V-STaR) before submitting.

⚠️ Then, please contact us to update your results via [email1](mailto:[email protected]) or [email2](mailto:[email protected]). 
"""

TABLE_INTRODUCTION = """
    """

LEADERBORAD_INFO = """
       V-STaR, a comprehensive spatio-temporal reasoning benchmark for video large language models (Video-LLMs). We construct a fine-grained reasoning dataset with coarse-to-fine CoT questions, enabling a structured evaluation of spatio-temporal reasoning. Specifically, we introduce a Reverse Spatio-Temporal Reasoning (RSTR) task to quantify models’ spatio-temporal reasoning ability. For each dimension and each content category, we carefully design a Prompt Suite as test cases, and sample Generated Videos from a set of video generation models. Experiments on V-STaR reveal although many models perform well on “what”, some struggle to ground their answers in time and location. This finding highlights a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning and inspires research in improving trustworthy spatio-temporal understanding in future Video-LLMs.
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@misc{cheng2025vstarbenchmarkingvideollmsvideo,
      title={V-STaR: Benchmarking Video-LLMs on Video Spatio-Temporal Reasoning}, 
      author={Zixu Cheng and Jian Hu and Ziquan Liu and Chenyang Si and Wei Li and Shaogang Gong},
      year={2025},
      eprint={2503.11495},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
      url={https://arxiv.org/abs/2503.11495}, 
}"""

QUALITY_CLAIM_TEXT = "We use all the videos on Sora website (https://openai.com/sora) for a preliminary evaluation, including the failure case videos Sora provided."

I2V_CLAIM_TEXT = "Since the open-sourced SVD models do not accept text input during the I2V stage, we are unable to evaluate its `camera motion` in terms of `video-text consistency`. The total score is calculated based on all dimensions except `camera motion`."

LONG_CLAIM_TEXT = "" 

NORMALIZE_DIC = {
  "subject consistency": {"Min": 0.1462, "Max": 1.0},
  "background consistency": {"Min": 0.2615, "Max": 1.0},
  "temporal flickering": {"Min": 0.6293, "Max": 1.0},
  "motion smoothness": {"Min": 0.706, "Max": 0.9975},
  "dynamic degree": {"Min": 0.0, "Max": 1.0},
  "aesthetic quality": {"Min": 0.0, "Max": 1.0},
  "imaging quality": {"Min": 0.0, "Max": 1.0},
  "object class": {"Min": 0.0, "Max": 1.0},
  "multiple objects": {"Min": 0.0, "Max": 1.0},
  "human action": {"Min": 0.0, "Max": 1.0},
  "color": {"Min": 0.0, "Max": 1.0},
  "spatial relationship": {"Min": 0.0, "Max": 1.0},
  "scene": {"Min": 0.0, "Max": 0.8222},
  "appearance style": {"Min": 0.0009, "Max": 0.2855},
  "temporal style": {"Min": 0.0, "Max": 0.364},
  "overall consistency": {"Min": 0.0, "Max": 0.364}
}

NORMALIZE_DIC_I2V = {
    "Video-Text Camera Motion" :{"Min": 0.0, "Max":1.0 },
    "Video-Image Subject Consistency":{"Min": 0.1462, "Max": 1.0},
    "Video-Image Background Consistency":{"Min": 0.2615, "Max":1.0 },
    "Subject Consistency":{"Min": 0.1462, "Max": 1.0},
    "Background Consistency":{"Min": 0.2615, "Max": 1.0 },
    "Motion Smoothness":{"Min": 0.7060, "Max": 0.9975},
    "Dynamic Degree":{"Min": 0.0, "Max": 1.0},
    "Aesthetic Quality":{"Min": 0.0, "Max": 1.0},
    "Imaging Quality":{"Min": 0.0, "Max": 1.0},
    "Temporal Flickering":{"Min":0.6293, "Max": 1.0}
}