Spaces:
Running
Running
File size: 9,886 Bytes
a904bab 32e3566 1cdb1e8 5928b40 a904bab 75795d2 a904bab b73b5a7 a904bab 84ea603 75795d2 a904bab 84ea603 0df35f9 84ea603 a904bab a9bad44 a904bab 84ea603 a904bab a9bad44 a904bab ff9e632 a9bad44 8fb60c2 a904bab 84ea603 a904bab 852662b a904bab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 |
import os
# this is .py for store constants
MODEL_INFO = [
"Model Name (clickable)",
"Sampled by",
"Evaluated by",
"Accessibility",
"Date",
"Total Score",
"Quality Score",
"Semantic Score",
"Selected Score",
]
MODEL_INFO_TAB_QUALITY = [
"Model Name (clickable)",
"Quality Score",
"Selected Score"
]
MODEL_INFO_TAB_I2V = [
"Model Name (clickable)",
"Sampled by",
"Evaluated by",
"Accessibility",
"Date",
"Total Score",
"I2V Score",
"Quality Score",
"Selected Score"
]
TASK_INFO = [
"subject consistency",
"background consistency",
"temporal flickering",
"motion smoothness",
"dynamic degree",
"aesthetic quality",
"imaging quality",
"object class",
"multiple objects",
"human action",
"color",
"spatial relationship",
"scene",
"appearance style",
"temporal style",
"overall consistency"
]
DEFAULT_INFO = [
"subject consistency",
"background consistency",
"temporal flickering",
"motion smoothness",
"dynamic degree",
"aesthetic quality",
"imaging quality",
"object class",
"multiple objects",
"human action",
"color",
"spatial relationship",
"scene",
"appearance style",
"temporal style",
"overall consistency"
]
QUALITY_LIST = [
"subject consistency",
"background consistency",
"temporal flickering",
"motion smoothness",
"aesthetic quality",
"imaging quality",
"dynamic degree",]
SEMANTIC_LIST = [
"object class",
"multiple objects",
"human action",
"color",
"spatial relationship",
"scene",
"appearance style",
"temporal style",
"overall consistency"
]
QUALITY_TAB = [
"subject consistency",
"background consistency",
"motion smoothness",
"aesthetic quality",
"imaging quality",
"dynamic degree",]
I2V_LIST = [
"Video-Text Camera Motion",
"Video-Image Subject Consistency",
"Video-Image Background Consistency",
]
I2V_QUALITY_LIST = [
"Subject Consistency",
"Background Consistency",
"Motion Smoothness",
"Dynamic Degree",
"Aesthetic Quality",
"Imaging Quality",
# "Temporal Flickering"
]
I2V_TAB = [
"Video-Text Camera Motion",
"Video-Image Subject Consistency",
"Video-Image Background Consistency",
"Subject Consistency",
"Background Consistency",
"Motion Smoothness",
"Dynamic Degree",
"Aesthetic Quality",
"Imaging Quality",
# "Temporal Flickering"
]
DIM_WEIGHT = {
"subject consistency":1,
"background consistency":1,
"temporal flickering":1,
"motion smoothness":1,
"aesthetic quality":1,
"imaging quality":1,
"dynamic degree":0.5,
"object class":1,
"multiple objects":1,
"human action":1,
"color":1,
"spatial relationship":1,
"scene":1,
"appearance style":1,
"temporal style":1,
"overall consistency":1
}
DIM_WEIGHT_I2V = {
"Video-Text Camera Motion": 0.1,
"Video-Image Subject Consistency": 1,
"Video-Image Background Consistency": 1,
"Subject Consistency": 1,
"Background Consistency": 1,
"Motion Smoothness": 1,
"Dynamic Degree": 0.5,
"Aesthetic Quality": 1,
"Imaging Quality": 1,
"Temporal Flickering": 1
}
SEMANTIC_WEIGHT = 1
QUALITY_WEIGHT = 4
I2V_WEIGHT = 1.0
I2V_QUALITY_WEIGHT = 1.0
DATA_TITILE_TYPE = ['markdown', 'markdown', 'markdown', 'markdown', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
I2V_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
SUBMISSION_NAME = "vstar_leaderboard_submission"
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/V-STaR-Bench", SUBMISSION_NAME)
CSV_DIR = "./vstar_leaderboard_submission/results.csv"
QUALITY_DIR = "./vstar_leaderboard_submission/quality.csv"
I2V_DIR = "./vstar_leaderboard_submission/i2v_results.csv"
LONG_DIR = "./vstar_leaderboard_submission/long_debug.csv"
INFO_DIR = "./vstar_leaderboard_submission/model_info.csv"
COLUMN_NAMES = MODEL_INFO + TASK_INFO
COLUMN_NAMES_QUALITY = MODEL_INFO_TAB_QUALITY + QUALITY_TAB
COLUMN_NAMES_I2V = MODEL_INFO_TAB_I2V + I2V_TAB
LEADERBORAD_INTRODUCTION = """# V-STaR Leaderboard
*"Can Video-LLMs “reason through a sequential spatio-temporal logic” in videos?"*
🏆 Welcome to the leaderboard of the **V-STaR**! 🎦 *A spatio-temporal reasoning benchmark for Video-LLMs* [](https://github.com/V-STaR-Bench/V-STaR)
<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
<a href=''><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
<a href='https://v-star-bench.github.io/'><img src='https://img.shields.io/badge/VBench-Website-green?logo=googlechrome&logoColor=green'></a>
</div>
- **Comprehensive Dimensions:** We evaluate Video-LLM’s spatio-temporal reasoning ability in answering questions explicitly in the context of “when”, “where”, and “what”.
- **Human Alignment:** We conducted extensive experiments and human annotations to validate robustness of V-STaR.
- **New Metrics:** We proposed to use Arithmetic Mean (AM) and modified logarithmic Geometric Mean (LGM) to measure the spatio-temporal reasoning capability of Video-LLMs. We calculate AM and LGM from the "Accuracy" of VQA, "m_tIoU" of Temporal grounding and "m_vIoU" of Spatial Grounding, and we get the mean AM (mAM) and mean LGM (mLGM) from the results of our proposed 2 RSTR question chains.
- **Valuable Insights:** V-STaR reveals a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning.
**Join Leaderboard**: Please contact us to update your results.
**Credits**: This leaderboard is updated and maintained by the team of [V-STaR Contributors]().
"""
SUBMIT_INTRODUCTION = """# Submit on V-STaR Benchmark Introduction
## 🎈
⚠️ Please note that you need to obtain the file `results/*.json` by running V-STaR in Github. You may conduct an [Offline Eval](https://github.com/V-STaR-Bench/V-STaR) before submitting.
⚠️ Then, please contact us to update your results via [email1](mailto:[email protected]) or [email2](mailto:[email protected]).
"""
TABLE_INTRODUCTION = """
"""
LEADERBORAD_INFO = """
V-STaR, a comprehensive spatio-temporal reasoning benchmark for video large language models (Video-LLMs). We construct a fine-grained reasoning dataset with coarse-to-fine CoT questions, enabling a structured evaluation of spatio-temporal reasoning. Specifically, we introduce a Reverse Spatio-Temporal Reasoning (RSTR) task to quantify models’ spatio-temporal reasoning ability. For each dimension and each content category, we carefully design a Prompt Suite as test cases, and sample Generated Videos from a set of video generation models. Experiments on V-STaR reveal although many models perform well on “what”, some struggle to ground their answers in time and location. This finding highlights a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning and inspires research in improving trustworthy spatio-temporal understanding in future Video-LLMs.
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@misc{cheng2025vstarbenchmarkingvideollmsvideo,
title={V-STaR: Benchmarking Video-LLMs on Video Spatio-Temporal Reasoning},
author={Zixu Cheng and Jian Hu and Ziquan Liu and Chenyang Si and Wei Li and Shaogang Gong},
year={2025},
eprint={2503.11495},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2503.11495},
}"""
QUALITY_CLAIM_TEXT = "We use all the videos on Sora website (https://openai.com/sora) for a preliminary evaluation, including the failure case videos Sora provided."
I2V_CLAIM_TEXT = "Since the open-sourced SVD models do not accept text input during the I2V stage, we are unable to evaluate its `camera motion` in terms of `video-text consistency`. The total score is calculated based on all dimensions except `camera motion`."
LONG_CLAIM_TEXT = ""
NORMALIZE_DIC = {
"subject consistency": {"Min": 0.1462, "Max": 1.0},
"background consistency": {"Min": 0.2615, "Max": 1.0},
"temporal flickering": {"Min": 0.6293, "Max": 1.0},
"motion smoothness": {"Min": 0.706, "Max": 0.9975},
"dynamic degree": {"Min": 0.0, "Max": 1.0},
"aesthetic quality": {"Min": 0.0, "Max": 1.0},
"imaging quality": {"Min": 0.0, "Max": 1.0},
"object class": {"Min": 0.0, "Max": 1.0},
"multiple objects": {"Min": 0.0, "Max": 1.0},
"human action": {"Min": 0.0, "Max": 1.0},
"color": {"Min": 0.0, "Max": 1.0},
"spatial relationship": {"Min": 0.0, "Max": 1.0},
"scene": {"Min": 0.0, "Max": 0.8222},
"appearance style": {"Min": 0.0009, "Max": 0.2855},
"temporal style": {"Min": 0.0, "Max": 0.364},
"overall consistency": {"Min": 0.0, "Max": 0.364}
}
NORMALIZE_DIC_I2V = {
"Video-Text Camera Motion" :{"Min": 0.0, "Max":1.0 },
"Video-Image Subject Consistency":{"Min": 0.1462, "Max": 1.0},
"Video-Image Background Consistency":{"Min": 0.2615, "Max":1.0 },
"Subject Consistency":{"Min": 0.1462, "Max": 1.0},
"Background Consistency":{"Min": 0.2615, "Max": 1.0 },
"Motion Smoothness":{"Min": 0.7060, "Max": 0.9975},
"Dynamic Degree":{"Min": 0.0, "Max": 1.0},
"Aesthetic Quality":{"Min": 0.0, "Max": 1.0},
"Imaging Quality":{"Min": 0.0, "Max": 1.0},
"Temporal Flickering":{"Min":0.6293, "Max": 1.0}
}
|