Spaces:

V-STaR-Bench
/

V-STaR-LeaderBoard

Running

File size: 9,886 Bytes

import os
# this is .py for store constants 
MODEL_INFO = [
    "Model Name (clickable)",
    "Sampled by",
    "Evaluated by",
    "Accessibility",
    "Date",
    "Total Score",
    "Quality Score",
    "Semantic Score",
    "Selected Score",
    ]

MODEL_INFO_TAB_QUALITY = [
    "Model Name (clickable)",
    "Quality Score",
    "Selected Score"
]

MODEL_INFO_TAB_I2V = [
    "Model Name (clickable)",
    "Sampled by",
    "Evaluated by",
    "Accessibility",
    "Date",
    "Total Score",
    "I2V Score",
    "Quality Score",
    "Selected Score"
]

TASK_INFO = [
    "subject consistency",
    "background consistency",
    "temporal flickering",
    "motion smoothness",
    "dynamic degree",
    "aesthetic quality",
    "imaging quality",
    "object class",
    "multiple objects",
    "human action",
    "color",
    "spatial relationship",
    "scene",
    "appearance style",
    "temporal style",
    "overall consistency"
]

DEFAULT_INFO = [
    "subject consistency",
    "background consistency",
    "temporal flickering",
    "motion smoothness",
    "dynamic degree",
    "aesthetic quality",
    "imaging quality",
    "object class",
    "multiple objects",
    "human action",
    "color",
    "spatial relationship",
    "scene",
    "appearance style",
    "temporal style",
    "overall consistency"
    ]

QUALITY_LIST = [ 
    "subject consistency",
    "background consistency",
    "temporal flickering",
    "motion smoothness",
    "aesthetic quality",
    "imaging quality",
    "dynamic degree",]

SEMANTIC_LIST = [
    "object class",
    "multiple objects",
    "human action",
    "color",
    "spatial relationship",
    "scene",
    "appearance style",
    "temporal style",
    "overall consistency"
]

QUALITY_TAB = [ 
    "subject consistency",
    "background consistency",
    "motion smoothness",
    "aesthetic quality",
    "imaging quality",
    "dynamic degree",]

I2V_LIST = [
    "Video-Text Camera Motion",
    "Video-Image Subject Consistency",
    "Video-Image Background Consistency",
]

I2V_QUALITY_LIST = [
    "Subject Consistency",
    "Background Consistency",
    "Motion Smoothness",
    "Dynamic Degree",
    "Aesthetic Quality",
    "Imaging Quality",
    # "Temporal Flickering"
]

I2V_TAB = [
    "Video-Text Camera Motion",
    "Video-Image Subject Consistency",
    "Video-Image Background Consistency",
    "Subject Consistency",
    "Background Consistency",
    "Motion Smoothness",
    "Dynamic Degree",
    "Aesthetic Quality",
    "Imaging Quality",
    # "Temporal Flickering"
]

DIM_WEIGHT = {
"subject consistency":1,
"background consistency":1,
"temporal flickering":1,
"motion smoothness":1,
"aesthetic quality":1,
"imaging quality":1,
"dynamic degree":0.5,
"object class":1,
"multiple objects":1,
"human action":1,
"color":1,
"spatial relationship":1,
"scene":1,
"appearance style":1,
"temporal style":1,
"overall consistency":1
}

DIM_WEIGHT_I2V = {
"Video-Text Camera Motion": 0.1,
"Video-Image Subject Consistency": 1,
"Video-Image Background Consistency": 1,
"Subject Consistency": 1,
"Background Consistency": 1,
"Motion Smoothness": 1,
"Dynamic Degree": 0.5,
"Aesthetic Quality": 1,
"Imaging Quality": 1,
"Temporal Flickering": 1
}

SEMANTIC_WEIGHT = 1
QUALITY_WEIGHT = 4
I2V_WEIGHT = 1.0
I2V_QUALITY_WEIGHT = 1.0

DATA_TITILE_TYPE = ['markdown', 'markdown', 'markdown', 'markdown', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
I2V_TITILE_TYPE =  ['markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']

SUBMISSION_NAME = "vstar_leaderboard_submission"
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/V-STaR-Bench", SUBMISSION_NAME)
CSV_DIR = "./vstar_leaderboard_submission/results.csv"
QUALITY_DIR = "./vstar_leaderboard_submission/quality.csv"
I2V_DIR = "./vstar_leaderboard_submission/i2v_results.csv"
LONG_DIR = "./vstar_leaderboard_submission/long_debug.csv"
INFO_DIR = "./vstar_leaderboard_submission/model_info.csv"

COLUMN_NAMES = MODEL_INFO + TASK_INFO
COLUMN_NAMES_QUALITY = MODEL_INFO_TAB_QUALITY + QUALITY_TAB
COLUMN_NAMES_I2V = MODEL_INFO_TAB_I2V + I2V_TAB

LEADERBORAD_INTRODUCTION = """# V-STaR Leaderboard
    
    *"Can Video-LLMs “reason through a sequential spatio-temporal logic” in videos?"*  
    🏆 Welcome to the leaderboard of the **V-STaR**! 🎦 *A spatio-temporal reasoning benchmark for Video-LLMs*   [![Code](https://img.shields.io/github/stars/V-STaR-Bench/V-STaR.svg?style=social&label=Official)](https://github.com/V-STaR-Bench/V-STaR) 
    <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
    <a href=''><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
    <a href='https://v-star-bench.github.io/'><img src='https://img.shields.io/badge/VBench-Website-green?logo=googlechrome&logoColor=green'></a>
    </div>
    
    - **Comprehensive Dimensions:** We evaluate Video-LLM’s spatio-temporal reasoning ability in answering questions explicitly in the context of “when”, “where”, and “what”.
    - **Human Alignment:** We conducted extensive experiments and human annotations to validate robustness of V-STaR.
    - **New Metrics:** We proposed to use Arithmetic Mean (AM) and modified logarithmic Geometric Mean (LGM) to measure the spatio-temporal reasoning capability of Video-LLMs. We calculate AM and LGM from the "Accuracy" of VQA, "m_tIoU" of Temporal grounding and "m_vIoU" of Spatial Grounding, and we get the mean AM (mAM) and mean LGM (mLGM) from the results of our proposed 2 RSTR question chains.
    - **Valuable Insights:** V-STaR reveals a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning.
    
    **Join Leaderboard**: Please contact us to update your results.
    
    **Credits**: This leaderboard is updated and maintained by the team of [V-STaR Contributors]().
    """

SUBMIT_INTRODUCTION = """# Submit on V-STaR Benchmark Introduction
## 🎈
⚠️ Please note that you need to obtain the file `results/*.json` by running V-STaR in Github. You may conduct an [Offline Eval](https://github.com/V-STaR-Bench/V-STaR) before submitting.

⚠️ Then, please contact us to update your results via [email1](mailto:[email protected]) or [email2](mailto:[email protected]). 
"""

TABLE_INTRODUCTION = """
    """

LEADERBORAD_INFO = """
       V-STaR, a comprehensive spatio-temporal reasoning benchmark for video large language models (Video-LLMs). We construct a fine-grained reasoning dataset with coarse-to-fine CoT questions, enabling a structured evaluation of spatio-temporal reasoning. Specifically, we introduce a Reverse Spatio-Temporal Reasoning (RSTR) task to quantify models’ spatio-temporal reasoning ability. For each dimension and each content category, we carefully design a Prompt Suite as test cases, and sample Generated Videos from a set of video generation models. Experiments on V-STaR reveal although many models perform well on “what”, some struggle to ground their answers in time and location. This finding highlights a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning and inspires research in improving trustworthy spatio-temporal understanding in future Video-LLMs.
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@misc{cheng2025vstarbenchmarkingvideollmsvideo,
      title={V-STaR: Benchmarking Video-LLMs on Video Spatio-Temporal Reasoning}, 
      author={Zixu Cheng and Jian Hu and Ziquan Liu and Chenyang Si and Wei Li and Shaogang Gong},
      year={2025},
      eprint={2503.11495},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
      url={https://arxiv.org/abs/2503.11495}, 
}"""

QUALITY_CLAIM_TEXT = "We use all the videos on Sora website (https://openai.com/sora) for a preliminary evaluation, including the failure case videos Sora provided."

I2V_CLAIM_TEXT = "Since the open-sourced SVD models do not accept text input during the I2V stage, we are unable to evaluate its `camera motion` in terms of `video-text consistency`. The total score is calculated based on all dimensions except `camera motion`."

LONG_CLAIM_TEXT = "" 

NORMALIZE_DIC = {
  "subject consistency": {"Min": 0.1462, "Max": 1.0},
  "background consistency": {"Min": 0.2615, "Max": 1.0},
  "temporal flickering": {"Min": 0.6293, "Max": 1.0},
  "motion smoothness": {"Min": 0.706, "Max": 0.9975},
  "dynamic degree": {"Min": 0.0, "Max": 1.0},
  "aesthetic quality": {"Min": 0.0, "Max": 1.0},
  "imaging quality": {"Min": 0.0, "Max": 1.0},
  "object class": {"Min": 0.0, "Max": 1.0},
  "multiple objects": {"Min": 0.0, "Max": 1.0},
  "human action": {"Min": 0.0, "Max": 1.0},
  "color": {"Min": 0.0, "Max": 1.0},
  "spatial relationship": {"Min": 0.0, "Max": 1.0},
  "scene": {"Min": 0.0, "Max": 0.8222},
  "appearance style": {"Min": 0.0009, "Max": 0.2855},
  "temporal style": {"Min": 0.0, "Max": 0.364},
  "overall consistency": {"Min": 0.0, "Max": 0.364}
}

NORMALIZE_DIC_I2V = {
    "Video-Text Camera Motion" :{"Min": 0.0, "Max":1.0 },
    "Video-Image Subject Consistency":{"Min": 0.1462, "Max": 1.0},
    "Video-Image Background Consistency":{"Min": 0.2615, "Max":1.0 },
    "Subject Consistency":{"Min": 0.1462, "Max": 1.0},
    "Background Consistency":{"Min": 0.2615, "Max": 1.0 },
    "Motion Smoothness":{"Min": 0.7060, "Max": 0.9975},
    "Dynamic Degree":{"Min": 0.0, "Max": 1.0},
    "Aesthetic Quality":{"Min": 0.0, "Max": 1.0},
    "Imaging Quality":{"Min": 0.0, "Max": 1.0},
    "Temporal Flickering":{"Min":0.6293, "Max": 1.0}
}