File size: 9,238 Bytes
2016488
 
 
 
 
ba1590f
 
2016488
78efe8b
2016488
 
78efe8b
 
2016488
 
15e5e27
 
6ae8aba
2016488
15e5e27
6ae8aba
2016488
ba1590f
392edcb
2016488
 
df126cf
ba1590f
 
 
df126cf
ba1590f
 
df126cf
8ad4dd0
 
 
576cf58
 
 
 
df126cf
78efe8b
8ad4dd0
c987e2d
 
2016488
 
576cf58
 
2016488
78efe8b
 
 
fd717be
78efe8b
fd717be
2016488
 
 
f8121ba
 
 
 
 
 
2016488
 
 
c987e2d
f2595ef
c987e2d
15e5e27
6ae8aba
 
29ea595
 
 
 
 
6ae8aba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15e5e27
0702221
 
 
 
 
 
c987e2d
2016488
 
4b8c00a
bd24c06
5380b14
2b86120
 
 
 
4b8c00a
 
2b86120
4b8c00a
 
15e5e27
 
 
 
 
 
 
 
 
 
c304fa4
2b86120
15e5e27
2016488
6ae8aba
59091a2
4ed87c1
15e5e27
491e896
75cdf4b
2016488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba1590f
 
 
2016488
 
 
 
 
 
 
 
9cfc538
2016488
 
 
ba1590f
 
2016488
 
 
ba1590f
2016488
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import pandas as pd
import gradio as gr
import csv
import json
import os
import requests
import io
import shutil
import pprint as pp
from huggingface_hub import Repository

from datasets import DATASETS

HF_TOKEN = os.environ.get("HF_TOKEN")

BASE_COLS = ["Rank", "Models", "Model Size(B)", "Data Source"]
TASKS_V1 = ["V1-Overall", "I-CLS", "I-QA", "I-RET", "I-VG"]
COLUMN_NAMES = BASE_COLS + TASKS_V1

DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown'] + \
                    ['number'] * len(TASKS_V1)

LEADERBOARD_INTRODUCTION = """
# 📊 **MMEB LEADERBOARD (VLM2Vec)**

## Introduction
We introduce a novel benchmark, **MMEB-V1 (Massive Multimodal Embedding Benchmark)**, 
which includes 36 datasets spanning four meta-task categories: classification, visual question answering, retrieval, and visual grounding. MMEB provides a comprehensive framework for training
and evaluating embedding models across various combinations of text and image modalities. 
All tasks are reformulated as ranking tasks, where the model follows instructions, processes a query, and selects the correct target from a set of candidates. The query and target can be an image, text,
or a combination of both. MMEB-V1 is divided into 20 in-distribution datasets, which can be used for
training, and 16 out-of-distribution datasets, reserved for evaluation.

Building upon on **MMEB-V1**, **MMEB-V2** expands the evaluation scope to include five new tasks: four video-based tasks 
— Video Retrieval, Moment Retrieval, Video Classification, and Video Question Answering — and one task focused on visual documents, Visual Document Retrieval. 
This comprehensive suite enables robust evaluation of multimodal embedding models across static, temporal, and structured visual data settings.

**IMPORTANT NOTES Regarding the old MMEB-V1 leaderboard:**
MMEB-V1 is now part of the Image section of MMEB-V2, and the results on its leaderboard have been merged into the MMEB-V2 Image leaderboard. 
For researchers relying on MMEB-V1, we recommend transitioning to MMEB-V2 for more comprehensive evaluation metrics and support. 

| [**📈Overview**](https://tiger-ai-lab.github.io/VLM2Vec/) | [**Github**](https://github.com/TIGER-AI-Lab/VLM2Vec) 
| [**📖MMEB-V2/VLM2Vec-V2 Paper**](https://arxiv.org/abs/2507.04590) 
| [**📖MMEB-V1/VLM2Vec-V1 Paper**](https://arxiv.org/abs/2410.05160) 
| [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) 
| [**Discord**](https://discord.gg/njyKubdtry) |
"""

TABLE_INTRODUCTION = """***Important Notes:***
This is the old MMEB-V1 leaderboard, which is now deprecated and going to be removed from this web page soon. MMEB-V1 is now the Image section of MMEB-V2, and the results on this leaderboard have been integrated into MMEB-V2 Image section. For researchers relying on MMEB-V1, we recommend transitioning to MMEB-V2 for more comprehensive evaluation metrics and support. Thank you for your collaborations and understanding! \n"""

LEADERBOARD_INFO = f"""
## Dataset Overview
This is the dictionary of all datasets used in our code. Please make sure all datasets' scores are included in your submission. \n
```python
{pp.pformat(DATASETS)}
```
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@article{jiang2024vlm2vec,
  title={VLM2Vec: Training Vision-Language Models for Massive Multimodal Embedding Tasks},
  author={Jiang, Ziyan and Meng, Rui and Yang, Xinyi and Yavuz, Semih and Zhou, Yingbo and Chen, Wenhu},
  journal={arXiv preprint arXiv:2410.05160},
  year={2024}
}"""

SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction

## Please refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n
## After running the pipelines, please use the script we provided \(e.g., [report_score_v2.py](https://github.com/TIGER-AI-Lab/VLM2Vec/blob/main/experiments/report_score_v2.py)\) to generate the unified score sheet which is like the following format. 
## ⚠️ Please note that you need to submit the JSON file with the following format:
```json
{
    "metadata": {
        "model_name": "<Model Name>",
        "url": "<Model URL>" or null,
        "model_size": <Model Size> or null,
        "data_source": "Self-Reported",
        ... ...
    },
    "metrics": {
        "image": {
            "ImageNet-1K": {
                "hit@1": 0.5,
                "ndcg@1": 0.5,
                ... ...
            }, 
            "N24News": {
                ... ...
            }, 
            ... ...
        }, 
        "visdoc": {
            "ViDoRe": {
                "hit@1": 0.5,
                "ndcg@1": 0.5,
                ... ...
            }, 
            ... ...
        },
        "video": {
            "DiDeMo": {
                "hit@1": 0.5,
                "ndcg@1": 0.5,
                ... ...
            }, 
            "MSR-VTT": {
                ... ...
            }, 
            ... ...
        }
    }
}
```
## ⚠️ To submit, create a pull request and upload the generated JSON file to the ***scores*** folder, then inform us on [our discord server](https://discord.gg/njyKubdtry), or send us an email at [email protected], including your model's information.\n
Please email us or leave a simple message (simply @ us) in the PR to indicate that you are ready to merge your PR. We will not merge your PR without informing us since we might think you are still editing and not yet ready to merge. 
We will review your submission and update the leaderboard accordingly. \n\n
## Special Instructions for submitting to MMEB Image (Previously MMEB-V1) Leaderboard
We understand that some researchers want to exclusively submit to the Image leaderboard. To do so, just run the 36 image datasets only and simply ignore the other datasets. The leaderboard will automatically assign a 0 to the missing datasets. 
We might be able to hide your model from the other leaderboards in the next leaderboard updates, but for now your model might have a lower rank on the overall leaderboard. 
We highly recommend joining our [discord server](https://discord.gg/njyKubdtry), which provides a convenient way to stay informed with latest updates, or share any feedback you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community!
"""

def create_hyperlinked_names(df):
    def convert_url(url, model_name):
        return f'<a href="{url}">{model_name}</a>' if url else model_name

    def add_link_to_model_name(row):
        row['Models'] = convert_url(row['URL'], row['Models'])
        return row
    
    df = df.copy()
    df = df.apply(add_link_to_model_name, axis=1)
    return df

# def fetch_data(file: str) -> pd.DataFrame:
#     # fetch the leaderboard data from remote
#     if file is None:
#         raise ValueError("URL Not Provided")
#     url = f"https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/{file}"
#     print(f"Fetching data from {url}")
#     response = requests.get(url)
#     if response.status_code != 200:
#         raise requests.HTTPError(f"Failed to fetch data: HTTP status code {response.status_code}")
#     return pd.read_json(io.StringIO(response.text), orient='records', lines=True)

def get_df(file="results.jsonl"):
    df = pd.read_json(file, orient='records', lines=True)
    df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
    for task in TASKS_V1:
        if df[task].isnull().any():
            df[task] = df[task].apply(lambda score: '-' if pd.isna(score) else score)
    df = df.sort_values(by=['V1-Overall'], ascending=False)
    df = create_hyperlinked_names(df)
    df['Rank'] = range(1, len(df) + 1)
    return df

def refresh_data():
    df = get_df()
    return df[COLUMN_NAMES]

def search_and_filter_models(df, query, min_size, max_size):
    filtered_df = df.copy()
    
    if query:
        filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]

    size_mask = filtered_df['Model Size(B)'].apply(lambda x: 
        (min_size <= 1000.0 <= max_size) if x == 'unknown' 
        else (min_size <= x <= max_size))
    
    filtered_df = filtered_df[size_mask]
    
    return filtered_df[COLUMN_NAMES]


def search_models(df, query):
    if query:
        return df[df['Models'].str.contains(query, case=False, na=False)]
    return df

def get_size_range(df):
    sizes = df['Model Size(B)'].apply(lambda x: 0.0 if x == 'unknown' else x)
    if (sizes == 0.0).all():
        return 0.0, 1000.0
    return float(sizes.min()), float(sizes.max())


def process_model_size(size):
    if pd.isna(size) or size == 'unk':
        return 'unknown'
    try:
        val = float(size)
        return round(val, 3)
    except (ValueError, TypeError):
        return 'unknown'

def filter_columns_by_tasks(df, selected_tasks=None):
    if selected_tasks is None or len(selected_tasks) == 0:
        return df[COLUMN_NAMES]
    
    base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
    selected_columns = base_columns + selected_tasks
    
    available_columns = [col for col in selected_columns if col in df.columns]
    return df[available_columns]