File size: 4,016 Bytes
06aa970
 
bf5c2b9
06aa970
bf5c2b9
06aa970
bf5c2b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b2074e
bf5c2b9
 
 
 
 
 
4b2074e
 
06aa970
bf5c2b9
 
06aa970
 
 
 
 
bf5c2b9
06aa970
 
bf5c2b9
06aa970
bf5c2b9
 
 
 
 
 
 
 
 
 
06aa970
 
4b2074e
06aa970
4b2074e
bf5c2b9
06aa970
 
 
 
4b2074e
 
 
 
 
 
 
 
06aa970
bf5c2b9
 
06aa970
 
bf5c2b9
 
 
 
 
 
4b2074e
 
4265423
bf5c2b9
 
 
9db537a
bf5c2b9
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import gradio as gr
import pandas as pd
from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter

from data_reviewer import create_data_viewer

# Define constants and enums
TITLE = "<h1>VL-RewardBench Leaderboard</h1>"
INTRODUCTION_TEXT = "https://vl-rewardbench.github.io/"
GOOGLE_SHEET_URL = (
    "https://docs.google.com/spreadsheets/d/1fPqZLF1FQFyy4n9I6GNk7MeDSGlJDVVes9yEBqN8RwU/export?gid=0&format=csv"
)
ABOUT_TEXT = """Welcome to VLRewardBench!

We introduce a novel benchmark VL-RewardBench, designed to expose limitations of vision-language reward models across visual perception, hallucination detection, and reasoning tasks.
Our evaluation reveals including that models primarily fail at basic visual perception rather than reasoning, and that performance on our benchmark strongly correlates (r>0.9) with downstream vision-language tasks.

The splits are:

- General (VLFeedback + WildVision
- Hallucination (POVID, RLAIF, RLHF-V)
- Reasoning (MMMU-Pro, MathVerse)"""


class AutoEvalColumn:
    model = {"name": "Model", "type": "markdown", "displayed_by_default": True, "never_hidden": True}
    general = {"name": "General", "type": "float", "displayed_by_default": True, "never_hidden": False}
    hallucination = {"name": "Hallucination", "type": "float", "displayed_by_default": True, "never_hidden": False}
    reasoning = {"name": "Reasoning", "type": "float", "displayed_by_default": True, "never_hidden": False}
    overall = {"name": "Overall Consistency", "type": "float", "displayed_by_default": True, "never_hidden": False}
    macro = {"name": "Macro Average", "type": "float", "displayed_by_default": True, "never_hidden": False}

    model_size = {"name": "Model", "type": "str", "displayed_by_default": False, "never_hidden": False}
    opensource = {"name": "Open Source?", "type": "str", "displayed_by_default": False, "never_hidden": False}

def get_result_data():
    return pd.read_csv(GOOGLE_SHEET_URL)


def init_leaderboard(dataframe):
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")

    return Leaderboard(
        value=dataframe,
        datatype=[col["type"] for col in AutoEvalColumn.__dict__.values() if isinstance(col, dict)],
        select_columns=SelectColumns(
            default_selection=[
                col["name"]
                for col in AutoEvalColumn.__dict__.values()
                if isinstance(col, dict) and col["displayed_by_default"]
            ],
            cant_deselect=[
                col["name"]
                for col in AutoEvalColumn.__dict__.values()
                if isinstance(col, dict) and col.get("never_hidden", False)
            ],
            label="Select Columns to Display:",
        ),
        search_columns=["Model"],
        filter_columns=[
            ColumnFilter("Open Source?", type="checkboxgroup", label="Open Source?"),
            ColumnFilter("Model Size", type="checkboxgroup", label="Model Size"),
        ],
        interactive=False,
    )

def format_model_link(row):
    """Format model name as HTML link if URL is available"""
    model_name = row['Model']
    url = row.get('URL', '')
    if pd.notna(url) and url.strip():
        return f'<a href="{url}" target="_blank">{model_name}</a>'
    return model_name


# Initialize the Gradio interface
demo = gr.Blocks()
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT)

    with gr.Tabs() as tabs:
        with gr.TabItem("🏅 Leaderboard"):
            # Load your DataFrame here instead of the sample data
            df = get_result_data()
            df["Model"] = df.apply(format_model_link, axis=1)
            del df["URL"]
            df = df.sort_values('Overall Consistency', ascending=False)
            leaderboard = init_leaderboard(df)

        with gr.TabItem("📊 Data Viewer"):
            dataset_split, sample_idx = create_data_viewer()

        with gr.TabItem("ℹ️ About"):
            gr.Markdown(ABOUT_TEXT)

demo.launch()