File size: 3,414 Bytes
d18a9b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
import pandas as pd
from datasets import load_dataset

df_final = pd.read_pickle("./df_final.pkl")
dataset = load_dataset("XAI/vlmsareblind")


def show_row(row_index, selected_task):
    task_df = df_final[df_final["task"] == selected_task]
    row = task_df.iloc[int(row_index)]
    custom_id = int(row["custom_id"])
    image = dataset["valid"][custom_id]["image"]
    prompt = dataset["valid"][custom_id]["prompt"]
    model_output = row["content_raw"]
    ground_truth = row["gt"]
    task = row["task"]
    is_correct = row["is_correct"]

    return image, prompt, model_output, ground_truth, task, is_correct


def update_slider(selected_task):
    task_df = df_final[df_final["task"] == selected_task]
    return gr.Slider(
        minimum=0,
        maximum=len(task_df) - 1,
        step=1,
        label=f"Select Row Index (0-{len(task_df) - 1})",
        value=0,
    )


# Create accuracy breakdown dataframe
accuracy_breakdown = (
    df_final.groupby("task")["is_correct"]
    .mean()
    .sort_values(ascending=False)
    .mul(100)
    .apply(lambda x: f"{x:.2f}")
    .reset_index()
)
accuracy_breakdown.columns = ["Task", "Accuracy (%)"]

# Create the Gradio interface
with gr.Blocks() as app:
    gr.Markdown("# VLMs Are Blind Results Review (GPT-4o-mini)")
    gr.HTML(
        """
        <p style="text-align: center;">
            This is a review of results from the GPT-4 model on the VLMs Are Blind dataset.
            <br>
            <a href="https://vlmsareblind.github.io/" target="_blank">Project Website</a> | 
            <a href="https://arxiv.org/abs/2407.06581" target="_blank">arXiv Paper</a>
        </p>
    """
    )

    with gr.Row():
        task_dropdown = gr.Dropdown(
            choices=df_final["task"].unique().tolist(),
            label="Select Task",
            value=df_final["task"].unique()[0],
        )
        row_selector = gr.Slider(
            minimum=0,
            maximum=len(df_final[df_final["task"] == df_final["task"].unique()[0]]) - 1,
            step=1,
            label=f"Select Row Index (0-{len(df_final[df_final['task'] == df_final['task'].unique()[0]]) - 1})",
            value=0,
        )

    with gr.Row():
        with gr.Column(scale=2):
            image_output = gr.Image(label="Image", type="pil")

        with gr.Column(scale=3):
            prompt_output = gr.Textbox(label="Prompt", lines=3)
            model_output = gr.Textbox(label="Model Output", lines=2)
            ground_truth = gr.Textbox(label="Ground Truth", lines=2)
            task = gr.Textbox(label="Task")
            is_correct = gr.Checkbox(label="Is Correct")

    gr.Markdown("## Accuracy Breakdown by Task")
    gr.DataFrame(accuracy_breakdown)

    task_dropdown.change(update_slider, inputs=task_dropdown, outputs=row_selector)

    task_dropdown.change(
        show_row,
        inputs=[gr.Slider(value=0, visible=False), task_dropdown],
        outputs=[
            image_output,
            prompt_output,
            model_output,
            ground_truth,
            task,
            is_correct,
        ],
    )

    row_selector.change(
        show_row,
        inputs=[row_selector, task_dropdown],
        outputs=[
            image_output,
            prompt_output,
            model_output,
            ground_truth,
            task,
            is_correct,
        ],
    )

# Launch the app
app.launch()