File size: 6,381 Bytes
0547e3e
 
 
e1247b7
 
 
 
 
 
 
 
 
cacf673
e25cebf
83bd87e
f4d957a
 
 
5856002
83bd87e
5856002
f4d957a
 
5856002
83bd87e
ee8b78c
f4d957a
 
ee8b78c
304d39c
f4d957a
cacf673
e25cebf
f4d957a
 
cacf673
 
 
 
d11d433
e25cebf
d11d433
a19ed1a
55faf8a
53d5656
f4d957a
d3a0f24
793fed4
 
 
 
 
 
 
 
 
 
d3a0f24
5cc2883
d3a0f24
 
 
 
 
 
c945edb
d11d433
e25cebf
 
 
 
d11d433
 
 
 
d369ab3
a19ed1a
d11d433
 
 
c945edb
e1247b7
 
 
 
 
e25cebf
738c269
e1247b7
 
 
 
 
 
 
 
 
 
e25cebf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3a0f24
e25cebf
 
 
 
 
 
 
 
d3a0f24
e25cebf
 
 
 
d3a0f24
e25cebf
 
 
 
d3a0f24
e25cebf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3a0f24
e25cebf
 
 
c945edb
cacf673
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import gradio as gr
import pandas as pd

from src.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    EVALUATION_QUEUE_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
)

data = {
    "Method": [
        "Human Performance (Handwritten LOTUS Llama-3.1-70B)",
        "Zero-shot Text2SQL (Llama-3.1-70B)",
        "Zero-shot Text2SQL + LM Generation (Llama-3.1-70B)",
        "RAG (E5 + Llama-3.1-70B)",
        "RAG (E5) + LM Rerank (Llama-3.1-70B)",
        "Human Performance (Handwritten LOTUS GPT-4o)",
        "Zero-shot Text2SQL (GPT-4o)",
       "Zero-shot Text2SQL + LM Generation (GPT-4o)",
        "RAG (E5 + GPT-4o)",
        "RAG (E5) + LM Rerank (GPT-4o)",
        "Human Performance (Handwritten LOTUS o3-mini)",
        "Zero-shot Text2SQL (o3-mini)",
        "Zero-shot Text2SQL + LM Generation (o3-mini)",
        "RAG (E5 + o3-mini)",
        "RAG (E5) + LM Rerank (o3-mini)",
        "Zero-shot Text2SQL (Deepseek-R1)",
        "Zero-shot Text2SQL + LM Generation (Deepseek-R1)",
    ],
    # "Model": ["meta-llama/Llama-3.1-70B"] * 5,
    "Execution Accuracy": [55.0, 17.0, 13.0, 0.0, 2.0, 55.0, 18.0, 15.0, 3.0, 3.0, 65.0, 18.0, 30.0, 7.0, 7.0, 12.0, 0.0],
    # "Execution Accuracy": [0.0, 2.0, 55.0, 18.0, 3.0, 3.0, 65.0, 18.0, 7.0, 7.0, 12.0],
}

leaderboard_df = pd.DataFrame(data)

leaderboard_df = leaderboard_df.sort_values(
    "Execution Accuracy", ascending=False
).reset_index(drop=True)
leaderboard_df.insert(0, "Rank", leaderboard_df.index - 2)
leaderboard_df.loc[0, "Rank"] = ""
leaderboard_df.loc[1, "Rank"] = ""
leaderboard_df.loc[2, "Rank"] = ""

def hyperlink_method(method):
    base_url = "https://github.com/TAG-Research/TAG-Bench/tree/main"
    return f'<a href="{base_url}" target="_blank">{method}</a>'

def hyperlink_model(model):
    base_url = "https://huggingface.co/meta-llama/Llama-3.1-70B"
    return f'<a href="{base_url}" target="_blank">{model}</a>'

leaderboard_df["Method"] = leaderboard_df["Method"].apply(hyperlink_method)

def highlight_row(row):
    if row["Rank"] == "":  # First row
        return ["background-color: #d4edda; font-weight: bold;" for _ in row]
    return [""] * len(row)


# Apply the style
leaderboard_df = leaderboard_df.style.apply(highlight_row, axis=1)



# leaderboard_df["Model"] = leaderboard_df["Model"].apply(hyperlink_model)


with gr.Blocks() as demo:
    gr.HTML(
        """
        <div style="text-align: center;">
            <h1 style="font-size: 2.5rem; margin-bottom: 0.5rem;">TAG Leaderboard</h1>
            <p style="font-size: 1.25rem; color: gray;">A benchmark for natural language queries over data</p>
        </div>
        """
    )

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
            with gr.Row():
                gr.Dataframe(
                    value=leaderboard_df,
                    headers=["Rank", "Method", "Execution Accuracy"],
                    datatype=["str", "html",  "number"],
                    row_count=(5, "dynamic"),
                    wrap=True,
                    elem_id="leaderboard",
                    type="pandas"
                )

        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

        with gr.TabItem("🚀 Submission Instructions ", elem_id="llm-benchmark-tab-table", id=3):
            with gr.Accordion("1️⃣ Required Materials", open=True):
                gr.Markdown(
                    """
                    Ensure the following files are included in your submission:
                    - **output.json**: File containing the evaluation outputs generated by your model. Please refer to [] for format instructions.
                    - **requirements.txt**: A list of dependencies needed to run your model or script.
                    - **README.md**: A detailed description of your submission, including:
                        - Purpose and overview of the submission.
                        - Instructions to reproduce the results.
                        - Any additional notes for evaluators.
                    - **Model/Keys**: Upload your models or API keys to [Hugging Face](https://huggingface.co/) if they are not publicly accessible.
                    
                    **Note**: Submissions missing any of these materials will not be processed.
                    """
                )

            # Section 2: Submission Frequency
            with gr.Accordion("2️⃣ Submission Frequency", open=True):
                gr.Markdown(
                    """
                    - Submissions are accepted **once a month** to ensure sufficient evaluation bandwidth.
                    - Plan your submission timeline accordingly to avoid delays.
                    """
                )

            # Section 3: How to Upload Materials
            with gr.Accordion("3️⃣ How to Upload Materials", open=True):
                gr.Markdown(
                    """
                    Follow these steps to upload your materials:
                    1. Compress all files in the code into a single `.zip` file, or provide a public repository to refer to.
                    2. Email the `.zip` file or repositoty link to our email [email protected].
                    """
                )

            # Section 4: Submission Process
            with gr.Accordion("4️⃣ Submission Process", open=True):
                gr.Markdown(
                    """
                    After uploading your materials:
                    - 
                    - Provide accurate contact information for follow-ups.
                    - Double-check your materials for completeness to avoid processing delays.
                    
                    **Important:** Your submission will be added to the evaluation queue. Depending on the queue size, evaluations may take up to a few weeks.
                    """
                )

            # Footer
            gr.Markdown(
                """
                <div style="text-align: center; margin-top: 2rem;">
                    For further assistance, reach out to [email protected] with questions.
                </div>
                """
            )


demo.launch()