Spaces:

TAG-Research
/

TAG-Leaderboard

Running

File size: 6,381 Bytes

0547e3e
 
 
e1247b7
 
 
 
 
 
 
 
 
cacf673
e25cebf
83bd87e
f4d957a
 
 
5856002
83bd87e
5856002
f4d957a
 
5856002
83bd87e
ee8b78c
f4d957a
 
ee8b78c
304d39c
f4d957a
cacf673
e25cebf
f4d957a
 
cacf673
 
 
 
d11d433
e25cebf
d11d433
a19ed1a
55faf8a
53d5656
f4d957a
d3a0f24
793fed4
 
 
 
 
 
 
 
 
 
d3a0f24
5cc2883
d3a0f24
 
 
 
 
 
c945edb
d11d433
e25cebf
 
 
 
d11d433
 
 
 
d369ab3
a19ed1a
d11d433
 
 
c945edb
e1247b7
 
 
 
 
e25cebf
738c269
e1247b7
 
 
 
 
 
 
 
 
 
e25cebf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3a0f24
e25cebf
 
 
 
 
 
 
 
d3a0f24
e25cebf
 
 
 
d3a0f24
e25cebf
 
 
 
d3a0f24
e25cebf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3a0f24
e25cebf
 
 
c945edb
cacf673

import gradio as gr
import pandas as pd

from src.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    EVALUATION_QUEUE_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
)

data = {
    "Method": [
        "Human Performance (Handwritten LOTUS Llama-3.1-70B)",
        "Zero-shot Text2SQL (Llama-3.1-70B)",
        "Zero-shot Text2SQL + LM Generation (Llama-3.1-70B)",
        "RAG (E5 + Llama-3.1-70B)",
        "RAG (E5) + LM Rerank (Llama-3.1-70B)",
        "Human Performance (Handwritten LOTUS GPT-4o)",
        "Zero-shot Text2SQL (GPT-4o)",
       "Zero-shot Text2SQL + LM Generation (GPT-4o)",
        "RAG (E5 + GPT-4o)",
        "RAG (E5) + LM Rerank (GPT-4o)",
        "Human Performance (Handwritten LOTUS o3-mini)",
        "Zero-shot Text2SQL (o3-mini)",
        "Zero-shot Text2SQL + LM Generation (o3-mini)",
        "RAG (E5 + o3-mini)",
        "RAG (E5) + LM Rerank (o3-mini)",
        "Zero-shot Text2SQL (Deepseek-R1)",
        "Zero-shot Text2SQL + LM Generation (Deepseek-R1)",
    ],
    # "Model": ["meta-llama/Llama-3.1-70B"] * 5,
    "Execution Accuracy": [55.0, 17.0, 13.0, 0.0, 2.0, 55.0, 18.0, 15.0, 3.0, 3.0, 65.0, 18.0, 30.0, 7.0, 7.0, 12.0, 0.0],
    # "Execution Accuracy": [0.0, 2.0, 55.0, 18.0, 3.0, 3.0, 65.0, 18.0, 7.0, 7.0, 12.0],
}

leaderboard_df = pd.DataFrame(data)

leaderboard_df = leaderboard_df.sort_values(
    "Execution Accuracy", ascending=False
).reset_index(drop=True)
leaderboard_df.insert(0, "Rank", leaderboard_df.index - 2)
leaderboard_df.loc[0, "Rank"] = ""
leaderboard_df.loc[1, "Rank"] = ""
leaderboard_df.loc[2, "Rank"] = ""

def hyperlink_method(method):
    base_url = "https://github.com/TAG-Research/TAG-Bench/tree/main"
    return f'<a href="{base_url}" target="_blank">{method}</a>'

def hyperlink_model(model):
    base_url = "https://huggingface.co/meta-llama/Llama-3.1-70B"
    return f'<a href="{base_url}" target="_blank">{model}</a>'

leaderboard_df["Method"] = leaderboard_df["Method"].apply(hyperlink_method)

def highlight_row(row):
    if row["Rank"] == "":  # First row
        return ["background-color: #d4edda; font-weight: bold;" for _ in row]
    return [""] * len(row)


# Apply the style
leaderboard_df = leaderboard_df.style.apply(highlight_row, axis=1)



# leaderboard_df["Model"] = leaderboard_df["Model"].apply(hyperlink_model)


with gr.Blocks() as demo:
    gr.HTML(
        """
        <div style="text-align: center;">
            <h1 style="font-size: 2.5rem; margin-bottom: 0.5rem;">TAG Leaderboard</h1>
            <p style="font-size: 1.25rem; color: gray;">A benchmark for natural language queries over data</p>
        </div>
        """
    )

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
            with gr.Row():
                gr.Dataframe(
                    value=leaderboard_df,
                    headers=["Rank", "Method", "Execution Accuracy"],
                    datatype=["str", "html",  "number"],
                    row_count=(5, "dynamic"),
                    wrap=True,
                    elem_id="leaderboard",
                    type="pandas"
                )

        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

        with gr.TabItem("🚀 Submission Instructions ", elem_id="llm-benchmark-tab-table", id=3):
            with gr.Accordion("1️⃣ Required Materials", open=True):
                gr.Markdown(
                    """
                    Ensure the following files are included in your submission:
                    - **output.json**: File containing the evaluation outputs generated by your model. Please refer to [] for format instructions.
                    - **requirements.txt**: A list of dependencies needed to run your model or script.
                    - **README.md**: A detailed description of your submission, including:
                        - Purpose and overview of the submission.
                        - Instructions to reproduce the results.
                        - Any additional notes for evaluators.
                    - **Model/Keys**: Upload your models or API keys to [Hugging Face](https://huggingface.co/) if they are not publicly accessible.
                    
                    **Note**: Submissions missing any of these materials will not be processed.
                    """
                )

            # Section 2: Submission Frequency
            with gr.Accordion("2️⃣ Submission Frequency", open=True):
                gr.Markdown(
                    """
                    - Submissions are accepted **once a month** to ensure sufficient evaluation bandwidth.
                    - Plan your submission timeline accordingly to avoid delays.
                    """
                )

            # Section 3: How to Upload Materials
            with gr.Accordion("3️⃣ How to Upload Materials", open=True):
                gr.Markdown(
                    """
                    Follow these steps to upload your materials:
                    1. Compress all files in the code into a single `.zip` file, or provide a public repository to refer to.
                    2. Email the `.zip` file or repositoty link to our email [email protected].
                    """
                )

            # Section 4: Submission Process
            with gr.Accordion("4️⃣ Submission Process", open=True):
                gr.Markdown(
                    """
                    After uploading your materials:
                    - 
                    - Provide accurate contact information for follow-ups.
                    - Double-check your materials for completeness to avoid processing delays.
                    
                    **Important:** Your submission will be added to the evaluation queue. Depending on the queue size, evaluations may take up to a few weeks.
                    """
                )

            # Footer
            gr.Markdown(
                """
                <div style="text-align: center; margin-top: 2rem;">
                    For further assistance, reach out to [email protected] with questions.
                </div>
                """
            )


demo.launch()