Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
from src.about import ( | |
CITATION_BUTTON_LABEL, | |
CITATION_BUTTON_TEXT, | |
EVALUATION_QUEUE_TEXT, | |
INTRODUCTION_TEXT, | |
LLM_BENCHMARKS_TEXT, | |
TITLE, | |
) | |
data = { | |
"Method": [ | |
"Human Performance (Handwritten LOTUS Llama-3.1-70B)", | |
"Zero-shot Text2SQL (Llama-3.1-70B)", | |
"Zero-shot Text2SQL + LM Generation (Llama-3.1-70B)", | |
"RAG (E5 + Llama-3.1-70B)", | |
"RAG (E5) + LM Rerank (Llama-3.1-70B)", | |
"Human Performance (Handwritten LOTUS GPT-4o)", | |
"Zero-shot Text2SQL (GPT-4o)", | |
"Zero-shot Text2SQL + LM Generation (GPT-4o)", | |
"RAG (E5 + GPT-4o)", | |
"RAG (E5) + LM Rerank (GPT-4o)", | |
"Human Performance (Handwritten LOTUS o3-mini)", | |
"Zero-shot Text2SQL (o3-mini)", | |
"Zero-shot Text2SQL + LM Generation (o3-mini)", | |
"RAG (E5 + o3-mini)", | |
"RAG (E5) + LM Rerank (o3-mini)", | |
"Zero-shot Text2SQL (Deepseek-R1)", | |
"Zero-shot Text2SQL + LM Generation (Deepseek-R1)", | |
], | |
# "Model": ["meta-llama/Llama-3.1-70B"] * 5, | |
"Execution Accuracy": [55.0, 17.0, 13.0, 0.0, 2.0, 55.0, 18.0, 15.0, 3.0, 3.0, 65.0, 18.0, 30.0, 7.0, 7.0, 12.0, 0.0], | |
# "Execution Accuracy": [0.0, 2.0, 55.0, 18.0, 3.0, 3.0, 65.0, 18.0, 7.0, 7.0, 12.0], | |
} | |
leaderboard_df = pd.DataFrame(data) | |
leaderboard_df = leaderboard_df.sort_values( | |
"Execution Accuracy", ascending=False | |
).reset_index(drop=True) | |
leaderboard_df.insert(0, "Rank", leaderboard_df.index - 2) | |
leaderboard_df.loc[0, "Rank"] = "" | |
leaderboard_df.loc[1, "Rank"] = "" | |
leaderboard_df.loc[2, "Rank"] = "" | |
def hyperlink_method(method): | |
base_url = "https://github.com/TAG-Research/TAG-Bench/tree/main" | |
return f'<a href="{base_url}" target="_blank">{method}</a>' | |
def hyperlink_model(model): | |
base_url = "https://huggingface.co/meta-llama/Llama-3.1-70B" | |
return f'<a href="{base_url}" target="_blank">{model}</a>' | |
leaderboard_df["Method"] = leaderboard_df["Method"].apply(hyperlink_method) | |
def highlight_row(row): | |
if row["Rank"] == "": # First row | |
return ["background-color: #d4edda; font-weight: bold;" for _ in row] | |
return [""] * len(row) | |
# Apply the style | |
leaderboard_df = leaderboard_df.style.apply(highlight_row, axis=1) | |
# leaderboard_df["Model"] = leaderboard_df["Model"].apply(hyperlink_model) | |
with gr.Blocks() as demo: | |
gr.HTML( | |
""" | |
<div style="text-align: center;"> | |
<h1 style="font-size: 2.5rem; margin-bottom: 0.5rem;">TAG Leaderboard</h1> | |
<p style="font-size: 1.25rem; color: gray;">A benchmark for natural language queries over data</p> | |
</div> | |
""" | |
) | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem("π LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0): | |
with gr.Row(): | |
gr.Dataframe( | |
value=leaderboard_df, | |
headers=["Rank", "Method", "Execution Accuracy"], | |
datatype=["str", "html", "number"], | |
row_count=(5, "dynamic"), | |
wrap=True, | |
elem_id="leaderboard", | |
type="pandas" | |
) | |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2): | |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") | |
with gr.TabItem("π Submission Instructions ", elem_id="llm-benchmark-tab-table", id=3): | |
with gr.Accordion("1οΈβ£ Required Materials", open=True): | |
gr.Markdown( | |
""" | |
Ensure the following files are included in your submission: | |
- **output.json**: File containing the evaluation outputs generated by your model. Please refer to [] for format instructions. | |
- **requirements.txt**: A list of dependencies needed to run your model or script. | |
- **README.md**: A detailed description of your submission, including: | |
- Purpose and overview of the submission. | |
- Instructions to reproduce the results. | |
- Any additional notes for evaluators. | |
- **Model/Keys**: Upload your models or API keys to [Hugging Face](https://huggingface.co/) if they are not publicly accessible. | |
**Note**: Submissions missing any of these materials will not be processed. | |
""" | |
) | |
# Section 2: Submission Frequency | |
with gr.Accordion("2οΈβ£ Submission Frequency", open=True): | |
gr.Markdown( | |
""" | |
- Submissions are accepted **once a month** to ensure sufficient evaluation bandwidth. | |
- Plan your submission timeline accordingly to avoid delays. | |
""" | |
) | |
# Section 3: How to Upload Materials | |
with gr.Accordion("3οΈβ£ How to Upload Materials", open=True): | |
gr.Markdown( | |
""" | |
Follow these steps to upload your materials: | |
1. Compress all files in the code into a single `.zip` file, or provide a public repository to refer to. | |
2. Email the `.zip` file or repositoty link to our email [email protected]. | |
""" | |
) | |
# Section 4: Submission Process | |
with gr.Accordion("4οΈβ£ Submission Process", open=True): | |
gr.Markdown( | |
""" | |
After uploading your materials: | |
- | |
- Provide accurate contact information for follow-ups. | |
- Double-check your materials for completeness to avoid processing delays. | |
**Important:** Your submission will be added to the evaluation queue. Depending on the queue size, evaluations may take up to a few weeks. | |
""" | |
) | |
# Footer | |
gr.Markdown( | |
""" | |
<div style="text-align: center; margin-top: 2rem;"> | |
For further assistance, reach out to [email protected] with questions. | |
</div> | |
""" | |
) | |
demo.launch() | |