Spaces:
Running
Running
File size: 6,381 Bytes
0547e3e e1247b7 cacf673 e25cebf 83bd87e f4d957a 5856002 83bd87e 5856002 f4d957a 5856002 83bd87e ee8b78c f4d957a ee8b78c 304d39c f4d957a cacf673 e25cebf f4d957a cacf673 d11d433 e25cebf d11d433 a19ed1a 55faf8a 53d5656 f4d957a d3a0f24 793fed4 d3a0f24 5cc2883 d3a0f24 c945edb d11d433 e25cebf d11d433 d369ab3 a19ed1a d11d433 c945edb e1247b7 e25cebf 738c269 e1247b7 e25cebf d3a0f24 e25cebf d3a0f24 e25cebf d3a0f24 e25cebf d3a0f24 e25cebf d3a0f24 e25cebf c945edb cacf673 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import gradio as gr
import pandas as pd
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
data = {
"Method": [
"Human Performance (Handwritten LOTUS Llama-3.1-70B)",
"Zero-shot Text2SQL (Llama-3.1-70B)",
"Zero-shot Text2SQL + LM Generation (Llama-3.1-70B)",
"RAG (E5 + Llama-3.1-70B)",
"RAG (E5) + LM Rerank (Llama-3.1-70B)",
"Human Performance (Handwritten LOTUS GPT-4o)",
"Zero-shot Text2SQL (GPT-4o)",
"Zero-shot Text2SQL + LM Generation (GPT-4o)",
"RAG (E5 + GPT-4o)",
"RAG (E5) + LM Rerank (GPT-4o)",
"Human Performance (Handwritten LOTUS o3-mini)",
"Zero-shot Text2SQL (o3-mini)",
"Zero-shot Text2SQL + LM Generation (o3-mini)",
"RAG (E5 + o3-mini)",
"RAG (E5) + LM Rerank (o3-mini)",
"Zero-shot Text2SQL (Deepseek-R1)",
"Zero-shot Text2SQL + LM Generation (Deepseek-R1)",
],
# "Model": ["meta-llama/Llama-3.1-70B"] * 5,
"Execution Accuracy": [55.0, 17.0, 13.0, 0.0, 2.0, 55.0, 18.0, 15.0, 3.0, 3.0, 65.0, 18.0, 30.0, 7.0, 7.0, 12.0, 0.0],
# "Execution Accuracy": [0.0, 2.0, 55.0, 18.0, 3.0, 3.0, 65.0, 18.0, 7.0, 7.0, 12.0],
}
leaderboard_df = pd.DataFrame(data)
leaderboard_df = leaderboard_df.sort_values(
"Execution Accuracy", ascending=False
).reset_index(drop=True)
leaderboard_df.insert(0, "Rank", leaderboard_df.index - 2)
leaderboard_df.loc[0, "Rank"] = ""
leaderboard_df.loc[1, "Rank"] = ""
leaderboard_df.loc[2, "Rank"] = ""
def hyperlink_method(method):
base_url = "https://github.com/TAG-Research/TAG-Bench/tree/main"
return f'<a href="{base_url}" target="_blank">{method}</a>'
def hyperlink_model(model):
base_url = "https://huggingface.co/meta-llama/Llama-3.1-70B"
return f'<a href="{base_url}" target="_blank">{model}</a>'
leaderboard_df["Method"] = leaderboard_df["Method"].apply(hyperlink_method)
def highlight_row(row):
if row["Rank"] == "": # First row
return ["background-color: #d4edda; font-weight: bold;" for _ in row]
return [""] * len(row)
# Apply the style
leaderboard_df = leaderboard_df.style.apply(highlight_row, axis=1)
# leaderboard_df["Model"] = leaderboard_df["Model"].apply(hyperlink_model)
with gr.Blocks() as demo:
gr.HTML(
"""
<div style="text-align: center;">
<h1 style="font-size: 2.5rem; margin-bottom: 0.5rem;">TAG Leaderboard</h1>
<p style="font-size: 1.25rem; color: gray;">A benchmark for natural language queries over data</p>
</div>
"""
)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
with gr.Row():
gr.Dataframe(
value=leaderboard_df,
headers=["Rank", "Method", "Execution Accuracy"],
datatype=["str", "html", "number"],
row_count=(5, "dynamic"),
wrap=True,
elem_id="leaderboard",
type="pandas"
)
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("🚀 Submission Instructions ", elem_id="llm-benchmark-tab-table", id=3):
with gr.Accordion("1️⃣ Required Materials", open=True):
gr.Markdown(
"""
Ensure the following files are included in your submission:
- **output.json**: File containing the evaluation outputs generated by your model. Please refer to [] for format instructions.
- **requirements.txt**: A list of dependencies needed to run your model or script.
- **README.md**: A detailed description of your submission, including:
- Purpose and overview of the submission.
- Instructions to reproduce the results.
- Any additional notes for evaluators.
- **Model/Keys**: Upload your models or API keys to [Hugging Face](https://huggingface.co/) if they are not publicly accessible.
**Note**: Submissions missing any of these materials will not be processed.
"""
)
# Section 2: Submission Frequency
with gr.Accordion("2️⃣ Submission Frequency", open=True):
gr.Markdown(
"""
- Submissions are accepted **once a month** to ensure sufficient evaluation bandwidth.
- Plan your submission timeline accordingly to avoid delays.
"""
)
# Section 3: How to Upload Materials
with gr.Accordion("3️⃣ How to Upload Materials", open=True):
gr.Markdown(
"""
Follow these steps to upload your materials:
1. Compress all files in the code into a single `.zip` file, or provide a public repository to refer to.
2. Email the `.zip` file or repositoty link to our email [email protected].
"""
)
# Section 4: Submission Process
with gr.Accordion("4️⃣ Submission Process", open=True):
gr.Markdown(
"""
After uploading your materials:
-
- Provide accurate contact information for follow-ups.
- Double-check your materials for completeness to avoid processing delays.
**Important:** Your submission will be added to the evaluation queue. Depending on the queue size, evaluations may take up to a few weeks.
"""
)
# Footer
gr.Markdown(
"""
<div style="text-align: center; margin-top: 2rem;">
For further assistance, reach out to [email protected] with questions.
</div>
"""
)
demo.launch()
|