abiswal commited on
Commit
e25cebf
·
1 Parent(s): d369ab3

leaderboard updates

Browse files
Files changed (2) hide show
  1. app.py +74 -25
  2. src/about.py +4 -4
app.py CHANGED
@@ -10,62 +10,56 @@ from src.about import (
10
  TITLE,
11
  )
12
 
13
- # Simplified DataFrame for the leaderboard
14
  data = {
15
- "Model": [
16
  "Handwritten TAG",
17
- "Zero-shot Text2SQL",
18
- "Zero-shot Text2SQL + LM Generation",
19
  "RAG (E5)",
20
  "RAG (E5) + LM Rerank",
21
  ],
22
- "Execution Accuracy": ["55%", "17%", "13%", "0%", "2%"],
 
23
  }
24
 
25
- # Create a DataFrame
26
  leaderboard_df = pd.DataFrame(data)
27
 
28
- # Convert Execution Accuracy to numeric for sorting
29
- leaderboard_df["Execution Accuracy (numeric)"] = (
30
- leaderboard_df["Execution Accuracy"].str.rstrip("%").astype(float)
31
- )
32
  leaderboard_df = leaderboard_df.sort_values(
33
- "Execution Accuracy (numeric)", ascending=False
34
  ).reset_index(drop=True)
35
-
36
- # Add the Rank column
37
  leaderboard_df.insert(0, "Rank", leaderboard_df.index + 1)
38
 
39
- # Drop the numeric column for display
40
- leaderboard_df = leaderboard_df.drop(columns=["Execution Accuracy (numeric)"])
41
 
42
- # Add hyperlinks to the Model column
43
- def hyperlink_model(model):
44
  base_url = "https://github.com/TAG-Research/TAG-Bench/tree/main"
 
 
 
 
45
  return f'<a href="{base_url}" target="_blank">{model}</a>'
46
 
47
- leaderboard_df["Model"] = leaderboard_df["Model"].apply(hyperlink_model)
48
 
49
- # Simplified Gradio app
 
 
 
50
  with gr.Blocks() as demo:
51
  gr.HTML(
52
  """
53
  <div style="text-align: center;">
54
  <h1 style="font-size: 2.5rem; margin-bottom: 0.5rem;">TAG Leaderboard</h1>
55
- <p style="font-size: 1.25rem; color: gray;">Comparing baseline approaches for structured data queries</p>
56
  </div>
57
  """
58
  )
59
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
60
 
61
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
62
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
63
- # Highlight the top row in green for "Handwritten TAG"
64
  with gr.Row():
65
  gr.Dataframe(
66
  value=leaderboard_df,
67
- headers=["Model", "Code", "Execution Accuracy"],
68
- datatype=["str", "html", "str"],
69
  row_count=(5, "dynamic"),
70
  wrap=True,
71
  elem_id="leaderboard",
@@ -76,7 +70,62 @@ with gr.Blocks() as demo:
76
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
77
 
78
  with gr.TabItem("🚀 Submission Instructions ", elem_id="llm-benchmark-tab-table", id=3):
79
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
 
82
  demo.launch()
 
10
  TITLE,
11
  )
12
 
 
13
  data = {
14
+ "Method": [
15
  "Handwritten TAG",
16
+ "Zero-shot Text2SQL (llama-3.1-70B)",
17
+ "Zero-shot Text2SQL + LM Generation (llama-3.1-70B)",
18
  "RAG (E5)",
19
  "RAG (E5) + LM Rerank",
20
  ],
21
+ # "Model": ["meta-llama/Llama-3.1-70B"] * 5,
22
+ "Execution Accuracy": [55.0, 17.0, 13.0, 0.0, 2.0],
23
  }
24
 
 
25
  leaderboard_df = pd.DataFrame(data)
26
 
 
 
 
 
27
  leaderboard_df = leaderboard_df.sort_values(
28
+ "Execution Accuracy", ascending=False
29
  ).reset_index(drop=True)
 
 
30
  leaderboard_df.insert(0, "Rank", leaderboard_df.index + 1)
31
 
 
 
32
 
33
+ def hyperlink_method(method):
 
34
  base_url = "https://github.com/TAG-Research/TAG-Bench/tree/main"
35
+ return f'<a href="{base_url}" target="_blank">{method}</a>'
36
+
37
+ def hyperlink_model(model):
38
+ base_url = "https://huggingface.co/meta-llama/Llama-3.1-70B"
39
  return f'<a href="{base_url}" target="_blank">{model}</a>'
40
 
 
41
 
42
+ leaderboard_df["Method"] = leaderboard_df["Method"].apply(hyperlink_method)
43
+ # leaderboard_df["Model"] = leaderboard_df["Model"].apply(hyperlink_model)
44
+
45
+
46
  with gr.Blocks() as demo:
47
  gr.HTML(
48
  """
49
  <div style="text-align: center;">
50
  <h1 style="font-size: 2.5rem; margin-bottom: 0.5rem;">TAG Leaderboard</h1>
51
+ <p style="font-size: 1.25rem; color: gray;">Evaluating complex natural language queries over structured data.</p>
52
  </div>
53
  """
54
  )
 
55
 
56
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
57
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
 
58
  with gr.Row():
59
  gr.Dataframe(
60
  value=leaderboard_df,
61
+ headers=["Rank", "Method", "Execution Accuracy"],
62
+ datatype=["str", "html", "float"],
63
  row_count=(5, "dynamic"),
64
  wrap=True,
65
  elem_id="leaderboard",
 
70
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
71
 
72
  with gr.TabItem("🚀 Submission Instructions ", elem_id="llm-benchmark-tab-table", id=3):
73
+ with gr.Accordion("1️⃣ Required Materials", open=True):
74
+ gr.Markdown(
75
+ """
76
+ Ensure the following files are included in your submission:
77
+ - **output.json**: File containing the evaluation outputs generated by your model. Please refer to [] for format instructions.
78
+ - **requirements.txt**: A list of dependencies needed to run your model or script.
79
+ - **README.md**: A detailed description of your submission, including:
80
+ - Purpose and overview of the submission.
81
+ - Instructions to reproduce the results.
82
+ - Any additional notes for evaluators.
83
+ - **Model/Keys**: Upload your models or API keys to [Hugging Face](https://huggingface.co/) if they are not publicly accessible.
84
+
85
+ **Note**: Submissions missing any of these materials will not be processed.
86
+ """
87
+ )
88
+
89
+ # Section 2: Submission Frequency
90
+ with gr.Accordion("2️⃣ Submission Frequency", open=False):
91
+ gr.Markdown(
92
+ """
93
+ - Submissions are accepted **once a month** to ensure sufficient evaluation bandwidth.
94
+ - Plan your submission timeline accordingly to avoid delays.
95
+ """
96
+ )
97
+
98
+ # Section 3: How to Upload Materials
99
+ with gr.Accordion("3️⃣ How to Upload Materials", open=False):
100
+ gr.Markdown(
101
+ """
102
+ Follow these steps to upload your materials:
103
+ 1. Compress all files in the code into a single `.zip` file, or provide a public repository to refer to.
104
+ 2. Email the `.zip` file or repositoty link to our email [email].
105
+ """
106
+ )
107
+
108
+ # Section 4: Submission Process
109
+ with gr.Accordion("4️⃣ Submission Process", open=False):
110
+ gr.Markdown(
111
+ """
112
+ After uploading your materials:
113
+ -
114
+ - Provide accurate contact information for follow-ups.
115
+ - Double-check your materials for completeness to avoid processing delays.
116
+
117
+ **Important:** Your submission will be added to the evaluation queue. Depending on the queue size, evaluations may take up to a few weeks.
118
+ """
119
+ )
120
+
121
+ # Footer
122
+ gr.Markdown(
123
+ """
124
+ <div style="text-align: center; margin-top: 2rem;">
125
+ For further assistance, reach out to [email] with questions.
126
+ </div>
127
+ """
128
+ )
129
 
130
 
131
  demo.launch()
src/about.py CHANGED
@@ -30,11 +30,11 @@ Intro text
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
32
  LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
34
-
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
37
 
 
 
38
  """
39
 
40
  EVALUATION_QUEUE_TEXT = """
 
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
32
  LLM_BENCHMARKS_TEXT = f"""
33
+ ## What does the TAG leaderboard evaluate?
34
+ In this leaderboard, you'll find execution accuracy comparisons of table question answering approaches on [TAG-Bench] (https://github.com/TAG-Research/TAG-Bench/tree/main). TAG-Bench contains complex queries requiring world knowledge or semantic reasoning that goes beyond the information explicitly available in the database.
 
 
35
 
36
+ ## How is accuracy measured?
37
+ Execution accuracy is measured as the number of exact matches to our annotated ground truth answers which are hand-labeled by experts.
38
  """
39
 
40
  EVALUATION_QUEUE_TEXT = """