eggie5-adyen commited on
Commit
aa92f8e
·
1 Parent(s): 84b45f6

added validation guidelines

Browse files
Files changed (2) hide show
  1. app.py +6 -6
  2. dabstep_benchmark/content.py +17 -0
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import gradio as gr
3
 
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
- from dabstep_benchmark.content import TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_TEXT, CITATION_BUTTON_LABEL
6
  from dabstep_benchmark.leaderboard import *
7
 
8
 
@@ -30,7 +30,6 @@ if __name__ == "__main__":
30
  # Generate initial leaderboard data
31
  validated, unvalidated = update_tables()
32
 
33
-
34
  with gr.Tab("Validated"):
35
  verified_table = gr.Dataframe(
36
  value=validated,
@@ -38,7 +37,7 @@ if __name__ == "__main__":
38
  interactive=False,
39
  column_widths=["20%"],
40
  wrap=True,
41
- )
42
 
43
  with gr.Tab("Unvalidated"):
44
  unverified_table = gr.Dataframe(
@@ -47,7 +46,7 @@ if __name__ == "__main__":
47
  interactive=False,
48
  column_widths=["20%"],
49
  wrap=True,
50
- )
51
  # create a Gradio event listener that runs when the page is loaded to populate the dataframe
52
  demo.load(update_tables, inputs=None, outputs=[verified_table, unverified_table])
53
 
@@ -61,6 +60,8 @@ if __name__ == "__main__":
61
  verified_table, unverified_table
62
  ],
63
  )
 
 
64
  with gr.Row():
65
  with gr.Accordion("📙 Citation", open=False):
66
  citation_button = gr.Textbox(
@@ -107,5 +108,4 @@ if __name__ == "__main__":
107
  scheduler = BackgroundScheduler()
108
  scheduler.add_job(restart_space, "interval", seconds=3600*24)
109
  scheduler.start()
110
- demo.launch(debug=True)
111
-
 
2
  import gradio as gr
3
 
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
+ from dabstep_benchmark.content import TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_TEXT, CITATION_BUTTON_LABEL, VALIDATION_GUIDELINES
6
  from dabstep_benchmark.leaderboard import *
7
 
8
 
 
30
  # Generate initial leaderboard data
31
  validated, unvalidated = update_tables()
32
 
 
33
  with gr.Tab("Validated"):
34
  verified_table = gr.Dataframe(
35
  value=validated,
 
37
  interactive=False,
38
  column_widths=["20%"],
39
  wrap=True,
40
+ )
41
 
42
  with gr.Tab("Unvalidated"):
43
  unverified_table = gr.Dataframe(
 
46
  interactive=False,
47
  column_widths=["20%"],
48
  wrap=True,
49
+ )
50
  # create a Gradio event listener that runs when the page is loaded to populate the dataframe
51
  demo.load(update_tables, inputs=None, outputs=[verified_table, unverified_table])
52
 
 
60
  verified_table, unverified_table
61
  ],
62
  )
63
+ gr.Markdown(VALIDATION_GUIDELINES, elem_classes="markdown-text")
64
+
65
  with gr.Row():
66
  with gr.Accordion("📙 Citation", open=False):
67
  citation_button = gr.Textbox(
 
108
  scheduler = BackgroundScheduler()
109
  scheduler.add_job(restart_space, "interval", seconds=3600*24)
110
  scheduler.start()
111
+ demo.launch(debug=True)
 
dabstep_benchmark/content.py CHANGED
@@ -39,3 +39,20 @@ CITATION_BUTTON_TEXT = r"""@misc{DABstep_benchmark_2025,
39
  month={February},
40
  url={https://www.adyen.com/knowledge-hub/data-agent-benchmark-for-multi-step-reasoning-dabstep}
41
  }"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  month={February},
40
  url={https://www.adyen.com/knowledge-hub/data-agent-benchmark-for-multi-step-reasoning-dabstep}
41
  }"""
42
+
43
+
44
+ VALIDATION_GUIDELINES = """
45
+ ## Benchmark Validation Standards
46
+
47
+ All submissions are initially added to the **Unvalidated Leaderboard**. The Adyen/Hugging Face team will attempt, with the participation of the respective submission team, to validate any entries that rank within the top 10.
48
+
49
+ **Validation** confirms that a submission's results were achieved using a novel approach involving data analysis agents. To support validation, participants must provide clear evidence of their methodology. This can be done in one of the following ways:
50
+
51
+ - **Preferred:** Share a research paper or blog post along with the source code to enable full reproducibility.
52
+ - Submit a complete dataset that includes **reasoning traces** demonstrating how the results were produced.
53
+ - Provide access to an **API** that the Adyen/Hugging Face team can use to independently validate and reproduce results.
54
+
55
+ Our goal with **DABStep** is to foster rapid progress and collaboration in the open research community. We strongly encourage participants to share their work and open-source their code whenever possible.
56
+
57
+ Once validated, submissions will be featured and showcased on the **Validated Leaderboard**, including annotations indicating the validation method used (e.g., `traces`, `code`, `API`).
58
+ """