Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
aa92f8e
1
Parent(s):
84b45f6
added validation guidelines
Browse files- app.py +6 -6
- dabstep_benchmark/content.py +17 -0
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
| 2 |
import gradio as gr
|
| 3 |
|
| 4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 5 |
-
from dabstep_benchmark.content import TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_TEXT, CITATION_BUTTON_LABEL
|
| 6 |
from dabstep_benchmark.leaderboard import *
|
| 7 |
|
| 8 |
|
|
@@ -30,7 +30,6 @@ if __name__ == "__main__":
|
|
| 30 |
# Generate initial leaderboard data
|
| 31 |
validated, unvalidated = update_tables()
|
| 32 |
|
| 33 |
-
|
| 34 |
with gr.Tab("Validated"):
|
| 35 |
verified_table = gr.Dataframe(
|
| 36 |
value=validated,
|
|
@@ -38,7 +37,7 @@ if __name__ == "__main__":
|
|
| 38 |
interactive=False,
|
| 39 |
column_widths=["20%"],
|
| 40 |
wrap=True,
|
| 41 |
-
|
| 42 |
|
| 43 |
with gr.Tab("Unvalidated"):
|
| 44 |
unverified_table = gr.Dataframe(
|
|
@@ -47,7 +46,7 @@ if __name__ == "__main__":
|
|
| 47 |
interactive=False,
|
| 48 |
column_widths=["20%"],
|
| 49 |
wrap=True,
|
| 50 |
-
|
| 51 |
# create a Gradio event listener that runs when the page is loaded to populate the dataframe
|
| 52 |
demo.load(update_tables, inputs=None, outputs=[verified_table, unverified_table])
|
| 53 |
|
|
@@ -61,6 +60,8 @@ if __name__ == "__main__":
|
|
| 61 |
verified_table, unverified_table
|
| 62 |
],
|
| 63 |
)
|
|
|
|
|
|
|
| 64 |
with gr.Row():
|
| 65 |
with gr.Accordion("📙 Citation", open=False):
|
| 66 |
citation_button = gr.Textbox(
|
|
@@ -107,5 +108,4 @@ if __name__ == "__main__":
|
|
| 107 |
scheduler = BackgroundScheduler()
|
| 108 |
scheduler.add_job(restart_space, "interval", seconds=3600*24)
|
| 109 |
scheduler.start()
|
| 110 |
-
demo.launch(debug=True)
|
| 111 |
-
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
|
| 4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 5 |
+
from dabstep_benchmark.content import TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_TEXT, CITATION_BUTTON_LABEL, VALIDATION_GUIDELINES
|
| 6 |
from dabstep_benchmark.leaderboard import *
|
| 7 |
|
| 8 |
|
|
|
|
| 30 |
# Generate initial leaderboard data
|
| 31 |
validated, unvalidated = update_tables()
|
| 32 |
|
|
|
|
| 33 |
with gr.Tab("Validated"):
|
| 34 |
verified_table = gr.Dataframe(
|
| 35 |
value=validated,
|
|
|
|
| 37 |
interactive=False,
|
| 38 |
column_widths=["20%"],
|
| 39 |
wrap=True,
|
| 40 |
+
)
|
| 41 |
|
| 42 |
with gr.Tab("Unvalidated"):
|
| 43 |
unverified_table = gr.Dataframe(
|
|
|
|
| 46 |
interactive=False,
|
| 47 |
column_widths=["20%"],
|
| 48 |
wrap=True,
|
| 49 |
+
)
|
| 50 |
# create a Gradio event listener that runs when the page is loaded to populate the dataframe
|
| 51 |
demo.load(update_tables, inputs=None, outputs=[verified_table, unverified_table])
|
| 52 |
|
|
|
|
| 60 |
verified_table, unverified_table
|
| 61 |
],
|
| 62 |
)
|
| 63 |
+
gr.Markdown(VALIDATION_GUIDELINES, elem_classes="markdown-text")
|
| 64 |
+
|
| 65 |
with gr.Row():
|
| 66 |
with gr.Accordion("📙 Citation", open=False):
|
| 67 |
citation_button = gr.Textbox(
|
|
|
|
| 108 |
scheduler = BackgroundScheduler()
|
| 109 |
scheduler.add_job(restart_space, "interval", seconds=3600*24)
|
| 110 |
scheduler.start()
|
| 111 |
+
demo.launch(debug=True)
|
|
|
dabstep_benchmark/content.py
CHANGED
|
@@ -39,3 +39,20 @@ CITATION_BUTTON_TEXT = r"""@misc{DABstep_benchmark_2025,
|
|
| 39 |
month={February},
|
| 40 |
url={https://www.adyen.com/knowledge-hub/data-agent-benchmark-for-multi-step-reasoning-dabstep}
|
| 41 |
}"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
month={February},
|
| 40 |
url={https://www.adyen.com/knowledge-hub/data-agent-benchmark-for-multi-step-reasoning-dabstep}
|
| 41 |
}"""
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
VALIDATION_GUIDELINES = """
|
| 45 |
+
## Benchmark Validation Standards
|
| 46 |
+
|
| 47 |
+
All submissions are initially added to the **Unvalidated Leaderboard**. The Adyen/Hugging Face team will attempt, with the participation of the respective submission team, to validate any entries that rank within the top 10.
|
| 48 |
+
|
| 49 |
+
**Validation** confirms that a submission's results were achieved using a novel approach involving data analysis agents. To support validation, participants must provide clear evidence of their methodology. This can be done in one of the following ways:
|
| 50 |
+
|
| 51 |
+
- **Preferred:** Share a research paper or blog post along with the source code to enable full reproducibility.
|
| 52 |
+
- Submit a complete dataset that includes **reasoning traces** demonstrating how the results were produced.
|
| 53 |
+
- Provide access to an **API** that the Adyen/Hugging Face team can use to independently validate and reproduce results.
|
| 54 |
+
|
| 55 |
+
Our goal with **DABStep** is to foster rapid progress and collaboration in the open research community. We strongly encourage participants to share their work and open-source their code whenever possible.
|
| 56 |
+
|
| 57 |
+
Once validated, submissions will be featured and showcased on the **Validated Leaderboard**, including annotations indicating the validation method used (e.g., `traces`, `code`, `API`).
|
| 58 |
+
"""
|