Signed-off-by: Jonathan Bnayahu <[email protected]>
- app.py +0 -12
- src/about.py +10 -7
app.py
CHANGED
@@ -4,8 +4,6 @@ from gradio_leaderboard import Leaderboard
|
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
|
6 |
from src.about import (
|
7 |
-
CITATION_BUTTON_LABEL,
|
8 |
-
CITATION_BUTTON_TEXT,
|
9 |
INTRODUCTION_TEXT,
|
10 |
LLM_BENCHMARKS_TEXT,
|
11 |
TITLE,
|
@@ -70,16 +68,6 @@ with gui:
|
|
70 |
|
71 |
download_button.click(fn=generate_csv_file, outputs=csv_output)
|
72 |
|
73 |
-
with gr.Row():
|
74 |
-
with gr.Accordion("📙 Citation", open=False):
|
75 |
-
citation_button = gr.Textbox(
|
76 |
-
value=CITATION_BUTTON_TEXT,
|
77 |
-
label=CITATION_BUTTON_LABEL,
|
78 |
-
lines=20,
|
79 |
-
elem_id="citation-button",
|
80 |
-
show_copy_button=True,
|
81 |
-
)
|
82 |
-
|
83 |
scheduler = BackgroundScheduler()
|
84 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
85 |
scheduler.start()
|
|
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
|
6 |
from src.about import (
|
|
|
|
|
7 |
INTRODUCTION_TEXT,
|
8 |
LLM_BENCHMARKS_TEXT,
|
9 |
TITLE,
|
|
|
68 |
|
69 |
download_button.click(fn=generate_csv_file, outputs=csv_output)
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
scheduler = BackgroundScheduler()
|
72 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
73 |
scheduler.start()
|
src/about.py
CHANGED
@@ -85,15 +85,18 @@ table th:nth-of-type(3) {
|
|
85 |
| QA Finance | <pre><p><b>FinQA</b></p>[Dataset](https://huggingface.co/datasets/ibm/finqa), [Paper](https://arxiv.org/abs/2109.00122), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.fin_qa.html)</pre> | <p>A large-scale dataset with 2.8k financial reports for 8k Q&A pairs to study numerical reasoning with structured and unstructured evidence.</p>The FinQA dataset is designed to facilitate research and development in the area of question answering (QA) using financial texts. It consists of a subset of QA pairs from a larger dataset, originally created through a collaboration between researchers from the University of Pennsylvania, J.P. Morgan, and Amazon.The original dataset includes 8,281 QA pairs built against publicly available earnings reports of S&P 500 companies from 1999 to 2019 (FinQA: A Dataset of Numerical Reasoning over Financial Data.). This subset, specifically curated by Aiera, consists of 91 QA pairs. Each entry in the dataset includes a context, a question, and an answer, with each component manually verified for accuracy and formatting consistency. |
|
86 |
|
87 |
## Reproducibility
|
88 |
-
To reproduce our results,
|
89 |
-
|
90 |
```
|
|
|
|
|
91 |
pip install unitxt[bluebench]
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
unitxt-summarize ./results/bluebench
|
94 |
```
|
95 |
"""
|
96 |
-
|
97 |
-
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
98 |
-
CITATION_BUTTON_TEXT = r"""
|
99 |
-
"""
|
|
|
85 |
| QA Finance | <pre><p><b>FinQA</b></p>[Dataset](https://huggingface.co/datasets/ibm/finqa), [Paper](https://arxiv.org/abs/2109.00122), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.fin_qa.html)</pre> | <p>A large-scale dataset with 2.8k financial reports for 8k Q&A pairs to study numerical reasoning with structured and unstructured evidence.</p>The FinQA dataset is designed to facilitate research and development in the area of question answering (QA) using financial texts. It consists of a subset of QA pairs from a larger dataset, originally created through a collaboration between researchers from the University of Pennsylvania, J.P. Morgan, and Amazon.The original dataset includes 8,281 QA pairs built against publicly available earnings reports of S&P 500 companies from 1999 to 2019 (FinQA: A Dataset of Numerical Reasoning over Financial Data.). This subset, specifically curated by Aiera, consists of 91 QA pairs. Each entry in the dataset includes a context, a question, and an answer, with each component manually verified for accuracy and formatting consistency. |
|
86 |
|
87 |
## Reproducibility
|
88 |
+
BlueBench is powered by the <a href="https://www.unitxt.ai">unitxt</a> library. To reproduce our results, start by installing Unitxt in a clean Python 3.10 virtual environment, along with the required dependencies:
|
|
|
89 |
```
|
90 |
+
conda create -n bluebench python=3.10
|
91 |
+
conda activate bluebench
|
92 |
pip install unitxt[bluebench]
|
93 |
+
```
|
94 |
+
To perform the evaluation, run the following, replacing MODEL_FULL_NAME with the name of the provider and model you wish to evaluate, in LiteLLM format. Consult the LiteLLM <a href="https://docs.litellm.ai/docs/providers">providers catalog</a> for details. Make sure you set the required environment variables (e.g., API keys and credentials).
|
95 |
+
```
|
96 |
+
unitxt-evaluate --tasks "benchmarks.bluebench" --model cross_provider --model_args "model_name=MODEL_FULL_NAME,max_tokens=1024" --output_path ./results/bluebench --log_samples --trust_remote_code --batch_size 8
|
97 |
+
```
|
98 |
+
A successful run will result in two json files in the ./results/bluebench folder. To view a summary of the results, run the following:
|
99 |
+
```
|
100 |
unitxt-summarize ./results/bluebench
|
101 |
```
|
102 |
"""
|
|
|
|
|
|
|
|