jbnayahu commited on
Commit
3ee3ca7
·
unverified ·
1 Parent(s): 86d72cb

Signed-off-by: Jonathan Bnayahu <[email protected]>

Files changed (2) hide show
  1. app.py +0 -12
  2. src/about.py +10 -7
app.py CHANGED
@@ -4,8 +4,6 @@ from gradio_leaderboard import Leaderboard
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
 
6
  from src.about import (
7
- CITATION_BUTTON_LABEL,
8
- CITATION_BUTTON_TEXT,
9
  INTRODUCTION_TEXT,
10
  LLM_BENCHMARKS_TEXT,
11
  TITLE,
@@ -70,16 +68,6 @@ with gui:
70
 
71
  download_button.click(fn=generate_csv_file, outputs=csv_output)
72
 
73
- with gr.Row():
74
- with gr.Accordion("📙 Citation", open=False):
75
- citation_button = gr.Textbox(
76
- value=CITATION_BUTTON_TEXT,
77
- label=CITATION_BUTTON_LABEL,
78
- lines=20,
79
- elem_id="citation-button",
80
- show_copy_button=True,
81
- )
82
-
83
  scheduler = BackgroundScheduler()
84
  scheduler.add_job(restart_space, "interval", seconds=1800)
85
  scheduler.start()
 
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
 
6
  from src.about import (
 
 
7
  INTRODUCTION_TEXT,
8
  LLM_BENCHMARKS_TEXT,
9
  TITLE,
 
68
 
69
  download_button.click(fn=generate_csv_file, outputs=csv_output)
70
 
 
 
 
 
 
 
 
 
 
 
71
  scheduler = BackgroundScheduler()
72
  scheduler.add_job(restart_space, "interval", seconds=1800)
73
  scheduler.start()
src/about.py CHANGED
@@ -85,15 +85,18 @@ table th:nth-of-type(3) {
85
  | QA Finance | <pre><p><b>FinQA</b></p>[Dataset](https://huggingface.co/datasets/ibm/finqa), [Paper](https://arxiv.org/abs/2109.00122), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.fin_qa.html)</pre> | <p>A large-scale dataset with 2.8k financial reports for 8k Q&A pairs to study numerical reasoning with structured and unstructured evidence.</p>The FinQA dataset is designed to facilitate research and development in the area of question answering (QA) using financial texts. It consists of a subset of QA pairs from a larger dataset, originally created through a collaboration between researchers from the University of Pennsylvania, J.P. Morgan, and Amazon.The original dataset includes 8,281 QA pairs built against publicly available earnings reports of S&P 500 companies from 1999 to 2019 (FinQA: A Dataset of Numerical Reasoning over Financial Data.). This subset, specifically curated by Aiera, consists of 91 QA pairs. Each entry in the dataset includes a context, a question, and an answer, with each component manually verified for accuracy and formatting consistency. |
86
 
87
  ## Reproducibility
88
- To reproduce our results, here is the commands you can run:
89
-
90
  ```
 
 
91
  pip install unitxt[bluebench]
92
- unitxt-evaluate --tasks "benchmarks.bluebench" --model cross_provider --model_args "model_name=MODEL_TO_EVALUATE_IN_LITELLM_FORMAT,max_tokens=1024" --output_path ./results/bluebench --log_samples --trust_remote_code --batch_size 8
 
 
 
 
 
 
93
  unitxt-summarize ./results/bluebench
94
  ```
95
  """
96
-
97
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
98
- CITATION_BUTTON_TEXT = r"""
99
- """
 
85
  | QA Finance | <pre><p><b>FinQA</b></p>[Dataset](https://huggingface.co/datasets/ibm/finqa), [Paper](https://arxiv.org/abs/2109.00122), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.fin_qa.html)</pre> | <p>A large-scale dataset with 2.8k financial reports for 8k Q&A pairs to study numerical reasoning with structured and unstructured evidence.</p>The FinQA dataset is designed to facilitate research and development in the area of question answering (QA) using financial texts. It consists of a subset of QA pairs from a larger dataset, originally created through a collaboration between researchers from the University of Pennsylvania, J.P. Morgan, and Amazon.The original dataset includes 8,281 QA pairs built against publicly available earnings reports of S&P 500 companies from 1999 to 2019 (FinQA: A Dataset of Numerical Reasoning over Financial Data.). This subset, specifically curated by Aiera, consists of 91 QA pairs. Each entry in the dataset includes a context, a question, and an answer, with each component manually verified for accuracy and formatting consistency. |
86
 
87
  ## Reproducibility
88
+ BlueBench is powered by the <a href="https://www.unitxt.ai">unitxt</a> library. To reproduce our results, start by installing Unitxt in a clean Python 3.10 virtual environment, along with the required dependencies:
 
89
  ```
90
+ conda create -n bluebench python=3.10
91
+ conda activate bluebench
92
  pip install unitxt[bluebench]
93
+ ```
94
+ To perform the evaluation, run the following, replacing MODEL_FULL_NAME with the name of the provider and model you wish to evaluate, in LiteLLM format. Consult the LiteLLM <a href="https://docs.litellm.ai/docs/providers">providers catalog</a> for details. Make sure you set the required environment variables (e.g., API keys and credentials).
95
+ ```
96
+ unitxt-evaluate --tasks "benchmarks.bluebench" --model cross_provider --model_args "model_name=MODEL_FULL_NAME,max_tokens=1024" --output_path ./results/bluebench --log_samples --trust_remote_code --batch_size 8
97
+ ```
98
+ A successful run will result in two json files in the ./results/bluebench folder. To view a summary of the results, run the following:
99
+ ```
100
  unitxt-summarize ./results/bluebench
101
  ```
102
  """