Commit
·
9fa29df
1
Parent(s):
e5acaa3
style: Rename ScandEval to EuroEval
Browse files
app.py
CHANGED
@@ -26,18 +26,18 @@ INTRO_MARKDOWN = """
|
|
26 |
|
27 |
This demo allows you to generate a radial plot comparing the performance of different
|
28 |
language models on different tasks. It is based on the generative results from the
|
29 |
-
[
|
30 |
"""
|
31 |
|
32 |
|
33 |
ABOUT_MARKDOWN = """
|
34 |
-
## About the
|
35 |
|
36 |
-
The [
|
37 |
-
models on tasks in Danish,
|
38 |
-
|
39 |
BERT) and generative models (such as GPT), and leaderboards for both kinds [are
|
40 |
-
available](https://
|
41 |
|
42 |
The generative models are evaluated using in-context learning with few-shot prompts.
|
43 |
The few-shot examples are sampled randomly from the training split, and we benchmark
|
@@ -54,10 +54,8 @@ the worst performing models having rank scores close to 0.
|
|
54 |
|
55 |
## The Benchmark Datasets
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
consists of 7 different tasks, each of which consists of 1-2 datasets. The tasks are
|
60 |
-
the following:
|
61 |
|
62 |
### Text Classification
|
63 |
Given a piece of text, classify it into a number of classes. For this task we extract
|
@@ -110,7 +108,7 @@ Correlation Coefficient (MCC) as the evaluation metric.
|
|
110 |
|
111 |
## Citation
|
112 |
|
113 |
-
If you use the
|
114 |
paper](https://aclanthology.org/2023.nodalida-1.20):
|
115 |
|
116 |
```
|
@@ -741,16 +739,16 @@ def produce_radial_plot(
|
|
741 |
|
742 |
|
743 |
def fetch_results() -> dict[Language, pd.DataFrame]:
|
744 |
-
"""Fetch the results from the
|
745 |
|
746 |
Returns:
|
747 |
A dictionary of languages -> results-dataframes, whose indices are the
|
748 |
models and columns are the tasks.
|
749 |
"""
|
750 |
-
logger.info("Fetching results from
|
751 |
|
752 |
response = requests.get(
|
753 |
-
"https://raw.githubusercontent.com/
|
754 |
)
|
755 |
response.raise_for_status()
|
756 |
records = [
|
@@ -804,7 +802,7 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
|
|
804 |
).dropna()
|
805 |
results_dfs[language] = results_df
|
806 |
|
807 |
-
logger.info("Successfully fetched results from
|
808 |
|
809 |
return results_dfs
|
810 |
|
|
|
26 |
|
27 |
This demo allows you to generate a radial plot comparing the performance of different
|
28 |
language models on different tasks. It is based on the generative results from the
|
29 |
+
[EuroEval benchmark](https://euroeval.com).
|
30 |
"""
|
31 |
|
32 |
|
33 |
ABOUT_MARKDOWN = """
|
34 |
+
## About the EuroEval Benchmark
|
35 |
|
36 |
+
The [EuroEval benchmark](https://euroeval.com) is used compare pretrained language
|
37 |
+
models on tasks in Danish, Dutch, English, Faroese, French, German, Icelandic, Italian,
|
38 |
+
Norwegian and Swedish. The benchmark supports both encoder models (such as
|
39 |
BERT) and generative models (such as GPT), and leaderboards for both kinds [are
|
40 |
+
available](https://euroeval.com).
|
41 |
|
42 |
The generative models are evaluated using in-context learning with few-shot prompts.
|
43 |
The few-shot examples are sampled randomly from the training split, and we benchmark
|
|
|
54 |
|
55 |
## The Benchmark Datasets
|
56 |
|
57 |
+
For each language, the benchmark consists of 7 different tasks, each of which consists
|
58 |
+
of 1-2 datasets. The tasks are the following:
|
|
|
|
|
59 |
|
60 |
### Text Classification
|
61 |
Given a piece of text, classify it into a number of classes. For this task we extract
|
|
|
108 |
|
109 |
## Citation
|
110 |
|
111 |
+
If you use the EuroEval benchmark in your work, please cite [the
|
112 |
paper](https://aclanthology.org/2023.nodalida-1.20):
|
113 |
|
114 |
```
|
|
|
739 |
|
740 |
|
741 |
def fetch_results() -> dict[Language, pd.DataFrame]:
|
742 |
+
"""Fetch the results from the EuroEval benchmark.
|
743 |
|
744 |
Returns:
|
745 |
A dictionary of languages -> results-dataframes, whose indices are the
|
746 |
models and columns are the tasks.
|
747 |
"""
|
748 |
+
logger.info("Fetching results from EuroEval benchmark...")
|
749 |
|
750 |
response = requests.get(
|
751 |
+
"https://raw.githubusercontent.com/EuroEval/leaderboards/refs/heads/main/results/results.jsonl"
|
752 |
)
|
753 |
response.raise_for_status()
|
754 |
records = [
|
|
|
802 |
).dropna()
|
803 |
results_dfs[language] = results_df
|
804 |
|
805 |
+
logger.info("Successfully fetched results from EuroEval benchmark.")
|
806 |
|
807 |
return results_dfs
|
808 |
|