Commit
·
67a665c
1
Parent(s):
7fcf611
add benchmark descriptions and links to About page
Browse files- src/about.py +36 -10
src/about.py
CHANGED
@@ -12,13 +12,13 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("icelandic_winogrande_stringmatch", "exact_match,get-answer", "
|
16 |
task1 = Task("icelandic_sentences_ged_stringmatch", "exact_match,get-answer", "GED")
|
17 |
task2 = Task("icelandic_inflection_easy", "json_metric,get-answer", "Inflection (common)")
|
18 |
task3 = Task("icelandic_inflection_medium", "json_metric,get-answer", "Inflection (uncommon)")
|
19 |
task4 = Task("icelandic_inflection_hard", "json_metric,get-answer", "Inflection (rare)")
|
20 |
-
task5 = Task("icelandic_belebele", "exact_match,get-answer", "Belebele")
|
21 |
-
task6 = Task("icelandic_arc_challenge", "exact_match,get-answer", "ARC
|
22 |
|
23 |
NUM_FEWSHOT = 0 # Change with your few shot
|
24 |
# ---------------------------------------------------
|
@@ -35,10 +35,39 @@ Intro text
|
|
35 |
|
36 |
# Which evaluations are you running? how can people reproduce what you have?
|
37 |
LLM_BENCHMARKS_TEXT = f"""
|
38 |
-
##
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
"""
|
44 |
|
@@ -72,6 +101,3 @@ Make sure you have followed the above steps first.
|
|
72 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
73 |
"""
|
74 |
|
75 |
-
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
76 |
-
CITATION_BUTTON_TEXT = r"""
|
77 |
-
"""
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("icelandic_winogrande_stringmatch", "exact_match,get-answer", "WinoGrande-IS")
|
16 |
task1 = Task("icelandic_sentences_ged_stringmatch", "exact_match,get-answer", "GED")
|
17 |
task2 = Task("icelandic_inflection_easy", "json_metric,get-answer", "Inflection (common)")
|
18 |
task3 = Task("icelandic_inflection_medium", "json_metric,get-answer", "Inflection (uncommon)")
|
19 |
task4 = Task("icelandic_inflection_hard", "json_metric,get-answer", "Inflection (rare)")
|
20 |
+
task5 = Task("icelandic_belebele", "exact_match,get-answer", "Belebele (IS)")
|
21 |
+
task6 = Task("icelandic_arc_challenge", "exact_match,get-answer", "ARC-Challenge-IS")
|
22 |
|
23 |
NUM_FEWSHOT = 0 # Change with your few shot
|
24 |
# ---------------------------------------------------
|
|
|
35 |
|
36 |
# Which evaluations are you running? how can people reproduce what you have?
|
37 |
LLM_BENCHMARKS_TEXT = f"""
|
38 |
+
## Benchmark tasks
|
39 |
+
The Icelandic LLM leaderboard evaluates models on several tasks. All of them are set up as generation tasks, where the model's output is compared to the expected output.
|
40 |
+
This means that models that have not been instruction fine-tuned might perform poorly on these tasks.
|
41 |
+
|
42 |
+
The following tasks are evaluated:
|
43 |
+
|
44 |
+
### WinoGrande-IS
|
45 |
+
The Icelandic WinoGrande task is a human-translated and localized version of the ~1000 test set examples in the WinoGrande task in English.
|
46 |
+
Each example consists of a sentence with a blank, and two answer choices for the blank. The task is to choose the correct answer choice using coreference resolution.
|
47 |
+
The benchmark is designed to test the model's ability to use knowledge and common sense reasoning in Icelandic.
|
48 |
+
The Icelandic WinoGrande dataset is described in more detail in the IceBERT paper (https://aclanthology.org/2022.lrec-1.464.pdf).
|
49 |
+
- Link to dataset: https://huggingface.co/datasets/mideind/icelandic-winogrande
|
50 |
+
|
51 |
+
### GED
|
52 |
+
This is a benchmark for binary sentence-level Icelandic grammatical error detection, adapted from the Icelandic Error Corpus (IEC) and contains 200 examples.
|
53 |
+
Each example consists of a sentence that may contain one or more grammatical errors, and the task is to predict whether the sentence contains an error.
|
54 |
+
- Link to dataset: https://huggingface.co/datasets/mideind/icelandic-sentences-gec
|
55 |
+
|
56 |
+
### Inflection benchmarks
|
57 |
+
The inflection benchmarks test the model's ability to generate inflected forms of Icelandic adjective-noun pairs. They are divided into three levels of difficulty by
|
58 |
+
commonness: common (100 examples), uncommon (100 examples), and rare (100 examples). The model gets a point for an example if it generates error-free json with the
|
59 |
+
correct inflected forms in all cases, singular and plural.
|
60 |
+
- Link to dataset (common): https://huggingface.co/datasets/mideind/icelandic-inflection-easy
|
61 |
+
- Link to dataset (uncommon): https://huggingface.co/datasets/mideind/icelandic-inflection-medium
|
62 |
+
- Link to dataset (rare): https://huggingface.co/datasets/mideind/icelandic-inflection-hard
|
63 |
+
|
64 |
+
### Belebele (IS)
|
65 |
+
This is the Icelandic subset (900 examples) of the Belebele benchmark, a multiple-choice reading comprehension task. The task is to answer questions about a given passage.
|
66 |
+
- Link to dataset: https://huggingface.co/datasets/facebook/belebele
|
67 |
+
|
68 |
+
### ARC-Challenge-IS
|
69 |
+
A machine-translated version of the ARC-Challenge multiple-choice question-answering dataset. For this benchmark, we use the test set which contains 1.23k examples.
|
70 |
+
- Link to dataset: https://huggingface.co/datasets/mideind/icelandic-arc-challenge
|
71 |
|
72 |
"""
|
73 |
|
|
|
101 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
102 |
"""
|
103 |
|
|
|
|
|
|