Commit
·
80793c6
1
Parent(s):
117d89c
add submission instructions to about page
Browse files- src/about.py +5 -2
src/about.py
CHANGED
@@ -12,7 +12,7 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("icelandic_winogrande_stringmatch", "exact_match,get-answer", "WinoGrande-IS")
|
16 |
task1 = Task("icelandic_sentences_ged_stringmatch", "exact_match,get-answer", "GED")
|
17 |
task2 = Task("icelandic_inflection_easy", "json_metric,get-answer", "Inflection (common)")
|
18 |
task3 = Task("icelandic_inflection_medium", "json_metric,get-answer", "Inflection (uncommon)")
|
@@ -33,6 +33,9 @@ INTRODUCTION_TEXT = """
|
|
33 |
|
34 |
# Which evaluations are you running? how can people reproduce what you have?
|
35 |
LLM_BENCHMARKS_TEXT = f"""
|
|
|
|
|
|
|
36 |
## Benchmark tasks
|
37 |
The Icelandic LLM leaderboard evaluates models on several tasks. All of them are set up as generation tasks, where the model's output is compared to the expected output.
|
38 |
This means that models that have not been instruction fine-tuned might perform poorly on these tasks.
|
@@ -42,7 +45,7 @@ The following tasks are evaluated:
|
|
42 |
### WinoGrande-IS
|
43 |
The Icelandic WinoGrande task is a human-translated and localized version of the ~1000 test set examples in the WinoGrande task in English.
|
44 |
Each example consists of a sentence with a blank, and two answer choices for the blank. The task is to choose the correct answer choice using coreference resolution.
|
45 |
-
The benchmark is designed to test the model's ability to use knowledge and common sense reasoning in Icelandic.
|
46 |
The Icelandic WinoGrande dataset is described in more detail in the IceBERT paper (https://aclanthology.org/2022.lrec-1.464.pdf).
|
47 |
- Link to dataset: https://huggingface.co/datasets/mideind/icelandic-winogrande
|
48 |
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("icelandic_winogrande_stringmatch", "exact_match,get-answer", "WinoGrande-IS (3-shot)")
|
16 |
task1 = Task("icelandic_sentences_ged_stringmatch", "exact_match,get-answer", "GED")
|
17 |
task2 = Task("icelandic_inflection_easy", "json_metric,get-answer", "Inflection (common)")
|
18 |
task3 = Task("icelandic_inflection_medium", "json_metric,get-answer", "Inflection (uncommon)")
|
|
|
33 |
|
34 |
# Which evaluations are you running? how can people reproduce what you have?
|
35 |
LLM_BENCHMARKS_TEXT = f"""
|
36 |
+
## New submissions
|
37 |
+
Do you want your model to be included on the leaderboard? Open a discussion on this repository with the details of your model and we will get back to you.
|
38 |
+
|
39 |
## Benchmark tasks
|
40 |
The Icelandic LLM leaderboard evaluates models on several tasks. All of them are set up as generation tasks, where the model's output is compared to the expected output.
|
41 |
This means that models that have not been instruction fine-tuned might perform poorly on these tasks.
|
|
|
45 |
### WinoGrande-IS
|
46 |
The Icelandic WinoGrande task is a human-translated and localized version of the ~1000 test set examples in the WinoGrande task in English.
|
47 |
Each example consists of a sentence with a blank, and two answer choices for the blank. The task is to choose the correct answer choice using coreference resolution.
|
48 |
+
The benchmark is designed to test the model's ability to use knowledge and common sense reasoning in Icelandic. For this benchmark, we use 3-shot evaluation.
|
49 |
The Icelandic WinoGrande dataset is described in more detail in the IceBERT paper (https://aclanthology.org/2022.lrec-1.464.pdf).
|
50 |
- Link to dataset: https://huggingface.co/datasets/mideind/icelandic-winogrande
|
51 |
|