gardarjuto commited on
Commit
67a665c
·
1 Parent(s): 7fcf611

add benchmark descriptions and links to About page

Browse files
Files changed (1) hide show
  1. src/about.py +36 -10
src/about.py CHANGED
@@ -12,13 +12,13 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("icelandic_winogrande_stringmatch", "exact_match,get-answer", "Winogrande")
16
  task1 = Task("icelandic_sentences_ged_stringmatch", "exact_match,get-answer", "GED")
17
  task2 = Task("icelandic_inflection_easy", "json_metric,get-answer", "Inflection (common)")
18
  task3 = Task("icelandic_inflection_medium", "json_metric,get-answer", "Inflection (uncommon)")
19
  task4 = Task("icelandic_inflection_hard", "json_metric,get-answer", "Inflection (rare)")
20
- task5 = Task("icelandic_belebele", "exact_match,get-answer", "Belebele")
21
- task6 = Task("icelandic_arc_challenge", "exact_match,get-answer", "ARC Challenge")
22
 
23
  NUM_FEWSHOT = 0 # Change with your few shot
24
  # ---------------------------------------------------
@@ -35,10 +35,39 @@ Intro text
35
 
36
  # Which evaluations are you running? how can people reproduce what you have?
37
  LLM_BENCHMARKS_TEXT = f"""
38
- ## How it works
39
-
40
- ## Reproducibility
41
- To reproduce our results, here is the commands you can run:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  """
44
 
@@ -72,6 +101,3 @@ Make sure you have followed the above steps first.
72
  If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
73
  """
74
 
75
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
76
- CITATION_BUTTON_TEXT = r"""
77
- """
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("icelandic_winogrande_stringmatch", "exact_match,get-answer", "WinoGrande-IS")
16
  task1 = Task("icelandic_sentences_ged_stringmatch", "exact_match,get-answer", "GED")
17
  task2 = Task("icelandic_inflection_easy", "json_metric,get-answer", "Inflection (common)")
18
  task3 = Task("icelandic_inflection_medium", "json_metric,get-answer", "Inflection (uncommon)")
19
  task4 = Task("icelandic_inflection_hard", "json_metric,get-answer", "Inflection (rare)")
20
+ task5 = Task("icelandic_belebele", "exact_match,get-answer", "Belebele (IS)")
21
+ task6 = Task("icelandic_arc_challenge", "exact_match,get-answer", "ARC-Challenge-IS")
22
 
23
  NUM_FEWSHOT = 0 # Change with your few shot
24
  # ---------------------------------------------------
 
35
 
36
  # Which evaluations are you running? how can people reproduce what you have?
37
  LLM_BENCHMARKS_TEXT = f"""
38
+ ## Benchmark tasks
39
+ The Icelandic LLM leaderboard evaluates models on several tasks. All of them are set up as generation tasks, where the model's output is compared to the expected output.
40
+ This means that models that have not been instruction fine-tuned might perform poorly on these tasks.
41
+
42
+ The following tasks are evaluated:
43
+
44
+ ### WinoGrande-IS
45
+ The Icelandic WinoGrande task is a human-translated and localized version of the ~1000 test set examples in the WinoGrande task in English.
46
+ Each example consists of a sentence with a blank, and two answer choices for the blank. The task is to choose the correct answer choice using coreference resolution.
47
+ The benchmark is designed to test the model's ability to use knowledge and common sense reasoning in Icelandic.
48
+ The Icelandic WinoGrande dataset is described in more detail in the IceBERT paper (https://aclanthology.org/2022.lrec-1.464.pdf).
49
+ - Link to dataset: https://huggingface.co/datasets/mideind/icelandic-winogrande
50
+
51
+ ### GED
52
+ This is a benchmark for binary sentence-level Icelandic grammatical error detection, adapted from the Icelandic Error Corpus (IEC) and contains 200 examples.
53
+ Each example consists of a sentence that may contain one or more grammatical errors, and the task is to predict whether the sentence contains an error.
54
+ - Link to dataset: https://huggingface.co/datasets/mideind/icelandic-sentences-gec
55
+
56
+ ### Inflection benchmarks
57
+ The inflection benchmarks test the model's ability to generate inflected forms of Icelandic adjective-noun pairs. They are divided into three levels of difficulty by
58
+ commonness: common (100 examples), uncommon (100 examples), and rare (100 examples). The model gets a point for an example if it generates error-free json with the
59
+ correct inflected forms in all cases, singular and plural.
60
+ - Link to dataset (common): https://huggingface.co/datasets/mideind/icelandic-inflection-easy
61
+ - Link to dataset (uncommon): https://huggingface.co/datasets/mideind/icelandic-inflection-medium
62
+ - Link to dataset (rare): https://huggingface.co/datasets/mideind/icelandic-inflection-hard
63
+
64
+ ### Belebele (IS)
65
+ This is the Icelandic subset (900 examples) of the Belebele benchmark, a multiple-choice reading comprehension task. The task is to answer questions about a given passage.
66
+ - Link to dataset: https://huggingface.co/datasets/facebook/belebele
67
+
68
+ ### ARC-Challenge-IS
69
+ A machine-translated version of the ARC-Challenge multiple-choice question-answering dataset. For this benchmark, we use the test set which contains 1.23k examples.
70
+ - Link to dataset: https://huggingface.co/datasets/mideind/icelandic-arc-challenge
71
 
72
  """
73
 
 
101
  If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
102
  """
103