Update app.py
Browse files
app.py
CHANGED
@@ -164,7 +164,7 @@ SUMMARISATION = Task(name="summarisation", metric="bertscore")
|
|
164 |
KNOWLEDGE = Task(name="knowledge", metric="mcc")
|
165 |
REASONING = Task(name="reasoning", metric="mcc")
|
166 |
GRAMMAR = Task(name="grammar", metric="mcc")
|
167 |
-
|
168 |
TEXT_CLASSIFICATION = Task(name="text classification", metric="mcc")
|
169 |
INFORMATION_EXTRACTION = Task(name="information extraction", metric="micro_f1_no_misc")
|
170 |
ALL_TASKS = [obj for obj in globals().values() if isinstance(obj, Task)]
|
@@ -203,13 +203,13 @@ DATASETS = [
|
|
203 |
Dataset(name="scala-de", language=GERMAN, task=GRAMMAR),
|
204 |
Dataset(name="scala-nl", language=DUTCH, task=GRAMMAR),
|
205 |
Dataset(name="scala-en", language=ENGLISH, task=GRAMMAR),
|
206 |
-
Dataset(name="scandiqa-da", language=DANISH, task=
|
207 |
-
Dataset(name="norquad", language=NORWEGIAN, task=
|
208 |
-
Dataset(name="scandiqa-sv", language=SWEDISH, task=
|
209 |
-
Dataset(name="nqii", language=ICELANDIC, task=
|
210 |
-
Dataset(name="germanquad", language=GERMAN, task=
|
211 |
-
Dataset(name="squad", language=ENGLISH, task=
|
212 |
-
Dataset(name="squad-nl", language=DUTCH, task=
|
213 |
Dataset(name="nordjylland-news", language=DANISH, task=SUMMARISATION),
|
214 |
Dataset(name="mlsum", language=GERMAN, task=SUMMARISATION),
|
215 |
Dataset(name="rrn", language=ICELANDIC, task=SUMMARISATION),
|
@@ -671,11 +671,6 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
|
|
671 |
for record in records:
|
672 |
model_name = record["model"]
|
673 |
|
674 |
-
# Manual fix for OpenAI models: Only keep the validation split results
|
675 |
-
if "gpt-3.5" in model_name or "gpt-4" in model_name:
|
676 |
-
if not record.get("validation_split", False):
|
677 |
-
continue
|
678 |
-
|
679 |
dataset_name = record["dataset"]
|
680 |
if dataset_name in possible_dataset_names:
|
681 |
dataset = next(
|
|
|
164 |
KNOWLEDGE = Task(name="knowledge", metric="mcc")
|
165 |
REASONING = Task(name="reasoning", metric="mcc")
|
166 |
GRAMMAR = Task(name="grammar", metric="mcc")
|
167 |
+
READING_COMPREHENSION = Task(name="reading comprehension", metric="em")
|
168 |
TEXT_CLASSIFICATION = Task(name="text classification", metric="mcc")
|
169 |
INFORMATION_EXTRACTION = Task(name="information extraction", metric="micro_f1_no_misc")
|
170 |
ALL_TASKS = [obj for obj in globals().values() if isinstance(obj, Task)]
|
|
|
203 |
Dataset(name="scala-de", language=GERMAN, task=GRAMMAR),
|
204 |
Dataset(name="scala-nl", language=DUTCH, task=GRAMMAR),
|
205 |
Dataset(name="scala-en", language=ENGLISH, task=GRAMMAR),
|
206 |
+
Dataset(name="scandiqa-da", language=DANISH, task=READING_COMPREHENSION),
|
207 |
+
Dataset(name="norquad", language=NORWEGIAN, task=READING_COMPREHENSION),
|
208 |
+
Dataset(name="scandiqa-sv", language=SWEDISH, task=READING_COMPREHENSION),
|
209 |
+
Dataset(name="nqii", language=ICELANDIC, task=READING_COMPREHENSION),
|
210 |
+
Dataset(name="germanquad", language=GERMAN, task=READING_COMPREHENSION),
|
211 |
+
Dataset(name="squad", language=ENGLISH, task=READING_COMPREHENSION),
|
212 |
+
Dataset(name="squad-nl", language=DUTCH, task=READING_COMPREHENSION),
|
213 |
Dataset(name="nordjylland-news", language=DANISH, task=SUMMARISATION),
|
214 |
Dataset(name="mlsum", language=GERMAN, task=SUMMARISATION),
|
215 |
Dataset(name="rrn", language=ICELANDIC, task=SUMMARISATION),
|
|
|
671 |
for record in records:
|
672 |
model_name = record["model"]
|
673 |
|
|
|
|
|
|
|
|
|
|
|
674 |
dataset_name = record["dataset"]
|
675 |
if dataset_name in possible_dataset_names:
|
676 |
dataset = next(
|