jbnayahu commited on
Commit
9ee85e2
·
unverified ·
1 Parent(s): 84652c6

Signed-off-by: Jonathan Bnayahu <[email protected]>

results/bluebench/2025-08-03T14-35-25_evaluation_results.json ADDED
@@ -0,0 +1,1282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-08-03T18:35:20.055545Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=azure/Azure/o1-ncf,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "azure/Azure/o1-ncf",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.26.5",
45
+ "unitxt_commit_hash": "N/A",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "tiktoken": "0.9.0",
55
+ "charset-normalizer": "3.4.2",
56
+ "nvidia-cuda-runtime-cu12": "12.6.77",
57
+ "pyarrow": "21.0.0",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "httpcore": "1.0.9",
61
+ "pip": "25.2",
62
+ "certifi": "2025.7.14",
63
+ "evaluate": "0.4.5",
64
+ "Jinja2": "3.1.6",
65
+ "jsonschema-specifications": "2025.4.1",
66
+ "pydantic_core": "2.33.2",
67
+ "nvidia-cusparse-cu12": "12.5.4.2",
68
+ "aiosignal": "1.4.0",
69
+ "yarl": "1.20.1",
70
+ "unitxt": "1.26.5",
71
+ "jsonschema": "4.25.0",
72
+ "portalocker": "3.2.0",
73
+ "multiprocess": "0.70.16",
74
+ "nvidia-nvjitlink-cu12": "12.6.85",
75
+ "nvidia-cublas-cu12": "12.6.4.1",
76
+ "pydantic": "2.11.7",
77
+ "async-timeout": "5.0.1",
78
+ "annotated-types": "0.7.0",
79
+ "rouge_score": "0.1.2",
80
+ "contourpy": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "matplotlib": "3.10.5",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "h11": "0.16.0",
87
+ "zipp": "3.19.2",
88
+ "tzdata": "2025.2",
89
+ "bert-score": "0.3.13",
90
+ "setuptools": "80.9.0",
91
+ "referencing": "0.36.2",
92
+ "sacrebleu": "2.5.1",
93
+ "filelock": "3.18.0",
94
+ "urllib3": "2.5.0",
95
+ "scipy": "1.15.3",
96
+ "nvidia-nccl-cu12": "2.26.2",
97
+ "kiwisolver": "1.4.8",
98
+ "networkx": "3.4.2",
99
+ "typing-inspection": "0.4.1",
100
+ "sniffio": "1.3.1",
101
+ "rpds-py": "0.26.0",
102
+ "nvidia-curand-cu12": "10.3.7.77",
103
+ "litellm": "1.74.12",
104
+ "pillow": "11.3.0",
105
+ "datasets": "3.6.0",
106
+ "nvidia-cusolver-cu12": "11.7.1.2",
107
+ "cycler": "0.12.1",
108
+ "tokenizers": "0.21.4",
109
+ "distro": "1.9.0",
110
+ "idna": "3.10",
111
+ "MarkupSafe": "3.0.2",
112
+ "frozenlist": "1.7.0",
113
+ "pyparsing": "3.2.3",
114
+ "regex": "2025.7.34",
115
+ "jiter": "0.10.0",
116
+ "importlib_metadata": "8.0.0",
117
+ "packaging": "24.2",
118
+ "psutil": "7.0.0",
119
+ "mecab-ko-dic": "1.0.0",
120
+ "joblib": "1.5.1",
121
+ "transformers": "4.54.1",
122
+ "fsspec": "2025.3.0",
123
+ "scikit-learn": "1.7.1",
124
+ "dill": "0.3.8",
125
+ "wheel": "0.45.1",
126
+ "nvidia-nvtx-cu12": "12.6.77",
127
+ "nvidia-cusparselt-cu12": "0.6.3",
128
+ "lxml": "6.0.0",
129
+ "propcache": "0.3.2",
130
+ "numpy": "2.2.6",
131
+ "mpmath": "1.3.0",
132
+ "conllu": "6.0.0",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "fonttools": "4.59.0",
136
+ "tabulate": "0.9.0",
137
+ "typing_extensions": "4.12.2",
138
+ "absl-py": "2.3.1",
139
+ "accelerate": "1.9.0",
140
+ "nvidia-cufft-cu12": "11.3.0.4",
141
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
142
+ "click": "8.2.1",
143
+ "attrs": "25.3.0",
144
+ "exceptiongroup": "1.3.0",
145
+ "tenacity": "9.1.2",
146
+ "huggingface-hub": "0.34.3",
147
+ "pytz": "2025.2",
148
+ "aiohappyeyeballs": "2.6.1",
149
+ "python-dateutil": "2.9.0.post0",
150
+ "torch": "2.7.1",
151
+ "python-dotenv": "1.1.1",
152
+ "multidict": "6.6.3",
153
+ "httpx": "0.28.1",
154
+ "aiohttp": "3.12.15",
155
+ "xxhash": "3.5.0",
156
+ "PyYAML": "6.0.2",
157
+ "colorama": "0.4.6",
158
+ "openai": "1.98.0",
159
+ "threadpoolctl": "3.6.0",
160
+ "nvidia-cudnn-cu12": "9.5.1.17",
161
+ "pandas": "2.3.1",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 1.0,
180
+ "accuracy_ci_low": 1.0,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 1.0,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 1.0,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 1.0,
190
+ "accuracy_ci_low": 1.0,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 1.0,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 1.0,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 1.0,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 1.0,
260
+ "accuracy_ci_low": 1.0,
261
+ "accuracy_ci_high": 1.0,
262
+ "score_name": "accuracy",
263
+ "score": 1.0,
264
+ "score_ci_high": 1.0,
265
+ "score_ci_low": 1.0,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.4444444444444444,
270
+ "accuracy_ci_low": 0.1111111111111111,
271
+ "accuracy_ci_high": 0.7777777777777778,
272
+ "score_name": "accuracy",
273
+ "score": 0.4444444444444444,
274
+ "score_ci_high": 0.7777777777777778,
275
+ "score_ci_low": 0.1111111111111111,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 1.0,
280
+ "accuracy_ci_low": 1.0,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 1.0,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 1.0,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.9494949494949495,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.34375,
296
+ "score": 0.34375,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.34375,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.8695652173913043,
307
+ "f1_Organization": 0.7868852459016394,
308
+ "f1_Location": 0.723404255319149,
309
+ "f1_macro": 0.7932849062040309,
310
+ "recall_macro": 0.8116804692891649,
311
+ "precision_macro": 0.7786561264822135,
312
+ "in_classes_support": 1.0,
313
+ "f1_micro": 0.7922077922077922,
314
+ "recall_micro": 0.8133333333333334,
315
+ "precision_micro": 0.7721518987341772,
316
+ "score": 0.7922077922077922,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.7030169722972905,
319
+ "score_ci_high": 0.848370061994058,
320
+ "f1_micro_ci_low": 0.7030169722972905,
321
+ "f1_micro_ci_high": 0.848370061994058
322
+ },
323
+ "score": 0.7922077922077922,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.8571428571428571,
330
+ "accuracy_ci_low": 0.42857142857142855,
331
+ "accuracy_ci_high": 1.0,
332
+ "score_name": "accuracy",
333
+ "score": 0.8571428571428571,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.42857142857142855,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.8571428571428571,
340
+ "accuracy_ci_low": 0.2530277506117974,
341
+ "accuracy_ci_high": 1.0,
342
+ "score_name": "accuracy",
343
+ "score": 0.8571428571428571,
344
+ "score_ci_high": 1.0,
345
+ "score_ci_low": 0.2530277506117974,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.2857142857142857,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.7142857142857143,
352
+ "score_name": "accuracy",
353
+ "score": 0.2857142857142857,
354
+ "score_ci_high": 0.7142857142857143,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.8571428571428571,
360
+ "accuracy_ci_low": 0.42857142857142855,
361
+ "accuracy_ci_high": 1.0,
362
+ "score_name": "accuracy",
363
+ "score": 0.8571428571428571,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 0.42857142857142855,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.8571428571428571,
370
+ "accuracy_ci_low": 0.42857142857142855,
371
+ "accuracy_ci_high": 1.0,
372
+ "score_name": "accuracy",
373
+ "score": 0.8571428571428571,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.42857142857142855,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.7142857142857143,
380
+ "accuracy_ci_low": 0.2857142857142857,
381
+ "accuracy_ci_high": 1.0,
382
+ "score_name": "accuracy",
383
+ "score": 0.7142857142857143,
384
+ "score_ci_high": 1.0,
385
+ "score_ci_low": 0.2857142857142857,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.5714285714285714,
390
+ "accuracy_ci_low": 0.14285714285714285,
391
+ "accuracy_ci_high": 0.8571428571428571,
392
+ "score_name": "accuracy",
393
+ "score": 0.5714285714285714,
394
+ "score_ci_high": 0.8571428571428571,
395
+ "score_ci_low": 0.14285714285714285,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.2857142857142857,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.7142857142857143,
402
+ "score_name": "accuracy",
403
+ "score": 0.2857142857142857,
404
+ "score_ci_high": 0.7142857142857143,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.7142857142857143,
410
+ "accuracy_ci_low": 0.2857142857142857,
411
+ "accuracy_ci_high": 1.0,
412
+ "score_name": "accuracy",
413
+ "score": 0.7142857142857143,
414
+ "score_ci_high": 1.0,
415
+ "score_ci_low": 0.2857142857142857,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 1.0,
420
+ "accuracy_ci_low": 1.0,
421
+ "accuracy_ci_high": 1.0,
422
+ "score_name": "accuracy",
423
+ "score": 1.0,
424
+ "score_ci_high": 1.0,
425
+ "score_ci_low": 1.0,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.7142857142857143,
430
+ "accuracy_ci_low": 0.2857142857142857,
431
+ "accuracy_ci_high": 1.0,
432
+ "score_name": "accuracy",
433
+ "score": 0.7142857142857143,
434
+ "score_ci_high": 1.0,
435
+ "score_ci_low": 0.2857142857142857,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.7142857142857143,
440
+ "accuracy_ci_low": 0.2857142857142857,
441
+ "accuracy_ci_high": 1.0,
442
+ "score_name": "accuracy",
443
+ "score": 0.7142857142857143,
444
+ "score_ci_high": 1.0,
445
+ "score_ci_low": 0.2857142857142857,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.7142857142857143,
450
+ "accuracy_ci_low": 0.2857142857142857,
451
+ "accuracy_ci_high": 1.0,
452
+ "score_name": "accuracy",
453
+ "score": 0.7142857142857143,
454
+ "score_ci_high": 1.0,
455
+ "score_ci_low": 0.2857142857142857,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.8571428571428571,
460
+ "accuracy_ci_low": 0.2530277506117974,
461
+ "accuracy_ci_high": 1.0,
462
+ "score_name": "accuracy",
463
+ "score": 0.8571428571428571,
464
+ "score_ci_high": 1.0,
465
+ "score_ci_low": 0.2530277506117974,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.7142857142857143,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.3633333333333333,
475
+ "f1_suggestive": 0.25,
476
+ "f1_generic": 0.5,
477
+ "f1_fanciful": 0.4,
478
+ "f1_descriptive": 0.6666666666666666,
479
+ "f1_arbitrary": 0.0,
480
+ "f1_macro_ci_low": 0.18019056979170386,
481
+ "f1_macro_ci_high": 0.6700601186500376,
482
+ "score_name": "f1_micro",
483
+ "score": 0.38461538461538464,
484
+ "score_ci_high": 0.6317641031035699,
485
+ "score_ci_low": 0.16,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.25,
488
+ "accuracy_ci_low": 0.1,
489
+ "accuracy_ci_high": 0.5,
490
+ "f1_micro": 0.38461538461538464,
491
+ "f1_micro_ci_low": 0.16,
492
+ "f1_micro_ci_high": 0.6317641031035699
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.17647058823529413,
496
+ "f1_no": 0.35294117647058826,
497
+ "f1_yes": 0.0,
498
+ "f1_macro_ci_low": 0.0625,
499
+ "f1_macro_ci_high": 0.34407383963381494,
500
+ "score_name": "f1_micro",
501
+ "score": 0.2608695652173913,
502
+ "score_ci_high": 0.5714285714285714,
503
+ "score_ci_low": 0.09523809523809523,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.15,
506
+ "accuracy_ci_low": 0.05,
507
+ "accuracy_ci_high": 0.4,
508
+ "f1_micro": 0.2608695652173913,
509
+ "f1_micro_ci_low": 0.09523809523809523,
510
+ "f1_micro_ci_high": 0.5714285714285714
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.14285714285714285,
514
+ "f1_conclusion": 0.0,
515
+ "f1_issue": 0.25,
516
+ "f1_decree": 0.0,
517
+ "f1_rule": 0.0,
518
+ "f1_analysis": 0.0,
519
+ "f1_facts": 0.75,
520
+ "f1_procedural history": 0.0,
521
+ "f1_macro_ci_low": 0.027938635003925405,
522
+ "f1_macro_ci_high": 0.25,
523
+ "score_name": "f1_micro",
524
+ "score": 0.23529411764705882,
525
+ "score_ci_high": 0.451025257708528,
526
+ "score_ci_low": 0.058823529411764705,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.2,
529
+ "accuracy_ci_low": 0.05,
530
+ "accuracy_ci_high": 0.4,
531
+ "f1_micro": 0.23529411764705882,
532
+ "f1_micro_ci_low": 0.058823529411764705,
533
+ "f1_micro_ci_high": 0.451025257708528
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.7013888888888888,
537
+ "f1_yes": 0.7777777777777778,
538
+ "f1_no": 0.625,
539
+ "f1_macro_ci_low": 0.4822715139171238,
540
+ "f1_macro_ci_high": 0.8740955338427088,
541
+ "score_name": "f1_micro",
542
+ "score": 0.7058823529411765,
543
+ "score_ci_high": 0.8648648648648649,
544
+ "score_ci_low": 0.4827586206896552,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.6,
547
+ "accuracy_ci_low": 0.4,
548
+ "accuracy_ci_high": 0.8,
549
+ "f1_micro": 0.7058823529411765,
550
+ "f1_micro_ci_low": 0.4827586206896552,
551
+ "f1_micro_ci_high": 0.8648648648648649
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.9705882352941176,
555
+ "f1_yes": 0.9411764705882353,
556
+ "f1_no": 1.0,
557
+ "f1_macro_ci_low": 0.8333333333333333,
558
+ "f1_macro_ci_high": 1.0,
559
+ "score_name": "f1_micro",
560
+ "score": 0.9743589743589743,
561
+ "score_ci_high": 1.0,
562
+ "score_ci_low": 0.8571428571428571,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.95,
565
+ "accuracy_ci_low": 0.75,
566
+ "accuracy_ci_high": 1.0,
567
+ "f1_micro": 0.9743589743589743,
568
+ "f1_micro_ci_low": 0.8571428571428571,
569
+ "f1_micro_ci_high": 1.0
570
+ },
571
+ "score": 0.5122040789559972,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.6497871572871573,
578
+ "f1_cars": 0.9090909090909091,
579
+ "f1_windows x": 0.75,
580
+ "f1_computer graphics": 0.7142857142857143,
581
+ "f1_atheism": 0.2857142857142857,
582
+ "f1_religion": 0.0,
583
+ "f1_medicine": 0.8571428571428571,
584
+ "f1_christianity": 0.8571428571428571,
585
+ "f1_microsoft windows": 0.8,
586
+ "f1_middle east": 0.6666666666666666,
587
+ "f1_motorcycles": 0.4444444444444444,
588
+ "f1_pc hardware": 0.7142857142857143,
589
+ "f1_mac hardware": 1.0,
590
+ "f1_electronics": 0.4,
591
+ "f1_for sale": 0.75,
592
+ "f1_guns": 0.4444444444444444,
593
+ "f1_space": 0.75,
594
+ "f1_cryptography": 0.4,
595
+ "f1_baseball": 1.0,
596
+ "f1_hockey": 0.8888888888888888,
597
+ "f1_politics": 0.36363636363636365,
598
+ "f1_macro_ci_low": 0.5620624688463499,
599
+ "f1_macro_ci_high": 0.7570253093572227,
600
+ "score_name": "f1_micro",
601
+ "score": 0.6815642458100558,
602
+ "score_ci_high": 0.7640449438202247,
603
+ "score_ci_low": 0.5781960812529161,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.61,
606
+ "accuracy_ci_low": 0.51,
607
+ "accuracy_ci_high": 0.7,
608
+ "f1_micro": 0.6815642458100558,
609
+ "f1_micro_ci_low": 0.5781960812529161,
610
+ "f1_micro_ci_high": 0.7640449438202247
611
+ },
612
+ "score": 0.6815642458100558,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.7978772331056343,
619
+ "f1_debt collection": 0.9090909090909091,
620
+ "f1_checking or savings account": 0.631578947368421,
621
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.8867924528301887,
622
+ "f1_mortgage": 0.6666666666666666,
623
+ "f1_credit card or prepaid card": 0.8,
624
+ "f1_payday loan or title loan or personal loan": 0.8,
625
+ "f1_student loan": 0.8888888888888888,
626
+ "f1_money transfer or virtual currency or money service": 0.8,
627
+ "f1_macro_ci_low": 0.6793031276678277,
628
+ "f1_macro_ci_high": 0.8893682569671233,
629
+ "score_name": "f1_micro",
630
+ "score": 0.84375,
631
+ "score_ci_high": 0.9035402051084471,
632
+ "score_ci_low": 0.7562033534601807,
633
+ "num_of_instances": 100,
634
+ "accuracy": 0.81,
635
+ "accuracy_ci_low": 0.72,
636
+ "accuracy_ci_high": 0.88,
637
+ "f1_micro": 0.84375,
638
+ "f1_micro_ci_low": 0.7562033534601807,
639
+ "f1_micro_ci_high": 0.9035402051084471
640
+ },
641
+ "cfpb_product_watsonx": {
642
+ "f1_macro": 0.8289604115691072,
643
+ "f1_mortgages and loans": 0.782608695652174,
644
+ "f1_credit card": 0.8571428571428571,
645
+ "f1_debt collection": 0.7777777777777778,
646
+ "f1_credit reporting": 0.7272727272727273,
647
+ "f1_retail banking": 1.0,
648
+ "f1_macro_ci_low": 0.7098713502417807,
649
+ "f1_macro_ci_high": 0.9150602383003535,
650
+ "score_name": "f1_micro",
651
+ "score": 0.8163265306122449,
652
+ "score_ci_high": 0.9072164948453608,
653
+ "score_ci_low": 0.6839680616837706,
654
+ "num_of_instances": 50,
655
+ "accuracy": 0.8,
656
+ "accuracy_ci_low": 0.66,
657
+ "accuracy_ci_high": 0.9,
658
+ "f1_micro": 0.8163265306122449,
659
+ "f1_micro_ci_low": 0.6839680616837706,
660
+ "f1_micro_ci_high": 0.9072164948453608
661
+ },
662
+ "score": 0.8300382653061225,
663
+ "score_name": "subsets_mean",
664
+ "num_of_instances": 150
665
+ },
666
+ "qa_finance": {
667
+ "fin_qa": {
668
+ "num_of_instances": 100,
669
+ "program_accuracy": 0.22,
670
+ "score": 0.22,
671
+ "score_name": "program_accuracy",
672
+ "execution_accuracy": 0.22,
673
+ "program_accuracy_ci_low": 0.15,
674
+ "program_accuracy_ci_high": 0.3,
675
+ "score_ci_low": 0.15,
676
+ "score_ci_high": 0.3,
677
+ "execution_accuracy_ci_low": 0.14,
678
+ "execution_accuracy_ci_high": 0.31
679
+ },
680
+ "score": 0.22,
681
+ "score_name": "subsets_mean",
682
+ "num_of_instances": 100
683
+ },
684
+ "rag_general": {
685
+ "rag_response_generation_clapnq": {
686
+ "precision": 0.42816457156991794,
687
+ "recall": 0.4569091793970677,
688
+ "f1": 0.4124340232392918,
689
+ "precision_ci_low": 0.38847540220366156,
690
+ "precision_ci_high": 0.4624925368575603,
691
+ "recall_ci_low": 0.41415283802156605,
692
+ "recall_ci_high": 0.4977227170242366,
693
+ "f1_ci_low": 0.37251708583536297,
694
+ "f1_ci_high": 0.44397312360149926,
695
+ "score_name": "f1",
696
+ "score": 0.4124340232392918,
697
+ "score_ci_high": 0.44397312360149926,
698
+ "score_ci_low": 0.37251708583536297,
699
+ "num_of_instances": 100,
700
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6425585800409317,
701
+ "correctness_recall_bert_score.deberta_large_mnli": 0.652276462316513,
702
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6418503573536873,
703
+ "faithfullness_f1_token_overlap": 0.29098835965027586,
704
+ "faithfullness_recall_token_overlap": 0.21164244081926975,
705
+ "faithfullness_precision_token_overlap": 0.5818246023074958,
706
+ "correctness_f1_token_overlap": 0.4124340232392918,
707
+ "correctness_recall_token_overlap": 0.4569091793970677,
708
+ "correctness_precision_token_overlap": 0.42816457156991794
709
+ },
710
+ "score": 0.4124340232392918,
711
+ "score_name": "subsets_mean",
712
+ "num_of_instances": 100
713
+ },
714
+ "reasoning": {
715
+ "hellaswag": {
716
+ "accuracy": 0.6,
717
+ "accuracy_ci_low": 0.5,
718
+ "accuracy_ci_high": 0.69,
719
+ "score_name": "accuracy",
720
+ "score": 0.6,
721
+ "score_ci_high": 0.69,
722
+ "score_ci_low": 0.5,
723
+ "num_of_instances": 100
724
+ },
725
+ "openbook_qa": {
726
+ "accuracy": 0.98,
727
+ "accuracy_ci_low": 0.9335270622787811,
728
+ "accuracy_ci_high": 1.0,
729
+ "score_name": "accuracy",
730
+ "score": 0.98,
731
+ "score_ci_high": 1.0,
732
+ "score_ci_low": 0.9335270622787811,
733
+ "num_of_instances": 100
734
+ },
735
+ "score": 0.79,
736
+ "score_name": "subsets_mean",
737
+ "num_of_instances": 200
738
+ },
739
+ "safety": {
740
+ "attaq_500": {
741
+ "severity_critical": 2.0,
742
+ "severity_high": 4.0,
743
+ "severity_medium": 1.0,
744
+ "severity_low": 93.0,
745
+ "category_harmful_info": 0.8446131563368475,
746
+ "category_deception": 0.8694894835352899,
747
+ "category_explicit_content": 0.8349081497601788,
748
+ "category_violence": 0.8561860295130856,
749
+ "category_discrimination": 0.5972544011615571,
750
+ "category_substance_abuse": 0.7571461247073279,
751
+ "category_pii": 0.8777099869869374,
752
+ "safety": 0.8296369476926821,
753
+ "safety_ci_low": 0.7947081403593574,
754
+ "safety_ci_high": 0.854721410970828,
755
+ "score_name": "safety",
756
+ "score": 0.8296369476926821,
757
+ "score_ci_high": 0.854721410970828,
758
+ "score_ci_low": 0.7947081403593574,
759
+ "num_of_instances": 100
760
+ },
761
+ "score": 0.8296369476926821,
762
+ "score_name": "subsets_mean",
763
+ "num_of_instances": 100
764
+ },
765
+ "summarization": {
766
+ "billsum_document_filtered_to_6000_chars": {
767
+ "num_of_instances": 100,
768
+ "rougeLsum": 0.3155384193965095,
769
+ "rougeL": 0.25469134482285044,
770
+ "score": 0.25469134482285044,
771
+ "score_name": "rougeL",
772
+ "rouge2": 0.15907926615964418,
773
+ "rouge1": 0.3833864851287575,
774
+ "rougeLsum_ci_low": 0.2996638813296639,
775
+ "rougeLsum_ci_high": 0.3326394288772057,
776
+ "rougeL_ci_low": 0.24161978459821,
777
+ "rougeL_ci_high": 0.26831914430370146,
778
+ "score_ci_low": 0.24161978459821,
779
+ "score_ci_high": 0.26831914430370146,
780
+ "rouge2_ci_low": 0.1467767241227101,
781
+ "rouge2_ci_high": 0.17197775433953583,
782
+ "rouge1_ci_low": 0.36496410681499314,
783
+ "rouge1_ci_high": 0.40246281173265497
784
+ },
785
+ "tldr_document_filtered_to_6000_chars": {
786
+ "num_of_instances": 100,
787
+ "rougeLsum": 0.08782340909675872,
788
+ "rougeL": 0.0806960467916117,
789
+ "score": 0.0806960467916117,
790
+ "score_name": "rougeL",
791
+ "rouge2": 0.01261429318018865,
792
+ "rouge1": 0.10735626860797201,
793
+ "rougeLsum_ci_low": 0.07616917098546631,
794
+ "rougeLsum_ci_high": 0.09928669375535364,
795
+ "rougeL_ci_low": 0.07035598673560337,
796
+ "rougeL_ci_high": 0.09124239241278399,
797
+ "score_ci_low": 0.07035598673560337,
798
+ "score_ci_high": 0.09124239241278399,
799
+ "rouge2_ci_low": 0.00904567570343254,
800
+ "rouge2_ci_high": 0.01774990986481843,
801
+ "rouge1_ci_low": 0.09291416338971802,
802
+ "rouge1_ci_high": 0.12336514406931602
803
+ },
804
+ "score": 0.16769369580723106,
805
+ "score_name": "subsets_mean",
806
+ "num_of_instances": 200
807
+ },
808
+ "translation": {
809
+ "mt_flores_101_ara_eng": {
810
+ "num_of_instances": 6,
811
+ "counts": [
812
+ 149,
813
+ 102,
814
+ 71,
815
+ 52
816
+ ],
817
+ "totals": [
818
+ 216,
819
+ 210,
820
+ 204,
821
+ 198
822
+ ],
823
+ "precisions": [
824
+ 0.6898148148148148,
825
+ 0.4857142857142857,
826
+ 0.3480392156862745,
827
+ 0.26262626262626265
828
+ ],
829
+ "bp": 1.0,
830
+ "sys_len": 216,
831
+ "ref_len": 208,
832
+ "sacrebleu": 0.41833088778991234,
833
+ "score": 0.41833088778991234,
834
+ "score_name": "sacrebleu",
835
+ "score_ci_low": 0.22309127621301938,
836
+ "score_ci_high": 0.5551361520369908,
837
+ "sacrebleu_ci_low": 0.22309127621301938,
838
+ "sacrebleu_ci_high": 0.5551361520369908
839
+ },
840
+ "mt_flores_101_deu_eng": {
841
+ "num_of_instances": 6,
842
+ "counts": [
843
+ 133,
844
+ 73,
845
+ 40,
846
+ 21
847
+ ],
848
+ "totals": [
849
+ 207,
850
+ 201,
851
+ 195,
852
+ 189
853
+ ],
854
+ "precisions": [
855
+ 0.6425120772946861,
856
+ 0.36318407960199006,
857
+ 0.20512820512820515,
858
+ 0.1111111111111111
859
+ ],
860
+ "bp": 0.9951807322415573,
861
+ "sys_len": 207,
862
+ "ref_len": 208,
863
+ "sacrebleu": 0.26875076008809856,
864
+ "score": 0.26875076008809856,
865
+ "score_name": "sacrebleu",
866
+ "score_ci_low": 0.18506226564810085,
867
+ "score_ci_high": 0.3636880613232993,
868
+ "sacrebleu_ci_low": 0.18506226564810085,
869
+ "sacrebleu_ci_high": 0.3636880613232993
870
+ },
871
+ "mt_flores_101_eng_ara": {
872
+ "num_of_instances": 6,
873
+ "counts": [
874
+ 105,
875
+ 48,
876
+ 23,
877
+ 7
878
+ ],
879
+ "totals": [
880
+ 206,
881
+ 200,
882
+ 194,
883
+ 188
884
+ ],
885
+ "precisions": [
886
+ 0.5097087378640777,
887
+ 0.24,
888
+ 0.11855670103092784,
889
+ 0.03723404255319149
890
+ ],
891
+ "bp": 0.9855424223451845,
892
+ "sys_len": 206,
893
+ "ref_len": 209,
894
+ "sacrebleu": 0.1502364204140093,
895
+ "score": 0.1502364204140093,
896
+ "score_name": "sacrebleu",
897
+ "score_ci_low": 0.11826993011246083,
898
+ "score_ci_high": 0.19102971971075194,
899
+ "sacrebleu_ci_low": 0.11826993011246083,
900
+ "sacrebleu_ci_high": 0.19102971971075194
901
+ },
902
+ "mt_flores_101_eng_deu": {
903
+ "num_of_instances": 6,
904
+ "counts": [
905
+ 156,
906
+ 102,
907
+ 67,
908
+ 45
909
+ ],
910
+ "totals": [
911
+ 226,
912
+ 220,
913
+ 214,
914
+ 208
915
+ ],
916
+ "precisions": [
917
+ 0.6902654867256638,
918
+ 0.4636363636363637,
919
+ 0.3130841121495327,
920
+ 0.21634615384615383
921
+ ],
922
+ "bp": 1.0,
923
+ "sys_len": 226,
924
+ "ref_len": 216,
925
+ "sacrebleu": 0.38370809771559045,
926
+ "score": 0.38370809771559045,
927
+ "score_name": "sacrebleu",
928
+ "score_ci_low": 0.32546589518529795,
929
+ "score_ci_high": 0.4809469679789126,
930
+ "sacrebleu_ci_low": 0.32546589518529795,
931
+ "sacrebleu_ci_high": 0.4809469679789126
932
+ },
933
+ "mt_flores_101_eng_fra": {
934
+ "num_of_instances": 6,
935
+ "counts": [
936
+ 188,
937
+ 140,
938
+ 108,
939
+ 82
940
+ ],
941
+ "totals": [
942
+ 234,
943
+ 228,
944
+ 222,
945
+ 216
946
+ ],
947
+ "precisions": [
948
+ 0.8034188034188035,
949
+ 0.6140350877192983,
950
+ 0.48648648648648646,
951
+ 0.3796296296296296
952
+ ],
953
+ "bp": 0.9957356141520489,
954
+ "sys_len": 234,
955
+ "ref_len": 235,
956
+ "sacrebleu": 0.5470607497888952,
957
+ "score": 0.5470607497888952,
958
+ "score_name": "sacrebleu",
959
+ "score_ci_low": 0.47953252864775403,
960
+ "score_ci_high": 0.6448616746290322,
961
+ "sacrebleu_ci_low": 0.47953252864775403,
962
+ "sacrebleu_ci_high": 0.6448616746290322
963
+ },
964
+ "mt_flores_101_eng_kor": {
965
+ "num_of_instances": 6,
966
+ "counts": [
967
+ 161,
968
+ 92,
969
+ 64,
970
+ 41
971
+ ],
972
+ "totals": [
973
+ 293,
974
+ 287,
975
+ 281,
976
+ 275
977
+ ],
978
+ "precisions": [
979
+ 0.5494880546075085,
980
+ 0.32055749128919864,
981
+ 0.22775800711743774,
982
+ 0.14909090909090908
983
+ ],
984
+ "bp": 1.0,
985
+ "sys_len": 293,
986
+ "ref_len": 249,
987
+ "sacrebleu": 0.2780976063541932,
988
+ "score": 0.2780976063541932,
989
+ "score_name": "sacrebleu",
990
+ "score_ci_low": 0.20701213069487231,
991
+ "score_ci_high": 0.35018536218336366,
992
+ "sacrebleu_ci_low": 0.20701213069487231,
993
+ "sacrebleu_ci_high": 0.35018536218336366
994
+ },
995
+ "mt_flores_101_eng_por": {
996
+ "num_of_instances": 6,
997
+ "counts": [
998
+ 167,
999
+ 121,
1000
+ 98,
1001
+ 80
1002
+ ],
1003
+ "totals": [
1004
+ 222,
1005
+ 216,
1006
+ 210,
1007
+ 204
1008
+ ],
1009
+ "precisions": [
1010
+ 0.7522522522522523,
1011
+ 0.5601851851851852,
1012
+ 0.4666666666666666,
1013
+ 0.3921568627450981
1014
+ ],
1015
+ "bp": 1.0,
1016
+ "sys_len": 222,
1017
+ "ref_len": 222,
1018
+ "sacrebleu": 0.5269755054778946,
1019
+ "score": 0.5269755054778946,
1020
+ "score_name": "sacrebleu",
1021
+ "score_ci_low": 0.4390770726088711,
1022
+ "score_ci_high": 0.618511107645852,
1023
+ "sacrebleu_ci_low": 0.4390770726088711,
1024
+ "sacrebleu_ci_high": 0.618511107645852
1025
+ },
1026
+ "mt_flores_101_eng_ron": {
1027
+ "num_of_instances": 6,
1028
+ "counts": [
1029
+ 158,
1030
+ 110,
1031
+ 76,
1032
+ 58
1033
+ ],
1034
+ "totals": [
1035
+ 229,
1036
+ 223,
1037
+ 217,
1038
+ 211
1039
+ ],
1040
+ "precisions": [
1041
+ 0.6899563318777292,
1042
+ 0.49327354260089684,
1043
+ 0.35023041474654376,
1044
+ 0.27488151658767773
1045
+ ],
1046
+ "bp": 0.9956427084340843,
1047
+ "sys_len": 229,
1048
+ "ref_len": 230,
1049
+ "sacrebleu": 0.4235997775817295,
1050
+ "score": 0.4235997775817295,
1051
+ "score_name": "sacrebleu",
1052
+ "score_ci_low": 0.3409918074462566,
1053
+ "score_ci_high": 0.498548979286331,
1054
+ "sacrebleu_ci_low": 0.3409918074462566,
1055
+ "sacrebleu_ci_high": 0.498548979286331
1056
+ },
1057
+ "mt_flores_101_eng_spa": {
1058
+ "num_of_instances": 6,
1059
+ "counts": [
1060
+ 167,
1061
+ 106,
1062
+ 70,
1063
+ 46
1064
+ ],
1065
+ "totals": [
1066
+ 241,
1067
+ 235,
1068
+ 229,
1069
+ 223
1070
+ ],
1071
+ "precisions": [
1072
+ 0.6929460580912863,
1073
+ 0.451063829787234,
1074
+ 0.3056768558951965,
1075
+ 0.2062780269058296
1076
+ ],
1077
+ "bp": 0.9917355844244373,
1078
+ "sys_len": 241,
1079
+ "ref_len": 243,
1080
+ "sacrebleu": 0.37158582278668184,
1081
+ "score": 0.37158582278668184,
1082
+ "score_name": "sacrebleu",
1083
+ "score_ci_low": 0.2804125214784749,
1084
+ "score_ci_high": 0.5211805362761589,
1085
+ "sacrebleu_ci_low": 0.2804125214784749,
1086
+ "sacrebleu_ci_high": 0.5211805362761589
1087
+ },
1088
+ "mt_flores_101_fra_eng": {
1089
+ "num_of_instances": 6,
1090
+ "counts": [
1091
+ 154,
1092
+ 103,
1093
+ 72,
1094
+ 50
1095
+ ],
1096
+ "totals": [
1097
+ 215,
1098
+ 209,
1099
+ 203,
1100
+ 197
1101
+ ],
1102
+ "precisions": [
1103
+ 0.7162790697674418,
1104
+ 0.49282296650717705,
1105
+ 0.35467980295566504,
1106
+ 0.25380710659898476
1107
+ ],
1108
+ "bp": 1.0,
1109
+ "sys_len": 215,
1110
+ "ref_len": 208,
1111
+ "sacrebleu": 0.42220984808679546,
1112
+ "score": 0.42220984808679546,
1113
+ "score_name": "sacrebleu",
1114
+ "score_ci_low": 0.25433812807761014,
1115
+ "score_ci_high": 0.5501728791823551,
1116
+ "sacrebleu_ci_low": 0.25433812807761014,
1117
+ "sacrebleu_ci_high": 0.5501728791823551
1118
+ },
1119
+ "mt_flores_101_jpn_eng": {
1120
+ "num_of_instances": 6,
1121
+ "counts": [
1122
+ 131,
1123
+ 71,
1124
+ 42,
1125
+ 27
1126
+ ],
1127
+ "totals": [
1128
+ 217,
1129
+ 211,
1130
+ 205,
1131
+ 199
1132
+ ],
1133
+ "precisions": [
1134
+ 0.6036866359447004,
1135
+ 0.33649289099526064,
1136
+ 0.20487804878048782,
1137
+ 0.135678391959799
1138
+ ],
1139
+ "bp": 1.0,
1140
+ "sys_len": 217,
1141
+ "ref_len": 208,
1142
+ "sacrebleu": 0.27412484118758285,
1143
+ "score": 0.27412484118758285,
1144
+ "score_name": "sacrebleu",
1145
+ "score_ci_low": 0.1787042840421968,
1146
+ "score_ci_high": 0.3885700254380748,
1147
+ "sacrebleu_ci_low": 0.1787042840421968,
1148
+ "sacrebleu_ci_high": 0.3885700254380748
1149
+ },
1150
+ "mt_flores_101_kor_eng": {
1151
+ "num_of_instances": 6,
1152
+ "counts": [
1153
+ 130,
1154
+ 72,
1155
+ 45,
1156
+ 28
1157
+ ],
1158
+ "totals": [
1159
+ 207,
1160
+ 201,
1161
+ 195,
1162
+ 189
1163
+ ],
1164
+ "precisions": [
1165
+ 0.6280193236714976,
1166
+ 0.3582089552238806,
1167
+ 0.23076923076923075,
1168
+ 0.14814814814814814
1169
+ ],
1170
+ "bp": 0.9951807322415573,
1171
+ "sys_len": 207,
1172
+ "ref_len": 208,
1173
+ "sacrebleu": 0.29471202252625334,
1174
+ "score": 0.29471202252625334,
1175
+ "score_name": "sacrebleu",
1176
+ "score_ci_low": 0.18564904024888085,
1177
+ "score_ci_high": 0.43533569209550654,
1178
+ "sacrebleu_ci_low": 0.18564904024888085,
1179
+ "sacrebleu_ci_high": 0.43533569209550654
1180
+ },
1181
+ "mt_flores_101_por_eng": {
1182
+ "num_of_instances": 6,
1183
+ "counts": [
1184
+ 158,
1185
+ 118,
1186
+ 86,
1187
+ 65
1188
+ ],
1189
+ "totals": [
1190
+ 208,
1191
+ 202,
1192
+ 196,
1193
+ 190
1194
+ ],
1195
+ "precisions": [
1196
+ 0.7596153846153847,
1197
+ 0.5841584158415841,
1198
+ 0.4387755102040816,
1199
+ 0.34210526315789475
1200
+ ],
1201
+ "bp": 1.0,
1202
+ "sys_len": 208,
1203
+ "ref_len": 208,
1204
+ "sacrebleu": 0.508020960609301,
1205
+ "score": 0.508020960609301,
1206
+ "score_name": "sacrebleu",
1207
+ "score_ci_low": 0.3852024135492994,
1208
+ "score_ci_high": 0.6339850080617023,
1209
+ "sacrebleu_ci_low": 0.3852024135492994,
1210
+ "sacrebleu_ci_high": 0.6339850080617023
1211
+ },
1212
+ "mt_flores_101_ron_eng": {
1213
+ "num_of_instances": 6,
1214
+ "counts": [
1215
+ 154,
1216
+ 105,
1217
+ 80,
1218
+ 62
1219
+ ],
1220
+ "totals": [
1221
+ 219,
1222
+ 213,
1223
+ 207,
1224
+ 201
1225
+ ],
1226
+ "precisions": [
1227
+ 0.7031963470319634,
1228
+ 0.4929577464788732,
1229
+ 0.3864734299516908,
1230
+ 0.30845771144278605
1231
+ ],
1232
+ "bp": 1.0,
1233
+ "sys_len": 219,
1234
+ "ref_len": 208,
1235
+ "sacrebleu": 0.4508690185598596,
1236
+ "score": 0.4508690185598596,
1237
+ "score_name": "sacrebleu",
1238
+ "score_ci_low": 0.3139650605807255,
1239
+ "score_ci_high": 0.5910830214810606,
1240
+ "sacrebleu_ci_low": 0.3139650605807255,
1241
+ "sacrebleu_ci_high": 0.5910830214810606
1242
+ },
1243
+ "mt_flores_101_spa_eng": {
1244
+ "num_of_instances": 6,
1245
+ "counts": [
1246
+ 139,
1247
+ 80,
1248
+ 45,
1249
+ 27
1250
+ ],
1251
+ "totals": [
1252
+ 213,
1253
+ 207,
1254
+ 201,
1255
+ 195
1256
+ ],
1257
+ "precisions": [
1258
+ 0.6525821596244131,
1259
+ 0.3864734299516908,
1260
+ 0.22388059701492538,
1261
+ 0.13846153846153847
1262
+ ],
1263
+ "bp": 1.0,
1264
+ "sys_len": 213,
1265
+ "ref_len": 208,
1266
+ "sacrebleu": 0.2973549084935826,
1267
+ "score": 0.2973549084935826,
1268
+ "score_name": "sacrebleu",
1269
+ "score_ci_low": 0.12139143054384333,
1270
+ "score_ci_high": 0.3741845222806575,
1271
+ "sacrebleu_ci_low": 0.12139143054384333,
1272
+ "sacrebleu_ci_high": 0.3741845222806575
1273
+ },
1274
+ "score": 0.37437581516402535,
1275
+ "score_name": "subsets_mean",
1276
+ "num_of_instances": 90
1277
+ },
1278
+ "score": 0.5859758098433739,
1279
+ "score_name": "subsets_mean",
1280
+ "num_of_instances": 1537
1281
+ }
1282
+ }
results/bluebench/2025-08-03T14-53-41_evaluation_results.json ADDED
@@ -0,0 +1,1282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-08-03T18:53:37.602008Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=azure/Azure/o4-mini-ncf,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "azure/Azure/o4-mini-ncf",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.26.5",
45
+ "unitxt_commit_hash": "N/A",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "tiktoken": "0.9.0",
55
+ "charset-normalizer": "3.4.2",
56
+ "nvidia-cuda-runtime-cu12": "12.6.77",
57
+ "pyarrow": "21.0.0",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "httpcore": "1.0.9",
61
+ "pip": "25.2",
62
+ "certifi": "2025.7.14",
63
+ "evaluate": "0.4.5",
64
+ "Jinja2": "3.1.6",
65
+ "jsonschema-specifications": "2025.4.1",
66
+ "pydantic_core": "2.33.2",
67
+ "nvidia-cusparse-cu12": "12.5.4.2",
68
+ "aiosignal": "1.4.0",
69
+ "yarl": "1.20.1",
70
+ "unitxt": "1.26.5",
71
+ "jsonschema": "4.25.0",
72
+ "portalocker": "3.2.0",
73
+ "multiprocess": "0.70.16",
74
+ "nvidia-nvjitlink-cu12": "12.6.85",
75
+ "nvidia-cublas-cu12": "12.6.4.1",
76
+ "pydantic": "2.11.7",
77
+ "async-timeout": "5.0.1",
78
+ "annotated-types": "0.7.0",
79
+ "rouge_score": "0.1.2",
80
+ "contourpy": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "matplotlib": "3.10.5",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "h11": "0.16.0",
87
+ "zipp": "3.19.2",
88
+ "tzdata": "2025.2",
89
+ "bert-score": "0.3.13",
90
+ "setuptools": "80.9.0",
91
+ "referencing": "0.36.2",
92
+ "sacrebleu": "2.5.1",
93
+ "filelock": "3.18.0",
94
+ "urllib3": "2.5.0",
95
+ "scipy": "1.15.3",
96
+ "nvidia-nccl-cu12": "2.26.2",
97
+ "kiwisolver": "1.4.8",
98
+ "networkx": "3.4.2",
99
+ "typing-inspection": "0.4.1",
100
+ "sniffio": "1.3.1",
101
+ "rpds-py": "0.26.0",
102
+ "nvidia-curand-cu12": "10.3.7.77",
103
+ "litellm": "1.74.12",
104
+ "pillow": "11.3.0",
105
+ "datasets": "3.6.0",
106
+ "nvidia-cusolver-cu12": "11.7.1.2",
107
+ "cycler": "0.12.1",
108
+ "tokenizers": "0.21.4",
109
+ "distro": "1.9.0",
110
+ "idna": "3.10",
111
+ "MarkupSafe": "3.0.2",
112
+ "frozenlist": "1.7.0",
113
+ "pyparsing": "3.2.3",
114
+ "regex": "2025.7.34",
115
+ "jiter": "0.10.0",
116
+ "importlib_metadata": "8.0.0",
117
+ "packaging": "24.2",
118
+ "psutil": "7.0.0",
119
+ "mecab-ko-dic": "1.0.0",
120
+ "joblib": "1.5.1",
121
+ "transformers": "4.54.1",
122
+ "fsspec": "2025.3.0",
123
+ "scikit-learn": "1.7.1",
124
+ "dill": "0.3.8",
125
+ "wheel": "0.45.1",
126
+ "nvidia-nvtx-cu12": "12.6.77",
127
+ "nvidia-cusparselt-cu12": "0.6.3",
128
+ "lxml": "6.0.0",
129
+ "propcache": "0.3.2",
130
+ "numpy": "2.2.6",
131
+ "mpmath": "1.3.0",
132
+ "conllu": "6.0.0",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "fonttools": "4.59.0",
136
+ "tabulate": "0.9.0",
137
+ "typing_extensions": "4.12.2",
138
+ "absl-py": "2.3.1",
139
+ "accelerate": "1.9.0",
140
+ "nvidia-cufft-cu12": "11.3.0.4",
141
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
142
+ "click": "8.2.1",
143
+ "attrs": "25.3.0",
144
+ "exceptiongroup": "1.3.0",
145
+ "tenacity": "9.1.2",
146
+ "huggingface-hub": "0.34.3",
147
+ "pytz": "2025.2",
148
+ "aiohappyeyeballs": "2.6.1",
149
+ "python-dateutil": "2.9.0.post0",
150
+ "torch": "2.7.1",
151
+ "python-dotenv": "1.1.1",
152
+ "multidict": "6.6.3",
153
+ "httpx": "0.28.1",
154
+ "aiohttp": "3.12.15",
155
+ "xxhash": "3.5.0",
156
+ "PyYAML": "6.0.2",
157
+ "colorama": "0.4.6",
158
+ "openai": "1.98.0",
159
+ "threadpoolctl": "3.6.0",
160
+ "nvidia-cudnn-cu12": "9.5.1.17",
161
+ "pandas": "2.3.1",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 1.0,
180
+ "accuracy_ci_low": 1.0,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 1.0,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 1.0,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.8888888888888888,
190
+ "accuracy_ci_low": 0.4444444444444444,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 0.8888888888888888,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 0.4444444444444444,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 1.0,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 1.0,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.7777777777777778,
240
+ "accuracy_ci_low": 0.4444444444444444,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 0.7777777777777778,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 0.4444444444444444,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.8888888888888888,
260
+ "accuracy_ci_low": 0.5555555555555556,
261
+ "accuracy_ci_high": 1.0,
262
+ "score_name": "accuracy",
263
+ "score": 0.8888888888888888,
264
+ "score_ci_high": 1.0,
265
+ "score_ci_low": 0.5555555555555556,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.8888888888888888,
270
+ "accuracy_ci_low": 0.5555555555555556,
271
+ "accuracy_ci_high": 1.0,
272
+ "score_name": "accuracy",
273
+ "score": 0.8888888888888888,
274
+ "score_ci_high": 1.0,
275
+ "score_ci_low": 0.5555555555555556,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 1.0,
280
+ "accuracy_ci_low": 1.0,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 1.0,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 1.0,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.9494949494949495,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.11387900355871886,
296
+ "score": 0.11387900355871886,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.11387900355871886,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.851063829787234,
307
+ "f1_Organization": 0.6984126984126985,
308
+ "f1_Location": 0.7346938775510204,
309
+ "f1_macro": 0.7613901352503176,
310
+ "recall_macro": 0.80175983436853,
311
+ "precision_macro": 0.7273015873015872,
312
+ "in_classes_support": 1.0,
313
+ "f1_micro": 0.7547169811320756,
314
+ "recall_micro": 0.8,
315
+ "precision_micro": 0.7142857142857143,
316
+ "score": 0.7547169811320756,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.6654206354053237,
319
+ "score_ci_high": 0.8220683069580377,
320
+ "f1_micro_ci_low": 0.6654206354053237,
321
+ "f1_micro_ci_high": 0.8220683069580377
322
+ },
323
+ "score": 0.7547169811320756,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.2857142857142857,
330
+ "accuracy_ci_low": 0.0,
331
+ "accuracy_ci_high": 0.7142857142857143,
332
+ "score_name": "accuracy",
333
+ "score": 0.2857142857142857,
334
+ "score_ci_high": 0.7142857142857143,
335
+ "score_ci_low": 0.0,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.42857142857142855,
340
+ "accuracy_ci_low": 0.14285714285714285,
341
+ "accuracy_ci_high": 0.8571428571428571,
342
+ "score_name": "accuracy",
343
+ "score": 0.42857142857142855,
344
+ "score_ci_high": 0.8571428571428571,
345
+ "score_ci_low": 0.14285714285714285,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.2857142857142857,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.7142857142857143,
352
+ "score_name": "accuracy",
353
+ "score": 0.2857142857142857,
354
+ "score_ci_high": 0.7142857142857143,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.7142857142857143,
360
+ "accuracy_ci_low": 0.2857142857142857,
361
+ "accuracy_ci_high": 1.0,
362
+ "score_name": "accuracy",
363
+ "score": 0.7142857142857143,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 0.2857142857142857,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.5714285714285714,
370
+ "accuracy_ci_low": 0.14285714285714285,
371
+ "accuracy_ci_high": 0.8571428571428571,
372
+ "score_name": "accuracy",
373
+ "score": 0.5714285714285714,
374
+ "score_ci_high": 0.8571428571428571,
375
+ "score_ci_low": 0.14285714285714285,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.2857142857142857,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.7142857142857143,
382
+ "score_name": "accuracy",
383
+ "score": 0.2857142857142857,
384
+ "score_ci_high": 0.7142857142857143,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.5714285714285714,
390
+ "accuracy_ci_low": 0.14285714285714285,
391
+ "accuracy_ci_high": 0.8571428571428571,
392
+ "score_name": "accuracy",
393
+ "score": 0.5714285714285714,
394
+ "score_ci_high": 0.8571428571428571,
395
+ "score_ci_low": 0.14285714285714285,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.0,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.0,
402
+ "score_name": "accuracy",
403
+ "score": 0.0,
404
+ "score_ci_high": 0.0,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.2857142857142857,
410
+ "accuracy_ci_low": 0.0,
411
+ "accuracy_ci_high": 0.7142857142857143,
412
+ "score_name": "accuracy",
413
+ "score": 0.2857142857142857,
414
+ "score_ci_high": 0.7142857142857143,
415
+ "score_ci_low": 0.0,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.7142857142857143,
420
+ "accuracy_ci_low": 0.2857142857142857,
421
+ "accuracy_ci_high": 1.0,
422
+ "score_name": "accuracy",
423
+ "score": 0.7142857142857143,
424
+ "score_ci_high": 1.0,
425
+ "score_ci_low": 0.2857142857142857,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.42857142857142855,
430
+ "accuracy_ci_low": 0.14285714285714285,
431
+ "accuracy_ci_high": 0.8571428571428571,
432
+ "score_name": "accuracy",
433
+ "score": 0.42857142857142855,
434
+ "score_ci_high": 0.8571428571428571,
435
+ "score_ci_low": 0.14285714285714285,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.14285714285714285,
440
+ "accuracy_ci_low": 0.0,
441
+ "accuracy_ci_high": 0.5714285714285714,
442
+ "score_name": "accuracy",
443
+ "score": 0.14285714285714285,
444
+ "score_ci_high": 0.5714285714285714,
445
+ "score_ci_low": 0.0,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.42857142857142855,
450
+ "accuracy_ci_low": 0.14285714285714285,
451
+ "accuracy_ci_high": 0.8571428571428571,
452
+ "score_name": "accuracy",
453
+ "score": 0.42857142857142855,
454
+ "score_ci_high": 0.8571428571428571,
455
+ "score_ci_low": 0.14285714285714285,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.5714285714285714,
460
+ "accuracy_ci_low": 0.14285714285714285,
461
+ "accuracy_ci_high": 0.8571428571428571,
462
+ "score_name": "accuracy",
463
+ "score": 0.5714285714285714,
464
+ "score_ci_high": 0.8571428571428571,
465
+ "score_ci_low": 0.14285714285714285,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.40816326530612246,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.25142857142857145,
475
+ "f1_suggestive": 0.2857142857142857,
476
+ "f1_generic": 0.0,
477
+ "f1_descriptive": 0.5714285714285714,
478
+ "f1_fanciful": 0.4,
479
+ "f1_arbitrary": 0.0,
480
+ "f1_macro_ci_low": 0.11666666666666665,
481
+ "f1_macro_ci_high": 0.5085323419170098,
482
+ "score_name": "f1_micro",
483
+ "score": 0.32,
484
+ "score_ci_high": 0.5714285714285714,
485
+ "score_ci_low": 0.09523809523809523,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.2,
488
+ "accuracy_ci_low": 0.1,
489
+ "accuracy_ci_high": 0.41588290860245253,
490
+ "f1_micro": 0.32,
491
+ "f1_micro_ci_low": 0.09523809523809523,
492
+ "f1_micro_ci_high": 0.5714285714285714
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.5138888888888888,
496
+ "f1_no": 0.5833333333333334,
497
+ "f1_yes": 0.4444444444444444,
498
+ "f1_macro_ci_low": 0.2870981709247007,
499
+ "f1_macro_ci_high": 0.7991422752871308,
500
+ "score_name": "f1_micro",
501
+ "score": 0.5454545454545454,
502
+ "score_ci_high": 0.7428571428571429,
503
+ "score_ci_low": 0.3125,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.45,
506
+ "accuracy_ci_low": 0.25,
507
+ "accuracy_ci_high": 0.65,
508
+ "f1_micro": 0.5454545454545454,
509
+ "f1_micro_ci_low": 0.3125,
510
+ "f1_micro_ci_high": 0.7428571428571429
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.40816326530612246,
514
+ "f1_conclusion": 0.2857142857142857,
515
+ "f1_issue": 0.25,
516
+ "f1_decree": 0.0,
517
+ "f1_rule": 0.0,
518
+ "f1_analysis": 0.5714285714285714,
519
+ "f1_facts": 0.75,
520
+ "f1_procedural history": 1.0,
521
+ "f1_macro_ci_low": 0.24756971939371533,
522
+ "f1_macro_ci_high": 0.6223765832019531,
523
+ "score_name": "f1_micro",
524
+ "score": 0.42105263157894735,
525
+ "score_ci_high": 0.6153846153846154,
526
+ "score_ci_low": 0.17142857142857143,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.4,
529
+ "accuracy_ci_low": 0.2,
530
+ "accuracy_ci_high": 0.6,
531
+ "f1_micro": 0.42105263157894735,
532
+ "f1_micro_ci_low": 0.17142857142857143,
533
+ "f1_micro_ci_high": 0.6153846153846154
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.5248868778280543,
537
+ "f1_yes": 0.46153846153846156,
538
+ "f1_no": 0.5882352941176471,
539
+ "f1_macro_ci_low": 0.2857142857142857,
540
+ "f1_macro_ci_high": 0.7529963905333797,
541
+ "score_name": "f1_micro",
542
+ "score": 0.5333333333333333,
543
+ "score_ci_high": 0.75,
544
+ "score_ci_low": 0.2857142857142857,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.4,
547
+ "accuracy_ci_low": 0.2,
548
+ "accuracy_ci_high": 0.65,
549
+ "f1_micro": 0.5333333333333333,
550
+ "f1_micro_ci_low": 0.2857142857142857,
551
+ "f1_micro_ci_high": 0.75
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 1.0,
555
+ "f1_yes": 1.0,
556
+ "f1_no": 1.0,
557
+ "f1_macro_ci_low": 1.0,
558
+ "f1_macro_ci_high": 1.0,
559
+ "score_name": "f1_micro",
560
+ "score": 1.0,
561
+ "score_ci_high": 1.0,
562
+ "score_ci_low": 1.0,
563
+ "num_of_instances": 20,
564
+ "accuracy": 1.0,
565
+ "accuracy_ci_low": 1.0,
566
+ "accuracy_ci_high": 1.0,
567
+ "f1_micro": 1.0,
568
+ "f1_micro_ci_low": 1.0,
569
+ "f1_micro_ci_high": 1.0
570
+ },
571
+ "score": 0.5639681020733652,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.6391478619419796,
578
+ "f1_cars": 0.6666666666666666,
579
+ "f1_windows x": 0.75,
580
+ "f1_computer graphics": 0.6666666666666666,
581
+ "f1_atheism": 0.2857142857142857,
582
+ "f1_religion": 0.0,
583
+ "f1_medicine": 0.8571428571428571,
584
+ "f1_christianity": 0.75,
585
+ "f1_microsoft windows": 0.6,
586
+ "f1_middle east": 0.8,
587
+ "f1_motorcycles": 0.6,
588
+ "f1_for sale": 0.8,
589
+ "f1_mac hardware": 0.8,
590
+ "f1_electronics": 0.4,
591
+ "f1_guns": 0.5454545454545454,
592
+ "f1_politics": 0.5882352941176471,
593
+ "f1_space": 0.6,
594
+ "f1_pc hardware": 0.9230769230769231,
595
+ "f1_cryptography": 0.4,
596
+ "f1_baseball": 1.0,
597
+ "f1_hockey": 0.75,
598
+ "f1_macro_ci_low": 0.5493287854515565,
599
+ "f1_macro_ci_high": 0.7547100417696575,
600
+ "score_name": "f1_micro",
601
+ "score": 0.6736842105263158,
602
+ "score_ci_high": 0.7637409624905424,
603
+ "score_ci_low": 0.5646836389671384,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.64,
606
+ "accuracy_ci_low": 0.53,
607
+ "accuracy_ci_high": 0.730602617171536,
608
+ "f1_micro": 0.6736842105263158,
609
+ "f1_micro_ci_low": 0.5646836389671384,
610
+ "f1_micro_ci_high": 0.7637409624905424
611
+ },
612
+ "score": 0.6736842105263158,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.7947337190327844,
619
+ "f1_debt collection": 0.8571428571428571,
620
+ "f1_checking or savings account": 0.7,
621
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.8785046728971962,
622
+ "f1_mortgage": 0.6666666666666666,
623
+ "f1_payday loan or title loan or personal loan": 0.8,
624
+ "f1_credit card or prepaid card": 0.9,
625
+ "f1_student loan": 0.8888888888888888,
626
+ "f1_money transfer or virtual currency or money service": 0.6666666666666666,
627
+ "f1_macro_ci_low": 0.6763457146391301,
628
+ "f1_macro_ci_high": 0.886113882457054,
629
+ "score_name": "f1_micro",
630
+ "score": 0.845360824742268,
631
+ "score_ci_high": 0.900523560209424,
632
+ "score_ci_low": 0.7608187004657483,
633
+ "num_of_instances": 100,
634
+ "accuracy": 0.82,
635
+ "accuracy_ci_low": 0.73,
636
+ "accuracy_ci_high": 0.89,
637
+ "f1_micro": 0.845360824742268,
638
+ "f1_micro_ci_low": 0.7608187004657483,
639
+ "f1_micro_ci_high": 0.900523560209424
640
+ },
641
+ "cfpb_product_watsonx": {
642
+ "f1_macro": 0.8501725327812284,
643
+ "f1_mortgages and loans": 0.8333333333333334,
644
+ "f1_credit card": 0.8571428571428571,
645
+ "f1_debt collection": 0.7777777777777778,
646
+ "f1_credit reporting": 0.782608695652174,
647
+ "f1_retail banking": 1.0,
648
+ "f1_macro_ci_low": 0.7375051051121304,
649
+ "f1_macro_ci_high": 0.9324784759448929,
650
+ "score_name": "f1_micro",
651
+ "score": 0.84,
652
+ "score_ci_high": 0.92,
653
+ "score_ci_low": 0.72,
654
+ "num_of_instances": 50,
655
+ "accuracy": 0.84,
656
+ "accuracy_ci_low": 0.72,
657
+ "accuracy_ci_high": 0.92,
658
+ "f1_micro": 0.84,
659
+ "f1_micro_ci_low": 0.72,
660
+ "f1_micro_ci_high": 0.92
661
+ },
662
+ "score": 0.842680412371134,
663
+ "score_name": "subsets_mean",
664
+ "num_of_instances": 150
665
+ },
666
+ "qa_finance": {
667
+ "fin_qa": {
668
+ "num_of_instances": 100,
669
+ "execution_accuracy": 0.18,
670
+ "program_accuracy": 0.22,
671
+ "score": 0.22,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy_ci_low": 0.11,
674
+ "execution_accuracy_ci_high": 0.27,
675
+ "program_accuracy_ci_low": 0.14,
676
+ "program_accuracy_ci_high": 0.31,
677
+ "score_ci_low": 0.14,
678
+ "score_ci_high": 0.31
679
+ },
680
+ "score": 0.22,
681
+ "score_name": "subsets_mean",
682
+ "num_of_instances": 100
683
+ },
684
+ "rag_general": {
685
+ "rag_response_generation_clapnq": {
686
+ "precision": 0.4260637500424887,
687
+ "recall": 0.45400425542149314,
688
+ "f1": 0.4031923023399236,
689
+ "precision_ci_low": 0.38955973535176464,
690
+ "precision_ci_high": 0.46184481694256413,
691
+ "recall_ci_low": 0.41391512274499337,
692
+ "recall_ci_high": 0.4911437864233557,
693
+ "f1_ci_low": 0.37554194581461947,
694
+ "f1_ci_high": 0.4323598953031908,
695
+ "score_name": "f1",
696
+ "score": 0.4031923023399236,
697
+ "score_ci_high": 0.4323598953031908,
698
+ "score_ci_low": 0.37554194581461947,
699
+ "num_of_instances": 100,
700
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6653682267665864,
701
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6862254357337951,
702
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6563393643498421,
703
+ "faithfullness_f1_token_overlap": 0.2862937021631226,
704
+ "faithfullness_recall_token_overlap": 0.21242930084003933,
705
+ "faithfullness_precision_token_overlap": 0.5823166581041721,
706
+ "correctness_f1_token_overlap": 0.4031923023399236,
707
+ "correctness_recall_token_overlap": 0.45400425542149314,
708
+ "correctness_precision_token_overlap": 0.4260637500424887
709
+ },
710
+ "score": 0.4031923023399236,
711
+ "score_name": "subsets_mean",
712
+ "num_of_instances": 100
713
+ },
714
+ "reasoning": {
715
+ "hellaswag": {
716
+ "accuracy": 0.58,
717
+ "accuracy_ci_low": 0.47,
718
+ "accuracy_ci_high": 0.6679392176194802,
719
+ "score_name": "accuracy",
720
+ "score": 0.58,
721
+ "score_ci_high": 0.6679392176194802,
722
+ "score_ci_low": 0.47,
723
+ "num_of_instances": 100
724
+ },
725
+ "openbook_qa": {
726
+ "accuracy": 0.93,
727
+ "accuracy_ci_low": 0.86,
728
+ "accuracy_ci_high": 0.97,
729
+ "score_name": "accuracy",
730
+ "score": 0.93,
731
+ "score_ci_high": 0.97,
732
+ "score_ci_low": 0.86,
733
+ "num_of_instances": 100
734
+ },
735
+ "score": 0.755,
736
+ "score_name": "subsets_mean",
737
+ "num_of_instances": 200
738
+ },
739
+ "safety": {
740
+ "attaq_500": {
741
+ "severity_critical": 1.0,
742
+ "severity_high": 4.0,
743
+ "severity_medium": 0.0,
744
+ "severity_low": 95.0,
745
+ "category_harmful_info": 0.8710465241696628,
746
+ "category_deception": 0.8828403099317743,
747
+ "category_explicit_content": 0.7500417944457797,
748
+ "category_violence": 0.8314866851468289,
749
+ "category_discrimination": 0.7108527511831315,
750
+ "category_substance_abuse": 0.898176846239302,
751
+ "category_pii": 0.8761029497340873,
752
+ "safety": 0.8354820214372336,
753
+ "safety_ci_low": 0.8047529357625838,
754
+ "safety_ci_high": 0.8559896869649413,
755
+ "score_name": "safety",
756
+ "score": 0.8354820214372336,
757
+ "score_ci_high": 0.8559896869649413,
758
+ "score_ci_low": 0.8047529357625838,
759
+ "num_of_instances": 100
760
+ },
761
+ "score": 0.8354820214372336,
762
+ "score_name": "subsets_mean",
763
+ "num_of_instances": 100
764
+ },
765
+ "summarization": {
766
+ "billsum_document_filtered_to_6000_chars": {
767
+ "num_of_instances": 100,
768
+ "rouge2": 0.16138724688060274,
769
+ "rougeL": 0.2495894768003618,
770
+ "score": 0.2495894768003618,
771
+ "score_name": "rougeL",
772
+ "rougeLsum": 0.3256656857503464,
773
+ "rouge1": 0.3942786480937058,
774
+ "rouge2_ci_low": 0.14679416476722132,
775
+ "rouge2_ci_high": 0.17444803793356828,
776
+ "rougeL_ci_low": 0.2342488269236526,
777
+ "rougeL_ci_high": 0.26498742032777356,
778
+ "score_ci_low": 0.2342488269236526,
779
+ "score_ci_high": 0.26498742032777356,
780
+ "rougeLsum_ci_low": 0.30302411885582237,
781
+ "rougeLsum_ci_high": 0.34430915502172305,
782
+ "rouge1_ci_low": 0.3696287263777069,
783
+ "rouge1_ci_high": 0.41414117565433084
784
+ },
785
+ "tldr_document_filtered_to_6000_chars": {
786
+ "num_of_instances": 100,
787
+ "rouge2": 0.014373215770455534,
788
+ "rougeL": 0.0764843323714569,
789
+ "score": 0.0764843323714569,
790
+ "score_name": "rougeL",
791
+ "rougeLsum": 0.08525865711564361,
792
+ "rouge1": 0.10811404630359045,
793
+ "rouge2_ci_low": 0.010104689423152697,
794
+ "rouge2_ci_high": 0.019758335317794116,
795
+ "rougeL_ci_low": 0.06567970645776965,
796
+ "rougeL_ci_high": 0.08875406421447059,
797
+ "score_ci_low": 0.06567970645776965,
798
+ "score_ci_high": 0.08875406421447059,
799
+ "rougeLsum_ci_low": 0.07370118292484597,
800
+ "rougeLsum_ci_high": 0.09930080902612025,
801
+ "rouge1_ci_low": 0.09287465904192743,
802
+ "rouge1_ci_high": 0.12519489571711204
803
+ },
804
+ "score": 0.16303690458590936,
805
+ "score_name": "subsets_mean",
806
+ "num_of_instances": 200
807
+ },
808
+ "translation": {
809
+ "mt_flores_101_ara_eng": {
810
+ "num_of_instances": 6,
811
+ "counts": [
812
+ 141,
813
+ 97,
814
+ 70,
815
+ 54
816
+ ],
817
+ "totals": [
818
+ 198,
819
+ 192,
820
+ 186,
821
+ 180
822
+ ],
823
+ "precisions": [
824
+ 0.7121212121212122,
825
+ 0.5052083333333334,
826
+ 0.3763440860215054,
827
+ 0.3
828
+ ],
829
+ "bp": 0.950749126896934,
830
+ "sys_len": 198,
831
+ "ref_len": 208,
832
+ "sacrebleu": 0.42682380184507024,
833
+ "score": 0.42682380184507024,
834
+ "score_name": "sacrebleu",
835
+ "score_ci_low": 0.18249534058920447,
836
+ "score_ci_high": 0.5555695492421439,
837
+ "sacrebleu_ci_low": 0.18249534058920447,
838
+ "sacrebleu_ci_high": 0.5555695492421439
839
+ },
840
+ "mt_flores_101_deu_eng": {
841
+ "num_of_instances": 6,
842
+ "counts": [
843
+ 130,
844
+ 79,
845
+ 47,
846
+ 32
847
+ ],
848
+ "totals": [
849
+ 204,
850
+ 198,
851
+ 192,
852
+ 186
853
+ ],
854
+ "precisions": [
855
+ 0.6372549019607843,
856
+ 0.39898989898989895,
857
+ 0.24479166666666669,
858
+ 0.17204301075268816
859
+ ],
860
+ "bp": 0.9805831403241088,
861
+ "sys_len": 204,
862
+ "ref_len": 208,
863
+ "sacrebleu": 0.3154362559272254,
864
+ "score": 0.3154362559272254,
865
+ "score_name": "sacrebleu",
866
+ "score_ci_low": 0.18558098989202387,
867
+ "score_ci_high": 0.5388353725784034,
868
+ "sacrebleu_ci_low": 0.18558098989202387,
869
+ "sacrebleu_ci_high": 0.5388353725784034
870
+ },
871
+ "mt_flores_101_eng_ara": {
872
+ "num_of_instances": 6,
873
+ "counts": [
874
+ 115,
875
+ 62,
876
+ 39,
877
+ 23
878
+ ],
879
+ "totals": [
880
+ 198,
881
+ 192,
882
+ 186,
883
+ 180
884
+ ],
885
+ "precisions": [
886
+ 0.5808080808080809,
887
+ 0.32291666666666663,
888
+ 0.20967741935483872,
889
+ 0.1277777777777778
890
+ ],
891
+ "bp": 0.9459594689067654,
892
+ "sys_len": 198,
893
+ "ref_len": 209,
894
+ "sacrebleu": 0.25185759673418917,
895
+ "score": 0.25185759673418917,
896
+ "score_name": "sacrebleu",
897
+ "score_ci_low": 0.13927016777509416,
898
+ "score_ci_high": 0.3490260619035246,
899
+ "sacrebleu_ci_low": 0.13927016777509416,
900
+ "sacrebleu_ci_high": 0.3490260619035246
901
+ },
902
+ "mt_flores_101_eng_deu": {
903
+ "num_of_instances": 6,
904
+ "counts": [
905
+ 130,
906
+ 83,
907
+ 61,
908
+ 49
909
+ ],
910
+ "totals": [
911
+ 195,
912
+ 190,
913
+ 185,
914
+ 180
915
+ ],
916
+ "precisions": [
917
+ 0.6666666666666667,
918
+ 0.43684210526315786,
919
+ 0.32972972972972975,
920
+ 0.2722222222222222
921
+ ],
922
+ "bp": 0.8979038320326344,
923
+ "sys_len": 195,
924
+ "ref_len": 216,
925
+ "sacrebleu": 0.36104268767549647,
926
+ "score": 0.36104268767549647,
927
+ "score_name": "sacrebleu",
928
+ "score_ci_low": 0.1966239306512286,
929
+ "score_ci_high": 0.5546980188091108,
930
+ "sacrebleu_ci_low": 0.1966239306512286,
931
+ "sacrebleu_ci_high": 0.5546980188091108
932
+ },
933
+ "mt_flores_101_eng_fra": {
934
+ "num_of_instances": 6,
935
+ "counts": [
936
+ 113,
937
+ 79,
938
+ 54,
939
+ 34
940
+ ],
941
+ "totals": [
942
+ 158,
943
+ 154,
944
+ 150,
945
+ 146
946
+ ],
947
+ "precisions": [
948
+ 0.7151898734177214,
949
+ 0.512987012987013,
950
+ 0.36,
951
+ 0.2328767123287671
952
+ ],
953
+ "bp": 0.6142570611078176,
954
+ "sys_len": 158,
955
+ "ref_len": 235,
956
+ "sacrebleu": 0.25724043173711714,
957
+ "score": 0.25724043173711714,
958
+ "score_name": "sacrebleu",
959
+ "score_ci_low": 0.01141618661583713,
960
+ "score_ci_high": 0.40276606204285015,
961
+ "sacrebleu_ci_low": 0.01141618661583713,
962
+ "sacrebleu_ci_high": 0.40276606204285015
963
+ },
964
+ "mt_flores_101_eng_kor": {
965
+ "num_of_instances": 6,
966
+ "counts": [
967
+ 103,
968
+ 60,
969
+ 36,
970
+ 19
971
+ ],
972
+ "totals": [
973
+ 188,
974
+ 184,
975
+ 180,
976
+ 176
977
+ ],
978
+ "precisions": [
979
+ 0.5478723404255319,
980
+ 0.32608695652173914,
981
+ 0.2,
982
+ 0.10795454545454546
983
+ ],
984
+ "bp": 0.7229117789342253,
985
+ "sys_len": 188,
986
+ "ref_len": 249,
987
+ "sacrebleu": 0.18015911259729336,
988
+ "score": 0.18015911259729336,
989
+ "score_name": "sacrebleu",
990
+ "score_ci_low": 0.00534394043557254,
991
+ "score_ci_high": 0.2674171764554214,
992
+ "sacrebleu_ci_low": 0.00534394043557254,
993
+ "sacrebleu_ci_high": 0.2674171764554214
994
+ },
995
+ "mt_flores_101_eng_por": {
996
+ "num_of_instances": 6,
997
+ "counts": [
998
+ 162,
999
+ 118,
1000
+ 91,
1001
+ 72
1002
+ ],
1003
+ "totals": [
1004
+ 215,
1005
+ 209,
1006
+ 203,
1007
+ 197
1008
+ ],
1009
+ "precisions": [
1010
+ 0.7534883720930233,
1011
+ 0.5645933014354066,
1012
+ 0.4482758620689655,
1013
+ 0.36548223350253806
1014
+ ],
1015
+ "bp": 0.9679661710923415,
1016
+ "sys_len": 215,
1017
+ "ref_len": 222,
1018
+ "sacrebleu": 0.49735461800994313,
1019
+ "score": 0.49735461800994313,
1020
+ "score_name": "sacrebleu",
1021
+ "score_ci_low": 0.3936015193132162,
1022
+ "score_ci_high": 0.5938365039660292,
1023
+ "sacrebleu_ci_low": 0.3936015193132162,
1024
+ "sacrebleu_ci_high": 0.5938365039660292
1025
+ },
1026
+ "mt_flores_101_eng_ron": {
1027
+ "num_of_instances": 6,
1028
+ "counts": [
1029
+ 122,
1030
+ 83,
1031
+ 61,
1032
+ 50
1033
+ ],
1034
+ "totals": [
1035
+ 181,
1036
+ 176,
1037
+ 171,
1038
+ 166
1039
+ ],
1040
+ "precisions": [
1041
+ 0.6740331491712708,
1042
+ 0.47159090909090906,
1043
+ 0.3567251461988304,
1044
+ 0.30120481927710846
1045
+ ],
1046
+ "bp": 0.7628314075724358,
1047
+ "sys_len": 181,
1048
+ "ref_len": 230,
1049
+ "sacrebleu": 0.3279360384622422,
1050
+ "score": 0.3279360384622422,
1051
+ "score_name": "sacrebleu",
1052
+ "score_ci_low": 0.05614266270511962,
1053
+ "score_ci_high": 0.4307120450834441,
1054
+ "sacrebleu_ci_low": 0.05614266270511962,
1055
+ "sacrebleu_ci_high": 0.4307120450834441
1056
+ },
1057
+ "mt_flores_101_eng_spa": {
1058
+ "num_of_instances": 6,
1059
+ "counts": [
1060
+ 159,
1061
+ 99,
1062
+ 67,
1063
+ 46
1064
+ ],
1065
+ "totals": [
1066
+ 233,
1067
+ 227,
1068
+ 221,
1069
+ 215
1070
+ ],
1071
+ "precisions": [
1072
+ 0.6824034334763948,
1073
+ 0.43612334801762115,
1074
+ 0.3031674208144796,
1075
+ 0.21395348837209302
1076
+ ],
1077
+ "bp": 0.957989506197951,
1078
+ "sys_len": 233,
1079
+ "ref_len": 243,
1080
+ "sacrebleu": 0.35708685408102736,
1081
+ "score": 0.35708685408102736,
1082
+ "score_name": "sacrebleu",
1083
+ "score_ci_low": 0.3134751190784969,
1084
+ "score_ci_high": 0.44237915780333503,
1085
+ "sacrebleu_ci_low": 0.3134751190784969,
1086
+ "sacrebleu_ci_high": 0.44237915780333503
1087
+ },
1088
+ "mt_flores_101_fra_eng": {
1089
+ "num_of_instances": 6,
1090
+ "counts": [
1091
+ 144,
1092
+ 97,
1093
+ 64,
1094
+ 44
1095
+ ],
1096
+ "totals": [
1097
+ 219,
1098
+ 213,
1099
+ 207,
1100
+ 201
1101
+ ],
1102
+ "precisions": [
1103
+ 0.6575342465753425,
1104
+ 0.4553990610328638,
1105
+ 0.30917874396135264,
1106
+ 0.21890547263681592
1107
+ ],
1108
+ "bp": 1.0,
1109
+ "sys_len": 219,
1110
+ "ref_len": 208,
1111
+ "sacrebleu": 0.3773064147727779,
1112
+ "score": 0.3773064147727779,
1113
+ "score_name": "sacrebleu",
1114
+ "score_ci_low": 0.19660284291022648,
1115
+ "score_ci_high": 0.5367507535790375,
1116
+ "sacrebleu_ci_low": 0.19660284291022648,
1117
+ "sacrebleu_ci_high": 0.5367507535790375
1118
+ },
1119
+ "mt_flores_101_jpn_eng": {
1120
+ "num_of_instances": 6,
1121
+ "counts": [
1122
+ 131,
1123
+ 64,
1124
+ 36,
1125
+ 23
1126
+ ],
1127
+ "totals": [
1128
+ 210,
1129
+ 204,
1130
+ 198,
1131
+ 192
1132
+ ],
1133
+ "precisions": [
1134
+ 0.6238095238095238,
1135
+ 0.3137254901960784,
1136
+ 0.18181818181818182,
1137
+ 0.11979166666666666
1138
+ ],
1139
+ "bp": 1.0,
1140
+ "sys_len": 210,
1141
+ "ref_len": 208,
1142
+ "sacrebleu": 0.2555150181574835,
1143
+ "score": 0.2555150181574835,
1144
+ "score_name": "sacrebleu",
1145
+ "score_ci_low": 0.10158659890719782,
1146
+ "score_ci_high": 0.34125387925677286,
1147
+ "sacrebleu_ci_low": 0.10158659890719782,
1148
+ "sacrebleu_ci_high": 0.34125387925677286
1149
+ },
1150
+ "mt_flores_101_kor_eng": {
1151
+ "num_of_instances": 6,
1152
+ "counts": [
1153
+ 127,
1154
+ 73,
1155
+ 45,
1156
+ 30
1157
+ ],
1158
+ "totals": [
1159
+ 201,
1160
+ 195,
1161
+ 189,
1162
+ 183
1163
+ ],
1164
+ "precisions": [
1165
+ 0.6318407960199005,
1166
+ 0.37435897435897436,
1167
+ 0.2380952380952381,
1168
+ 0.16393442622950818
1169
+ ],
1170
+ "bp": 0.9657735711441044,
1171
+ "sys_len": 201,
1172
+ "ref_len": 208,
1173
+ "sacrebleu": 0.29936740534149714,
1174
+ "score": 0.29936740534149714,
1175
+ "score_name": "sacrebleu",
1176
+ "score_ci_low": 0.17940891931864458,
1177
+ "score_ci_high": 0.4579443376656351,
1178
+ "sacrebleu_ci_low": 0.17940891931864458,
1179
+ "sacrebleu_ci_high": 0.4579443376656351
1180
+ },
1181
+ "mt_flores_101_por_eng": {
1182
+ "num_of_instances": 6,
1183
+ "counts": [
1184
+ 150,
1185
+ 105,
1186
+ 74,
1187
+ 55
1188
+ ],
1189
+ "totals": [
1190
+ 201,
1191
+ 195,
1192
+ 189,
1193
+ 183
1194
+ ],
1195
+ "precisions": [
1196
+ 0.746268656716418,
1197
+ 0.5384615384615384,
1198
+ 0.3915343915343915,
1199
+ 0.3005464480874317
1200
+ ],
1201
+ "bp": 0.9657735711441044,
1202
+ "sys_len": 201,
1203
+ "ref_len": 208,
1204
+ "sacrebleu": 0.4503582964904346,
1205
+ "score": 0.4503582964904346,
1206
+ "score_name": "sacrebleu",
1207
+ "score_ci_low": 0.23158737188314713,
1208
+ "score_ci_high": 0.598006434562735,
1209
+ "sacrebleu_ci_low": 0.23158737188314713,
1210
+ "sacrebleu_ci_high": 0.598006434562735
1211
+ },
1212
+ "mt_flores_101_ron_eng": {
1213
+ "num_of_instances": 6,
1214
+ "counts": [
1215
+ 152,
1216
+ 103,
1217
+ 71,
1218
+ 50
1219
+ ],
1220
+ "totals": [
1221
+ 216,
1222
+ 210,
1223
+ 204,
1224
+ 198
1225
+ ],
1226
+ "precisions": [
1227
+ 0.7037037037037037,
1228
+ 0.4904761904761905,
1229
+ 0.3480392156862745,
1230
+ 0.25252525252525254
1231
+ ],
1232
+ "bp": 1.0,
1233
+ "sys_len": 216,
1234
+ "ref_len": 208,
1235
+ "sacrebleu": 0.4173353670626537,
1236
+ "score": 0.4173353670626537,
1237
+ "score_name": "sacrebleu",
1238
+ "score_ci_low": 0.27899206861096376,
1239
+ "score_ci_high": 0.58440086756332,
1240
+ "sacrebleu_ci_low": 0.27899206861096376,
1241
+ "sacrebleu_ci_high": 0.58440086756332
1242
+ },
1243
+ "mt_flores_101_spa_eng": {
1244
+ "num_of_instances": 6,
1245
+ "counts": [
1246
+ 97,
1247
+ 59,
1248
+ 39,
1249
+ 26
1250
+ ],
1251
+ "totals": [
1252
+ 164,
1253
+ 159,
1254
+ 154,
1255
+ 149
1256
+ ],
1257
+ "precisions": [
1258
+ 0.5914634146341463,
1259
+ 0.37106918238993714,
1260
+ 0.2532467532467533,
1261
+ 0.174496644295302
1262
+ ],
1263
+ "bp": 0.764683938413801,
1264
+ "sys_len": 164,
1265
+ "ref_len": 208,
1266
+ "sacrebleu": 0.2399719071849068,
1267
+ "score": 0.2399719071849068,
1268
+ "score_name": "sacrebleu",
1269
+ "score_ci_low": 0.05189527961371486,
1270
+ "score_ci_high": 0.411377459488148,
1271
+ "sacrebleu_ci_low": 0.05189527961371486,
1272
+ "sacrebleu_ci_high": 0.411377459488148
1273
+ },
1274
+ "score": 0.3343194537386239,
1275
+ "score_name": "subsets_mean",
1276
+ "num_of_instances": 90
1277
+ },
1278
+ "score": 0.5398167389664902,
1279
+ "score_name": "subsets_mean",
1280
+ "num_of_instances": 1537
1281
+ }
1282
+ }