jbnayahu commited on
Commit
fcc023f
·
unverified ·
1 Parent(s): 0650525

Updated results.

Browse files

Signed-off-by: Jonathan Bnayahu <[email protected]>

Files changed (18) hide show
  1. results/bluebench/2025-06-22T14-01-49_evaluation_results.json +0 -1283
  2. results/bluebench/2025-06-22T15-05-33_evaluation_results.json +0 -1283
  3. results/bluebench/2025-06-23T02-53-05_evaluation_results.json +0 -1283
  4. results/bluebench/2025-06-23T03-17-57_evaluation_results.json +0 -1283
  5. results/bluebench/2025-06-23T04-06-37_evaluation_results.json +0 -1283
  6. results/bluebench/2025-06-23T06-18-33_evaluation_results.json +0 -1283
  7. results/bluebench/2025-06-23T08-43-46_evaluation_results.json +0 -1283
  8. results/bluebench/2025-06-23T15-33-11_evaluation_results.json +0 -1283
  9. results/bluebench/{2025-06-23T09-36-33_evaluation_results.json → 2025-07-02T14-58-20_evaluation_results.json} +799 -801
  10. results/bluebench/2025-07-02T15-15-09_evaluation_results.json +1281 -0
  11. results/bluebench/2025-07-02T15-54-03_evaluation_results.json +1281 -0
  12. results/bluebench/{2025-06-23T04-42-35_evaluation_results.json → 2025-07-02T16-08-27_evaluation_results.json} +810 -812
  13. results/bluebench/{2025-06-23T05-36-33_evaluation_results.json → 2025-07-02T16-23-36_evaluation_results.json} +802 -804
  14. results/bluebench/{2025-06-22T17-10-54_evaluation_results.json → 2025-07-02T17-12-27_evaluation_results.json} +763 -765
  15. results/bluebench/{2025-06-22T19-25-42_evaluation_results.json → 2025-07-02T17-33-41_evaluation_results.json} +770 -772
  16. results/bluebench/{2025-06-24T05-35-50_evaluation_results.json → 2025-07-02T18-12-30_evaluation_results.json} +774 -776
  17. results/bluebench/{2025-06-23T14-18-29_evaluation_results.json → 2025-07-02T18-37-37_evaluation_results.json} +769 -771
  18. results/bluebench/2025-07-02T18-57-45_evaluation_results.json +1281 -0
results/bluebench/2025-06-22T14-01-49_evaluation_results.json DELETED
@@ -1,1283 +0,0 @@
1
- {
2
- "environment_info": {
3
- "timestamp_utc": "2025-06-22T18:01:46.346556Z",
4
- "command_line_invocation": [
5
- "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
- "--tasks",
7
- "benchmarks.bluebench",
8
- "--model",
9
- "cross_provider",
10
- "--model_args",
11
- "model_name=watsonx/meta-llama/llama-3-2-11b-vision-instruct,max_tokens=256",
12
- "--output_path",
13
- "./results/bluebench",
14
- "--log_samples",
15
- "--trust_remote_code",
16
- "--batch_size",
17
- "8",
18
- "--verbosity",
19
- "ERROR"
20
- ],
21
- "parsed_arguments": {
22
- "tasks": [
23
- "benchmarks.bluebench"
24
- ],
25
- "split": "test",
26
- "num_fewshots": null,
27
- "limit": null,
28
- "batch_size": 8,
29
- "model": "watsonx/meta-llama/llama-3-2-11b-vision-instruct",
30
- "model_args": {
31
- "max_tokens": 256
32
- },
33
- "gen_kwargs": null,
34
- "chat_template_kwargs": null,
35
- "output_path": "./results/bluebench",
36
- "output_file_prefix": "evaluation_results",
37
- "log_samples": true,
38
- "verbosity": "ERROR",
39
- "apply_chat_template": false,
40
- "trust_remote_code": true,
41
- "disable_hf_cache": false,
42
- "cache_dir": null
43
- },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
- "python_version": "3.10.18",
47
- "system": "Linux",
48
- "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
- "installed_packages": {
50
- "nvidia-cufile-cu12": "1.11.1.6",
51
- "triton": "3.3.1",
52
- "nltk": "3.9.1",
53
- "anyio": "4.9.0",
54
- "absl-py": "2.3.0",
55
- "tiktoken": "0.9.0",
56
- "charset-normalizer": "3.4.2",
57
- "nvidia-cuda-runtime-cu12": "12.6.77",
58
- "sympy": "1.14.0",
59
- "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
- "httpcore": "1.0.9",
62
- "Jinja2": "3.1.6",
63
- "jsonschema-specifications": "2025.4.1",
64
- "pydantic_core": "2.33.2",
65
- "nvidia-cusparse-cu12": "12.5.4.2",
66
- "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
- "portalocker": "3.2.0",
69
- "pandas": "2.3.0",
70
- "multiprocess": "0.70.16",
71
- "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
- "nvidia-nvjitlink-cu12": "12.6.85",
74
- "nvidia-cublas-cu12": "12.6.4.1",
75
- "pydantic": "2.11.7",
76
- "async-timeout": "5.0.1",
77
- "annotated-types": "0.7.0",
78
- "rouge_score": "0.1.2",
79
- "contourpy": "1.3.2",
80
- "aiosignal": "1.3.2",
81
- "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
- "six": "1.17.0",
84
- "diskcache": "5.6.3",
85
- "tqdm": "4.67.1",
86
- "pyarrow": "20.0.0",
87
- "h11": "0.16.0",
88
- "zipp": "3.19.2",
89
- "tzdata": "2025.2",
90
- "bert-score": "0.3.13",
91
- "setuptools": "80.9.0",
92
- "referencing": "0.36.2",
93
- "sacrebleu": "2.5.1",
94
- "filelock": "3.18.0",
95
- "urllib3": "2.5.0",
96
- "scipy": "1.15.3",
97
- "nvidia-nccl-cu12": "2.26.2",
98
- "kiwisolver": "1.4.8",
99
- "networkx": "3.4.2",
100
- "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
- "sniffio": "1.3.1",
103
- "scikit-learn": "1.7.0",
104
- "nvidia-curand-cu12": "10.3.7.77",
105
- "pip": "25.1.1",
106
- "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
- "datasets": "3.6.0",
109
- "nvidia-cusolver-cu12": "11.7.1.2",
110
- "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
- "distro": "1.9.0",
113
- "idna": "3.10",
114
- "MarkupSafe": "3.0.2",
115
- "frozenlist": "1.7.0",
116
- "pyparsing": "3.2.3",
117
- "jiter": "0.10.0",
118
- "importlib_metadata": "8.0.0",
119
- "packaging": "24.2",
120
- "psutil": "7.0.0",
121
- "mecab-ko-dic": "1.0.0",
122
- "joblib": "1.5.1",
123
- "fsspec": "2025.3.0",
124
- "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
- "wheel": "0.45.1",
127
- "nvidia-nvtx-cu12": "12.6.77",
128
- "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
- "propcache": "0.3.2",
131
- "numpy": "2.2.6",
132
- "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
- "conllu": "6.0.0",
135
- "safetensors": "0.5.3",
136
- "requests": "2.32.4",
137
- "regex": "2024.11.6",
138
- "aiohttp": "3.12.13",
139
- "tabulate": "0.9.0",
140
- "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
- "nvidia-cufft-cu12": "11.3.0.4",
143
- "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
- "click": "8.2.1",
145
- "typing_extensions": "4.12.2",
146
- "attrs": "25.3.0",
147
- "exceptiongroup": "1.3.0",
148
- "tenacity": "9.1.2",
149
- "pytz": "2025.2",
150
- "aiohappyeyeballs": "2.6.1",
151
- "python-dateutil": "2.9.0.post0",
152
- "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
154
- "httpx": "0.28.1",
155
- "matplotlib": "3.10.3",
156
- "xxhash": "3.5.0",
157
- "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
- "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
- "threadpoolctl": "3.6.0",
162
- "nvidia-cudnn-cu12": "9.5.1.17",
163
- "jaraco.collections": "5.1.0",
164
- "tomli": "2.0.1",
165
- "backports.tarfile": "1.2.0",
166
- "jaraco.context": "5.3.0",
167
- "typeguard": "4.3.0",
168
- "autocommand": "2.2.2",
169
- "jaraco.text": "3.12.1",
170
- "more-itertools": "10.3.0",
171
- "platformdirs": "4.2.2",
172
- "inflect": "7.3.1",
173
- "jaraco.functools": "4.0.1"
174
- }
175
- },
176
- "results": {
177
- "bias": {
178
- "safety_bbq_age": {
179
- "accuracy": 0.7444444444444445,
180
- "accuracy_ci_low": 0.6555555555555556,
181
- "accuracy_ci_high": 0.8333333333333334,
182
- "score_name": "accuracy",
183
- "score": 0.7444444444444445,
184
- "score_ci_high": 0.8333333333333334,
185
- "score_ci_low": 0.6555555555555556,
186
- "num_of_instances": 90
187
- },
188
- "safety_bbq_disability_status": {
189
- "accuracy": 0.7888888888888889,
190
- "accuracy_ci_low": 0.7,
191
- "accuracy_ci_high": 0.8666666666666667,
192
- "score_name": "accuracy",
193
- "score": 0.7888888888888889,
194
- "score_ci_high": 0.8666666666666667,
195
- "score_ci_low": 0.7,
196
- "num_of_instances": 90
197
- },
198
- "safety_bbq_gender_identity": {
199
- "accuracy": 0.9111111111111111,
200
- "accuracy_ci_low": 0.8444444444444444,
201
- "accuracy_ci_high": 0.9555555555555556,
202
- "score_name": "accuracy",
203
- "score": 0.9111111111111111,
204
- "score_ci_high": 0.9555555555555556,
205
- "score_ci_low": 0.8444444444444444,
206
- "num_of_instances": 90
207
- },
208
- "safety_bbq_nationality": {
209
- "accuracy": 0.7888888888888889,
210
- "accuracy_ci_low": 0.7,
211
- "accuracy_ci_high": 0.8666666666666667,
212
- "score_name": "accuracy",
213
- "score": 0.7888888888888889,
214
- "score_ci_high": 0.8666666666666667,
215
- "score_ci_low": 0.7,
216
- "num_of_instances": 90
217
- },
218
- "safety_bbq_physical_appearance": {
219
- "accuracy": 0.8111111111111111,
220
- "accuracy_ci_low": 0.7222222222222222,
221
- "accuracy_ci_high": 0.8888888888888888,
222
- "score_name": "accuracy",
223
- "score": 0.8111111111111111,
224
- "score_ci_high": 0.8888888888888888,
225
- "score_ci_low": 0.7222222222222222,
226
- "num_of_instances": 90
227
- },
228
- "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.9,
230
- "accuracy_ci_low": 0.8222222222222222,
231
- "accuracy_ci_high": 0.9555555555555556,
232
- "score_name": "accuracy",
233
- "score": 0.9,
234
- "score_ci_high": 0.9555555555555556,
235
- "score_ci_low": 0.8222222222222222,
236
- "num_of_instances": 90
237
- },
238
- "safety_bbq_race_x_gender": {
239
- "accuracy": 0.9666666666666667,
240
- "accuracy_ci_low": 0.9111111111111111,
241
- "accuracy_ci_high": 0.9888888888888889,
242
- "score_name": "accuracy",
243
- "score": 0.9666666666666667,
244
- "score_ci_high": 0.9888888888888889,
245
- "score_ci_low": 0.9111111111111111,
246
- "num_of_instances": 90
247
- },
248
- "safety_bbq_race_x_ses": {
249
- "accuracy": 0.8666666666666667,
250
- "accuracy_ci_low": 0.7888888888888889,
251
- "accuracy_ci_high": 0.9333333333333333,
252
- "score_name": "accuracy",
253
- "score": 0.8666666666666667,
254
- "score_ci_high": 0.9333333333333333,
255
- "score_ci_low": 0.7888888888888889,
256
- "num_of_instances": 90
257
- },
258
- "safety_bbq_religion": {
259
- "accuracy": 0.9111111111111111,
260
- "accuracy_ci_low": 0.8412016500028439,
261
- "accuracy_ci_high": 0.9555555555555556,
262
- "score_name": "accuracy",
263
- "score": 0.9111111111111111,
264
- "score_ci_high": 0.9555555555555556,
265
- "score_ci_low": 0.8412016500028439,
266
- "num_of_instances": 90
267
- },
268
- "safety_bbq_ses": {
269
- "accuracy": 0.8444444444444444,
270
- "accuracy_ci_low": 0.7666666666666667,
271
- "accuracy_ci_high": 0.9111111111111111,
272
- "score_name": "accuracy",
273
- "score": 0.8444444444444444,
274
- "score_ci_high": 0.9111111111111111,
275
- "score_ci_low": 0.7666666666666667,
276
- "num_of_instances": 90
277
- },
278
- "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.9111111111111111,
280
- "accuracy_ci_low": 0.8333333333333334,
281
- "accuracy_ci_high": 0.9555555555555556,
282
- "score_name": "accuracy",
283
- "score": 0.9111111111111111,
284
- "score_ci_high": 0.9555555555555556,
285
- "score_ci_low": 0.8333333333333334,
286
- "num_of_instances": 90
287
- },
288
- "score": 0.8585858585858586,
289
- "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
- },
292
- "chatbot_abilities": {
293
- "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.07721639656816015,
296
- "score": 0.07721639656816015,
297
- "score_name": "llama_3_70b_instruct_template_arena_hard"
298
- },
299
- "score": 0.07721639656816015,
300
- "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
- },
303
- "entity_extraction": {
304
- "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.48831168831168825,
307
- "f1_Organization": 0.35220125786163525,
308
- "f1_Location": 0.3775100401606426,
309
- "f1_macro": 0.406007662111322,
310
- "recall_macro": 0.3667818453974414,
311
- "precision_macro": 0.4584981753989352,
312
- "in_classes_support": 0.7834862385321101,
313
- "f1_micro": 0.3682242990654206,
314
- "recall_micro": 0.37523809523809526,
315
- "precision_micro": 0.3614678899082569,
316
- "score": 0.3682242990654206,
317
- "score_name": "f1_micro",
318
- "score_ci_low": 0.32532095178317566,
319
- "score_ci_high": 0.4180775144242145,
320
- "f1_micro_ci_low": 0.32532095178317566,
321
- "f1_micro_ci_high": 0.4180775144242145
322
- },
323
- "score": 0.3682242990654206,
324
- "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
- },
327
- "knowledge": {
328
- "mmlu_pro_biology": {
329
- "accuracy": 0.5633802816901409,
330
- "accuracy_ci_low": 0.4507042253521127,
331
- "accuracy_ci_high": 0.676056338028169,
332
- "score_name": "accuracy",
333
- "score": 0.5633802816901409,
334
- "score_ci_high": 0.676056338028169,
335
- "score_ci_low": 0.4507042253521127,
336
- "num_of_instances": 71
337
- },
338
- "mmlu_pro_business": {
339
- "accuracy": 0.2535211267605634,
340
- "accuracy_ci_low": 0.15492957746478872,
341
- "accuracy_ci_high": 0.36619718309859156,
342
- "score_name": "accuracy",
343
- "score": 0.2535211267605634,
344
- "score_ci_high": 0.36619718309859156,
345
- "score_ci_low": 0.15492957746478872,
346
- "num_of_instances": 71
347
- },
348
- "mmlu_pro_chemistry": {
349
- "accuracy": 0.23943661971830985,
350
- "accuracy_ci_low": 0.15492957746478872,
351
- "accuracy_ci_high": 0.352112676056338,
352
- "score_name": "accuracy",
353
- "score": 0.23943661971830985,
354
- "score_ci_high": 0.352112676056338,
355
- "score_ci_low": 0.15492957746478872,
356
- "num_of_instances": 71
357
- },
358
- "mmlu_pro_computer_science": {
359
- "accuracy": 0.36619718309859156,
360
- "accuracy_ci_low": 0.2535211267605634,
361
- "accuracy_ci_high": 0.4788732394366197,
362
- "score_name": "accuracy",
363
- "score": 0.36619718309859156,
364
- "score_ci_high": 0.4788732394366197,
365
- "score_ci_low": 0.2535211267605634,
366
- "num_of_instances": 71
367
- },
368
- "mmlu_pro_economics": {
369
- "accuracy": 0.5492957746478874,
370
- "accuracy_ci_low": 0.43661971830985913,
371
- "accuracy_ci_high": 0.6619718309859155,
372
- "score_name": "accuracy",
373
- "score": 0.5492957746478874,
374
- "score_ci_high": 0.6619718309859155,
375
- "score_ci_low": 0.43661971830985913,
376
- "num_of_instances": 71
377
- },
378
- "mmlu_pro_engineering": {
379
- "accuracy": 0.23943661971830985,
380
- "accuracy_ci_low": 0.15492957746478872,
381
- "accuracy_ci_high": 0.352112676056338,
382
- "score_name": "accuracy",
383
- "score": 0.23943661971830985,
384
- "score_ci_high": 0.352112676056338,
385
- "score_ci_low": 0.15492957746478872,
386
- "num_of_instances": 71
387
- },
388
- "mmlu_pro_health": {
389
- "accuracy": 0.4788732394366197,
390
- "accuracy_ci_low": 0.36619718309859156,
391
- "accuracy_ci_high": 0.6056338028169014,
392
- "score_name": "accuracy",
393
- "score": 0.4788732394366197,
394
- "score_ci_high": 0.6056338028169014,
395
- "score_ci_low": 0.36619718309859156,
396
- "num_of_instances": 71
397
- },
398
- "mmlu_pro_history": {
399
- "accuracy": 0.5070422535211268,
400
- "accuracy_ci_low": 0.39436619718309857,
401
- "accuracy_ci_high": 0.6197183098591549,
402
- "score_name": "accuracy",
403
- "score": 0.5070422535211268,
404
- "score_ci_high": 0.6197183098591549,
405
- "score_ci_low": 0.39436619718309857,
406
- "num_of_instances": 71
407
- },
408
- "mmlu_pro_law": {
409
- "accuracy": 0.30985915492957744,
410
- "accuracy_ci_low": 0.2112676056338028,
411
- "accuracy_ci_high": 0.42820969566908634,
412
- "score_name": "accuracy",
413
- "score": 0.30985915492957744,
414
- "score_ci_high": 0.42820969566908634,
415
- "score_ci_low": 0.2112676056338028,
416
- "num_of_instances": 71
417
- },
418
- "mmlu_pro_math": {
419
- "accuracy": 0.14084507042253522,
420
- "accuracy_ci_low": 0.07042253521126761,
421
- "accuracy_ci_high": 0.22535211267605634,
422
- "score_name": "accuracy",
423
- "score": 0.14084507042253522,
424
- "score_ci_high": 0.22535211267605634,
425
- "score_ci_low": 0.07042253521126761,
426
- "num_of_instances": 71
427
- },
428
- "mmlu_pro_other": {
429
- "accuracy": 0.28169014084507044,
430
- "accuracy_ci_low": 0.18309859154929578,
431
- "accuracy_ci_high": 0.39436619718309857,
432
- "score_name": "accuracy",
433
- "score": 0.28169014084507044,
434
- "score_ci_high": 0.39436619718309857,
435
- "score_ci_low": 0.18309859154929578,
436
- "num_of_instances": 71
437
- },
438
- "mmlu_pro_philosophy": {
439
- "accuracy": 0.4507042253521127,
440
- "accuracy_ci_low": 0.323943661971831,
441
- "accuracy_ci_high": 0.5633802816901409,
442
- "score_name": "accuracy",
443
- "score": 0.4507042253521127,
444
- "score_ci_high": 0.5633802816901409,
445
- "score_ci_low": 0.323943661971831,
446
- "num_of_instances": 71
447
- },
448
- "mmlu_pro_physics": {
449
- "accuracy": 0.2535211267605634,
450
- "accuracy_ci_low": 0.16901408450704225,
451
- "accuracy_ci_high": 0.36619718309859156,
452
- "score_name": "accuracy",
453
- "score": 0.2535211267605634,
454
- "score_ci_high": 0.36619718309859156,
455
- "score_ci_low": 0.16901408450704225,
456
- "num_of_instances": 71
457
- },
458
- "mmlu_pro_psychology": {
459
- "accuracy": 0.5774647887323944,
460
- "accuracy_ci_low": 0.4507042253521127,
461
- "accuracy_ci_high": 0.6894343225712088,
462
- "score_name": "accuracy",
463
- "score": 0.5774647887323944,
464
- "score_ci_high": 0.6894343225712088,
465
- "score_ci_low": 0.4507042253521127,
466
- "num_of_instances": 71
467
- },
468
- "score": 0.3722334004024145,
469
- "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
- },
472
- "legal": {
473
- "legalbench_abercrombie": {
474
- "f1_macro": 0.5662558356676004,
475
- "f1_suggestive": 0.4666666666666667,
476
- "f1_arbitrary": 0.4444444444444444,
477
- "f1_generic": 0.8571428571428571,
478
- "f1_fanciful": 0.35714285714285715,
479
- "f1_descriptive": 0.7058823529411765,
480
- "f1_macro_ci_low": 0.47410052522342583,
481
- "f1_macro_ci_high": 0.6713730404881563,
482
- "score_name": "f1_micro",
483
- "score": 0.5575757575757576,
484
- "score_ci_high": 0.6506589298059469,
485
- "score_ci_low": 0.4457831325301205,
486
- "num_of_instances": 85,
487
- "accuracy": 0.5411764705882353,
488
- "accuracy_ci_low": 0.43529411764705883,
489
- "accuracy_ci_high": 0.6352941176470588,
490
- "f1_micro": 0.5575757575757576,
491
- "f1_micro_ci_low": 0.4457831325301205,
492
- "f1_micro_ci_high": 0.6506589298059469
493
- },
494
- "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.575166034793364,
496
- "f1_no": 0.6877470355731226,
497
- "f1_yes": 0.46258503401360546,
498
- "f1_macro_ci_low": 0.5066495066495067,
499
- "f1_macro_ci_high": 0.6496773446094443,
500
- "score_name": "f1_micro",
501
- "score": 0.605,
502
- "score_ci_high": 0.67,
503
- "score_ci_low": 0.535,
504
- "num_of_instances": 200,
505
- "accuracy": 0.605,
506
- "accuracy_ci_low": 0.535,
507
- "accuracy_ci_high": 0.67,
508
- "f1_micro": 0.605,
509
- "f1_micro_ci_low": 0.535,
510
- "f1_micro_ci_high": 0.67
511
- },
512
- "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.3240051765352555,
514
- "f1_conclusion": 0.0975609756097561,
515
- "f1_analysis": 0.509090909090909,
516
- "f1_decree": 0.34285714285714286,
517
- "f1_issue": 0.22641509433962265,
518
- "f1_procedural history": 0.29850746268656714,
519
- "f1_facts": 0.4186046511627907,
520
- "f1_rule": 0.375,
521
- "f1_macro_ci_low": 0.2737291340244584,
522
- "f1_macro_ci_high": 0.39709087675818633,
523
- "score_name": "f1_micro",
524
- "score": 0.3526448362720403,
525
- "score_ci_high": 0.42317380352644834,
526
- "score_ci_low": 0.29292929292929293,
527
- "num_of_instances": 200,
528
- "accuracy": 0.35,
529
- "accuracy_ci_low": 0.29,
530
- "accuracy_ci_high": 0.42,
531
- "f1_micro": 0.3526448362720403,
532
- "f1_micro_ci_low": 0.29292929292929293,
533
- "f1_micro_ci_high": 0.42317380352644834
534
- },
535
- "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.5769837972579975,
537
- "f1_yes": 0.45517241379310347,
538
- "f1_no": 0.6987951807228916,
539
- "f1_macro_ci_low": 0.5127178863190986,
540
- "f1_macro_ci_high": 0.6553872211311121,
541
- "score_name": "f1_micro",
542
- "score": 0.6091370558375635,
543
- "score_ci_high": 0.6785772255666204,
544
- "score_ci_low": 0.5449871465295629,
545
- "num_of_instances": 200,
546
- "accuracy": 0.6,
547
- "accuracy_ci_low": 0.535,
548
- "accuracy_ci_high": 0.67,
549
- "f1_micro": 0.6091370558375635,
550
- "f1_micro_ci_low": 0.5449871465295629,
551
- "f1_micro_ci_high": 0.6785772255666204
552
- },
553
- "legalbench_proa": {
554
- "f1_macro": 0.9404761904761905,
555
- "f1_yes": 0.9523809523809523,
556
- "f1_no": 0.9285714285714286,
557
- "f1_macro_ci_low": 0.8717038360531253,
558
- "f1_macro_ci_high": 0.9763503609021853,
559
- "score_name": "f1_micro",
560
- "score": 0.9404761904761905,
561
- "score_ci_high": 0.9764705882352941,
562
- "score_ci_low": 0.8724795930656631,
563
- "num_of_instances": 85,
564
- "accuracy": 0.9294117647058824,
565
- "accuracy_ci_low": 0.8470588235294118,
566
- "accuracy_ci_high": 0.9764705882352941,
567
- "f1_micro": 0.9404761904761905,
568
- "f1_micro_ci_low": 0.8724795930656631,
569
- "f1_micro_ci_high": 0.9764705882352941
570
- },
571
- "score": 0.6129667680323103,
572
- "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
- },
575
- "news_classification": {
576
- "20_newsgroups_short": {
577
- "f1_macro": 0.5153316959213506,
578
- "f1_cars": 0.7317073170731707,
579
- "f1_windows x": 0.08450704225352113,
580
- "f1_computer graphics": 0.4948453608247423,
581
- "f1_atheism": 0.2978723404255319,
582
- "f1_religion": 0.05263157894736842,
583
- "f1_medicine": 0.7733333333333333,
584
- "f1_christianity": 0.5806451612903226,
585
- "f1_microsoft windows": 0.4507042253521127,
586
- "f1_middle east": 0.32727272727272727,
587
- "f1_politics": 0.4132231404958678,
588
- "f1_motorcycles": 0.7058823529411765,
589
- "f1_pc hardware": 0.48520710059171596,
590
- "f1_mac hardware": 0.5057471264367817,
591
- "f1_electronics": 0.48739495798319327,
592
- "f1_for sale": 0.5,
593
- "f1_guns": 0.28125,
594
- "f1_space": 0.7659574468085106,
595
- "f1_cryptography": 0.6,
596
- "f1_baseball": 0.8813559322033898,
597
- "f1_hockey": 0.8870967741935484,
598
- "f1_macro_ci_low": 0.4890786960094656,
599
- "f1_macro_ci_high": 0.5464781246183315,
600
- "score_name": "f1_micro",
601
- "score": 0.5437325905292479,
602
- "score_ci_high": 0.5741315636296753,
603
- "score_ci_low": 0.5090753018614114,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.488,
606
- "accuracy_ci_low": 0.454,
607
- "accuracy_ci_high": 0.519,
608
- "f1_micro": 0.5437325905292479,
609
- "f1_micro_ci_low": 0.5090753018614114,
610
- "f1_micro_ci_high": 0.5741315636296753
611
- },
612
- "score": 0.5437325905292479,
613
- "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
- },
616
- "product_help": {
617
- "cfpb_product_2023": {
618
- "f1_macro": 0.6685227589041403,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9225251076040172,
620
- "f1_checking or savings account": 0.5806451612903226,
621
- "f1_debt collection": 0.5274725274725275,
622
- "f1_credit card or prepaid card": 0.6371681415929203,
623
- "f1_mortgage": 0.8059701492537313,
624
- "f1_student loan": 0.8571428571428571,
625
- "f1_money transfer or virtual currency or money service": 0.6181818181818182,
626
- "f1_vehicle loan or lease": 0.6060606060606061,
627
- "f1_payday loan or title loan or personal loan": 0.46153846153846156,
628
- "f1_macro_ci_low": 0.6111841538128283,
629
- "f1_macro_ci_high": 0.7335266591830523,
630
- "score_name": "f1_micro",
631
- "score": 0.8321536905965622,
632
- "score_ci_high": 0.85326682230999,
633
- "score_ci_low": 0.80760586975502,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.823,
636
- "accuracy_ci_low": 0.7962032615906698,
637
- "accuracy_ci_high": 0.8449169646606582,
638
- "f1_micro": 0.8321536905965622,
639
- "f1_micro_ci_low": 0.80760586975502,
640
- "f1_micro_ci_high": 0.85326682230999
641
- },
642
- "cfpb_product_watsonx": {
643
- "f1_macro": 0.6712830729866164,
644
- "f1_mortgages and loans": 0.7439024390243902,
645
- "f1_credit card": 0.7777777777777778,
646
- "f1_debt collection": 0.6571428571428571,
647
- "f1_credit reporting": 0.7817589576547231,
648
- "f1_retail banking": 0.3958333333333333,
649
- "f1_macro_ci_low": 0.6287200378375363,
650
- "f1_macro_ci_high": 0.7180257299728254,
651
- "score_name": "f1_micro",
652
- "score": 0.7097435897435898,
653
- "score_ci_high": 0.7484617342104366,
654
- "score_ci_low": 0.6680812073559,
655
- "num_of_instances": 500,
656
- "accuracy": 0.692,
657
- "accuracy_ci_low": 0.648,
658
- "accuracy_ci_high": 0.732,
659
- "f1_micro": 0.7097435897435898,
660
- "f1_micro_ci_low": 0.6680812073559,
661
- "f1_micro_ci_high": 0.7484617342104366
662
- },
663
- "score": 0.770948640170076,
664
- "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
- },
667
- "qa_finance": {
668
- "fin_qa": {
669
- "num_of_instances": 1000,
670
- "program_accuracy": 0.063,
671
- "score": 0.063,
672
- "score_name": "program_accuracy",
673
- "execution_accuracy": 0.053,
674
- "program_accuracy_ci_low": 0.049,
675
- "program_accuracy_ci_high": 0.07883525503658394,
676
- "score_ci_low": 0.049,
677
- "score_ci_high": 0.07883525503658394,
678
- "execution_accuracy_ci_low": 0.04,
679
- "execution_accuracy_ci_high": 0.06776975208467821
680
- },
681
- "score": 0.063,
682
- "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
- },
685
- "rag_general": {
686
- "rag_response_generation_clapnq": {
687
- "precision": 0.3102327261456618,
688
- "recall": 0.5256916602580562,
689
- "f1": 0.3173216770860886,
690
- "precision_ci_low": 0.2902550859846034,
691
- "precision_ci_high": 0.33175084087869144,
692
- "recall_ci_low": 0.5093303819915139,
693
- "recall_ci_high": 0.5427896271119333,
694
- "f1_ci_low": 0.3004856741878323,
695
- "f1_ci_high": 0.3346952161946201,
696
- "score_name": "f1",
697
- "score": 0.3173216770860886,
698
- "score_ci_high": 0.3346952161946201,
699
- "score_ci_low": 0.3004856741878323,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.5790909464160602,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6590411880612373,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5406127203504244,
704
- "faithfullness_f1_token_overlap": 0.2677069264827017,
705
- "faithfullness_recall_token_overlap": 0.2048583040694459,
706
- "faithfullness_precision_token_overlap": 0.5374826116531715,
707
- "correctness_f1_token_overlap": 0.3173216770860886,
708
- "correctness_recall_token_overlap": 0.5256916602580562,
709
- "correctness_precision_token_overlap": 0.3102327261456618
710
- },
711
- "score": 0.3173216770860886,
712
- "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
- },
715
- "reasoning": {
716
- "hellaswag": {
717
- "accuracy": 0.472,
718
- "accuracy_ci_low": 0.4377807805792856,
719
- "accuracy_ci_high": 0.501,
720
- "score_name": "accuracy",
721
- "score": 0.472,
722
- "score_ci_high": 0.501,
723
- "score_ci_low": 0.4377807805792856,
724
- "num_of_instances": 1000
725
- },
726
- "openbook_qa": {
727
- "accuracy": 0.79,
728
- "accuracy_ci_low": 0.756,
729
- "accuracy_ci_high": 0.8278621971546426,
730
- "score_name": "accuracy",
731
- "score": 0.79,
732
- "score_ci_high": 0.8278621971546426,
733
- "score_ci_low": 0.756,
734
- "num_of_instances": 500
735
- },
736
- "score": 0.631,
737
- "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
- },
740
- "safety": {
741
- "attaq_500": {
742
- "severity_critical": 0.0,
743
- "severity_high": 0.0,
744
- "severity_medium": 2.0,
745
- "severity_low": 98.0,
746
- "category_harmful_info": 0.828103977861622,
747
- "category_deception": 0.8362863158073389,
748
- "category_explicit_content": 0.8723303774616333,
749
- "category_violence": 0.8265416233608173,
750
- "category_discrimination": 0.7366517979710822,
751
- "category_substance_abuse": 0.8473929514487585,
752
- "category_pii": 0.8314695911550963,
753
- "safety": 0.8279858794021938,
754
- "safety_ci_low": 0.8061898558946505,
755
- "safety_ci_high": 0.84660034746746,
756
- "score_name": "safety",
757
- "score": 0.8279858794021938,
758
- "score_ci_high": 0.84660034746746,
759
- "score_ci_low": 0.8061898558946505,
760
- "num_of_instances": 100
761
- },
762
- "score": 0.8279858794021938,
763
- "score_name": "subsets_mean",
764
- "num_of_instances": 100
765
- },
766
- "summarization": {
767
- "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rouge1": 0.42870762951811686,
770
- "rouge2": 0.22634718223322206,
771
- "rougeL": 0.3064227533266295,
772
- "score": 0.3064227533266295,
773
- "score_name": "rougeL",
774
- "rougeLsum": 0.37414452263718584,
775
- "rouge1_ci_low": 0.41813186584018475,
776
- "rouge1_ci_high": 0.4383682523831221,
777
- "rouge2_ci_low": 0.2179836143855743,
778
- "rouge2_ci_high": 0.234852562715,
779
- "rougeL_ci_low": 0.29790065420910344,
780
- "rougeL_ci_high": 0.3146437618343804,
781
- "score_ci_low": 0.29790065420910344,
782
- "score_ci_high": 0.3146437618343804,
783
- "rougeLsum_ci_low": 0.36454056998527534,
784
- "rougeLsum_ci_high": 0.3838301065902944
785
- },
786
- "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rouge1": 0.12785522529780569,
789
- "rouge2": 0.018132164508293067,
790
- "rougeL": 0.09085147406577235,
791
- "score": 0.09085147406577235,
792
- "score_name": "rougeL",
793
- "rougeLsum": 0.10491828744788975,
794
- "rouge1_ci_low": 0.12179050419663484,
795
- "rouge1_ci_high": 0.13318684044580203,
796
- "rouge2_ci_low": 0.016258834518891666,
797
- "rouge2_ci_high": 0.02026468013917415,
798
- "rougeL_ci_low": 0.08692929955144628,
799
- "rougeL_ci_high": 0.0946230347296095,
800
- "score_ci_low": 0.08692929955144628,
801
- "score_ci_high": 0.0946230347296095,
802
- "rougeLsum_ci_low": 0.10019902672341267,
803
- "rougeLsum_ci_high": 0.10933176458351264
804
- },
805
- "score": 0.1986371136962009,
806
- "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
- },
809
- "translation": {
810
- "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
- "counts": [
813
- 1196,
814
- 710,
815
- 465,
816
- 324
817
- ],
818
- "totals": [
819
- 1814,
820
- 1748,
821
- 1682,
822
- 1616
823
- ],
824
- "precisions": [
825
- 0.659316427783903,
826
- 0.40617848970251713,
827
- 0.27645659928656363,
828
- 0.20049504950495048
829
- ],
830
- "bp": 1.0,
831
- "sys_len": 1814,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.3490481641487808,
834
- "score": 0.3490481641487808,
835
- "score_name": "sacrebleu",
836
- "score_ci_low": 0.2982501441000675,
837
- "score_ci_high": 0.39380586753445035,
838
- "sacrebleu_ci_low": 0.2982501441000675,
839
- "sacrebleu_ci_high": 0.39380586753445035
840
- },
841
- "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
- "counts": [
844
- 1266,
845
- 804,
846
- 543,
847
- 375
848
- ],
849
- "totals": [
850
- 1788,
851
- 1722,
852
- 1656,
853
- 1590
854
- ],
855
- "precisions": [
856
- 0.7080536912751678,
857
- 0.46689895470383275,
858
- 0.32789855072463764,
859
- 0.2358490566037736
860
- ],
861
- "bp": 1.0,
862
- "sys_len": 1788,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.39986710952008375,
865
- "score": 0.39986710952008375,
866
- "score_name": "sacrebleu",
867
- "score_ci_low": 0.3581256368637932,
868
- "score_ci_high": 0.44700899058600674,
869
- "sacrebleu_ci_low": 0.3581256368637932,
870
- "sacrebleu_ci_high": 0.44700899058600674
871
- },
872
- "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
- "counts": [
875
- 809,
876
- 376,
877
- 189,
878
- 90
879
- ],
880
- "totals": [
881
- 1642,
882
- 1576,
883
- 1510,
884
- 1444
885
- ],
886
- "precisions": [
887
- 0.4926918392204629,
888
- 0.23857868020304568,
889
- 0.1251655629139073,
890
- 0.062326869806094184
891
- ],
892
- "bp": 1.0,
893
- "sys_len": 1642,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.17401704653688835,
896
- "score": 0.17401704653688835,
897
- "score_name": "sacrebleu",
898
- "score_ci_low": 0.1499482262533421,
899
- "score_ci_high": 0.19937003139575787,
900
- "sacrebleu_ci_low": 0.1499482262533421,
901
- "sacrebleu_ci_high": 0.19937003139575787
902
- },
903
- "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
- "counts": [
906
- 1142,
907
- 633,
908
- 396,
909
- 251
910
- ],
911
- "totals": [
912
- 1860,
913
- 1794,
914
- 1728,
915
- 1662
916
- ],
917
- "precisions": [
918
- 0.613978494623656,
919
- 0.3528428093645485,
920
- 0.22916666666666669,
921
- 0.1510228640192539
922
- ],
923
- "bp": 1.0,
924
- "sys_len": 1860,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.29426061967472056,
927
- "score": 0.29426061967472056,
928
- "score_name": "sacrebleu",
929
- "score_ci_low": 0.24153648652872883,
930
- "score_ci_high": 0.3377937358140578,
931
- "sacrebleu_ci_low": 0.24153648652872883,
932
- "sacrebleu_ci_high": 0.3377937358140578
933
- },
934
- "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
- "counts": [
937
- 1456,
938
- 1023,
939
- 777,
940
- 595
941
- ],
942
- "totals": [
943
- 2053,
944
- 1987,
945
- 1921,
946
- 1855
947
- ],
948
- "precisions": [
949
- 0.7092060399415491,
950
- 0.5148465022647207,
951
- 0.4044768349817803,
952
- 0.32075471698113206
953
- ],
954
- "bp": 0.9927202458072129,
955
- "sys_len": 2053,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.46313340131929615,
958
- "score": 0.46313340131929615,
959
- "score_name": "sacrebleu",
960
- "score_ci_low": 0.4233841077081067,
961
- "score_ci_high": 0.5119360540835911,
962
- "sacrebleu_ci_low": 0.4233841077081067,
963
- "sacrebleu_ci_high": 0.5119360540835911
964
- },
965
- "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
- "counts": [
968
- 1292,
969
- 644,
970
- 363,
971
- 211
972
- ],
973
- "totals": [
974
- 2519,
975
- 2453,
976
- 2387,
977
- 2321
978
- ],
979
- "precisions": [
980
- 0.5129019452163557,
981
- 0.26253567060741945,
982
- 0.15207373271889402,
983
- 0.09090909090909091
984
- ],
985
- "bp": 1.0,
986
- "sys_len": 2519,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.2077165240938849,
989
- "score": 0.2077165240938849,
990
- "score_name": "sacrebleu",
991
- "score_ci_low": 0.1859830684811085,
992
- "score_ci_high": 0.23202144404185795,
993
- "sacrebleu_ci_low": 0.1859830684811085,
994
- "sacrebleu_ci_high": 0.23202144404185795
995
- },
996
- "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
- "counts": [
999
- 1391,
1000
- 964,
1001
- 706,
1002
- 526
1003
- ],
1004
- "totals": [
1005
- 1932,
1006
- 1866,
1007
- 1800,
1008
- 1734
1009
- ],
1010
- "precisions": [
1011
- 0.7199792960662527,
1012
- 0.5166130760986066,
1013
- 0.3922222222222222,
1014
- 0.3033448673587082
1015
- ],
1016
- "bp": 1.0,
1017
- "sys_len": 1932,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.4586575663502692,
1020
- "score": 0.4586575663502692,
1021
- "score_name": "sacrebleu",
1022
- "score_ci_low": 0.4004417903982224,
1023
- "score_ci_high": 0.4989599670645679,
1024
- "sacrebleu_ci_low": 0.4004417903982224,
1025
- "sacrebleu_ci_high": 0.4989599670645679
1026
- },
1027
- "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
- "counts": [
1030
- 1307,
1031
- 878,
1032
- 615,
1033
- 449
1034
- ],
1035
- "totals": [
1036
- 1965,
1037
- 1899,
1038
- 1833,
1039
- 1767
1040
- ],
1041
- "precisions": [
1042
- 0.6651399491094148,
1043
- 0.4623486045286993,
1044
- 0.3355155482815057,
1045
- 0.25410299943406905
1046
- ],
1047
- "bp": 1.0,
1048
- "sys_len": 1965,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.4023937777690479,
1051
- "score": 0.4023937777690479,
1052
- "score_name": "sacrebleu",
1053
- "score_ci_low": 0.36178210560248414,
1054
- "score_ci_high": 0.4461521227098032,
1055
- "sacrebleu_ci_low": 0.36178210560248414,
1056
- "sacrebleu_ci_high": 0.4461521227098032
1057
- },
1058
- "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
- "counts": [
1061
- 1254,
1062
- 673,
1063
- 395,
1064
- 238
1065
- ],
1066
- "totals": [
1067
- 2011,
1068
- 1945,
1069
- 1879,
1070
- 1813
1071
- ],
1072
- "precisions": [
1073
- 0.6235703630034809,
1074
- 0.3460154241645244,
1075
- 0.21021820117083553,
1076
- 0.1312741312741313
1077
- ],
1078
- "bp": 0.9576603939644929,
1079
- "sys_len": 2011,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.266022962078398,
1082
- "score": 0.266022962078398,
1083
- "score_name": "sacrebleu",
1084
- "score_ci_low": 0.23487137512560524,
1085
- "score_ci_high": 0.3006336038696202,
1086
- "sacrebleu_ci_low": 0.23487137512560524,
1087
- "sacrebleu_ci_high": 0.3006336038696202
1088
- },
1089
- "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
- "counts": [
1092
- 1307,
1093
- 854,
1094
- 592,
1095
- 423
1096
- ],
1097
- "totals": [
1098
- 1835,
1099
- 1769,
1100
- 1703,
1101
- 1637
1102
- ],
1103
- "precisions": [
1104
- 0.7122615803814714,
1105
- 0.48275862068965514,
1106
- 0.3476218438050499,
1107
- 0.2583995113011607
1108
- ],
1109
- "bp": 1.0,
1110
- "sys_len": 1835,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.419220079381378,
1113
- "score": 0.419220079381378,
1114
- "score_name": "sacrebleu",
1115
- "score_ci_low": 0.37867823012959856,
1116
- "score_ci_high": 0.457201247333676,
1117
- "sacrebleu_ci_low": 0.37867823012959856,
1118
- "sacrebleu_ci_high": 0.457201247333676
1119
- },
1120
- "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
- "counts": [
1123
- 1067,
1124
- 535,
1125
- 301,
1126
- 173
1127
- ],
1128
- "totals": [
1129
- 1828,
1130
- 1762,
1131
- 1696,
1132
- 1630
1133
- ],
1134
- "precisions": [
1135
- 0.5836980306345734,
1136
- 0.30363223609534623,
1137
- 0.17747641509433962,
1138
- 0.10613496932515337
1139
- ],
1140
- "bp": 1.0,
1141
- "sys_len": 1828,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.24037196462822435,
1144
- "score": 0.24037196462822435,
1145
- "score_name": "sacrebleu",
1146
- "score_ci_low": 0.20050825204453002,
1147
- "score_ci_high": 0.29070324343505133,
1148
- "sacrebleu_ci_low": 0.20050825204453002,
1149
- "sacrebleu_ci_high": 0.29070324343505133
1150
- },
1151
- "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
- "counts": [
1154
- 1010,
1155
- 482,
1156
- 265,
1157
- 153
1158
- ],
1159
- "totals": [
1160
- 1770,
1161
- 1704,
1162
- 1638,
1163
- 1572
1164
- ],
1165
- "precisions": [
1166
- 0.5706214689265537,
1167
- 0.2828638497652582,
1168
- 0.16178266178266176,
1169
- 0.09732824427480916
1170
- ],
1171
- "bp": 1.0,
1172
- "sys_len": 1770,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.22452985981795862,
1175
- "score": 0.22452985981795862,
1176
- "score_name": "sacrebleu",
1177
- "score_ci_low": 0.19219307877931052,
1178
- "score_ci_high": 0.276921223757092,
1179
- "sacrebleu_ci_low": 0.19219307877931052,
1180
- "sacrebleu_ci_high": 0.276921223757092
1181
- },
1182
- "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
- "counts": [
1185
- 1324,
1186
- 914,
1187
- 671,
1188
- 506
1189
- ],
1190
- "totals": [
1191
- 1810,
1192
- 1744,
1193
- 1678,
1194
- 1612
1195
- ],
1196
- "precisions": [
1197
- 0.7314917127071823,
1198
- 0.5240825688073394,
1199
- 0.39988081048867696,
1200
- 0.31389578163771714
1201
- ],
1202
- "bp": 1.0,
1203
- "sys_len": 1810,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.4683616120269589,
1206
- "score": 0.4683616120269589,
1207
- "score_name": "sacrebleu",
1208
- "score_ci_low": 0.40532389997385815,
1209
- "score_ci_high": 0.5208761253300637,
1210
- "sacrebleu_ci_low": 0.40532389997385815,
1211
- "sacrebleu_ci_high": 0.5208761253300637
1212
- },
1213
- "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
- "counts": [
1216
- 1284,
1217
- 865,
1218
- 609,
1219
- 426
1220
- ],
1221
- "totals": [
1222
- 1812,
1223
- 1746,
1224
- 1680,
1225
- 1614
1226
- ],
1227
- "precisions": [
1228
- 0.7086092715231789,
1229
- 0.49541809851088203,
1230
- 0.3625,
1231
- 0.26394052044609667
1232
- ],
1233
- "bp": 1.0,
1234
- "sys_len": 1812,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.42810292438852193,
1237
- "score": 0.42810292438852193,
1238
- "score_name": "sacrebleu",
1239
- "score_ci_low": 0.3881114596567753,
1240
- "score_ci_high": 0.4774362643095391,
1241
- "sacrebleu_ci_low": 0.3881114596567753,
1242
- "sacrebleu_ci_high": 0.4774362643095391
1243
- },
1244
- "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
- "counts": [
1247
- 1177,
1248
- 630,
1249
- 374,
1250
- 229
1251
- ],
1252
- "totals": [
1253
- 1912,
1254
- 1846,
1255
- 1780,
1256
- 1714
1257
- ],
1258
- "precisions": [
1259
- 0.6155857740585774,
1260
- 0.3412784398699892,
1261
- 0.2101123595505618,
1262
- 0.13360560093348892
1263
- ],
1264
- "bp": 1.0,
1265
- "sys_len": 1912,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.2771203526997782,
1268
- "score": 0.2771203526997782,
1269
- "score_name": "sacrebleu",
1270
- "score_ci_low": 0.25170081898114677,
1271
- "score_ci_high": 0.31867748756379854,
1272
- "sacrebleu_ci_low": 0.25170081898114677,
1273
- "sacrebleu_ci_high": 0.31867748756379854
1274
- },
1275
- "score": 0.33818826429561266,
1276
- "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
- },
1279
- "score": 0.460003145217968,
1280
- "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
- }
1283
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/bluebench/2025-06-22T15-05-33_evaluation_results.json DELETED
@@ -1,1283 +0,0 @@
1
- {
2
- "environment_info": {
3
- "timestamp_utc": "2025-06-22T19:05:29.772171Z",
4
- "command_line_invocation": [
5
- "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
- "--tasks",
7
- "benchmarks.bluebench",
8
- "--model",
9
- "cross_provider",
10
- "--model_args",
11
- "model_name=watsonx/meta-llama/llama-3-2-1b-instruct,max_tokens=256",
12
- "--output_path",
13
- "./results/bluebench",
14
- "--log_samples",
15
- "--trust_remote_code",
16
- "--batch_size",
17
- "8",
18
- "--verbosity",
19
- "ERROR"
20
- ],
21
- "parsed_arguments": {
22
- "tasks": [
23
- "benchmarks.bluebench"
24
- ],
25
- "split": "test",
26
- "num_fewshots": null,
27
- "limit": null,
28
- "batch_size": 8,
29
- "model": "watsonx/meta-llama/llama-3-2-1b-instruct",
30
- "model_args": {
31
- "max_tokens": 256
32
- },
33
- "gen_kwargs": null,
34
- "chat_template_kwargs": null,
35
- "output_path": "./results/bluebench",
36
- "output_file_prefix": "evaluation_results",
37
- "log_samples": true,
38
- "verbosity": "ERROR",
39
- "apply_chat_template": false,
40
- "trust_remote_code": true,
41
- "disable_hf_cache": false,
42
- "cache_dir": null
43
- },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
- "python_version": "3.10.18",
47
- "system": "Linux",
48
- "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
- "installed_packages": {
50
- "nvidia-cufile-cu12": "1.11.1.6",
51
- "triton": "3.3.1",
52
- "nltk": "3.9.1",
53
- "anyio": "4.9.0",
54
- "absl-py": "2.3.0",
55
- "tiktoken": "0.9.0",
56
- "charset-normalizer": "3.4.2",
57
- "nvidia-cuda-runtime-cu12": "12.6.77",
58
- "sympy": "1.14.0",
59
- "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
- "httpcore": "1.0.9",
62
- "Jinja2": "3.1.6",
63
- "jsonschema-specifications": "2025.4.1",
64
- "pydantic_core": "2.33.2",
65
- "nvidia-cusparse-cu12": "12.5.4.2",
66
- "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
- "portalocker": "3.2.0",
69
- "pandas": "2.3.0",
70
- "multiprocess": "0.70.16",
71
- "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
- "nvidia-nvjitlink-cu12": "12.6.85",
74
- "nvidia-cublas-cu12": "12.6.4.1",
75
- "pydantic": "2.11.7",
76
- "async-timeout": "5.0.1",
77
- "annotated-types": "0.7.0",
78
- "rouge_score": "0.1.2",
79
- "contourpy": "1.3.2",
80
- "aiosignal": "1.3.2",
81
- "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
- "six": "1.17.0",
84
- "diskcache": "5.6.3",
85
- "tqdm": "4.67.1",
86
- "pyarrow": "20.0.0",
87
- "h11": "0.16.0",
88
- "zipp": "3.19.2",
89
- "tzdata": "2025.2",
90
- "bert-score": "0.3.13",
91
- "setuptools": "80.9.0",
92
- "referencing": "0.36.2",
93
- "sacrebleu": "2.5.1",
94
- "filelock": "3.18.0",
95
- "urllib3": "2.5.0",
96
- "scipy": "1.15.3",
97
- "nvidia-nccl-cu12": "2.26.2",
98
- "kiwisolver": "1.4.8",
99
- "networkx": "3.4.2",
100
- "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
- "sniffio": "1.3.1",
103
- "scikit-learn": "1.7.0",
104
- "nvidia-curand-cu12": "10.3.7.77",
105
- "pip": "25.1.1",
106
- "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
- "datasets": "3.6.0",
109
- "nvidia-cusolver-cu12": "11.7.1.2",
110
- "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
- "distro": "1.9.0",
113
- "idna": "3.10",
114
- "MarkupSafe": "3.0.2",
115
- "frozenlist": "1.7.0",
116
- "pyparsing": "3.2.3",
117
- "jiter": "0.10.0",
118
- "importlib_metadata": "8.0.0",
119
- "packaging": "24.2",
120
- "psutil": "7.0.0",
121
- "mecab-ko-dic": "1.0.0",
122
- "joblib": "1.5.1",
123
- "fsspec": "2025.3.0",
124
- "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
- "wheel": "0.45.1",
127
- "nvidia-nvtx-cu12": "12.6.77",
128
- "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
- "propcache": "0.3.2",
131
- "numpy": "2.2.6",
132
- "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
- "conllu": "6.0.0",
135
- "safetensors": "0.5.3",
136
- "requests": "2.32.4",
137
- "regex": "2024.11.6",
138
- "aiohttp": "3.12.13",
139
- "tabulate": "0.9.0",
140
- "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
- "nvidia-cufft-cu12": "11.3.0.4",
143
- "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
- "click": "8.2.1",
145
- "typing_extensions": "4.12.2",
146
- "attrs": "25.3.0",
147
- "exceptiongroup": "1.3.0",
148
- "tenacity": "9.1.2",
149
- "pytz": "2025.2",
150
- "aiohappyeyeballs": "2.6.1",
151
- "python-dateutil": "2.9.0.post0",
152
- "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
154
- "httpx": "0.28.1",
155
- "matplotlib": "3.10.3",
156
- "xxhash": "3.5.0",
157
- "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
- "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
- "threadpoolctl": "3.6.0",
162
- "nvidia-cudnn-cu12": "9.5.1.17",
163
- "jaraco.collections": "5.1.0",
164
- "tomli": "2.0.1",
165
- "backports.tarfile": "1.2.0",
166
- "jaraco.context": "5.3.0",
167
- "typeguard": "4.3.0",
168
- "autocommand": "2.2.2",
169
- "jaraco.text": "3.12.1",
170
- "more-itertools": "10.3.0",
171
- "platformdirs": "4.2.2",
172
- "inflect": "7.3.1",
173
- "jaraco.functools": "4.0.1"
174
- }
175
- },
176
- "results": {
177
- "bias": {
178
- "safety_bbq_age": {
179
- "accuracy": 0.37777777777777777,
180
- "accuracy_ci_low": 0.28888888888888886,
181
- "accuracy_ci_high": 0.4888888888888889,
182
- "score_name": "accuracy",
183
- "score": 0.37777777777777777,
184
- "score_ci_high": 0.4888888888888889,
185
- "score_ci_low": 0.28888888888888886,
186
- "num_of_instances": 90
187
- },
188
- "safety_bbq_disability_status": {
189
- "accuracy": 0.45555555555555555,
190
- "accuracy_ci_low": 0.35555555555555557,
191
- "accuracy_ci_high": 0.5666666666666667,
192
- "score_name": "accuracy",
193
- "score": 0.45555555555555555,
194
- "score_ci_high": 0.5666666666666667,
195
- "score_ci_low": 0.35555555555555557,
196
- "num_of_instances": 90
197
- },
198
- "safety_bbq_gender_identity": {
199
- "accuracy": 0.3888888888888889,
200
- "accuracy_ci_low": 0.3,
201
- "accuracy_ci_high": 0.4888888888888889,
202
- "score_name": "accuracy",
203
- "score": 0.3888888888888889,
204
- "score_ci_high": 0.4888888888888889,
205
- "score_ci_low": 0.3,
206
- "num_of_instances": 90
207
- },
208
- "safety_bbq_nationality": {
209
- "accuracy": 0.37777777777777777,
210
- "accuracy_ci_low": 0.28888888888888886,
211
- "accuracy_ci_high": 0.4777777777777778,
212
- "score_name": "accuracy",
213
- "score": 0.37777777777777777,
214
- "score_ci_high": 0.4777777777777778,
215
- "score_ci_low": 0.28888888888888886,
216
- "num_of_instances": 90
217
- },
218
- "safety_bbq_physical_appearance": {
219
- "accuracy": 0.36666666666666664,
220
- "accuracy_ci_low": 0.2777777777777778,
221
- "accuracy_ci_high": 0.4777777777777778,
222
- "score_name": "accuracy",
223
- "score": 0.36666666666666664,
224
- "score_ci_high": 0.4777777777777778,
225
- "score_ci_low": 0.2777777777777778,
226
- "num_of_instances": 90
227
- },
228
- "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.4666666666666667,
230
- "accuracy_ci_low": 0.37436916691430816,
231
- "accuracy_ci_high": 0.5777777777777777,
232
- "score_name": "accuracy",
233
- "score": 0.4666666666666667,
234
- "score_ci_high": 0.5777777777777777,
235
- "score_ci_low": 0.37436916691430816,
236
- "num_of_instances": 90
237
- },
238
- "safety_bbq_race_x_gender": {
239
- "accuracy": 0.43333333333333335,
240
- "accuracy_ci_low": 0.32222222222222224,
241
- "accuracy_ci_high": 0.5333333333333333,
242
- "score_name": "accuracy",
243
- "score": 0.43333333333333335,
244
- "score_ci_high": 0.5333333333333333,
245
- "score_ci_low": 0.32222222222222224,
246
- "num_of_instances": 90
247
- },
248
- "safety_bbq_race_x_ses": {
249
- "accuracy": 0.4111111111111111,
250
- "accuracy_ci_low": 0.3111111111111111,
251
- "accuracy_ci_high": 0.5111111111111111,
252
- "score_name": "accuracy",
253
- "score": 0.4111111111111111,
254
- "score_ci_high": 0.5111111111111111,
255
- "score_ci_low": 0.3111111111111111,
256
- "num_of_instances": 90
257
- },
258
- "safety_bbq_religion": {
259
- "accuracy": 0.3888888888888889,
260
- "accuracy_ci_low": 0.28888888888888886,
261
- "accuracy_ci_high": 0.4888888888888889,
262
- "score_name": "accuracy",
263
- "score": 0.3888888888888889,
264
- "score_ci_high": 0.4888888888888889,
265
- "score_ci_low": 0.28888888888888886,
266
- "num_of_instances": 90
267
- },
268
- "safety_bbq_ses": {
269
- "accuracy": 0.5111111111111111,
270
- "accuracy_ci_low": 0.4111111111111111,
271
- "accuracy_ci_high": 0.6222222222222222,
272
- "score_name": "accuracy",
273
- "score": 0.5111111111111111,
274
- "score_ci_high": 0.6222222222222222,
275
- "score_ci_low": 0.4111111111111111,
276
- "num_of_instances": 90
277
- },
278
- "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.3888888888888889,
280
- "accuracy_ci_low": 0.3,
281
- "accuracy_ci_high": 0.5,
282
- "score_name": "accuracy",
283
- "score": 0.3888888888888889,
284
- "score_ci_high": 0.5,
285
- "score_ci_low": 0.3,
286
- "num_of_instances": 90
287
- },
288
- "score": 0.41515151515151516,
289
- "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
- },
292
- "chatbot_abilities": {
293
- "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.01950078003120125,
296
- "score": 0.01950078003120125,
297
- "score_name": "llama_3_70b_instruct_template_arena_hard"
298
- },
299
- "score": 0.01950078003120125,
300
- "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
- },
303
- "entity_extraction": {
304
- "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.44648318042813456,
307
- "f1_Organization": 0.2157676348547718,
308
- "f1_Location": 0.16666666666666669,
309
- "f1_macro": 0.27630582731652437,
310
- "recall_macro": 0.20087031380401354,
311
- "precision_macro": 0.48225440495177335,
312
- "in_classes_support": 0.6990595611285266,
313
- "f1_micro": 0.2701421800947867,
314
- "recall_micro": 0.21714285714285714,
315
- "precision_micro": 0.3573667711598746,
316
- "score": 0.2701421800947867,
317
- "score_name": "f1_micro",
318
- "score_ci_low": 0.2188760455473918,
319
- "score_ci_high": 0.31166112583088945,
320
- "f1_micro_ci_low": 0.2188760455473918,
321
- "f1_micro_ci_high": 0.31166112583088945
322
- },
323
- "score": 0.2701421800947867,
324
- "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
- },
327
- "knowledge": {
328
- "mmlu_pro_biology": {
329
- "accuracy": 0.28169014084507044,
330
- "accuracy_ci_low": 0.18309859154929578,
331
- "accuracy_ci_high": 0.40138961326568784,
332
- "score_name": "accuracy",
333
- "score": 0.28169014084507044,
334
- "score_ci_high": 0.40138961326568784,
335
- "score_ci_low": 0.18309859154929578,
336
- "num_of_instances": 71
337
- },
338
- "mmlu_pro_business": {
339
- "accuracy": 0.16901408450704225,
340
- "accuracy_ci_low": 0.09859154929577464,
341
- "accuracy_ci_high": 0.28169014084507044,
342
- "score_name": "accuracy",
343
- "score": 0.16901408450704225,
344
- "score_ci_high": 0.28169014084507044,
345
- "score_ci_low": 0.09859154929577464,
346
- "num_of_instances": 71
347
- },
348
- "mmlu_pro_chemistry": {
349
- "accuracy": 0.08450704225352113,
350
- "accuracy_ci_low": 0.04225352112676056,
351
- "accuracy_ci_high": 0.16901408450704225,
352
- "score_name": "accuracy",
353
- "score": 0.08450704225352113,
354
- "score_ci_high": 0.16901408450704225,
355
- "score_ci_low": 0.04225352112676056,
356
- "num_of_instances": 71
357
- },
358
- "mmlu_pro_computer_science": {
359
- "accuracy": 0.22535211267605634,
360
- "accuracy_ci_low": 0.14084507042253522,
361
- "accuracy_ci_high": 0.323943661971831,
362
- "score_name": "accuracy",
363
- "score": 0.22535211267605634,
364
- "score_ci_high": 0.323943661971831,
365
- "score_ci_low": 0.14084507042253522,
366
- "num_of_instances": 71
367
- },
368
- "mmlu_pro_economics": {
369
- "accuracy": 0.38028169014084506,
370
- "accuracy_ci_low": 0.2676056338028169,
371
- "accuracy_ci_high": 0.49295774647887325,
372
- "score_name": "accuracy",
373
- "score": 0.38028169014084506,
374
- "score_ci_high": 0.49295774647887325,
375
- "score_ci_low": 0.2676056338028169,
376
- "num_of_instances": 71
377
- },
378
- "mmlu_pro_engineering": {
379
- "accuracy": 0.18309859154929578,
380
- "accuracy_ci_low": 0.09859154929577464,
381
- "accuracy_ci_high": 0.28169014084507044,
382
- "score_name": "accuracy",
383
- "score": 0.18309859154929578,
384
- "score_ci_high": 0.28169014084507044,
385
- "score_ci_low": 0.09859154929577464,
386
- "num_of_instances": 71
387
- },
388
- "mmlu_pro_health": {
389
- "accuracy": 0.11267605633802817,
390
- "accuracy_ci_low": 0.056338028169014086,
391
- "accuracy_ci_high": 0.20762427324557167,
392
- "score_name": "accuracy",
393
- "score": 0.11267605633802817,
394
- "score_ci_high": 0.20762427324557167,
395
- "score_ci_low": 0.056338028169014086,
396
- "num_of_instances": 71
397
- },
398
- "mmlu_pro_history": {
399
- "accuracy": 0.09859154929577464,
400
- "accuracy_ci_low": 0.04225352112676056,
401
- "accuracy_ci_high": 0.18309859154929578,
402
- "score_name": "accuracy",
403
- "score": 0.09859154929577464,
404
- "score_ci_high": 0.18309859154929578,
405
- "score_ci_low": 0.04225352112676056,
406
- "num_of_instances": 71
407
- },
408
- "mmlu_pro_law": {
409
- "accuracy": 0.16901408450704225,
410
- "accuracy_ci_low": 0.09859154929577464,
411
- "accuracy_ci_high": 0.2676056338028169,
412
- "score_name": "accuracy",
413
- "score": 0.16901408450704225,
414
- "score_ci_high": 0.2676056338028169,
415
- "score_ci_low": 0.09859154929577464,
416
- "num_of_instances": 71
417
- },
418
- "mmlu_pro_math": {
419
- "accuracy": 0.14084507042253522,
420
- "accuracy_ci_low": 0.07042253521126761,
421
- "accuracy_ci_high": 0.2535211267605634,
422
- "score_name": "accuracy",
423
- "score": 0.14084507042253522,
424
- "score_ci_high": 0.2535211267605634,
425
- "score_ci_low": 0.07042253521126761,
426
- "num_of_instances": 71
427
- },
428
- "mmlu_pro_other": {
429
- "accuracy": 0.11267605633802817,
430
- "accuracy_ci_low": 0.056338028169014086,
431
- "accuracy_ci_high": 0.2112676056338028,
432
- "score_name": "accuracy",
433
- "score": 0.11267605633802817,
434
- "score_ci_high": 0.2112676056338028,
435
- "score_ci_low": 0.056338028169014086,
436
- "num_of_instances": 71
437
- },
438
- "mmlu_pro_philosophy": {
439
- "accuracy": 0.11267605633802817,
440
- "accuracy_ci_low": 0.056338028169014086,
441
- "accuracy_ci_high": 0.2112676056338028,
442
- "score_name": "accuracy",
443
- "score": 0.11267605633802817,
444
- "score_ci_high": 0.2112676056338028,
445
- "score_ci_low": 0.056338028169014086,
446
- "num_of_instances": 71
447
- },
448
- "mmlu_pro_physics": {
449
- "accuracy": 0.08450704225352113,
450
- "accuracy_ci_low": 0.028169014084507043,
451
- "accuracy_ci_high": 0.15492957746478872,
452
- "score_name": "accuracy",
453
- "score": 0.08450704225352113,
454
- "score_ci_high": 0.15492957746478872,
455
- "score_ci_low": 0.028169014084507043,
456
- "num_of_instances": 71
457
- },
458
- "mmlu_pro_psychology": {
459
- "accuracy": 0.38028169014084506,
460
- "accuracy_ci_low": 0.2676056338028169,
461
- "accuracy_ci_high": 0.5070422535211268,
462
- "score_name": "accuracy",
463
- "score": 0.38028169014084506,
464
- "score_ci_high": 0.5070422535211268,
465
- "score_ci_low": 0.2676056338028169,
466
- "num_of_instances": 71
467
- },
468
- "score": 0.18108651911468812,
469
- "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
- },
472
- "legal": {
473
- "legalbench_abercrombie": {
474
- "f1_macro": 0.2201422254824052,
475
- "f1_suggestive": 0.2631578947368421,
476
- "f1_arbitrary": 0.22857142857142856,
477
- "f1_generic": 0.24390243902439024,
478
- "f1_fanciful": 0.2222222222222222,
479
- "f1_descriptive": 0.14285714285714285,
480
- "f1_macro_ci_low": 0.14283541117516643,
481
- "f1_macro_ci_high": 0.3249210624357632,
482
- "score_name": "f1_micro",
483
- "score": 0.22485207100591717,
484
- "score_ci_high": 0.32142857142857145,
485
- "score_ci_low": 0.14281093882602658,
486
- "num_of_instances": 85,
487
- "accuracy": 0.2235294117647059,
488
- "accuracy_ci_low": 0.1411764705882353,
489
- "accuracy_ci_high": 0.3176470588235294,
490
- "f1_micro": 0.22485207100591717,
491
- "f1_micro_ci_low": 0.14281093882602658,
492
- "f1_micro_ci_high": 0.32142857142857145
493
- },
494
- "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.49296028880866427,
496
- "f1_no": 0.6859205776173285,
497
- "f1_yes": 0.3,
498
- "f1_macro_ci_low": 0.42550135126538996,
499
- "f1_macro_ci_high": 0.5612203343628853,
500
- "score_name": "f1_micro",
501
- "score": 0.5692695214105793,
502
- "score_ci_high": 0.6329113924050633,
503
- "score_ci_low": 0.4962025316455696,
504
- "num_of_instances": 200,
505
- "accuracy": 0.565,
506
- "accuracy_ci_low": 0.4918996659624703,
507
- "accuracy_ci_high": 0.63,
508
- "f1_micro": 0.5692695214105793,
509
- "f1_micro_ci_low": 0.4962025316455696,
510
- "f1_micro_ci_high": 0.6329113924050633
511
- },
512
- "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.17282485903398895,
514
- "f1_conclusion": 0.047619047619047616,
515
- "f1_analysis": 0.2898550724637681,
516
- "f1_decree": 0.25806451612903225,
517
- "f1_facts": 0.09302325581395349,
518
- "f1_issue": 0.13333333333333333,
519
- "f1_rule": 0.26666666666666666,
520
- "f1_procedural history": 0.12121212121212122,
521
- "f1_macro_ci_low": 0.1258149629713259,
522
- "f1_macro_ci_high": 0.23750827438601588,
523
- "score_name": "f1_micro",
524
- "score": 0.20408163265306123,
525
- "score_ci_high": 0.26463104325699743,
526
- "score_ci_low": 0.15267175572519084,
527
- "num_of_instances": 200,
528
- "accuracy": 0.2,
529
- "accuracy_ci_low": 0.15,
530
- "accuracy_ci_high": 0.25995049710654655,
531
- "f1_micro": 0.20408163265306123,
532
- "f1_micro_ci_low": 0.15267175572519084,
533
- "f1_micro_ci_high": 0.26463104325699743
534
- },
535
- "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.4769502535757023,
537
- "f1_yes": 0.5688073394495413,
538
- "f1_no": 0.38509316770186336,
539
- "f1_macro_ci_low": 0.4074761191353562,
540
- "f1_macro_ci_high": 0.542706355356233,
541
- "score_name": "f1_micro",
542
- "score": 0.49076517150395776,
543
- "score_ci_high": 0.5549738219895288,
544
- "score_ci_low": 0.4183693762852218,
545
- "num_of_instances": 200,
546
- "accuracy": 0.465,
547
- "accuracy_ci_low": 0.395,
548
- "accuracy_ci_high": 0.53,
549
- "f1_micro": 0.49076517150395776,
550
- "f1_micro_ci_low": 0.4183693762852218,
551
- "f1_micro_ci_high": 0.5549738219895288
552
- },
553
- "legalbench_proa": {
554
- "f1_macro": 0.6890476190476191,
555
- "f1_yes": 0.64,
556
- "f1_no": 0.7380952380952381,
557
- "f1_macro_ci_low": 0.5867043850592167,
558
- "f1_macro_ci_high": 0.775,
559
- "score_name": "f1_micro",
560
- "score": 0.6918238993710691,
561
- "score_ci_high": 0.7770700636942676,
562
- "score_ci_low": 0.586011156606,
563
- "num_of_instances": 85,
564
- "accuracy": 0.6470588235294118,
565
- "accuracy_ci_low": 0.5411764705882353,
566
- "accuracy_ci_high": 0.7411764705882353,
567
- "f1_micro": 0.6918238993710691,
568
- "f1_micro_ci_low": 0.586011156606,
569
- "f1_micro_ci_high": 0.7770700636942676
570
- },
571
- "score": 0.43615845918891694,
572
- "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
- },
575
- "news_classification": {
576
- "20_newsgroups_short": {
577
- "f1_macro": 0.2164270451621233,
578
- "f1_cars": 0.5263157894736842,
579
- "f1_windows x": 0.0,
580
- "f1_atheism": 0.19047619047619047,
581
- "f1_religion": 0.07692307692307693,
582
- "f1_medicine": 0.3050847457627119,
583
- "f1_hockey": 0.3516483516483517,
584
- "f1_christianity": 0.29850746268656714,
585
- "f1_computer graphics": 0.125,
586
- "f1_microsoft windows": 0.03508771929824561,
587
- "f1_middle east": 0.125,
588
- "f1_motorcycles": 0.23684210526315788,
589
- "f1_cryptography": 0.2702702702702703,
590
- "f1_mac hardware": 0.0,
591
- "f1_politics": 0.22818791946308725,
592
- "f1_electronics": 0.23529411764705882,
593
- "f1_for sale": 0.0,
594
- "f1_guns": 0.14035087719298245,
595
- "f1_space": 0.39436619718309857,
596
- "f1_pc hardware": 0.03508771929824561,
597
- "f1_baseball": 0.7540983606557377,
598
- "f1_macro_ci_low": 0.19503700990493542,
599
- "f1_macro_ci_high": 0.24363185833629145,
600
- "score_name": "f1_micro",
601
- "score": 0.25332400279916023,
602
- "score_ci_high": 0.2842103070323454,
603
- "score_ci_low": 0.2255621673024344,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.181,
606
- "accuracy_ci_low": 0.16,
607
- "accuracy_ci_high": 0.206,
608
- "f1_micro": 0.25332400279916023,
609
- "f1_micro_ci_low": 0.2255621673024344,
610
- "f1_micro_ci_high": 0.2842103070323454
611
- },
612
- "score": 0.25332400279916023,
613
- "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
- },
616
- "product_help": {
617
- "cfpb_product_2023": {
618
- "f1_macro": 0.17253682837845144,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.7386276021588281,
620
- "f1_checking or savings account": 0.15053763440860216,
621
- "f1_debt collection": 0.19310344827586207,
622
- "f1_credit card or prepaid card": 0.1038961038961039,
623
- "f1_mortgage": 0.13333333333333333,
624
- "f1_vehicle loan or lease": 0.16666666666666666,
625
- "f1_student loan": 0.0,
626
- "f1_payday loan or title loan or personal loan": 0.0,
627
- "f1_money transfer or virtual currency or money service": 0.06666666666666667,
628
- "f1_macro_ci_low": 0.14360181188460305,
629
- "f1_macro_ci_high": 0.21482869629805593,
630
- "score_name": "f1_micro",
631
- "score": 0.5875576036866359,
632
- "score_ci_high": 0.6180717759541877,
633
- "score_ci_low": 0.5574039394995974,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.51,
636
- "accuracy_ci_low": 0.4802728156816149,
637
- "accuracy_ci_high": 0.541,
638
- "f1_micro": 0.5875576036866359,
639
- "f1_micro_ci_low": 0.5574039394995974,
640
- "f1_micro_ci_high": 0.6180717759541877
641
- },
642
- "cfpb_product_watsonx": {
643
- "f1_macro": 0.36175351677566253,
644
- "f1_mortgages and loans": 0.4264705882352941,
645
- "f1_credit card": 0.29310344827586204,
646
- "f1_debt collection": 0.44360902255639095,
647
- "f1_credit reporting": 0.5724137931034483,
648
- "f1_retail banking": 0.07317073170731707,
649
- "f1_macro_ci_low": 0.31986969619833744,
650
- "f1_macro_ci_high": 0.4037922302792007,
651
- "score_name": "f1_micro",
652
- "score": 0.42921348314606744,
653
- "score_ci_high": 0.47176643035248556,
654
- "score_ci_low": 0.38170408070231343,
655
- "num_of_instances": 500,
656
- "accuracy": 0.382,
657
- "accuracy_ci_low": 0.34,
658
- "accuracy_ci_high": 0.424,
659
- "f1_micro": 0.42921348314606744,
660
- "f1_micro_ci_low": 0.38170408070231343,
661
- "f1_micro_ci_high": 0.47176643035248556
662
- },
663
- "score": 0.5083855434163517,
664
- "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
- },
667
- "qa_finance": {
668
- "fin_qa": {
669
- "num_of_instances": 1000,
670
- "execution_accuracy": 0.013,
671
- "program_accuracy": 0.017,
672
- "score": 0.017,
673
- "score_name": "program_accuracy",
674
- "execution_accuracy_ci_low": 0.007,
675
- "execution_accuracy_ci_high": 0.022,
676
- "program_accuracy_ci_low": 0.01,
677
- "program_accuracy_ci_high": 0.027,
678
- "score_ci_low": 0.01,
679
- "score_ci_high": 0.027
680
- },
681
- "score": 0.017,
682
- "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
- },
685
- "rag_general": {
686
- "rag_response_generation_clapnq": {
687
- "precision": 0.3198532673532609,
688
- "recall": 0.4011596558810935,
689
- "f1": 0.2852470156019899,
690
- "precision_ci_low": 0.29836535944826126,
691
- "precision_ci_high": 0.34239842009004323,
692
- "recall_ci_low": 0.3833246258154271,
693
- "recall_ci_high": 0.4181134290525644,
694
- "f1_ci_low": 0.26905618569014855,
695
- "f1_ci_high": 0.3006017844609168,
696
- "score_name": "f1",
697
- "score": 0.2852470156019899,
698
- "score_ci_high": 0.3006017844609168,
699
- "score_ci_low": 0.26905618569014855,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.5655807377894719,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6124767065048218,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5509337742378314,
704
- "faithfullness_f1_token_overlap": 0.2309696715819019,
705
- "faithfullness_recall_token_overlap": 0.17063637636202156,
706
- "faithfullness_precision_token_overlap": 0.5816026494498466,
707
- "correctness_f1_token_overlap": 0.2852470156019899,
708
- "correctness_recall_token_overlap": 0.4011596558810935,
709
- "correctness_precision_token_overlap": 0.3198532673532609
710
- },
711
- "score": 0.2852470156019899,
712
- "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
- },
715
- "reasoning": {
716
- "hellaswag": {
717
- "accuracy": 0.284,
718
- "accuracy_ci_low": 0.259,
719
- "accuracy_ci_high": 0.311,
720
- "score_name": "accuracy",
721
- "score": 0.284,
722
- "score_ci_high": 0.311,
723
- "score_ci_low": 0.259,
724
- "num_of_instances": 1000
725
- },
726
- "openbook_qa": {
727
- "accuracy": 0.528,
728
- "accuracy_ci_low": 0.482,
729
- "accuracy_ci_high": 0.568,
730
- "score_name": "accuracy",
731
- "score": 0.528,
732
- "score_ci_high": 0.568,
733
- "score_ci_low": 0.482,
734
- "num_of_instances": 500
735
- },
736
- "score": 0.406,
737
- "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
- },
740
- "safety": {
741
- "attaq_500": {
742
- "severity_critical": 0.0,
743
- "severity_high": 1.0,
744
- "severity_medium": 5.0,
745
- "severity_low": 94.0,
746
- "category_harmful_info": 0.8219608657416844,
747
- "category_deception": 0.8234998996962202,
748
- "category_explicit_content": 0.8439478427548,
749
- "category_violence": 0.8219440165138435,
750
- "category_discrimination": 0.6564966686188228,
751
- "category_substance_abuse": 0.8870111008485158,
752
- "category_pii": 0.8831345179566631,
753
- "safety": 0.8180480572415724,
754
- "safety_ci_low": 0.7930656252748522,
755
- "safety_ci_high": 0.838062627359591,
756
- "score_name": "safety",
757
- "score": 0.8180480572415724,
758
- "score_ci_high": 0.838062627359591,
759
- "score_ci_low": 0.7930656252748522,
760
- "num_of_instances": 100
761
- },
762
- "score": 0.8180480572415724,
763
- "score_name": "subsets_mean",
764
- "num_of_instances": 100
765
- },
766
- "summarization": {
767
- "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rougeL": 0.2815736998609776,
770
- "score": 0.2815736998609776,
771
- "score_name": "rougeL",
772
- "rougeLsum": 0.34760450741431803,
773
- "rouge2": 0.19849457046306532,
774
- "rouge1": 0.4046054880676319,
775
- "rougeL_ci_low": 0.2735435644090114,
776
- "rougeL_ci_high": 0.29011999834027047,
777
- "score_ci_low": 0.2735435644090114,
778
- "score_ci_high": 0.29011999834027047,
779
- "rougeLsum_ci_low": 0.33767767468970084,
780
- "rougeLsum_ci_high": 0.35710807691804086,
781
- "rouge2_ci_low": 0.19109481393979616,
782
- "rouge2_ci_high": 0.20664772893408026,
783
- "rouge1_ci_low": 0.3944251425211918,
784
- "rouge1_ci_high": 0.4147568240146707
785
- },
786
- "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rougeL": 0.08474301186053737,
789
- "score": 0.08474301186053737,
790
- "score_name": "rougeL",
791
- "rougeLsum": 0.09636762322033209,
792
- "rouge2": 0.015597888373505451,
793
- "rouge1": 0.11636905030749585,
794
- "rougeL_ci_low": 0.08126431007647106,
795
- "rougeL_ci_high": 0.08828857560838864,
796
- "score_ci_low": 0.08126431007647106,
797
- "score_ci_high": 0.08828857560838864,
798
- "rougeLsum_ci_low": 0.09229796806654987,
799
- "rougeLsum_ci_high": 0.10047301966535477,
800
- "rouge2_ci_low": 0.013877999787076423,
801
- "rouge2_ci_high": 0.017527052297316516,
802
- "rouge1_ci_low": 0.11092613898018398,
803
- "rouge1_ci_high": 0.12141188563840967
804
- },
805
- "score": 0.18315835586075746,
806
- "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
- },
809
- "translation": {
810
- "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
- "counts": [
813
- 965,
814
- 453,
815
- 254,
816
- 143
817
- ],
818
- "totals": [
819
- 1792,
820
- 1726,
821
- 1660,
822
- 1594
823
- ],
824
- "precisions": [
825
- 0.5385044642857143,
826
- 0.26245654692931636,
827
- 0.1530120481927711,
828
- 0.08971141781681306
829
- ],
830
- "bp": 1.0,
831
- "sys_len": 1792,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.20987224921574224,
834
- "score": 0.20987224921574224,
835
- "score_name": "sacrebleu",
836
- "score_ci_low": 0.18278483808284093,
837
- "score_ci_high": 0.24710116888154685,
838
- "sacrebleu_ci_low": 0.18278483808284093,
839
- "sacrebleu_ci_high": 0.24710116888154685
840
- },
841
- "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
- "counts": [
844
- 1117,
845
- 629,
846
- 389,
847
- 250
848
- ],
849
- "totals": [
850
- 1750,
851
- 1684,
852
- 1618,
853
- 1552
854
- ],
855
- "precisions": [
856
- 0.6382857142857143,
857
- 0.37351543942992876,
858
- 0.24042027194066748,
859
- 0.16108247422680413
860
- ],
861
- "bp": 1.0,
862
- "sys_len": 1750,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.30998149224900357,
865
- "score": 0.30998149224900357,
866
- "score_name": "sacrebleu",
867
- "score_ci_low": 0.27669479922631884,
868
- "score_ci_high": 0.35743043699935445,
869
- "sacrebleu_ci_low": 0.27669479922631884,
870
- "sacrebleu_ci_high": 0.35743043699935445
871
- },
872
- "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
- "counts": [
875
- 485,
876
- 128,
877
- 45,
878
- 13
879
- ],
880
- "totals": [
881
- 1633,
882
- 1567,
883
- 1501,
884
- 1435
885
- ],
886
- "precisions": [
887
- 0.2969993876301286,
888
- 0.08168474792597319,
889
- 0.02998001332445037,
890
- 0.009059233449477351
891
- ],
892
- "bp": 1.0,
893
- "sys_len": 1633,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.05066463869983458,
896
- "score": 0.05066463869983458,
897
- "score_name": "sacrebleu",
898
- "score_ci_low": 0.033695533177424505,
899
- "score_ci_high": 0.07329875078984167,
900
- "sacrebleu_ci_low": 0.033695533177424505,
901
- "sacrebleu_ci_high": 0.07329875078984167
902
- },
903
- "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
- "counts": [
906
- 953,
907
- 451,
908
- 252,
909
- 149
910
- ],
911
- "totals": [
912
- 1838,
913
- 1772,
914
- 1706,
915
- 1640
916
- ],
917
- "precisions": [
918
- 0.5184983677910773,
919
- 0.25451467268623024,
920
- 0.1477139507620164,
921
- 0.09085365853658538
922
- ],
923
- "bp": 1.0,
924
- "sys_len": 1838,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.20514268622714965,
927
- "score": 0.20514268622714965,
928
- "score_name": "sacrebleu",
929
- "score_ci_low": 0.1759309759364383,
930
- "score_ci_high": 0.2610907281316971,
931
- "sacrebleu_ci_low": 0.1759309759364383,
932
- "sacrebleu_ci_high": 0.2610907281316971
933
- },
934
- "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
- "counts": [
937
- 1253,
938
- 748,
939
- 501,
940
- 343
941
- ],
942
- "totals": [
943
- 1957,
944
- 1891,
945
- 1825,
946
- 1759
947
- ],
948
- "precisions": [
949
- 0.6402657128257537,
950
- 0.3955579058699101,
951
- 0.2745205479452055,
952
- 0.19499715747583854
953
- ],
954
- "bp": 0.9448590948597164,
955
- "sys_len": 1957,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.32241141585526967,
958
- "score": 0.32241141585526967,
959
- "score_name": "sacrebleu",
960
- "score_ci_low": 0.28766013219666603,
961
- "score_ci_high": 0.37380260101110974,
962
- "sacrebleu_ci_low": 0.28766013219666603,
963
- "sacrebleu_ci_high": 0.37380260101110974
964
- },
965
- "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
- "counts": [
968
- 885,
969
- 311,
970
- 127,
971
- 61
972
- ],
973
- "totals": [
974
- 2604,
975
- 2538,
976
- 2472,
977
- 2406
978
- ],
979
- "precisions": [
980
- 0.33986175115207373,
981
- 0.12253743104806934,
982
- 0.05137540453074434,
983
- 0.025353283458021614
984
- ],
985
- "bp": 1.0,
986
- "sys_len": 2604,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.08582032051210414,
989
- "score": 0.08582032051210414,
990
- "score_name": "sacrebleu",
991
- "score_ci_low": 0.06950389399091811,
992
- "score_ci_high": 0.10763503120611631,
993
- "sacrebleu_ci_low": 0.06950389399091811,
994
- "sacrebleu_ci_high": 0.10763503120611631
995
- },
996
- "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
- "counts": [
999
- 1206,
1000
- 704,
1001
- 461,
1002
- 305
1003
- ],
1004
- "totals": [
1005
- 1897,
1006
- 1831,
1007
- 1765,
1008
- 1699
1009
- ],
1010
- "precisions": [
1011
- 0.6357406431207169,
1012
- 0.3844893500819224,
1013
- 0.26118980169971673,
1014
- 0.17951736315479694
1015
- ],
1016
- "bp": 0.9900341767854584,
1017
- "sys_len": 1897,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.32393429479632424,
1020
- "score": 0.32393429479632424,
1021
- "score_name": "sacrebleu",
1022
- "score_ci_low": 0.27986805538812254,
1023
- "score_ci_high": 0.35639693336265377,
1024
- "sacrebleu_ci_low": 0.27986805538812254,
1025
- "sacrebleu_ci_high": 0.35639693336265377
1026
- },
1027
- "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
- "counts": [
1030
- 1028,
1031
- 493,
1032
- 278,
1033
- 158
1034
- ],
1035
- "totals": [
1036
- 1947,
1037
- 1881,
1038
- 1815,
1039
- 1749
1040
- ],
1041
- "precisions": [
1042
- 0.5279917822290704,
1043
- 0.26209463051568316,
1044
- 0.15316804407713497,
1045
- 0.0903373356203545
1046
- ],
1047
- "bp": 0.9989733060450584,
1048
- "sys_len": 1947,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.20897005939347885,
1051
- "score": 0.20897005939347885,
1052
- "score_name": "sacrebleu",
1053
- "score_ci_low": 0.18451953543688365,
1054
- "score_ci_high": 0.24162363180192453,
1055
- "sacrebleu_ci_low": 0.18451953543688365,
1056
- "sacrebleu_ci_high": 0.24162363180192453
1057
- },
1058
- "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
- "counts": [
1061
- 1122,
1062
- 520,
1063
- 265,
1064
- 132
1065
- ],
1066
- "totals": [
1067
- 1976,
1068
- 1910,
1069
- 1844,
1070
- 1778
1071
- ],
1072
- "precisions": [
1073
- 0.5678137651821862,
1074
- 0.27225130890052357,
1075
- 0.14370932754880694,
1076
- 0.07424071991001126
1077
- ],
1078
- "bp": 0.940126450752485,
1079
- "sys_len": 1976,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.18945759851867444,
1082
- "score": 0.18945759851867444,
1083
- "score_name": "sacrebleu",
1084
- "score_ci_low": 0.1622061427536013,
1085
- "score_ci_high": 0.21752219857602634,
1086
- "sacrebleu_ci_low": 0.1622061427536013,
1087
- "sacrebleu_ci_high": 0.21752219857602634
1088
- },
1089
- "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
- "counts": [
1092
- 1203,
1093
- 711,
1094
- 461,
1095
- 311
1096
- ],
1097
- "totals": [
1098
- 1781,
1099
- 1715,
1100
- 1649,
1101
- 1583
1102
- ],
1103
- "precisions": [
1104
- 0.6754632229084784,
1105
- 0.4145772594752187,
1106
- 0.27956337174044876,
1107
- 0.19646241313960833
1108
- ],
1109
- "bp": 1.0,
1110
- "sys_len": 1781,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.3521613840302072,
1113
- "score": 0.3521613840302072,
1114
- "score_name": "sacrebleu",
1115
- "score_ci_low": 0.3167971458716246,
1116
- "score_ci_high": 0.3951489450129151,
1117
- "sacrebleu_ci_low": 0.3167971458716246,
1118
- "sacrebleu_ci_high": 0.3951489450129151
1119
- },
1120
- "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
- "counts": [
1123
- 850,
1124
- 315,
1125
- 147,
1126
- 72
1127
- ],
1128
- "totals": [
1129
- 1724,
1130
- 1658,
1131
- 1592,
1132
- 1526
1133
- ],
1134
- "precisions": [
1135
- 0.49303944315545245,
1136
- 0.18998793727382388,
1137
- 0.09233668341708542,
1138
- 0.047182175622542594
1139
- ],
1140
- "bp": 0.9942163261750401,
1141
- "sys_len": 1724,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.14130934129693265,
1144
- "score": 0.14130934129693265,
1145
- "score_name": "sacrebleu",
1146
- "score_ci_low": 0.12273663080433993,
1147
- "score_ci_high": 0.18194995219240426,
1148
- "sacrebleu_ci_low": 0.12273663080433993,
1149
- "sacrebleu_ci_high": 0.18194995219240426
1150
- },
1151
- "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
- "counts": [
1154
- 843,
1155
- 293,
1156
- 128,
1157
- 64
1158
- ],
1159
- "totals": [
1160
- 1778,
1161
- 1712,
1162
- 1646,
1163
- 1580
1164
- ],
1165
- "precisions": [
1166
- 0.47412823397075365,
1167
- 0.17114485981308414,
1168
- 0.07776427703523693,
1169
- 0.04050632911392405
1170
- ],
1171
- "bp": 1.0,
1172
- "sys_len": 1778,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.12644180180823753,
1175
- "score": 0.12644180180823753,
1176
- "score_name": "sacrebleu",
1177
- "score_ci_low": 0.09927741599851922,
1178
- "score_ci_high": 0.18042643788312576,
1179
- "sacrebleu_ci_low": 0.09927741599851922,
1180
- "sacrebleu_ci_high": 0.18042643788312576
1181
- },
1182
- "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
- "counts": [
1185
- 1181,
1186
- 709,
1187
- 467,
1188
- 312
1189
- ],
1190
- "totals": [
1191
- 1738,
1192
- 1672,
1193
- 1606,
1194
- 1540
1195
- ],
1196
- "precisions": [
1197
- 0.6795166858457997,
1198
- 0.4240430622009569,
1199
- 0.2907845579078456,
1200
- 0.20259740259740258
1201
- ],
1202
- "bp": 1.0,
1203
- "sys_len": 1738,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.3609556341496431,
1206
- "score": 0.3609556341496431,
1207
- "score_name": "sacrebleu",
1208
- "score_ci_low": 0.3241550044831716,
1209
- "score_ci_high": 0.41619397652312556,
1210
- "sacrebleu_ci_low": 0.3241550044831716,
1211
- "sacrebleu_ci_high": 0.41619397652312556
1212
- },
1213
- "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
- "counts": [
1216
- 1191,
1217
- 698,
1218
- 453,
1219
- 298
1220
- ],
1221
- "totals": [
1222
- 1820,
1223
- 1754,
1224
- 1688,
1225
- 1622
1226
- ],
1227
- "precisions": [
1228
- 0.6543956043956044,
1229
- 0.3979475484606613,
1230
- 0.2683649289099526,
1231
- 0.18372379778051787
1232
- ],
1233
- "bp": 1.0,
1234
- "sys_len": 1820,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.3366195578842849,
1237
- "score": 0.3366195578842849,
1238
- "score_name": "sacrebleu",
1239
- "score_ci_low": 0.29838967696947455,
1240
- "score_ci_high": 0.3695962539518517,
1241
- "sacrebleu_ci_low": 0.29838967696947455,
1242
- "sacrebleu_ci_high": 0.3695962539518517
1243
- },
1244
- "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
- "counts": [
1247
- 1057,
1248
- 509,
1249
- 284,
1250
- 154
1251
- ],
1252
- "totals": [
1253
- 1809,
1254
- 1743,
1255
- 1677,
1256
- 1611
1257
- ],
1258
- "precisions": [
1259
- 0.5843007186290768,
1260
- 0.29202524383247275,
1261
- 0.1693500298151461,
1262
- 0.09559279950341402
1263
- ],
1264
- "bp": 1.0,
1265
- "sys_len": 1809,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.229253945338322,
1268
- "score": 0.229253945338322,
1269
- "score_name": "sacrebleu",
1270
- "score_ci_low": 0.2057022441776137,
1271
- "score_ci_high": 0.2587331578660881,
1272
- "sacrebleu_ci_low": 0.2057022441776137,
1273
- "sacrebleu_ci_high": 0.2587331578660881
1274
- },
1275
- "score": 0.23019976133168057,
1276
- "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
- },
1279
- "score": 0.3094924761409708,
1280
- "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
- }
1283
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/bluebench/2025-06-23T02-53-05_evaluation_results.json DELETED
@@ -1,1283 +0,0 @@
1
- {
2
- "environment_info": {
3
- "timestamp_utc": "2025-06-23T06:53:01.281933Z",
4
- "command_line_invocation": [
5
- "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
- "--tasks",
7
- "benchmarks.bluebench",
8
- "--model",
9
- "cross_provider",
10
- "--model_args",
11
- "model_name=watsonx/ibm/granite-3-2-8b-instruct,max_tokens=256",
12
- "--output_path",
13
- "./results/bluebench",
14
- "--log_samples",
15
- "--trust_remote_code",
16
- "--batch_size",
17
- "8",
18
- "--verbosity",
19
- "ERROR"
20
- ],
21
- "parsed_arguments": {
22
- "tasks": [
23
- "benchmarks.bluebench"
24
- ],
25
- "split": "test",
26
- "num_fewshots": null,
27
- "limit": null,
28
- "batch_size": 8,
29
- "model": "watsonx/ibm/granite-3-2-8b-instruct",
30
- "model_args": {
31
- "max_tokens": 256
32
- },
33
- "gen_kwargs": null,
34
- "chat_template_kwargs": null,
35
- "output_path": "./results/bluebench",
36
- "output_file_prefix": "evaluation_results",
37
- "log_samples": true,
38
- "verbosity": "ERROR",
39
- "apply_chat_template": false,
40
- "trust_remote_code": true,
41
- "disable_hf_cache": false,
42
- "cache_dir": null
43
- },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
- "python_version": "3.10.18",
47
- "system": "Linux",
48
- "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
- "installed_packages": {
50
- "nvidia-cufile-cu12": "1.11.1.6",
51
- "triton": "3.3.1",
52
- "nltk": "3.9.1",
53
- "anyio": "4.9.0",
54
- "absl-py": "2.3.0",
55
- "tiktoken": "0.9.0",
56
- "charset-normalizer": "3.4.2",
57
- "nvidia-cuda-runtime-cu12": "12.6.77",
58
- "sympy": "1.14.0",
59
- "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
- "httpcore": "1.0.9",
62
- "Jinja2": "3.1.6",
63
- "jsonschema-specifications": "2025.4.1",
64
- "pydantic_core": "2.33.2",
65
- "nvidia-cusparse-cu12": "12.5.4.2",
66
- "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
- "portalocker": "3.2.0",
69
- "pandas": "2.3.0",
70
- "multiprocess": "0.70.16",
71
- "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
- "nvidia-nvjitlink-cu12": "12.6.85",
74
- "nvidia-cublas-cu12": "12.6.4.1",
75
- "pydantic": "2.11.7",
76
- "async-timeout": "5.0.1",
77
- "annotated-types": "0.7.0",
78
- "rouge_score": "0.1.2",
79
- "contourpy": "1.3.2",
80
- "aiosignal": "1.3.2",
81
- "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
- "six": "1.17.0",
84
- "diskcache": "5.6.3",
85
- "tqdm": "4.67.1",
86
- "pyarrow": "20.0.0",
87
- "h11": "0.16.0",
88
- "zipp": "3.19.2",
89
- "tzdata": "2025.2",
90
- "bert-score": "0.3.13",
91
- "setuptools": "80.9.0",
92
- "referencing": "0.36.2",
93
- "sacrebleu": "2.5.1",
94
- "filelock": "3.18.0",
95
- "urllib3": "2.5.0",
96
- "scipy": "1.15.3",
97
- "nvidia-nccl-cu12": "2.26.2",
98
- "kiwisolver": "1.4.8",
99
- "networkx": "3.4.2",
100
- "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
- "sniffio": "1.3.1",
103
- "scikit-learn": "1.7.0",
104
- "nvidia-curand-cu12": "10.3.7.77",
105
- "pip": "25.1.1",
106
- "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
- "datasets": "3.6.0",
109
- "nvidia-cusolver-cu12": "11.7.1.2",
110
- "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
- "distro": "1.9.0",
113
- "idna": "3.10",
114
- "MarkupSafe": "3.0.2",
115
- "frozenlist": "1.7.0",
116
- "pyparsing": "3.2.3",
117
- "jiter": "0.10.0",
118
- "importlib_metadata": "8.0.0",
119
- "packaging": "24.2",
120
- "psutil": "7.0.0",
121
- "mecab-ko-dic": "1.0.0",
122
- "joblib": "1.5.1",
123
- "fsspec": "2025.3.0",
124
- "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
- "wheel": "0.45.1",
127
- "nvidia-nvtx-cu12": "12.6.77",
128
- "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
- "propcache": "0.3.2",
131
- "numpy": "2.2.6",
132
- "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
- "conllu": "6.0.0",
135
- "safetensors": "0.5.3",
136
- "requests": "2.32.4",
137
- "regex": "2024.11.6",
138
- "aiohttp": "3.12.13",
139
- "tabulate": "0.9.0",
140
- "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
- "nvidia-cufft-cu12": "11.3.0.4",
143
- "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
- "click": "8.2.1",
145
- "typing_extensions": "4.12.2",
146
- "attrs": "25.3.0",
147
- "exceptiongroup": "1.3.0",
148
- "tenacity": "9.1.2",
149
- "pytz": "2025.2",
150
- "aiohappyeyeballs": "2.6.1",
151
- "python-dateutil": "2.9.0.post0",
152
- "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
154
- "httpx": "0.28.1",
155
- "matplotlib": "3.10.3",
156
- "xxhash": "3.5.0",
157
- "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
- "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
- "threadpoolctl": "3.6.0",
162
- "nvidia-cudnn-cu12": "9.5.1.17",
163
- "jaraco.collections": "5.1.0",
164
- "tomli": "2.0.1",
165
- "backports.tarfile": "1.2.0",
166
- "jaraco.context": "5.3.0",
167
- "typeguard": "4.3.0",
168
- "autocommand": "2.2.2",
169
- "jaraco.text": "3.12.1",
170
- "more-itertools": "10.3.0",
171
- "platformdirs": "4.2.2",
172
- "inflect": "7.3.1",
173
- "jaraco.functools": "4.0.1"
174
- }
175
- },
176
- "results": {
177
- "bias": {
178
- "safety_bbq_age": {
179
- "accuracy": 0.6444444444444445,
180
- "accuracy_ci_low": 0.5444444444444444,
181
- "accuracy_ci_high": 0.7384996290160605,
182
- "score_name": "accuracy",
183
- "score": 0.6444444444444445,
184
- "score_ci_high": 0.7384996290160605,
185
- "score_ci_low": 0.5444444444444444,
186
- "num_of_instances": 90
187
- },
188
- "safety_bbq_disability_status": {
189
- "accuracy": 0.7111111111111111,
190
- "accuracy_ci_low": 0.6,
191
- "accuracy_ci_high": 0.8,
192
- "score_name": "accuracy",
193
- "score": 0.7111111111111111,
194
- "score_ci_high": 0.8,
195
- "score_ci_low": 0.6,
196
- "num_of_instances": 90
197
- },
198
- "safety_bbq_gender_identity": {
199
- "accuracy": 0.9111111111111111,
200
- "accuracy_ci_low": 0.8444444444444444,
201
- "accuracy_ci_high": 0.9555555555555556,
202
- "score_name": "accuracy",
203
- "score": 0.9111111111111111,
204
- "score_ci_high": 0.9555555555555556,
205
- "score_ci_low": 0.8444444444444444,
206
- "num_of_instances": 90
207
- },
208
- "safety_bbq_nationality": {
209
- "accuracy": 0.7888888888888889,
210
- "accuracy_ci_low": 0.7,
211
- "accuracy_ci_high": 0.8666666666666667,
212
- "score_name": "accuracy",
213
- "score": 0.7888888888888889,
214
- "score_ci_high": 0.8666666666666667,
215
- "score_ci_low": 0.7,
216
- "num_of_instances": 90
217
- },
218
- "safety_bbq_physical_appearance": {
219
- "accuracy": 0.7888888888888889,
220
- "accuracy_ci_low": 0.6888888888888889,
221
- "accuracy_ci_high": 0.8666666666666667,
222
- "score_name": "accuracy",
223
- "score": 0.7888888888888889,
224
- "score_ci_high": 0.8666666666666667,
225
- "score_ci_low": 0.6888888888888889,
226
- "num_of_instances": 90
227
- },
228
- "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.9666666666666667,
230
- "accuracy_ci_low": 0.9222222222222223,
231
- "accuracy_ci_high": 0.9888888888888889,
232
- "score_name": "accuracy",
233
- "score": 0.9666666666666667,
234
- "score_ci_high": 0.9888888888888889,
235
- "score_ci_low": 0.9222222222222223,
236
- "num_of_instances": 90
237
- },
238
- "safety_bbq_race_x_gender": {
239
- "accuracy": 0.9333333333333333,
240
- "accuracy_ci_low": 0.8666666666666667,
241
- "accuracy_ci_high": 0.9777777777777777,
242
- "score_name": "accuracy",
243
- "score": 0.9333333333333333,
244
- "score_ci_high": 0.9777777777777777,
245
- "score_ci_low": 0.8666666666666667,
246
- "num_of_instances": 90
247
- },
248
- "safety_bbq_race_x_ses": {
249
- "accuracy": 0.9444444444444444,
250
- "accuracy_ci_low": 0.8777777777777778,
251
- "accuracy_ci_high": 0.9777777777777777,
252
- "score_name": "accuracy",
253
- "score": 0.9444444444444444,
254
- "score_ci_high": 0.9777777777777777,
255
- "score_ci_low": 0.8777777777777778,
256
- "num_of_instances": 90
257
- },
258
- "safety_bbq_religion": {
259
- "accuracy": 0.8333333333333334,
260
- "accuracy_ci_low": 0.7333333333333333,
261
- "accuracy_ci_high": 0.9,
262
- "score_name": "accuracy",
263
- "score": 0.8333333333333334,
264
- "score_ci_high": 0.9,
265
- "score_ci_low": 0.7333333333333333,
266
- "num_of_instances": 90
267
- },
268
- "safety_bbq_ses": {
269
- "accuracy": 0.7,
270
- "accuracy_ci_low": 0.6,
271
- "accuracy_ci_high": 0.7888888888888889,
272
- "score_name": "accuracy",
273
- "score": 0.7,
274
- "score_ci_high": 0.7888888888888889,
275
- "score_ci_low": 0.6,
276
- "num_of_instances": 90
277
- },
278
- "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.8222222222222222,
280
- "accuracy_ci_low": 0.7444444444444445,
281
- "accuracy_ci_high": 0.8888888888888888,
282
- "score_name": "accuracy",
283
- "score": 0.8222222222222222,
284
- "score_ci_high": 0.8888888888888888,
285
- "score_ci_low": 0.7444444444444445,
286
- "num_of_instances": 90
287
- },
288
- "score": 0.8222222222222222,
289
- "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
- },
292
- "chatbot_abilities": {
293
- "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.0625,
296
- "score": 0.0625,
297
- "score_name": "llama_3_70b_instruct_template_arena_hard"
298
- },
299
- "score": 0.0625,
300
- "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
- },
303
- "entity_extraction": {
304
- "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.5798816568047337,
307
- "f1_Organization": 0.42592592592592593,
308
- "f1_Location": 0.40336134453781514,
309
- "f1_macro": 0.46972297575615823,
310
- "recall_macro": 0.40171664278500413,
311
- "precision_macro": 0.5828611111111112,
312
- "in_classes_support": 0.8064516129032258,
313
- "f1_micro": 0.4343434343434343,
314
- "recall_micro": 0.4095238095238095,
315
- "precision_micro": 0.46236559139784944,
316
- "score": 0.4343434343434343,
317
- "score_name": "f1_micro",
318
- "score_ci_low": 0.3767195996125236,
319
- "score_ci_high": 0.4839388766009964,
320
- "f1_micro_ci_low": 0.3767195996125236,
321
- "f1_micro_ci_high": 0.4839388766009964
322
- },
323
- "score": 0.4343434343434343,
324
- "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
- },
327
- "knowledge": {
328
- "mmlu_pro_biology": {
329
- "accuracy": 0.5492957746478874,
330
- "accuracy_ci_low": 0.43661971830985913,
331
- "accuracy_ci_high": 0.6619718309859155,
332
- "score_name": "accuracy",
333
- "score": 0.5492957746478874,
334
- "score_ci_high": 0.6619718309859155,
335
- "score_ci_low": 0.43661971830985913,
336
- "num_of_instances": 71
337
- },
338
- "mmlu_pro_business": {
339
- "accuracy": 0.2112676056338028,
340
- "accuracy_ci_low": 0.1267605633802817,
341
- "accuracy_ci_high": 0.30985915492957744,
342
- "score_name": "accuracy",
343
- "score": 0.2112676056338028,
344
- "score_ci_high": 0.30985915492957744,
345
- "score_ci_low": 0.1267605633802817,
346
- "num_of_instances": 71
347
- },
348
- "mmlu_pro_chemistry": {
349
- "accuracy": 0.2112676056338028,
350
- "accuracy_ci_low": 0.1267605633802817,
351
- "accuracy_ci_high": 0.323943661971831,
352
- "score_name": "accuracy",
353
- "score": 0.2112676056338028,
354
- "score_ci_high": 0.323943661971831,
355
- "score_ci_low": 0.1267605633802817,
356
- "num_of_instances": 71
357
- },
358
- "mmlu_pro_computer_science": {
359
- "accuracy": 0.30985915492957744,
360
- "accuracy_ci_low": 0.2112676056338028,
361
- "accuracy_ci_high": 0.4225352112676056,
362
- "score_name": "accuracy",
363
- "score": 0.30985915492957744,
364
- "score_ci_high": 0.4225352112676056,
365
- "score_ci_low": 0.2112676056338028,
366
- "num_of_instances": 71
367
- },
368
- "mmlu_pro_economics": {
369
- "accuracy": 0.4225352112676056,
370
- "accuracy_ci_low": 0.29577464788732394,
371
- "accuracy_ci_high": 0.5352112676056338,
372
- "score_name": "accuracy",
373
- "score": 0.4225352112676056,
374
- "score_ci_high": 0.5352112676056338,
375
- "score_ci_low": 0.29577464788732394,
376
- "num_of_instances": 71
377
- },
378
- "mmlu_pro_engineering": {
379
- "accuracy": 0.23943661971830985,
380
- "accuracy_ci_low": 0.14084507042253522,
381
- "accuracy_ci_high": 0.352112676056338,
382
- "score_name": "accuracy",
383
- "score": 0.23943661971830985,
384
- "score_ci_high": 0.352112676056338,
385
- "score_ci_low": 0.14084507042253522,
386
- "num_of_instances": 71
387
- },
388
- "mmlu_pro_health": {
389
- "accuracy": 0.323943661971831,
390
- "accuracy_ci_low": 0.2112676056338028,
391
- "accuracy_ci_high": 0.43661971830985913,
392
- "score_name": "accuracy",
393
- "score": 0.323943661971831,
394
- "score_ci_high": 0.43661971830985913,
395
- "score_ci_low": 0.2112676056338028,
396
- "num_of_instances": 71
397
- },
398
- "mmlu_pro_history": {
399
- "accuracy": 0.4507042253521127,
400
- "accuracy_ci_low": 0.3380281690140845,
401
- "accuracy_ci_high": 0.5633802816901409,
402
- "score_name": "accuracy",
403
- "score": 0.4507042253521127,
404
- "score_ci_high": 0.5633802816901409,
405
- "score_ci_low": 0.3380281690140845,
406
- "num_of_instances": 71
407
- },
408
- "mmlu_pro_law": {
409
- "accuracy": 0.3380281690140845,
410
- "accuracy_ci_low": 0.23943661971830985,
411
- "accuracy_ci_high": 0.4393434853289757,
412
- "score_name": "accuracy",
413
- "score": 0.3380281690140845,
414
- "score_ci_high": 0.4393434853289757,
415
- "score_ci_low": 0.23943661971830985,
416
- "num_of_instances": 71
417
- },
418
- "mmlu_pro_math": {
419
- "accuracy": 0.2112676056338028,
420
- "accuracy_ci_low": 0.1267605633802817,
421
- "accuracy_ci_high": 0.30985915492957744,
422
- "score_name": "accuracy",
423
- "score": 0.2112676056338028,
424
- "score_ci_high": 0.30985915492957744,
425
- "score_ci_low": 0.1267605633802817,
426
- "num_of_instances": 71
427
- },
428
- "mmlu_pro_other": {
429
- "accuracy": 0.29577464788732394,
430
- "accuracy_ci_low": 0.19718309859154928,
431
- "accuracy_ci_high": 0.4084507042253521,
432
- "score_name": "accuracy",
433
- "score": 0.29577464788732394,
434
- "score_ci_high": 0.4084507042253521,
435
- "score_ci_low": 0.19718309859154928,
436
- "num_of_instances": 71
437
- },
438
- "mmlu_pro_philosophy": {
439
- "accuracy": 0.4084507042253521,
440
- "accuracy_ci_low": 0.2885703240152898,
441
- "accuracy_ci_high": 0.5211267605633803,
442
- "score_name": "accuracy",
443
- "score": 0.4084507042253521,
444
- "score_ci_high": 0.5211267605633803,
445
- "score_ci_low": 0.2885703240152898,
446
- "num_of_instances": 71
447
- },
448
- "mmlu_pro_physics": {
449
- "accuracy": 0.2112676056338028,
450
- "accuracy_ci_low": 0.1267605633802817,
451
- "accuracy_ci_high": 0.30985915492957744,
452
- "score_name": "accuracy",
453
- "score": 0.2112676056338028,
454
- "score_ci_high": 0.30985915492957744,
455
- "score_ci_low": 0.1267605633802817,
456
- "num_of_instances": 71
457
- },
458
- "mmlu_pro_psychology": {
459
- "accuracy": 0.5492957746478874,
460
- "accuracy_ci_low": 0.43661971830985913,
461
- "accuracy_ci_high": 0.672415960906933,
462
- "score_name": "accuracy",
463
- "score": 0.5492957746478874,
464
- "score_ci_high": 0.672415960906933,
465
- "score_ci_low": 0.43661971830985913,
466
- "num_of_instances": 71
467
- },
468
- "score": 0.3380281690140845,
469
- "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
- },
472
- "legal": {
473
- "legalbench_abercrombie": {
474
- "f1_macro": 0.29746332099273276,
475
- "f1_suggestive": 0.0,
476
- "f1_descriptive": 0.36363636363636365,
477
- "f1_generic": 0.11764705882352941,
478
- "f1_fanciful": 0.6470588235294118,
479
- "f1_arbitrary": 0.358974358974359,
480
- "f1_macro_ci_low": 0.2234746436877424,
481
- "f1_macro_ci_high": 0.3820540135751509,
482
- "score_name": "f1_micro",
483
- "score": 0.3493975903614458,
484
- "score_ci_high": 0.45121951219512196,
485
- "score_ci_low": 0.25149700598802394,
486
- "num_of_instances": 85,
487
- "accuracy": 0.3411764705882353,
488
- "accuracy_ci_low": 0.24705882352941178,
489
- "accuracy_ci_high": 0.4470588235294118,
490
- "f1_micro": 0.3493975903614458,
491
- "f1_micro_ci_low": 0.25149700598802394,
492
- "f1_micro_ci_high": 0.45121951219512196
493
- },
494
- "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.6415770609318996,
496
- "f1_no": 0.8387096774193549,
497
- "f1_yes": 0.4444444444444444,
498
- "f1_macro_ci_low": 0.5666801252929456,
499
- "f1_macro_ci_high": 0.7176297030965157,
500
- "score_name": "f1_micro",
501
- "score": 0.75,
502
- "score_ci_high": 0.805,
503
- "score_ci_low": 0.685,
504
- "num_of_instances": 200,
505
- "accuracy": 0.75,
506
- "accuracy_ci_low": 0.685,
507
- "accuracy_ci_high": 0.805,
508
- "f1_micro": 0.75,
509
- "f1_micro_ci_low": 0.685,
510
- "f1_micro_ci_high": 0.805
511
- },
512
- "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.2937287351505673,
514
- "f1_conclusion": 0.16,
515
- "f1_issue": 0.3291139240506329,
516
- "f1_decree": 0.24242424242424243,
517
- "f1_rule": 0.4931506849315068,
518
- "f1_analysis": 0.2916666666666667,
519
- "f1_facts": 0.21621621621621623,
520
- "f1_procedural history": 0.3235294117647059,
521
- "f1_macro_ci_low": 0.2356167023599295,
522
- "f1_macro_ci_high": 0.3627174769966993,
523
- "score_name": "f1_micro",
524
- "score": 0.31443298969072164,
525
- "score_ci_high": 0.37945181171815084,
526
- "score_ci_low": 0.24415584415584415,
527
- "num_of_instances": 200,
528
- "accuracy": 0.305,
529
- "accuracy_ci_low": 0.23726030718429333,
530
- "accuracy_ci_high": 0.37,
531
- "f1_micro": 0.31443298969072164,
532
- "f1_micro_ci_low": 0.24415584415584415,
533
- "f1_micro_ci_high": 0.37945181171815084
534
- },
535
- "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.44871725481706,
537
- "f1_yes": 0.5887096774193549,
538
- "f1_no": 0.3087248322147651,
539
- "f1_macro_ci_low": 0.3839275870787324,
540
- "f1_macro_ci_high": 0.5191612607559799,
541
- "score_name": "f1_micro",
542
- "score": 0.4836272040302267,
543
- "score_ci_high": 0.5505050505050505,
544
- "score_ci_low": 0.4120603015075377,
545
- "num_of_instances": 200,
546
- "accuracy": 0.48,
547
- "accuracy_ci_low": 0.4083713252748318,
548
- "accuracy_ci_high": 0.545,
549
- "f1_micro": 0.4836272040302267,
550
- "f1_micro_ci_low": 0.4120603015075377,
551
- "f1_micro_ci_high": 0.5505050505050505
552
- },
553
- "legalbench_proa": {
554
- "f1_macro": 0.8044662309368191,
555
- "f1_yes": 0.7941176470588235,
556
- "f1_no": 0.8148148148148148,
557
- "f1_macro_ci_low": 0.7223270079060395,
558
- "f1_macro_ci_high": 0.87627946340442,
559
- "score_name": "f1_micro",
560
- "score": 0.8053691275167785,
561
- "score_ci_high": 0.8717948717948718,
562
- "score_ci_low": 0.7140882327681733,
563
- "num_of_instances": 85,
564
- "accuracy": 0.7058823529411765,
565
- "accuracy_ci_low": 0.6,
566
- "accuracy_ci_high": 0.8,
567
- "f1_micro": 0.8053691275167785,
568
- "f1_micro_ci_low": 0.7140882327681733,
569
- "f1_micro_ci_high": 0.8717948717948718
570
- },
571
- "score": 0.5405653823198345,
572
- "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
- },
575
- "news_classification": {
576
- "20_newsgroups_short": {
577
- "f1_macro": 0.4772436601851055,
578
- "f1_cars": 0.7640449438202247,
579
- "f1_pc hardware": 0.3557692307692308,
580
- "f1_windows x": 0.028985507246376812,
581
- "f1_electronics": 0.48717948717948717,
582
- "f1_atheism": 0.20408163265306123,
583
- "f1_politics": 0.34355828220858897,
584
- "f1_religion": 0.2708333333333333,
585
- "f1_medicine": 0.7948717948717948,
586
- "f1_christianity": 0.4166666666666667,
587
- "f1_for sale": 0.6067415730337079,
588
- "f1_computer graphics": 0.42016806722689076,
589
- "f1_microsoft windows": 0.25806451612903225,
590
- "f1_middle east": 0.49382716049382713,
591
- "f1_motorcycles": 0.6666666666666666,
592
- "f1_mac hardware": 0.25,
593
- "f1_guns": 0.23728813559322035,
594
- "f1_space": 0.717391304347826,
595
- "f1_cryptography": 0.5230769230769231,
596
- "f1_baseball": 0.8461538461538461,
597
- "f1_hockey": 0.859504132231405,
598
- "f1_macro_ci_low": 0.45194761799386507,
599
- "f1_macro_ci_high": 0.5063130462647102,
600
- "score_name": "f1_micro",
601
- "score": 0.49115281501340485,
602
- "score_ci_high": 0.5196912105086561,
603
- "score_ci_low": 0.4585932126016045,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.458,
606
- "accuracy_ci_low": 0.427,
607
- "accuracy_ci_high": 0.4864735442740007,
608
- "f1_micro": 0.49115281501340485,
609
- "f1_micro_ci_low": 0.4585932126016045,
610
- "f1_micro_ci_high": 0.5196912105086561
611
- },
612
- "score": 0.49115281501340485,
613
- "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
- },
616
- "product_help": {
617
- "cfpb_product_2023": {
618
- "f1_macro": 0.5988009590549132,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9113372093023255,
620
- "f1_checking or savings account": 0.5542168674698795,
621
- "f1_debt collection": 0.4779874213836478,
622
- "f1_credit card or prepaid card": 0.6370370370370371,
623
- "f1_mortgage": 0.7397260273972602,
624
- "f1_student loan": 0.8461538461538461,
625
- "f1_money transfer or virtual currency or money service": 0.4864864864864865,
626
- "f1_vehicle loan or lease": 0.42857142857142855,
627
- "f1_payday loan or title loan or personal loan": 0.3076923076923077,
628
- "f1_macro_ci_low": 0.550125163696031,
629
- "f1_macro_ci_high": 0.6692920824665255,
630
- "score_name": "f1_micro",
631
- "score": 0.8145077720207254,
632
- "score_ci_high": 0.8367924066551193,
633
- "score_ci_low": 0.7900784551279257,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.786,
636
- "accuracy_ci_low": 0.76,
637
- "accuracy_ci_high": 0.81,
638
- "f1_micro": 0.8145077720207254,
639
- "f1_micro_ci_low": 0.7900784551279257,
640
- "f1_micro_ci_high": 0.8367924066551193
641
- },
642
- "cfpb_product_watsonx": {
643
- "f1_macro": 0.6991553168902936,
644
- "f1_mortgages and loans": 0.8228571428571428,
645
- "f1_credit card": 0.7428571428571429,
646
- "f1_debt collection": 0.6116504854368932,
647
- "f1_credit reporting": 0.7211895910780669,
648
- "f1_retail banking": 0.5972222222222222,
649
- "f1_macro_ci_low": 0.6611649815931737,
650
- "f1_macro_ci_high": 0.7441131702771507,
651
- "score_name": "f1_micro",
652
- "score": 0.7017543859649122,
653
- "score_ci_high": 0.7444878377150386,
654
- "score_ci_low": 0.6639049566735055,
655
- "num_of_instances": 500,
656
- "accuracy": 0.68,
657
- "accuracy_ci_low": 0.6415834821537145,
658
- "accuracy_ci_high": 0.7250870857804175,
659
- "f1_micro": 0.7017543859649122,
660
- "f1_micro_ci_low": 0.6639049566735055,
661
- "f1_micro_ci_high": 0.7444878377150386
662
- },
663
- "score": 0.7581310789928188,
664
- "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
- },
667
- "qa_finance": {
668
- "fin_qa": {
669
- "num_of_instances": 1000,
670
- "execution_accuracy": 0.113,
671
- "program_accuracy": 0.137,
672
- "score": 0.137,
673
- "score_name": "program_accuracy",
674
- "execution_accuracy_ci_low": 0.094,
675
- "execution_accuracy_ci_high": 0.133,
676
- "program_accuracy_ci_low": 0.115,
677
- "program_accuracy_ci_high": 0.159,
678
- "score_ci_low": 0.115,
679
- "score_ci_high": 0.159
680
- },
681
- "score": 0.137,
682
- "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
- },
685
- "rag_general": {
686
- "rag_response_generation_clapnq": {
687
- "precision": 0.3510702505438354,
688
- "recall": 0.5809699679510664,
689
- "f1": 0.3824309995878018,
690
- "precision_ci_low": 0.33249065458630267,
691
- "precision_ci_high": 0.37203127312002804,
692
- "recall_ci_low": 0.5641121374917414,
693
- "recall_ci_high": 0.5988884760579221,
694
- "f1_ci_low": 0.3661582243989197,
695
- "f1_ci_high": 0.4007102293320155,
696
- "score_name": "f1",
697
- "score": 0.3824309995878018,
698
- "score_ci_high": 0.4007102293320155,
699
- "score_ci_low": 0.3661582243989197,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.6284164202213287,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.7022818158070246,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5862240339815616,
704
- "faithfullness_f1_token_overlap": 0.32867960701983034,
705
- "faithfullness_recall_token_overlap": 0.26185727010665827,
706
- "faithfullness_precision_token_overlap": 0.5811151625996505,
707
- "correctness_f1_token_overlap": 0.3824309995878018,
708
- "correctness_recall_token_overlap": 0.5809699679510664,
709
- "correctness_precision_token_overlap": 0.3510702505438354
710
- },
711
- "score": 0.3824309995878018,
712
- "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
- },
715
- "reasoning": {
716
- "hellaswag": {
717
- "accuracy": 0.479,
718
- "accuracy_ci_low": 0.448,
719
- "accuracy_ci_high": 0.5103198452743031,
720
- "score_name": "accuracy",
721
- "score": 0.479,
722
- "score_ci_high": 0.5103198452743031,
723
- "score_ci_low": 0.448,
724
- "num_of_instances": 1000
725
- },
726
- "openbook_qa": {
727
- "accuracy": 0.818,
728
- "accuracy_ci_low": 0.7804131917525572,
729
- "accuracy_ci_high": 0.85,
730
- "score_name": "accuracy",
731
- "score": 0.818,
732
- "score_ci_high": 0.85,
733
- "score_ci_low": 0.7804131917525572,
734
- "num_of_instances": 500
735
- },
736
- "score": 0.6485,
737
- "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
- },
740
- "safety": {
741
- "attaq_500": {
742
- "severity_critical": 0.0,
743
- "severity_high": 0.0,
744
- "severity_medium": 0.0,
745
- "severity_low": 100.0,
746
- "category_harmful_info": 0.8486059237981127,
747
- "category_deception": 0.8698270569997605,
748
- "category_explicit_content": 0.8145937094032161,
749
- "category_violence": 0.8388658185134685,
750
- "category_discrimination": 0.8823170511732025,
751
- "category_substance_abuse": 0.9462624288267559,
752
- "category_pii": 0.8776307304700216,
753
- "safety": 0.8519032163850758,
754
- "safety_ci_low": 0.8310657418019435,
755
- "safety_ci_high": 0.8712351615904984,
756
- "score_name": "safety",
757
- "score": 0.8519032163850758,
758
- "score_ci_high": 0.8712351615904984,
759
- "score_ci_low": 0.8310657418019435,
760
- "num_of_instances": 100
761
- },
762
- "score": 0.8519032163850758,
763
- "score_name": "subsets_mean",
764
- "num_of_instances": 100
765
- },
766
- "summarization": {
767
- "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rougeLsum": 0.3610679948661998,
770
- "rouge1": 0.4233619373045491,
771
- "rougeL": 0.29420815957168495,
772
- "score": 0.29420815957168495,
773
- "score_name": "rougeL",
774
- "rouge2": 0.20840228792560456,
775
- "rougeLsum_ci_low": 0.35255104965168327,
776
- "rougeLsum_ci_high": 0.36915628751369906,
777
- "rouge1_ci_low": 0.4141157245136807,
778
- "rouge1_ci_high": 0.43171361247266377,
779
- "rougeL_ci_low": 0.28665554382409086,
780
- "rougeL_ci_high": 0.30100667120780134,
781
- "score_ci_low": 0.28665554382409086,
782
- "score_ci_high": 0.30100667120780134,
783
- "rouge2_ci_low": 0.20121549945064432,
784
- "rouge2_ci_high": 0.21553750893087562
785
- },
786
- "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rougeLsum": 0.09608294803448246,
789
- "rouge1": 0.11601113348984893,
790
- "rougeL": 0.08329710711031496,
791
- "score": 0.08329710711031496,
792
- "score_name": "rougeL",
793
- "rouge2": 0.01614281525612853,
794
- "rougeLsum_ci_low": 0.09161955792928417,
795
- "rougeLsum_ci_high": 0.10006888471086645,
796
- "rouge1_ci_low": 0.1103956147665806,
797
- "rouge1_ci_high": 0.12113815092736294,
798
- "rougeL_ci_low": 0.07939906960390719,
799
- "rougeL_ci_high": 0.08668886729552314,
800
- "score_ci_low": 0.07939906960390719,
801
- "score_ci_high": 0.08668886729552314,
802
- "rouge2_ci_low": 0.014303052357088147,
803
- "rouge2_ci_high": 0.01823150788885683
804
- },
805
- "score": 0.18875263334099995,
806
- "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
- },
809
- "translation": {
810
- "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
- "counts": [
813
- 1127,
814
- 611,
815
- 358,
816
- 217
817
- ],
818
- "totals": [
819
- 1857,
820
- 1791,
821
- 1725,
822
- 1659
823
- ],
824
- "precisions": [
825
- 0.6068928379106086,
826
- 0.3411501954215522,
827
- 0.20753623188405798,
828
- 0.13080168776371306
829
- ],
830
- "bp": 1.0,
831
- "sys_len": 1857,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.27380490753896447,
834
- "score": 0.27380490753896447,
835
- "score_name": "sacrebleu",
836
- "score_ci_low": 0.24139134438987545,
837
- "score_ci_high": 0.3114086803649994,
838
- "sacrebleu_ci_low": 0.24139134438987545,
839
- "sacrebleu_ci_high": 0.3114086803649994
840
- },
841
- "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
- "counts": [
844
- 1217,
845
- 742,
846
- 486,
847
- 324
848
- ],
849
- "totals": [
850
- 1805,
851
- 1739,
852
- 1673,
853
- 1607
854
- ],
855
- "precisions": [
856
- 0.6742382271468144,
857
- 0.42668200115008625,
858
- 0.2904961147638972,
859
- 0.20161792159303052
860
- ],
861
- "bp": 1.0,
862
- "sys_len": 1805,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.36028550442407303,
865
- "score": 0.36028550442407303,
866
- "score_name": "sacrebleu",
867
- "score_ci_low": 0.3120172829600809,
868
- "score_ci_high": 0.4057887928505002,
869
- "sacrebleu_ci_low": 0.3120172829600809,
870
- "sacrebleu_ci_high": 0.4057887928505002
871
- },
872
- "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
- "counts": [
875
- 672,
876
- 256,
877
- 123,
878
- 56
879
- ],
880
- "totals": [
881
- 1845,
882
- 1779,
883
- 1713,
884
- 1647
885
- ],
886
- "precisions": [
887
- 0.36422764227642274,
888
- 0.14390106801573918,
889
- 0.07180385288966726,
890
- 0.03400121432908318
891
- ],
892
- "bp": 1.0,
893
- "sys_len": 1845,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.10635790496521375,
896
- "score": 0.10635790496521375,
897
- "score_name": "sacrebleu",
898
- "score_ci_low": 0.0652406937888321,
899
- "score_ci_high": 0.1319350359039831,
900
- "sacrebleu_ci_low": 0.0652406937888321,
901
- "sacrebleu_ci_high": 0.1319350359039831
902
- },
903
- "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
- "counts": [
906
- 1077,
907
- 569,
908
- 326,
909
- 185
910
- ],
911
- "totals": [
912
- 1845,
913
- 1779,
914
- 1713,
915
- 1647
916
- ],
917
- "precisions": [
918
- 0.583739837398374,
919
- 0.3198426082068578,
920
- 0.19030939871570343,
921
- 0.11232544019429266
922
- ],
923
- "bp": 1.0,
924
- "sys_len": 1845,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.25134688330821237,
927
- "score": 0.25134688330821237,
928
- "score_name": "sacrebleu",
929
- "score_ci_low": 0.2195832990093629,
930
- "score_ci_high": 0.2837968314094506,
931
- "sacrebleu_ci_low": 0.2195832990093629,
932
- "sacrebleu_ci_high": 0.2837968314094506
933
- },
934
- "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
- "counts": [
937
- 1424,
938
- 986,
939
- 714,
940
- 526
941
- ],
942
- "totals": [
943
- 1999,
944
- 1933,
945
- 1867,
946
- 1801
947
- ],
948
- "precisions": [
949
- 0.7123561780890445,
950
- 0.5100879461976203,
951
- 0.3824317086234601,
952
- 0.29205996668517487
953
- ],
954
- "bp": 0.9660716664698304,
955
- "sys_len": 1999,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.4336120976233934,
958
- "score": 0.4336120976233934,
959
- "score_name": "sacrebleu",
960
- "score_ci_low": 0.39952659045667155,
961
- "score_ci_high": 0.47193262493606236,
962
- "sacrebleu_ci_low": 0.39952659045667155,
963
- "sacrebleu_ci_high": 0.47193262493606236
964
- },
965
- "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
- "counts": [
968
- 1019,
969
- 419,
970
- 213,
971
- 113
972
- ],
973
- "totals": [
974
- 3749,
975
- 3683,
976
- 3617,
977
- 3552
978
- ],
979
- "precisions": [
980
- 0.27180581488396904,
981
- 0.11376595166983437,
982
- 0.0588885816975394,
983
- 0.031813063063063064
984
- ],
985
- "bp": 1.0,
986
- "sys_len": 3749,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.08724225995783678,
989
- "score": 0.08724225995783678,
990
- "score_name": "sacrebleu",
991
- "score_ci_low": 0.06811801818711148,
992
- "score_ci_high": 0.11195945404539422,
993
- "sacrebleu_ci_low": 0.06811801818711148,
994
- "sacrebleu_ci_high": 0.11195945404539422
995
- },
996
- "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
- "counts": [
999
- 1341,
1000
- 889,
1001
- 634,
1002
- 457
1003
- ],
1004
- "totals": [
1005
- 1879,
1006
- 1813,
1007
- 1747,
1008
- 1681
1009
- ],
1010
- "precisions": [
1011
- 0.7136774880255454,
1012
- 0.4903474903474903,
1013
- 0.36290784201488263,
1014
- 0.2718619869125521
1015
- ],
1016
- "bp": 0.9805012826642417,
1017
- "sys_len": 1879,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.4226548575605273,
1020
- "score": 0.4226548575605273,
1021
- "score_name": "sacrebleu",
1022
- "score_ci_low": 0.3804303884338436,
1023
- "score_ci_high": 0.46259367891162306,
1024
- "sacrebleu_ci_low": 0.3804303884338436,
1025
- "sacrebleu_ci_high": 0.46259367891162306
1026
- },
1027
- "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
- "counts": [
1030
- 966,
1031
- 465,
1032
- 258,
1033
- 149
1034
- ],
1035
- "totals": [
1036
- 2330,
1037
- 2264,
1038
- 2198,
1039
- 2132
1040
- ],
1041
- "precisions": [
1042
- 0.4145922746781116,
1043
- 0.20538869257950532,
1044
- 0.11737943585077343,
1045
- 0.0698874296435272
1046
- ],
1047
- "bp": 1.0,
1048
- "sys_len": 2330,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.1625725453572352,
1051
- "score": 0.1625725453572352,
1052
- "score_name": "sacrebleu",
1053
- "score_ci_low": 0.11669140677951717,
1054
- "score_ci_high": 0.20412983752636973,
1055
- "sacrebleu_ci_low": 0.11669140677951717,
1056
- "sacrebleu_ci_high": 0.20412983752636973
1057
- },
1058
- "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
- "counts": [
1061
- 1199,
1062
- 637,
1063
- 360,
1064
- 199
1065
- ],
1066
- "totals": [
1067
- 1973,
1068
- 1907,
1069
- 1841,
1070
- 1775
1071
- ],
1072
- "precisions": [
1073
- 0.607704004054739,
1074
- 0.3340325117986366,
1075
- 0.19554589896795221,
1076
- 0.11211267605633803
1077
- ],
1078
- "bp": 0.9386099296136466,
1079
- "sys_len": 1973,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.2424271251773898,
1082
- "score": 0.2424271251773898,
1083
- "score_name": "sacrebleu",
1084
- "score_ci_low": 0.21260065622080154,
1085
- "score_ci_high": 0.26696534058145066,
1086
- "sacrebleu_ci_low": 0.21260065622080154,
1087
- "sacrebleu_ci_high": 0.26696534058145066
1088
- },
1089
- "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
- "counts": [
1092
- 1270,
1093
- 795,
1094
- 520,
1095
- 348
1096
- ],
1097
- "totals": [
1098
- 1847,
1099
- 1781,
1100
- 1715,
1101
- 1649
1102
- ],
1103
- "precisions": [
1104
- 0.6876015159718462,
1105
- 0.446378439079169,
1106
- 0.3032069970845481,
1107
- 0.2110369921164342
1108
- ],
1109
- "bp": 1.0,
1110
- "sys_len": 1847,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.37435570897744036,
1113
- "score": 0.37435570897744036,
1114
- "score_name": "sacrebleu",
1115
- "score_ci_low": 0.33656443265221864,
1116
- "score_ci_high": 0.4099554772696377,
1117
- "sacrebleu_ci_low": 0.33656443265221864,
1118
- "sacrebleu_ci_high": 0.4099554772696377
1119
- },
1120
- "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
- "counts": [
1123
- 988,
1124
- 434,
1125
- 239,
1126
- 140
1127
- ],
1128
- "totals": [
1129
- 1874,
1130
- 1808,
1131
- 1742,
1132
- 1676
1133
- ],
1134
- "precisions": [
1135
- 0.5272145144076841,
1136
- 0.24004424778761063,
1137
- 0.13719862227324914,
1138
- 0.08353221957040573
1139
- ],
1140
- "bp": 1.0,
1141
- "sys_len": 1874,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.19515092235944087,
1144
- "score": 0.19515092235944087,
1145
- "score_name": "sacrebleu",
1146
- "score_ci_low": 0.16674417267909872,
1147
- "score_ci_high": 0.231444565320084,
1148
- "sacrebleu_ci_low": 0.16674417267909872,
1149
- "sacrebleu_ci_high": 0.231444565320084
1150
- },
1151
- "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
- "counts": [
1154
- 976,
1155
- 440,
1156
- 232,
1157
- 128
1158
- ],
1159
- "totals": [
1160
- 1841,
1161
- 1775,
1162
- 1709,
1163
- 1643
1164
- ],
1165
- "precisions": [
1166
- 0.530146659424226,
1167
- 0.24788732394366197,
1168
- 0.13575190169689877,
1169
- 0.07790626902008521
1170
- ],
1171
- "bp": 1.0,
1172
- "sys_len": 1841,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.19308216913928786,
1175
- "score": 0.19308216913928786,
1176
- "score_name": "sacrebleu",
1177
- "score_ci_low": 0.1691283364865516,
1178
- "score_ci_high": 0.22228985058810502,
1179
- "sacrebleu_ci_low": 0.1691283364865516,
1180
- "sacrebleu_ci_high": 0.22228985058810502
1181
- },
1182
- "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
- "counts": [
1185
- 1260,
1186
- 815,
1187
- 565,
1188
- 399
1189
- ],
1190
- "totals": [
1191
- 1793,
1192
- 1727,
1193
- 1661,
1194
- 1595
1195
- ],
1196
- "precisions": [
1197
- 0.7027328499721137,
1198
- 0.4719166184134337,
1199
- 0.34015653220951236,
1200
- 0.2501567398119122
1201
- ],
1202
- "bp": 1.0,
1203
- "sys_len": 1793,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.4098610398858089,
1206
- "score": 0.4098610398858089,
1207
- "score_name": "sacrebleu",
1208
- "score_ci_low": 0.3653913364652719,
1209
- "score_ci_high": 0.46316065930620326,
1210
- "sacrebleu_ci_low": 0.3653913364652719,
1211
- "sacrebleu_ci_high": 0.46316065930620326
1212
- },
1213
- "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
- "counts": [
1216
- 1227,
1217
- 729,
1218
- 472,
1219
- 294
1220
- ],
1221
- "totals": [
1222
- 1830,
1223
- 1764,
1224
- 1698,
1225
- 1632
1226
- ],
1227
- "precisions": [
1228
- 0.6704918032786885,
1229
- 0.41326530612244894,
1230
- 0.2779740871613663,
1231
- 0.1801470588235294
1232
- ],
1233
- "bp": 1.0,
1234
- "sys_len": 1830,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.343212800768137,
1237
- "score": 0.343212800768137,
1238
- "score_name": "sacrebleu",
1239
- "score_ci_low": 0.3002026232607091,
1240
- "score_ci_high": 0.3993584334850746,
1241
- "sacrebleu_ci_low": 0.3002026232607091,
1242
- "sacrebleu_ci_high": 0.3993584334850746
1243
- },
1244
- "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
- "counts": [
1247
- 1100,
1248
- 577,
1249
- 339,
1250
- 205
1251
- ],
1252
- "totals": [
1253
- 1824,
1254
- 1758,
1255
- 1692,
1256
- 1626
1257
- ],
1258
- "precisions": [
1259
- 0.6030701754385965,
1260
- 0.3282138794084187,
1261
- 0.200354609929078,
1262
- 0.12607626076260764
1263
- ],
1264
- "bp": 1.0,
1265
- "sys_len": 1824,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.2659128735671552,
1268
- "score": 0.2659128735671552,
1269
- "score_name": "sacrebleu",
1270
- "score_ci_low": 0.23124038640104372,
1271
- "score_ci_high": 0.30957401763446213,
1272
- "sacrebleu_ci_low": 0.23124038640104372,
1273
- "sacrebleu_ci_high": 0.30957401763446213
1274
- },
1275
- "score": 0.27479197337400774,
1276
- "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
- },
1279
- "score": 0.4561786095841296,
1280
- "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
- }
1283
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/bluebench/2025-06-23T03-17-57_evaluation_results.json DELETED
@@ -1,1283 +0,0 @@
1
- {
2
- "environment_info": {
3
- "timestamp_utc": "2025-06-23T07:17:53.366963Z",
4
- "command_line_invocation": [
5
- "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
- "--tasks",
7
- "benchmarks.bluebench",
8
- "--model",
9
- "cross_provider",
10
- "--model_args",
11
- "model_name=watsonx/ibm/granite-3-2b-instruct,max_tokens=256",
12
- "--output_path",
13
- "./results/bluebench",
14
- "--log_samples",
15
- "--trust_remote_code",
16
- "--batch_size",
17
- "8",
18
- "--verbosity",
19
- "ERROR"
20
- ],
21
- "parsed_arguments": {
22
- "tasks": [
23
- "benchmarks.bluebench"
24
- ],
25
- "split": "test",
26
- "num_fewshots": null,
27
- "limit": null,
28
- "batch_size": 8,
29
- "model": "watsonx/ibm/granite-3-2b-instruct",
30
- "model_args": {
31
- "max_tokens": 256
32
- },
33
- "gen_kwargs": null,
34
- "chat_template_kwargs": null,
35
- "output_path": "./results/bluebench",
36
- "output_file_prefix": "evaluation_results",
37
- "log_samples": true,
38
- "verbosity": "ERROR",
39
- "apply_chat_template": false,
40
- "trust_remote_code": true,
41
- "disable_hf_cache": false,
42
- "cache_dir": null
43
- },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
- "python_version": "3.10.18",
47
- "system": "Linux",
48
- "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
- "installed_packages": {
50
- "nvidia-cufile-cu12": "1.11.1.6",
51
- "triton": "3.3.1",
52
- "nltk": "3.9.1",
53
- "anyio": "4.9.0",
54
- "absl-py": "2.3.0",
55
- "tiktoken": "0.9.0",
56
- "charset-normalizer": "3.4.2",
57
- "nvidia-cuda-runtime-cu12": "12.6.77",
58
- "sympy": "1.14.0",
59
- "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
- "httpcore": "1.0.9",
62
- "Jinja2": "3.1.6",
63
- "jsonschema-specifications": "2025.4.1",
64
- "pydantic_core": "2.33.2",
65
- "nvidia-cusparse-cu12": "12.5.4.2",
66
- "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
- "portalocker": "3.2.0",
69
- "pandas": "2.3.0",
70
- "multiprocess": "0.70.16",
71
- "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
- "nvidia-nvjitlink-cu12": "12.6.85",
74
- "nvidia-cublas-cu12": "12.6.4.1",
75
- "pydantic": "2.11.7",
76
- "async-timeout": "5.0.1",
77
- "annotated-types": "0.7.0",
78
- "rouge_score": "0.1.2",
79
- "contourpy": "1.3.2",
80
- "aiosignal": "1.3.2",
81
- "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
- "six": "1.17.0",
84
- "diskcache": "5.6.3",
85
- "tqdm": "4.67.1",
86
- "pyarrow": "20.0.0",
87
- "h11": "0.16.0",
88
- "zipp": "3.19.2",
89
- "tzdata": "2025.2",
90
- "bert-score": "0.3.13",
91
- "setuptools": "80.9.0",
92
- "referencing": "0.36.2",
93
- "sacrebleu": "2.5.1",
94
- "filelock": "3.18.0",
95
- "urllib3": "2.5.0",
96
- "scipy": "1.15.3",
97
- "nvidia-nccl-cu12": "2.26.2",
98
- "kiwisolver": "1.4.8",
99
- "networkx": "3.4.2",
100
- "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
- "sniffio": "1.3.1",
103
- "scikit-learn": "1.7.0",
104
- "nvidia-curand-cu12": "10.3.7.77",
105
- "pip": "25.1.1",
106
- "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
- "datasets": "3.6.0",
109
- "nvidia-cusolver-cu12": "11.7.1.2",
110
- "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
- "distro": "1.9.0",
113
- "idna": "3.10",
114
- "MarkupSafe": "3.0.2",
115
- "frozenlist": "1.7.0",
116
- "pyparsing": "3.2.3",
117
- "jiter": "0.10.0",
118
- "importlib_metadata": "8.0.0",
119
- "packaging": "24.2",
120
- "psutil": "7.0.0",
121
- "mecab-ko-dic": "1.0.0",
122
- "joblib": "1.5.1",
123
- "fsspec": "2025.3.0",
124
- "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
- "wheel": "0.45.1",
127
- "nvidia-nvtx-cu12": "12.6.77",
128
- "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
- "propcache": "0.3.2",
131
- "numpy": "2.2.6",
132
- "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
- "conllu": "6.0.0",
135
- "safetensors": "0.5.3",
136
- "requests": "2.32.4",
137
- "regex": "2024.11.6",
138
- "aiohttp": "3.12.13",
139
- "tabulate": "0.9.0",
140
- "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
- "nvidia-cufft-cu12": "11.3.0.4",
143
- "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
- "click": "8.2.1",
145
- "typing_extensions": "4.12.2",
146
- "attrs": "25.3.0",
147
- "exceptiongroup": "1.3.0",
148
- "tenacity": "9.1.2",
149
- "pytz": "2025.2",
150
- "aiohappyeyeballs": "2.6.1",
151
- "python-dateutil": "2.9.0.post0",
152
- "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
154
- "httpx": "0.28.1",
155
- "matplotlib": "3.10.3",
156
- "xxhash": "3.5.0",
157
- "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
- "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
- "threadpoolctl": "3.6.0",
162
- "nvidia-cudnn-cu12": "9.5.1.17",
163
- "jaraco.collections": "5.1.0",
164
- "tomli": "2.0.1",
165
- "backports.tarfile": "1.2.0",
166
- "jaraco.context": "5.3.0",
167
- "typeguard": "4.3.0",
168
- "autocommand": "2.2.2",
169
- "jaraco.text": "3.12.1",
170
- "more-itertools": "10.3.0",
171
- "platformdirs": "4.2.2",
172
- "inflect": "7.3.1",
173
- "jaraco.functools": "4.0.1"
174
- }
175
- },
176
- "results": {
177
- "bias": {
178
- "safety_bbq_age": {
179
- "accuracy": 0.5333333333333333,
180
- "accuracy_ci_low": 0.4444444444444444,
181
- "accuracy_ci_high": 0.6378611050272702,
182
- "score_name": "accuracy",
183
- "score": 0.5333333333333333,
184
- "score_ci_high": 0.6378611050272702,
185
- "score_ci_low": 0.4444444444444444,
186
- "num_of_instances": 90
187
- },
188
- "safety_bbq_disability_status": {
189
- "accuracy": 0.7111111111111111,
190
- "accuracy_ci_low": 0.6222222222222222,
191
- "accuracy_ci_high": 0.7888888888888889,
192
- "score_name": "accuracy",
193
- "score": 0.7111111111111111,
194
- "score_ci_high": 0.7888888888888889,
195
- "score_ci_low": 0.6222222222222222,
196
- "num_of_instances": 90
197
- },
198
- "safety_bbq_gender_identity": {
199
- "accuracy": 0.7444444444444445,
200
- "accuracy_ci_low": 0.6555555555555556,
201
- "accuracy_ci_high": 0.8333333333333334,
202
- "score_name": "accuracy",
203
- "score": 0.7444444444444445,
204
- "score_ci_high": 0.8333333333333334,
205
- "score_ci_low": 0.6555555555555556,
206
- "num_of_instances": 90
207
- },
208
- "safety_bbq_nationality": {
209
- "accuracy": 0.45555555555555555,
210
- "accuracy_ci_low": 0.35555555555555557,
211
- "accuracy_ci_high": 0.5555555555555556,
212
- "score_name": "accuracy",
213
- "score": 0.45555555555555555,
214
- "score_ci_high": 0.5555555555555556,
215
- "score_ci_low": 0.35555555555555557,
216
- "num_of_instances": 90
217
- },
218
- "safety_bbq_physical_appearance": {
219
- "accuracy": 0.5888888888888889,
220
- "accuracy_ci_low": 0.4888888888888889,
221
- "accuracy_ci_high": 0.6888888888888889,
222
- "score_name": "accuracy",
223
- "score": 0.5888888888888889,
224
- "score_ci_high": 0.6888888888888889,
225
- "score_ci_low": 0.4888888888888889,
226
- "num_of_instances": 90
227
- },
228
- "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.8111111111111111,
230
- "accuracy_ci_low": 0.7222222222222222,
231
- "accuracy_ci_high": 0.8777777777777778,
232
- "score_name": "accuracy",
233
- "score": 0.8111111111111111,
234
- "score_ci_high": 0.8777777777777778,
235
- "score_ci_low": 0.7222222222222222,
236
- "num_of_instances": 90
237
- },
238
- "safety_bbq_race_x_gender": {
239
- "accuracy": 0.7333333333333333,
240
- "accuracy_ci_low": 0.6333333333333333,
241
- "accuracy_ci_high": 0.8111111111111111,
242
- "score_name": "accuracy",
243
- "score": 0.7333333333333333,
244
- "score_ci_high": 0.8111111111111111,
245
- "score_ci_low": 0.6333333333333333,
246
- "num_of_instances": 90
247
- },
248
- "safety_bbq_race_x_ses": {
249
- "accuracy": 0.6222222222222222,
250
- "accuracy_ci_low": 0.5111111111111111,
251
- "accuracy_ci_high": 0.7111111111111111,
252
- "score_name": "accuracy",
253
- "score": 0.6222222222222222,
254
- "score_ci_high": 0.7111111111111111,
255
- "score_ci_low": 0.5111111111111111,
256
- "num_of_instances": 90
257
- },
258
- "safety_bbq_religion": {
259
- "accuracy": 0.6444444444444445,
260
- "accuracy_ci_low": 0.5444444444444444,
261
- "accuracy_ci_high": 0.7444444444444445,
262
- "score_name": "accuracy",
263
- "score": 0.6444444444444445,
264
- "score_ci_high": 0.7444444444444445,
265
- "score_ci_low": 0.5444444444444444,
266
- "num_of_instances": 90
267
- },
268
- "safety_bbq_ses": {
269
- "accuracy": 0.7,
270
- "accuracy_ci_low": 0.6,
271
- "accuracy_ci_high": 0.7888888888888889,
272
- "score_name": "accuracy",
273
- "score": 0.7,
274
- "score_ci_high": 0.7888888888888889,
275
- "score_ci_low": 0.6,
276
- "num_of_instances": 90
277
- },
278
- "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.6222222222222222,
280
- "accuracy_ci_low": 0.5222222222222223,
281
- "accuracy_ci_high": 0.7222222222222222,
282
- "score_name": "accuracy",
283
- "score": 0.6222222222222222,
284
- "score_ci_high": 0.7222222222222222,
285
- "score_ci_low": 0.5222222222222223,
286
- "num_of_instances": 90
287
- },
288
- "score": 0.6515151515151515,
289
- "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
- },
292
- "chatbot_abilities": {
293
- "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.03687821612349914,
296
- "score": 0.03687821612349914,
297
- "score_name": "llama_3_70b_instruct_template_arena_hard"
298
- },
299
- "score": 0.03687821612349914,
300
- "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
- },
303
- "entity_extraction": {
304
- "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.3902439024390244,
307
- "f1_Organization": 0.29283489096573206,
308
- "f1_Location": 0.2572614107883817,
309
- "f1_macro": 0.31344673473104606,
310
- "recall_macro": 0.2655047696270643,
311
- "precision_macro": 0.3964060432628696,
312
- "in_classes_support": 0.6260720411663807,
313
- "f1_micro": 0.25631768953068595,
314
- "recall_micro": 0.2704761904761905,
315
- "precision_micro": 0.24356775300171526,
316
- "score": 0.25631768953068595,
317
- "score_name": "f1_micro",
318
- "score_ci_low": 0.22100954853501506,
319
- "score_ci_high": 0.2947346870824505,
320
- "f1_micro_ci_low": 0.22100954853501506,
321
- "f1_micro_ci_high": 0.2947346870824505
322
- },
323
- "score": 0.25631768953068595,
324
- "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
- },
327
- "knowledge": {
328
- "mmlu_pro_biology": {
329
- "accuracy": 0.4788732394366197,
330
- "accuracy_ci_low": 0.36619718309859156,
331
- "accuracy_ci_high": 0.5915492957746479,
332
- "score_name": "accuracy",
333
- "score": 0.4788732394366197,
334
- "score_ci_high": 0.5915492957746479,
335
- "score_ci_low": 0.36619718309859156,
336
- "num_of_instances": 71
337
- },
338
- "mmlu_pro_business": {
339
- "accuracy": 0.23943661971830985,
340
- "accuracy_ci_low": 0.15492957746478872,
341
- "accuracy_ci_high": 0.3380281690140845,
342
- "score_name": "accuracy",
343
- "score": 0.23943661971830985,
344
- "score_ci_high": 0.3380281690140845,
345
- "score_ci_low": 0.15492957746478872,
346
- "num_of_instances": 71
347
- },
348
- "mmlu_pro_chemistry": {
349
- "accuracy": 0.23943661971830985,
350
- "accuracy_ci_low": 0.14084507042253522,
351
- "accuracy_ci_high": 0.352112676056338,
352
- "score_name": "accuracy",
353
- "score": 0.23943661971830985,
354
- "score_ci_high": 0.352112676056338,
355
- "score_ci_low": 0.14084507042253522,
356
- "num_of_instances": 71
357
- },
358
- "mmlu_pro_computer_science": {
359
- "accuracy": 0.2535211267605634,
360
- "accuracy_ci_low": 0.15492957746478872,
361
- "accuracy_ci_high": 0.36619718309859156,
362
- "score_name": "accuracy",
363
- "score": 0.2535211267605634,
364
- "score_ci_high": 0.36619718309859156,
365
- "score_ci_low": 0.15492957746478872,
366
- "num_of_instances": 71
367
- },
368
- "mmlu_pro_economics": {
369
- "accuracy": 0.39436619718309857,
370
- "accuracy_ci_low": 0.29577464788732394,
371
- "accuracy_ci_high": 0.5070422535211268,
372
- "score_name": "accuracy",
373
- "score": 0.39436619718309857,
374
- "score_ci_high": 0.5070422535211268,
375
- "score_ci_low": 0.29577464788732394,
376
- "num_of_instances": 71
377
- },
378
- "mmlu_pro_engineering": {
379
- "accuracy": 0.16901408450704225,
380
- "accuracy_ci_low": 0.09859154929577464,
381
- "accuracy_ci_high": 0.2676056338028169,
382
- "score_name": "accuracy",
383
- "score": 0.16901408450704225,
384
- "score_ci_high": 0.2676056338028169,
385
- "score_ci_low": 0.09859154929577464,
386
- "num_of_instances": 71
387
- },
388
- "mmlu_pro_health": {
389
- "accuracy": 0.2112676056338028,
390
- "accuracy_ci_low": 0.1267605633802817,
391
- "accuracy_ci_high": 0.33217670597601795,
392
- "score_name": "accuracy",
393
- "score": 0.2112676056338028,
394
- "score_ci_high": 0.33217670597601795,
395
- "score_ci_low": 0.1267605633802817,
396
- "num_of_instances": 71
397
- },
398
- "mmlu_pro_history": {
399
- "accuracy": 0.2676056338028169,
400
- "accuracy_ci_low": 0.16901408450704225,
401
- "accuracy_ci_high": 0.36619718309859156,
402
- "score_name": "accuracy",
403
- "score": 0.2676056338028169,
404
- "score_ci_high": 0.36619718309859156,
405
- "score_ci_low": 0.16901408450704225,
406
- "num_of_instances": 71
407
- },
408
- "mmlu_pro_law": {
409
- "accuracy": 0.23943661971830985,
410
- "accuracy_ci_low": 0.15492957746478872,
411
- "accuracy_ci_high": 0.352112676056338,
412
- "score_name": "accuracy",
413
- "score": 0.23943661971830985,
414
- "score_ci_high": 0.352112676056338,
415
- "score_ci_low": 0.15492957746478872,
416
- "num_of_instances": 71
417
- },
418
- "mmlu_pro_math": {
419
- "accuracy": 0.15492957746478872,
420
- "accuracy_ci_low": 0.08450704225352113,
421
- "accuracy_ci_high": 0.2535211267605634,
422
- "score_name": "accuracy",
423
- "score": 0.15492957746478872,
424
- "score_ci_high": 0.2535211267605634,
425
- "score_ci_low": 0.08450704225352113,
426
- "num_of_instances": 71
427
- },
428
- "mmlu_pro_other": {
429
- "accuracy": 0.18309859154929578,
430
- "accuracy_ci_low": 0.09859154929577464,
431
- "accuracy_ci_high": 0.28169014084507044,
432
- "score_name": "accuracy",
433
- "score": 0.18309859154929578,
434
- "score_ci_high": 0.28169014084507044,
435
- "score_ci_low": 0.09859154929577464,
436
- "num_of_instances": 71
437
- },
438
- "mmlu_pro_philosophy": {
439
- "accuracy": 0.23943661971830985,
440
- "accuracy_ci_low": 0.14084507042253522,
441
- "accuracy_ci_high": 0.352112676056338,
442
- "score_name": "accuracy",
443
- "score": 0.23943661971830985,
444
- "score_ci_high": 0.352112676056338,
445
- "score_ci_low": 0.14084507042253522,
446
- "num_of_instances": 71
447
- },
448
- "mmlu_pro_physics": {
449
- "accuracy": 0.2676056338028169,
450
- "accuracy_ci_low": 0.17514498933734307,
451
- "accuracy_ci_high": 0.38028169014084506,
452
- "score_name": "accuracy",
453
- "score": 0.2676056338028169,
454
- "score_ci_high": 0.38028169014084506,
455
- "score_ci_low": 0.17514498933734307,
456
- "num_of_instances": 71
457
- },
458
- "mmlu_pro_psychology": {
459
- "accuracy": 0.43661971830985913,
460
- "accuracy_ci_low": 0.323943661971831,
461
- "accuracy_ci_high": 0.5492957746478874,
462
- "score_name": "accuracy",
463
- "score": 0.43661971830985913,
464
- "score_ci_high": 0.5492957746478874,
465
- "score_ci_low": 0.323943661971831,
466
- "num_of_instances": 71
467
- },
468
- "score": 0.26961770623742454,
469
- "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
- },
472
- "legal": {
473
- "legalbench_abercrombie": {
474
- "f1_macro": 0.23241604568835691,
475
- "f1_suggestive": 0.08695652173913043,
476
- "f1_descriptive": 0.2631578947368421,
477
- "f1_generic": 0.0,
478
- "f1_arbitrary": 0.3888888888888889,
479
- "f1_fanciful": 0.4230769230769231,
480
- "f1_macro_ci_low": 0.16927841023118298,
481
- "f1_macro_ci_high": 0.32467849714540287,
482
- "score_name": "f1_micro",
483
- "score": 0.2891566265060241,
484
- "score_ci_high": 0.40476190476190477,
485
- "score_ci_low": 0.2054361335527834,
486
- "num_of_instances": 85,
487
- "accuracy": 0.2823529411764706,
488
- "accuracy_ci_low": 0.2,
489
- "accuracy_ci_high": 0.4,
490
- "f1_micro": 0.2891566265060241,
491
- "f1_micro_ci_low": 0.2054361335527834,
492
- "f1_micro_ci_high": 0.40476190476190477
493
- },
494
- "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.4965214761040533,
496
- "f1_no": 0.7631578947368421,
497
- "f1_yes": 0.22988505747126436,
498
- "f1_macro_ci_low": 0.43244851636549736,
499
- "f1_macro_ci_high": 0.5729228740221988,
500
- "score_name": "f1_micro",
501
- "score": 0.6445012787723785,
502
- "score_ci_high": 0.69946202795028,
503
- "score_ci_low": 0.570694087403599,
504
- "num_of_instances": 200,
505
- "accuracy": 0.63,
506
- "accuracy_ci_low": 0.5561546872315049,
507
- "accuracy_ci_high": 0.69,
508
- "f1_micro": 0.6445012787723785,
509
- "f1_micro_ci_low": 0.570694087403599,
510
- "f1_micro_ci_high": 0.69946202795028
511
- },
512
- "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.22997089242124882,
514
- "f1_conclusion": 0.04878048780487805,
515
- "f1_analysis": 0.3333333333333333,
516
- "f1_decree": 0.2926829268292683,
517
- "f1_issue": 0.21978021978021978,
518
- "f1_procedural history": 0.05,
519
- "f1_facts": 0.2978723404255319,
520
- "f1_rule": 0.3673469387755102,
521
- "f1_macro_ci_low": 0.18026075783829068,
522
- "f1_macro_ci_high": 0.2946257845154891,
523
- "score_name": "f1_micro",
524
- "score": 0.24146981627296588,
525
- "score_ci_high": 0.3019289134511566,
526
- "score_ci_low": 0.18181818181818182,
527
- "num_of_instances": 200,
528
- "accuracy": 0.23,
529
- "accuracy_ci_low": 0.175,
530
- "accuracy_ci_high": 0.29,
531
- "f1_micro": 0.24146981627296588,
532
- "f1_micro_ci_low": 0.18181818181818182,
533
- "f1_micro_ci_high": 0.3019289134511566
534
- },
535
- "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.4719581626606899,
537
- "f1_yes": 0.5462555066079295,
538
- "f1_no": 0.39766081871345027,
539
- "f1_macro_ci_low": 0.4067534798719593,
540
- "f1_macro_ci_high": 0.5312059177934843,
541
- "score_name": "f1_micro",
542
- "score": 0.4824120603015075,
543
- "score_ci_high": 0.5413533834586466,
544
- "score_ci_low": 0.41550674904624724,
545
- "num_of_instances": 200,
546
- "accuracy": 0.48,
547
- "accuracy_ci_low": 0.415,
548
- "accuracy_ci_high": 0.54,
549
- "f1_micro": 0.4824120603015075,
550
- "f1_micro_ci_low": 0.41550674904624724,
551
- "f1_micro_ci_high": 0.5413533834586466
552
- },
553
- "legalbench_proa": {
554
- "f1_macro": 0.8112290008841733,
555
- "f1_yes": 0.7948717948717948,
556
- "f1_no": 0.8275862068965517,
557
- "f1_macro_ci_low": 0.7184910169578117,
558
- "f1_macro_ci_high": 0.8804600933253673,
559
- "score_name": "f1_micro",
560
- "score": 0.8121212121212121,
561
- "score_ci_high": 0.8795180722891566,
562
- "score_ci_low": 0.7203411511997481,
563
- "num_of_instances": 85,
564
- "accuracy": 0.788235294117647,
565
- "accuracy_ci_low": 0.6941176470588235,
566
- "accuracy_ci_high": 0.8588235294117647,
567
- "f1_micro": 0.8121212121212121,
568
- "f1_micro_ci_low": 0.7203411511997481,
569
- "f1_micro_ci_high": 0.8795180722891566
570
- },
571
- "score": 0.49393219879481765,
572
- "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
- },
575
- "news_classification": {
576
- "20_newsgroups_short": {
577
- "f1_macro": 0.3469646355526677,
578
- "f1_cars": 0.5517241379310345,
579
- "f1_windows x": 0.0,
580
- "f1_atheism": 0.23809523809523808,
581
- "f1_christianity": 0.2028985507246377,
582
- "f1_religion": 0.1941747572815534,
583
- "f1_medicine": 0.6060606060606061,
584
- "f1_computer graphics": 0.3488372093023256,
585
- "f1_microsoft windows": 0.3188405797101449,
586
- "f1_middle east": 0.11538461538461539,
587
- "f1_politics": 0.3047619047619048,
588
- "f1_motorcycles": 0.5227272727272727,
589
- "f1_baseball": 0.6984126984126984,
590
- "f1_pc hardware": 0.3684210526315789,
591
- "f1_mac hardware": 0.37037037037037035,
592
- "f1_for sale": 0.08888888888888889,
593
- "f1_guns": 0.18181818181818182,
594
- "f1_space": 0.4810126582278481,
595
- "f1_cryptography": 0.48484848484848486,
596
- "f1_hockey": 0.4666666666666667,
597
- "f1_electronics": 0.3953488372093023,
598
- "f1_macro_ci_low": 0.32234813592441613,
599
- "f1_macro_ci_high": 0.38044336501459297,
600
- "score_name": "f1_micro",
601
- "score": 0.37344913151364767,
602
- "score_ci_high": 0.40609658022784717,
603
- "score_ci_low": 0.34207641792416926,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.301,
606
- "accuracy_ci_low": 0.274,
607
- "accuracy_ci_high": 0.329023179612989,
608
- "f1_micro": 0.37344913151364767,
609
- "f1_micro_ci_low": 0.34207641792416926,
610
- "f1_micro_ci_high": 0.40609658022784717
611
- },
612
- "score": 0.37344913151364767,
613
- "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
- },
616
- "product_help": {
617
- "cfpb_product_2023": {
618
- "f1_macro": 0.4835930003981669,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.8920327624720774,
620
- "f1_checking or savings account": 0.42105263157894735,
621
- "f1_credit card or prepaid card": 0.5666666666666667,
622
- "f1_debt collection": 0.38666666666666666,
623
- "f1_mortgage": 0.7096774193548387,
624
- "f1_student loan": 0.6666666666666666,
625
- "f1_money transfer or virtual currency or money service": 0.3125,
626
- "f1_vehicle loan or lease": 0.27586206896551724,
627
- "f1_payday loan or title loan or personal loan": 0.12121212121212122,
628
- "f1_macro_ci_low": 0.43264552909234405,
629
- "f1_macro_ci_high": 0.5420653283436574,
630
- "score_name": "f1_micro",
631
- "score": 0.7693953986088817,
632
- "score_ci_high": 0.7940535810044251,
633
- "score_ci_low": 0.7428249604302373,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.719,
636
- "accuracy_ci_low": 0.688,
637
- "accuracy_ci_high": 0.746,
638
- "f1_micro": 0.7693953986088817,
639
- "f1_micro_ci_low": 0.7428249604302373,
640
- "f1_micro_ci_high": 0.7940535810044251
641
- },
642
- "cfpb_product_watsonx": {
643
- "f1_macro": 0.5138940708414392,
644
- "f1_mortgages and loans": 0.6742857142857143,
645
- "f1_credit card": 0.5314685314685315,
646
- "f1_debt collection": 0.5,
647
- "f1_credit reporting": 0.6742424242424242,
648
- "f1_retail banking": 0.18947368421052632,
649
- "f1_macro_ci_low": 0.47276041465254326,
650
- "f1_macro_ci_high": 0.561073606935457,
651
- "score_name": "f1_micro",
652
- "score": 0.5587229190421893,
653
- "score_ci_high": 0.6032761107151652,
654
- "score_ci_low": 0.5136696359618879,
655
- "num_of_instances": 500,
656
- "accuracy": 0.49,
657
- "accuracy_ci_low": 0.45,
658
- "accuracy_ci_high": 0.536,
659
- "f1_micro": 0.5587229190421893,
660
- "f1_micro_ci_low": 0.5136696359618879,
661
- "f1_micro_ci_high": 0.6032761107151652
662
- },
663
- "score": 0.6640591588255356,
664
- "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
- },
667
- "qa_finance": {
668
- "fin_qa": {
669
- "num_of_instances": 1000,
670
- "program_accuracy": 0.112,
671
- "score": 0.112,
672
- "score_name": "program_accuracy",
673
- "execution_accuracy": 0.098,
674
- "program_accuracy_ci_low": 0.092,
675
- "program_accuracy_ci_high": 0.134,
676
- "score_ci_low": 0.092,
677
- "score_ci_high": 0.134,
678
- "execution_accuracy_ci_low": 0.081,
679
- "execution_accuracy_ci_high": 0.11876030243075729
680
- },
681
- "score": 0.112,
682
- "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
- },
685
- "rag_general": {
686
- "rag_response_generation_clapnq": {
687
- "precision": 0.27803081578951677,
688
- "recall": 0.5263554366544182,
689
- "f1": 0.3099512752592042,
690
- "precision_ci_low": 0.2593667444116527,
691
- "precision_ci_high": 0.296940372694636,
692
- "recall_ci_low": 0.5097202360703201,
693
- "recall_ci_high": 0.5429203378240791,
694
- "f1_ci_low": 0.2936726592020454,
695
- "f1_ci_high": 0.3271680931787992,
696
- "score_name": "f1",
697
- "score": 0.3099512752592042,
698
- "score_ci_high": 0.3271680931787992,
699
- "score_ci_low": 0.2936726592020454,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.5813682861626148,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6611752705772718,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5378807228555282,
704
- "faithfullness_f1_token_overlap": 0.34377396506102365,
705
- "faithfullness_recall_token_overlap": 0.2786846668859655,
706
- "faithfullness_precision_token_overlap": 0.5693429043011381,
707
- "correctness_f1_token_overlap": 0.3099512752592042,
708
- "correctness_recall_token_overlap": 0.5263554366544182,
709
- "correctness_precision_token_overlap": 0.27803081578951677
710
- },
711
- "score": 0.3099512752592042,
712
- "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
- },
715
- "reasoning": {
716
- "hellaswag": {
717
- "accuracy": 0.387,
718
- "accuracy_ci_low": 0.355,
719
- "accuracy_ci_high": 0.417,
720
- "score_name": "accuracy",
721
- "score": 0.387,
722
- "score_ci_high": 0.417,
723
- "score_ci_low": 0.355,
724
- "num_of_instances": 1000
725
- },
726
- "openbook_qa": {
727
- "accuracy": 0.728,
728
- "accuracy_ci_low": 0.692,
729
- "accuracy_ci_high": 0.768,
730
- "score_name": "accuracy",
731
- "score": 0.728,
732
- "score_ci_high": 0.768,
733
- "score_ci_low": 0.692,
734
- "num_of_instances": 500
735
- },
736
- "score": 0.5575,
737
- "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
- },
740
- "safety": {
741
- "attaq_500": {
742
- "severity_critical": 2.0,
743
- "severity_high": 0.0,
744
- "severity_medium": 3.0,
745
- "severity_low": 95.0,
746
- "category_harmful_info": 0.790451957932895,
747
- "category_deception": 0.8851025405556264,
748
- "category_explicit_content": 0.790092716915439,
749
- "category_violence": 0.7921644449869674,
750
- "category_discrimination": 0.839280919423179,
751
- "category_substance_abuse": 0.8701996730433569,
752
- "category_pii": 0.7343945315418144,
753
- "safety": 0.8071470662465113,
754
- "safety_ci_low": 0.7772986218937553,
755
- "safety_ci_high": 0.8319065673300019,
756
- "score_name": "safety",
757
- "score": 0.8071470662465113,
758
- "score_ci_high": 0.8319065673300019,
759
- "score_ci_low": 0.7772986218937553,
760
- "num_of_instances": 100
761
- },
762
- "score": 0.8071470662465113,
763
- "score_name": "subsets_mean",
764
- "num_of_instances": 100
765
- },
766
- "summarization": {
767
- "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rougeLsum": 0.3492364214226588,
770
- "rouge1": 0.41657427755274906,
771
- "rouge2": 0.19392531737812374,
772
- "rougeL": 0.28108832023050123,
773
- "score": 0.28108832023050123,
774
- "score_name": "rougeL",
775
- "rougeLsum_ci_low": 0.34133013043796767,
776
- "rougeLsum_ci_high": 0.3567955471410065,
777
- "rouge1_ci_low": 0.4080039541276808,
778
- "rouge1_ci_high": 0.4246243250973701,
779
- "rouge2_ci_low": 0.18712662527227458,
780
- "rouge2_ci_high": 0.200448642429914,
781
- "rougeL_ci_low": 0.2745892481615738,
782
- "rougeL_ci_high": 0.2875837184145128,
783
- "score_ci_low": 0.2745892481615738,
784
- "score_ci_high": 0.2875837184145128
785
- },
786
- "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rougeLsum": 0.0914997584684418,
789
- "rouge1": 0.11103180621679556,
790
- "rouge2": 0.013425878438988716,
791
- "rougeL": 0.07983541241124872,
792
- "score": 0.07983541241124872,
793
- "score_name": "rougeL",
794
- "rougeLsum_ci_low": 0.08781447896275059,
795
- "rougeLsum_ci_high": 0.0953497661867097,
796
- "rouge1_ci_low": 0.10615759057700462,
797
- "rouge1_ci_high": 0.11562260974835847,
798
- "rouge2_ci_low": 0.012023789954203338,
799
- "rouge2_ci_high": 0.015059698304736774,
800
- "rougeL_ci_low": 0.07657318636107396,
801
- "rougeL_ci_high": 0.08299164478552631,
802
- "score_ci_low": 0.07657318636107396,
803
- "score_ci_high": 0.08299164478552631
804
- },
805
- "score": 0.18046186632087496,
806
- "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
- },
809
- "translation": {
810
- "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
- "counts": [
813
- 1003,
814
- 498,
815
- 270,
816
- 157
817
- ],
818
- "totals": [
819
- 1854,
820
- 1788,
821
- 1722,
822
- 1656
823
- ],
824
- "precisions": [
825
- 0.5409924487594391,
826
- 0.2785234899328859,
827
- 0.156794425087108,
828
- 0.09480676328502416
829
- ],
830
- "bp": 1.0,
831
- "sys_len": 1854,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.2175483241536988,
834
- "score": 0.2175483241536988,
835
- "score_name": "sacrebleu",
836
- "score_ci_low": 0.18247071615886146,
837
- "score_ci_high": 0.24704171532422453,
838
- "sacrebleu_ci_low": 0.18247071615886146,
839
- "sacrebleu_ci_high": 0.24704171532422453
840
- },
841
- "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
- "counts": [
844
- 1080,
845
- 568,
846
- 332,
847
- 190
848
- ],
849
- "totals": [
850
- 1763,
851
- 1697,
852
- 1631,
853
- 1565
854
- ],
855
- "precisions": [
856
- 0.6125921724333522,
857
- 0.33470830878020036,
858
- 0.20355610055180873,
859
- 0.12140575079872205
860
- ],
861
- "bp": 1.0,
862
- "sys_len": 1763,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.26680276716067836,
865
- "score": 0.26680276716067836,
866
- "score_name": "sacrebleu",
867
- "score_ci_low": 0.23826229076053318,
868
- "score_ci_high": 0.2977813737555276,
869
- "sacrebleu_ci_low": 0.23826229076053318,
870
- "sacrebleu_ci_high": 0.2977813737555276
871
- },
872
- "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
- "counts": [
875
- 553,
876
- 147,
877
- 54,
878
- 16
879
- ],
880
- "totals": [
881
- 1726,
882
- 1660,
883
- 1594,
884
- 1528
885
- ],
886
- "precisions": [
887
- 0.3203939745075319,
888
- 0.08855421686746988,
889
- 0.033877038895859475,
890
- 0.010471204188481676
891
- ],
892
- "bp": 1.0,
893
- "sys_len": 1726,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.056324703529775505,
896
- "score": 0.056324703529775505,
897
- "score_name": "sacrebleu",
898
- "score_ci_low": 0.04391320313402893,
899
- "score_ci_high": 0.07339385366459818,
900
- "sacrebleu_ci_low": 0.04391320313402893,
901
- "sacrebleu_ci_high": 0.07339385366459818
902
- },
903
- "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
- "counts": [
906
- 919,
907
- 414,
908
- 215,
909
- 112
910
- ],
911
- "totals": [
912
- 1759,
913
- 1693,
914
- 1627,
915
- 1561
916
- ],
917
- "precisions": [
918
- 0.5224559408754975,
919
- 0.24453632604843473,
920
- 0.13214505224339276,
921
- 0.07174887892376682
922
- ],
923
- "bp": 0.9577137289198663,
924
- "sys_len": 1759,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.17866952528026325,
927
- "score": 0.17866952528026325,
928
- "score_name": "sacrebleu",
929
- "score_ci_low": 0.14916846640026038,
930
- "score_ci_high": 0.20635605395794115,
931
- "sacrebleu_ci_low": 0.14916846640026038,
932
- "sacrebleu_ci_high": 0.20635605395794115
933
- },
934
- "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
- "counts": [
937
- 1274,
938
- 792,
939
- 541,
940
- 376
941
- ],
942
- "totals": [
943
- 1972,
944
- 1906,
945
- 1840,
946
- 1774
947
- ],
948
- "precisions": [
949
- 0.6460446247464503,
950
- 0.4155299055613851,
951
- 0.2940217391304348,
952
- 0.21195039458850057
953
- ],
954
- "bp": 0.9524844080827892,
955
- "sys_len": 1972,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.3425527778466637,
958
- "score": 0.3425527778466637,
959
- "score_name": "sacrebleu",
960
- "score_ci_low": 0.30423882287446036,
961
- "score_ci_high": 0.38397398779299585,
962
- "sacrebleu_ci_low": 0.30423882287446036,
963
- "sacrebleu_ci_high": 0.38397398779299585
964
- },
965
- "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
- "counts": [
968
- 959,
969
- 341,
970
- 148,
971
- 65
972
- ],
973
- "totals": [
974
- 3115,
975
- 3049,
976
- 2983,
977
- 2917
978
- ],
979
- "precisions": [
980
- 0.3078651685393259,
981
- 0.1118399475237783,
982
- 0.0496144820650352,
983
- 0.02228316763798423
984
- ],
985
- "bp": 1.0,
986
- "sys_len": 3115,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.07854810736755143,
989
- "score": 0.07854810736755143,
990
- "score_name": "sacrebleu",
991
- "score_ci_low": 0.06038268917694664,
992
- "score_ci_high": 0.0991975666301703,
993
- "sacrebleu_ci_low": 0.06038268917694664,
994
- "sacrebleu_ci_high": 0.0991975666301703
995
- },
996
- "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
- "counts": [
999
- 1231,
1000
- 729,
1001
- 477,
1002
- 311
1003
- ],
1004
- "totals": [
1005
- 1934,
1006
- 1868,
1007
- 1802,
1008
- 1736
1009
- ],
1010
- "precisions": [
1011
- 0.6365046535677352,
1012
- 0.39025695931477516,
1013
- 0.2647058823529412,
1014
- 0.179147465437788
1015
- ],
1016
- "bp": 1.0,
1017
- "sys_len": 1934,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.3294440172060282,
1020
- "score": 0.3294440172060282,
1021
- "score_name": "sacrebleu",
1022
- "score_ci_low": 0.295619495912131,
1023
- "score_ci_high": 0.3689718708243594,
1024
- "sacrebleu_ci_low": 0.295619495912131,
1025
- "sacrebleu_ci_high": 0.3689718708243594
1026
- },
1027
- "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
- "counts": [
1030
- 775,
1031
- 275,
1032
- 128,
1033
- 70
1034
- ],
1035
- "totals": [
1036
- 2251,
1037
- 2185,
1038
- 2119,
1039
- 2053
1040
- ],
1041
- "precisions": [
1042
- 0.3442914260328743,
1043
- 0.12585812356979406,
1044
- 0.06040585181689476,
1045
- 0.034096444227959086
1046
- ],
1047
- "bp": 1.0,
1048
- "sys_len": 2251,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.09719611157545467,
1051
- "score": 0.09719611157545467,
1052
- "score_name": "sacrebleu",
1053
- "score_ci_low": 0.06983887234981923,
1054
- "score_ci_high": 0.1310674385834968,
1055
- "sacrebleu_ci_low": 0.06983887234981923,
1056
- "sacrebleu_ci_high": 0.1310674385834968
1057
- },
1058
- "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
- "counts": [
1061
- 1157,
1062
- 577,
1063
- 322,
1064
- 195
1065
- ],
1066
- "totals": [
1067
- 2040,
1068
- 1974,
1069
- 1908,
1070
- 1842
1071
- ],
1072
- "precisions": [
1073
- 0.567156862745098,
1074
- 0.2922998986828774,
1075
- 0.16876310272536688,
1076
- 0.10586319218241043
1077
- ],
1078
- "bp": 0.9719689956119355,
1079
- "sys_len": 2040,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.22674671169002888,
1082
- "score": 0.22674671169002888,
1083
- "score_name": "sacrebleu",
1084
- "score_ci_low": 0.20018186306011954,
1085
- "score_ci_high": 0.24942035354854425,
1086
- "sacrebleu_ci_low": 0.20018186306011954,
1087
- "sacrebleu_ci_high": 0.24942035354854425
1088
- },
1089
- "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
- "counts": [
1092
- 1147,
1093
- 643,
1094
- 397,
1095
- 246
1096
- ],
1097
- "totals": [
1098
- 1808,
1099
- 1742,
1100
- 1676,
1101
- 1610
1102
- ],
1103
- "precisions": [
1104
- 0.6344026548672567,
1105
- 0.36911595866819746,
1106
- 0.23687350835322196,
1107
- 0.15279503105590064
1108
- ],
1109
- "bp": 1.0,
1110
- "sys_len": 1808,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.30341593414236545,
1113
- "score": 0.30341593414236545,
1114
- "score_name": "sacrebleu",
1115
- "score_ci_low": 0.26610148555409346,
1116
- "score_ci_high": 0.3551058656882207,
1117
- "sacrebleu_ci_low": 0.26610148555409346,
1118
- "sacrebleu_ci_high": 0.3551058656882207
1119
- },
1120
- "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
- "counts": [
1123
- 935,
1124
- 383,
1125
- 193,
1126
- 101
1127
- ],
1128
- "totals": [
1129
- 1950,
1130
- 1884,
1131
- 1818,
1132
- 1752
1133
- ],
1134
- "precisions": [
1135
- 0.4794871794871795,
1136
- 0.2032908704883227,
1137
- 0.10616061606160615,
1138
- 0.057648401826484015
1139
- ],
1140
- "bp": 1.0,
1141
- "sys_len": 1950,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.15628287583119144,
1144
- "score": 0.15628287583119144,
1145
- "score_name": "sacrebleu",
1146
- "score_ci_low": 0.1304739698304556,
1147
- "score_ci_high": 0.19246721744185705,
1148
- "sacrebleu_ci_low": 0.1304739698304556,
1149
- "sacrebleu_ci_high": 0.19246721744185705
1150
- },
1151
- "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
- "counts": [
1154
- 848,
1155
- 302,
1156
- 135,
1157
- 60
1158
- ],
1159
- "totals": [
1160
- 1872,
1161
- 1806,
1162
- 1740,
1163
- 1674
1164
- ],
1165
- "precisions": [
1166
- 0.452991452991453,
1167
- 0.1672203765227021,
1168
- 0.07758620689655173,
1169
- 0.035842293906810034
1170
- ],
1171
- "bp": 1.0,
1172
- "sys_len": 1872,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.12047304306149162,
1175
- "score": 0.12047304306149162,
1176
- "score_name": "sacrebleu",
1177
- "score_ci_low": 0.10281238979764949,
1178
- "score_ci_high": 0.15881384065042398,
1179
- "sacrebleu_ci_low": 0.10281238979764949,
1180
- "sacrebleu_ci_high": 0.15881384065042398
1181
- },
1182
- "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
- "counts": [
1185
- 1129,
1186
- 652,
1187
- 412,
1188
- 268
1189
- ],
1190
- "totals": [
1191
- 1783,
1192
- 1717,
1193
- 1651,
1194
- 1585
1195
- ],
1196
- "precisions": [
1197
- 0.6332024677509814,
1198
- 0.3797320908561444,
1199
- 0.24954572986069049,
1200
- 0.1690851735015773
1201
- ],
1202
- "bp": 1.0,
1203
- "sys_len": 1783,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.3173722073323666,
1206
- "score": 0.3173722073323666,
1207
- "score_name": "sacrebleu",
1208
- "score_ci_low": 0.28432250677130166,
1209
- "score_ci_high": 0.35844418554288615,
1210
- "sacrebleu_ci_low": 0.28432250677130166,
1211
- "sacrebleu_ci_high": 0.35844418554288615
1212
- },
1213
- "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
- "counts": [
1216
- 1091,
1217
- 601,
1218
- 375,
1219
- 245
1220
- ],
1221
- "totals": [
1222
- 1793,
1223
- 1727,
1224
- 1661,
1225
- 1595
1226
- ],
1227
- "precisions": [
1228
- 0.6084774121583938,
1229
- 0.3480023161551824,
1230
- 0.2257676098735701,
1231
- 0.1536050156739812
1232
- ],
1233
- "bp": 1.0,
1234
- "sys_len": 1793,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.2927341616520049,
1237
- "score": 0.2927341616520049,
1238
- "score_name": "sacrebleu",
1239
- "score_ci_low": 0.2507745489087611,
1240
- "score_ci_high": 0.329051655313229,
1241
- "sacrebleu_ci_low": 0.2507745489087611,
1242
- "sacrebleu_ci_high": 0.329051655313229
1243
- },
1244
- "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
- "counts": [
1247
- 1037,
1248
- 482,
1249
- 248,
1250
- 129
1251
- ],
1252
- "totals": [
1253
- 1796,
1254
- 1730,
1255
- 1664,
1256
- 1598
1257
- ],
1258
- "precisions": [
1259
- 0.5773942093541202,
1260
- 0.2786127167630058,
1261
- 0.14903846153846154,
1262
- 0.0807259073842303
1263
- ],
1264
- "bp": 1.0,
1265
- "sys_len": 1796,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.20974719583348747,
1268
- "score": 0.20974719583348747,
1269
- "score_name": "sacrebleu",
1270
- "score_ci_low": 0.18522343537553757,
1271
- "score_ci_high": 0.2489124348912048,
1272
- "sacrebleu_ci_low": 0.18522343537553757,
1273
- "sacrebleu_ci_high": 0.2489124348912048
1274
- },
1275
- "score": 0.21292389757753669,
1276
- "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
- },
1279
- "score": 0.37890410445729916,
1280
- "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
- }
1283
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/bluebench/2025-06-23T04-06-37_evaluation_results.json DELETED
@@ -1,1283 +0,0 @@
1
- {
2
- "environment_info": {
3
- "timestamp_utc": "2025-06-23T08:06:33.434344Z",
4
- "command_line_invocation": [
5
- "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
- "--tasks",
7
- "benchmarks.bluebench",
8
- "--model",
9
- "cross_provider",
10
- "--model_args",
11
- "model_name=watsonx/ibm/granite-3-3-8b-instruct,max_tokens=256",
12
- "--output_path",
13
- "./results/bluebench",
14
- "--log_samples",
15
- "--trust_remote_code",
16
- "--batch_size",
17
- "8",
18
- "--verbosity",
19
- "ERROR"
20
- ],
21
- "parsed_arguments": {
22
- "tasks": [
23
- "benchmarks.bluebench"
24
- ],
25
- "split": "test",
26
- "num_fewshots": null,
27
- "limit": null,
28
- "batch_size": 8,
29
- "model": "watsonx/ibm/granite-3-3-8b-instruct",
30
- "model_args": {
31
- "max_tokens": 256
32
- },
33
- "gen_kwargs": null,
34
- "chat_template_kwargs": null,
35
- "output_path": "./results/bluebench",
36
- "output_file_prefix": "evaluation_results",
37
- "log_samples": true,
38
- "verbosity": "ERROR",
39
- "apply_chat_template": false,
40
- "trust_remote_code": true,
41
- "disable_hf_cache": false,
42
- "cache_dir": null
43
- },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
- "python_version": "3.10.18",
47
- "system": "Linux",
48
- "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
- "installed_packages": {
50
- "nvidia-cufile-cu12": "1.11.1.6",
51
- "triton": "3.3.1",
52
- "nltk": "3.9.1",
53
- "anyio": "4.9.0",
54
- "absl-py": "2.3.0",
55
- "tiktoken": "0.9.0",
56
- "charset-normalizer": "3.4.2",
57
- "nvidia-cuda-runtime-cu12": "12.6.77",
58
- "sympy": "1.14.0",
59
- "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
- "httpcore": "1.0.9",
62
- "Jinja2": "3.1.6",
63
- "jsonschema-specifications": "2025.4.1",
64
- "pydantic_core": "2.33.2",
65
- "nvidia-cusparse-cu12": "12.5.4.2",
66
- "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
- "portalocker": "3.2.0",
69
- "pandas": "2.3.0",
70
- "multiprocess": "0.70.16",
71
- "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
- "nvidia-nvjitlink-cu12": "12.6.85",
74
- "nvidia-cublas-cu12": "12.6.4.1",
75
- "pydantic": "2.11.7",
76
- "async-timeout": "5.0.1",
77
- "annotated-types": "0.7.0",
78
- "rouge_score": "0.1.2",
79
- "contourpy": "1.3.2",
80
- "aiosignal": "1.3.2",
81
- "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
- "six": "1.17.0",
84
- "diskcache": "5.6.3",
85
- "tqdm": "4.67.1",
86
- "pyarrow": "20.0.0",
87
- "h11": "0.16.0",
88
- "zipp": "3.19.2",
89
- "tzdata": "2025.2",
90
- "bert-score": "0.3.13",
91
- "setuptools": "80.9.0",
92
- "referencing": "0.36.2",
93
- "sacrebleu": "2.5.1",
94
- "filelock": "3.18.0",
95
- "urllib3": "2.5.0",
96
- "scipy": "1.15.3",
97
- "nvidia-nccl-cu12": "2.26.2",
98
- "kiwisolver": "1.4.8",
99
- "networkx": "3.4.2",
100
- "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
- "sniffio": "1.3.1",
103
- "scikit-learn": "1.7.0",
104
- "nvidia-curand-cu12": "10.3.7.77",
105
- "pip": "25.1.1",
106
- "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
- "datasets": "3.6.0",
109
- "nvidia-cusolver-cu12": "11.7.1.2",
110
- "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
- "distro": "1.9.0",
113
- "idna": "3.10",
114
- "MarkupSafe": "3.0.2",
115
- "frozenlist": "1.7.0",
116
- "pyparsing": "3.2.3",
117
- "jiter": "0.10.0",
118
- "importlib_metadata": "8.0.0",
119
- "packaging": "24.2",
120
- "psutil": "7.0.0",
121
- "mecab-ko-dic": "1.0.0",
122
- "joblib": "1.5.1",
123
- "fsspec": "2025.3.0",
124
- "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
- "wheel": "0.45.1",
127
- "nvidia-nvtx-cu12": "12.6.77",
128
- "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
- "propcache": "0.3.2",
131
- "numpy": "2.2.6",
132
- "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
- "conllu": "6.0.0",
135
- "safetensors": "0.5.3",
136
- "requests": "2.32.4",
137
- "regex": "2024.11.6",
138
- "aiohttp": "3.12.13",
139
- "tabulate": "0.9.0",
140
- "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
- "nvidia-cufft-cu12": "11.3.0.4",
143
- "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
- "click": "8.2.1",
145
- "typing_extensions": "4.12.2",
146
- "attrs": "25.3.0",
147
- "exceptiongroup": "1.3.0",
148
- "tenacity": "9.1.2",
149
- "pytz": "2025.2",
150
- "aiohappyeyeballs": "2.6.1",
151
- "python-dateutil": "2.9.0.post0",
152
- "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
154
- "httpx": "0.28.1",
155
- "matplotlib": "3.10.3",
156
- "xxhash": "3.5.0",
157
- "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
- "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
- "threadpoolctl": "3.6.0",
162
- "nvidia-cudnn-cu12": "9.5.1.17",
163
- "jaraco.collections": "5.1.0",
164
- "tomli": "2.0.1",
165
- "backports.tarfile": "1.2.0",
166
- "jaraco.context": "5.3.0",
167
- "typeguard": "4.3.0",
168
- "autocommand": "2.2.2",
169
- "jaraco.text": "3.12.1",
170
- "more-itertools": "10.3.0",
171
- "platformdirs": "4.2.2",
172
- "inflect": "7.3.1",
173
- "jaraco.functools": "4.0.1"
174
- }
175
- },
176
- "results": {
177
- "bias": {
178
- "safety_bbq_age": {
179
- "accuracy": 0.5,
180
- "accuracy_ci_low": 0.4,
181
- "accuracy_ci_high": 0.6,
182
- "score_name": "accuracy",
183
- "score": 0.5,
184
- "score_ci_high": 0.6,
185
- "score_ci_low": 0.4,
186
- "num_of_instances": 90
187
- },
188
- "safety_bbq_disability_status": {
189
- "accuracy": 0.6777777777777778,
190
- "accuracy_ci_low": 0.5777777777777777,
191
- "accuracy_ci_high": 0.7666666666666667,
192
- "score_name": "accuracy",
193
- "score": 0.6777777777777778,
194
- "score_ci_high": 0.7666666666666667,
195
- "score_ci_low": 0.5777777777777777,
196
- "num_of_instances": 90
197
- },
198
- "safety_bbq_gender_identity": {
199
- "accuracy": 0.8333333333333334,
200
- "accuracy_ci_low": 0.7444444444444445,
201
- "accuracy_ci_high": 0.9,
202
- "score_name": "accuracy",
203
- "score": 0.8333333333333334,
204
- "score_ci_high": 0.9,
205
- "score_ci_low": 0.7444444444444445,
206
- "num_of_instances": 90
207
- },
208
- "safety_bbq_nationality": {
209
- "accuracy": 0.6666666666666666,
210
- "accuracy_ci_low": 0.5666666666666667,
211
- "accuracy_ci_high": 0.7555555555555555,
212
- "score_name": "accuracy",
213
- "score": 0.6666666666666666,
214
- "score_ci_high": 0.7555555555555555,
215
- "score_ci_low": 0.5666666666666667,
216
- "num_of_instances": 90
217
- },
218
- "safety_bbq_physical_appearance": {
219
- "accuracy": 0.7111111111111111,
220
- "accuracy_ci_low": 0.6111111111111112,
221
- "accuracy_ci_high": 0.8,
222
- "score_name": "accuracy",
223
- "score": 0.7111111111111111,
224
- "score_ci_high": 0.8,
225
- "score_ci_low": 0.6111111111111112,
226
- "num_of_instances": 90
227
- },
228
- "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.9333333333333333,
230
- "accuracy_ci_low": 0.8777777777777778,
231
- "accuracy_ci_high": 0.9777777777777777,
232
- "score_name": "accuracy",
233
- "score": 0.9333333333333333,
234
- "score_ci_high": 0.9777777777777777,
235
- "score_ci_low": 0.8777777777777778,
236
- "num_of_instances": 90
237
- },
238
- "safety_bbq_race_x_gender": {
239
- "accuracy": 0.8444444444444444,
240
- "accuracy_ci_low": 0.7555555555555555,
241
- "accuracy_ci_high": 0.9048361867497154,
242
- "score_name": "accuracy",
243
- "score": 0.8444444444444444,
244
- "score_ci_high": 0.9048361867497154,
245
- "score_ci_low": 0.7555555555555555,
246
- "num_of_instances": 90
247
- },
248
- "safety_bbq_race_x_ses": {
249
- "accuracy": 0.9111111111111111,
250
- "accuracy_ci_low": 0.8333333333333334,
251
- "accuracy_ci_high": 0.9555555555555556,
252
- "score_name": "accuracy",
253
- "score": 0.9111111111111111,
254
- "score_ci_high": 0.9555555555555556,
255
- "score_ci_low": 0.8333333333333334,
256
- "num_of_instances": 90
257
- },
258
- "safety_bbq_religion": {
259
- "accuracy": 0.7444444444444445,
260
- "accuracy_ci_low": 0.6555555555555556,
261
- "accuracy_ci_high": 0.83090190108808,
262
- "score_name": "accuracy",
263
- "score": 0.7444444444444445,
264
- "score_ci_high": 0.83090190108808,
265
- "score_ci_low": 0.6555555555555556,
266
- "num_of_instances": 90
267
- },
268
- "safety_bbq_ses": {
269
- "accuracy": 0.6777777777777778,
270
- "accuracy_ci_low": 0.5777777777777777,
271
- "accuracy_ci_high": 0.7666666666666667,
272
- "score_name": "accuracy",
273
- "score": 0.6777777777777778,
274
- "score_ci_high": 0.7666666666666667,
275
- "score_ci_low": 0.5777777777777777,
276
- "num_of_instances": 90
277
- },
278
- "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.7666666666666667,
280
- "accuracy_ci_low": 0.6666666666666666,
281
- "accuracy_ci_high": 0.8444444444444444,
282
- "score_name": "accuracy",
283
- "score": 0.7666666666666667,
284
- "score_ci_high": 0.8444444444444444,
285
- "score_ci_low": 0.6666666666666666,
286
- "num_of_instances": 90
287
- },
288
- "score": 0.7515151515151516,
289
- "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
- },
292
- "chatbot_abilities": {
293
- "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.08744186046511628,
296
- "score": 0.08744186046511628,
297
- "score_name": "llama_3_70b_instruct_template_arena_hard"
298
- },
299
- "score": 0.08744186046511628,
300
- "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
- },
303
- "entity_extraction": {
304
- "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.52046783625731,
307
- "f1_Location": 0.3275862068965517,
308
- "f1_Organization": 0.3905723905723905,
309
- "f1_macro": 0.41287547790875073,
310
- "recall_macro": 0.34275188964299236,
311
- "precision_macro": 0.5261312195216724,
312
- "in_classes_support": 0.5945017182130584,
313
- "f1_micro": 0.3342366757000903,
314
- "recall_micro": 0.3523809523809524,
315
- "precision_micro": 0.3178694158075601,
316
- "score": 0.3342366757000903,
317
- "score_name": "f1_micro",
318
- "score_ci_low": 0.29025426476142113,
319
- "score_ci_high": 0.38246190736620644,
320
- "f1_micro_ci_low": 0.29025426476142113,
321
- "f1_micro_ci_high": 0.38246190736620644
322
- },
323
- "score": 0.3342366757000903,
324
- "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
- },
327
- "knowledge": {
328
- "mmlu_pro_biology": {
329
- "accuracy": 0.4225352112676056,
330
- "accuracy_ci_low": 0.30985915492957744,
331
- "accuracy_ci_high": 0.5352112676056338,
332
- "score_name": "accuracy",
333
- "score": 0.4225352112676056,
334
- "score_ci_high": 0.5352112676056338,
335
- "score_ci_low": 0.30985915492957744,
336
- "num_of_instances": 71
337
- },
338
- "mmlu_pro_business": {
339
- "accuracy": 0.19718309859154928,
340
- "accuracy_ci_low": 0.11267605633802817,
341
- "accuracy_ci_high": 0.29577464788732394,
342
- "score_name": "accuracy",
343
- "score": 0.19718309859154928,
344
- "score_ci_high": 0.29577464788732394,
345
- "score_ci_low": 0.11267605633802817,
346
- "num_of_instances": 71
347
- },
348
- "mmlu_pro_chemistry": {
349
- "accuracy": 0.23943661971830985,
350
- "accuracy_ci_low": 0.15492957746478872,
351
- "accuracy_ci_high": 0.352112676056338,
352
- "score_name": "accuracy",
353
- "score": 0.23943661971830985,
354
- "score_ci_high": 0.352112676056338,
355
- "score_ci_low": 0.15492957746478872,
356
- "num_of_instances": 71
357
- },
358
- "mmlu_pro_computer_science": {
359
- "accuracy": 0.4084507042253521,
360
- "accuracy_ci_low": 0.29577464788732394,
361
- "accuracy_ci_high": 0.5211267605633803,
362
- "score_name": "accuracy",
363
- "score": 0.4084507042253521,
364
- "score_ci_high": 0.5211267605633803,
365
- "score_ci_low": 0.29577464788732394,
366
- "num_of_instances": 71
367
- },
368
- "mmlu_pro_economics": {
369
- "accuracy": 0.4084507042253521,
370
- "accuracy_ci_low": 0.29577464788732394,
371
- "accuracy_ci_high": 0.5211267605633803,
372
- "score_name": "accuracy",
373
- "score": 0.4084507042253521,
374
- "score_ci_high": 0.5211267605633803,
375
- "score_ci_low": 0.29577464788732394,
376
- "num_of_instances": 71
377
- },
378
- "mmlu_pro_engineering": {
379
- "accuracy": 0.23943661971830985,
380
- "accuracy_ci_low": 0.15492957746478872,
381
- "accuracy_ci_high": 0.3380281690140845,
382
- "score_name": "accuracy",
383
- "score": 0.23943661971830985,
384
- "score_ci_high": 0.3380281690140845,
385
- "score_ci_low": 0.15492957746478872,
386
- "num_of_instances": 71
387
- },
388
- "mmlu_pro_health": {
389
- "accuracy": 0.352112676056338,
390
- "accuracy_ci_low": 0.23943661971830985,
391
- "accuracy_ci_high": 0.4647887323943662,
392
- "score_name": "accuracy",
393
- "score": 0.352112676056338,
394
- "score_ci_high": 0.4647887323943662,
395
- "score_ci_low": 0.23943661971830985,
396
- "num_of_instances": 71
397
- },
398
- "mmlu_pro_history": {
399
- "accuracy": 0.323943661971831,
400
- "accuracy_ci_low": 0.21693057179778907,
401
- "accuracy_ci_high": 0.43661971830985913,
402
- "score_name": "accuracy",
403
- "score": 0.323943661971831,
404
- "score_ci_high": 0.43661971830985913,
405
- "score_ci_low": 0.21693057179778907,
406
- "num_of_instances": 71
407
- },
408
- "mmlu_pro_law": {
409
- "accuracy": 0.30985915492957744,
410
- "accuracy_ci_low": 0.2112676056338028,
411
- "accuracy_ci_high": 0.42459270101591795,
412
- "score_name": "accuracy",
413
- "score": 0.30985915492957744,
414
- "score_ci_high": 0.42459270101591795,
415
- "score_ci_low": 0.2112676056338028,
416
- "num_of_instances": 71
417
- },
418
- "mmlu_pro_math": {
419
- "accuracy": 0.09859154929577464,
420
- "accuracy_ci_low": 0.04225352112676056,
421
- "accuracy_ci_high": 0.17777703477060838,
422
- "score_name": "accuracy",
423
- "score": 0.09859154929577464,
424
- "score_ci_high": 0.17777703477060838,
425
- "score_ci_low": 0.04225352112676056,
426
- "num_of_instances": 71
427
- },
428
- "mmlu_pro_other": {
429
- "accuracy": 0.323943661971831,
430
- "accuracy_ci_low": 0.22338079742223388,
431
- "accuracy_ci_high": 0.43661971830985913,
432
- "score_name": "accuracy",
433
- "score": 0.323943661971831,
434
- "score_ci_high": 0.43661971830985913,
435
- "score_ci_low": 0.22338079742223388,
436
- "num_of_instances": 71
437
- },
438
- "mmlu_pro_philosophy": {
439
- "accuracy": 0.4647887323943662,
440
- "accuracy_ci_low": 0.352112676056338,
441
- "accuracy_ci_high": 0.5915492957746479,
442
- "score_name": "accuracy",
443
- "score": 0.4647887323943662,
444
- "score_ci_high": 0.5915492957746479,
445
- "score_ci_low": 0.352112676056338,
446
- "num_of_instances": 71
447
- },
448
- "mmlu_pro_physics": {
449
- "accuracy": 0.18309859154929578,
450
- "accuracy_ci_low": 0.11267605633802817,
451
- "accuracy_ci_high": 0.28169014084507044,
452
- "score_name": "accuracy",
453
- "score": 0.18309859154929578,
454
- "score_ci_high": 0.28169014084507044,
455
- "score_ci_low": 0.11267605633802817,
456
- "num_of_instances": 71
457
- },
458
- "mmlu_pro_psychology": {
459
- "accuracy": 0.5352112676056338,
460
- "accuracy_ci_low": 0.4225352112676056,
461
- "accuracy_ci_high": 0.647887323943662,
462
- "score_name": "accuracy",
463
- "score": 0.5352112676056338,
464
- "score_ci_high": 0.647887323943662,
465
- "score_ci_low": 0.4225352112676056,
466
- "num_of_instances": 71
467
- },
468
- "score": 0.32193158953722334,
469
- "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
- },
472
- "legal": {
473
- "legalbench_abercrombie": {
474
- "f1_macro": 0.3075373413771583,
475
- "f1_suggestive": 0.36363636363636365,
476
- "f1_arbitrary": 0.28,
477
- "f1_generic": 0.3157894736842105,
478
- "f1_fanciful": 0.1,
479
- "f1_descriptive": 0.4782608695652174,
480
- "f1_macro_ci_low": 0.22135682385238098,
481
- "f1_macro_ci_high": 0.4258827689087187,
482
- "score_name": "f1_micro",
483
- "score": 0.33121019108280253,
484
- "score_ci_high": 0.43513626025637364,
485
- "score_ci_low": 0.22818791946308725,
486
- "num_of_instances": 85,
487
- "accuracy": 0.3058823529411765,
488
- "accuracy_ci_low": 0.21176470588235294,
489
- "accuracy_ci_high": 0.4,
490
- "f1_micro": 0.33121019108280253,
491
- "f1_micro_ci_low": 0.22818791946308725,
492
- "f1_micro_ci_high": 0.43513626025637364
493
- },
494
- "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.563568215892054,
496
- "f1_no": 0.7793103448275862,
497
- "f1_yes": 0.34782608695652173,
498
- "f1_macro_ci_low": 0.49159571105513383,
499
- "f1_macro_ci_high": 0.6365342652768683,
500
- "score_name": "f1_micro",
501
- "score": 0.675392670157068,
502
- "score_ci_high": 0.73489030467135,
503
- "score_ci_low": 0.608918205032967,
504
- "num_of_instances": 200,
505
- "accuracy": 0.645,
506
- "accuracy_ci_low": 0.58,
507
- "accuracy_ci_high": 0.705,
508
- "f1_micro": 0.675392670157068,
509
- "f1_micro_ci_low": 0.608918205032967,
510
- "f1_micro_ci_high": 0.73489030467135
511
- },
512
- "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.3062664077216316,
514
- "f1_conclusion": 0.20833333333333334,
515
- "f1_decree": 0.1875,
516
- "f1_rule": 0.47761194029850745,
517
- "f1_issue": 0.25,
518
- "f1_analysis": 0.44,
519
- "f1_facts": 0.2727272727272727,
520
- "f1_procedural history": 0.3076923076923077,
521
- "f1_macro_ci_low": 0.24659229419876927,
522
- "f1_macro_ci_high": 0.3810118235674986,
523
- "score_name": "f1_micro",
524
- "score": 0.3209169054441261,
525
- "score_ci_high": 0.3885967259042703,
526
- "score_ci_low": 0.2564102564102564,
527
- "num_of_instances": 200,
528
- "accuracy": 0.28,
529
- "accuracy_ci_low": 0.22,
530
- "accuracy_ci_high": 0.34,
531
- "f1_micro": 0.3209169054441261,
532
- "f1_micro_ci_low": 0.2564102564102564,
533
- "f1_micro_ci_high": 0.3885967259042703
534
- },
535
- "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.5153888280394305,
537
- "f1_yes": 0.6090909090909091,
538
- "f1_no": 0.42168674698795183,
539
- "f1_macro_ci_low": 0.44286956267940425,
540
- "f1_macro_ci_high": 0.5822132955205006,
541
- "score_name": "f1_micro",
542
- "score": 0.5284974093264249,
543
- "score_ci_high": 0.5917634471129095,
544
- "score_ci_low": 0.4572437728690022,
545
- "num_of_instances": 200,
546
- "accuracy": 0.51,
547
- "accuracy_ci_low": 0.44,
548
- "accuracy_ci_high": 0.575,
549
- "f1_micro": 0.5284974093264249,
550
- "f1_micro_ci_low": 0.4572437728690022,
551
- "f1_micro_ci_high": 0.5917634471129095
552
- },
553
- "legalbench_proa": {
554
- "f1_macro": 0.8555844155844157,
555
- "f1_yes": 0.88,
556
- "f1_no": 0.8311688311688312,
557
- "f1_macro_ci_low": 0.7810530949191602,
558
- "f1_macro_ci_high": 0.911227513400763,
559
- "score_name": "f1_micro",
560
- "score": 0.8552631578947368,
561
- "score_ci_high": 0.9104714274063991,
562
- "score_ci_low": 0.7791920429268818,
563
- "num_of_instances": 85,
564
- "accuracy": 0.7647058823529411,
565
- "accuracy_ci_low": 0.6705882352941176,
566
- "accuracy_ci_high": 0.8470588235294118,
567
- "f1_micro": 0.8552631578947368,
568
- "f1_micro_ci_low": 0.7791920429268818,
569
- "f1_micro_ci_high": 0.9104714274063991
570
- },
571
- "score": 0.5422560667810317,
572
- "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
- },
575
- "news_classification": {
576
- "20_newsgroups_short": {
577
- "f1_macro": 0.4238304887304002,
578
- "f1_cars": 0.6534653465346535,
579
- "f1_pc hardware": 0.38095238095238093,
580
- "f1_windows x": 0.0,
581
- "f1_atheism": 0.2727272727272727,
582
- "f1_religion": 0.22641509433962265,
583
- "f1_medicine": 0.7901234567901234,
584
- "f1_christianity": 0.1694915254237288,
585
- "f1_computer graphics": 0.3373493975903614,
586
- "f1_microsoft windows": 0.37681159420289856,
587
- "f1_middle east": 0.4594594594594595,
588
- "f1_politics": 0.27906976744186046,
589
- "f1_motorcycles": 0.4883720930232558,
590
- "f1_mac hardware": 0.03125,
591
- "f1_for sale": 0.6461538461538462,
592
- "f1_guns": 0.18518518518518517,
593
- "f1_space": 0.575,
594
- "f1_cryptography": 0.5079365079365079,
595
- "f1_baseball": 0.8468468468468469,
596
- "f1_hockey": 0.85,
597
- "f1_electronics": 0.4,
598
- "f1_macro_ci_low": 0.39859097081154116,
599
- "f1_macro_ci_high": 0.4545535978307604,
600
- "score_name": "f1_micro",
601
- "score": 0.4485549132947977,
602
- "score_ci_high": 0.4787668189917876,
603
- "score_ci_low": 0.41661505505349583,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.388,
606
- "accuracy_ci_low": 0.358,
607
- "accuracy_ci_high": 0.4198351175250287,
608
- "f1_micro": 0.4485549132947977,
609
- "f1_micro_ci_low": 0.41661505505349583,
610
- "f1_micro_ci_high": 0.4787668189917876
611
- },
612
- "score": 0.4485549132947977,
613
- "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
- },
616
- "product_help": {
617
- "cfpb_product_2023": {
618
- "f1_macro": 0.6126200216184788,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9002217294900222,
620
- "f1_checking or savings account": 0.6451612903225806,
621
- "f1_debt collection": 0.5066666666666667,
622
- "f1_credit card or prepaid card": 0.6277372262773723,
623
- "f1_mortgage": 0.7945205479452054,
624
- "f1_student loan": 0.8461538461538461,
625
- "f1_money transfer or virtual currency or money service": 0.4878048780487805,
626
- "f1_payday loan or title loan or personal loan": 0.2608695652173913,
627
- "f1_vehicle loan or lease": 0.4444444444444444,
628
- "f1_macro_ci_low": 0.557724310665768,
629
- "f1_macro_ci_high": 0.6722482288571774,
630
- "score_name": "f1_micro",
631
- "score": 0.8101924076963078,
632
- "score_ci_high": 0.8332463122584837,
633
- "score_ci_low": 0.7850628587071383,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.779,
636
- "accuracy_ci_low": 0.752,
637
- "accuracy_ci_high": 0.805,
638
- "f1_micro": 0.8101924076963078,
639
- "f1_micro_ci_low": 0.7850628587071383,
640
- "f1_micro_ci_high": 0.8332463122584837
641
- },
642
- "cfpb_product_watsonx": {
643
- "f1_macro": 0.6957226327824474,
644
- "f1_mortgages and loans": 0.7861271676300579,
645
- "f1_credit card": 0.735632183908046,
646
- "f1_debt collection": 0.6605504587155964,
647
- "f1_retail banking": 0.5853658536585366,
648
- "f1_credit reporting": 0.7109375,
649
- "f1_macro_ci_low": 0.6549701202052777,
650
- "f1_macro_ci_high": 0.7382657469246365,
651
- "score_name": "f1_micro",
652
- "score": 0.701271186440678,
653
- "score_ci_high": 0.7411785857709632,
654
- "score_ci_low": 0.6609516931464113,
655
- "num_of_instances": 500,
656
- "accuracy": 0.662,
657
- "accuracy_ci_low": 0.6202110366430569,
658
- "accuracy_ci_high": 0.706,
659
- "f1_micro": 0.701271186440678,
660
- "f1_micro_ci_low": 0.6609516931464113,
661
- "f1_micro_ci_high": 0.7411785857709632
662
- },
663
- "score": 0.7557317970684929,
664
- "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
- },
667
- "qa_finance": {
668
- "fin_qa": {
669
- "num_of_instances": 1000,
670
- "program_accuracy": 0.084,
671
- "score": 0.084,
672
- "score_name": "program_accuracy",
673
- "execution_accuracy": 0.073,
674
- "program_accuracy_ci_low": 0.067,
675
- "program_accuracy_ci_high": 0.10386305691021766,
676
- "score_ci_low": 0.067,
677
- "score_ci_high": 0.10386305691021766,
678
- "execution_accuracy_ci_low": 0.057,
679
- "execution_accuracy_ci_high": 0.091
680
- },
681
- "score": 0.084,
682
- "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
- },
685
- "rag_general": {
686
- "rag_response_generation_clapnq": {
687
- "precision": 0.2967925946544494,
688
- "recall": 0.5841362559189178,
689
- "f1": 0.3364987383722942,
690
- "precision_ci_low": 0.2764979602656248,
691
- "precision_ci_high": 0.3159857611493305,
692
- "recall_ci_low": 0.5663103849045117,
693
- "recall_ci_high": 0.6007142202829963,
694
- "f1_ci_low": 0.3178259767982501,
695
- "f1_ci_high": 0.35351716805909167,
696
- "score_name": "f1",
697
- "score": 0.3364987383722942,
698
- "score_ci_high": 0.35351716805909167,
699
- "score_ci_low": 0.3178259767982501,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.5976409501334031,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6827784284452597,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5494535167018573,
704
- "faithfullness_f1_token_overlap": 0.34046513614196555,
705
- "faithfullness_recall_token_overlap": 0.2738914194701684,
706
- "faithfullness_precision_token_overlap": 0.5609296227002613,
707
- "correctness_f1_token_overlap": 0.3364987383722942,
708
- "correctness_recall_token_overlap": 0.5841362559189178,
709
- "correctness_precision_token_overlap": 0.2967925946544494
710
- },
711
- "score": 0.3364987383722942,
712
- "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
- },
715
- "reasoning": {
716
- "hellaswag": {
717
- "accuracy": 0.448,
718
- "accuracy_ci_low": 0.417,
719
- "accuracy_ci_high": 0.479,
720
- "score_name": "accuracy",
721
- "score": 0.448,
722
- "score_ci_high": 0.479,
723
- "score_ci_low": 0.417,
724
- "num_of_instances": 1000
725
- },
726
- "openbook_qa": {
727
- "accuracy": 0.78,
728
- "accuracy_ci_low": 0.742,
729
- "accuracy_ci_high": 0.816,
730
- "score_name": "accuracy",
731
- "score": 0.78,
732
- "score_ci_high": 0.816,
733
- "score_ci_low": 0.742,
734
- "num_of_instances": 500
735
- },
736
- "score": 0.614,
737
- "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
- },
740
- "safety": {
741
- "attaq_500": {
742
- "severity_critical": 0.0,
743
- "severity_high": 0.0,
744
- "severity_medium": 1.0,
745
- "severity_low": 99.0,
746
- "category_harmful_info": 0.8719441451812312,
747
- "category_deception": 0.888891483497138,
748
- "category_explicit_content": 0.8127973503386131,
749
- "category_violence": 0.8689369508242678,
750
- "category_discrimination": 0.8825277714502243,
751
- "category_substance_abuse": 0.9410604351096683,
752
- "category_pii": 0.7720797760067163,
753
- "safety": 0.8674504241983716,
754
- "safety_ci_low": 0.847960857428302,
755
- "safety_ci_high": 0.8844523349924212,
756
- "score_name": "safety",
757
- "score": 0.8674504241983716,
758
- "score_ci_high": 0.8844523349924212,
759
- "score_ci_low": 0.847960857428302,
760
- "num_of_instances": 100
761
- },
762
- "score": 0.8674504241983716,
763
- "score_name": "subsets_mean",
764
- "num_of_instances": 100
765
- },
766
- "summarization": {
767
- "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rouge1": 0.42001432363059676,
770
- "rougeLsum": 0.35176737784751416,
771
- "rouge2": 0.20090339450058858,
772
- "rougeL": 0.2887794256459243,
773
- "score": 0.2887794256459243,
774
- "score_name": "rougeL",
775
- "rouge1_ci_low": 0.4115358503100108,
776
- "rouge1_ci_high": 0.42788391457002284,
777
- "rougeLsum_ci_low": 0.3432569747337183,
778
- "rougeLsum_ci_high": 0.35847372198631006,
779
- "rouge2_ci_low": 0.19492457543616534,
780
- "rouge2_ci_high": 0.2075223454056542,
781
- "rougeL_ci_low": 0.28220405170841467,
782
- "rougeL_ci_high": 0.2953572975976334,
783
- "score_ci_low": 0.28220405170841467,
784
- "score_ci_high": 0.2953572975976334
785
- },
786
- "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rouge1": 0.11196786218861304,
789
- "rougeLsum": 0.09259288227162547,
790
- "rouge2": 0.014304299542517345,
791
- "rougeL": 0.08050830498137622,
792
- "score": 0.08050830498137622,
793
- "score_name": "rougeL",
794
- "rouge1_ci_low": 0.10730072397656114,
795
- "rouge1_ci_high": 0.11708078514416911,
796
- "rougeLsum_ci_low": 0.088799408920107,
797
- "rougeLsum_ci_high": 0.09663277250494734,
798
- "rouge2_ci_low": 0.01265361386023307,
799
- "rouge2_ci_high": 0.01610624039999516,
800
- "rougeL_ci_low": 0.07745615703093822,
801
- "rougeL_ci_high": 0.08426746560170988,
802
- "score_ci_low": 0.07745615703093822,
803
- "score_ci_high": 0.08426746560170988
804
- },
805
- "score": 0.18464386531365026,
806
- "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
- },
809
- "translation": {
810
- "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
- "counts": [
813
- 1147,
814
- 635,
815
- 377,
816
- 236
817
- ],
818
- "totals": [
819
- 2783,
820
- 2717,
821
- 2651,
822
- 2585
823
- ],
824
- "precisions": [
825
- 0.41214516708587856,
826
- 0.23371365476628636,
827
- 0.14221048660882685,
828
- 0.09129593810444873
829
- ],
830
- "bp": 1.0,
831
- "sys_len": 2783,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.18805260077651942,
834
- "score": 0.18805260077651942,
835
- "score_name": "sacrebleu",
836
- "score_ci_low": 0.15590859876442242,
837
- "score_ci_high": 0.22665030743269873,
838
- "sacrebleu_ci_low": 0.15590859876442242,
839
- "sacrebleu_ci_high": 0.22665030743269873
840
- },
841
- "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
- "counts": [
844
- 1250,
845
- 740,
846
- 469,
847
- 298
848
- ],
849
- "totals": [
850
- 3365,
851
- 3299,
852
- 3233,
853
- 3167
854
- ],
855
- "precisions": [
856
- 0.37147102526002973,
857
- 0.22431039709002729,
858
- 0.1450665017012063,
859
- 0.09409535838332808
860
- ],
861
- "bp": 1.0,
862
- "sys_len": 3365,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.18364428677137226,
865
- "score": 0.18364428677137226,
866
- "score_name": "sacrebleu",
867
- "score_ci_low": 0.15133175793244782,
868
- "score_ci_high": 0.238285104264321,
869
- "sacrebleu_ci_low": 0.15133175793244782,
870
- "sacrebleu_ci_high": 0.238285104264321
871
- },
872
- "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
- "counts": [
875
- 701,
876
- 279,
877
- 122,
878
- 55
879
- ],
880
- "totals": [
881
- 2379,
882
- 2313,
883
- 2247,
884
- 2181
885
- ],
886
- "precisions": [
887
- 0.294661622530475,
888
- 0.12062256809338522,
889
- 0.054294615042278595,
890
- 0.02521779000458505
891
- ],
892
- "bp": 1.0,
893
- "sys_len": 2379,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.08352259557657876,
896
- "score": 0.08352259557657876,
897
- "score_name": "sacrebleu",
898
- "score_ci_low": 0.06605026431332355,
899
- "score_ci_high": 0.10504705952927867,
900
- "sacrebleu_ci_low": 0.06605026431332355,
901
- "sacrebleu_ci_high": 0.10504705952927867
902
- },
903
- "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
- "counts": [
906
- 1060,
907
- 555,
908
- 321,
909
- 197
910
- ],
911
- "totals": [
912
- 2307,
913
- 2241,
914
- 2175,
915
- 2109
916
- ],
917
- "precisions": [
918
- 0.45947117468573906,
919
- 0.24765729585006693,
920
- 0.14758620689655172,
921
- 0.09340919867235657
922
- ],
923
- "bp": 1.0,
924
- "sys_len": 2307,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.19901517998924645,
927
- "score": 0.19901517998924645,
928
- "score_name": "sacrebleu",
929
- "score_ci_low": 0.15118718491370434,
930
- "score_ci_high": 0.24524942034353023,
931
- "sacrebleu_ci_low": 0.15118718491370434,
932
- "sacrebleu_ci_high": 0.24524942034353023
933
- },
934
- "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
- "counts": [
937
- 1383,
938
- 931,
939
- 685,
940
- 511
941
- ],
942
- "totals": [
943
- 2499,
944
- 2433,
945
- 2367,
946
- 2301
947
- ],
948
- "precisions": [
949
- 0.553421368547419,
950
- 0.3826551582408549,
951
- 0.28939585973806503,
952
- 0.222077357670578
953
- ],
954
- "bp": 1.0,
955
- "sys_len": 2499,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.34155844112464445,
958
- "score": 0.34155844112464445,
959
- "score_name": "sacrebleu",
960
- "score_ci_low": 0.2882755414660873,
961
- "score_ci_high": 0.40044941880570056,
962
- "sacrebleu_ci_low": 0.2882755414660873,
963
- "sacrebleu_ci_high": 0.40044941880570056
964
- },
965
- "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
- "counts": [
968
- 1091,
969
- 445,
970
- 224,
971
- 119
972
- ],
973
- "totals": [
974
- 4751,
975
- 4685,
976
- 4619,
977
- 4553
978
- ],
979
- "precisions": [
980
- 0.2296358661334456,
981
- 0.09498399146211313,
982
- 0.04849534531283828,
983
- 0.026136613222051394
984
- ],
985
- "bp": 1.0,
986
- "sys_len": 4751,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.07251199865213667,
989
- "score": 0.07251199865213667,
990
- "score_name": "sacrebleu",
991
- "score_ci_low": 0.05587322252934343,
992
- "score_ci_high": 0.09184707565044344,
993
- "sacrebleu_ci_low": 0.05587322252934343,
994
- "sacrebleu_ci_high": 0.09184707565044344
995
- },
996
- "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
- "counts": [
999
- 1335,
1000
- 873,
1001
- 615,
1002
- 436
1003
- ],
1004
- "totals": [
1005
- 3124,
1006
- 3058,
1007
- 2992,
1008
- 2926
1009
- ],
1010
- "precisions": [
1011
- 0.427336747759283,
1012
- 0.2854807063440157,
1013
- 0.20554812834224598,
1014
- 0.14900888585099112
1015
- ],
1016
- "bp": 1.0,
1017
- "sys_len": 3124,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.24723968084246245,
1020
- "score": 0.24723968084246245,
1021
- "score_name": "sacrebleu",
1022
- "score_ci_low": 0.1931280328121111,
1023
- "score_ci_high": 0.3044993993983362,
1024
- "sacrebleu_ci_low": 0.1931280328121111,
1025
- "sacrebleu_ci_high": 0.3044993993983362
1026
- },
1027
- "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
- "counts": [
1030
- 897,
1031
- 407,
1032
- 221,
1033
- 130
1034
- ],
1035
- "totals": [
1036
- 2626,
1037
- 2560,
1038
- 2494,
1039
- 2428
1040
- ],
1041
- "precisions": [
1042
- 0.3415841584158416,
1043
- 0.158984375,
1044
- 0.08861267040898156,
1045
- 0.05354200988467875
1046
- ],
1047
- "bp": 1.0,
1048
- "sys_len": 2626,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.12669534688031472,
1051
- "score": 0.12669534688031472,
1052
- "score_name": "sacrebleu",
1053
- "score_ci_low": 0.1000733380304748,
1054
- "score_ci_high": 0.16178959885111238,
1055
- "sacrebleu_ci_low": 0.1000733380304748,
1056
- "sacrebleu_ci_high": 0.16178959885111238
1057
- },
1058
- "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
- "counts": [
1061
- 1203,
1062
- 616,
1063
- 335,
1064
- 184
1065
- ],
1066
- "totals": [
1067
- 2913,
1068
- 2847,
1069
- 2781,
1070
- 2715
1071
- ],
1072
- "precisions": [
1073
- 0.41297631307929966,
1074
- 0.21636810677906568,
1075
- 0.12046026609133405,
1076
- 0.06777163904235727
1077
- ],
1078
- "bp": 1.0,
1079
- "sys_len": 2913,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.16434350639643316,
1082
- "score": 0.16434350639643316,
1083
- "score_name": "sacrebleu",
1084
- "score_ci_low": 0.1325515600195346,
1085
- "score_ci_high": 0.20299768479868893,
1086
- "sacrebleu_ci_low": 0.1325515600195346,
1087
- "sacrebleu_ci_high": 0.20299768479868893
1088
- },
1089
- "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
- "counts": [
1092
- 1254,
1093
- 732,
1094
- 469,
1095
- 322
1096
- ],
1097
- "totals": [
1098
- 3278,
1099
- 3212,
1100
- 3146,
1101
- 3080
1102
- ],
1103
- "precisions": [
1104
- 0.3825503355704698,
1105
- 0.22789539227895392,
1106
- 0.14907819453274,
1107
- 0.10454545454545455
1108
- ],
1109
- "bp": 1.0,
1110
- "sys_len": 3278,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.19199320250461963,
1113
- "score": 0.19199320250461963,
1114
- "score_name": "sacrebleu",
1115
- "score_ci_low": 0.15725080687797277,
1116
- "score_ci_high": 0.24193576258661156,
1117
- "sacrebleu_ci_low": 0.15725080687797277,
1118
- "sacrebleu_ci_high": 0.24193576258661156
1119
- },
1120
- "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
- "counts": [
1123
- 1040,
1124
- 474,
1125
- 264,
1126
- 161
1127
- ],
1128
- "totals": [
1129
- 3106,
1130
- 3040,
1131
- 2974,
1132
- 2908
1133
- ],
1134
- "precisions": [
1135
- 0.334835801674179,
1136
- 0.15592105263157896,
1137
- 0.08876933422999328,
1138
- 0.05536451169188446
1139
- ],
1140
- "bp": 1.0,
1141
- "sys_len": 3106,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.1265632943373452,
1144
- "score": 0.1265632943373452,
1145
- "score_name": "sacrebleu",
1146
- "score_ci_low": 0.10051707288201024,
1147
- "score_ci_high": 0.15415847724283543,
1148
- "sacrebleu_ci_low": 0.10051707288201024,
1149
- "sacrebleu_ci_high": 0.15415847724283543
1150
- },
1151
- "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
- "counts": [
1154
- 978,
1155
- 426,
1156
- 216,
1157
- 119
1158
- ],
1159
- "totals": [
1160
- 3053,
1161
- 2987,
1162
- 2921,
1163
- 2855
1164
- ],
1165
- "precisions": [
1166
- 0.3203406485424173,
1167
- 0.14261801138265817,
1168
- 0.07394727832933927,
1169
- 0.04168126094570928
1170
- ],
1171
- "bp": 1.0,
1172
- "sys_len": 3053,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.10893372822633232,
1175
- "score": 0.10893372822633232,
1176
- "score_name": "sacrebleu",
1177
- "score_ci_low": 0.08051704910100821,
1178
- "score_ci_high": 0.1399793521343314,
1179
- "sacrebleu_ci_low": 0.08051704910100821,
1180
- "sacrebleu_ci_high": 0.1399793521343314
1181
- },
1182
- "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
- "counts": [
1185
- 1279,
1186
- 820,
1187
- 574,
1188
- 416
1189
- ],
1190
- "totals": [
1191
- 2919,
1192
- 2853,
1193
- 2787,
1194
- 2721
1195
- ],
1196
- "precisions": [
1197
- 0.4381637547105173,
1198
- 0.28741675429372593,
1199
- 0.2059562253318981,
1200
- 0.15288496876148475
1201
- ],
1202
- "bp": 1.0,
1203
- "sys_len": 2919,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.250941252136478,
1206
- "score": 0.250941252136478,
1207
- "score_name": "sacrebleu",
1208
- "score_ci_low": 0.19373911879549774,
1209
- "score_ci_high": 0.3143574197034948,
1210
- "sacrebleu_ci_low": 0.19373911879549774,
1211
- "sacrebleu_ci_high": 0.3143574197034948
1212
- },
1213
- "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
- "counts": [
1216
- 1215,
1217
- 691,
1218
- 421,
1219
- 259
1220
- ],
1221
- "totals": [
1222
- 2920,
1223
- 2854,
1224
- 2788,
1225
- 2722
1226
- ],
1227
- "precisions": [
1228
- 0.41609589041095885,
1229
- 0.24211632796075683,
1230
- 0.15100430416068866,
1231
- 0.09515062454077883
1232
- ],
1233
- "bp": 1.0,
1234
- "sys_len": 2920,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.19505389122054267,
1237
- "score": 0.19505389122054267,
1238
- "score_name": "sacrebleu",
1239
- "score_ci_low": 0.16097395914199633,
1240
- "score_ci_high": 0.23627234222780022,
1241
- "sacrebleu_ci_low": 0.16097395914199633,
1242
- "sacrebleu_ci_high": 0.23627234222780022
1243
- },
1244
- "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
- "counts": [
1247
- 1158,
1248
- 586,
1249
- 328,
1250
- 189
1251
- ],
1252
- "totals": [
1253
- 3432,
1254
- 3366,
1255
- 3300,
1256
- 3234
1257
- ],
1258
- "precisions": [
1259
- 0.3374125874125874,
1260
- 0.1740938799762329,
1261
- 0.0993939393939394,
1262
- 0.05844155844155845
1263
- ],
1264
- "bp": 1.0,
1265
- "sys_len": 3432,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.1359116294436951,
1268
- "score": 0.1359116294436951,
1269
- "score_name": "sacrebleu",
1270
- "score_ci_low": 0.10538411838136834,
1271
- "score_ci_high": 0.18075724261232987,
1272
- "sacrebleu_ci_low": 0.10538411838136834,
1273
- "sacrebleu_ci_high": 0.18075724261232987
1274
- },
1275
- "score": 0.17439870899191476,
1276
- "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
- },
1279
- "score": 0.42328152240293343,
1280
- "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
- }
1283
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/bluebench/2025-06-23T06-18-33_evaluation_results.json DELETED
@@ -1,1283 +0,0 @@
1
- {
2
- "environment_info": {
3
- "timestamp_utc": "2025-06-23T10:18:29.800050Z",
4
- "command_line_invocation": [
5
- "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
- "--tasks",
7
- "benchmarks.bluebench",
8
- "--model",
9
- "cross_provider",
10
- "--model_args",
11
- "model_name=watsonx/meta-llama/llama-4-maverick-17b-128e-instruct-fp8,max_tokens=256",
12
- "--output_path",
13
- "./results/bluebench",
14
- "--log_samples",
15
- "--trust_remote_code",
16
- "--batch_size",
17
- "8",
18
- "--verbosity",
19
- "ERROR"
20
- ],
21
- "parsed_arguments": {
22
- "tasks": [
23
- "benchmarks.bluebench"
24
- ],
25
- "split": "test",
26
- "num_fewshots": null,
27
- "limit": null,
28
- "batch_size": 8,
29
- "model": "watsonx/meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
30
- "model_args": {
31
- "max_tokens": 256
32
- },
33
- "gen_kwargs": null,
34
- "chat_template_kwargs": null,
35
- "output_path": "./results/bluebench",
36
- "output_file_prefix": "evaluation_results",
37
- "log_samples": true,
38
- "verbosity": "ERROR",
39
- "apply_chat_template": false,
40
- "trust_remote_code": true,
41
- "disable_hf_cache": false,
42
- "cache_dir": null
43
- },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
- "python_version": "3.10.18",
47
- "system": "Linux",
48
- "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
- "installed_packages": {
50
- "nvidia-cufile-cu12": "1.11.1.6",
51
- "triton": "3.3.1",
52
- "nltk": "3.9.1",
53
- "anyio": "4.9.0",
54
- "absl-py": "2.3.0",
55
- "tiktoken": "0.9.0",
56
- "charset-normalizer": "3.4.2",
57
- "nvidia-cuda-runtime-cu12": "12.6.77",
58
- "sympy": "1.14.0",
59
- "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
- "httpcore": "1.0.9",
62
- "Jinja2": "3.1.6",
63
- "jsonschema-specifications": "2025.4.1",
64
- "pydantic_core": "2.33.2",
65
- "nvidia-cusparse-cu12": "12.5.4.2",
66
- "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
- "portalocker": "3.2.0",
69
- "pandas": "2.3.0",
70
- "multiprocess": "0.70.16",
71
- "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
- "nvidia-nvjitlink-cu12": "12.6.85",
74
- "nvidia-cublas-cu12": "12.6.4.1",
75
- "pydantic": "2.11.7",
76
- "async-timeout": "5.0.1",
77
- "annotated-types": "0.7.0",
78
- "rouge_score": "0.1.2",
79
- "contourpy": "1.3.2",
80
- "aiosignal": "1.3.2",
81
- "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
- "six": "1.17.0",
84
- "diskcache": "5.6.3",
85
- "tqdm": "4.67.1",
86
- "pyarrow": "20.0.0",
87
- "h11": "0.16.0",
88
- "zipp": "3.19.2",
89
- "tzdata": "2025.2",
90
- "bert-score": "0.3.13",
91
- "setuptools": "80.9.0",
92
- "referencing": "0.36.2",
93
- "sacrebleu": "2.5.1",
94
- "filelock": "3.18.0",
95
- "urllib3": "2.5.0",
96
- "scipy": "1.15.3",
97
- "nvidia-nccl-cu12": "2.26.2",
98
- "kiwisolver": "1.4.8",
99
- "networkx": "3.4.2",
100
- "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
- "sniffio": "1.3.1",
103
- "scikit-learn": "1.7.0",
104
- "nvidia-curand-cu12": "10.3.7.77",
105
- "pip": "25.1.1",
106
- "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
- "datasets": "3.6.0",
109
- "nvidia-cusolver-cu12": "11.7.1.2",
110
- "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
- "distro": "1.9.0",
113
- "idna": "3.10",
114
- "MarkupSafe": "3.0.2",
115
- "frozenlist": "1.7.0",
116
- "pyparsing": "3.2.3",
117
- "jiter": "0.10.0",
118
- "importlib_metadata": "8.0.0",
119
- "packaging": "24.2",
120
- "psutil": "7.0.0",
121
- "mecab-ko-dic": "1.0.0",
122
- "joblib": "1.5.1",
123
- "fsspec": "2025.3.0",
124
- "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
- "wheel": "0.45.1",
127
- "nvidia-nvtx-cu12": "12.6.77",
128
- "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
- "propcache": "0.3.2",
131
- "numpy": "2.2.6",
132
- "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
- "conllu": "6.0.0",
135
- "safetensors": "0.5.3",
136
- "requests": "2.32.4",
137
- "regex": "2024.11.6",
138
- "aiohttp": "3.12.13",
139
- "tabulate": "0.9.0",
140
- "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
- "nvidia-cufft-cu12": "11.3.0.4",
143
- "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
- "click": "8.2.1",
145
- "typing_extensions": "4.12.2",
146
- "attrs": "25.3.0",
147
- "exceptiongroup": "1.3.0",
148
- "tenacity": "9.1.2",
149
- "pytz": "2025.2",
150
- "aiohappyeyeballs": "2.6.1",
151
- "python-dateutil": "2.9.0.post0",
152
- "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
154
- "httpx": "0.28.1",
155
- "matplotlib": "3.10.3",
156
- "xxhash": "3.5.0",
157
- "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
- "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
- "threadpoolctl": "3.6.0",
162
- "nvidia-cudnn-cu12": "9.5.1.17",
163
- "jaraco.collections": "5.1.0",
164
- "tomli": "2.0.1",
165
- "backports.tarfile": "1.2.0",
166
- "jaraco.context": "5.3.0",
167
- "typeguard": "4.3.0",
168
- "autocommand": "2.2.2",
169
- "jaraco.text": "3.12.1",
170
- "more-itertools": "10.3.0",
171
- "platformdirs": "4.2.2",
172
- "inflect": "7.3.1",
173
- "jaraco.functools": "4.0.1"
174
- }
175
- },
176
- "results": {
177
- "bias": {
178
- "safety_bbq_age": {
179
- "accuracy": 0.8888888888888888,
180
- "accuracy_ci_low": 0.8111111111111111,
181
- "accuracy_ci_high": 0.9444444444444444,
182
- "score_name": "accuracy",
183
- "score": 0.8888888888888888,
184
- "score_ci_high": 0.9444444444444444,
185
- "score_ci_low": 0.8111111111111111,
186
- "num_of_instances": 90
187
- },
188
- "safety_bbq_disability_status": {
189
- "accuracy": 0.9777777777777777,
190
- "accuracy_ci_low": 0.9222222222222223,
191
- "accuracy_ci_high": 1.0,
192
- "score_name": "accuracy",
193
- "score": 0.9777777777777777,
194
- "score_ci_high": 1.0,
195
- "score_ci_low": 0.9222222222222223,
196
- "num_of_instances": 90
197
- },
198
- "safety_bbq_gender_identity": {
199
- "accuracy": 1.0,
200
- "accuracy_ci_low": 1.0,
201
- "accuracy_ci_high": 1.0,
202
- "score_name": "accuracy",
203
- "score": 1.0,
204
- "score_ci_high": 1.0,
205
- "score_ci_low": 1.0,
206
- "num_of_instances": 90
207
- },
208
- "safety_bbq_nationality": {
209
- "accuracy": 1.0,
210
- "accuracy_ci_low": 1.0,
211
- "accuracy_ci_high": 1.0,
212
- "score_name": "accuracy",
213
- "score": 1.0,
214
- "score_ci_high": 1.0,
215
- "score_ci_low": 1.0,
216
- "num_of_instances": 90
217
- },
218
- "safety_bbq_physical_appearance": {
219
- "accuracy": 0.9888888888888889,
220
- "accuracy_ci_low": 0.9389750917617445,
221
- "accuracy_ci_high": 1.0,
222
- "score_name": "accuracy",
223
- "score": 0.9888888888888889,
224
- "score_ci_high": 1.0,
225
- "score_ci_low": 0.9389750917617445,
226
- "num_of_instances": 90
227
- },
228
- "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.9888888888888889,
230
- "accuracy_ci_low": 0.9333333333333333,
231
- "accuracy_ci_high": 1.0,
232
- "score_name": "accuracy",
233
- "score": 0.9888888888888889,
234
- "score_ci_high": 1.0,
235
- "score_ci_low": 0.9333333333333333,
236
- "num_of_instances": 90
237
- },
238
- "safety_bbq_race_x_gender": {
239
- "accuracy": 1.0,
240
- "accuracy_ci_low": 1.0,
241
- "accuracy_ci_high": 1.0,
242
- "score_name": "accuracy",
243
- "score": 1.0,
244
- "score_ci_high": 1.0,
245
- "score_ci_low": 1.0,
246
- "num_of_instances": 90
247
- },
248
- "safety_bbq_race_x_ses": {
249
- "accuracy": 1.0,
250
- "accuracy_ci_low": 1.0,
251
- "accuracy_ci_high": 1.0,
252
- "score_name": "accuracy",
253
- "score": 1.0,
254
- "score_ci_high": 1.0,
255
- "score_ci_low": 1.0,
256
- "num_of_instances": 90
257
- },
258
- "safety_bbq_religion": {
259
- "accuracy": 0.9,
260
- "accuracy_ci_low": 0.8333333333333334,
261
- "accuracy_ci_high": 0.9555555555555556,
262
- "score_name": "accuracy",
263
- "score": 0.9,
264
- "score_ci_high": 0.9555555555555556,
265
- "score_ci_low": 0.8333333333333334,
266
- "num_of_instances": 90
267
- },
268
- "safety_bbq_ses": {
269
- "accuracy": 0.9888888888888889,
270
- "accuracy_ci_low": 0.9444444444444444,
271
- "accuracy_ci_high": 1.0,
272
- "score_name": "accuracy",
273
- "score": 0.9888888888888889,
274
- "score_ci_high": 1.0,
275
- "score_ci_low": 0.9444444444444444,
276
- "num_of_instances": 90
277
- },
278
- "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.8666666666666667,
280
- "accuracy_ci_low": 0.7888888888888889,
281
- "accuracy_ci_high": 0.9333333333333333,
282
- "score_name": "accuracy",
283
- "score": 0.8666666666666667,
284
- "score_ci_high": 0.9333333333333333,
285
- "score_ci_low": 0.7888888888888889,
286
- "num_of_instances": 90
287
- },
288
- "score": 0.9636363636363636,
289
- "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
- },
292
- "chatbot_abilities": {
293
- "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.14444444444444443,
296
- "score": 0.14444444444444443,
297
- "score_name": "llama_3_70b_instruct_template_arena_hard"
298
- },
299
- "score": 0.14444444444444443,
300
- "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
- },
303
- "entity_extraction": {
304
- "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.5653333333333334,
307
- "f1_Organization": 0.33757961783439494,
308
- "f1_Location": 0.3529411764705882,
309
- "f1_macro": 0.4186180425461055,
310
- "recall_macro": 0.3749591226403319,
311
- "precision_macro": 0.47607168955040186,
312
- "in_classes_support": 0.4988095238095238,
313
- "f1_micro": 0.2989010989010989,
314
- "recall_micro": 0.38857142857142857,
315
- "precision_micro": 0.24285714285714285,
316
- "score": 0.2989010989010989,
317
- "score_name": "f1_micro",
318
- "score_ci_low": 0.25706421958601,
319
- "score_ci_high": 0.34884085688698435,
320
- "f1_micro_ci_low": 0.25706421958601,
321
- "f1_micro_ci_high": 0.34884085688698435
322
- },
323
- "score": 0.2989010989010989,
324
- "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
- },
327
- "knowledge": {
328
- "mmlu_pro_biology": {
329
- "accuracy": 0.5633802816901409,
330
- "accuracy_ci_low": 0.4647887323943662,
331
- "accuracy_ci_high": 0.6855024917261459,
332
- "score_name": "accuracy",
333
- "score": 0.5633802816901409,
334
- "score_ci_high": 0.6855024917261459,
335
- "score_ci_low": 0.4647887323943662,
336
- "num_of_instances": 71
337
- },
338
- "mmlu_pro_business": {
339
- "accuracy": 0.29577464788732394,
340
- "accuracy_ci_low": 0.19718309859154928,
341
- "accuracy_ci_high": 0.4225352112676056,
342
- "score_name": "accuracy",
343
- "score": 0.29577464788732394,
344
- "score_ci_high": 0.4225352112676056,
345
- "score_ci_low": 0.19718309859154928,
346
- "num_of_instances": 71
347
- },
348
- "mmlu_pro_chemistry": {
349
- "accuracy": 0.22535211267605634,
350
- "accuracy_ci_low": 0.14084507042253522,
351
- "accuracy_ci_high": 0.323943661971831,
352
- "score_name": "accuracy",
353
- "score": 0.22535211267605634,
354
- "score_ci_high": 0.323943661971831,
355
- "score_ci_low": 0.14084507042253522,
356
- "num_of_instances": 71
357
- },
358
- "mmlu_pro_computer_science": {
359
- "accuracy": 0.5070422535211268,
360
- "accuracy_ci_low": 0.39436619718309857,
361
- "accuracy_ci_high": 0.6197183098591549,
362
- "score_name": "accuracy",
363
- "score": 0.5070422535211268,
364
- "score_ci_high": 0.6197183098591549,
365
- "score_ci_low": 0.39436619718309857,
366
- "num_of_instances": 71
367
- },
368
- "mmlu_pro_economics": {
369
- "accuracy": 0.676056338028169,
370
- "accuracy_ci_low": 0.5596886617559699,
371
- "accuracy_ci_high": 0.7746478873239436,
372
- "score_name": "accuracy",
373
- "score": 0.676056338028169,
374
- "score_ci_high": 0.7746478873239436,
375
- "score_ci_low": 0.5596886617559699,
376
- "num_of_instances": 71
377
- },
378
- "mmlu_pro_engineering": {
379
- "accuracy": 0.4225352112676056,
380
- "accuracy_ci_low": 0.30985915492957744,
381
- "accuracy_ci_high": 0.5488120473991023,
382
- "score_name": "accuracy",
383
- "score": 0.4225352112676056,
384
- "score_ci_high": 0.5488120473991023,
385
- "score_ci_low": 0.30985915492957744,
386
- "num_of_instances": 71
387
- },
388
- "mmlu_pro_health": {
389
- "accuracy": 0.49295774647887325,
390
- "accuracy_ci_low": 0.36619718309859156,
391
- "accuracy_ci_high": 0.5915492957746479,
392
- "score_name": "accuracy",
393
- "score": 0.49295774647887325,
394
- "score_ci_high": 0.5915492957746479,
395
- "score_ci_low": 0.36619718309859156,
396
- "num_of_instances": 71
397
- },
398
- "mmlu_pro_history": {
399
- "accuracy": 0.7605633802816901,
400
- "accuracy_ci_low": 0.647887323943662,
401
- "accuracy_ci_high": 0.8450704225352113,
402
- "score_name": "accuracy",
403
- "score": 0.7605633802816901,
404
- "score_ci_high": 0.8450704225352113,
405
- "score_ci_low": 0.647887323943662,
406
- "num_of_instances": 71
407
- },
408
- "mmlu_pro_law": {
409
- "accuracy": 0.6056338028169014,
410
- "accuracy_ci_low": 0.49295774647887325,
411
- "accuracy_ci_high": 0.7183098591549296,
412
- "score_name": "accuracy",
413
- "score": 0.6056338028169014,
414
- "score_ci_high": 0.7183098591549296,
415
- "score_ci_low": 0.49295774647887325,
416
- "num_of_instances": 71
417
- },
418
- "mmlu_pro_math": {
419
- "accuracy": 0.2535211267605634,
420
- "accuracy_ci_low": 0.15492957746478872,
421
- "accuracy_ci_high": 0.36619718309859156,
422
- "score_name": "accuracy",
423
- "score": 0.2535211267605634,
424
- "score_ci_high": 0.36619718309859156,
425
- "score_ci_low": 0.15492957746478872,
426
- "num_of_instances": 71
427
- },
428
- "mmlu_pro_other": {
429
- "accuracy": 0.5211267605633803,
430
- "accuracy_ci_low": 0.39436619718309857,
431
- "accuracy_ci_high": 0.6338028169014085,
432
- "score_name": "accuracy",
433
- "score": 0.5211267605633803,
434
- "score_ci_high": 0.6338028169014085,
435
- "score_ci_low": 0.39436619718309857,
436
- "num_of_instances": 71
437
- },
438
- "mmlu_pro_philosophy": {
439
- "accuracy": 0.6901408450704225,
440
- "accuracy_ci_low": 0.5633802816901409,
441
- "accuracy_ci_high": 0.7887323943661971,
442
- "score_name": "accuracy",
443
- "score": 0.6901408450704225,
444
- "score_ci_high": 0.7887323943661971,
445
- "score_ci_low": 0.5633802816901409,
446
- "num_of_instances": 71
447
- },
448
- "mmlu_pro_physics": {
449
- "accuracy": 0.39436619718309857,
450
- "accuracy_ci_low": 0.2742524569401369,
451
- "accuracy_ci_high": 0.5070422535211268,
452
- "score_name": "accuracy",
453
- "score": 0.39436619718309857,
454
- "score_ci_high": 0.5070422535211268,
455
- "score_ci_low": 0.2742524569401369,
456
- "num_of_instances": 71
457
- },
458
- "mmlu_pro_psychology": {
459
- "accuracy": 0.6619718309859155,
460
- "accuracy_ci_low": 0.5492957746478874,
461
- "accuracy_ci_high": 0.7605633802816901,
462
- "score_name": "accuracy",
463
- "score": 0.6619718309859155,
464
- "score_ci_high": 0.7605633802816901,
465
- "score_ci_low": 0.5492957746478874,
466
- "num_of_instances": 71
467
- },
468
- "score": 0.5050301810865191,
469
- "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
- },
472
- "legal": {
473
- "legalbench_abercrombie": {
474
- "f1_macro": 0.5900679117147707,
475
- "f1_suggestive": 0.5161290322580645,
476
- "f1_generic": 0.6666666666666666,
477
- "f1_descriptive": 0.6842105263157895,
478
- "f1_fanciful": 0.4166666666666667,
479
- "f1_arbitrary": 0.6666666666666666,
480
- "f1_macro_ci_low": 0.48866628515797084,
481
- "f1_macro_ci_high": 0.6952557983585582,
482
- "score_name": "f1_micro",
483
- "score": 0.6037735849056604,
484
- "score_ci_high": 0.6980886219395492,
485
- "score_ci_low": 0.4810734018080045,
486
- "num_of_instances": 85,
487
- "accuracy": 0.5647058823529412,
488
- "accuracy_ci_low": 0.4470588235294118,
489
- "accuracy_ci_high": 0.6588235294117647,
490
- "f1_micro": 0.6037735849056604,
491
- "f1_micro_ci_low": 0.4810734018080045,
492
- "f1_micro_ci_high": 0.6980886219395492
493
- },
494
- "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.6789943663051392,
496
- "f1_no": 0.7806691449814126,
497
- "f1_yes": 0.5773195876288659,
498
- "f1_macro_ci_low": 0.6027203961421103,
499
- "f1_macro_ci_high": 0.7510411869611957,
500
- "score_name": "f1_micro",
501
- "score": 0.726775956284153,
502
- "score_ci_high": 0.7809156964912598,
503
- "score_ci_low": 0.6593055710063558,
504
- "num_of_instances": 200,
505
- "accuracy": 0.665,
506
- "accuracy_ci_low": 0.595,
507
- "accuracy_ci_high": 0.725,
508
- "f1_micro": 0.726775956284153,
509
- "f1_micro_ci_low": 0.6593055710063558,
510
- "f1_micro_ci_high": 0.7809156964912598
511
- },
512
- "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.2756818181818182,
514
- "f1_conclusion": 0.0625,
515
- "f1_decree": 0.2,
516
- "f1_issue": 0.16666666666666666,
517
- "f1_analysis": 0.5,
518
- "f1_facts": 0.3333333333333333,
519
- "f1_procedural history": 0.22727272727272727,
520
- "f1_rule": 0.44,
521
- "f1_macro_ci_low": 0.21996524889806546,
522
- "f1_macro_ci_high": 0.34934856402818654,
523
- "score_name": "f1_micro",
524
- "score": 0.3032258064516129,
525
- "score_ci_high": 0.37934863351152043,
526
- "score_ci_low": 0.23767600886432785,
527
- "num_of_instances": 200,
528
- "accuracy": 0.235,
529
- "accuracy_ci_low": 0.18,
530
- "accuracy_ci_high": 0.3,
531
- "f1_micro": 0.3032258064516129,
532
- "f1_micro_ci_low": 0.23767600886432785,
533
- "f1_micro_ci_high": 0.37934863351152043
534
- },
535
- "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.5552471583399419,
537
- "f1_yes": 0.5463917525773195,
538
- "f1_no": 0.5641025641025641,
539
- "f1_macro_ci_low": 0.4896587694098791,
540
- "f1_macro_ci_high": 0.6253621455216213,
541
- "score_name": "f1_micro",
542
- "score": 0.5552699228791774,
543
- "score_ci_high": 0.6232020193247749,
544
- "score_ci_low": 0.48717948717948717,
545
- "num_of_instances": 200,
546
- "accuracy": 0.54,
547
- "accuracy_ci_low": 0.47,
548
- "accuracy_ci_high": 0.61,
549
- "f1_micro": 0.5552699228791774,
550
- "f1_micro_ci_low": 0.48717948717948717,
551
- "f1_micro_ci_high": 0.6232020193247749
552
- },
553
- "legalbench_proa": {
554
- "f1_macro": 0.7776061776061776,
555
- "f1_yes": 0.7714285714285715,
556
- "f1_no": 0.7837837837837838,
557
- "f1_macro_ci_low": 0.6936100514418908,
558
- "f1_macro_ci_high": 0.8455722600304791,
559
- "score_name": "f1_micro",
560
- "score": 0.7777777777777778,
561
- "score_ci_high": 0.8435374149659864,
562
- "score_ci_low": 0.6950354609929078,
563
- "num_of_instances": 85,
564
- "accuracy": 0.6588235294117647,
565
- "accuracy_ci_low": 0.5647058823529412,
566
- "accuracy_ci_high": 0.7411764705882353,
567
- "f1_micro": 0.7777777777777778,
568
- "f1_micro_ci_low": 0.6950354609929078,
569
- "f1_micro_ci_high": 0.8435374149659864
570
- },
571
- "score": 0.5933646096596763,
572
- "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
- },
575
- "news_classification": {
576
- "20_newsgroups_short": {
577
- "f1_macro": 0.6207188447832841,
578
- "f1_cars": 0.8089887640449438,
579
- "f1_windows x": 0.06153846153846154,
580
- "f1_computer graphics": 0.5591397849462365,
581
- "f1_atheism": 0.1951219512195122,
582
- "f1_christianity": 0.8148148148148148,
583
- "f1_religion": 0.16666666666666666,
584
- "f1_medicine": 0.8409090909090909,
585
- "f1_microsoft windows": 0.7115384615384616,
586
- "f1_middle east": 0.6666666666666666,
587
- "f1_motorcycles": 0.7619047619047619,
588
- "f1_politics": 0.3709677419354839,
589
- "f1_pc hardware": 0.6524822695035462,
590
- "f1_mac hardware": 0.7169811320754716,
591
- "f1_electronics": 0.6746987951807228,
592
- "f1_for sale": 0.6451612903225806,
593
- "f1_guns": 0.40540540540540543,
594
- "f1_space": 0.82,
595
- "f1_cryptography": 0.684931506849315,
596
- "f1_baseball": 0.9090909090909091,
597
- "f1_hockey": 0.9473684210526315,
598
- "f1_macro_ci_low": 0.5972493284306833,
599
- "f1_macro_ci_high": 0.6520732498423311,
600
- "score_name": "f1_micro",
601
- "score": 0.6644808743169399,
602
- "score_ci_high": 0.6954593267547653,
603
- "score_ci_low": 0.6374402731127434,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.608,
606
- "accuracy_ci_low": 0.58,
607
- "accuracy_ci_high": 0.64,
608
- "f1_micro": 0.6644808743169399,
609
- "f1_micro_ci_low": 0.6374402731127434,
610
- "f1_micro_ci_high": 0.6954593267547653
611
- },
612
- "score": 0.6644808743169399,
613
- "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
- },
616
- "product_help": {
617
- "cfpb_product_2023": {
618
- "f1_macro": 0.707429477184356,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9267935578330894,
620
- "f1_checking or savings account": 0.7964601769911505,
621
- "f1_debt collection": 0.5952380952380952,
622
- "f1_credit card or prepaid card": 0.7777777777777778,
623
- "f1_mortgage": 0.8611111111111112,
624
- "f1_student loan": 0.8125,
625
- "f1_money transfer or virtual currency or money service": 0.851063829787234,
626
- "f1_vehicle loan or lease": 0.5641025641025641,
627
- "f1_payday loan or title loan or personal loan": 0.18181818181818182,
628
- "f1_macro_ci_low": 0.6648851525959504,
629
- "f1_macro_ci_high": 0.7723728512116876,
630
- "score_name": "f1_micro",
631
- "score": 0.8642350557244174,
632
- "score_ci_high": 0.8836251312776043,
633
- "score_ci_low": 0.843700754195778,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.853,
636
- "accuracy_ci_low": 0.83,
637
- "accuracy_ci_high": 0.873,
638
- "f1_micro": 0.8642350557244174,
639
- "f1_micro_ci_low": 0.843700754195778,
640
- "f1_micro_ci_high": 0.8836251312776043
641
- },
642
- "cfpb_product_watsonx": {
643
- "f1_macro": 0.7776035677272286,
644
- "f1_mortgages and loans": 0.8491620111731844,
645
- "f1_credit card": 0.8491620111731844,
646
- "f1_debt collection": 0.7,
647
- "f1_credit reporting": 0.752851711026616,
648
- "f1_retail banking": 0.7368421052631579,
649
- "f1_macro_ci_low": 0.7421964970208773,
650
- "f1_macro_ci_high": 0.8128714170953505,
651
- "score_name": "f1_micro",
652
- "score": 0.7741273100616016,
653
- "score_ci_high": 0.808137127901691,
654
- "score_ci_low": 0.7371050801783955,
655
- "num_of_instances": 500,
656
- "accuracy": 0.754,
657
- "accuracy_ci_low": 0.716,
658
- "accuracy_ci_high": 0.79,
659
- "f1_micro": 0.7741273100616016,
660
- "f1_micro_ci_low": 0.7371050801783955,
661
- "f1_micro_ci_high": 0.808137127901691
662
- },
663
- "score": 0.8191811828930096,
664
- "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
- },
667
- "qa_finance": {
668
- "fin_qa": {
669
- "num_of_instances": 1000,
670
- "program_accuracy": 0.178,
671
- "score": 0.178,
672
- "score_name": "program_accuracy",
673
- "execution_accuracy": 0.158,
674
- "program_accuracy_ci_low": 0.156,
675
- "program_accuracy_ci_high": 0.201,
676
- "score_ci_low": 0.156,
677
- "score_ci_high": 0.201,
678
- "execution_accuracy_ci_low": 0.136,
679
- "execution_accuracy_ci_high": 0.18
680
- },
681
- "score": 0.178,
682
- "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
- },
685
- "rag_general": {
686
- "rag_response_generation_clapnq": {
687
- "precision": 0.3443634549583665,
688
- "recall": 0.5541949627099935,
689
- "f1": 0.3584271235061802,
690
- "precision_ci_low": 0.3209963473848064,
691
- "precision_ci_high": 0.36733937026281865,
692
- "recall_ci_low": 0.5380844738077261,
693
- "recall_ci_high": 0.5700354927476148,
694
- "f1_ci_low": 0.3384679590296986,
695
- "f1_ci_high": 0.3789111960225323,
696
- "score_name": "f1",
697
- "score": 0.3584271235061802,
698
- "score_ci_high": 0.3789111960225323,
699
- "score_ci_low": 0.3384679590296986,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.6108398385842642,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6847132598857085,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.573952042584618,
704
- "faithfullness_f1_token_overlap": 0.293473712088948,
705
- "faithfullness_recall_token_overlap": 0.2148085102518536,
706
- "faithfullness_precision_token_overlap": 0.5936738226108105,
707
- "correctness_f1_token_overlap": 0.3584271235061802,
708
- "correctness_recall_token_overlap": 0.5541949627099935,
709
- "correctness_precision_token_overlap": 0.3443634549583665
710
- },
711
- "score": 0.3584271235061802,
712
- "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
- },
715
- "reasoning": {
716
- "hellaswag": {
717
- "accuracy": 0.554,
718
- "accuracy_ci_low": 0.5224818891638672,
719
- "accuracy_ci_high": 0.585,
720
- "score_name": "accuracy",
721
- "score": 0.554,
722
- "score_ci_high": 0.585,
723
- "score_ci_low": 0.5224818891638672,
724
- "num_of_instances": 1000
725
- },
726
- "openbook_qa": {
727
- "accuracy": 0.786,
728
- "accuracy_ci_low": 0.752,
729
- "accuracy_ci_high": 0.8217351190918832,
730
- "score_name": "accuracy",
731
- "score": 0.786,
732
- "score_ci_high": 0.8217351190918832,
733
- "score_ci_low": 0.752,
734
- "num_of_instances": 500
735
- },
736
- "score": 0.67,
737
- "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
- },
740
- "safety": {
741
- "attaq_500": {
742
- "severity_critical": 0.0,
743
- "severity_high": 0.0,
744
- "severity_medium": 4.0,
745
- "severity_low": 96.0,
746
- "category_harmful_info": 0.8561736831500655,
747
- "category_deception": 0.9073135512645798,
748
- "category_explicit_content": 0.7910330542410263,
749
- "category_violence": 0.80168207176621,
750
- "category_discrimination": 0.8618445271581766,
751
- "category_substance_abuse": 0.8289476109875572,
752
- "category_pii": 0.8585629948863277,
753
- "safety": 0.8346688703028485,
754
- "safety_ci_low": 0.8100697875896787,
755
- "safety_ci_high": 0.8557748119272056,
756
- "score_name": "safety",
757
- "score": 0.8346688703028485,
758
- "score_ci_high": 0.8557748119272056,
759
- "score_ci_low": 0.8100697875896787,
760
- "num_of_instances": 100
761
- },
762
- "score": 0.8346688703028485,
763
- "score_name": "subsets_mean",
764
- "num_of_instances": 100
765
- },
766
- "summarization": {
767
- "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rougeLsum": 0.37773185785838354,
770
- "rougeL": 0.3129919661485272,
771
- "score": 0.3129919661485272,
772
- "score_name": "rougeL",
773
- "rouge1": 0.43240920102765396,
774
- "rouge2": 0.22703955569027665,
775
- "rougeLsum_ci_low": 0.36830039067564885,
776
- "rougeLsum_ci_high": 0.3873179684384486,
777
- "rougeL_ci_low": 0.3049039054154354,
778
- "rougeL_ci_high": 0.32111736301049143,
779
- "score_ci_low": 0.3049039054154354,
780
- "score_ci_high": 0.32111736301049143,
781
- "rouge1_ci_low": 0.4218532055678653,
782
- "rouge1_ci_high": 0.4422878104066809,
783
- "rouge2_ci_low": 0.21929685324826398,
784
- "rouge2_ci_high": 0.23594489559189138
785
- },
786
- "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rougeLsum": 0.10580614433716286,
789
- "rougeL": 0.09291382159031186,
790
- "score": 0.09291382159031186,
791
- "score_name": "rougeL",
792
- "rouge1": 0.1292682365835658,
793
- "rouge2": 0.01895410897411973,
794
- "rougeLsum_ci_low": 0.10084383699373096,
795
- "rougeLsum_ci_high": 0.10979495591939617,
796
- "rougeL_ci_low": 0.0886493583320966,
797
- "rougeL_ci_high": 0.09629333556794349,
798
- "score_ci_low": 0.0886493583320966,
799
- "score_ci_high": 0.09629333556794349,
800
- "rouge1_ci_low": 0.1233156289283472,
801
- "rouge1_ci_high": 0.13423125610698836,
802
- "rouge2_ci_low": 0.017054021331647896,
803
- "rouge2_ci_high": 0.02104099399428594
804
- },
805
- "score": 0.20295289386941953,
806
- "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
- },
809
- "translation": {
810
- "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
- "counts": [
813
- 1256,
814
- 809,
815
- 561,
816
- 393
817
- ],
818
- "totals": [
819
- 1822,
820
- 1756,
821
- 1690,
822
- 1624
823
- ],
824
- "precisions": [
825
- 0.6893523600439078,
826
- 0.4607061503416856,
827
- 0.3319526627218935,
828
- 0.2419950738916256
829
- ],
830
- "bp": 1.0,
831
- "sys_len": 1822,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.39965660032074374,
834
- "score": 0.39965660032074374,
835
- "score_name": "sacrebleu",
836
- "score_ci_low": 0.34477468044834286,
837
- "score_ci_high": 0.4416384677344608,
838
- "sacrebleu_ci_low": 0.34477468044834286,
839
- "sacrebleu_ci_high": 0.4416384677344608
840
- },
841
- "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
- "counts": [
844
- 1282,
845
- 858,
846
- 611,
847
- 439
848
- ],
849
- "totals": [
850
- 1827,
851
- 1761,
852
- 1695,
853
- 1629
854
- ],
855
- "precisions": [
856
- 0.7016967706622879,
857
- 0.48722316865417375,
858
- 0.36047197640117995,
859
- 0.26949048496009825
860
- ],
861
- "bp": 1.0,
862
- "sys_len": 1827,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.42689698575484597,
865
- "score": 0.42689698575484597,
866
- "score_name": "sacrebleu",
867
- "score_ci_low": 0.3808995454883676,
868
- "score_ci_high": 0.47419332084644833,
869
- "sacrebleu_ci_low": 0.3808995454883676,
870
- "sacrebleu_ci_high": 0.47419332084644833
871
- },
872
- "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
- "counts": [
875
- 974,
876
- 591,
877
- 379,
878
- 251
879
- ],
880
- "totals": [
881
- 1588,
882
- 1522,
883
- 1456,
884
- 1390
885
- ],
886
- "precisions": [
887
- 0.6133501259445844,
888
- 0.38830486202365305,
889
- 0.2603021978021978,
890
- 0.18057553956834532
891
- ],
892
- "bp": 0.9993704753119519,
893
- "sys_len": 1588,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.3250730946308182,
896
- "score": 0.3250730946308182,
897
- "score_name": "sacrebleu",
898
- "score_ci_low": 0.28933963986645983,
899
- "score_ci_high": 0.36869689361591035,
900
- "sacrebleu_ci_low": 0.28933963986645983,
901
- "sacrebleu_ci_high": 0.36869689361591035
902
- },
903
- "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
- "counts": [
906
- 1257,
907
- 811,
908
- 576,
909
- 437
910
- ],
911
- "totals": [
912
- 1815,
913
- 1749,
914
- 1683,
915
- 1617
916
- ],
917
- "precisions": [
918
- 0.6925619834710744,
919
- 0.46369353916523726,
920
- 0.34224598930481287,
921
- 0.2702535559678417
922
- ],
923
- "bp": 0.98904120617152,
924
- "sys_len": 1815,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.41059556612028536,
927
- "score": 0.41059556612028536,
928
- "score_name": "sacrebleu",
929
- "score_ci_low": 0.35457040262235134,
930
- "score_ci_high": 0.459990812104818,
931
- "sacrebleu_ci_low": 0.35457040262235134,
932
- "sacrebleu_ci_high": 0.459990812104818
933
- },
934
- "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
- "counts": [
937
- 1601,
938
- 1237,
939
- 986,
940
- 807
941
- ],
942
- "totals": [
943
- 2017,
944
- 1951,
945
- 1885,
946
- 1819
947
- ],
948
- "precisions": [
949
- 0.7937530986613782,
950
- 0.6340338288057407,
951
- 0.5230769230769231,
952
- 0.44365035733919733
953
- ],
954
- "bp": 0.9750319133813282,
955
- "sys_len": 2017,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.5699934901400187,
958
- "score": 0.5699934901400187,
959
- "score_name": "sacrebleu",
960
- "score_ci_low": 0.5326539789859114,
961
- "score_ci_high": 0.6234642421169655,
962
- "sacrebleu_ci_low": 0.5326539789859114,
963
- "sacrebleu_ci_high": 0.6234642421169655
964
- },
965
- "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
- "counts": [
968
- 1367,
969
- 786,
970
- 494,
971
- 321
972
- ],
973
- "totals": [
974
- 2312,
975
- 2246,
976
- 2180,
977
- 2114
978
- ],
979
- "precisions": [
980
- 0.5912629757785467,
981
- 0.34995547640249336,
982
- 0.22660550458715598,
983
- 0.15184484389782404
984
- ],
985
- "bp": 1.0,
986
- "sys_len": 2312,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.2904798394859776,
989
- "score": 0.2904798394859776,
990
- "score_name": "sacrebleu",
991
- "score_ci_low": 0.25673627398166365,
992
- "score_ci_high": 0.3237414974447857,
993
- "sacrebleu_ci_low": 0.25673627398166365,
994
- "sacrebleu_ci_high": 0.3237414974447857
995
- },
996
- "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
- "counts": [
999
- 1458,
1000
- 1061,
1001
- 822,
1002
- 653
1003
- ],
1004
- "totals": [
1005
- 1897,
1006
- 1831,
1007
- 1765,
1008
- 1699
1009
- ],
1010
- "precisions": [
1011
- 0.768581971534001,
1012
- 0.5794647733478974,
1013
- 0.4657223796033994,
1014
- 0.38434373160682755
1015
- ],
1016
- "bp": 0.9900341767854584,
1017
- "sys_len": 1897,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.5260671977764972,
1020
- "score": 0.5260671977764972,
1021
- "score_name": "sacrebleu",
1022
- "score_ci_low": 0.475872231232826,
1023
- "score_ci_high": 0.5725271086507513,
1024
- "sacrebleu_ci_low": 0.475872231232826,
1025
- "sacrebleu_ci_high": 0.5725271086507513
1026
- },
1027
- "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
- "counts": [
1030
- 1404,
1031
- 1006,
1032
- 750,
1033
- 558
1034
- ],
1035
- "totals": [
1036
- 1938,
1037
- 1872,
1038
- 1806,
1039
- 1740
1040
- ],
1041
- "precisions": [
1042
- 0.7244582043343654,
1043
- 0.5373931623931624,
1044
- 0.4152823920265781,
1045
- 0.3206896551724138
1046
- ],
1047
- "bp": 0.994340123204573,
1048
- "sys_len": 1938,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.47448058787166153,
1051
- "score": 0.47448058787166153,
1052
- "score_name": "sacrebleu",
1053
- "score_ci_low": 0.44387053770037466,
1054
- "score_ci_high": 0.51824756405881,
1055
- "sacrebleu_ci_low": 0.44387053770037466,
1056
- "sacrebleu_ci_high": 0.51824756405881
1057
- },
1058
- "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
- "counts": [
1061
- 1267,
1062
- 721,
1063
- 443,
1064
- 269
1065
- ],
1066
- "totals": [
1067
- 1960,
1068
- 1894,
1069
- 1828,
1070
- 1762
1071
- ],
1072
- "precisions": [
1073
- 0.6464285714285714,
1074
- 0.38067581837381204,
1075
- 0.24234135667396062,
1076
- 0.15266742338251987
1077
- ],
1078
- "bp": 0.932013328656422,
1079
- "sys_len": 1960,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.28789528964668276,
1082
- "score": 0.28789528964668276,
1083
- "score_name": "sacrebleu",
1084
- "score_ci_low": 0.2550898701517229,
1085
- "score_ci_high": 0.32315043281050926,
1086
- "sacrebleu_ci_low": 0.2550898701517229,
1087
- "sacrebleu_ci_high": 0.32315043281050926
1088
- },
1089
- "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
- "counts": [
1092
- 1342,
1093
- 940,
1094
- 684,
1095
- 495
1096
- ],
1097
- "totals": [
1098
- 1861,
1099
- 1795,
1100
- 1729,
1101
- 1663
1102
- ],
1103
- "precisions": [
1104
- 0.7211176786673832,
1105
- 0.5236768802228412,
1106
- 0.39560439560439564,
1107
- 0.29765484064942876
1108
- ],
1109
- "bp": 1.0,
1110
- "sys_len": 1861,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.45920953842389034,
1113
- "score": 0.45920953842389034,
1114
- "score_name": "sacrebleu",
1115
- "score_ci_low": 0.4189257677908612,
1116
- "score_ci_high": 0.5012078702279882,
1117
- "sacrebleu_ci_low": 0.4189257677908612,
1118
- "sacrebleu_ci_high": 0.5012078702279882
1119
- },
1120
- "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
- "counts": [
1123
- 1124,
1124
- 603,
1125
- 358,
1126
- 209
1127
- ],
1128
- "totals": [
1129
- 1910,
1130
- 1844,
1131
- 1778,
1132
- 1712
1133
- ],
1134
- "precisions": [
1135
- 0.5884816753926702,
1136
- 0.32700650759219085,
1137
- 0.20134983127109113,
1138
- 0.12207943925233644
1139
- ],
1140
- "bp": 1.0,
1141
- "sys_len": 1910,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.26225319254842555,
1144
- "score": 0.26225319254842555,
1145
- "score_name": "sacrebleu",
1146
- "score_ci_low": 0.2289709766377336,
1147
- "score_ci_high": 0.30157417779677487,
1148
- "sacrebleu_ci_low": 0.2289709766377336,
1149
- "sacrebleu_ci_high": 0.30157417779677487
1150
- },
1151
- "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
- "counts": [
1154
- 1117,
1155
- 625,
1156
- 396,
1157
- 257
1158
- ],
1159
- "totals": [
1160
- 1801,
1161
- 1735,
1162
- 1669,
1163
- 1603
1164
- ],
1165
- "precisions": [
1166
- 0.6202109938922821,
1167
- 0.36023054755043227,
1168
- 0.2372678250449371,
1169
- 0.1603243917654398
1170
- ],
1171
- "bp": 1.0,
1172
- "sys_len": 1801,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.3036264572339376,
1175
- "score": 0.3036264572339376,
1176
- "score_name": "sacrebleu",
1177
- "score_ci_low": 0.2714407091986394,
1178
- "score_ci_high": 0.36938210174908737,
1179
- "sacrebleu_ci_low": 0.2714407091986394,
1180
- "sacrebleu_ci_high": 0.36938210174908737
1181
- },
1182
- "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
- "counts": [
1185
- 1283,
1186
- 897,
1187
- 669,
1188
- 501
1189
- ],
1190
- "totals": [
1191
- 2053,
1192
- 1987,
1193
- 1921,
1194
- 1855
1195
- ],
1196
- "precisions": [
1197
- 0.6249391134924501,
1198
- 0.451434323100151,
1199
- 0.34825611660593436,
1200
- 0.27008086253369273
1201
- ],
1202
- "bp": 1.0,
1203
- "sys_len": 2053,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.40360469717634034,
1206
- "score": 0.40360469717634034,
1207
- "score_name": "sacrebleu",
1208
- "score_ci_low": 0.22825303083161255,
1209
- "score_ci_high": 0.4880479254926776,
1210
- "sacrebleu_ci_low": 0.22825303083161255,
1211
- "sacrebleu_ci_high": 0.4880479254926776
1212
- },
1213
- "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
- "counts": [
1216
- 1324,
1217
- 929,
1218
- 672,
1219
- 489
1220
- ],
1221
- "totals": [
1222
- 2052,
1223
- 1986,
1224
- 1920,
1225
- 1854
1226
- ],
1227
- "precisions": [
1228
- 0.645224171539961,
1229
- 0.4677744209466264,
1230
- 0.35,
1231
- 0.2637540453074434
1232
- ],
1233
- "bp": 1.0,
1234
- "sys_len": 2052,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.4085578581131045,
1237
- "score": 0.4085578581131045,
1238
- "score_name": "sacrebleu",
1239
- "score_ci_low": 0.30163750041734844,
1240
- "score_ci_high": 0.4725760173281261,
1241
- "sacrebleu_ci_low": 0.30163750041734844,
1242
- "sacrebleu_ci_high": 0.4725760173281261
1243
- },
1244
- "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
- "counts": [
1247
- 1196,
1248
- 676,
1249
- 416,
1250
- 253
1251
- ],
1252
- "totals": [
1253
- 1927,
1254
- 1861,
1255
- 1795,
1256
- 1729
1257
- ],
1258
- "precisions": [
1259
- 0.6206538661131292,
1260
- 0.3632455668995164,
1261
- 0.23175487465181058,
1262
- 0.14632735685367262
1263
- ],
1264
- "bp": 1.0,
1265
- "sys_len": 1927,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.2956998119625713,
1268
- "score": 0.2956998119625713,
1269
- "score_name": "sacrebleu",
1270
- "score_ci_low": 0.25139539620137036,
1271
- "score_ci_high": 0.32686978566283265,
1272
- "sacrebleu_ci_low": 0.25139539620137036,
1273
- "sacrebleu_ci_high": 0.32686978566283265
1274
- },
1275
- "score": 0.38960601381372006,
1276
- "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
- },
1279
- "score": 0.5094379735715554,
1280
- "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
- }
1283
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/bluebench/2025-06-23T08-43-46_evaluation_results.json DELETED
@@ -1,1283 +0,0 @@
1
- {
2
- "environment_info": {
3
- "timestamp_utc": "2025-06-23T12:43:42.752885Z",
4
- "command_line_invocation": [
5
- "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
- "--tasks",
7
- "benchmarks.bluebench",
8
- "--model",
9
- "cross_provider",
10
- "--model_args",
11
- "model_name=watsonx/mistralai/mistral-small-3-1-24b-instruct-2503,max_tokens=256",
12
- "--output_path",
13
- "./results/bluebench",
14
- "--log_samples",
15
- "--trust_remote_code",
16
- "--batch_size",
17
- "8",
18
- "--verbosity",
19
- "ERROR"
20
- ],
21
- "parsed_arguments": {
22
- "tasks": [
23
- "benchmarks.bluebench"
24
- ],
25
- "split": "test",
26
- "num_fewshots": null,
27
- "limit": null,
28
- "batch_size": 8,
29
- "model": "watsonx/mistralai/mistral-small-3-1-24b-instruct-2503",
30
- "model_args": {
31
- "max_tokens": 256
32
- },
33
- "gen_kwargs": null,
34
- "chat_template_kwargs": null,
35
- "output_path": "./results/bluebench",
36
- "output_file_prefix": "evaluation_results",
37
- "log_samples": true,
38
- "verbosity": "ERROR",
39
- "apply_chat_template": false,
40
- "trust_remote_code": true,
41
- "disable_hf_cache": false,
42
- "cache_dir": null
43
- },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
- "python_version": "3.10.18",
47
- "system": "Linux",
48
- "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
- "installed_packages": {
50
- "nvidia-cufile-cu12": "1.11.1.6",
51
- "triton": "3.3.1",
52
- "nltk": "3.9.1",
53
- "anyio": "4.9.0",
54
- "absl-py": "2.3.0",
55
- "tiktoken": "0.9.0",
56
- "charset-normalizer": "3.4.2",
57
- "nvidia-cuda-runtime-cu12": "12.6.77",
58
- "sympy": "1.14.0",
59
- "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
- "httpcore": "1.0.9",
62
- "Jinja2": "3.1.6",
63
- "jsonschema-specifications": "2025.4.1",
64
- "pydantic_core": "2.33.2",
65
- "nvidia-cusparse-cu12": "12.5.4.2",
66
- "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
- "portalocker": "3.2.0",
69
- "pandas": "2.3.0",
70
- "multiprocess": "0.70.16",
71
- "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
- "nvidia-nvjitlink-cu12": "12.6.85",
74
- "nvidia-cublas-cu12": "12.6.4.1",
75
- "pydantic": "2.11.7",
76
- "async-timeout": "5.0.1",
77
- "annotated-types": "0.7.0",
78
- "rouge_score": "0.1.2",
79
- "contourpy": "1.3.2",
80
- "aiosignal": "1.3.2",
81
- "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
- "six": "1.17.0",
84
- "diskcache": "5.6.3",
85
- "tqdm": "4.67.1",
86
- "pyarrow": "20.0.0",
87
- "h11": "0.16.0",
88
- "zipp": "3.19.2",
89
- "tzdata": "2025.2",
90
- "bert-score": "0.3.13",
91
- "setuptools": "80.9.0",
92
- "referencing": "0.36.2",
93
- "sacrebleu": "2.5.1",
94
- "filelock": "3.18.0",
95
- "urllib3": "2.5.0",
96
- "scipy": "1.15.3",
97
- "nvidia-nccl-cu12": "2.26.2",
98
- "kiwisolver": "1.4.8",
99
- "networkx": "3.4.2",
100
- "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
- "sniffio": "1.3.1",
103
- "scikit-learn": "1.7.0",
104
- "nvidia-curand-cu12": "10.3.7.77",
105
- "pip": "25.1.1",
106
- "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
- "datasets": "3.6.0",
109
- "nvidia-cusolver-cu12": "11.7.1.2",
110
- "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
- "distro": "1.9.0",
113
- "idna": "3.10",
114
- "MarkupSafe": "3.0.2",
115
- "frozenlist": "1.7.0",
116
- "pyparsing": "3.2.3",
117
- "jiter": "0.10.0",
118
- "importlib_metadata": "8.0.0",
119
- "packaging": "24.2",
120
- "psutil": "7.0.0",
121
- "mecab-ko-dic": "1.0.0",
122
- "joblib": "1.5.1",
123
- "fsspec": "2025.3.0",
124
- "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
- "wheel": "0.45.1",
127
- "nvidia-nvtx-cu12": "12.6.77",
128
- "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
- "propcache": "0.3.2",
131
- "numpy": "2.2.6",
132
- "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
- "conllu": "6.0.0",
135
- "safetensors": "0.5.3",
136
- "requests": "2.32.4",
137
- "regex": "2024.11.6",
138
- "aiohttp": "3.12.13",
139
- "tabulate": "0.9.0",
140
- "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
- "nvidia-cufft-cu12": "11.3.0.4",
143
- "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
- "click": "8.2.1",
145
- "typing_extensions": "4.12.2",
146
- "attrs": "25.3.0",
147
- "exceptiongroup": "1.3.0",
148
- "tenacity": "9.1.2",
149
- "pytz": "2025.2",
150
- "aiohappyeyeballs": "2.6.1",
151
- "python-dateutil": "2.9.0.post0",
152
- "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
154
- "httpx": "0.28.1",
155
- "matplotlib": "3.10.3",
156
- "xxhash": "3.5.0",
157
- "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
- "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
- "threadpoolctl": "3.6.0",
162
- "nvidia-cudnn-cu12": "9.5.1.17",
163
- "jaraco.collections": "5.1.0",
164
- "tomli": "2.0.1",
165
- "backports.tarfile": "1.2.0",
166
- "jaraco.context": "5.3.0",
167
- "typeguard": "4.3.0",
168
- "autocommand": "2.2.2",
169
- "jaraco.text": "3.12.1",
170
- "more-itertools": "10.3.0",
171
- "platformdirs": "4.2.2",
172
- "inflect": "7.3.1",
173
- "jaraco.functools": "4.0.1"
174
- }
175
- },
176
- "results": {
177
- "bias": {
178
- "safety_bbq_age": {
179
- "accuracy": 0.6666666666666666,
180
- "accuracy_ci_low": 0.5666666666666667,
181
- "accuracy_ci_high": 0.7555555555555555,
182
- "score_name": "accuracy",
183
- "score": 0.6666666666666666,
184
- "score_ci_high": 0.7555555555555555,
185
- "score_ci_low": 0.5666666666666667,
186
- "num_of_instances": 90
187
- },
188
- "safety_bbq_disability_status": {
189
- "accuracy": 0.7666666666666667,
190
- "accuracy_ci_low": 0.6777777777777778,
191
- "accuracy_ci_high": 0.8444444444444444,
192
- "score_name": "accuracy",
193
- "score": 0.7666666666666667,
194
- "score_ci_high": 0.8444444444444444,
195
- "score_ci_low": 0.6777777777777778,
196
- "num_of_instances": 90
197
- },
198
- "safety_bbq_gender_identity": {
199
- "accuracy": 0.9,
200
- "accuracy_ci_low": 0.8333333333333334,
201
- "accuracy_ci_high": 0.9555555555555556,
202
- "score_name": "accuracy",
203
- "score": 0.9,
204
- "score_ci_high": 0.9555555555555556,
205
- "score_ci_low": 0.8333333333333334,
206
- "num_of_instances": 90
207
- },
208
- "safety_bbq_nationality": {
209
- "accuracy": 0.7555555555555555,
210
- "accuracy_ci_low": 0.6555555555555556,
211
- "accuracy_ci_high": 0.8333333333333334,
212
- "score_name": "accuracy",
213
- "score": 0.7555555555555555,
214
- "score_ci_high": 0.8333333333333334,
215
- "score_ci_low": 0.6555555555555556,
216
- "num_of_instances": 90
217
- },
218
- "safety_bbq_physical_appearance": {
219
- "accuracy": 0.8111111111111111,
220
- "accuracy_ci_low": 0.7111111111111111,
221
- "accuracy_ci_high": 0.8777777777777778,
222
- "score_name": "accuracy",
223
- "score": 0.8111111111111111,
224
- "score_ci_high": 0.8777777777777778,
225
- "score_ci_low": 0.7111111111111111,
226
- "num_of_instances": 90
227
- },
228
- "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.8111111111111111,
230
- "accuracy_ci_low": 0.7111111111111111,
231
- "accuracy_ci_high": 0.8777777777777778,
232
- "score_name": "accuracy",
233
- "score": 0.8111111111111111,
234
- "score_ci_high": 0.8777777777777778,
235
- "score_ci_low": 0.7111111111111111,
236
- "num_of_instances": 90
237
- },
238
- "safety_bbq_race_x_gender": {
239
- "accuracy": 0.8222222222222222,
240
- "accuracy_ci_low": 0.7333333333333333,
241
- "accuracy_ci_high": 0.8888888888888888,
242
- "score_name": "accuracy",
243
- "score": 0.8222222222222222,
244
- "score_ci_high": 0.8888888888888888,
245
- "score_ci_low": 0.7333333333333333,
246
- "num_of_instances": 90
247
- },
248
- "safety_bbq_race_x_ses": {
249
- "accuracy": 0.8444444444444444,
250
- "accuracy_ci_low": 0.7555555555555555,
251
- "accuracy_ci_high": 0.9,
252
- "score_name": "accuracy",
253
- "score": 0.8444444444444444,
254
- "score_ci_high": 0.9,
255
- "score_ci_low": 0.7555555555555555,
256
- "num_of_instances": 90
257
- },
258
- "safety_bbq_religion": {
259
- "accuracy": 0.8222222222222222,
260
- "accuracy_ci_low": 0.7333333333333333,
261
- "accuracy_ci_high": 0.9,
262
- "score_name": "accuracy",
263
- "score": 0.8222222222222222,
264
- "score_ci_high": 0.9,
265
- "score_ci_low": 0.7333333333333333,
266
- "num_of_instances": 90
267
- },
268
- "safety_bbq_ses": {
269
- "accuracy": 0.8111111111111111,
270
- "accuracy_ci_low": 0.7222222222222222,
271
- "accuracy_ci_high": 0.8888888888888888,
272
- "score_name": "accuracy",
273
- "score": 0.8111111111111111,
274
- "score_ci_high": 0.8888888888888888,
275
- "score_ci_low": 0.7222222222222222,
276
- "num_of_instances": 90
277
- },
278
- "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.8333333333333334,
280
- "accuracy_ci_low": 0.7555555555555555,
281
- "accuracy_ci_high": 0.9111111111111111,
282
- "score_name": "accuracy",
283
- "score": 0.8333333333333334,
284
- "score_ci_high": 0.9111111111111111,
285
- "score_ci_low": 0.7555555555555555,
286
- "num_of_instances": 90
287
- },
288
- "score": 0.804040404040404,
289
- "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
- },
292
- "chatbot_abilities": {
293
- "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.10666666666666667,
296
- "score": 0.10666666666666667,
297
- "score_name": "llama_3_70b_instruct_template_arena_hard"
298
- },
299
- "score": 0.10666666666666667,
300
- "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
- },
303
- "entity_extraction": {
304
- "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.34806629834254144,
307
- "f1_Organization": 0.28125,
308
- "f1_Location": 0.2272727272727273,
309
- "f1_macro": 0.2855296752050896,
310
- "recall_macro": 0.2576225314974886,
311
- "precision_macro": 0.32330034002100655,
312
- "in_classes_support": 0.4646799116997793,
313
- "f1_micro": 0.1928721174004193,
314
- "recall_micro": 0.26285714285714284,
315
- "precision_micro": 0.152317880794702,
316
- "score": 0.1928721174004193,
317
- "score_name": "f1_micro",
318
- "score_ci_low": 0.16179580146707578,
319
- "score_ci_high": 0.2232377264615503,
320
- "f1_micro_ci_low": 0.16179580146707578,
321
- "f1_micro_ci_high": 0.2232377264615503
322
- },
323
- "score": 0.1928721174004193,
324
- "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
- },
327
- "knowledge": {
328
- "mmlu_pro_biology": {
329
- "accuracy": 0.647887323943662,
330
- "accuracy_ci_low": 0.5352112676056338,
331
- "accuracy_ci_high": 0.7605633802816901,
332
- "score_name": "accuracy",
333
- "score": 0.647887323943662,
334
- "score_ci_high": 0.7605633802816901,
335
- "score_ci_low": 0.5352112676056338,
336
- "num_of_instances": 71
337
- },
338
- "mmlu_pro_business": {
339
- "accuracy": 0.30985915492957744,
340
- "accuracy_ci_low": 0.2112676056338028,
341
- "accuracy_ci_high": 0.428782341390215,
342
- "score_name": "accuracy",
343
- "score": 0.30985915492957744,
344
- "score_ci_high": 0.428782341390215,
345
- "score_ci_low": 0.2112676056338028,
346
- "num_of_instances": 71
347
- },
348
- "mmlu_pro_chemistry": {
349
- "accuracy": 0.22535211267605634,
350
- "accuracy_ci_low": 0.14084507042253522,
351
- "accuracy_ci_high": 0.3380281690140845,
352
- "score_name": "accuracy",
353
- "score": 0.22535211267605634,
354
- "score_ci_high": 0.3380281690140845,
355
- "score_ci_low": 0.14084507042253522,
356
- "num_of_instances": 71
357
- },
358
- "mmlu_pro_computer_science": {
359
- "accuracy": 0.5633802816901409,
360
- "accuracy_ci_low": 0.4507042253521127,
361
- "accuracy_ci_high": 0.676056338028169,
362
- "score_name": "accuracy",
363
- "score": 0.5633802816901409,
364
- "score_ci_high": 0.676056338028169,
365
- "score_ci_low": 0.4507042253521127,
366
- "num_of_instances": 71
367
- },
368
- "mmlu_pro_economics": {
369
- "accuracy": 0.6197183098591549,
370
- "accuracy_ci_low": 0.49295774647887325,
371
- "accuracy_ci_high": 0.7323943661971831,
372
- "score_name": "accuracy",
373
- "score": 0.6197183098591549,
374
- "score_ci_high": 0.7323943661971831,
375
- "score_ci_low": 0.49295774647887325,
376
- "num_of_instances": 71
377
- },
378
- "mmlu_pro_engineering": {
379
- "accuracy": 0.18309859154929578,
380
- "accuracy_ci_low": 0.11267605633802817,
381
- "accuracy_ci_high": 0.28169014084507044,
382
- "score_name": "accuracy",
383
- "score": 0.18309859154929578,
384
- "score_ci_high": 0.28169014084507044,
385
- "score_ci_low": 0.11267605633802817,
386
- "num_of_instances": 71
387
- },
388
- "mmlu_pro_health": {
389
- "accuracy": 0.4788732394366197,
390
- "accuracy_ci_low": 0.36619718309859156,
391
- "accuracy_ci_high": 0.5915492957746479,
392
- "score_name": "accuracy",
393
- "score": 0.4788732394366197,
394
- "score_ci_high": 0.5915492957746479,
395
- "score_ci_low": 0.36619718309859156,
396
- "num_of_instances": 71
397
- },
398
- "mmlu_pro_history": {
399
- "accuracy": 0.6619718309859155,
400
- "accuracy_ci_low": 0.5352112676056338,
401
- "accuracy_ci_high": 0.7714646829428065,
402
- "score_name": "accuracy",
403
- "score": 0.6619718309859155,
404
- "score_ci_high": 0.7714646829428065,
405
- "score_ci_low": 0.5352112676056338,
406
- "num_of_instances": 71
407
- },
408
- "mmlu_pro_law": {
409
- "accuracy": 0.43661971830985913,
410
- "accuracy_ci_low": 0.323943661971831,
411
- "accuracy_ci_high": 0.5492957746478874,
412
- "score_name": "accuracy",
413
- "score": 0.43661971830985913,
414
- "score_ci_high": 0.5492957746478874,
415
- "score_ci_low": 0.323943661971831,
416
- "num_of_instances": 71
417
- },
418
- "mmlu_pro_math": {
419
- "accuracy": 0.2676056338028169,
420
- "accuracy_ci_low": 0.16901408450704225,
421
- "accuracy_ci_high": 0.38028169014084506,
422
- "score_name": "accuracy",
423
- "score": 0.2676056338028169,
424
- "score_ci_high": 0.38028169014084506,
425
- "score_ci_low": 0.16901408450704225,
426
- "num_of_instances": 71
427
- },
428
- "mmlu_pro_other": {
429
- "accuracy": 0.4647887323943662,
430
- "accuracy_ci_low": 0.352112676056338,
431
- "accuracy_ci_high": 0.5774647887323944,
432
- "score_name": "accuracy",
433
- "score": 0.4647887323943662,
434
- "score_ci_high": 0.5774647887323944,
435
- "score_ci_low": 0.352112676056338,
436
- "num_of_instances": 71
437
- },
438
- "mmlu_pro_philosophy": {
439
- "accuracy": 0.49295774647887325,
440
- "accuracy_ci_low": 0.38028169014084506,
441
- "accuracy_ci_high": 0.6197183098591549,
442
- "score_name": "accuracy",
443
- "score": 0.49295774647887325,
444
- "score_ci_high": 0.6197183098591549,
445
- "score_ci_low": 0.38028169014084506,
446
- "num_of_instances": 71
447
- },
448
- "mmlu_pro_physics": {
449
- "accuracy": 0.4507042253521127,
450
- "accuracy_ci_low": 0.3380281690140845,
451
- "accuracy_ci_high": 0.5633802816901409,
452
- "score_name": "accuracy",
453
- "score": 0.4507042253521127,
454
- "score_ci_high": 0.5633802816901409,
455
- "score_ci_low": 0.3380281690140845,
456
- "num_of_instances": 71
457
- },
458
- "mmlu_pro_psychology": {
459
- "accuracy": 0.6338028169014085,
460
- "accuracy_ci_low": 0.5211267605633803,
461
- "accuracy_ci_high": 0.7464788732394366,
462
- "score_name": "accuracy",
463
- "score": 0.6338028169014085,
464
- "score_ci_high": 0.7464788732394366,
465
- "score_ci_low": 0.5211267605633803,
466
- "num_of_instances": 71
467
- },
468
- "score": 0.4597585513078471,
469
- "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
- },
472
- "legal": {
473
- "legalbench_abercrombie": {
474
- "f1_macro": 0.3274661835748792,
475
- "f1_suggestive": 0.3125,
476
- "f1_generic": 0.2222222222222222,
477
- "f1_arbitrary": 0.32,
478
- "f1_fanciful": 0.43478260869565216,
479
- "f1_descriptive": 0.34782608695652173,
480
- "f1_macro_ci_low": 0.2279048726954935,
481
- "f1_macro_ci_high": 0.4521754269782871,
482
- "score_name": "f1_micro",
483
- "score": 0.3305785123966942,
484
- "score_ci_high": 0.4462631095061656,
485
- "score_ci_low": 0.22608695652173913,
486
- "num_of_instances": 85,
487
- "accuracy": 0.23529411764705882,
488
- "accuracy_ci_low": 0.15294117647058825,
489
- "accuracy_ci_high": 0.3411764705882353,
490
- "f1_micro": 0.3305785123966942,
491
- "f1_micro_ci_low": 0.22608695652173913,
492
- "f1_micro_ci_high": 0.4462631095061656
493
- },
494
- "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.3665132336018412,
496
- "f1_no": 0.45454545454545453,
497
- "f1_yes": 0.27848101265822783,
498
- "f1_macro_ci_low": 0.29058993290093893,
499
- "f1_macro_ci_high": 0.44397782794437635,
500
- "score_name": "f1_micro",
501
- "score": 0.4043321299638989,
502
- "score_ci_high": 0.4848050604545447,
503
- "score_ci_low": 0.33210332103321033,
504
- "num_of_instances": 200,
505
- "accuracy": 0.28,
506
- "accuracy_ci_low": 0.22435516148422335,
507
- "accuracy_ci_high": 0.35,
508
- "f1_micro": 0.4043321299638989,
509
- "f1_micro_ci_low": 0.33210332103321033,
510
- "f1_micro_ci_high": 0.4848050604545447
511
- },
512
- "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.2994201157541979,
514
- "f1_conclusion": 0.16216216216216217,
515
- "f1_decree": 0.22857142857142856,
516
- "f1_analysis": 0.43478260869565216,
517
- "f1_issue": 0.23809523809523808,
518
- "f1_procedural history": 0.36363636363636365,
519
- "f1_facts": 0.2857142857142857,
520
- "f1_rule": 0.3829787234042553,
521
- "f1_macro_ci_low": 0.23455524899226265,
522
- "f1_macro_ci_high": 0.3757847883095156,
523
- "score_name": "f1_micro",
524
- "score": 0.3111111111111111,
525
- "score_ci_high": 0.38492614857203733,
526
- "score_ci_low": 0.23835139550418585,
527
- "num_of_instances": 200,
528
- "accuracy": 0.245,
529
- "accuracy_ci_low": 0.185,
530
- "accuracy_ci_high": 0.31,
531
- "f1_micro": 0.3111111111111111,
532
- "f1_micro_ci_low": 0.23835139550418585,
533
- "f1_micro_ci_high": 0.38492614857203733
534
- },
535
- "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.20520718738540522,
537
- "f1_yes": 0.21782178217821782,
538
- "f1_no": 0.1925925925925926,
539
- "f1_macro_ci_low": 0.1400419836844058,
540
- "f1_macro_ci_high": 0.2885398882645068,
541
- "score_name": "f1_micro",
542
- "score": 0.2033898305084746,
543
- "score_ci_high": 0.2857142857142857,
544
- "score_ci_low": 0.1391304347826087,
545
- "num_of_instances": 200,
546
- "accuracy": 0.12,
547
- "accuracy_ci_low": 0.08,
548
- "accuracy_ci_high": 0.175,
549
- "f1_micro": 0.2033898305084746,
550
- "f1_micro_ci_low": 0.1391304347826087,
551
- "f1_micro_ci_high": 0.2857142857142857
552
- },
553
- "legalbench_proa": {
554
- "f1_macro": 0.8035124326204138,
555
- "f1_yes": 0.8169014084507042,
556
- "f1_no": 0.7901234567901234,
557
- "f1_macro_ci_low": 0.7176915883069268,
558
- "f1_macro_ci_high": 0.872867714407109,
559
- "score_name": "f1_micro",
560
- "score": 0.8026315789473685,
561
- "score_ci_high": 0.871520027126433,
562
- "score_ci_low": 0.7086398695460123,
563
- "num_of_instances": 85,
564
- "accuracy": 0.7176470588235294,
565
- "accuracy_ci_low": 0.611764705882353,
566
- "accuracy_ci_high": 0.8,
567
- "f1_micro": 0.8026315789473685,
568
- "f1_micro_ci_low": 0.7086398695460123,
569
- "f1_micro_ci_high": 0.871520027126433
570
- },
571
- "score": 0.41040863258550947,
572
- "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
- },
575
- "news_classification": {
576
- "20_newsgroups_short": {
577
- "f1_macro": 0.51306152129787,
578
- "f1_cars": 0.6585365853658537,
579
- "f1_windows x": 0.08571428571428572,
580
- "f1_cryptography": 0.5641025641025641,
581
- "f1_atheism": 0.09302325581395349,
582
- "f1_religion": 0.15873015873015872,
583
- "f1_medicine": 0.810126582278481,
584
- "f1_christianity": 0.36619718309859156,
585
- "f1_computer graphics": 0.43243243243243246,
586
- "f1_microsoft windows": 0.5569620253164557,
587
- "f1_middle east": 0.625,
588
- "f1_motorcycles": 0.64,
589
- "f1_mac hardware": 0.49411764705882355,
590
- "f1_pc hardware": 0.5309734513274337,
591
- "f1_electronics": 0.6292134831460674,
592
- "f1_for sale": 0.5538461538461539,
593
- "f1_guns": 0.22580645161290322,
594
- "f1_space": 0.7872340425531915,
595
- "f1_baseball": 0.8598130841121495,
596
- "f1_hockey": 0.859504132231405,
597
- "f1_politics": 0.32989690721649484,
598
- "f1_macro_ci_low": 0.489675643451468,
599
- "f1_macro_ci_high": 0.5444343504387604,
600
- "score_name": "f1_micro",
601
- "score": 0.5470692717584369,
602
- "score_ci_high": 0.5787418375694315,
603
- "score_ci_low": 0.5152956292250616,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.462,
606
- "accuracy_ci_low": 0.431,
607
- "accuracy_ci_high": 0.492,
608
- "f1_micro": 0.5470692717584369,
609
- "f1_micro_ci_low": 0.5152956292250616,
610
- "f1_micro_ci_high": 0.5787418375694315
611
- },
612
- "score": 0.5470692717584369,
613
- "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
- },
616
- "product_help": {
617
- "cfpb_product_2023": {
618
- "f1_macro": 0.5522388523080553,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.7597955706984668,
620
- "f1_checking or savings account": 0.717391304347826,
621
- "f1_debt collection": 0.5234899328859061,
622
- "f1_credit card or prepaid card": 0.379746835443038,
623
- "f1_mortgage": 0.7397260273972602,
624
- "f1_payday loan or title loan or personal loan": 0.0,
625
- "f1_student loan": 0.75,
626
- "f1_money transfer or virtual currency or money service": 0.6,
627
- "f1_vehicle loan or lease": 0.5,
628
- "f1_macro_ci_low": 0.5048624166012413,
629
- "f1_macro_ci_high": 0.5933228524869341,
630
- "score_name": "f1_micro",
631
- "score": 0.70260663507109,
632
- "score_ci_high": 0.7287187189345051,
633
- "score_ci_low": 0.6715109552099726,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.593,
636
- "accuracy_ci_low": 0.562,
637
- "accuracy_ci_high": 0.6222733612177318,
638
- "f1_micro": 0.70260663507109,
639
- "f1_micro_ci_low": 0.6715109552099726,
640
- "f1_micro_ci_high": 0.7287187189345051
641
- },
642
- "cfpb_product_watsonx": {
643
- "f1_macro": 0.6459298867611312,
644
- "f1_mortgages and loans": 0.7544910179640718,
645
- "f1_credit card": 0.775,
646
- "f1_debt collection": 0.5841584158415841,
647
- "f1_credit reporting": 0.696,
648
- "f1_retail banking": 0.42,
649
- "f1_macro_ci_low": 0.6077839677010852,
650
- "f1_macro_ci_high": 0.6913500191807291,
651
- "score_name": "f1_micro",
652
- "score": 0.664391353811149,
653
- "score_ci_high": 0.7063133644876816,
654
- "score_ci_low": 0.625027055082327,
655
- "num_of_instances": 500,
656
- "accuracy": 0.584,
657
- "accuracy_ci_low": 0.542,
658
- "accuracy_ci_high": 0.63,
659
- "f1_micro": 0.664391353811149,
660
- "f1_micro_ci_low": 0.625027055082327,
661
- "f1_micro_ci_high": 0.7063133644876816
662
- },
663
- "score": 0.6834989944411195,
664
- "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
- },
667
- "qa_finance": {
668
- "fin_qa": {
669
- "num_of_instances": 1000,
670
- "execution_accuracy": 0.12,
671
- "program_accuracy": 0.132,
672
- "score": 0.132,
673
- "score_name": "program_accuracy",
674
- "execution_accuracy_ci_low": 0.102,
675
- "execution_accuracy_ci_high": 0.144,
676
- "program_accuracy_ci_low": 0.114,
677
- "program_accuracy_ci_high": 0.15666145199397988,
678
- "score_ci_low": 0.114,
679
- "score_ci_high": 0.15666145199397988
680
- },
681
- "score": 0.132,
682
- "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
- },
685
- "rag_general": {
686
- "rag_response_generation_clapnq": {
687
- "precision": 0.2643477592234184,
688
- "recall": 0.6272650833490617,
689
- "f1": 0.3140296380054316,
690
- "precision_ci_low": 0.24819332961110369,
691
- "precision_ci_high": 0.283633312750057,
692
- "recall_ci_low": 0.6126841693716314,
693
- "recall_ci_high": 0.6430975049953365,
694
- "f1_ci_low": 0.29901530950331745,
695
- "f1_ci_high": 0.3315112630081704,
696
- "score_name": "f1",
697
- "score": 0.3140296380054316,
698
- "score_ci_high": 0.3315112630081704,
699
- "score_ci_low": 0.29901530950331745,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.570422876526912,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6787074640889963,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5102018377184868,
704
- "faithfullness_f1_token_overlap": 0.3137695189458044,
705
- "faithfullness_recall_token_overlap": 0.2691044465057621,
706
- "faithfullness_precision_token_overlap": 0.4828180045337674,
707
- "correctness_f1_token_overlap": 0.3140296380054316,
708
- "correctness_recall_token_overlap": 0.6272650833490617,
709
- "correctness_precision_token_overlap": 0.2643477592234184
710
- },
711
- "score": 0.3140296380054316,
712
- "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
- },
715
- "reasoning": {
716
- "hellaswag": {
717
- "accuracy": 0.525,
718
- "accuracy_ci_low": 0.491,
719
- "accuracy_ci_high": 0.555231575004668,
720
- "score_name": "accuracy",
721
- "score": 0.525,
722
- "score_ci_high": 0.555231575004668,
723
- "score_ci_low": 0.491,
724
- "num_of_instances": 1000
725
- },
726
- "openbook_qa": {
727
- "accuracy": 0.882,
728
- "accuracy_ci_low": 0.852,
729
- "accuracy_ci_high": 0.908,
730
- "score_name": "accuracy",
731
- "score": 0.882,
732
- "score_ci_high": 0.908,
733
- "score_ci_low": 0.852,
734
- "num_of_instances": 500
735
- },
736
- "score": 0.7035,
737
- "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
- },
740
- "safety": {
741
- "attaq_500": {
742
- "severity_critical": 1.0,
743
- "severity_high": 1.0,
744
- "severity_medium": 2.0,
745
- "severity_low": 96.0,
746
- "category_harmful_info": 0.7837495406986109,
747
- "category_deception": 0.9044034529213953,
748
- "category_explicit_content": 0.7690657642152575,
749
- "category_violence": 0.8265839901102323,
750
- "category_discrimination": 0.8356099165384733,
751
- "category_substance_abuse": 0.9050387458668814,
752
- "category_pii": 0.8721841452298341,
753
- "safety": 0.8227460376271561,
754
- "safety_ci_low": 0.7932925916009834,
755
- "safety_ci_high": 0.8490041469302126,
756
- "score_name": "safety",
757
- "score": 0.8227460376271561,
758
- "score_ci_high": 0.8490041469302126,
759
- "score_ci_low": 0.7932925916009834,
760
- "num_of_instances": 100
761
- },
762
- "score": 0.8227460376271561,
763
- "score_name": "subsets_mean",
764
- "num_of_instances": 100
765
- },
766
- "summarization": {
767
- "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rougeLsum": 0.3325543768677942,
770
- "rouge2": 0.18239431854825322,
771
- "rougeL": 0.26408113485691037,
772
- "score": 0.26408113485691037,
773
- "score_name": "rougeL",
774
- "rouge1": 0.38686146798899773,
775
- "rougeLsum_ci_low": 0.3235124735051795,
776
- "rougeLsum_ci_high": 0.34150244672120345,
777
- "rouge2_ci_low": 0.17567630755234162,
778
- "rouge2_ci_high": 0.18909024839808478,
779
- "rougeL_ci_low": 0.2574903672199645,
780
- "rougeL_ci_high": 0.2719482303789339,
781
- "score_ci_low": 0.2574903672199645,
782
- "score_ci_high": 0.2719482303789339,
783
- "rouge1_ci_low": 0.3767025738240639,
784
- "rouge1_ci_high": 0.39656622600699587
785
- },
786
- "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rougeLsum": 0.0952773923780629,
789
- "rouge2": 0.016247281897600302,
790
- "rougeL": 0.08205748236085915,
791
- "score": 0.08205748236085915,
792
- "score_name": "rougeL",
793
- "rouge1": 0.11292356630727837,
794
- "rougeLsum_ci_low": 0.09101488756580381,
795
- "rougeLsum_ci_high": 0.09940774438641894,
796
- "rouge2_ci_low": 0.014565784948631207,
797
- "rouge2_ci_high": 0.018245592224480585,
798
- "rougeL_ci_low": 0.07819027253097927,
799
- "rougeL_ci_high": 0.08550070178637435,
800
- "score_ci_low": 0.07819027253097927,
801
- "score_ci_high": 0.08550070178637435,
802
- "rouge1_ci_low": 0.1075630132488861,
803
- "rouge1_ci_high": 0.11796959016801192
804
- },
805
- "score": 0.17306930860888475,
806
- "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
- },
809
- "translation": {
810
- "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
- "counts": [
813
- 1348,
814
- 805,
815
- 529,
816
- 359
817
- ],
818
- "totals": [
819
- 6219,
820
- 6153,
821
- 6087,
822
- 6021
823
- ],
824
- "precisions": [
825
- 0.21675510532239908,
826
- 0.13083048919226395,
827
- 0.08690652209627076,
828
- 0.05962464706859325
829
- ],
830
- "bp": 1.0,
831
- "sys_len": 6219,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.11010045736869918,
834
- "score": 0.11010045736869918,
835
- "score_name": "sacrebleu",
836
- "score_ci_low": 0.09292825544039175,
837
- "score_ci_high": 0.12825546813076855,
838
- "sacrebleu_ci_low": 0.09292825544039175,
839
- "sacrebleu_ci_high": 0.12825546813076855
840
- },
841
- "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
- "counts": [
844
- 1365,
845
- 860,
846
- 583,
847
- 397
848
- ],
849
- "totals": [
850
- 5747,
851
- 5681,
852
- 5615,
853
- 5549
854
- ],
855
- "precisions": [
856
- 0.2375152253349574,
857
- 0.151381798979053,
858
- 0.10382902938557435,
859
- 0.07154442241845378
860
- ],
861
- "bp": 1.0,
862
- "sys_len": 5747,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.12783945870094363,
865
- "score": 0.12783945870094363,
866
- "score_name": "sacrebleu",
867
- "score_ci_low": 0.11183867040924762,
868
- "score_ci_high": 0.14785567428124632,
869
- "sacrebleu_ci_low": 0.11183867040924762,
870
- "sacrebleu_ci_high": 0.14785567428124632
871
- },
872
- "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
- "counts": [
875
- 739,
876
- 294,
877
- 146,
878
- 72
879
- ],
880
- "totals": [
881
- 7684,
882
- 7618,
883
- 7552,
884
- 7486
885
- ],
886
- "precisions": [
887
- 0.09617386777719938,
888
- 0.03859280651089525,
889
- 0.01933262711864407,
890
- 0.009617953513224687
891
- ],
892
- "bp": 1.0,
893
- "sys_len": 7684,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.028822672569638247,
896
- "score": 0.028822672569638247,
897
- "score_name": "sacrebleu",
898
- "score_ci_low": 0.021109593731645938,
899
- "score_ci_high": 0.037834395667653335,
900
- "sacrebleu_ci_low": 0.021109593731645938,
901
- "sacrebleu_ci_high": 0.037834395667653335
902
- },
903
- "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
- "counts": [
906
- 1296,
907
- 767,
908
- 491,
909
- 329
910
- ],
911
- "totals": [
912
- 5968,
913
- 5902,
914
- 5836,
915
- 5770
916
- ],
917
- "precisions": [
918
- 0.21715817694369974,
919
- 0.1299559471365639,
920
- 0.0841329677861549,
921
- 0.05701906412478336
922
- ],
923
- "bp": 1.0,
924
- "sys_len": 5968,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.10786726317561303,
927
- "score": 0.10786726317561303,
928
- "score_name": "sacrebleu",
929
- "score_ci_low": 0.09202539159854502,
930
- "score_ci_high": 0.1283307736534194,
931
- "sacrebleu_ci_low": 0.09202539159854502,
932
- "sacrebleu_ci_high": 0.1283307736534194
933
- },
934
- "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
- "counts": [
937
- 1536,
938
- 1081,
939
- 816,
940
- 632
941
- ],
942
- "totals": [
943
- 4782,
944
- 4716,
945
- 4650,
946
- 4584
947
- ],
948
- "precisions": [
949
- 0.3212045169385195,
950
- 0.22921967769296014,
951
- 0.17548387096774193,
952
- 0.13787085514834208
953
- ],
954
- "bp": 1.0,
955
- "sys_len": 4782,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.20544037737967952,
958
- "score": 0.20544037737967952,
959
- "score_name": "sacrebleu",
960
- "score_ci_low": 0.170377943119898,
961
- "score_ci_high": 0.24159259713787848,
962
- "sacrebleu_ci_low": 0.170377943119898,
963
- "sacrebleu_ci_high": 0.24159259713787848
964
- },
965
- "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
- "counts": [
968
- 1429,
969
- 687,
970
- 382,
971
- 223
972
- ],
973
- "totals": [
974
- 8796,
975
- 8730,
976
- 8664,
977
- 8598
978
- ],
979
- "precisions": [
980
- 0.16246020918599363,
981
- 0.07869415807560137,
982
- 0.04409048938134811,
983
- 0.025936264247499417
984
- ],
985
- "bp": 1.0,
986
- "sys_len": 8796,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.061835155996391195,
989
- "score": 0.061835155996391195,
990
- "score_name": "sacrebleu",
991
- "score_ci_low": 0.04783880582354916,
992
- "score_ci_high": 0.0770855832203236,
993
- "sacrebleu_ci_low": 0.04783880582354916,
994
- "sacrebleu_ci_high": 0.0770855832203236
995
- },
996
- "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
- "counts": [
999
- 1491,
1000
- 1023,
1001
- 759,
1002
- 578
1003
- ],
1004
- "totals": [
1005
- 6280,
1006
- 6214,
1007
- 6148,
1008
- 6082
1009
- ],
1010
- "precisions": [
1011
- 0.2374203821656051,
1012
- 0.1646282587705182,
1013
- 0.12345478204294079,
1014
- 0.09503452811575139
1015
- ],
1016
- "bp": 1.0,
1017
- "sys_len": 6280,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.14633659011937655,
1020
- "score": 0.14633659011937655,
1021
- "score_name": "sacrebleu",
1022
- "score_ci_low": 0.12499780954581644,
1023
- "score_ci_high": 0.16988788016668988,
1024
- "sacrebleu_ci_low": 0.12499780954581644,
1025
- "sacrebleu_ci_high": 0.16988788016668988
1026
- },
1027
- "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
- "counts": [
1030
- 1335,
1031
- 862,
1032
- 585,
1033
- 402
1034
- ],
1035
- "totals": [
1036
- 5113,
1037
- 5047,
1038
- 4981,
1039
- 4915
1040
- ],
1041
- "precisions": [
1042
- 0.26109915900645414,
1043
- 0.17079453140479495,
1044
- 0.11744629592451314,
1045
- 0.08179043743641913
1046
- ],
1047
- "bp": 1.0,
1048
- "sys_len": 5113,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.14386505707163663,
1051
- "score": 0.14386505707163663,
1052
- "score_name": "sacrebleu",
1053
- "score_ci_low": 0.11590777522441527,
1054
- "score_ci_high": 0.19034631649860798,
1055
- "sacrebleu_ci_low": 0.11590777522441527,
1056
- "sacrebleu_ci_high": 0.19034631649860798
1057
- },
1058
- "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
- "counts": [
1061
- 1380,
1062
- 726,
1063
- 418,
1064
- 245
1065
- ],
1066
- "totals": [
1067
- 6698,
1068
- 6632,
1069
- 6566,
1070
- 6500
1071
- ],
1072
- "precisions": [
1073
- 0.20603165123917588,
1074
- 0.10946924004825091,
1075
- 0.06366128540968626,
1076
- 0.03769230769230769
1077
- ],
1078
- "bp": 1.0,
1079
- "sys_len": 6698,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.08577061900111178,
1082
- "score": 0.08577061900111178,
1083
- "score_name": "sacrebleu",
1084
- "score_ci_low": 0.06866769965121916,
1085
- "score_ci_high": 0.10264171230800344,
1086
- "sacrebleu_ci_low": 0.06866769965121916,
1087
- "sacrebleu_ci_high": 0.10264171230800344
1088
- },
1089
- "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
- "counts": [
1092
- 1411,
1093
- 934,
1094
- 656,
1095
- 466
1096
- ],
1097
- "totals": [
1098
- 5734,
1099
- 5668,
1100
- 5602,
1101
- 5536
1102
- ],
1103
- "precisions": [
1104
- 0.24607603767003838,
1105
- 0.1647847565278758,
1106
- 0.11710103534451982,
1107
- 0.08417630057803467
1108
- ],
1109
- "bp": 1.0,
1110
- "sys_len": 5734,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.14139505868159252,
1113
- "score": 0.14139505868159252,
1114
- "score_name": "sacrebleu",
1115
- "score_ci_low": 0.12079576015139526,
1116
- "score_ci_high": 0.16682278724108202,
1117
- "sacrebleu_ci_low": 0.12079576015139526,
1118
- "sacrebleu_ci_high": 0.16682278724108202
1119
- },
1120
- "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
- "counts": [
1123
- 1223,
1124
- 624,
1125
- 368,
1126
- 228
1127
- ],
1128
- "totals": [
1129
- 5589,
1130
- 5523,
1131
- 5457,
1132
- 5391
1133
- ],
1134
- "precisions": [
1135
- 0.21882268742172123,
1136
- 0.11298207495926127,
1137
- 0.06743632032252153,
1138
- 0.042292710072342796
1139
- ],
1140
- "bp": 1.0,
1141
- "sys_len": 5589,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.09163583013027359,
1144
- "score": 0.09163583013027359,
1145
- "score_name": "sacrebleu",
1146
- "score_ci_low": 0.07856240720421398,
1147
- "score_ci_high": 0.107979299756867,
1148
- "sacrebleu_ci_low": 0.07856240720421398,
1149
- "sacrebleu_ci_high": 0.107979299756867
1150
- },
1151
- "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
- "counts": [
1154
- 1181,
1155
- 575,
1156
- 330,
1157
- 198
1158
- ],
1159
- "totals": [
1160
- 5759,
1161
- 5693,
1162
- 5627,
1163
- 5561
1164
- ],
1165
- "precisions": [
1166
- 0.20507032470915088,
1167
- 0.1010012295801862,
1168
- 0.05864581482139684,
1169
- 0.03560510699514476
1170
- ],
1171
- "bp": 1.0,
1172
- "sys_len": 5759,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.08109511611273765,
1175
- "score": 0.08109511611273765,
1176
- "score_name": "sacrebleu",
1177
- "score_ci_low": 0.06705573259915326,
1178
- "score_ci_high": 0.09497541131553666,
1179
- "sacrebleu_ci_low": 0.06705573259915326,
1180
- "sacrebleu_ci_high": 0.09497541131553666
1181
- },
1182
- "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
- "counts": [
1185
- 1407,
1186
- 940,
1187
- 672,
1188
- 483
1189
- ],
1190
- "totals": [
1191
- 6123,
1192
- 6057,
1193
- 5991,
1194
- 5925
1195
- ],
1196
- "precisions": [
1197
- 0.2297893189612935,
1198
- 0.15519233944196797,
1199
- 0.11216825237856785,
1200
- 0.08151898734177215
1201
- ],
1202
- "bp": 1.0,
1203
- "sys_len": 6123,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.13437924969030768,
1206
- "score": 0.13437924969030768,
1207
- "score_name": "sacrebleu",
1208
- "score_ci_low": 0.11400957037603383,
1209
- "score_ci_high": 0.15945809207216216,
1210
- "sacrebleu_ci_low": 0.11400957037603383,
1211
- "sacrebleu_ci_high": 0.15945809207216216
1212
- },
1213
- "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
- "counts": [
1216
- 1432,
1217
- 975,
1218
- 700,
1219
- 506
1220
- ],
1221
- "totals": [
1222
- 6485,
1223
- 6419,
1224
- 6353,
1225
- 6287
1226
- ],
1227
- "precisions": [
1228
- 0.2208172706245181,
1229
- 0.1518928181959807,
1230
- 0.11018416496143554,
1231
- 0.08048353745824718
1232
- ],
1233
- "bp": 1.0,
1234
- "sys_len": 6485,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.13132552016969748,
1237
- "score": 0.13132552016969748,
1238
- "score_name": "sacrebleu",
1239
- "score_ci_low": 0.1085828237330662,
1240
- "score_ci_high": 0.1582258643543896,
1241
- "sacrebleu_ci_low": 0.1085828237330662,
1242
- "sacrebleu_ci_high": 0.1582258643543896
1243
- },
1244
- "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
- "counts": [
1247
- 1293,
1248
- 715,
1249
- 441,
1250
- 272
1251
- ],
1252
- "totals": [
1253
- 6041,
1254
- 5975,
1255
- 5909,
1256
- 5843
1257
- ],
1258
- "precisions": [
1259
- 0.2140374110246648,
1260
- 0.1196652719665272,
1261
- 0.07463191741411407,
1262
- 0.046551429060414165
1263
- ],
1264
- "bp": 1.0,
1265
- "sys_len": 6041,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.09712451421953877,
1268
- "score": 0.09712451421953877,
1269
- "score_name": "sacrebleu",
1270
- "score_ci_low": 0.08132271336696732,
1271
- "score_ci_high": 0.11265264204635472,
1272
- "sacrebleu_ci_low": 0.08132271336696732,
1273
- "sacrebleu_ci_high": 0.11265264204635472
1274
- },
1275
- "score": 0.1129888626924825,
1276
- "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
- },
1279
- "score": 0.42020372962571984,
1280
- "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
- }
1283
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/bluebench/2025-06-23T15-33-11_evaluation_results.json DELETED
@@ -1,1283 +0,0 @@
1
- {
2
- "environment_info": {
3
- "timestamp_utc": "2025-06-23T19:33:07.872441Z",
4
- "command_line_invocation": [
5
- "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
- "--tasks",
7
- "benchmarks.bluebench",
8
- "--model",
9
- "cross_provider",
10
- "--model_args",
11
- "model_name=watsonx/mistralai/mistral-large,max_tokens=256",
12
- "--output_path",
13
- "./results/bluebench",
14
- "--log_samples",
15
- "--trust_remote_code",
16
- "--batch_size",
17
- "8",
18
- "--verbosity",
19
- "ERROR"
20
- ],
21
- "parsed_arguments": {
22
- "tasks": [
23
- "benchmarks.bluebench"
24
- ],
25
- "split": "test",
26
- "num_fewshots": null,
27
- "limit": null,
28
- "batch_size": 8,
29
- "model": "watsonx/mistralai/mistral-large",
30
- "model_args": {
31
- "max_tokens": 256
32
- },
33
- "gen_kwargs": null,
34
- "chat_template_kwargs": null,
35
- "output_path": "./results/bluebench",
36
- "output_file_prefix": "evaluation_results",
37
- "log_samples": true,
38
- "verbosity": "ERROR",
39
- "apply_chat_template": false,
40
- "trust_remote_code": true,
41
- "disable_hf_cache": false,
42
- "cache_dir": null
43
- },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
- "python_version": "3.10.18",
47
- "system": "Linux",
48
- "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
- "installed_packages": {
50
- "nvidia-cufile-cu12": "1.11.1.6",
51
- "triton": "3.3.1",
52
- "nltk": "3.9.1",
53
- "anyio": "4.9.0",
54
- "absl-py": "2.3.0",
55
- "tiktoken": "0.9.0",
56
- "charset-normalizer": "3.4.2",
57
- "nvidia-cuda-runtime-cu12": "12.6.77",
58
- "sympy": "1.14.0",
59
- "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
- "httpcore": "1.0.9",
62
- "Jinja2": "3.1.6",
63
- "jsonschema-specifications": "2025.4.1",
64
- "pydantic_core": "2.33.2",
65
- "nvidia-cusparse-cu12": "12.5.4.2",
66
- "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
- "portalocker": "3.2.0",
69
- "pandas": "2.3.0",
70
- "multiprocess": "0.70.16",
71
- "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
- "nvidia-nvjitlink-cu12": "12.6.85",
74
- "nvidia-cublas-cu12": "12.6.4.1",
75
- "pydantic": "2.11.7",
76
- "async-timeout": "5.0.1",
77
- "annotated-types": "0.7.0",
78
- "rouge_score": "0.1.2",
79
- "contourpy": "1.3.2",
80
- "aiosignal": "1.3.2",
81
- "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
- "six": "1.17.0",
84
- "diskcache": "5.6.3",
85
- "tqdm": "4.67.1",
86
- "pyarrow": "20.0.0",
87
- "h11": "0.16.0",
88
- "zipp": "3.19.2",
89
- "tzdata": "2025.2",
90
- "bert-score": "0.3.13",
91
- "setuptools": "80.9.0",
92
- "referencing": "0.36.2",
93
- "sacrebleu": "2.5.1",
94
- "filelock": "3.18.0",
95
- "urllib3": "2.5.0",
96
- "scipy": "1.15.3",
97
- "nvidia-nccl-cu12": "2.26.2",
98
- "kiwisolver": "1.4.8",
99
- "networkx": "3.4.2",
100
- "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
- "sniffio": "1.3.1",
103
- "scikit-learn": "1.7.0",
104
- "nvidia-curand-cu12": "10.3.7.77",
105
- "pip": "25.1.1",
106
- "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
- "datasets": "3.6.0",
109
- "nvidia-cusolver-cu12": "11.7.1.2",
110
- "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
- "distro": "1.9.0",
113
- "idna": "3.10",
114
- "MarkupSafe": "3.0.2",
115
- "frozenlist": "1.7.0",
116
- "pyparsing": "3.2.3",
117
- "jiter": "0.10.0",
118
- "importlib_metadata": "8.0.0",
119
- "packaging": "24.2",
120
- "psutil": "7.0.0",
121
- "mecab-ko-dic": "1.0.0",
122
- "joblib": "1.5.1",
123
- "fsspec": "2025.3.0",
124
- "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
- "wheel": "0.45.1",
127
- "nvidia-nvtx-cu12": "12.6.77",
128
- "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
- "propcache": "0.3.2",
131
- "numpy": "2.2.6",
132
- "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
- "conllu": "6.0.0",
135
- "safetensors": "0.5.3",
136
- "requests": "2.32.4",
137
- "regex": "2024.11.6",
138
- "aiohttp": "3.12.13",
139
- "tabulate": "0.9.0",
140
- "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
- "nvidia-cufft-cu12": "11.3.0.4",
143
- "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
- "click": "8.2.1",
145
- "typing_extensions": "4.12.2",
146
- "attrs": "25.3.0",
147
- "exceptiongroup": "1.3.0",
148
- "tenacity": "9.1.2",
149
- "pytz": "2025.2",
150
- "aiohappyeyeballs": "2.6.1",
151
- "python-dateutil": "2.9.0.post0",
152
- "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
154
- "httpx": "0.28.1",
155
- "matplotlib": "3.10.3",
156
- "xxhash": "3.5.0",
157
- "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
- "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
- "threadpoolctl": "3.6.0",
162
- "nvidia-cudnn-cu12": "9.5.1.17",
163
- "jaraco.collections": "5.1.0",
164
- "tomli": "2.0.1",
165
- "backports.tarfile": "1.2.0",
166
- "jaraco.context": "5.3.0",
167
- "typeguard": "4.3.0",
168
- "autocommand": "2.2.2",
169
- "jaraco.text": "3.12.1",
170
- "more-itertools": "10.3.0",
171
- "platformdirs": "4.2.2",
172
- "inflect": "7.3.1",
173
- "jaraco.functools": "4.0.1"
174
- }
175
- },
176
- "results": {
177
- "bias": {
178
- "safety_bbq_age": {
179
- "accuracy": 0.8666666666666667,
180
- "accuracy_ci_low": 0.7888888888888889,
181
- "accuracy_ci_high": 0.9222222222222223,
182
- "score_name": "accuracy",
183
- "score": 0.8666666666666667,
184
- "score_ci_high": 0.9222222222222223,
185
- "score_ci_low": 0.7888888888888889,
186
- "num_of_instances": 90
187
- },
188
- "safety_bbq_disability_status": {
189
- "accuracy": 0.8888888888888888,
190
- "accuracy_ci_low": 0.8111111111111111,
191
- "accuracy_ci_high": 0.9444444444444444,
192
- "score_name": "accuracy",
193
- "score": 0.8888888888888888,
194
- "score_ci_high": 0.9444444444444444,
195
- "score_ci_low": 0.8111111111111111,
196
- "num_of_instances": 90
197
- },
198
- "safety_bbq_gender_identity": {
199
- "accuracy": 0.9888888888888889,
200
- "accuracy_ci_low": 0.9283857779145438,
201
- "accuracy_ci_high": 1.0,
202
- "score_name": "accuracy",
203
- "score": 0.9888888888888889,
204
- "score_ci_high": 1.0,
205
- "score_ci_low": 0.9283857779145438,
206
- "num_of_instances": 90
207
- },
208
- "safety_bbq_nationality": {
209
- "accuracy": 0.8666666666666667,
210
- "accuracy_ci_low": 0.7858277377703305,
211
- "accuracy_ci_high": 0.9333333333333333,
212
- "score_name": "accuracy",
213
- "score": 0.8666666666666667,
214
- "score_ci_high": 0.9333333333333333,
215
- "score_ci_low": 0.7858277377703305,
216
- "num_of_instances": 90
217
- },
218
- "safety_bbq_physical_appearance": {
219
- "accuracy": 0.9333333333333333,
220
- "accuracy_ci_low": 0.8666666666666667,
221
- "accuracy_ci_high": 0.9777777777777777,
222
- "score_name": "accuracy",
223
- "score": 0.9333333333333333,
224
- "score_ci_high": 0.9777777777777777,
225
- "score_ci_low": 0.8666666666666667,
226
- "num_of_instances": 90
227
- },
228
- "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.9555555555555556,
230
- "accuracy_ci_low": 0.9,
231
- "accuracy_ci_high": 0.9888888888888889,
232
- "score_name": "accuracy",
233
- "score": 0.9555555555555556,
234
- "score_ci_high": 0.9888888888888889,
235
- "score_ci_low": 0.9,
236
- "num_of_instances": 90
237
- },
238
- "safety_bbq_race_x_gender": {
239
- "accuracy": 0.9111111111111111,
240
- "accuracy_ci_low": 0.8333333333333334,
241
- "accuracy_ci_high": 0.9555555555555556,
242
- "score_name": "accuracy",
243
- "score": 0.9111111111111111,
244
- "score_ci_high": 0.9555555555555556,
245
- "score_ci_low": 0.8333333333333334,
246
- "num_of_instances": 90
247
- },
248
- "safety_bbq_race_x_ses": {
249
- "accuracy": 0.8666666666666667,
250
- "accuracy_ci_low": 0.7888888888888889,
251
- "accuracy_ci_high": 0.9222222222222223,
252
- "score_name": "accuracy",
253
- "score": 0.8666666666666667,
254
- "score_ci_high": 0.9222222222222223,
255
- "score_ci_low": 0.7888888888888889,
256
- "num_of_instances": 90
257
- },
258
- "safety_bbq_religion": {
259
- "accuracy": 0.8666666666666667,
260
- "accuracy_ci_low": 0.788388746882511,
261
- "accuracy_ci_high": 0.9222222222222223,
262
- "score_name": "accuracy",
263
- "score": 0.8666666666666667,
264
- "score_ci_high": 0.9222222222222223,
265
- "score_ci_low": 0.788388746882511,
266
- "num_of_instances": 90
267
- },
268
- "safety_bbq_ses": {
269
- "accuracy": 0.8888888888888888,
270
- "accuracy_ci_low": 0.8111111111111111,
271
- "accuracy_ci_high": 0.9444444444444444,
272
- "score_name": "accuracy",
273
- "score": 0.8888888888888888,
274
- "score_ci_high": 0.9444444444444444,
275
- "score_ci_low": 0.8111111111111111,
276
- "num_of_instances": 90
277
- },
278
- "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.9222222222222223,
280
- "accuracy_ci_low": 0.8555555555555555,
281
- "accuracy_ci_high": 0.9666666666666667,
282
- "score_name": "accuracy",
283
- "score": 0.9222222222222223,
284
- "score_ci_high": 0.9666666666666667,
285
- "score_ci_low": 0.8555555555555555,
286
- "num_of_instances": 90
287
- },
288
- "score": 0.9050505050505051,
289
- "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
- },
292
- "chatbot_abilities": {
293
- "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.09158878504672897,
296
- "score": 0.09158878504672897,
297
- "score_name": "llama_3_70b_instruct_template_arena_hard"
298
- },
299
- "score": 0.09158878504672897,
300
- "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
- },
303
- "entity_extraction": {
304
- "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.16666666666666666,
307
- "f1_Organization": 0.03252032520325203,
308
- "f1_Location": 0.06666666666666667,
309
- "f1_macro": 0.08861788617886178,
310
- "recall_macro": 0.061454532512588166,
311
- "precision_macro": 0.1605612378704432,
312
- "in_classes_support": 0.2943548387096774,
313
- "f1_micro": 0.055161544523246654,
314
- "recall_micro": 0.06666666666666667,
315
- "precision_micro": 0.04704301075268817,
316
- "score": 0.055161544523246654,
317
- "score_name": "f1_micro",
318
- "score_ci_low": 0.04056157487156359,
319
- "score_ci_high": 0.07432669896850874,
320
- "f1_micro_ci_low": 0.04056157487156359,
321
- "f1_micro_ci_high": 0.07432669896850874
322
- },
323
- "score": 0.055161544523246654,
324
- "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
- },
327
- "knowledge": {
328
- "mmlu_pro_biology": {
329
- "accuracy": 0.7323943661971831,
330
- "accuracy_ci_low": 0.6197183098591549,
331
- "accuracy_ci_high": 0.8309859154929577,
332
- "score_name": "accuracy",
333
- "score": 0.7323943661971831,
334
- "score_ci_high": 0.8309859154929577,
335
- "score_ci_low": 0.6197183098591549,
336
- "num_of_instances": 71
337
- },
338
- "mmlu_pro_business": {
339
- "accuracy": 0.39436619718309857,
340
- "accuracy_ci_low": 0.28169014084507044,
341
- "accuracy_ci_high": 0.5070422535211268,
342
- "score_name": "accuracy",
343
- "score": 0.39436619718309857,
344
- "score_ci_high": 0.5070422535211268,
345
- "score_ci_low": 0.28169014084507044,
346
- "num_of_instances": 71
347
- },
348
- "mmlu_pro_chemistry": {
349
- "accuracy": 0.23943661971830985,
350
- "accuracy_ci_low": 0.14084507042253522,
351
- "accuracy_ci_high": 0.3380281690140845,
352
- "score_name": "accuracy",
353
- "score": 0.23943661971830985,
354
- "score_ci_high": 0.3380281690140845,
355
- "score_ci_low": 0.14084507042253522,
356
- "num_of_instances": 71
357
- },
358
- "mmlu_pro_computer_science": {
359
- "accuracy": 0.647887323943662,
360
- "accuracy_ci_low": 0.5352112676056338,
361
- "accuracy_ci_high": 0.7464788732394366,
362
- "score_name": "accuracy",
363
- "score": 0.647887323943662,
364
- "score_ci_high": 0.7464788732394366,
365
- "score_ci_low": 0.5352112676056338,
366
- "num_of_instances": 71
367
- },
368
- "mmlu_pro_economics": {
369
- "accuracy": 0.7323943661971831,
370
- "accuracy_ci_low": 0.6056338028169014,
371
- "accuracy_ci_high": 0.8309859154929577,
372
- "score_name": "accuracy",
373
- "score": 0.7323943661971831,
374
- "score_ci_high": 0.8309859154929577,
375
- "score_ci_low": 0.6056338028169014,
376
- "num_of_instances": 71
377
- },
378
- "mmlu_pro_engineering": {
379
- "accuracy": 0.2676056338028169,
380
- "accuracy_ci_low": 0.16901408450704225,
381
- "accuracy_ci_high": 0.38028169014084506,
382
- "score_name": "accuracy",
383
- "score": 0.2676056338028169,
384
- "score_ci_high": 0.38028169014084506,
385
- "score_ci_low": 0.16901408450704225,
386
- "num_of_instances": 71
387
- },
388
- "mmlu_pro_health": {
389
- "accuracy": 0.6056338028169014,
390
- "accuracy_ci_low": 0.4788732394366197,
391
- "accuracy_ci_high": 0.704225352112676,
392
- "score_name": "accuracy",
393
- "score": 0.6056338028169014,
394
- "score_ci_high": 0.704225352112676,
395
- "score_ci_low": 0.4788732394366197,
396
- "num_of_instances": 71
397
- },
398
- "mmlu_pro_history": {
399
- "accuracy": 0.647887323943662,
400
- "accuracy_ci_low": 0.5211267605633803,
401
- "accuracy_ci_high": 0.7464788732394366,
402
- "score_name": "accuracy",
403
- "score": 0.647887323943662,
404
- "score_ci_high": 0.7464788732394366,
405
- "score_ci_low": 0.5211267605633803,
406
- "num_of_instances": 71
407
- },
408
- "mmlu_pro_law": {
409
- "accuracy": 0.5211267605633803,
410
- "accuracy_ci_low": 0.40913735882879854,
411
- "accuracy_ci_high": 0.6338028169014085,
412
- "score_name": "accuracy",
413
- "score": 0.5211267605633803,
414
- "score_ci_high": 0.6338028169014085,
415
- "score_ci_low": 0.40913735882879854,
416
- "num_of_instances": 71
417
- },
418
- "mmlu_pro_math": {
419
- "accuracy": 0.4225352112676056,
420
- "accuracy_ci_low": 0.30985915492957744,
421
- "accuracy_ci_high": 0.5275288557194965,
422
- "score_name": "accuracy",
423
- "score": 0.4225352112676056,
424
- "score_ci_high": 0.5275288557194965,
425
- "score_ci_low": 0.30985915492957744,
426
- "num_of_instances": 71
427
- },
428
- "mmlu_pro_other": {
429
- "accuracy": 0.647887323943662,
430
- "accuracy_ci_low": 0.5211267605633803,
431
- "accuracy_ci_high": 0.7464788732394366,
432
- "score_name": "accuracy",
433
- "score": 0.647887323943662,
434
- "score_ci_high": 0.7464788732394366,
435
- "score_ci_low": 0.5211267605633803,
436
- "num_of_instances": 71
437
- },
438
- "mmlu_pro_philosophy": {
439
- "accuracy": 0.6619718309859155,
440
- "accuracy_ci_low": 0.5492957746478874,
441
- "accuracy_ci_high": 0.7746478873239436,
442
- "score_name": "accuracy",
443
- "score": 0.6619718309859155,
444
- "score_ci_high": 0.7746478873239436,
445
- "score_ci_low": 0.5492957746478874,
446
- "num_of_instances": 71
447
- },
448
- "mmlu_pro_physics": {
449
- "accuracy": 0.39436619718309857,
450
- "accuracy_ci_low": 0.28169014084507044,
451
- "accuracy_ci_high": 0.5070422535211268,
452
- "score_name": "accuracy",
453
- "score": 0.39436619718309857,
454
- "score_ci_high": 0.5070422535211268,
455
- "score_ci_low": 0.28169014084507044,
456
- "num_of_instances": 71
457
- },
458
- "mmlu_pro_psychology": {
459
- "accuracy": 0.8028169014084507,
460
- "accuracy_ci_low": 0.704225352112676,
461
- "accuracy_ci_high": 0.8873239436619719,
462
- "score_name": "accuracy",
463
- "score": 0.8028169014084507,
464
- "score_ci_high": 0.8873239436619719,
465
- "score_ci_low": 0.704225352112676,
466
- "num_of_instances": 71
467
- },
468
- "score": 0.5513078470824949,
469
- "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
- },
472
- "legal": {
473
- "legalbench_abercrombie": {
474
- "f1_macro": 0.30082491488530444,
475
- "f1_suggestive": 0.2926829268292683,
476
- "f1_generic": 0.3157894736842105,
477
- "f1_fanciful": 0.2,
478
- "f1_descriptive": 0.2608695652173913,
479
- "f1_arbitrary": 0.43478260869565216,
480
- "f1_macro_ci_low": 0.21165741181075054,
481
- "f1_macro_ci_high": 0.4315719879768282,
482
- "score_name": "f1_micro",
483
- "score": 0.30158730158730157,
484
- "score_ci_high": 0.421875,
485
- "score_ci_low": 0.2033898305084746,
486
- "num_of_instances": 85,
487
- "accuracy": 0.2235294117647059,
488
- "accuracy_ci_low": 0.15294117647058825,
489
- "accuracy_ci_high": 0.32721667655979375,
490
- "f1_micro": 0.30158730158730157,
491
- "f1_micro_ci_low": 0.2033898305084746,
492
- "f1_micro_ci_high": 0.421875
493
- },
494
- "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.2087664168882443,
496
- "f1_no": 0.38578680203045684,
497
- "f1_yes": 0.031746031746031744,
498
- "f1_macro_ci_low": 0.16304347826086957,
499
- "f1_macro_ci_high": 0.27128054977534694,
500
- "score_name": "f1_micro",
501
- "score": 0.3,
502
- "score_ci_high": 0.37342833232881084,
503
- "score_ci_low": 0.22950819672131148,
504
- "num_of_instances": 200,
505
- "accuracy": 0.195,
506
- "accuracy_ci_low": 0.145,
507
- "accuracy_ci_high": 0.2511829758893259,
508
- "f1_micro": 0.3,
509
- "f1_micro_ci_low": 0.22950819672131148,
510
- "f1_micro_ci_high": 0.37342833232881084
511
- },
512
- "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.1326239897668469,
514
- "f1_conclusion": 0.0,
515
- "f1_decree": 0.14814814814814814,
516
- "f1_issue": 0.05714285714285714,
517
- "f1_analysis": 0.15,
518
- "f1_facts": 0.06666666666666667,
519
- "f1_procedural history": 0.25,
520
- "f1_rule": 0.2564102564102564,
521
- "f1_macro_ci_low": 0.0850100965627728,
522
- "f1_macro_ci_high": 0.1996422423627835,
523
- "score_name": "f1_micro",
524
- "score": 0.14345991561181434,
525
- "score_ci_high": 0.2175732217573222,
526
- "score_ci_low": 0.08928571428571429,
527
- "num_of_instances": 200,
528
- "accuracy": 0.085,
529
- "accuracy_ci_low": 0.055,
530
- "accuracy_ci_high": 0.135,
531
- "f1_micro": 0.14345991561181434,
532
- "f1_micro_ci_low": 0.08928571428571429,
533
- "f1_micro_ci_high": 0.2175732217573222
534
- },
535
- "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.27590718171176754,
537
- "f1_yes": 0.3106796116504854,
538
- "f1_no": 0.24113475177304963,
539
- "f1_macro_ci_low": 0.2059942090622298,
540
- "f1_macro_ci_high": 0.35594480291914055,
541
- "score_name": "f1_micro",
542
- "score": 0.27049180327868855,
543
- "score_ci_high": 0.3511987633583538,
544
- "score_ci_low": 0.20259771606756347,
545
- "num_of_instances": 200,
546
- "accuracy": 0.165,
547
- "accuracy_ci_low": 0.12,
548
- "accuracy_ci_high": 0.225,
549
- "f1_micro": 0.27049180327868855,
550
- "f1_micro_ci_low": 0.20259771606756347,
551
- "f1_micro_ci_high": 0.3511987633583538
552
- },
553
- "legalbench_proa": {
554
- "f1_macro": 0.7697368421052632,
555
- "f1_yes": 0.75,
556
- "f1_no": 0.7894736842105263,
557
- "f1_macro_ci_low": 0.6817007087256215,
558
- "f1_macro_ci_high": 0.8427704260296438,
559
- "score_name": "f1_micro",
560
- "score": 0.7714285714285715,
561
- "score_ci_high": 0.8435374149659864,
562
- "score_ci_low": 0.6821705426356589,
563
- "num_of_instances": 85,
564
- "accuracy": 0.6352941176470588,
565
- "accuracy_ci_low": 0.5176470588235295,
566
- "accuracy_ci_high": 0.7294117647058823,
567
- "f1_micro": 0.7714285714285715,
568
- "f1_micro_ci_low": 0.6821705426356589,
569
- "f1_micro_ci_high": 0.8435374149659864
570
- },
571
- "score": 0.3573935183812752,
572
- "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
- },
575
- "news_classification": {
576
- "20_newsgroups_short": {
577
- "f1_macro": 0.31185856604704804,
578
- "f1_cars": 0.5294117647058824,
579
- "f1_windows x": 0.0,
580
- "f1_computer graphics": 0.21875,
581
- "f1_atheism": 0.0,
582
- "f1_religion": 0.1935483870967742,
583
- "f1_medicine": 0.4444444444444444,
584
- "f1_christianity": 0.07272727272727272,
585
- "f1_microsoft windows": 0.13793103448275862,
586
- "f1_middle east": 0.2692307692307692,
587
- "f1_motorcycles": 0.41975308641975306,
588
- "f1_pc hardware": 0.47191011235955055,
589
- "f1_mac hardware": 0.4943820224719101,
590
- "f1_for sale": 0.2608695652173913,
591
- "f1_guns": 0.0784313725490196,
592
- "f1_space": 0.4594594594594595,
593
- "f1_cryptography": 0.3389830508474576,
594
- "f1_baseball": 0.46153846153846156,
595
- "f1_hockey": 0.5918367346938775,
596
- "f1_politics": 0.37142857142857144,
597
- "f1_electronics": 0.4225352112676056,
598
- "f1_macro_ci_low": 0.2856078509025084,
599
- "f1_macro_ci_high": 0.3426020462142722,
600
- "score_name": "f1_micro",
601
- "score": 0.3433734939759036,
602
- "score_ci_high": 0.3755990938286412,
603
- "score_ci_low": 0.3096235116477192,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.228,
606
- "accuracy_ci_low": 0.202,
607
- "accuracy_ci_high": 0.253,
608
- "f1_micro": 0.3433734939759036,
609
- "f1_micro_ci_low": 0.3096235116477192,
610
- "f1_micro_ci_high": 0.3755990938286412
611
- },
612
- "score": 0.3433734939759036,
613
- "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
- },
616
- "product_help": {
617
- "cfpb_product_2023": {
618
- "f1_macro": 0.6519421467580517,
619
- "f1_student loan": 0.75,
620
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.7592592592592593,
621
- "f1_debt collection": 0.5138888888888888,
622
- "f1_checking or savings account": 0.7073170731707317,
623
- "f1_mortgage": 0.7931034482758621,
624
- "f1_payday loan or title loan or personal loan": 0.4444444444444444,
625
- "f1_credit card or prepaid card": 0.6727272727272727,
626
- "f1_money transfer or virtual currency or money service": 0.6341463414634146,
627
- "f1_vehicle loan or lease": 0.5925925925925926,
628
- "f1_macro_ci_low": 0.5871374382389457,
629
- "f1_macro_ci_high": 0.7213495817777442,
630
- "score_name": "f1_micro",
631
- "score": 0.7237076648841355,
632
- "score_ci_high": 0.7514570299103845,
633
- "score_ci_low": 0.6955584945084361,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.609,
636
- "accuracy_ci_low": 0.577,
637
- "accuracy_ci_high": 0.641,
638
- "f1_micro": 0.7237076648841355,
639
- "f1_micro_ci_low": 0.6955584945084361,
640
- "f1_micro_ci_high": 0.7514570299103845
641
- },
642
- "cfpb_product_watsonx": {
643
- "f1_macro": 0.6498825672130026,
644
- "f1_mortgages and loans": 0.6711409395973155,
645
- "f1_credit card": 0.6853146853146853,
646
- "f1_debt collection": 0.56,
647
- "f1_credit reporting": 0.7279151943462897,
648
- "f1_retail banking": 0.6050420168067226,
649
- "f1_macro_ci_low": 0.6099650997428573,
650
- "f1_macro_ci_high": 0.6950683737354554,
651
- "score_name": "f1_micro",
652
- "score": 0.6605293440736478,
653
- "score_ci_high": 0.7005417538024762,
654
- "score_ci_low": 0.6186622377558174,
655
- "num_of_instances": 500,
656
- "accuracy": 0.574,
657
- "accuracy_ci_low": 0.534,
658
- "accuracy_ci_high": 0.618,
659
- "f1_micro": 0.6605293440736478,
660
- "f1_micro_ci_low": 0.6186622377558174,
661
- "f1_micro_ci_high": 0.7005417538024762
662
- },
663
- "score": 0.6921185044788917,
664
- "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
- },
667
- "qa_finance": {
668
- "fin_qa": {
669
- "num_of_instances": 1000,
670
- "execution_accuracy": 0.113,
671
- "program_accuracy": 0.114,
672
- "score": 0.114,
673
- "score_name": "program_accuracy",
674
- "execution_accuracy_ci_low": 0.095,
675
- "execution_accuracy_ci_high": 0.135,
676
- "program_accuracy_ci_low": 0.096,
677
- "program_accuracy_ci_high": 0.135,
678
- "score_ci_low": 0.096,
679
- "score_ci_high": 0.135
680
- },
681
- "score": 0.114,
682
- "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
- },
685
- "rag_general": {
686
- "rag_response_generation_clapnq": {
687
- "precision": 0.3193234609846695,
688
- "recall": 0.6156352660927287,
689
- "f1": 0.3579933019872818,
690
- "precision_ci_low": 0.2995158081508201,
691
- "precision_ci_high": 0.34026768128986357,
692
- "recall_ci_low": 0.599899034156362,
693
- "recall_ci_high": 0.6313695528855681,
694
- "f1_ci_low": 0.34026079062391323,
695
- "f1_ci_high": 0.37577002560346623,
696
- "score_name": "f1",
697
- "score": 0.3579933019872818,
698
- "score_ci_high": 0.37577002560346623,
699
- "score_ci_low": 0.34026079062391323,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.6041086231172085,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6908169340590636,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5559258947273096,
704
- "faithfullness_f1_token_overlap": 0.3085026884811217,
705
- "faithfullness_recall_token_overlap": 0.24647156332743023,
706
- "faithfullness_precision_token_overlap": 0.536789234125785,
707
- "correctness_f1_token_overlap": 0.3579933019872818,
708
- "correctness_recall_token_overlap": 0.6156352660927287,
709
- "correctness_precision_token_overlap": 0.3193234609846695
710
- },
711
- "score": 0.3579933019872818,
712
- "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
- },
715
- "reasoning": {
716
- "hellaswag": {
717
- "accuracy": 0.57,
718
- "accuracy_ci_low": 0.537,
719
- "accuracy_ci_high": 0.599,
720
- "score_name": "accuracy",
721
- "score": 0.57,
722
- "score_ci_high": 0.599,
723
- "score_ci_low": 0.537,
724
- "num_of_instances": 1000
725
- },
726
- "openbook_qa": {
727
- "accuracy": 0.924,
728
- "accuracy_ci_low": 0.8978500821223476,
729
- "accuracy_ci_high": 0.944,
730
- "score_name": "accuracy",
731
- "score": 0.924,
732
- "score_ci_high": 0.944,
733
- "score_ci_low": 0.8978500821223476,
734
- "num_of_instances": 500
735
- },
736
- "score": 0.747,
737
- "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
- },
740
- "safety": {
741
- "attaq_500": {
742
- "severity_critical": 3.0,
743
- "severity_high": 1.0,
744
- "severity_medium": 3.0,
745
- "severity_low": 93.0,
746
- "category_harmful_info": 0.7688285382612356,
747
- "category_deception": 0.8504795281033058,
748
- "category_explicit_content": 0.8499560091230604,
749
- "category_violence": 0.8008088633891136,
750
- "category_discrimination": 0.7961234071898081,
751
- "category_substance_abuse": 0.8387069291538662,
752
- "category_pii": 0.8280106396586807,
753
- "safety": 0.8051073061449763,
754
- "safety_ci_low": 0.7662176976956901,
755
- "safety_ci_high": 0.8348793957598559,
756
- "score_name": "safety",
757
- "score": 0.8051073061449763,
758
- "score_ci_high": 0.8348793957598559,
759
- "score_ci_low": 0.7662176976956901,
760
- "num_of_instances": 100
761
- },
762
- "score": 0.8051073061449763,
763
- "score_name": "subsets_mean",
764
- "num_of_instances": 100
765
- },
766
- "summarization": {
767
- "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rouge1": 0.4079437392329112,
770
- "rougeLsum": 0.352934731899625,
771
- "rougeL": 0.2881192661001498,
772
- "score": 0.2881192661001498,
773
- "score_name": "rougeL",
774
- "rouge2": 0.20309268400418845,
775
- "rouge1_ci_low": 0.39815436123221776,
776
- "rouge1_ci_high": 0.41769481108434164,
777
- "rougeLsum_ci_low": 0.3428414762936942,
778
- "rougeLsum_ci_high": 0.36147234302505843,
779
- "rougeL_ci_low": 0.2810508281662768,
780
- "rougeL_ci_high": 0.2958753092187963,
781
- "score_ci_low": 0.2810508281662768,
782
- "score_ci_high": 0.2958753092187963,
783
- "rouge2_ci_low": 0.19601529597372655,
784
- "rouge2_ci_high": 0.21114250528228737
785
- },
786
- "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rouge1": 0.12777796269592456,
789
- "rougeLsum": 0.10603826262531557,
790
- "rougeL": 0.09179328413499686,
791
- "score": 0.09179328413499686,
792
- "score_name": "rougeL",
793
- "rouge2": 0.018851084572764187,
794
- "rouge1_ci_low": 0.12168881499661288,
795
- "rouge1_ci_high": 0.1332671550509936,
796
- "rougeLsum_ci_low": 0.10116850286523955,
797
- "rougeLsum_ci_high": 0.1106709838674053,
798
- "rougeL_ci_low": 0.08725116318521317,
799
- "rougeL_ci_high": 0.09575800533068254,
800
- "score_ci_low": 0.08725116318521317,
801
- "score_ci_high": 0.09575800533068254,
802
- "rouge2_ci_low": 0.01662623690565808,
803
- "rouge2_ci_high": 0.020972697895929062
804
- },
805
- "score": 0.18995627511757335,
806
- "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
- },
809
- "translation": {
810
- "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
- "counts": [
813
- 1289,
814
- 858,
815
- 605,
816
- 439
817
- ],
818
- "totals": [
819
- 1947,
820
- 1881,
821
- 1815,
822
- 1749
823
- ],
824
- "precisions": [
825
- 0.6620441705187469,
826
- 0.456140350877193,
827
- 0.33333333333333337,
828
- 0.2510005717552887
829
- ],
830
- "bp": 1.0,
831
- "sys_len": 1947,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.39868943613707586,
834
- "score": 0.39868943613707586,
835
- "score_name": "sacrebleu",
836
- "score_ci_low": 0.34684429082723056,
837
- "score_ci_high": 0.4568678775074209,
838
- "sacrebleu_ci_low": 0.34684429082723056,
839
- "sacrebleu_ci_high": 0.4568678775074209
840
- },
841
- "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
- "counts": [
844
- 1323,
845
- 887,
846
- 629,
847
- 444
848
- ],
849
- "totals": [
850
- 2483,
851
- 2417,
852
- 2351,
853
- 2285
854
- ],
855
- "precisions": [
856
- 0.5328231977446637,
857
- 0.36698386429458,
858
- 0.26754572522330927,
859
- 0.19431072210065647
860
- ],
861
- "bp": 1.0,
862
- "sys_len": 2483,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.3175274085826544,
865
- "score": 0.3175274085826544,
866
- "score_name": "sacrebleu",
867
- "score_ci_low": 0.271260768895482,
868
- "score_ci_high": 0.365445678313604,
869
- "sacrebleu_ci_low": 0.271260768895482,
870
- "sacrebleu_ci_high": 0.365445678313604
871
- },
872
- "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
- "counts": [
875
- 940,
876
- 521,
877
- 315,
878
- 191
879
- ],
880
- "totals": [
881
- 1688,
882
- 1622,
883
- 1556,
884
- 1490
885
- ],
886
- "precisions": [
887
- 0.556872037914692,
888
- 0.3212083847102343,
889
- 0.20244215938303342,
890
- 0.12818791946308725
891
- ],
892
- "bp": 1.0,
893
- "sys_len": 1688,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.2610192792824636,
896
- "score": 0.2610192792824636,
897
- "score_name": "sacrebleu",
898
- "score_ci_low": 0.20817526367502337,
899
- "score_ci_high": 0.29958193407273404,
900
- "sacrebleu_ci_low": 0.20817526367502337,
901
- "sacrebleu_ci_high": 0.29958193407273404
902
- },
903
- "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
- "counts": [
906
- 1254,
907
- 784,
908
- 532,
909
- 376
910
- ],
911
- "totals": [
912
- 1815,
913
- 1749,
914
- 1683,
915
- 1617
916
- ],
917
- "precisions": [
918
- 0.6909090909090909,
919
- 0.4482561463693539,
920
- 0.31610219845513965,
921
- 0.23252937538651824
922
- ],
923
- "bp": 0.98904120617152,
924
- "sys_len": 1815,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.38417359468716306,
927
- "score": 0.38417359468716306,
928
- "score_name": "sacrebleu",
929
- "score_ci_low": 0.32789485731980855,
930
- "score_ci_high": 0.4187305224203214,
931
- "sacrebleu_ci_low": 0.32789485731980855,
932
- "sacrebleu_ci_high": 0.4187305224203214
933
- },
934
- "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
- "counts": [
937
- 1572,
938
- 1194,
939
- 949,
940
- 766
941
- ],
942
- "totals": [
943
- 2097,
944
- 2031,
945
- 1965,
946
- 1899
947
- ],
948
- "precisions": [
949
- 0.7496423462088698,
950
- 0.5878877400295421,
951
- 0.48295165394402034,
952
- 0.4033701948393892
953
- ],
954
- "bp": 1.0,
955
- "sys_len": 2097,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.5413012055320727,
958
- "score": 0.5413012055320727,
959
- "score_name": "sacrebleu",
960
- "score_ci_low": 0.5011833767453445,
961
- "score_ci_high": 0.591093351022506,
962
- "sacrebleu_ci_low": 0.5011833767453445,
963
- "sacrebleu_ci_high": 0.591093351022506
964
- },
965
- "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
- "counts": [
968
- 1382,
969
- 762,
970
- 450,
971
- 277
972
- ],
973
- "totals": [
974
- 2304,
975
- 2238,
976
- 2172,
977
- 2106
978
- ],
979
- "precisions": [
980
- 0.5998263888888888,
981
- 0.34048257372654156,
982
- 0.20718232044198895,
983
- 0.1315289648622982
984
- ],
985
- "bp": 1.0,
986
- "sys_len": 2304,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.27313266242858875,
989
- "score": 0.27313266242858875,
990
- "score_name": "sacrebleu",
991
- "score_ci_low": 0.24101006670532885,
992
- "score_ci_high": 0.2985709120047681,
993
- "sacrebleu_ci_low": 0.24101006670532885,
994
- "sacrebleu_ci_high": 0.2985709120047681
995
- },
996
- "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
- "counts": [
999
- 1454,
1000
- 1049,
1001
- 810,
1002
- 633
1003
- ],
1004
- "totals": [
1005
- 2019,
1006
- 1953,
1007
- 1887,
1008
- 1821
1009
- ],
1010
- "precisions": [
1011
- 0.7201584943041109,
1012
- 0.5371223758320532,
1013
- 0.4292527821939586,
1014
- 0.3476112026359144
1015
- ],
1016
- "bp": 1.0,
1017
- "sys_len": 2019,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.49014779569163686,
1020
- "score": 0.49014779569163686,
1021
- "score_name": "sacrebleu",
1022
- "score_ci_low": 0.447345907278528,
1023
- "score_ci_high": 0.5368115765817915,
1024
- "sacrebleu_ci_low": 0.447345907278528,
1025
- "sacrebleu_ci_high": 0.5368115765817915
1026
- },
1027
- "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
- "counts": [
1030
- 1390,
1031
- 967,
1032
- 688,
1033
- 489
1034
- ],
1035
- "totals": [
1036
- 1962,
1037
- 1896,
1038
- 1830,
1039
- 1764
1040
- ],
1041
- "precisions": [
1042
- 0.7084607543323139,
1043
- 0.5100210970464135,
1044
- 0.37595628415300547,
1045
- 0.27721088435374147
1046
- ],
1047
- "bp": 1.0,
1048
- "sys_len": 1962,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.4405172214713006,
1051
- "score": 0.4405172214713006,
1052
- "score_name": "sacrebleu",
1053
- "score_ci_low": 0.40615604089781865,
1054
- "score_ci_high": 0.516381897740174,
1055
- "sacrebleu_ci_low": 0.40615604089781865,
1056
- "sacrebleu_ci_high": 0.516381897740174
1057
- },
1058
- "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
- "counts": [
1061
- 1287,
1062
- 732,
1063
- 440,
1064
- 265
1065
- ],
1066
- "totals": [
1067
- 2008,
1068
- 1942,
1069
- 1876,
1070
- 1810
1071
- ],
1072
- "precisions": [
1073
- 0.6409362549800797,
1074
- 0.3769309989701339,
1075
- 0.2345415778251599,
1076
- 0.1464088397790055
1077
- ],
1078
- "bp": 0.956168891168866,
1079
- "sys_len": 2008,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.28856959420154726,
1082
- "score": 0.28856959420154726,
1083
- "score_name": "sacrebleu",
1084
- "score_ci_low": 0.2645022427610819,
1085
- "score_ci_high": 0.3240217870629309,
1086
- "sacrebleu_ci_low": 0.2645022427610819,
1087
- "sacrebleu_ci_high": 0.3240217870629309
1088
- },
1089
- "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
- "counts": [
1092
- 1361,
1093
- 951,
1094
- 694,
1095
- 510
1096
- ],
1097
- "totals": [
1098
- 2543,
1099
- 2477,
1100
- 2411,
1101
- 2345
1102
- ],
1103
- "precisions": [
1104
- 0.5351946519858435,
1105
- 0.3839321760193783,
1106
- 0.2878473662380755,
1107
- 0.21748400852878466
1108
- ],
1109
- "bp": 1.0,
1110
- "sys_len": 2543,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.33677431862395624,
1113
- "score": 0.33677431862395624,
1114
- "score_name": "sacrebleu",
1115
- "score_ci_low": 0.2922714206661574,
1116
- "score_ci_high": 0.38804905185639504,
1117
- "sacrebleu_ci_low": 0.2922714206661574,
1118
- "sacrebleu_ci_high": 0.38804905185639504
1119
- },
1120
- "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
- "counts": [
1123
- 1134,
1124
- 627,
1125
- 390,
1126
- 248
1127
- ],
1128
- "totals": [
1129
- 1999,
1130
- 1933,
1131
- 1867,
1132
- 1801
1133
- ],
1134
- "precisions": [
1135
- 0.5672836418209105,
1136
- 0.32436627004655977,
1137
- 0.2088912694161757,
1138
- 0.13770127706829538
1139
- ],
1140
- "bp": 1.0,
1141
- "sys_len": 1999,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.2697264590787591,
1144
- "score": 0.2697264590787591,
1145
- "score_name": "sacrebleu",
1146
- "score_ci_low": 0.23374708119412382,
1147
- "score_ci_high": 0.32852575808051926,
1148
- "sacrebleu_ci_low": 0.23374708119412382,
1149
- "sacrebleu_ci_high": 0.32852575808051926
1150
- },
1151
- "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
- "counts": [
1154
- 1102,
1155
- 602,
1156
- 369,
1157
- 241
1158
- ],
1159
- "totals": [
1160
- 1925,
1161
- 1859,
1162
- 1793,
1163
- 1727
1164
- ],
1165
- "precisions": [
1166
- 0.5724675324675325,
1167
- 0.32383001613770845,
1168
- 0.20580033463469047,
1169
- 0.13954834973943256
1170
- ],
1171
- "bp": 1.0,
1172
- "sys_len": 1925,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.27012183165344417,
1175
- "score": 0.27012183165344417,
1176
- "score_name": "sacrebleu",
1177
- "score_ci_low": 0.23833827580048095,
1178
- "score_ci_high": 0.320793499898261,
1179
- "sacrebleu_ci_low": 0.23833827580048095,
1180
- "sacrebleu_ci_high": 0.320793499898261
1181
- },
1182
- "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
- "counts": [
1185
- 1392,
1186
- 1000,
1187
- 752,
1188
- 574
1189
- ],
1190
- "totals": [
1191
- 2564,
1192
- 2498,
1193
- 2432,
1194
- 2366
1195
- ],
1196
- "precisions": [
1197
- 0.5429017160686428,
1198
- 0.400320256204964,
1199
- 0.3092105263157895,
1200
- 0.242603550295858
1201
- ],
1202
- "bp": 1.0,
1203
- "sys_len": 2564,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.35733047002149876,
1206
- "score": 0.35733047002149876,
1207
- "score_name": "sacrebleu",
1208
- "score_ci_low": 0.2995890270688701,
1209
- "score_ci_high": 0.40598076558127455,
1210
- "sacrebleu_ci_low": 0.2995890270688701,
1211
- "sacrebleu_ci_high": 0.40598076558127455
1212
- },
1213
- "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
- "counts": [
1216
- 1376,
1217
- 983,
1218
- 722,
1219
- 538
1220
- ],
1221
- "totals": [
1222
- 2343,
1223
- 2277,
1224
- 2211,
1225
- 2145
1226
- ],
1227
- "precisions": [
1228
- 0.5872812633376013,
1229
- 0.43170838823012736,
1230
- 0.32654907281772955,
1231
- 0.25081585081585084
1232
- ],
1233
- "bp": 1.0,
1234
- "sys_len": 2343,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.3796077031006635,
1237
- "score": 0.3796077031006635,
1238
- "score_name": "sacrebleu",
1239
- "score_ci_low": 0.3432297777298846,
1240
- "score_ci_high": 0.4214166231989559,
1241
- "sacrebleu_ci_low": 0.3432297777298846,
1242
- "sacrebleu_ci_high": 0.4214166231989559
1243
- },
1244
- "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
- "counts": [
1247
- 1238,
1248
- 705,
1249
- 452,
1250
- 287
1251
- ],
1252
- "totals": [
1253
- 2444,
1254
- 2378,
1255
- 2312,
1256
- 2246
1257
- ],
1258
- "precisions": [
1259
- 0.5065466448445172,
1260
- 0.2964676198486123,
1261
- 0.19550173010380623,
1262
- 0.12778272484416742
1263
- ],
1264
- "bp": 1.0,
1265
- "sys_len": 2444,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.24748840965516117,
1268
- "score": 0.24748840965516117,
1269
- "score_name": "sacrebleu",
1270
- "score_ci_low": 0.21183776036339763,
1271
- "score_ci_high": 0.2806697118543708,
1272
- "sacrebleu_ci_low": 0.21183776036339763,
1273
- "sacrebleu_ci_high": 0.2806697118543708
1274
- },
1275
- "score": 0.3504084926765324,
1276
- "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
- },
1279
- "score": 0.4277276595742623,
1280
- "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
- }
1283
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/bluebench/{2025-06-23T09-36-33_evaluation_results.json → 2025-07-02T14-58-20_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-23T13:36:29.058411Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -8,7 +8,7 @@
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
- "model_name=watsonx/mistralai/pixtral-12b,max_tokens=256",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
@@ -26,9 +26,9 @@
26
  "num_fewshots": null,
27
  "limit": null,
28
  "batch_size": 8,
29
- "model": "watsonx/mistralai/pixtral-12b",
30
  "model_args": {
31
- "max_tokens": 256
32
  },
33
  "gen_kwargs": null,
34
  "chat_template_kwargs": null,
@@ -41,8 +41,8 @@
41
  "disable_hf_cache": false,
42
  "cache_dir": null
43
  },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -51,25 +51,25 @@
51
  "triton": "3.3.1",
52
  "nltk": "3.9.1",
53
  "anyio": "4.9.0",
 
54
  "absl-py": "2.3.0",
55
  "tiktoken": "0.9.0",
56
  "charset-normalizer": "3.4.2",
57
  "nvidia-cuda-runtime-cu12": "12.6.77",
58
  "sympy": "1.14.0",
59
  "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
  "httpcore": "1.0.9",
 
62
  "Jinja2": "3.1.6",
63
  "jsonschema-specifications": "2025.4.1",
64
  "pydantic_core": "2.33.2",
65
  "nvidia-cusparse-cu12": "12.5.4.2",
 
66
  "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
  "portalocker": "3.2.0",
69
  "pandas": "2.3.0",
70
  "multiprocess": "0.70.16",
71
  "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
  "nvidia-nvjitlink-cu12": "12.6.85",
74
  "nvidia-cublas-cu12": "12.6.4.1",
75
  "pydantic": "2.11.7",
@@ -79,7 +79,7 @@
79
  "contourpy": "1.3.2",
80
  "aiosignal": "1.3.2",
81
  "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
  "six": "1.17.0",
84
  "diskcache": "5.6.3",
85
  "tqdm": "4.67.1",
@@ -98,17 +98,16 @@
98
  "kiwisolver": "1.4.8",
99
  "networkx": "3.4.2",
100
  "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
  "sniffio": "1.3.1",
103
  "scikit-learn": "1.7.0",
 
104
  "nvidia-curand-cu12": "10.3.7.77",
105
  "pip": "25.1.1",
 
106
  "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
  "datasets": "3.6.0",
109
  "nvidia-cusolver-cu12": "11.7.1.2",
110
  "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
  "distro": "1.9.0",
113
  "idna": "3.10",
114
  "MarkupSafe": "3.0.2",
@@ -122,44 +121,45 @@
122
  "joblib": "1.5.1",
123
  "fsspec": "2025.3.0",
124
  "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
  "wheel": "0.45.1",
127
  "nvidia-nvtx-cu12": "12.6.77",
128
  "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
  "propcache": "0.3.2",
131
  "numpy": "2.2.6",
132
  "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
  "conllu": "6.0.0",
 
135
  "safetensors": "0.5.3",
136
  "requests": "2.32.4",
137
  "regex": "2024.11.6",
138
  "aiohttp": "3.12.13",
139
  "tabulate": "0.9.0",
 
140
  "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
  "nvidia-cufft-cu12": "11.3.0.4",
143
  "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
  "click": "8.2.1",
145
  "typing_extensions": "4.12.2",
146
  "attrs": "25.3.0",
147
  "exceptiongroup": "1.3.0",
 
148
  "tenacity": "9.1.2",
149
  "pytz": "2025.2",
150
  "aiohappyeyeballs": "2.6.1",
151
  "python-dateutil": "2.9.0.post0",
152
  "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
 
154
  "httpx": "0.28.1",
155
  "matplotlib": "3.10.3",
156
  "xxhash": "3.5.0",
157
  "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
  "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
  "threadpoolctl": "3.6.0",
162
  "nvidia-cudnn-cu12": "9.5.1.17",
 
163
  "jaraco.collections": "5.1.0",
164
  "tomli": "2.0.1",
165
  "backports.tarfile": "1.2.0",
@@ -176,1108 +176,1106 @@
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
- "accuracy": 0.5222222222222223,
180
- "accuracy_ci_low": 0.4222222222222222,
181
- "accuracy_ci_high": 0.6333333333333333,
182
  "score_name": "accuracy",
183
- "score": 0.5222222222222223,
184
- "score_ci_high": 0.6333333333333333,
185
- "score_ci_low": 0.4222222222222222,
186
- "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
  "accuracy": 0.5555555555555556,
190
- "accuracy_ci_low": 0.44726747907364484,
191
- "accuracy_ci_high": 0.6555555555555556,
192
  "score_name": "accuracy",
193
  "score": 0.5555555555555556,
194
- "score_ci_high": 0.6555555555555556,
195
- "score_ci_low": 0.44726747907364484,
196
- "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
- "accuracy": 0.8666666666666667,
200
- "accuracy_ci_low": 0.7781253622132644,
201
- "accuracy_ci_high": 0.9333333333333333,
202
  "score_name": "accuracy",
203
- "score": 0.8666666666666667,
204
- "score_ci_high": 0.9333333333333333,
205
- "score_ci_low": 0.7781253622132644,
206
- "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
- "accuracy": 0.5777777777777777,
210
- "accuracy_ci_low": 0.457520776596763,
211
- "accuracy_ci_high": 0.6777777777777778,
212
  "score_name": "accuracy",
213
- "score": 0.5777777777777777,
214
- "score_ci_high": 0.6777777777777778,
215
- "score_ci_low": 0.457520776596763,
216
- "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
- "accuracy": 0.5888888888888889,
220
- "accuracy_ci_low": 0.4888888888888889,
221
- "accuracy_ci_high": 0.6804301831819051,
222
  "score_name": "accuracy",
223
- "score": 0.5888888888888889,
224
- "score_ci_high": 0.6804301831819051,
225
- "score_ci_low": 0.4888888888888889,
226
- "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.9666666666666667,
230
- "accuracy_ci_low": 0.9111111111111111,
231
- "accuracy_ci_high": 0.9888888888888889,
232
  "score_name": "accuracy",
233
- "score": 0.9666666666666667,
234
- "score_ci_high": 0.9888888888888889,
235
- "score_ci_low": 0.9111111111111111,
236
- "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
- "accuracy": 0.8555555555555555,
240
- "accuracy_ci_low": 0.7725017589399771,
241
- "accuracy_ci_high": 0.9222222222222223,
242
  "score_name": "accuracy",
243
- "score": 0.8555555555555555,
244
- "score_ci_high": 0.9222222222222223,
245
- "score_ci_low": 0.7725017589399771,
246
- "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
  "accuracy": 0.6666666666666666,
250
- "accuracy_ci_low": 0.5666666666666667,
251
- "accuracy_ci_high": 0.7555555555555555,
252
  "score_name": "accuracy",
253
  "score": 0.6666666666666666,
254
- "score_ci_high": 0.7555555555555555,
255
- "score_ci_low": 0.5666666666666667,
256
- "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
- "accuracy": 0.6444444444444445,
260
- "accuracy_ci_low": 0.5444444444444444,
261
- "accuracy_ci_high": 0.7444444444444445,
262
  "score_name": "accuracy",
263
- "score": 0.6444444444444445,
264
- "score_ci_high": 0.7444444444444445,
265
- "score_ci_low": 0.5444444444444444,
266
- "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
- "accuracy": 0.6444444444444445,
270
- "accuracy_ci_low": 0.5444444444444444,
271
- "accuracy_ci_high": 0.7444444444444445,
272
  "score_name": "accuracy",
273
- "score": 0.6444444444444445,
274
- "score_ci_high": 0.7444444444444445,
275
- "score_ci_low": 0.5444444444444444,
276
- "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.8777777777777778,
280
- "accuracy_ci_low": 0.8,
281
- "accuracy_ci_high": 0.9333333333333333,
282
  "score_name": "accuracy",
283
- "score": 0.8777777777777778,
284
- "score_ci_high": 0.9333333333333333,
285
- "score_ci_low": 0.8,
286
- "num_of_instances": 90
287
  },
288
- "score": 0.706060606060606,
289
  "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.07977207977207977,
296
- "score": 0.07977207977207977,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.07977207977207977,
300
  "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.3180428134556575,
307
- "f1_Organization": 0.2747603833865815,
308
- "f1_Location": 0.22950819672131148,
309
- "f1_macro": 0.27410379785451683,
310
- "recall_macro": 0.23183858884648992,
311
- "precision_macro": 0.3454658738569018,
312
- "in_classes_support": 0.5302806499261448,
313
- "f1_micro": 0.20465890183028285,
314
- "recall_micro": 0.2342857142857143,
315
- "precision_micro": 0.18168389955686853,
316
- "score": 0.20465890183028285,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.1623107942071267,
319
- "score_ci_high": 0.24302164655950423,
320
- "f1_micro_ci_low": 0.1623107942071267,
321
- "f1_micro_ci_high": 0.24302164655950423
322
  },
323
- "score": 0.20465890183028285,
324
  "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
- "accuracy": 0.5633802816901409,
330
- "accuracy_ci_low": 0.43661971830985913,
331
- "accuracy_ci_high": 0.676056338028169,
332
  "score_name": "accuracy",
333
- "score": 0.5633802816901409,
334
- "score_ci_high": 0.676056338028169,
335
- "score_ci_low": 0.43661971830985913,
336
- "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.1267605633802817,
340
- "accuracy_ci_low": 0.056338028169014086,
341
- "accuracy_ci_high": 0.2112676056338028,
342
  "score_name": "accuracy",
343
- "score": 0.1267605633802817,
344
- "score_ci_high": 0.2112676056338028,
345
- "score_ci_low": 0.056338028169014086,
346
- "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
- "accuracy": 0.08450704225352113,
350
- "accuracy_ci_low": 0.04225352112676056,
351
- "accuracy_ci_high": 0.16901408450704225,
352
  "score_name": "accuracy",
353
- "score": 0.08450704225352113,
354
- "score_ci_high": 0.16901408450704225,
355
- "score_ci_low": 0.04225352112676056,
356
- "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
- "accuracy": 0.4084507042253521,
360
- "accuracy_ci_low": 0.30985915492957744,
361
- "accuracy_ci_high": 0.5211267605633803,
362
  "score_name": "accuracy",
363
- "score": 0.4084507042253521,
364
- "score_ci_high": 0.5211267605633803,
365
- "score_ci_low": 0.30985915492957744,
366
- "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
- "accuracy": 0.5211267605633803,
370
- "accuracy_ci_low": 0.39436619718309857,
371
- "accuracy_ci_high": 0.6338028169014085,
372
  "score_name": "accuracy",
373
- "score": 0.5211267605633803,
374
- "score_ci_high": 0.6338028169014085,
375
- "score_ci_low": 0.39436619718309857,
376
- "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
- "accuracy": 0.14084507042253522,
380
- "accuracy_ci_low": 0.07042253521126761,
381
- "accuracy_ci_high": 0.23943661971830985,
382
  "score_name": "accuracy",
383
- "score": 0.14084507042253522,
384
- "score_ci_high": 0.23943661971830985,
385
- "score_ci_low": 0.07042253521126761,
386
- "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
- "accuracy": 0.3380281690140845,
390
- "accuracy_ci_low": 0.22535211267605634,
391
- "accuracy_ci_high": 0.4507042253521127,
392
  "score_name": "accuracy",
393
- "score": 0.3380281690140845,
394
- "score_ci_high": 0.4507042253521127,
395
- "score_ci_low": 0.22535211267605634,
396
- "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
- "accuracy": 0.4788732394366197,
400
- "accuracy_ci_low": 0.36619718309859156,
401
- "accuracy_ci_high": 0.5915492957746479,
402
  "score_name": "accuracy",
403
- "score": 0.4788732394366197,
404
- "score_ci_high": 0.5915492957746479,
405
- "score_ci_low": 0.36619718309859156,
406
- "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
- "accuracy": 0.36619718309859156,
410
- "accuracy_ci_low": 0.2676056338028169,
411
- "accuracy_ci_high": 0.4788732394366197,
412
  "score_name": "accuracy",
413
- "score": 0.36619718309859156,
414
- "score_ci_high": 0.4788732394366197,
415
- "score_ci_low": 0.2676056338028169,
416
- "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
- "accuracy": 0.08450704225352113,
420
- "accuracy_ci_low": 0.028169014084507043,
421
- "accuracy_ci_high": 0.16901408450704225,
422
  "score_name": "accuracy",
423
- "score": 0.08450704225352113,
424
- "score_ci_high": 0.16901408450704225,
425
- "score_ci_low": 0.028169014084507043,
426
- "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
- "accuracy": 0.323943661971831,
430
- "accuracy_ci_low": 0.22535211267605634,
431
- "accuracy_ci_high": 0.4397440034897243,
432
  "score_name": "accuracy",
433
- "score": 0.323943661971831,
434
- "score_ci_high": 0.4397440034897243,
435
- "score_ci_low": 0.22535211267605634,
436
- "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
- "accuracy": 0.4225352112676056,
440
- "accuracy_ci_low": 0.30985915492957744,
441
- "accuracy_ci_high": 0.5492957746478874,
442
  "score_name": "accuracy",
443
- "score": 0.4225352112676056,
444
- "score_ci_high": 0.5492957746478874,
445
- "score_ci_low": 0.30985915492957744,
446
- "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.18309859154929578,
450
- "accuracy_ci_low": 0.09859154929577464,
451
- "accuracy_ci_high": 0.28910654360361887,
452
  "score_name": "accuracy",
453
- "score": 0.18309859154929578,
454
- "score_ci_high": 0.28910654360361887,
455
- "score_ci_low": 0.09859154929577464,
456
- "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
- "accuracy": 0.5633802816901409,
460
- "accuracy_ci_low": 0.4507042253521127,
461
- "accuracy_ci_high": 0.676056338028169,
462
  "score_name": "accuracy",
463
- "score": 0.5633802816901409,
464
- "score_ci_high": 0.676056338028169,
465
- "score_ci_low": 0.4507042253521127,
466
- "num_of_instances": 71
467
  },
468
- "score": 0.3289738430583501,
469
  "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.09605263157894736,
475
  "f1_suggestive": 0.0,
 
476
  "f1_generic": 0.0,
477
- "f1_descriptive": 0.375,
478
- "f1_fanciful": 0.10526315789473684,
479
- "f1_arbitrary": 0.0,
480
- "f1_macro_ci_low": 0.048484848484848485,
481
- "f1_macro_ci_high": 0.1610036081002675,
482
  "score_name": "f1_micro",
483
- "score": 0.13333333333333333,
484
- "score_ci_high": 0.24299065420560748,
485
- "score_ci_low": 0.05825242718446602,
486
- "num_of_instances": 85,
487
- "accuracy": 0.08235294117647059,
488
- "accuracy_ci_low": 0.03529411764705882,
489
- "accuracy_ci_high": 0.15294117647058825,
490
- "f1_micro": 0.13333333333333333,
491
- "f1_micro_ci_low": 0.05825242718446602,
492
- "f1_micro_ci_high": 0.24299065420560748
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.46628060735944554,
496
- "f1_no": 0.5809128630705395,
497
- "f1_yes": 0.3516483516483517,
498
- "f1_macro_ci_low": 0.3928270444081002,
499
- "f1_macro_ci_high": 0.5415825326375369,
500
  "score_name": "f1_micro",
501
- "score": 0.5180722891566265,
502
- "score_ci_high": 0.58253132966529,
503
- "score_ci_low": 0.44652531947540486,
504
- "num_of_instances": 200,
505
- "accuracy": 0.43,
506
- "accuracy_ci_low": 0.365,
507
- "accuracy_ci_high": 0.495,
508
- "f1_micro": 0.5180722891566265,
509
- "f1_micro_ci_low": 0.44652531947540486,
510
- "f1_micro_ci_high": 0.58253132966529
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.09994226986403168,
514
- "f1_conclusion": 0.0,
515
- "f1_decree": 0.14814814814814814,
516
- "f1_issue": 0.05714285714285714,
517
- "f1_analysis": 0.3076923076923077,
518
- "f1_facts": 0.06896551724137931,
519
  "f1_procedural history": 0.0,
520
- "f1_rule": 0.11764705882352941,
521
- "f1_macro_ci_low": 0.06467882036635723,
522
- "f1_macro_ci_high": 0.16271896227970067,
523
  "score_name": "f1_micro",
524
- "score": 0.1222707423580786,
525
- "score_ci_high": 0.1896551724137931,
526
- "score_ci_low": 0.07144817486457739,
527
- "num_of_instances": 200,
528
- "accuracy": 0.07,
529
- "accuracy_ci_low": 0.04,
530
- "accuracy_ci_high": 0.11032816661500704,
531
- "f1_micro": 0.1222707423580786,
532
- "f1_micro_ci_low": 0.07144817486457739,
533
- "f1_micro_ci_high": 0.1896551724137931
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.3036750483558994,
537
- "f1_yes": 0.18181818181818182,
538
- "f1_no": 0.425531914893617,
539
- "f1_macro_ci_low": 0.24346443663418135,
540
- "f1_macro_ci_high": 0.37985983397043105,
541
  "score_name": "f1_micro",
542
- "score": 0.33557046979865773,
543
- "score_ci_high": 0.41208424597764826,
544
- "score_ci_low": 0.26697141622873294,
545
- "num_of_instances": 200,
546
- "accuracy": 0.25,
547
- "accuracy_ci_low": 0.2,
548
- "accuracy_ci_high": 0.315,
549
- "f1_micro": 0.33557046979865773,
550
- "f1_micro_ci_low": 0.26697141622873294,
551
- "f1_micro_ci_high": 0.41208424597764826
552
  },
553
  "legalbench_proa": {
554
- "f1_macro": 0.8285413216920066,
555
- "f1_yes": 0.821917808219178,
556
- "f1_no": 0.8351648351648352,
557
- "f1_macro_ci_low": 0.7370610449522892,
558
- "f1_macro_ci_high": 0.8925462752093225,
559
  "score_name": "f1_micro",
560
- "score": 0.8292682926829268,
561
- "score_ci_high": 0.891566265060241,
562
- "score_ci_low": 0.7393939393939394,
563
- "num_of_instances": 85,
564
  "accuracy": 0.8,
565
- "accuracy_ci_low": 0.7058823529411765,
566
- "accuracy_ci_high": 0.8705882352941177,
567
- "f1_micro": 0.8292682926829268,
568
- "f1_micro_ci_low": 0.7393939393939394,
569
- "f1_micro_ci_high": 0.891566265060241
570
  },
571
- "score": 0.3877030254659246,
572
  "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.2883227310464808,
578
- "f1_cars": 0.45454545454545453,
579
  "f1_windows x": 0.0,
580
- "f1_atheism": 0.1,
581
- "f1_religion": 0.11594202898550725,
582
- "f1_medicine": 0.5161290322580645,
583
- "f1_christianity": 0.2,
584
- "f1_computer graphics": 0.24615384615384617,
585
- "f1_microsoft windows": 0.2,
586
- "f1_middle east": 0.23529411764705882,
587
- "f1_motorcycles": 0.43902439024390244,
588
- "f1_pc hardware": 0.38095238095238093,
589
- "f1_mac hardware": 0.14285714285714285,
590
- "f1_for sale": 0.2127659574468085,
591
- "f1_guns": 0.04,
592
- "f1_space": 0.4166666666666667,
593
- "f1_cryptography": 0.36065573770491804,
594
- "f1_baseball": 0.37333333333333335,
595
- "f1_hockey": 0.6060606060606061,
596
- "f1_politics": 0.23376623376623376,
597
- "f1_electronics": 0.49230769230769234,
598
- "f1_macro_ci_low": 0.2617609426183389,
599
- "f1_macro_ci_high": 0.31709395962356884,
600
  "score_name": "f1_micro",
601
- "score": 0.30918595967139656,
602
- "score_ci_high": 0.3396825281712499,
603
- "score_ci_low": 0.2773193184164219,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.207,
606
- "accuracy_ci_low": 0.183,
607
- "accuracy_ci_high": 0.231,
608
- "f1_micro": 0.30918595967139656,
609
- "f1_micro_ci_low": 0.2773193184164219,
610
- "f1_micro_ci_high": 0.3396825281712499
611
  },
612
- "score": 0.30918595967139656,
613
  "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.4658504084132011,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.6103646833013435,
620
- "f1_checking or savings account": 0.43243243243243246,
621
- "f1_debt collection": 0.35294117647058826,
622
- "f1_credit card or prepaid card": 0.4883720930232558,
623
- "f1_money transfer or virtual currency or money service": 0.48,
624
- "f1_mortgage": 0.6885245901639344,
625
- "f1_payday loan or title loan or personal loan": 0.16666666666666666,
626
- "f1_student loan": 0.5217391304347826,
627
- "f1_vehicle loan or lease": 0.45161290322580644,
628
- "f1_macro_ci_low": 0.4151030067042806,
629
- "f1_macro_ci_high": 0.5374380194007881,
630
  "score_name": "f1_micro",
631
- "score": 0.5647530040053405,
632
- "score_ci_high": 0.5946299934512115,
633
- "score_ci_low": 0.5324708819498815,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.423,
636
- "accuracy_ci_low": 0.392,
637
- "accuracy_ci_high": 0.4538616190423828,
638
- "f1_micro": 0.5647530040053405,
639
- "f1_micro_ci_low": 0.5324708819498815,
640
- "f1_micro_ci_high": 0.5946299934512115
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.6154980967666208,
644
- "f1_mortgages and loans": 0.6794871794871795,
645
- "f1_credit card": 0.6394557823129252,
646
- "f1_debt collection": 0.5405405405405406,
647
- "f1_credit reporting": 0.6307053941908713,
648
- "f1_retail banking": 0.5873015873015873,
649
- "f1_macro_ci_low": 0.5757444877146287,
650
- "f1_macro_ci_high": 0.6632621576134956,
651
  "score_name": "f1_micro",
652
- "score": 0.6152046783625731,
653
- "score_ci_high": 0.6592588246755606,
654
- "score_ci_low": 0.5758293838862559,
655
- "num_of_instances": 500,
656
- "accuracy": 0.526,
657
- "accuracy_ci_low": 0.488,
658
- "accuracy_ci_high": 0.572065074842346,
659
- "f1_micro": 0.6152046783625731,
660
- "f1_micro_ci_low": 0.5758293838862559,
661
- "f1_micro_ci_high": 0.6592588246755606
662
  },
663
- "score": 0.5899788411839568,
664
  "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
- "num_of_instances": 1000,
670
- "program_accuracy": 0.03,
671
- "score": 0.03,
672
  "score_name": "program_accuracy",
673
- "execution_accuracy": 0.03,
674
- "program_accuracy_ci_low": 0.02,
675
- "program_accuracy_ci_high": 0.041,
676
- "score_ci_low": 0.02,
677
- "score_ci_high": 0.041,
678
- "execution_accuracy_ci_low": 0.02,
679
- "execution_accuracy_ci_high": 0.042
680
  },
681
- "score": 0.03,
682
  "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.27959949741292506,
688
- "recall": 0.5616895325224012,
689
- "f1": 0.3093936462410963,
690
- "precision_ci_low": 0.26037333098993004,
691
- "precision_ci_high": 0.2998496234035471,
692
- "recall_ci_low": 0.545189324623663,
693
- "recall_ci_high": 0.5772522419606227,
694
- "f1_ci_low": 0.29233863350762596,
695
- "f1_ci_high": 0.3260584980677657,
696
  "score_name": "f1",
697
- "score": 0.3093936462410963,
698
- "score_ci_high": 0.3260584980677657,
699
- "score_ci_low": 0.29233863350762596,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.5708662017683188,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6597496373951435,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5238148674865564,
704
- "faithfullness_f1_token_overlap": 0.3063464322583528,
705
- "faithfullness_recall_token_overlap": 0.2515040891461165,
706
- "faithfullness_precision_token_overlap": 0.5201254248503554,
707
- "correctness_f1_token_overlap": 0.3093936462410963,
708
- "correctness_recall_token_overlap": 0.5616895325224012,
709
- "correctness_precision_token_overlap": 0.27959949741292506
710
  },
711
- "score": 0.3093936462410963,
712
  "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.572,
718
- "accuracy_ci_low": 0.5408695080796895,
719
- "accuracy_ci_high": 0.602,
720
  "score_name": "accuracy",
721
- "score": 0.572,
722
- "score_ci_high": 0.602,
723
- "score_ci_low": 0.5408695080796895,
724
- "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.878,
728
- "accuracy_ci_low": 0.846,
729
- "accuracy_ci_high": 0.9048842512733182,
730
  "score_name": "accuracy",
731
- "score": 0.878,
732
- "score_ci_high": 0.9048842512733182,
733
- "score_ci_low": 0.846,
734
- "num_of_instances": 500
735
  },
736
- "score": 0.725,
737
  "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
- "severity_critical": 2.0,
743
- "severity_high": 2.0,
744
  "severity_medium": 1.0,
745
- "severity_low": 95.0,
746
- "category_harmful_info": 0.8059234746569206,
747
- "category_deception": 0.7821050891370483,
748
- "category_explicit_content": 0.8131378136499965,
749
- "category_violence": 0.7824397089225905,
750
- "category_discrimination": 0.7646755935653806,
751
- "category_substance_abuse": 0.7601792050732505,
752
- "category_pii": 0.7583640239856861,
753
- "safety": 0.7892763628562292,
754
- "safety_ci_low": 0.7568372385512269,
755
- "safety_ci_high": 0.8199271972816702,
756
  "score_name": "safety",
757
- "score": 0.7892763628562292,
758
- "score_ci_high": 0.8199271972816702,
759
- "score_ci_low": 0.7568372385512269,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.7892763628562292,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rougeL": 0.26029440810817217,
770
- "score": 0.26029440810817217,
 
771
  "score_name": "rougeL",
772
- "rouge1": 0.37812164542305304,
773
- "rougeLsum": 0.3224592121117338,
774
- "rouge2": 0.1735255109874066,
775
- "rougeL_ci_low": 0.25326019704193026,
776
- "rougeL_ci_high": 0.2676771085873056,
777
- "score_ci_low": 0.25326019704193026,
778
- "score_ci_high": 0.2676771085873056,
779
- "rouge1_ci_low": 0.36799150913690026,
780
- "rouge1_ci_high": 0.3873907236511595,
781
- "rougeLsum_ci_low": 0.3130244535911371,
782
- "rougeLsum_ci_high": 0.33132370273160416,
783
- "rouge2_ci_low": 0.16695272546253506,
784
- "rouge2_ci_high": 0.18091296094174728
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rougeL": 0.09390373141967649,
789
- "score": 0.09390373141967649,
 
790
  "score_name": "rougeL",
791
- "rouge1": 0.12667295475773943,
792
- "rougeLsum": 0.10750099464975159,
793
- "rouge2": 0.01898752670157484,
794
- "rougeL_ci_low": 0.08937719185127607,
795
- "rougeL_ci_high": 0.09832477865928776,
796
- "score_ci_low": 0.08937719185127607,
797
- "score_ci_high": 0.09832477865928776,
798
- "rouge1_ci_low": 0.12055496978215834,
799
- "rouge1_ci_high": 0.13195192205981862,
800
- "rougeLsum_ci_low": 0.10257751488972287,
801
- "rougeLsum_ci_high": 0.11236007191539416,
802
- "rouge2_ci_low": 0.01673170907001085,
803
- "rouge2_ci_high": 0.02151839340904419
804
  },
805
- "score": 0.17709906976392434,
806
  "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
  },
809
  "translation": {
810
  "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
  "counts": [
813
- 1176,
814
- 681,
815
- 439,
816
- 289
817
  ],
818
  "totals": [
819
- 1805,
820
- 1739,
821
- 1673,
822
- 1607
823
  ],
824
  "precisions": [
825
- 0.6515235457063712,
826
- 0.39160437032777456,
827
- 0.26240286909742977,
828
- 0.17983820784069696
829
  ],
830
- "bp": 1.0,
831
- "sys_len": 1805,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.33125088532286445,
834
- "score": 0.33125088532286445,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.28827207319430564,
837
- "score_ci_high": 0.3750069277361591,
838
- "sacrebleu_ci_low": 0.28827207319430564,
839
- "sacrebleu_ci_high": 0.3750069277361591
840
  },
841
  "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
  "counts": [
844
- 1224,
845
- 746,
846
- 495,
847
- 332
848
  ],
849
  "totals": [
850
- 1783,
851
- 1717,
852
- 1651,
853
- 1585
854
  ],
855
  "precisions": [
856
- 0.6864834548513741,
857
- 0.4344787419918463,
858
- 0.29981829194427617,
859
- 0.20946372239747635
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 1783,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.36994859939150276,
865
- "score": 0.36994859939150276,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.3148788426694038,
868
- "score_ci_high": 0.4105592207294174,
869
- "sacrebleu_ci_low": 0.3148788426694038,
870
- "sacrebleu_ci_high": 0.4105592207294174
871
  },
872
  "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
  "counts": [
875
- 215,
876
- 39,
877
- 18,
878
  6
879
  ],
880
  "totals": [
881
- 3272,
882
- 3206,
883
- 3140,
884
- 3074
885
  ],
886
  "precisions": [
887
- 0.06570904645476773,
888
- 0.012164691203992516,
889
- 0.005732484076433121,
890
- 0.001951854261548471
891
  ],
892
- "bp": 1.0,
893
- "sys_len": 3272,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.009724765205835872,
896
- "score": 0.009724765205835872,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.00598430838144436,
899
- "score_ci_high": 0.015966807436499916,
900
- "sacrebleu_ci_low": 0.00598430838144436,
901
- "sacrebleu_ci_high": 0.015966807436499916
902
  },
903
  "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
  "counts": [
906
- 1112,
907
- 617,
908
- 379,
909
- 249
910
  ],
911
  "totals": [
912
- 1879,
913
- 1813,
914
- 1747,
915
- 1681
916
  ],
917
  "precisions": [
918
- 0.5918041511442257,
919
- 0.34031991174848314,
920
- 0.21694333142530053,
921
- 0.14812611540749554
922
  ],
923
  "bp": 1.0,
924
- "sys_len": 1879,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.283635656706225,
927
- "score": 0.283635656706225,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.25483093745296115,
930
- "score_ci_high": 0.3284369566535284,
931
- "sacrebleu_ci_low": 0.25483093745296115,
932
- "sacrebleu_ci_high": 0.3284369566535284
933
  },
934
  "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
  "counts": [
937
- 1422,
938
- 990,
939
- 740,
940
- 568
941
  ],
942
  "totals": [
943
- 2012,
944
- 1946,
945
- 1880,
946
- 1814
947
  ],
948
  "precisions": [
949
- 0.7067594433399603,
950
- 0.5087358684480986,
951
- 0.39361702127659576,
952
- 0.3131201764057332
953
  ],
954
- "bp": 0.9725507672852267,
955
- "sys_len": 2012,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.44622048751232035,
958
- "score": 0.44622048751232035,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.4016486544135287,
961
- "score_ci_high": 0.492468721152314,
962
- "sacrebleu_ci_low": 0.4016486544135287,
963
- "sacrebleu_ci_high": 0.492468721152314
964
  },
965
  "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
  "counts": [
968
- 983,
969
- 445,
970
- 221,
971
- 116
972
  ],
973
  "totals": [
974
- 2522,
975
- 2456,
976
- 2390,
977
- 2324
978
  ],
979
  "precisions": [
980
- 0.3897700237906423,
981
- 0.18118892508143322,
982
- 0.09246861924686192,
983
- 0.04991394148020654
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 2522,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.13436590468744522,
989
- "score": 0.13436590468744522,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.10668449271679148,
992
- "score_ci_high": 0.16331801147285638,
993
- "sacrebleu_ci_low": 0.10668449271679148,
994
- "sacrebleu_ci_high": 0.16331801147285638
995
  },
996
  "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
  "counts": [
999
- 1397,
1000
- 976,
1001
- 718,
1002
- 534
1003
  ],
1004
  "totals": [
1005
- 1885,
1006
- 1819,
1007
- 1753,
1008
- 1687
1009
  ],
1010
  "precisions": [
1011
- 0.7411140583554376,
1012
- 0.536558548653106,
1013
- 0.40958357102110665,
1014
- 0.31653823355068167
1015
  ],
1016
- "bp": 0.9836888676493653,
1017
- "sys_len": 1885,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.4687329402986153,
1020
- "score": 0.4687329402986153,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.4090027394886287,
1023
- "score_ci_high": 0.5072542898902915,
1024
- "sacrebleu_ci_low": 0.4090027394886287,
1025
- "sacrebleu_ci_high": 0.5072542898902915
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
  "counts": [
1030
- 1075,
1031
- 589,
1032
- 357,
1033
- 218
1034
  ],
1035
  "totals": [
1036
- 2037,
1037
- 1971,
1038
- 1905,
1039
- 1839
1040
  ],
1041
  "precisions": [
1042
- 0.5277368679430535,
1043
- 0.29883307965499745,
1044
- 0.1874015748031496,
1045
- 0.11854268624252312
1046
  ],
1047
  "bp": 1.0,
1048
- "sys_len": 2037,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.24328959002445089,
1051
- "score": 0.24328959002445089,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.2108781721763864,
1054
- "score_ci_high": 0.2866413776406142,
1055
- "sacrebleu_ci_low": 0.2108781721763864,
1056
- "sacrebleu_ci_high": 0.2866413776406142
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
  "counts": [
1061
- 1223,
1062
- 645,
1063
- 371,
1064
- 219
1065
  ],
1066
  "totals": [
1067
- 2012,
1068
- 1946,
1069
- 1880,
1070
- 1814
1071
  ],
1072
  "precisions": [
1073
- 0.6078528827037774,
1074
- 0.3314491264131552,
1075
- 0.1973404255319149,
1076
- 0.12072767364939362
1077
  ],
1078
- "bp": 0.9581570887075945,
1079
- "sys_len": 2012,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.25220069551672647,
1082
- "score": 0.25220069551672647,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.21455240851266189,
1085
- "score_ci_high": 0.28127381859222883,
1086
- "sacrebleu_ci_low": 0.21455240851266189,
1087
- "sacrebleu_ci_high": 0.28127381859222883
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
  "counts": [
1092
- 1265,
1093
- 811,
1094
- 564,
1095
- 403
1096
  ],
1097
  "totals": [
1098
- 1799,
1099
- 1733,
1100
- 1667,
1101
- 1601
1102
  ],
1103
  "precisions": [
1104
- 0.7031684269038355,
1105
- 0.4679746105020196,
1106
- 0.33833233353329334,
1107
- 0.2517176764522173
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 1799,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.40915203730030597,
1113
- "score": 0.40915203730030597,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.37740759275198177,
1116
- "score_ci_high": 0.45339812071532676,
1117
- "sacrebleu_ci_low": 0.37740759275198177,
1118
- "sacrebleu_ci_high": 0.45339812071532676
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
  "counts": [
1123
- 1032,
1124
- 484,
1125
- 292,
1126
- 181
1127
  ],
1128
  "totals": [
1129
- 1873,
1130
- 1807,
1131
- 1741,
1132
- 1675
1133
  ],
1134
  "precisions": [
1135
- 0.5509877202349173,
1136
- 0.26784726065301606,
1137
- 0.16771970132107986,
1138
- 0.10805970149253731
1139
  ],
1140
  "bp": 1.0,
1141
- "sys_len": 1873,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.22741507142824863,
1144
- "score": 0.22741507142824863,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.19114102819495998,
1147
- "score_ci_high": 0.26211626317047076,
1148
- "sacrebleu_ci_low": 0.19114102819495998,
1149
- "sacrebleu_ci_high": 0.26211626317047076
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
  "counts": [
1154
- 1019,
1155
- 503,
1156
- 310,
1157
- 204
1158
  ],
1159
  "totals": [
1160
- 1801,
1161
- 1735,
1162
- 1669,
1163
- 1603
1164
  ],
1165
  "precisions": [
1166
- 0.5657967795669073,
1167
- 0.28991354466858793,
1168
- 0.18573996405032955,
1169
- 0.1272613849033063
1170
  ],
1171
  "bp": 1.0,
1172
- "sys_len": 1801,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.24953573320283845,
1175
- "score": 0.24953573320283845,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.20393561832427076,
1178
- "score_ci_high": 0.27875367055825584,
1179
- "sacrebleu_ci_low": 0.20393561832427076,
1180
- "sacrebleu_ci_high": 0.27875367055825584
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
  "counts": [
1185
- 1285,
1186
- 864,
1187
- 625,
1188
- 463
1189
  ],
1190
  "totals": [
1191
- 1899,
1192
- 1833,
1193
- 1767,
1194
- 1701
1195
  ],
1196
  "precisions": [
1197
- 0.6766719325961031,
1198
- 0.4713584288052373,
1199
- 0.35370684776457273,
1200
- 0.2721928277483833
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 1899,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.4186126965889709,
1206
- "score": 0.4186126965889709,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.3580478494686714,
1209
- "score_ci_high": 0.47721854835474,
1210
- "sacrebleu_ci_low": 0.3580478494686714,
1211
- "sacrebleu_ci_high": 0.47721854835474
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
  "counts": [
1216
- 1286,
1217
- 859,
1218
- 598,
1219
- 417
1220
  ],
1221
  "totals": [
1222
- 1799,
1223
- 1733,
1224
- 1667,
1225
- 1601
1226
  ],
1227
  "precisions": [
1228
- 0.7148415786548082,
1229
- 0.4956722446624351,
1230
- 0.3587282543491302,
1231
- 0.26046221111805123
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 1799,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.42655857647405626,
1237
- "score": 0.42655857647405626,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.37843554301324095,
1240
- "score_ci_high": 0.4676738192216758,
1241
- "sacrebleu_ci_low": 0.37843554301324095,
1242
- "sacrebleu_ci_high": 0.4676738192216758
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
  "counts": [
1247
- 1178,
1248
- 657,
1249
- 409,
1250
- 266
1251
  ],
1252
  "totals": [
1253
- 1874,
1254
- 1808,
1255
- 1742,
1256
- 1676
1257
  ],
1258
  "precisions": [
1259
- 0.6286019210245464,
1260
- 0.36338495575221236,
1261
- 0.23478760045924227,
1262
- 0.15871121718377088
1263
  ],
1264
  "bp": 1.0,
1265
- "sys_len": 1874,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.3037430550349969,
1268
- "score": 0.3037430550349969,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.26128349825222114,
1271
- "score_ci_high": 0.3531214265467947,
1272
- "sacrebleu_ci_low": 0.26128349825222114,
1273
- "sacrebleu_ci_high": 0.3531214265467947
1274
  },
1275
- "score": 0.3049591129796936,
1276
  "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
  },
1279
- "score": 0.38015857299104155,
1280
  "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
  }
1283
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-07-02T18:58:17.004768Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
+ "model_name=watsonx/ibm/granite-3-2b-instruct,max_tokens=1024",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
 
26
  "num_fewshots": null,
27
  "limit": null,
28
  "batch_size": 8,
29
+ "model": "watsonx/ibm/granite-3-2b-instruct",
30
  "model_args": {
31
+ "max_tokens": 1024
32
  },
33
  "gen_kwargs": null,
34
  "chat_template_kwargs": null,
 
41
  "disable_hf_cache": false,
42
  "cache_dir": null
43
  },
44
+ "unitxt_version": "1.25.0",
45
+ "unitxt_commit_hash": "c8a5e77fd6ca62039b915dc10700323f50ccaacf",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
51
  "triton": "3.3.1",
52
  "nltk": "3.9.1",
53
  "anyio": "4.9.0",
54
+ "unitxt": "1.25.0",
55
  "absl-py": "2.3.0",
56
  "tiktoken": "0.9.0",
57
  "charset-normalizer": "3.4.2",
58
  "nvidia-cuda-runtime-cu12": "12.6.77",
59
  "sympy": "1.14.0",
60
  "mecab-ko": "1.0.1",
 
61
  "httpcore": "1.0.9",
62
+ "litellm": "1.73.6",
63
  "Jinja2": "3.1.6",
64
  "jsonschema-specifications": "2025.4.1",
65
  "pydantic_core": "2.33.2",
66
  "nvidia-cusparse-cu12": "12.5.4.2",
67
+ "tokenizers": "0.21.2",
68
  "yarl": "1.20.1",
 
69
  "portalocker": "3.2.0",
70
  "pandas": "2.3.0",
71
  "multiprocess": "0.70.16",
72
  "jsonschema": "4.24.0",
 
73
  "nvidia-nvjitlink-cu12": "12.6.85",
74
  "nvidia-cublas-cu12": "12.6.4.1",
75
  "pydantic": "2.11.7",
 
79
  "contourpy": "1.3.2",
80
  "aiosignal": "1.3.2",
81
  "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "openai": "1.93.0",
83
  "six": "1.17.0",
84
  "diskcache": "5.6.3",
85
  "tqdm": "4.67.1",
 
98
  "kiwisolver": "1.4.8",
99
  "networkx": "3.4.2",
100
  "typing-inspection": "0.4.1",
 
101
  "sniffio": "1.3.1",
102
  "scikit-learn": "1.7.0",
103
+ "rpds-py": "0.26.0",
104
  "nvidia-curand-cu12": "10.3.7.77",
105
  "pip": "25.1.1",
106
+ "pillow": "11.3.0",
107
  "fonttools": "4.58.4",
 
108
  "datasets": "3.6.0",
109
  "nvidia-cusolver-cu12": "11.7.1.2",
110
  "cycler": "0.12.1",
 
111
  "distro": "1.9.0",
112
  "idna": "3.10",
113
  "MarkupSafe": "3.0.2",
 
121
  "joblib": "1.5.1",
122
  "fsspec": "2025.3.0",
123
  "dill": "0.3.8",
 
124
  "wheel": "0.45.1",
125
  "nvidia-nvtx-cu12": "12.6.77",
126
  "nvidia-cusparselt-cu12": "0.6.3",
127
+ "lxml": "6.0.0",
128
  "propcache": "0.3.2",
129
  "numpy": "2.2.6",
130
  "mpmath": "1.3.0",
 
131
  "conllu": "6.0.0",
132
+ "huggingface-hub": "0.33.2",
133
  "safetensors": "0.5.3",
134
  "requests": "2.32.4",
135
  "regex": "2024.11.6",
136
  "aiohttp": "3.12.13",
137
  "tabulate": "0.9.0",
138
+ "accelerate": "1.8.1",
139
  "certifi": "2025.6.15",
140
+ "evaluate": "0.4.4",
141
  "nvidia-cufft-cu12": "11.3.0.4",
142
  "nvidia-cuda-nvrtc-cu12": "12.6.77",
143
  "click": "8.2.1",
144
  "typing_extensions": "4.12.2",
145
  "attrs": "25.3.0",
146
  "exceptiongroup": "1.3.0",
147
+ "transformers": "4.53.0",
148
  "tenacity": "9.1.2",
149
  "pytz": "2025.2",
150
  "aiohappyeyeballs": "2.6.1",
151
  "python-dateutil": "2.9.0.post0",
152
  "torch": "2.7.1",
153
+ "python-dotenv": "1.1.1",
154
+ "multidict": "6.6.3",
155
  "httpx": "0.28.1",
156
  "matplotlib": "3.10.3",
157
  "xxhash": "3.5.0",
158
  "PyYAML": "6.0.2",
 
159
  "colorama": "0.4.6",
 
160
  "threadpoolctl": "3.6.0",
161
  "nvidia-cudnn-cu12": "9.5.1.17",
162
+ "hf-xet": "1.1.5",
163
  "jaraco.collections": "5.1.0",
164
  "tomli": "2.0.1",
165
  "backports.tarfile": "1.2.0",
 
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
+ "accuracy": 0.3333333333333333,
180
+ "accuracy_ci_low": 0.0,
181
+ "accuracy_ci_high": 0.6666666666666666,
182
  "score_name": "accuracy",
183
+ "score": 0.3333333333333333,
184
+ "score_ci_high": 0.6666666666666666,
185
+ "score_ci_low": 0.0,
186
+ "num_of_instances": 9
187
  },
188
  "safety_bbq_disability_status": {
189
  "accuracy": 0.5555555555555556,
190
+ "accuracy_ci_low": 0.2222222222222222,
191
+ "accuracy_ci_high": 0.8888888888888888,
192
  "score_name": "accuracy",
193
  "score": 0.5555555555555556,
194
+ "score_ci_high": 0.8888888888888888,
195
+ "score_ci_low": 0.2222222222222222,
196
+ "num_of_instances": 9
197
  },
198
  "safety_bbq_gender_identity": {
199
+ "accuracy": 0.7777777777777778,
200
+ "accuracy_ci_low": 0.4444444444444444,
201
+ "accuracy_ci_high": 1.0,
202
  "score_name": "accuracy",
203
+ "score": 0.7777777777777778,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 0.4444444444444444,
206
+ "num_of_instances": 9
207
  },
208
  "safety_bbq_nationality": {
209
+ "accuracy": 0.3333333333333333,
210
+ "accuracy_ci_low": 0.1111111111111111,
211
+ "accuracy_ci_high": 0.6666666666666666,
212
  "score_name": "accuracy",
213
+ "score": 0.3333333333333333,
214
+ "score_ci_high": 0.6666666666666666,
215
+ "score_ci_low": 0.1111111111111111,
216
+ "num_of_instances": 9
217
  },
218
  "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.3333333333333333,
220
+ "accuracy_ci_low": 0.1111111111111111,
221
+ "accuracy_ci_high": 0.6666666666666666,
222
  "score_name": "accuracy",
223
+ "score": 0.3333333333333333,
224
+ "score_ci_high": 0.6666666666666666,
225
+ "score_ci_low": 0.1111111111111111,
226
+ "num_of_instances": 9
227
  },
228
  "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.5555555555555556,
230
+ "accuracy_ci_low": 0.2222222222222222,
231
+ "accuracy_ci_high": 0.8888888888888888,
232
  "score_name": "accuracy",
233
+ "score": 0.5555555555555556,
234
+ "score_ci_high": 0.8888888888888888,
235
+ "score_ci_low": 0.2222222222222222,
236
+ "num_of_instances": 9
237
  },
238
  "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.5555555555555556,
240
+ "accuracy_ci_low": 0.2222222222222222,
241
+ "accuracy_ci_high": 0.8888888888888888,
242
  "score_name": "accuracy",
243
+ "score": 0.5555555555555556,
244
+ "score_ci_high": 0.8888888888888888,
245
+ "score_ci_low": 0.2222222222222222,
246
+ "num_of_instances": 9
247
  },
248
  "safety_bbq_race_x_ses": {
249
  "accuracy": 0.6666666666666666,
250
+ "accuracy_ci_low": 0.3333333333333333,
251
+ "accuracy_ci_high": 0.8888888888888888,
252
  "score_name": "accuracy",
253
  "score": 0.6666666666666666,
254
+ "score_ci_high": 0.8888888888888888,
255
+ "score_ci_low": 0.3333333333333333,
256
+ "num_of_instances": 9
257
  },
258
  "safety_bbq_religion": {
259
+ "accuracy": 0.4444444444444444,
260
+ "accuracy_ci_low": 0.1111111111111111,
261
+ "accuracy_ci_high": 0.7777777777777778,
262
  "score_name": "accuracy",
263
+ "score": 0.4444444444444444,
264
+ "score_ci_high": 0.7777777777777778,
265
+ "score_ci_low": 0.1111111111111111,
266
+ "num_of_instances": 9
267
  },
268
  "safety_bbq_ses": {
269
+ "accuracy": 0.2222222222222222,
270
+ "accuracy_ci_low": 0.0,
271
+ "accuracy_ci_high": 0.5555555555555556,
272
  "score_name": "accuracy",
273
+ "score": 0.2222222222222222,
274
+ "score_ci_high": 0.5555555555555556,
275
+ "score_ci_low": 0.0,
276
+ "num_of_instances": 9
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.2222222222222222,
280
+ "accuracy_ci_low": 0.0,
281
+ "accuracy_ci_high": 0.5780215743718348,
282
  "score_name": "accuracy",
283
+ "score": 0.2222222222222222,
284
+ "score_ci_high": 0.5780215743718348,
285
+ "score_ci_low": 0.0,
286
+ "num_of_instances": 9
287
  },
288
+ "score": 0.45454545454545453,
289
  "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.31693989071038253,
296
+ "score": 0.31693989071038253,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.31693989071038253,
300
  "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.19354838709677416,
307
+ "f1_Organization": 0.3111111111111111,
308
+ "f1_Location": 0.11764705882352941,
309
+ "f1_macro": 0.20743551901047155,
310
+ "recall_macro": 0.15458937198067632,
311
+ "precision_macro": 0.328921568627451,
312
+ "in_classes_support": 0.7,
313
+ "f1_micro": 0.19199999999999998,
314
+ "recall_micro": 0.16,
315
+ "precision_micro": 0.24,
316
+ "score": 0.19199999999999998,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.10104164340624598,
319
+ "score_ci_high": 0.27209420121438893,
320
+ "f1_micro_ci_low": 0.10104164340624598,
321
+ "f1_micro_ci_high": 0.27209420121438893
322
  },
323
+ "score": 0.19199999999999998,
324
  "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
  "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.14285714285714285,
340
+ "accuracy_ci_low": 0.0,
341
+ "accuracy_ci_high": 0.7469722493882013,
342
  "score_name": "accuracy",
343
+ "score": 0.14285714285714285,
344
+ "score_ci_high": 0.7469722493882013,
345
+ "score_ci_low": 0.0,
346
+ "num_of_instances": 7
347
  },
348
  "mmlu_pro_chemistry": {
349
+ "accuracy": 0.42857142857142855,
350
+ "accuracy_ci_low": 0.14285714285714285,
351
+ "accuracy_ci_high": 0.8571428571428571,
352
  "score_name": "accuracy",
353
+ "score": 0.42857142857142855,
354
+ "score_ci_high": 0.8571428571428571,
355
+ "score_ci_low": 0.14285714285714285,
356
+ "num_of_instances": 7
357
  },
358
  "mmlu_pro_computer_science": {
359
+ "accuracy": 0.5714285714285714,
360
+ "accuracy_ci_low": 0.14285714285714285,
361
+ "accuracy_ci_high": 0.8571428571428571,
362
  "score_name": "accuracy",
363
+ "score": 0.5714285714285714,
364
+ "score_ci_high": 0.8571428571428571,
365
+ "score_ci_low": 0.14285714285714285,
366
+ "num_of_instances": 7
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.42857142857142855,
370
+ "accuracy_ci_low": 0.14285714285714285,
371
+ "accuracy_ci_high": 0.8571428571428571,
372
  "score_name": "accuracy",
373
+ "score": 0.42857142857142855,
374
+ "score_ci_high": 0.8571428571428571,
375
+ "score_ci_low": 0.14285714285714285,
376
+ "num_of_instances": 7
377
  },
378
  "mmlu_pro_engineering": {
379
+ "accuracy": 0.0,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.0,
382
  "score_name": "accuracy",
383
+ "score": 0.0,
384
+ "score_ci_high": 0.0,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.0,
390
+ "accuracy_ci_low": 0.0,
391
+ "accuracy_ci_high": 0.0,
392
  "score_name": "accuracy",
393
+ "score": 0.0,
394
+ "score_ci_high": 0.0,
395
+ "score_ci_low": 0.0,
396
+ "num_of_instances": 7
397
  },
398
  "mmlu_pro_history": {
399
+ "accuracy": 0.14285714285714285,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.5714285714285714,
402
  "score_name": "accuracy",
403
+ "score": 0.14285714285714285,
404
+ "score_ci_high": 0.5714285714285714,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
  },
408
  "mmlu_pro_law": {
409
+ "accuracy": 0.42857142857142855,
410
+ "accuracy_ci_low": 0.14285714285714285,
411
+ "accuracy_ci_high": 0.8571428571428571,
412
  "score_name": "accuracy",
413
+ "score": 0.42857142857142855,
414
+ "score_ci_high": 0.8571428571428571,
415
+ "score_ci_low": 0.14285714285714285,
416
+ "num_of_instances": 7
417
  },
418
  "mmlu_pro_math": {
419
+ "accuracy": 0.14285714285714285,
420
+ "accuracy_ci_low": 0.0,
421
+ "accuracy_ci_high": 0.6807203593841678,
422
  "score_name": "accuracy",
423
+ "score": 0.14285714285714285,
424
+ "score_ci_high": 0.6807203593841678,
425
+ "score_ci_low": 0.0,
426
+ "num_of_instances": 7
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.0,
430
+ "accuracy_ci_low": 0.0,
431
+ "accuracy_ci_high": 0.0,
432
  "score_name": "accuracy",
433
+ "score": 0.0,
434
+ "score_ci_high": 0.0,
435
+ "score_ci_low": 0.0,
436
+ "num_of_instances": 7
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.2857142857142857,
440
+ "accuracy_ci_low": 0.0,
441
+ "accuracy_ci_high": 0.7142857142857143,
442
  "score_name": "accuracy",
443
+ "score": 0.2857142857142857,
444
+ "score_ci_high": 0.7142857142857143,
445
+ "score_ci_low": 0.0,
446
+ "num_of_instances": 7
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.14285714285714285,
450
+ "accuracy_ci_low": 0.0,
451
+ "accuracy_ci_high": 0.6807203593841678,
452
  "score_name": "accuracy",
453
+ "score": 0.14285714285714285,
454
+ "score_ci_high": 0.6807203593841678,
455
+ "score_ci_low": 0.0,
456
+ "num_of_instances": 7
457
  },
458
  "mmlu_pro_psychology": {
459
+ "accuracy": 0.2857142857142857,
460
+ "accuracy_ci_low": 0.0,
461
+ "accuracy_ci_high": 0.7142857142857143,
462
  "score_name": "accuracy",
463
+ "score": 0.2857142857142857,
464
+ "score_ci_high": 0.7142857142857143,
465
+ "score_ci_low": 0.0,
466
+ "num_of_instances": 7
467
  },
468
+ "score": 0.26530612244897955,
469
  "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.3833333333333333,
475
  "f1_suggestive": 0.0,
476
+ "f1_arbitrary": 0.5,
477
  "f1_generic": 0.0,
478
+ "f1_descriptive": 0.75,
479
+ "f1_fanciful": 0.6666666666666666,
480
+ "f1_macro_ci_low": 0.23401046490188043,
481
+ "f1_macro_ci_high": 0.5639947336385638,
 
482
  "score_name": "f1_micro",
483
+ "score": 0.45714285714285713,
484
+ "score_ci_high": 0.7027027027027027,
485
+ "score_ci_low": 0.24242424242424243,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.4,
488
+ "accuracy_ci_low": 0.2,
489
+ "accuracy_ci_high": 0.65,
490
+ "f1_micro": 0.45714285714285713,
491
+ "f1_micro_ci_low": 0.24242424242424243,
492
+ "f1_micro_ci_high": 0.7027027027027027
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.4982078853046595,
496
+ "f1_no": 0.7741935483870968,
497
+ "f1_yes": 0.2222222222222222,
498
+ "f1_macro_ci_low": 0.3548387096774194,
499
+ "f1_macro_ci_high": 0.8838212196875215,
500
  "score_name": "f1_micro",
501
+ "score": 0.65,
502
+ "score_ci_high": 0.85,
503
+ "score_ci_low": 0.4,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.65,
506
+ "accuracy_ci_low": 0.4,
507
+ "accuracy_ci_high": 0.85,
508
+ "f1_micro": 0.65,
509
+ "f1_micro_ci_low": 0.4,
510
+ "f1_micro_ci_high": 0.85
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.11355311355311357,
514
+ "f1_conclusion": 0.3333333333333333,
515
+ "f1_rule": 0.0,
516
+ "f1_decree": 0.0,
517
+ "f1_analysis": 0.0,
518
+ "f1_issue": 0.46153846153846156,
519
  "f1_procedural history": 0.0,
520
+ "f1_facts": 0.0,
521
+ "f1_macro_ci_low": 0.03031798789203245,
522
+ "f1_macro_ci_high": 0.24492716802205583,
523
  "score_name": "f1_micro",
524
+ "score": 0.20512820512820512,
525
+ "score_ci_high": 0.42105263157894735,
526
+ "score_ci_low": 0.05263157894736842,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.2,
529
+ "accuracy_ci_low": 0.05,
530
+ "accuracy_ci_high": 0.4,
531
+ "f1_micro": 0.20512820512820512,
532
+ "f1_micro_ci_low": 0.05263157894736842,
533
+ "f1_micro_ci_high": 0.42105263157894735
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.5333333333333333,
537
+ "f1_yes": 0.6666666666666666,
538
+ "f1_no": 0.4,
539
+ "f1_macro_ci_low": 0.32142857142857145,
540
+ "f1_macro_ci_high": 0.7684210526315789,
541
  "score_name": "f1_micro",
542
+ "score": 0.5641025641025641,
543
+ "score_ci_high": 0.7692307692307693,
544
+ "score_ci_low": 0.34978373445915895,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.55,
547
+ "accuracy_ci_low": 0.3199652777510431,
548
+ "accuracy_ci_high": 0.75,
549
+ "f1_micro": 0.5641025641025641,
550
+ "f1_micro_ci_low": 0.34978373445915895,
551
+ "f1_micro_ci_high": 0.7692307692307693
552
  },
553
  "legalbench_proa": {
554
+ "f1_macro": 0.8174603174603174,
555
+ "f1_yes": 0.8571428571428571,
556
+ "f1_no": 0.7777777777777778,
557
+ "f1_macro_ci_low": 0.6114676143429352,
558
+ "f1_macro_ci_high": 0.949874686716792,
559
  "score_name": "f1_micro",
560
+ "score": 0.8205128205128205,
561
+ "score_ci_high": 0.95,
562
+ "score_ci_low": 0.6153846153846154,
563
+ "num_of_instances": 20,
564
  "accuracy": 0.8,
565
+ "accuracy_ci_low": 0.6,
566
+ "accuracy_ci_high": 0.95,
567
+ "f1_micro": 0.8205128205128205,
568
+ "f1_micro_ci_low": 0.6153846153846154,
569
+ "f1_micro_ci_high": 0.95
570
  },
571
+ "score": 0.5393772893772893,
572
  "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.30137445887445885,
578
+ "f1_cars": 0.7272727272727273,
579
  "f1_windows x": 0.0,
580
+ "f1_atheism": 0.0,
581
+ "f1_christianity": 0.5714285714285714,
582
+ "f1_religion": 0.0,
583
+ "f1_medicine": 0.0,
584
+ "f1_computer graphics": 0.5714285714285714,
585
+ "f1_microsoft windows": 0.5,
586
+ "f1_middle east": 0.0,
587
+ "f1_politics": 0.2857142857142857,
588
+ "f1_motorcycles": 0.25,
589
+ "f1_mac hardware": 0.2857142857142857,
590
+ "f1_pc hardware": 0.26666666666666666,
591
+ "f1_electronics": 0.5,
592
+ "f1_for sale": 0.3333333333333333,
593
+ "f1_guns": 0.0,
594
+ "f1_space": 0.5714285714285714,
595
+ "f1_cryptography": 0.2857142857142857,
596
+ "f1_baseball": 0.5454545454545454,
597
+ "f1_hockey": 0.3333333333333333,
598
+ "f1_macro_ci_low": 0.23918926535588275,
599
+ "f1_macro_ci_high": 0.40599488243071413,
600
  "score_name": "f1_micro",
601
+ "score": 0.3333333333333333,
602
+ "score_ci_high": 0.43373493975903615,
603
+ "score_ci_low": 0.2360248447204969,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.27,
606
+ "accuracy_ci_low": 0.19,
607
+ "accuracy_ci_high": 0.36,
608
+ "f1_micro": 0.3333333333333333,
609
+ "f1_micro_ci_low": 0.2360248447204969,
610
+ "f1_micro_ci_high": 0.43373493975903615
611
  },
612
+ "score": 0.3333333333333333,
613
  "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.5896499490484451,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9172932330827067,
620
+ "f1_credit card or prepaid card": 0.3076923076923077,
621
+ "f1_money transfer or virtual currency or money service": 0.8,
622
+ "f1_mortgage": 0.6666666666666666,
623
+ "f1_payday loan or title loan or personal loan": 0.0,
624
+ "f1_debt collection": 0.6666666666666666,
625
+ "f1_checking or savings account": 0.7692307692307693,
626
+ "f1_macro_ci_low": 0.3258080885737301,
627
+ "f1_macro_ci_high": 0.7354751788733711,
 
 
628
  "score_name": "f1_micro",
629
+ "score": 0.8172043010752689,
630
+ "score_ci_high": 0.8756756756756757,
631
+ "score_ci_low": 0.7191151800610749,
632
+ "num_of_instances": 100,
633
+ "accuracy": 0.76,
634
+ "accuracy_ci_low": 0.67,
635
+ "accuracy_ci_high": 0.85,
636
+ "f1_micro": 0.8172043010752689,
637
+ "f1_micro_ci_low": 0.7191151800610749,
638
+ "f1_micro_ci_high": 0.8756756756756757
639
  },
640
  "cfpb_product_watsonx": {
641
+ "f1_macro": 0.5410602593440121,
642
+ "f1_mortgages and loans": 0.7,
643
+ "f1_credit card": 0.5333333333333333,
644
+ "f1_debt collection": 0.5263157894736842,
645
+ "f1_credit reporting": 0.6956521739130435,
646
+ "f1_retail banking": 0.25,
647
+ "f1_macro_ci_low": 0.42536480028629614,
648
+ "f1_macro_ci_high": 0.724652829691291,
649
  "score_name": "f1_micro",
650
+ "score": 0.5882352941176471,
651
+ "score_ci_high": 0.7143825412589823,
652
+ "score_ci_low": 0.4444444444444444,
653
+ "num_of_instances": 50,
654
+ "accuracy": 0.5,
655
+ "accuracy_ci_low": 0.36,
656
+ "accuracy_ci_high": 0.64,
657
+ "f1_micro": 0.5882352941176471,
658
+ "f1_micro_ci_low": 0.4444444444444444,
659
+ "f1_micro_ci_high": 0.7143825412589823
660
  },
661
+ "score": 0.702719797596458,
662
  "score_name": "subsets_mean",
663
+ "num_of_instances": 150
664
  },
665
  "qa_finance": {
666
  "fin_qa": {
667
+ "num_of_instances": 100,
668
+ "program_accuracy": 0.12,
669
+ "score": 0.12,
670
  "score_name": "program_accuracy",
671
+ "execution_accuracy": 0.11,
672
+ "program_accuracy_ci_low": 0.07,
673
+ "program_accuracy_ci_high": 0.19,
674
+ "score_ci_low": 0.07,
675
+ "score_ci_high": 0.19,
676
+ "execution_accuracy_ci_low": 0.05,
677
+ "execution_accuracy_ci_high": 0.18
678
  },
679
+ "score": 0.12,
680
  "score_name": "subsets_mean",
681
+ "num_of_instances": 100
682
  },
683
  "rag_general": {
684
  "rag_response_generation_clapnq": {
685
+ "precision": 0.4313706117093708,
686
+ "recall": 0.6100473634574232,
687
+ "f1": 0.46154026066287224,
688
+ "precision_ci_low": 0.39954934305142603,
689
+ "precision_ci_high": 0.4688092947200737,
690
+ "recall_ci_low": 0.561785694956309,
691
+ "recall_ci_high": 0.6478852680062104,
692
+ "f1_ci_low": 0.4348230402373462,
693
+ "f1_ci_high": 0.4887575867947463,
694
  "score_name": "f1",
695
+ "score": 0.46154026066287224,
696
+ "score_ci_high": 0.4887575867947463,
697
+ "score_ci_low": 0.4348230402373462,
698
+ "num_of_instances": 100,
699
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6853316521644592,
700
+ "correctness_recall_bert_score.deberta_large_mnli": 0.7231194680929184,
701
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6613810208439826,
702
+ "faithfullness_f1_token_overlap": 0.42865612410785836,
703
+ "faithfullness_recall_token_overlap": 0.3474276364400726,
704
+ "faithfullness_precision_token_overlap": 0.6951614384100505,
705
+ "correctness_f1_token_overlap": 0.46154026066287224,
706
+ "correctness_recall_token_overlap": 0.6100473634574232,
707
+ "correctness_precision_token_overlap": 0.4313706117093708
708
  },
709
+ "score": 0.46154026066287224,
710
  "score_name": "subsets_mean",
711
+ "num_of_instances": 100
712
  },
713
  "reasoning": {
714
  "hellaswag": {
715
+ "accuracy": 0.41,
716
+ "accuracy_ci_low": 0.32,
717
+ "accuracy_ci_high": 0.51,
718
  "score_name": "accuracy",
719
+ "score": 0.41,
720
+ "score_ci_high": 0.51,
721
+ "score_ci_low": 0.32,
722
+ "num_of_instances": 100
723
  },
724
  "openbook_qa": {
725
+ "accuracy": 0.74,
726
+ "accuracy_ci_low": 0.65,
727
+ "accuracy_ci_high": 0.83,
728
  "score_name": "accuracy",
729
+ "score": 0.74,
730
+ "score_ci_high": 0.83,
731
+ "score_ci_low": 0.65,
732
+ "num_of_instances": 100
733
  },
734
+ "score": 0.575,
735
  "score_name": "subsets_mean",
736
+ "num_of_instances": 200
737
  },
738
  "safety": {
739
  "attaq_500": {
740
+ "severity_critical": 0.0,
741
+ "severity_high": 0.0,
742
  "severity_medium": 1.0,
743
+ "severity_low": 99.0,
744
+ "category_harmful_info": 0.8524156869463032,
745
+ "category_deception": 0.8773881010334903,
746
+ "category_explicit_content": 0.859378189149529,
747
+ "category_violence": 0.8398635849356652,
748
+ "category_discrimination": 0.8441331806400465,
749
+ "category_substance_abuse": 0.8779810434931683,
750
+ "category_pii": 0.8530489449147826,
751
+ "safety": 0.852252782159485,
752
+ "safety_ci_low": 0.832847680014418,
753
+ "safety_ci_high": 0.8708004382240435,
754
  "score_name": "safety",
755
+ "score": 0.852252782159485,
756
+ "score_ci_high": 0.8708004382240435,
757
+ "score_ci_low": 0.832847680014418,
758
  "num_of_instances": 100
759
  },
760
+ "score": 0.852252782159485,
761
  "score_name": "subsets_mean",
762
  "num_of_instances": 100
763
  },
764
  "summarization": {
765
  "billsum_document_filtered_to_6000_chars": {
766
+ "num_of_instances": 100,
767
+ "rouge2": 0.17642133353378867,
768
+ "rougeL": 0.2631037191491013,
769
+ "score": 0.2631037191491013,
770
  "score_name": "rougeL",
771
+ "rouge1": 0.4091466523448068,
772
+ "rougeLsum": 0.34021391474003543,
773
+ "rouge2_ci_low": 0.16326857616870982,
774
+ "rouge2_ci_high": 0.19157111379873681,
775
+ "rougeL_ci_low": 0.2489620557680663,
776
+ "rougeL_ci_high": 0.27896256764413024,
777
+ "score_ci_low": 0.2489620557680663,
778
+ "score_ci_high": 0.27896256764413024,
779
+ "rouge1_ci_low": 0.3876453159453502,
780
+ "rouge1_ci_high": 0.4301714881055239,
781
+ "rougeLsum_ci_low": 0.31999068104597417,
782
+ "rougeLsum_ci_high": 0.35802162862605086
 
783
  },
784
  "tldr_document_filtered_to_6000_chars": {
785
+ "num_of_instances": 100,
786
+ "rouge2": 0.012194115618038825,
787
+ "rougeL": 0.07205757337255324,
788
+ "score": 0.07205757337255324,
789
  "score_name": "rougeL",
790
+ "rouge1": 0.0979610101102933,
791
+ "rougeLsum": 0.08211458868414816,
792
+ "rouge2_ci_low": 0.008497690786302959,
793
+ "rouge2_ci_high": 0.017709346128850476,
794
+ "rougeL_ci_low": 0.06301430117089502,
795
+ "rougeL_ci_high": 0.08066813507295256,
796
+ "score_ci_low": 0.06301430117089502,
797
+ "score_ci_high": 0.08066813507295256,
798
+ "rouge1_ci_low": 0.08482616596430592,
799
+ "rouge1_ci_high": 0.11192239519184158,
800
+ "rougeLsum_ci_low": 0.0716126906179182,
801
+ "rougeLsum_ci_high": 0.09246495633996564
 
802
  },
803
+ "score": 0.16758064626082728,
804
  "score_name": "subsets_mean",
805
+ "num_of_instances": 200
806
  },
807
  "translation": {
808
  "mt_flores_101_ara_eng": {
809
+ "num_of_instances": 6,
810
  "counts": [
811
+ 117,
812
+ 57,
813
+ 33,
814
+ 20
815
  ],
816
  "totals": [
817
+ 201,
818
+ 195,
819
+ 189,
820
+ 183
821
  ],
822
  "precisions": [
823
+ 0.582089552238806,
824
+ 0.2923076923076923,
825
+ 0.1746031746031746,
826
+ 0.1092896174863388
827
  ],
828
+ "bp": 0.9657735711441044,
829
+ "sys_len": 201,
830
+ "ref_len": 208,
831
+ "sacrebleu": 0.23053697440015153,
832
+ "score": 0.23053697440015153,
833
  "score_name": "sacrebleu",
834
+ "score_ci_low": 0.10517490265827865,
835
+ "score_ci_high": 0.3940190928317325,
836
+ "sacrebleu_ci_low": 0.10517490265827865,
837
+ "sacrebleu_ci_high": 0.3940190928317325
838
  },
839
  "mt_flores_101_deu_eng": {
840
+ "num_of_instances": 6,
841
  "counts": [
842
+ 117,
843
+ 65,
844
+ 35,
845
+ 23
846
  ],
847
  "totals": [
848
+ 213,
849
+ 207,
850
+ 201,
851
+ 195
852
  ],
853
  "precisions": [
854
+ 0.5492957746478874,
855
+ 0.3140096618357488,
856
+ 0.17412935323383086,
857
+ 0.11794871794871796
858
  ],
859
  "bp": 1.0,
860
+ "sys_len": 213,
861
+ "ref_len": 208,
862
+ "sacrebleu": 0.2439656149705105,
863
+ "score": 0.2439656149705105,
864
  "score_name": "sacrebleu",
865
+ "score_ci_low": 0.14790087731817905,
866
+ "score_ci_high": 0.33202464586887365,
867
+ "sacrebleu_ci_low": 0.14790087731817905,
868
+ "sacrebleu_ci_high": 0.33202464586887365
869
  },
870
  "mt_flores_101_eng_ara": {
871
+ "num_of_instances": 6,
872
  "counts": [
873
+ 72,
874
+ 23,
875
+ 11,
876
  6
877
  ],
878
  "totals": [
879
+ 206,
880
+ 200,
881
+ 194,
882
+ 188
883
  ],
884
  "precisions": [
885
+ 0.34951456310679613,
886
+ 0.115,
887
+ 0.05670103092783505,
888
+ 0.031914893617021274
889
  ],
890
+ "bp": 0.9855424223451845,
891
+ "sys_len": 206,
892
+ "ref_len": 209,
893
+ "sacrebleu": 0.09101483608425708,
894
+ "score": 0.09101483608425708,
895
  "score_name": "sacrebleu",
896
+ "score_ci_low": 0.04163319689445597,
897
+ "score_ci_high": 0.12744602542291247,
898
+ "sacrebleu_ci_low": 0.04163319689445597,
899
+ "sacrebleu_ci_high": 0.12744602542291247
900
  },
901
  "mt_flores_101_eng_deu": {
902
+ "num_of_instances": 6,
903
  "counts": [
904
+ 113,
905
+ 54,
906
+ 29,
907
+ 13
908
  ],
909
  "totals": [
910
+ 224,
911
+ 218,
912
+ 212,
913
+ 206
914
  ],
915
  "precisions": [
916
+ 0.5044642857142857,
917
+ 0.24770642201834864,
918
+ 0.13679245283018868,
919
+ 0.06310679611650485
920
  ],
921
  "bp": 1.0,
922
+ "sys_len": 224,
923
+ "ref_len": 216,
924
+ "sacrebleu": 0.1812284975765625,
925
+ "score": 0.1812284975765625,
926
  "score_name": "sacrebleu",
927
+ "score_ci_low": 0.10521667883417998,
928
+ "score_ci_high": 0.29917894657147576,
929
+ "sacrebleu_ci_low": 0.10521667883417998,
930
+ "sacrebleu_ci_high": 0.29917894657147576
931
  },
932
  "mt_flores_101_eng_fra": {
933
+ "num_of_instances": 6,
934
  "counts": [
935
+ 162,
936
+ 103,
937
+ 68,
938
+ 48
939
  ],
940
  "totals": [
941
+ 246,
942
+ 240,
943
+ 234,
944
+ 228
945
  ],
946
  "precisions": [
947
+ 0.6585365853658537,
948
+ 0.42916666666666664,
949
+ 0.2905982905982906,
950
+ 0.2105263157894737
951
  ],
952
+ "bp": 1.0,
953
+ "sys_len": 246,
954
+ "ref_len": 235,
955
+ "sacrebleu": 0.36261964969994975,
956
+ "score": 0.36261964969994975,
957
  "score_name": "sacrebleu",
958
+ "score_ci_low": 0.29613813398752276,
959
+ "score_ci_high": 0.4664466752414592,
960
+ "sacrebleu_ci_low": 0.29613813398752276,
961
+ "sacrebleu_ci_high": 0.4664466752414592
962
  },
963
  "mt_flores_101_eng_kor": {
964
+ "num_of_instances": 6,
965
  "counts": [
966
+ 102,
967
+ 39,
968
+ 22,
969
+ 11
970
  ],
971
  "totals": [
972
+ 348,
973
+ 342,
974
+ 336,
975
+ 330
976
  ],
977
  "precisions": [
978
+ 0.29310344827586204,
979
+ 0.11403508771929825,
980
+ 0.06547619047619048,
981
+ 0.03333333333333333
982
  ],
983
  "bp": 1.0,
984
+ "sys_len": 348,
985
+ "ref_len": 249,
986
+ "sacrebleu": 0.09241775072026762,
987
+ "score": 0.09241775072026762,
988
  "score_name": "sacrebleu",
989
+ "score_ci_low": 0.03911145943513657,
990
+ "score_ci_high": 0.13521883464235146,
991
+ "sacrebleu_ci_low": 0.03911145943513657,
992
+ "sacrebleu_ci_high": 0.13521883464235146
993
  },
994
  "mt_flores_101_eng_por": {
995
+ "num_of_instances": 6,
996
  "counts": [
997
+ 149,
998
+ 95,
999
+ 66,
1000
+ 46
1001
  ],
1002
  "totals": [
1003
+ 230,
1004
+ 224,
1005
+ 218,
1006
+ 212
1007
  ],
1008
  "precisions": [
1009
+ 0.6478260869565218,
1010
+ 0.42410714285714285,
1011
+ 0.3027522935779816,
1012
+ 0.2169811320754717
1013
  ],
1014
+ "bp": 1.0,
1015
+ "sys_len": 230,
1016
+ "ref_len": 222,
1017
+ "sacrebleu": 0.3665311234426107,
1018
+ "score": 0.3665311234426107,
1019
  "score_name": "sacrebleu",
1020
+ "score_ci_low": 0.2708503915486859,
1021
+ "score_ci_high": 0.4439101310395995,
1022
+ "sacrebleu_ci_low": 0.2708503915486859,
1023
+ "sacrebleu_ci_high": 0.4439101310395995
1024
  },
1025
  "mt_flores_101_eng_ron": {
1026
+ "num_of_instances": 6,
1027
  "counts": [
1028
+ 110,
1029
+ 42,
1030
+ 22,
1031
+ 15
1032
  ],
1033
  "totals": [
1034
+ 240,
1035
+ 234,
1036
+ 228,
1037
+ 222
1038
  ],
1039
  "precisions": [
1040
+ 0.45833333333333337,
1041
+ 0.1794871794871795,
1042
+ 0.09649122807017545,
1043
+ 0.06756756756756757
1044
  ],
1045
  "bp": 1.0,
1046
+ "sys_len": 240,
1047
+ "ref_len": 230,
1048
+ "sacrebleu": 0.1521809352229689,
1049
+ "score": 0.1521809352229689,
1050
  "score_name": "sacrebleu",
1051
+ "score_ci_low": 0.04331464284837397,
1052
+ "score_ci_high": 0.28507289956238885,
1053
+ "sacrebleu_ci_low": 0.04331464284837397,
1054
+ "sacrebleu_ci_high": 0.28507289956238885
1055
  },
1056
  "mt_flores_101_eng_spa": {
1057
+ "num_of_instances": 6,
1058
  "counts": [
1059
+ 153,
1060
+ 89,
1061
+ 58,
1062
+ 39
1063
  ],
1064
  "totals": [
1065
+ 235,
1066
+ 229,
1067
+ 223,
1068
+ 217
1069
  ],
1070
  "precisions": [
1071
+ 0.651063829787234,
1072
+ 0.38864628820960695,
1073
+ 0.2600896860986547,
1074
+ 0.17972350230414746
1075
  ],
1076
+ "bp": 0.9665303748102905,
1077
+ "sys_len": 235,
1078
+ "ref_len": 243,
1079
+ "sacrebleu": 0.3187440093392008,
1080
+ "score": 0.3187440093392008,
1081
  "score_name": "sacrebleu",
1082
+ "score_ci_low": 0.23704783766140267,
1083
+ "score_ci_high": 0.35626973427173086,
1084
+ "sacrebleu_ci_low": 0.23704783766140267,
1085
+ "sacrebleu_ci_high": 0.35626973427173086
1086
  },
1087
  "mt_flores_101_fra_eng": {
1088
+ "num_of_instances": 6,
1089
  "counts": [
1090
+ 140,
1091
+ 88,
1092
+ 53,
1093
+ 32
1094
  ],
1095
  "totals": [
1096
+ 213,
1097
+ 207,
1098
+ 201,
1099
+ 195
1100
  ],
1101
  "precisions": [
1102
+ 0.6572769953051644,
1103
+ 0.42512077294685985,
1104
+ 0.263681592039801,
1105
+ 0.16410256410256407
1106
  ],
1107
  "bp": 1.0,
1108
+ "sys_len": 213,
1109
+ "ref_len": 208,
1110
+ "sacrebleu": 0.3315995897054171,
1111
+ "score": 0.3315995897054171,
1112
  "score_name": "sacrebleu",
1113
+ "score_ci_low": 0.3023185103522073,
1114
+ "score_ci_high": 0.36769424496487263,
1115
+ "sacrebleu_ci_low": 0.3023185103522073,
1116
+ "sacrebleu_ci_high": 0.36769424496487263
1117
  },
1118
  "mt_flores_101_jpn_eng": {
1119
+ "num_of_instances": 6,
1120
  "counts": [
1121
+ 112,
1122
+ 56,
1123
+ 31,
1124
+ 18
1125
  ],
1126
  "totals": [
1127
+ 262,
1128
+ 256,
1129
+ 250,
1130
+ 244
1131
  ],
1132
  "precisions": [
1133
+ 0.42748091603053434,
1134
+ 0.21875,
1135
+ 0.124,
1136
+ 0.07377049180327869
1137
  ],
1138
  "bp": 1.0,
1139
+ "sys_len": 262,
1140
+ "ref_len": 208,
1141
+ "sacrebleu": 0.17101818352211745,
1142
+ "score": 0.17101818352211745,
1143
  "score_name": "sacrebleu",
1144
+ "score_ci_low": 0.11747970698212559,
1145
+ "score_ci_high": 0.25765320515004,
1146
+ "sacrebleu_ci_low": 0.11747970698212559,
1147
+ "sacrebleu_ci_high": 0.25765320515004
1148
  },
1149
  "mt_flores_101_kor_eng": {
1150
+ "num_of_instances": 6,
1151
  "counts": [
1152
+ 108,
1153
+ 47,
1154
+ 25,
1155
+ 13
1156
  ],
1157
  "totals": [
1158
+ 217,
1159
+ 211,
1160
+ 205,
1161
+ 199
1162
  ],
1163
  "precisions": [
1164
+ 0.4976958525345622,
1165
+ 0.22274881516587677,
1166
+ 0.12195121951219512,
1167
+ 0.06532663316582915
1168
  ],
1169
  "bp": 1.0,
1170
+ "sys_len": 217,
1171
+ "ref_len": 208,
1172
+ "sacrebleu": 0.1723907511301038,
1173
+ "score": 0.1723907511301038,
1174
  "score_name": "sacrebleu",
1175
+ "score_ci_low": 0.12055708968153714,
1176
+ "score_ci_high": 0.23536159723298522,
1177
+ "sacrebleu_ci_low": 0.12055708968153714,
1178
+ "sacrebleu_ci_high": 0.23536159723298522
1179
  },
1180
  "mt_flores_101_por_eng": {
1181
+ "num_of_instances": 6,
1182
  "counts": [
1183
+ 138,
1184
+ 90,
1185
+ 56,
1186
+ 42
1187
  ],
1188
  "totals": [
1189
+ 211,
1190
+ 205,
1191
+ 199,
1192
+ 193
1193
  ],
1194
  "precisions": [
1195
+ 0.6540284360189574,
1196
+ 0.4390243902439025,
1197
+ 0.2814070351758794,
1198
+ 0.21761658031088082
1199
  ],
1200
  "bp": 1.0,
1201
+ "sys_len": 211,
1202
+ "ref_len": 208,
1203
+ "sacrebleu": 0.3641481038335531,
1204
+ "score": 0.3641481038335531,
1205
  "score_name": "sacrebleu",
1206
+ "score_ci_low": 0.2224318478297402,
1207
+ "score_ci_high": 0.45657021838402995,
1208
+ "sacrebleu_ci_low": 0.2224318478297402,
1209
+ "sacrebleu_ci_high": 0.45657021838402995
1210
  },
1211
  "mt_flores_101_ron_eng": {
1212
+ "num_of_instances": 6,
1213
  "counts": [
1214
+ 136,
1215
+ 81,
1216
+ 57,
1217
+ 41
1218
  ],
1219
  "totals": [
1220
+ 229,
1221
+ 223,
1222
+ 217,
1223
+ 211
1224
  ],
1225
  "precisions": [
1226
+ 0.5938864628820961,
1227
+ 0.36322869955156956,
1228
+ 0.2626728110599078,
1229
+ 0.19431279620853079
1230
  ],
1231
  "bp": 1.0,
1232
+ "sys_len": 229,
1233
+ "ref_len": 208,
1234
+ "sacrebleu": 0.3239291461681843,
1235
+ "score": 0.3239291461681843,
1236
  "score_name": "sacrebleu",
1237
+ "score_ci_low": 0.23531941746553897,
1238
+ "score_ci_high": 0.4842839193385085,
1239
+ "sacrebleu_ci_low": 0.23531941746553897,
1240
+ "sacrebleu_ci_high": 0.4842839193385085
1241
  },
1242
  "mt_flores_101_spa_eng": {
1243
+ "num_of_instances": 6,
1244
  "counts": [
1245
+ 128,
1246
+ 64,
1247
+ 35,
1248
+ 23
1249
  ],
1250
  "totals": [
1251
+ 225,
1252
+ 219,
1253
+ 213,
1254
+ 207
1255
  ],
1256
  "precisions": [
1257
+ 0.5688888888888889,
1258
+ 0.2922374429223744,
1259
+ 0.1643192488262911,
1260
+ 0.1111111111111111
1261
  ],
1262
  "bp": 1.0,
1263
+ "sys_len": 225,
1264
+ "ref_len": 208,
1265
+ "sacrebleu": 0.23472119660866142,
1266
+ "score": 0.23472119660866142,
1267
  "score_name": "sacrebleu",
1268
+ "score_ci_low": 0.11338576888580632,
1269
+ "score_ci_high": 0.32460680792548485,
1270
+ "sacrebleu_ci_low": 0.11338576888580632,
1271
+ "sacrebleu_ci_high": 0.32460680792548485
1272
  },
1273
+ "score": 0.24246975749496777,
1274
  "score_name": "subsets_mean",
1275
+ "num_of_instances": 90
1276
  },
1277
+ "score": 0.4017742565069269,
1278
  "score_name": "subsets_mean",
1279
+ "num_of_instances": 1537
1280
  }
1281
  }
results/bluebench/2025-07-02T15-15-09_evaluation_results.json ADDED
@@ -0,0 +1,1281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-07-02T19:15:05.019850Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/ibm/granite-3-3-8b-instruct,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/ibm/granite-3-3-8b-instruct",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.25.0",
45
+ "unitxt_commit_hash": "c8a5e77fd6ca62039b915dc10700323f50ccaacf",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "unitxt": "1.25.0",
55
+ "absl-py": "2.3.0",
56
+ "tiktoken": "0.9.0",
57
+ "charset-normalizer": "3.4.2",
58
+ "nvidia-cuda-runtime-cu12": "12.6.77",
59
+ "sympy": "1.14.0",
60
+ "mecab-ko": "1.0.1",
61
+ "httpcore": "1.0.9",
62
+ "litellm": "1.73.6",
63
+ "Jinja2": "3.1.6",
64
+ "jsonschema-specifications": "2025.4.1",
65
+ "pydantic_core": "2.33.2",
66
+ "nvidia-cusparse-cu12": "12.5.4.2",
67
+ "tokenizers": "0.21.2",
68
+ "yarl": "1.20.1",
69
+ "portalocker": "3.2.0",
70
+ "pandas": "2.3.0",
71
+ "multiprocess": "0.70.16",
72
+ "jsonschema": "4.24.0",
73
+ "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
78
+ "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
80
+ "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "openai": "1.93.0",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
+ "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
+ "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "sniffio": "1.3.1",
102
+ "scikit-learn": "1.7.0",
103
+ "rpds-py": "0.26.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
105
+ "pip": "25.1.1",
106
+ "pillow": "11.3.0",
107
+ "fonttools": "4.58.4",
108
+ "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "distro": "1.9.0",
112
+ "idna": "3.10",
113
+ "MarkupSafe": "3.0.2",
114
+ "frozenlist": "1.7.0",
115
+ "pyparsing": "3.2.3",
116
+ "jiter": "0.10.0",
117
+ "importlib_metadata": "8.0.0",
118
+ "packaging": "24.2",
119
+ "psutil": "7.0.0",
120
+ "mecab-ko-dic": "1.0.0",
121
+ "joblib": "1.5.1",
122
+ "fsspec": "2025.3.0",
123
+ "dill": "0.3.8",
124
+ "wheel": "0.45.1",
125
+ "nvidia-nvtx-cu12": "12.6.77",
126
+ "nvidia-cusparselt-cu12": "0.6.3",
127
+ "lxml": "6.0.0",
128
+ "propcache": "0.3.2",
129
+ "numpy": "2.2.6",
130
+ "mpmath": "1.3.0",
131
+ "conllu": "6.0.0",
132
+ "huggingface-hub": "0.33.2",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "regex": "2024.11.6",
136
+ "aiohttp": "3.12.13",
137
+ "tabulate": "0.9.0",
138
+ "accelerate": "1.8.1",
139
+ "certifi": "2025.6.15",
140
+ "evaluate": "0.4.4",
141
+ "nvidia-cufft-cu12": "11.3.0.4",
142
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
143
+ "click": "8.2.1",
144
+ "typing_extensions": "4.12.2",
145
+ "attrs": "25.3.0",
146
+ "exceptiongroup": "1.3.0",
147
+ "transformers": "4.53.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.1",
154
+ "multidict": "6.6.3",
155
+ "httpx": "0.28.1",
156
+ "matplotlib": "3.10.3",
157
+ "xxhash": "3.5.0",
158
+ "PyYAML": "6.0.2",
159
+ "colorama": "0.4.6",
160
+ "threadpoolctl": "3.6.0",
161
+ "nvidia-cudnn-cu12": "9.5.1.17",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.3333333333333333,
180
+ "accuracy_ci_low": 0.1111111111111111,
181
+ "accuracy_ci_high": 0.6666666666666666,
182
+ "score_name": "accuracy",
183
+ "score": 0.3333333333333333,
184
+ "score_ci_high": 0.6666666666666666,
185
+ "score_ci_low": 0.1111111111111111,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.6666666666666666,
190
+ "accuracy_ci_low": 0.3333333333333333,
191
+ "accuracy_ci_high": 0.8888888888888888,
192
+ "score_name": "accuracy",
193
+ "score": 0.6666666666666666,
194
+ "score_ci_high": 0.8888888888888888,
195
+ "score_ci_low": 0.3333333333333333,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 1.0,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.6666666666666666,
220
+ "accuracy_ci_low": 0.3333333333333333,
221
+ "accuracy_ci_high": 0.8888888888888888,
222
+ "score_name": "accuracy",
223
+ "score": 0.6666666666666666,
224
+ "score_ci_high": 0.8888888888888888,
225
+ "score_ci_low": 0.3333333333333333,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.8888888888888888,
230
+ "accuracy_ci_low": 0.4444444444444444,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 0.8888888888888888,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 0.4444444444444444,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.8888888888888888,
240
+ "accuracy_ci_low": 0.47716657027690984,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 0.8888888888888888,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 0.47716657027690984,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.7777777777777778,
250
+ "accuracy_ci_low": 0.3333333333333333,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 0.7777777777777778,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 0.3333333333333333,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.7777777777777778,
260
+ "accuracy_ci_low": 0.4444444444444444,
261
+ "accuracy_ci_high": 1.0,
262
+ "score_name": "accuracy",
263
+ "score": 0.7777777777777778,
264
+ "score_ci_high": 1.0,
265
+ "score_ci_low": 0.4444444444444444,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.4444444444444444,
270
+ "accuracy_ci_low": 0.1111111111111111,
271
+ "accuracy_ci_high": 0.7777777777777778,
272
+ "score_name": "accuracy",
273
+ "score": 0.4444444444444444,
274
+ "score_ci_high": 0.7777777777777778,
275
+ "score_ci_low": 0.1111111111111111,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.7777777777777778,
280
+ "accuracy_ci_low": 0.3333333333333333,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 0.7777777777777778,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 0.3333333333333333,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.7474747474747474,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.6125,
296
+ "score": 0.6125,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.6125,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.37500000000000006,
307
+ "f1_Organization": 0.2857142857142857,
308
+ "f1_Location": 0.13793103448275862,
309
+ "f1_macro": 0.26621510673234816,
310
+ "recall_macro": 0.18616287094547965,
311
+ "precision_macro": 0.4984126984126984,
312
+ "in_classes_support": 0.509090909090909,
313
+ "f1_micro": 0.2153846153846154,
314
+ "recall_micro": 0.18666666666666668,
315
+ "precision_micro": 0.2545454545454545,
316
+ "score": 0.2153846153846154,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.11288016143257878,
319
+ "score_ci_high": 0.3309060924550398,
320
+ "f1_micro_ci_low": 0.11288016143257878,
321
+ "f1_micro_ci_high": 0.3309060924550398
322
+ },
323
+ "score": 0.2153846153846154,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
+ "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.0,
340
+ "accuracy_ci_low": 0.0,
341
+ "accuracy_ci_high": 0.0,
342
+ "score_name": "accuracy",
343
+ "score": 0.0,
344
+ "score_ci_high": 0.0,
345
+ "score_ci_low": 0.0,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.14285714285714285,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.5714285714285714,
352
+ "score_name": "accuracy",
353
+ "score": 0.14285714285714285,
354
+ "score_ci_high": 0.5714285714285714,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.7142857142857143,
360
+ "accuracy_ci_low": 0.2857142857142857,
361
+ "accuracy_ci_high": 1.0,
362
+ "score_name": "accuracy",
363
+ "score": 0.7142857142857143,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 0.2857142857142857,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.5714285714285714,
370
+ "accuracy_ci_low": 0.14285714285714285,
371
+ "accuracy_ci_high": 0.8571428571428571,
372
+ "score_name": "accuracy",
373
+ "score": 0.5714285714285714,
374
+ "score_ci_high": 0.8571428571428571,
375
+ "score_ci_low": 0.14285714285714285,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.0,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.0,
382
+ "score_name": "accuracy",
383
+ "score": 0.0,
384
+ "score_ci_high": 0.0,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.14285714285714285,
390
+ "accuracy_ci_low": 0.0,
391
+ "accuracy_ci_high": 0.5714285714285714,
392
+ "score_name": "accuracy",
393
+ "score": 0.14285714285714285,
394
+ "score_ci_high": 0.5714285714285714,
395
+ "score_ci_low": 0.0,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.2857142857142857,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.7142857142857143,
402
+ "score_name": "accuracy",
403
+ "score": 0.2857142857142857,
404
+ "score_ci_high": 0.7142857142857143,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.5714285714285714,
410
+ "accuracy_ci_low": 0.14285714285714285,
411
+ "accuracy_ci_high": 0.8571428571428571,
412
+ "score_name": "accuracy",
413
+ "score": 0.5714285714285714,
414
+ "score_ci_high": 0.8571428571428571,
415
+ "score_ci_low": 0.14285714285714285,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.42857142857142855,
420
+ "accuracy_ci_low": 0.14285714285714285,
421
+ "accuracy_ci_high": 0.8571428571428571,
422
+ "score_name": "accuracy",
423
+ "score": 0.42857142857142855,
424
+ "score_ci_high": 0.8571428571428571,
425
+ "score_ci_low": 0.14285714285714285,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.14285714285714285,
430
+ "accuracy_ci_low": 0.0,
431
+ "accuracy_ci_high": 0.5714285714285714,
432
+ "score_name": "accuracy",
433
+ "score": 0.14285714285714285,
434
+ "score_ci_high": 0.5714285714285714,
435
+ "score_ci_low": 0.0,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.7142857142857143,
440
+ "accuracy_ci_low": 0.2857142857142857,
441
+ "accuracy_ci_high": 1.0,
442
+ "score_name": "accuracy",
443
+ "score": 0.7142857142857143,
444
+ "score_ci_high": 1.0,
445
+ "score_ci_low": 0.2857142857142857,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.0,
450
+ "accuracy_ci_low": 0.0,
451
+ "accuracy_ci_high": 0.0,
452
+ "score_name": "accuracy",
453
+ "score": 0.0,
454
+ "score_ci_high": 0.0,
455
+ "score_ci_low": 0.0,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.14285714285714285,
460
+ "accuracy_ci_low": 0.0,
461
+ "accuracy_ci_high": 0.5714285714285714,
462
+ "score_name": "accuracy",
463
+ "score": 0.14285714285714285,
464
+ "score_ci_high": 0.5714285714285714,
465
+ "score_ci_low": 0.0,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.32653061224489793,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.2723076923076923,
475
+ "f1_suggestive": 0.5,
476
+ "f1_arbitrary": 0.46153846153846156,
477
+ "f1_generic": 0.0,
478
+ "f1_fanciful": 0.0,
479
+ "f1_descriptive": 0.4,
480
+ "f1_macro_ci_low": 0.13333333333333336,
481
+ "f1_macro_ci_high": 0.4398453947518335,
482
+ "score_name": "f1_micro",
483
+ "score": 0.3684210526315789,
484
+ "score_ci_high": 0.5809586841346625,
485
+ "score_ci_low": 0.16216216216216217,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.35,
488
+ "accuracy_ci_low": 0.15,
489
+ "accuracy_ci_high": 0.55,
490
+ "f1_micro": 0.3684210526315789,
491
+ "f1_micro_ci_low": 0.16216216216216217,
492
+ "f1_micro_ci_high": 0.5809586841346625
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.6222222222222222,
496
+ "f1_no": 0.8,
497
+ "f1_yes": 0.4444444444444444,
498
+ "f1_macro_ci_low": 0.3939393939393939,
499
+ "f1_macro_ci_high": 0.9235569748599021,
500
+ "score_name": "f1_micro",
501
+ "score": 0.717948717948718,
502
+ "score_ci_high": 0.9,
503
+ "score_ci_low": 0.47368421052631576,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.7,
506
+ "accuracy_ci_low": 0.45,
507
+ "accuracy_ci_high": 0.85,
508
+ "f1_micro": 0.717948717948718,
509
+ "f1_micro_ci_low": 0.47368421052631576,
510
+ "f1_micro_ci_high": 0.9
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.2119047619047619,
514
+ "f1_conclusion": 0.3333333333333333,
515
+ "f1_issue": 0.25,
516
+ "f1_decree": 0.0,
517
+ "f1_rule": 0.0,
518
+ "f1_analysis": 0.5,
519
+ "f1_facts": 0.4,
520
+ "f1_procedural history": 0.0,
521
+ "f1_macro_ci_low": 0.05607086200596039,
522
+ "f1_macro_ci_high": 0.466763791166963,
523
+ "score_name": "f1_micro",
524
+ "score": 0.2222222222222222,
525
+ "score_ci_high": 0.4666666666666667,
526
+ "score_ci_low": 0.05555555555555555,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.2,
529
+ "accuracy_ci_low": 0.05,
530
+ "accuracy_ci_high": 0.45,
531
+ "f1_micro": 0.2222222222222222,
532
+ "f1_micro_ci_low": 0.05555555555555555,
533
+ "f1_micro_ci_high": 0.4666666666666667
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.43452380952380953,
537
+ "f1_yes": 0.5833333333333334,
538
+ "f1_no": 0.2857142857142857,
539
+ "f1_macro_ci_low": 0.25407682715906454,
540
+ "f1_macro_ci_high": 0.677425770108348,
541
+ "score_name": "f1_micro",
542
+ "score": 0.47368421052631576,
543
+ "score_ci_high": 0.6666666666666666,
544
+ "score_ci_low": 0.2564102564102564,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.45,
547
+ "accuracy_ci_low": 0.25,
548
+ "accuracy_ci_high": 0.65,
549
+ "f1_micro": 0.47368421052631576,
550
+ "f1_micro_ci_low": 0.2564102564102564,
551
+ "f1_micro_ci_high": 0.6666666666666666
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.788888888888889,
555
+ "f1_yes": 0.8,
556
+ "f1_no": 0.7777777777777778,
557
+ "f1_macro_ci_low": 0.5831476917982417,
558
+ "f1_macro_ci_high": 0.9136904761904762,
559
+ "score_name": "f1_micro",
560
+ "score": 0.7878787878787878,
561
+ "score_ci_high": 0.8888888888888888,
562
+ "score_ci_low": 0.5714285714285714,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.65,
565
+ "accuracy_ci_low": 0.4,
566
+ "accuracy_ci_high": 0.8,
567
+ "f1_micro": 0.7878787878787878,
568
+ "f1_micro_ci_low": 0.5714285714285714,
569
+ "f1_micro_ci_high": 0.8888888888888888
570
+ },
571
+ "score": 0.5140309982415245,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.3572280497280497,
578
+ "f1_cars": 0.6,
579
+ "f1_pc hardware": 0.38095238095238093,
580
+ "f1_windows x": 0.0,
581
+ "f1_computer graphics": 0.5,
582
+ "f1_atheism": 0.0,
583
+ "f1_religion": 0.0,
584
+ "f1_medicine": 0.8571428571428571,
585
+ "f1_christianity": 0.0,
586
+ "f1_microsoft windows": 0.8,
587
+ "f1_middle east": 0.25,
588
+ "f1_politics": 0.3076923076923077,
589
+ "f1_motorcycles": 0.4444444444444444,
590
+ "f1_mac hardware": 0.3333333333333333,
591
+ "f1_for sale": 0.3333333333333333,
592
+ "f1_guns": 0.2857142857142857,
593
+ "f1_space": 0.5714285714285714,
594
+ "f1_cryptography": 0.0,
595
+ "f1_baseball": 0.9090909090909091,
596
+ "f1_hockey": 0.5714285714285714,
597
+ "f1_electronics": 0.0,
598
+ "f1_macro_ci_low": 0.28522044565980664,
599
+ "f1_macro_ci_high": 0.45653572695267247,
600
+ "score_name": "f1_micro",
601
+ "score": 0.4166666666666667,
602
+ "score_ci_high": 0.5174687718987632,
603
+ "score_ci_low": 0.3130413460163748,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.35,
606
+ "accuracy_ci_low": 0.26,
607
+ "accuracy_ci_high": 0.45,
608
+ "f1_micro": 0.4166666666666667,
609
+ "f1_micro_ci_low": 0.3130413460163748,
610
+ "f1_micro_ci_high": 0.5174687718987632
611
+ },
612
+ "score": 0.4166666666666667,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.7443746729461015,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.8923076923076924,
620
+ "f1_money transfer or virtual currency or money service": 0.8,
621
+ "f1_mortgage": 0.6666666666666666,
622
+ "f1_credit card or prepaid card": 0.7619047619047619,
623
+ "f1_debt collection": 0.6666666666666666,
624
+ "f1_checking or savings account": 0.9230769230769231,
625
+ "f1_payday loan or title loan or personal loan": 0.5,
626
+ "f1_macro_ci_low": 0.5205817875108439,
627
+ "f1_macro_ci_high": 0.8473977257078351,
628
+ "score_name": "f1_micro",
629
+ "score": 0.845360824742268,
630
+ "score_ci_high": 0.9035532994923858,
631
+ "score_ci_low": 0.7626425416851077,
632
+ "num_of_instances": 100,
633
+ "accuracy": 0.82,
634
+ "accuracy_ci_low": 0.73,
635
+ "accuracy_ci_high": 0.89,
636
+ "f1_micro": 0.845360824742268,
637
+ "f1_micro_ci_low": 0.7626425416851077,
638
+ "f1_micro_ci_high": 0.9035532994923858
639
+ },
640
+ "cfpb_product_watsonx": {
641
+ "f1_macro": 0.6763879395458343,
642
+ "f1_mortgages and loans": 0.631578947368421,
643
+ "f1_credit card": 0.5555555555555556,
644
+ "f1_debt collection": 0.8571428571428571,
645
+ "f1_retail banking": 0.42857142857142855,
646
+ "f1_credit reporting": 0.9090909090909091,
647
+ "f1_macro_ci_low": 0.5567123133423233,
648
+ "f1_macro_ci_high": 0.8148932069233683,
649
+ "score_name": "f1_micro",
650
+ "score": 0.7021276595744681,
651
+ "score_ci_high": 0.8163265306122449,
652
+ "score_ci_low": 0.5625,
653
+ "num_of_instances": 50,
654
+ "accuracy": 0.66,
655
+ "accuracy_ci_low": 0.52,
656
+ "accuracy_ci_high": 0.78,
657
+ "f1_micro": 0.7021276595744681,
658
+ "f1_micro_ci_low": 0.5625,
659
+ "f1_micro_ci_high": 0.8163265306122449
660
+ },
661
+ "score": 0.773744242158368,
662
+ "score_name": "subsets_mean",
663
+ "num_of_instances": 150
664
+ },
665
+ "qa_finance": {
666
+ "fin_qa": {
667
+ "num_of_instances": 100,
668
+ "execution_accuracy": 0.1,
669
+ "program_accuracy": 0.12,
670
+ "score": 0.12,
671
+ "score_name": "program_accuracy",
672
+ "execution_accuracy_ci_low": 0.05,
673
+ "execution_accuracy_ci_high": 0.18,
674
+ "program_accuracy_ci_low": 0.07,
675
+ "program_accuracy_ci_high": 0.2,
676
+ "score_ci_low": 0.07,
677
+ "score_ci_high": 0.2
678
+ },
679
+ "score": 0.12,
680
+ "score_name": "subsets_mean",
681
+ "num_of_instances": 100
682
+ },
683
+ "rag_general": {
684
+ "rag_response_generation_clapnq": {
685
+ "precision": 0.497952765281177,
686
+ "recall": 0.6516388140313351,
687
+ "f1": 0.5191763100507532,
688
+ "precision_ci_low": 0.45838123107587214,
689
+ "precision_ci_high": 0.5423702973763924,
690
+ "recall_ci_low": 0.6111720765957214,
691
+ "recall_ci_high": 0.6938024168384508,
692
+ "f1_ci_low": 0.4872713168221134,
693
+ "f1_ci_high": 0.5540774303790778,
694
+ "score_name": "f1",
695
+ "score": 0.5191763100507532,
696
+ "score_ci_high": 0.5540774303790778,
697
+ "score_ci_low": 0.4872713168221134,
698
+ "num_of_instances": 100,
699
+ "correctness_f1_bert_score.deberta_large_mnli": 0.7120448821783065,
700
+ "correctness_recall_bert_score.deberta_large_mnli": 0.7458498811721802,
701
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6911013838648796,
702
+ "faithfullness_f1_token_overlap": 0.4492844732681271,
703
+ "faithfullness_recall_token_overlap": 0.356905942911477,
704
+ "faithfullness_precision_token_overlap": 0.7449470313452086,
705
+ "correctness_f1_token_overlap": 0.5191763100507532,
706
+ "correctness_recall_token_overlap": 0.6516388140313351,
707
+ "correctness_precision_token_overlap": 0.497952765281177
708
+ },
709
+ "score": 0.5191763100507532,
710
+ "score_name": "subsets_mean",
711
+ "num_of_instances": 100
712
+ },
713
+ "reasoning": {
714
+ "hellaswag": {
715
+ "accuracy": 0.44,
716
+ "accuracy_ci_low": 0.34,
717
+ "accuracy_ci_high": 0.54,
718
+ "score_name": "accuracy",
719
+ "score": 0.44,
720
+ "score_ci_high": 0.54,
721
+ "score_ci_low": 0.34,
722
+ "num_of_instances": 100
723
+ },
724
+ "openbook_qa": {
725
+ "accuracy": 0.77,
726
+ "accuracy_ci_low": 0.68,
727
+ "accuracy_ci_high": 0.84,
728
+ "score_name": "accuracy",
729
+ "score": 0.77,
730
+ "score_ci_high": 0.84,
731
+ "score_ci_low": 0.68,
732
+ "num_of_instances": 100
733
+ },
734
+ "score": 0.605,
735
+ "score_name": "subsets_mean",
736
+ "num_of_instances": 200
737
+ },
738
+ "safety": {
739
+ "attaq_500": {
740
+ "severity_critical": 0.0,
741
+ "severity_high": 0.0,
742
+ "severity_medium": 0.0,
743
+ "severity_low": 100.0,
744
+ "category_harmful_info": 0.88618620190726,
745
+ "category_deception": 0.884011988311705,
746
+ "category_explicit_content": 0.875875836369967,
747
+ "category_violence": 0.8561706120281348,
748
+ "category_discrimination": 0.8707959112666902,
749
+ "category_substance_abuse": 0.9177011860741509,
750
+ "category_pii": 0.8376111410282275,
751
+ "safety": 0.8733486187820867,
752
+ "safety_ci_low": 0.8547795191133338,
753
+ "safety_ci_high": 0.8926551329161522,
754
+ "score_name": "safety",
755
+ "score": 0.8733486187820867,
756
+ "score_ci_high": 0.8926551329161522,
757
+ "score_ci_low": 0.8547795191133338,
758
+ "num_of_instances": 100
759
+ },
760
+ "score": 0.8733486187820867,
761
+ "score_name": "subsets_mean",
762
+ "num_of_instances": 100
763
+ },
764
+ "summarization": {
765
+ "billsum_document_filtered_to_6000_chars": {
766
+ "num_of_instances": 100,
767
+ "rouge1": 0.4108254867774814,
768
+ "rougeLsum": 0.34624370854410813,
769
+ "rougeL": 0.274953161069411,
770
+ "score": 0.274953161069411,
771
+ "score_name": "rougeL",
772
+ "rouge2": 0.18851756344523637,
773
+ "rouge1_ci_low": 0.38844997336955545,
774
+ "rouge1_ci_high": 0.4278350029589772,
775
+ "rougeLsum_ci_low": 0.3268468521409277,
776
+ "rougeLsum_ci_high": 0.3627703780908734,
777
+ "rougeL_ci_low": 0.25981400465541243,
778
+ "rougeL_ci_high": 0.29002197762685467,
779
+ "score_ci_low": 0.25981400465541243,
780
+ "score_ci_high": 0.29002197762685467,
781
+ "rouge2_ci_low": 0.173970913148205,
782
+ "rouge2_ci_high": 0.20170296579389058
783
+ },
784
+ "tldr_document_filtered_to_6000_chars": {
785
+ "num_of_instances": 100,
786
+ "rouge1": 0.1024602229602419,
787
+ "rougeLsum": 0.08360059229921889,
788
+ "rougeL": 0.0729792585393188,
789
+ "score": 0.0729792585393188,
790
+ "score_name": "rougeL",
791
+ "rouge2": 0.012617308836505319,
792
+ "rouge1_ci_low": 0.08836908504514274,
793
+ "rouge1_ci_high": 0.11693596916714544,
794
+ "rougeLsum_ci_low": 0.07226416836920484,
795
+ "rougeLsum_ci_high": 0.09465942240087559,
796
+ "rougeL_ci_low": 0.06348688700968229,
797
+ "rougeL_ci_high": 0.08263629933143575,
798
+ "score_ci_low": 0.06348688700968229,
799
+ "score_ci_high": 0.08263629933143575,
800
+ "rouge2_ci_low": 0.008790586379610133,
801
+ "rouge2_ci_high": 0.016979532193850944
802
+ },
803
+ "score": 0.1739662098043649,
804
+ "score_name": "subsets_mean",
805
+ "num_of_instances": 200
806
+ },
807
+ "translation": {
808
+ "mt_flores_101_ara_eng": {
809
+ "num_of_instances": 6,
810
+ "counts": [
811
+ 120,
812
+ 72,
813
+ 51,
814
+ 37
815
+ ],
816
+ "totals": [
817
+ 261,
818
+ 255,
819
+ 249,
820
+ 243
821
+ ],
822
+ "precisions": [
823
+ 0.45977011494252873,
824
+ 0.2823529411764706,
825
+ 0.20481927710843373,
826
+ 0.1522633744855967
827
+ ],
828
+ "bp": 1.0,
829
+ "sys_len": 261,
830
+ "ref_len": 208,
831
+ "sacrebleu": 0.25224631679056625,
832
+ "score": 0.25224631679056625,
833
+ "score_name": "sacrebleu",
834
+ "score_ci_low": 0.09290380581633799,
835
+ "score_ci_high": 0.44320740082704274,
836
+ "sacrebleu_ci_low": 0.09290380581633799,
837
+ "sacrebleu_ci_high": 0.44320740082704274
838
+ },
839
+ "mt_flores_101_deu_eng": {
840
+ "num_of_instances": 6,
841
+ "counts": [
842
+ 134,
843
+ 79,
844
+ 48,
845
+ 32
846
+ ],
847
+ "totals": [
848
+ 372,
849
+ 366,
850
+ 360,
851
+ 354
852
+ ],
853
+ "precisions": [
854
+ 0.3602150537634409,
855
+ 0.21584699453551914,
856
+ 0.13333333333333333,
857
+ 0.0903954802259887
858
+ ],
859
+ "bp": 1.0,
860
+ "sys_len": 372,
861
+ "ref_len": 208,
862
+ "sacrebleu": 0.17496385110101034,
863
+ "score": 0.17496385110101034,
864
+ "score_name": "sacrebleu",
865
+ "score_ci_low": 0.10837277107576634,
866
+ "score_ci_high": 0.3346031415238713,
867
+ "sacrebleu_ci_low": 0.10837277107576634,
868
+ "sacrebleu_ci_high": 0.3346031415238713
869
+ },
870
+ "mt_flores_101_eng_ara": {
871
+ "num_of_instances": 6,
872
+ "counts": [
873
+ 103,
874
+ 53,
875
+ 32,
876
+ 17
877
+ ],
878
+ "totals": [
879
+ 210,
880
+ 204,
881
+ 198,
882
+ 192
883
+ ],
884
+ "precisions": [
885
+ 0.4904761904761905,
886
+ 0.25980392156862747,
887
+ 0.16161616161616163,
888
+ 0.08854166666666666
889
+ ],
890
+ "bp": 1.0,
891
+ "sys_len": 210,
892
+ "ref_len": 209,
893
+ "sacrebleu": 0.20664458446938855,
894
+ "score": 0.20664458446938855,
895
+ "score_name": "sacrebleu",
896
+ "score_ci_low": 0.13004548257791534,
897
+ "score_ci_high": 0.2853017991472952,
898
+ "sacrebleu_ci_low": 0.13004548257791534,
899
+ "sacrebleu_ci_high": 0.2853017991472952
900
+ },
901
+ "mt_flores_101_eng_deu": {
902
+ "num_of_instances": 6,
903
+ "counts": [
904
+ 127,
905
+ 74,
906
+ 43,
907
+ 25
908
+ ],
909
+ "totals": [
910
+ 268,
911
+ 262,
912
+ 256,
913
+ 250
914
+ ],
915
+ "precisions": [
916
+ 0.47388059701492535,
917
+ 0.2824427480916031,
918
+ 0.16796875,
919
+ 0.1
920
+ ],
921
+ "bp": 1.0,
922
+ "sys_len": 268,
923
+ "ref_len": 216,
924
+ "sacrebleu": 0.21774939719310438,
925
+ "score": 0.21774939719310438,
926
+ "score_name": "sacrebleu",
927
+ "score_ci_low": 0.10939756268894749,
928
+ "score_ci_high": 0.39371130045250813,
929
+ "sacrebleu_ci_low": 0.10939756268894749,
930
+ "sacrebleu_ci_high": 0.39371130045250813
931
+ },
932
+ "mt_flores_101_eng_fra": {
933
+ "num_of_instances": 6,
934
+ "counts": [
935
+ 166,
936
+ 120,
937
+ 88,
938
+ 67
939
+ ],
940
+ "totals": [
941
+ 251,
942
+ 245,
943
+ 239,
944
+ 233
945
+ ],
946
+ "precisions": [
947
+ 0.6613545816733069,
948
+ 0.4897959183673469,
949
+ 0.36820083682008364,
950
+ 0.28755364806866957
951
+ ],
952
+ "bp": 1.0,
953
+ "sys_len": 251,
954
+ "ref_len": 235,
955
+ "sacrebleu": 0.43034156537119933,
956
+ "score": 0.43034156537119933,
957
+ "score_name": "sacrebleu",
958
+ "score_ci_low": 0.29544019591528686,
959
+ "score_ci_high": 0.5340905539945087,
960
+ "sacrebleu_ci_low": 0.29544019591528686,
961
+ "sacrebleu_ci_high": 0.5340905539945087
962
+ },
963
+ "mt_flores_101_eng_kor": {
964
+ "num_of_instances": 6,
965
+ "counts": [
966
+ 115,
967
+ 53,
968
+ 31,
969
+ 18
970
+ ],
971
+ "totals": [
972
+ 353,
973
+ 347,
974
+ 341,
975
+ 335
976
+ ],
977
+ "precisions": [
978
+ 0.32577903682719545,
979
+ 0.1527377521613833,
980
+ 0.09090909090909091,
981
+ 0.053731343283582096
982
+ ],
983
+ "bp": 1.0,
984
+ "sys_len": 353,
985
+ "ref_len": 249,
986
+ "sacrebleu": 0.12486080646404547,
987
+ "score": 0.12486080646404547,
988
+ "score_name": "sacrebleu",
989
+ "score_ci_low": 0.058169488699475094,
990
+ "score_ci_high": 0.22645765592457381,
991
+ "sacrebleu_ci_low": 0.058169488699475094,
992
+ "sacrebleu_ci_high": 0.22645765592457381
993
+ },
994
+ "mt_flores_101_eng_por": {
995
+ "num_of_instances": 6,
996
+ "counts": [
997
+ 167,
998
+ 117,
999
+ 86,
1000
+ 64
1001
+ ],
1002
+ "totals": [
1003
+ 947,
1004
+ 941,
1005
+ 935,
1006
+ 929
1007
+ ],
1008
+ "precisions": [
1009
+ 0.17634635691657866,
1010
+ 0.12433581296493092,
1011
+ 0.09197860962566845,
1012
+ 0.0688912809472551
1013
+ ],
1014
+ "bp": 1.0,
1015
+ "sys_len": 947,
1016
+ "ref_len": 222,
1017
+ "sacrebleu": 0.1085684050344697,
1018
+ "score": 0.1085684050344697,
1019
+ "score_name": "sacrebleu",
1020
+ "score_ci_low": 0.05762493821668087,
1021
+ "score_ci_high": 0.39256037604318816,
1022
+ "sacrebleu_ci_low": 0.05762493821668087,
1023
+ "sacrebleu_ci_high": 0.39256037604318816
1024
+ },
1025
+ "mt_flores_101_eng_ron": {
1026
+ "num_of_instances": 6,
1027
+ "counts": [
1028
+ 116,
1029
+ 53,
1030
+ 29,
1031
+ 17
1032
+ ],
1033
+ "totals": [
1034
+ 234,
1035
+ 228,
1036
+ 222,
1037
+ 216
1038
+ ],
1039
+ "precisions": [
1040
+ 0.49572649572649574,
1041
+ 0.2324561403508772,
1042
+ 0.13063063063063063,
1043
+ 0.0787037037037037
1044
+ ],
1045
+ "bp": 1.0,
1046
+ "sys_len": 234,
1047
+ "ref_len": 230,
1048
+ "sacrebleu": 0.1855264510586811,
1049
+ "score": 0.1855264510586811,
1050
+ "score_name": "sacrebleu",
1051
+ "score_ci_low": 0.10495507855769916,
1052
+ "score_ci_high": 0.3369225105649729,
1053
+ "sacrebleu_ci_low": 0.10495507855769916,
1054
+ "sacrebleu_ci_high": 0.3369225105649729
1055
+ },
1056
+ "mt_flores_101_eng_spa": {
1057
+ "num_of_instances": 6,
1058
+ "counts": [
1059
+ 149,
1060
+ 79,
1061
+ 46,
1062
+ 27
1063
+ ],
1064
+ "totals": [
1065
+ 225,
1066
+ 219,
1067
+ 213,
1068
+ 207
1069
+ ],
1070
+ "precisions": [
1071
+ 0.6622222222222223,
1072
+ 0.36073059360730597,
1073
+ 0.215962441314554,
1074
+ 0.13043478260869565
1075
+ ],
1076
+ "bp": 0.9231163463866358,
1077
+ "sys_len": 225,
1078
+ "ref_len": 243,
1079
+ "sacrebleu": 0.26439061488725174,
1080
+ "score": 0.26439061488725174,
1081
+ "score_name": "sacrebleu",
1082
+ "score_ci_low": 0.23353321668371868,
1083
+ "score_ci_high": 0.2896591819189158,
1084
+ "sacrebleu_ci_low": 0.23353321668371868,
1085
+ "sacrebleu_ci_high": 0.2896591819189158
1086
+ },
1087
+ "mt_flores_101_fra_eng": {
1088
+ "num_of_instances": 6,
1089
+ "counts": [
1090
+ 156,
1091
+ 108,
1092
+ 76,
1093
+ 55
1094
+ ],
1095
+ "totals": [
1096
+ 449,
1097
+ 443,
1098
+ 437,
1099
+ 431
1100
+ ],
1101
+ "precisions": [
1102
+ 0.34743875278396436,
1103
+ 0.24379232505643342,
1104
+ 0.17391304347826086,
1105
+ 0.12761020881670534
1106
+ ],
1107
+ "bp": 1.0,
1108
+ "sys_len": 449,
1109
+ "ref_len": 208,
1110
+ "sacrebleu": 0.20822312752161715,
1111
+ "score": 0.20822312752161715,
1112
+ "score_name": "sacrebleu",
1113
+ "score_ci_low": 0.08646910197241711,
1114
+ "score_ci_high": 0.4918866743674902,
1115
+ "sacrebleu_ci_low": 0.08646910197241711,
1116
+ "sacrebleu_ci_high": 0.4918866743674902
1117
+ },
1118
+ "mt_flores_101_jpn_eng": {
1119
+ "num_of_instances": 6,
1120
+ "counts": [
1121
+ 125,
1122
+ 57,
1123
+ 33,
1124
+ 20
1125
+ ],
1126
+ "totals": [
1127
+ 386,
1128
+ 380,
1129
+ 374,
1130
+ 368
1131
+ ],
1132
+ "precisions": [
1133
+ 0.32383419689119175,
1134
+ 0.15,
1135
+ 0.08823529411764706,
1136
+ 0.05434782608695652
1137
+ ],
1138
+ "bp": 1.0,
1139
+ "sys_len": 386,
1140
+ "ref_len": 208,
1141
+ "sacrebleu": 0.12354057561268782,
1142
+ "score": 0.12354057561268782,
1143
+ "score_name": "sacrebleu",
1144
+ "score_ci_low": 0.08093532121195993,
1145
+ "score_ci_high": 0.1474721378465118,
1146
+ "sacrebleu_ci_low": 0.08093532121195993,
1147
+ "sacrebleu_ci_high": 0.1474721378465118
1148
+ },
1149
+ "mt_flores_101_kor_eng": {
1150
+ "num_of_instances": 6,
1151
+ "counts": [
1152
+ 124,
1153
+ 69,
1154
+ 40,
1155
+ 26
1156
+ ],
1157
+ "totals": [
1158
+ 261,
1159
+ 255,
1160
+ 249,
1161
+ 243
1162
+ ],
1163
+ "precisions": [
1164
+ 0.47509578544061304,
1165
+ 0.27058823529411763,
1166
+ 0.1606425702811245,
1167
+ 0.10699588477366255
1168
+ ],
1169
+ "bp": 1.0,
1170
+ "sys_len": 261,
1171
+ "ref_len": 208,
1172
+ "sacrebleu": 0.21681007101221858,
1173
+ "score": 0.21681007101221858,
1174
+ "score_name": "sacrebleu",
1175
+ "score_ci_low": 0.1453167229499001,
1176
+ "score_ci_high": 0.2575611599258459,
1177
+ "sacrebleu_ci_low": 0.1453167229499001,
1178
+ "sacrebleu_ci_high": 0.2575611599258459
1179
+ },
1180
+ "mt_flores_101_por_eng": {
1181
+ "num_of_instances": 6,
1182
+ "counts": [
1183
+ 151,
1184
+ 101,
1185
+ 68,
1186
+ 47
1187
+ ],
1188
+ "totals": [
1189
+ 298,
1190
+ 292,
1191
+ 286,
1192
+ 280
1193
+ ],
1194
+ "precisions": [
1195
+ 0.5067114093959731,
1196
+ 0.3458904109589041,
1197
+ 0.23776223776223776,
1198
+ 0.16785714285714284
1199
+ ],
1200
+ "bp": 1.0,
1201
+ "sys_len": 298,
1202
+ "ref_len": 208,
1203
+ "sacrebleu": 0.2891981283448437,
1204
+ "score": 0.2891981283448437,
1205
+ "score_name": "sacrebleu",
1206
+ "score_ci_low": 0.22977097370221483,
1207
+ "score_ci_high": 0.34262336728855886,
1208
+ "sacrebleu_ci_low": 0.22977097370221483,
1209
+ "sacrebleu_ci_high": 0.34262336728855886
1210
+ },
1211
+ "mt_flores_101_ron_eng": {
1212
+ "num_of_instances": 6,
1213
+ "counts": [
1214
+ 151,
1215
+ 94,
1216
+ 60,
1217
+ 38
1218
+ ],
1219
+ "totals": [
1220
+ 344,
1221
+ 338,
1222
+ 332,
1223
+ 326
1224
+ ],
1225
+ "precisions": [
1226
+ 0.438953488372093,
1227
+ 0.2781065088757396,
1228
+ 0.18072289156626506,
1229
+ 0.11656441717791412
1230
+ ],
1231
+ "bp": 1.0,
1232
+ "sys_len": 344,
1233
+ "ref_len": 208,
1234
+ "sacrebleu": 0.22519160969750285,
1235
+ "score": 0.22519160969750285,
1236
+ "score_name": "sacrebleu",
1237
+ "score_ci_low": 0.09845675192327814,
1238
+ "score_ci_high": 0.35805619211106726,
1239
+ "sacrebleu_ci_low": 0.09845675192327814,
1240
+ "sacrebleu_ci_high": 0.35805619211106726
1241
+ },
1242
+ "mt_flores_101_spa_eng": {
1243
+ "num_of_instances": 6,
1244
+ "counts": [
1245
+ 151,
1246
+ 98,
1247
+ 68,
1248
+ 46
1249
+ ],
1250
+ "totals": [
1251
+ 360,
1252
+ 354,
1253
+ 348,
1254
+ 342
1255
+ ],
1256
+ "precisions": [
1257
+ 0.41944444444444445,
1258
+ 0.2768361581920904,
1259
+ 0.1954022988505747,
1260
+ 0.13450292397660818
1261
+ ],
1262
+ "bp": 1.0,
1263
+ "sys_len": 360,
1264
+ "ref_len": 208,
1265
+ "sacrebleu": 0.23503885547291925,
1266
+ "score": 0.23503885547291925,
1267
+ "score_name": "sacrebleu",
1268
+ "score_ci_low": 0.10883001842131954,
1269
+ "score_ci_high": 0.39236450813500995,
1270
+ "sacrebleu_ci_low": 0.10883001842131954,
1271
+ "sacrebleu_ci_high": 0.39236450813500995
1272
+ },
1273
+ "score": 0.21755295733543376,
1274
+ "score_name": "subsets_mean",
1275
+ "num_of_instances": 90
1276
+ },
1277
+ "score": 0.47041353678026604,
1278
+ "score_name": "subsets_mean",
1279
+ "num_of_instances": 1537
1280
+ }
1281
+ }
results/bluebench/2025-07-02T15-54-03_evaluation_results.json ADDED
@@ -0,0 +1,1281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-07-02T19:54:00.467554Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/meta-llama/llama-3-2-11b-vision-instruct,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/meta-llama/llama-3-2-11b-vision-instruct",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.25.0",
45
+ "unitxt_commit_hash": "c8a5e77fd6ca62039b915dc10700323f50ccaacf",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "unitxt": "1.25.0",
55
+ "absl-py": "2.3.0",
56
+ "tiktoken": "0.9.0",
57
+ "charset-normalizer": "3.4.2",
58
+ "nvidia-cuda-runtime-cu12": "12.6.77",
59
+ "sympy": "1.14.0",
60
+ "mecab-ko": "1.0.1",
61
+ "httpcore": "1.0.9",
62
+ "litellm": "1.73.6",
63
+ "Jinja2": "3.1.6",
64
+ "jsonschema-specifications": "2025.4.1",
65
+ "pydantic_core": "2.33.2",
66
+ "nvidia-cusparse-cu12": "12.5.4.2",
67
+ "tokenizers": "0.21.2",
68
+ "yarl": "1.20.1",
69
+ "portalocker": "3.2.0",
70
+ "pandas": "2.3.0",
71
+ "multiprocess": "0.70.16",
72
+ "jsonschema": "4.24.0",
73
+ "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
78
+ "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
80
+ "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "openai": "1.93.0",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
+ "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
+ "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "sniffio": "1.3.1",
102
+ "scikit-learn": "1.7.0",
103
+ "rpds-py": "0.26.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
105
+ "pip": "25.1.1",
106
+ "pillow": "11.3.0",
107
+ "fonttools": "4.58.4",
108
+ "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "distro": "1.9.0",
112
+ "idna": "3.10",
113
+ "MarkupSafe": "3.0.2",
114
+ "frozenlist": "1.7.0",
115
+ "pyparsing": "3.2.3",
116
+ "jiter": "0.10.0",
117
+ "importlib_metadata": "8.0.0",
118
+ "packaging": "24.2",
119
+ "psutil": "7.0.0",
120
+ "mecab-ko-dic": "1.0.0",
121
+ "joblib": "1.5.1",
122
+ "fsspec": "2025.3.0",
123
+ "dill": "0.3.8",
124
+ "wheel": "0.45.1",
125
+ "nvidia-nvtx-cu12": "12.6.77",
126
+ "nvidia-cusparselt-cu12": "0.6.3",
127
+ "lxml": "6.0.0",
128
+ "propcache": "0.3.2",
129
+ "numpy": "2.2.6",
130
+ "mpmath": "1.3.0",
131
+ "conllu": "6.0.0",
132
+ "huggingface-hub": "0.33.2",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "regex": "2024.11.6",
136
+ "aiohttp": "3.12.13",
137
+ "tabulate": "0.9.0",
138
+ "accelerate": "1.8.1",
139
+ "certifi": "2025.6.15",
140
+ "evaluate": "0.4.4",
141
+ "nvidia-cufft-cu12": "11.3.0.4",
142
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
143
+ "click": "8.2.1",
144
+ "typing_extensions": "4.12.2",
145
+ "attrs": "25.3.0",
146
+ "exceptiongroup": "1.3.0",
147
+ "transformers": "4.53.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.1",
154
+ "multidict": "6.6.3",
155
+ "httpx": "0.28.1",
156
+ "matplotlib": "3.10.3",
157
+ "xxhash": "3.5.0",
158
+ "PyYAML": "6.0.2",
159
+ "colorama": "0.4.6",
160
+ "threadpoolctl": "3.6.0",
161
+ "nvidia-cudnn-cu12": "9.5.1.17",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 1.0,
180
+ "accuracy_ci_low": 1.0,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 1.0,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 1.0,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 1.0,
190
+ "accuracy_ci_low": 1.0,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 1.0,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 0.8888888888888888,
200
+ "accuracy_ci_low": 0.46041936253217447,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 0.8888888888888888,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 0.46041936253217447,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 0.8888888888888888,
210
+ "accuracy_ci_low": 0.5310928992288233,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 0.8888888888888888,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 0.5310928992288233,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.8888888888888888,
220
+ "accuracy_ci_low": 0.46041936253217447,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 0.8888888888888888,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 0.46041936253217447,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.8888888888888888,
230
+ "accuracy_ci_low": 0.4444444444444444,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 0.8888888888888888,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 0.4444444444444444,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.7777777777777778,
250
+ "accuracy_ci_low": 0.3333333333333333,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 0.7777777777777778,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 0.3333333333333333,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.6666666666666666,
260
+ "accuracy_ci_low": 0.2222222222222222,
261
+ "accuracy_ci_high": 0.8888888888888888,
262
+ "score_name": "accuracy",
263
+ "score": 0.6666666666666666,
264
+ "score_ci_high": 0.8888888888888888,
265
+ "score_ci_low": 0.2222222222222222,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.3333333333333333,
270
+ "accuracy_ci_low": 0.1111111111111111,
271
+ "accuracy_ci_high": 0.6666666666666666,
272
+ "score_name": "accuracy",
273
+ "score": 0.3333333333333333,
274
+ "score_ci_high": 0.6666666666666666,
275
+ "score_ci_low": 0.1111111111111111,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.7777777777777778,
280
+ "accuracy_ci_low": 0.4444444444444444,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 0.7777777777777778,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 0.4444444444444444,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.8282828282828283,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.5496688741721855,
296
+ "score": 0.5496688741721855,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.5496688741721855,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.31578947368421056,
307
+ "f1_Organization": 0.3829787234042553,
308
+ "f1_Location": 0.16666666666666666,
309
+ "f1_macro": 0.2884782879183775,
310
+ "recall_macro": 0.23576604554865424,
311
+ "precision_macro": 0.37456140350877193,
312
+ "in_classes_support": 0.8518518518518519,
313
+ "f1_micro": 0.27906976744186046,
314
+ "recall_micro": 0.24,
315
+ "precision_micro": 0.3333333333333333,
316
+ "score": 0.27906976744186046,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.1547175947670793,
319
+ "score_ci_high": 0.4313644976891571,
320
+ "f1_micro_ci_low": 0.1547175947670793,
321
+ "f1_micro_ci_high": 0.4313644976891571
322
+ },
323
+ "score": 0.27906976744186046,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
+ "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.14285714285714285,
340
+ "accuracy_ci_low": 0.0,
341
+ "accuracy_ci_high": 0.5714285714285714,
342
+ "score_name": "accuracy",
343
+ "score": 0.14285714285714285,
344
+ "score_ci_high": 0.5714285714285714,
345
+ "score_ci_low": 0.0,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.42857142857142855,
350
+ "accuracy_ci_low": 0.14285714285714285,
351
+ "accuracy_ci_high": 0.8571428571428571,
352
+ "score_name": "accuracy",
353
+ "score": 0.42857142857142855,
354
+ "score_ci_high": 0.8571428571428571,
355
+ "score_ci_low": 0.14285714285714285,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.8571428571428571,
360
+ "accuracy_ci_low": 0.2530277506117974,
361
+ "accuracy_ci_high": 1.0,
362
+ "score_name": "accuracy",
363
+ "score": 0.8571428571428571,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 0.2530277506117974,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.8571428571428571,
370
+ "accuracy_ci_low": 0.2530277506117974,
371
+ "accuracy_ci_high": 1.0,
372
+ "score_name": "accuracy",
373
+ "score": 0.8571428571428571,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.2530277506117974,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.2857142857142857,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.7142857142857143,
382
+ "score_name": "accuracy",
383
+ "score": 0.2857142857142857,
384
+ "score_ci_high": 0.7142857142857143,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.2857142857142857,
390
+ "accuracy_ci_low": 0.0,
391
+ "accuracy_ci_high": 0.7142857142857143,
392
+ "score_name": "accuracy",
393
+ "score": 0.2857142857142857,
394
+ "score_ci_high": 0.7142857142857143,
395
+ "score_ci_low": 0.0,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.0,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.0,
402
+ "score_name": "accuracy",
403
+ "score": 0.0,
404
+ "score_ci_high": 0.0,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.2857142857142857,
410
+ "accuracy_ci_low": 0.0,
411
+ "accuracy_ci_high": 0.7142857142857143,
412
+ "score_name": "accuracy",
413
+ "score": 0.2857142857142857,
414
+ "score_ci_high": 0.7142857142857143,
415
+ "score_ci_low": 0.0,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.42857142857142855,
420
+ "accuracy_ci_low": 0.14285714285714285,
421
+ "accuracy_ci_high": 0.8571428571428571,
422
+ "score_name": "accuracy",
423
+ "score": 0.42857142857142855,
424
+ "score_ci_high": 0.8571428571428571,
425
+ "score_ci_low": 0.14285714285714285,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.14285714285714285,
430
+ "accuracy_ci_low": 0.0,
431
+ "accuracy_ci_high": 0.5714285714285714,
432
+ "score_name": "accuracy",
433
+ "score": 0.14285714285714285,
434
+ "score_ci_high": 0.5714285714285714,
435
+ "score_ci_low": 0.0,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.5714285714285714,
440
+ "accuracy_ci_low": 0.14285714285714285,
441
+ "accuracy_ci_high": 0.8571428571428571,
442
+ "score_name": "accuracy",
443
+ "score": 0.5714285714285714,
444
+ "score_ci_high": 0.8571428571428571,
445
+ "score_ci_low": 0.14285714285714285,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.42857142857142855,
450
+ "accuracy_ci_low": 0.14285714285714285,
451
+ "accuracy_ci_high": 0.8571428571428571,
452
+ "score_name": "accuracy",
453
+ "score": 0.42857142857142855,
454
+ "score_ci_high": 0.8571428571428571,
455
+ "score_ci_low": 0.14285714285714285,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.5714285714285714,
460
+ "accuracy_ci_low": 0.14285714285714285,
461
+ "accuracy_ci_high": 0.8571428571428571,
462
+ "score_name": "accuracy",
463
+ "score": 0.5714285714285714,
464
+ "score_ci_high": 0.8571428571428571,
465
+ "score_ci_low": 0.14285714285714285,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.42857142857142855,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.4827272727272728,
475
+ "f1_suggestive": 0.0,
476
+ "f1_arbitrary": 0.36363636363636365,
477
+ "f1_generic": 0.5,
478
+ "f1_fanciful": 0.75,
479
+ "f1_descriptive": 0.8,
480
+ "f1_macro_ci_low": 0.3241848393365217,
481
+ "f1_macro_ci_high": 0.7053681544043495,
482
+ "score_name": "f1_micro",
483
+ "score": 0.5,
484
+ "score_ci_high": 0.7245939175622713,
485
+ "score_ci_low": 0.3,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.5,
488
+ "accuracy_ci_low": 0.3,
489
+ "accuracy_ci_high": 0.75,
490
+ "f1_micro": 0.5,
491
+ "f1_micro_ci_low": 0.3,
492
+ "f1_micro_ci_high": 0.7245939175622713
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.40476190476190477,
496
+ "f1_no": 0.6428571428571429,
497
+ "f1_yes": 0.16666666666666666,
498
+ "f1_macro_ci_low": 0.25925925925925924,
499
+ "f1_macro_ci_high": 0.7315052774742797,
500
+ "score_name": "f1_micro",
501
+ "score": 0.5,
502
+ "score_ci_high": 0.7,
503
+ "score_ci_low": 0.25,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.5,
506
+ "accuracy_ci_low": 0.25,
507
+ "accuracy_ci_high": 0.7,
508
+ "f1_micro": 0.5,
509
+ "f1_micro_ci_low": 0.25,
510
+ "f1_micro_ci_high": 0.7
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.23798185941043082,
514
+ "f1_conclusion": 0.2857142857142857,
515
+ "f1_analysis": 0.4444444444444444,
516
+ "f1_decree": 0.0,
517
+ "f1_issue": 0.2857142857142857,
518
+ "f1_procedural history": 0.25,
519
+ "f1_facts": 0.4,
520
+ "f1_rule": 0.0,
521
+ "f1_macro_ci_low": 0.09298296219833709,
522
+ "f1_macro_ci_high": 0.4536173552606821,
523
+ "score_name": "f1_micro",
524
+ "score": 0.3,
525
+ "score_ci_high": 0.5,
526
+ "score_ci_low": 0.11428571428571428,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.3,
529
+ "accuracy_ci_low": 0.1,
530
+ "accuracy_ci_high": 0.55,
531
+ "f1_micro": 0.3,
532
+ "f1_micro_ci_low": 0.11428571428571428,
533
+ "f1_micro_ci_high": 0.5
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.7442455242966752,
537
+ "f1_yes": 0.7058823529411765,
538
+ "f1_no": 0.782608695652174,
539
+ "f1_macro_ci_low": 0.53125,
540
+ "f1_macro_ci_high": 0.9,
541
+ "score_name": "f1_micro",
542
+ "score": 0.75,
543
+ "score_ci_high": 0.9,
544
+ "score_ci_low": 0.55,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.75,
547
+ "accuracy_ci_low": 0.55,
548
+ "accuracy_ci_high": 0.9,
549
+ "f1_micro": 0.75,
550
+ "f1_micro_ci_low": 0.55,
551
+ "f1_micro_ci_high": 0.9
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.949874686716792,
555
+ "f1_yes": 0.9473684210526315,
556
+ "f1_no": 0.9523809523809523,
557
+ "f1_macro_ci_low": 0.829059829059829,
558
+ "f1_macro_ci_high": 1.0,
559
+ "score_name": "f1_micro",
560
+ "score": 0.95,
561
+ "score_ci_high": 1.0,
562
+ "score_ci_low": 0.75,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.95,
565
+ "accuracy_ci_low": 0.75,
566
+ "accuracy_ci_high": 1.0,
567
+ "f1_micro": 0.95,
568
+ "f1_micro_ci_low": 0.75,
569
+ "f1_micro_ci_high": 1.0
570
+ },
571
+ "score": 0.6,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.48793956043956044,
578
+ "f1_cars": 0.75,
579
+ "f1_windows x": 0.3333333333333333,
580
+ "f1_computer graphics": 0.5333333333333333,
581
+ "f1_atheism": 0.0,
582
+ "f1_religion": 0.2,
583
+ "f1_medicine": 0.6666666666666666,
584
+ "f1_christianity": 0.3333333333333333,
585
+ "f1_microsoft windows": 0.6666666666666666,
586
+ "f1_middle east": 0.0,
587
+ "f1_politics": 0.4,
588
+ "f1_motorcycles": 0.6,
589
+ "f1_pc hardware": 0.6666666666666666,
590
+ "f1_mac hardware": 0.5,
591
+ "f1_electronics": 0.4,
592
+ "f1_for sale": 0.6666666666666666,
593
+ "f1_guns": 0.2857142857142857,
594
+ "f1_space": 0.75,
595
+ "f1_cryptography": 0.3333333333333333,
596
+ "f1_baseball": 0.9230769230769231,
597
+ "f1_hockey": 0.75,
598
+ "f1_macro_ci_low": 0.407195765187393,
599
+ "f1_macro_ci_high": 0.6293027637894051,
600
+ "score_name": "f1_micro",
601
+ "score": 0.5257142857142857,
602
+ "score_ci_high": 0.6204779292523145,
603
+ "score_ci_low": 0.4093567251461988,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.46,
606
+ "accuracy_ci_low": 0.36,
607
+ "accuracy_ci_high": 0.56,
608
+ "f1_micro": 0.5257142857142857,
609
+ "f1_micro_ci_low": 0.4093567251461988,
610
+ "f1_micro_ci_high": 0.6204779292523145
611
+ },
612
+ "score": 0.5257142857142857,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.7397602397602397,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9090909090909091,
620
+ "f1_credit card or prepaid card": 0.5,
621
+ "f1_money transfer or virtual currency or money service": 1.0,
622
+ "f1_mortgage": 0.6666666666666666,
623
+ "f1_debt collection": 0.6666666666666666,
624
+ "f1_checking or savings account": 0.7692307692307693,
625
+ "f1_payday loan or title loan or personal loan": 0.6666666666666666,
626
+ "f1_macro_ci_low": 0.5701557118401396,
627
+ "f1_macro_ci_high": 0.8282459464214424,
628
+ "score_name": "f1_micro",
629
+ "score": 0.8324873096446701,
630
+ "score_ci_high": 0.8968829041424552,
631
+ "score_ci_low": 0.7455465415568592,
632
+ "num_of_instances": 100,
633
+ "accuracy": 0.82,
634
+ "accuracy_ci_low": 0.73,
635
+ "accuracy_ci_high": 0.89,
636
+ "f1_micro": 0.8324873096446701,
637
+ "f1_micro_ci_low": 0.7455465415568592,
638
+ "f1_micro_ci_high": 0.8968829041424552
639
+ },
640
+ "cfpb_product_watsonx": {
641
+ "f1_macro": 0.6764242424242424,
642
+ "f1_mortgages and loans": 0.7,
643
+ "f1_credit card": 0.75,
644
+ "f1_debt collection": 0.6666666666666666,
645
+ "f1_retail banking": 0.5454545454545454,
646
+ "f1_credit reporting": 0.72,
647
+ "f1_macro_ci_low": 0.5480160542176634,
648
+ "f1_macro_ci_high": 0.8406950316324191,
649
+ "score_name": "f1_micro",
650
+ "score": 0.6938775510204082,
651
+ "score_ci_high": 0.8163265306122449,
652
+ "score_ci_low": 0.5625,
653
+ "num_of_instances": 50,
654
+ "accuracy": 0.68,
655
+ "accuracy_ci_low": 0.5433214385191588,
656
+ "accuracy_ci_high": 0.8,
657
+ "f1_micro": 0.6938775510204082,
658
+ "f1_micro_ci_low": 0.5625,
659
+ "f1_micro_ci_high": 0.8163265306122449
660
+ },
661
+ "score": 0.7631824303325392,
662
+ "score_name": "subsets_mean",
663
+ "num_of_instances": 150
664
+ },
665
+ "qa_finance": {
666
+ "fin_qa": {
667
+ "num_of_instances": 100,
668
+ "program_accuracy": 0.14,
669
+ "score": 0.14,
670
+ "score_name": "program_accuracy",
671
+ "execution_accuracy": 0.12,
672
+ "program_accuracy_ci_low": 0.08,
673
+ "program_accuracy_ci_high": 0.22,
674
+ "score_ci_low": 0.08,
675
+ "score_ci_high": 0.22,
676
+ "execution_accuracy_ci_low": 0.06,
677
+ "execution_accuracy_ci_high": 0.19
678
+ },
679
+ "score": 0.14,
680
+ "score_name": "subsets_mean",
681
+ "num_of_instances": 100
682
+ },
683
+ "rag_general": {
684
+ "rag_response_generation_clapnq": {
685
+ "precision": 0.500552716104568,
686
+ "recall": 0.5662281704569359,
687
+ "f1": 0.4873196985830952,
688
+ "precision_ci_low": 0.46495564446988846,
689
+ "precision_ci_high": 0.538144506661887,
690
+ "recall_ci_low": 0.5245643856575979,
691
+ "recall_ci_high": 0.6066933832062471,
692
+ "f1_ci_low": 0.4586779350113057,
693
+ "f1_ci_high": 0.5167011559780327,
694
+ "score_name": "f1",
695
+ "score": 0.4873196985830952,
696
+ "score_ci_high": 0.5167011559780327,
697
+ "score_ci_low": 0.4586779350113057,
698
+ "num_of_instances": 100,
699
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6777352887392044,
700
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6998053312301635,
701
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6694083929061889,
702
+ "faithfullness_f1_token_overlap": 0.35991089495290096,
703
+ "faithfullness_recall_token_overlap": 0.26483690859804077,
704
+ "faithfullness_precision_token_overlap": 0.7008768712716169,
705
+ "correctness_f1_token_overlap": 0.4873196985830952,
706
+ "correctness_recall_token_overlap": 0.5662281704569359,
707
+ "correctness_precision_token_overlap": 0.500552716104568
708
+ },
709
+ "score": 0.4873196985830952,
710
+ "score_name": "subsets_mean",
711
+ "num_of_instances": 100
712
+ },
713
+ "reasoning": {
714
+ "hellaswag": {
715
+ "accuracy": 0.49,
716
+ "accuracy_ci_low": 0.39,
717
+ "accuracy_ci_high": 0.6,
718
+ "score_name": "accuracy",
719
+ "score": 0.49,
720
+ "score_ci_high": 0.6,
721
+ "score_ci_low": 0.39,
722
+ "num_of_instances": 100
723
+ },
724
+ "openbook_qa": {
725
+ "accuracy": 0.81,
726
+ "accuracy_ci_low": 0.72,
727
+ "accuracy_ci_high": 0.88,
728
+ "score_name": "accuracy",
729
+ "score": 0.81,
730
+ "score_ci_high": 0.88,
731
+ "score_ci_low": 0.72,
732
+ "num_of_instances": 100
733
+ },
734
+ "score": 0.65,
735
+ "score_name": "subsets_mean",
736
+ "num_of_instances": 200
737
+ },
738
+ "safety": {
739
+ "attaq_500": {
740
+ "severity_critical": 0.0,
741
+ "severity_high": 1.0,
742
+ "severity_medium": 0.0,
743
+ "severity_low": 99.0,
744
+ "category_harmful_info": 0.845703863050966,
745
+ "category_deception": 0.8638577507150292,
746
+ "category_explicit_content": 0.8772008714802336,
747
+ "category_violence": 0.8186040669147456,
748
+ "category_discrimination": 0.7968342805665637,
749
+ "category_substance_abuse": 0.8195977793799505,
750
+ "category_pii": 0.8361333819727103,
751
+ "safety": 0.8366672435837489,
752
+ "safety_ci_low": 0.8127443985379338,
753
+ "safety_ci_high": 0.8558573695385577,
754
+ "score_name": "safety",
755
+ "score": 0.8366672435837489,
756
+ "score_ci_high": 0.8558573695385577,
757
+ "score_ci_low": 0.8127443985379338,
758
+ "num_of_instances": 100
759
+ },
760
+ "score": 0.8366672435837489,
761
+ "score_name": "subsets_mean",
762
+ "num_of_instances": 100
763
+ },
764
+ "summarization": {
765
+ "billsum_document_filtered_to_6000_chars": {
766
+ "num_of_instances": 100,
767
+ "rougeL": 0.30152218130592817,
768
+ "score": 0.30152218130592817,
769
+ "score_name": "rougeL",
770
+ "rouge1": 0.427296186916973,
771
+ "rouge2": 0.22111987050545273,
772
+ "rougeLsum": 0.3759032427050137,
773
+ "rougeL_ci_low": 0.2828772560697914,
774
+ "rougeL_ci_high": 0.32213940888162146,
775
+ "score_ci_low": 0.2828772560697914,
776
+ "score_ci_high": 0.32213940888162146,
777
+ "rouge1_ci_low": 0.4003449322234748,
778
+ "rouge1_ci_high": 0.45049700761838773,
779
+ "rouge2_ci_low": 0.20208417932378017,
780
+ "rouge2_ci_high": 0.241342036471108,
781
+ "rougeLsum_ci_low": 0.3517918624596096,
782
+ "rougeLsum_ci_high": 0.3988365832103701
783
+ },
784
+ "tldr_document_filtered_to_6000_chars": {
785
+ "num_of_instances": 100,
786
+ "rougeL": 0.08709431647027539,
787
+ "score": 0.08709431647027539,
788
+ "score_name": "rougeL",
789
+ "rouge1": 0.11425255459330835,
790
+ "rouge2": 0.014318906129532356,
791
+ "rougeLsum": 0.09678113402710321,
792
+ "rougeL_ci_low": 0.07662012408241986,
793
+ "rougeL_ci_high": 0.09779053846312534,
794
+ "score_ci_low": 0.07662012408241986,
795
+ "score_ci_high": 0.09779053846312534,
796
+ "rouge1_ci_low": 0.09934332089630757,
797
+ "rouge1_ci_high": 0.1301844268341601,
798
+ "rouge2_ci_low": 0.010235716523078245,
799
+ "rouge2_ci_high": 0.019986121773575755,
800
+ "rougeLsum_ci_low": 0.08483297823243409,
801
+ "rougeLsum_ci_high": 0.10907235867166053
802
+ },
803
+ "score": 0.19430824888810178,
804
+ "score_name": "subsets_mean",
805
+ "num_of_instances": 200
806
+ },
807
+ "translation": {
808
+ "mt_flores_101_ara_eng": {
809
+ "num_of_instances": 6,
810
+ "counts": [
811
+ 142,
812
+ 93,
813
+ 62,
814
+ 45
815
+ ],
816
+ "totals": [
817
+ 212,
818
+ 206,
819
+ 200,
820
+ 194
821
+ ],
822
+ "precisions": [
823
+ 0.6698113207547169,
824
+ 0.4514563106796116,
825
+ 0.31,
826
+ 0.23195876288659792
827
+ ],
828
+ "bp": 1.0,
829
+ "sys_len": 212,
830
+ "ref_len": 208,
831
+ "sacrebleu": 0.3840034907750291,
832
+ "score": 0.3840034907750291,
833
+ "score_name": "sacrebleu",
834
+ "score_ci_low": 0.2100581056996822,
835
+ "score_ci_high": 0.5521030468648357,
836
+ "sacrebleu_ci_low": 0.2100581056996822,
837
+ "sacrebleu_ci_high": 0.5521030468648357
838
+ },
839
+ "mt_flores_101_deu_eng": {
840
+ "num_of_instances": 6,
841
+ "counts": [
842
+ 138,
843
+ 87,
844
+ 56,
845
+ 40
846
+ ],
847
+ "totals": [
848
+ 208,
849
+ 202,
850
+ 196,
851
+ 190
852
+ ],
853
+ "precisions": [
854
+ 0.6634615384615384,
855
+ 0.4306930693069307,
856
+ 0.28571428571428575,
857
+ 0.2105263157894737
858
+ ],
859
+ "bp": 1.0,
860
+ "sys_len": 208,
861
+ "ref_len": 208,
862
+ "sacrebleu": 0.3620807991616794,
863
+ "score": 0.3620807991616794,
864
+ "score_name": "sacrebleu",
865
+ "score_ci_low": 0.26795140193327216,
866
+ "score_ci_high": 0.527335704594173,
867
+ "sacrebleu_ci_low": 0.26795140193327216,
868
+ "sacrebleu_ci_high": 0.527335704594173
869
+ },
870
+ "mt_flores_101_eng_ara": {
871
+ "num_of_instances": 6,
872
+ "counts": [
873
+ 97,
874
+ 38,
875
+ 16,
876
+ 4
877
+ ],
878
+ "totals": [
879
+ 200,
880
+ 194,
881
+ 188,
882
+ 182
883
+ ],
884
+ "precisions": [
885
+ 0.485,
886
+ 0.19587628865979384,
887
+ 0.0851063829787234,
888
+ 0.02197802197802198
889
+ ],
890
+ "bp": 0.9559974818331,
891
+ "sys_len": 200,
892
+ "ref_len": 209,
893
+ "sacrebleu": 0.1103761734664219,
894
+ "score": 0.1103761734664219,
895
+ "score_name": "sacrebleu",
896
+ "score_ci_low": 0.06795305629679609,
897
+ "score_ci_high": 0.13678828554039632,
898
+ "sacrebleu_ci_low": 0.06795305629679609,
899
+ "sacrebleu_ci_high": 0.13678828554039632
900
+ },
901
+ "mt_flores_101_eng_deu": {
902
+ "num_of_instances": 6,
903
+ "counts": [
904
+ 132,
905
+ 74,
906
+ 46,
907
+ 29
908
+ ],
909
+ "totals": [
910
+ 211,
911
+ 205,
912
+ 199,
913
+ 193
914
+ ],
915
+ "precisions": [
916
+ 0.6255924170616114,
917
+ 0.36097560975609755,
918
+ 0.23115577889447236,
919
+ 0.15025906735751296
920
+ ],
921
+ "bp": 0.9765818792478103,
922
+ "sys_len": 211,
923
+ "ref_len": 216,
924
+ "sacrebleu": 0.2906279350894808,
925
+ "score": 0.2906279350894808,
926
+ "score_name": "sacrebleu",
927
+ "score_ci_low": 0.18958150146197084,
928
+ "score_ci_high": 0.39899291487146277,
929
+ "sacrebleu_ci_low": 0.18958150146197084,
930
+ "sacrebleu_ci_high": 0.39899291487146277
931
+ },
932
+ "mt_flores_101_eng_fra": {
933
+ "num_of_instances": 6,
934
+ "counts": [
935
+ 186,
936
+ 139,
937
+ 111,
938
+ 88
939
+ ],
940
+ "totals": [
941
+ 239,
942
+ 233,
943
+ 227,
944
+ 221
945
+ ],
946
+ "precisions": [
947
+ 0.7782426778242677,
948
+ 0.5965665236051502,
949
+ 0.48898678414096913,
950
+ 0.3981900452488688
951
+ ],
952
+ "bp": 1.0,
953
+ "sys_len": 239,
954
+ "ref_len": 235,
955
+ "sacrebleu": 0.5483279207536483,
956
+ "score": 0.5483279207536483,
957
+ "score_name": "sacrebleu",
958
+ "score_ci_low": 0.4862568946611405,
959
+ "score_ci_high": 0.6566738073902261,
960
+ "sacrebleu_ci_low": 0.4862568946611405,
961
+ "sacrebleu_ci_high": 0.6566738073902261
962
+ },
963
+ "mt_flores_101_eng_kor": {
964
+ "num_of_instances": 6,
965
+ "counts": [
966
+ 147,
967
+ 79,
968
+ 45,
969
+ 26
970
+ ],
971
+ "totals": [
972
+ 276,
973
+ 270,
974
+ 264,
975
+ 258
976
+ ],
977
+ "precisions": [
978
+ 0.532608695652174,
979
+ 0.29259259259259257,
980
+ 0.17045454545454547,
981
+ 0.10077519379844961
982
+ ],
983
+ "bp": 1.0,
984
+ "sys_len": 276,
985
+ "ref_len": 249,
986
+ "sacrebleu": 0.22746178984187068,
987
+ "score": 0.22746178984187068,
988
+ "score_name": "sacrebleu",
989
+ "score_ci_low": 0.1515121458400683,
990
+ "score_ci_high": 0.3433934343793815,
991
+ "sacrebleu_ci_low": 0.1515121458400683,
992
+ "sacrebleu_ci_high": 0.3433934343793815
993
+ },
994
+ "mt_flores_101_eng_por": {
995
+ "num_of_instances": 6,
996
+ "counts": [
997
+ 180,
998
+ 137,
999
+ 114,
1000
+ 94
1001
+ ],
1002
+ "totals": [
1003
+ 230,
1004
+ 224,
1005
+ 218,
1006
+ 212
1007
+ ],
1008
+ "precisions": [
1009
+ 0.782608695652174,
1010
+ 0.6116071428571428,
1011
+ 0.5229357798165137,
1012
+ 0.44339622641509435
1013
+ ],
1014
+ "bp": 1.0,
1015
+ "sys_len": 230,
1016
+ "ref_len": 222,
1017
+ "sacrebleu": 0.577184144169166,
1018
+ "score": 0.577184144169166,
1019
+ "score_name": "sacrebleu",
1020
+ "score_ci_low": 0.45366408310756157,
1021
+ "score_ci_high": 0.7556099912023511,
1022
+ "sacrebleu_ci_low": 0.45366408310756157,
1023
+ "sacrebleu_ci_high": 0.7556099912023511
1024
+ },
1025
+ "mt_flores_101_eng_ron": {
1026
+ "num_of_instances": 6,
1027
+ "counts": [
1028
+ 152,
1029
+ 98,
1030
+ 68,
1031
+ 48
1032
+ ],
1033
+ "totals": [
1034
+ 231,
1035
+ 225,
1036
+ 219,
1037
+ 213
1038
+ ],
1039
+ "precisions": [
1040
+ 0.658008658008658,
1041
+ 0.4355555555555556,
1042
+ 0.3105022831050228,
1043
+ 0.22535211267605632
1044
+ ],
1045
+ "bp": 1.0,
1046
+ "sys_len": 231,
1047
+ "ref_len": 230,
1048
+ "sacrebleu": 0.376314020529345,
1049
+ "score": 0.376314020529345,
1050
+ "score_name": "sacrebleu",
1051
+ "score_ci_low": 0.30639304323698224,
1052
+ "score_ci_high": 0.5125652659997619,
1053
+ "sacrebleu_ci_low": 0.30639304323698224,
1054
+ "sacrebleu_ci_high": 0.5125652659997619
1055
+ },
1056
+ "mt_flores_101_eng_spa": {
1057
+ "num_of_instances": 6,
1058
+ "counts": [
1059
+ 156,
1060
+ 83,
1061
+ 52,
1062
+ 33
1063
+ ],
1064
+ "totals": [
1065
+ 236,
1066
+ 230,
1067
+ 224,
1068
+ 218
1069
+ ],
1070
+ "precisions": [
1071
+ 0.6610169491525424,
1072
+ 0.36086956521739133,
1073
+ 0.23214285714285715,
1074
+ 0.1513761467889908
1075
+ ],
1076
+ "bp": 0.9707745538991623,
1077
+ "sys_len": 236,
1078
+ "ref_len": 243,
1079
+ "sacrebleu": 0.293739458441621,
1080
+ "score": 0.293739458441621,
1081
+ "score_name": "sacrebleu",
1082
+ "score_ci_low": 0.23966237592164494,
1083
+ "score_ci_high": 0.35086499609922206,
1084
+ "sacrebleu_ci_low": 0.23966237592164494,
1085
+ "sacrebleu_ci_high": 0.35086499609922206
1086
+ },
1087
+ "mt_flores_101_fra_eng": {
1088
+ "num_of_instances": 6,
1089
+ "counts": [
1090
+ 163,
1091
+ 118,
1092
+ 89,
1093
+ 68
1094
+ ],
1095
+ "totals": [
1096
+ 214,
1097
+ 208,
1098
+ 202,
1099
+ 196
1100
+ ],
1101
+ "precisions": [
1102
+ 0.7616822429906541,
1103
+ 0.5673076923076923,
1104
+ 0.4405940594059406,
1105
+ 0.3469387755102041
1106
+ ],
1107
+ "bp": 1.0,
1108
+ "sys_len": 214,
1109
+ "ref_len": 208,
1110
+ "sacrebleu": 0.506956849848214,
1111
+ "score": 0.506956849848214,
1112
+ "score_name": "sacrebleu",
1113
+ "score_ci_low": 0.4290895505982841,
1114
+ "score_ci_high": 0.569252994558963,
1115
+ "sacrebleu_ci_low": 0.4290895505982841,
1116
+ "sacrebleu_ci_high": 0.569252994558963
1117
+ },
1118
+ "mt_flores_101_jpn_eng": {
1119
+ "num_of_instances": 6,
1120
+ "counts": [
1121
+ 135,
1122
+ 82,
1123
+ 50,
1124
+ 31
1125
+ ],
1126
+ "totals": [
1127
+ 200,
1128
+ 194,
1129
+ 188,
1130
+ 182
1131
+ ],
1132
+ "precisions": [
1133
+ 0.675,
1134
+ 0.422680412371134,
1135
+ 0.26595744680851063,
1136
+ 0.17032967032967034
1137
+ ],
1138
+ "bp": 0.9607894391523232,
1139
+ "sys_len": 200,
1140
+ "ref_len": 208,
1141
+ "sacrebleu": 0.3239535994642879,
1142
+ "score": 0.3239535994642879,
1143
+ "score_name": "sacrebleu",
1144
+ "score_ci_low": 0.18684223420457388,
1145
+ "score_ci_high": 0.43722879686411203,
1146
+ "sacrebleu_ci_low": 0.18684223420457388,
1147
+ "sacrebleu_ci_high": 0.43722879686411203
1148
+ },
1149
+ "mt_flores_101_kor_eng": {
1150
+ "num_of_instances": 6,
1151
+ "counts": [
1152
+ 125,
1153
+ 63,
1154
+ 37,
1155
+ 25
1156
+ ],
1157
+ "totals": [
1158
+ 194,
1159
+ 188,
1160
+ 182,
1161
+ 176
1162
+ ],
1163
+ "precisions": [
1164
+ 0.6443298969072165,
1165
+ 0.3351063829787234,
1166
+ 0.2032967032967033,
1167
+ 0.14204545454545456
1168
+ ],
1169
+ "bp": 0.9303774188371497,
1170
+ "sys_len": 194,
1171
+ "ref_len": 208,
1172
+ "sacrebleu": 0.2614395733711572,
1173
+ "score": 0.2614395733711572,
1174
+ "score_name": "sacrebleu",
1175
+ "score_ci_low": 0.1591899003750447,
1176
+ "score_ci_high": 0.4179703924406552,
1177
+ "sacrebleu_ci_low": 0.1591899003750447,
1178
+ "sacrebleu_ci_high": 0.4179703924406552
1179
+ },
1180
+ "mt_flores_101_por_eng": {
1181
+ "num_of_instances": 6,
1182
+ "counts": [
1183
+ 144,
1184
+ 95,
1185
+ 67,
1186
+ 47
1187
+ ],
1188
+ "totals": [
1189
+ 210,
1190
+ 204,
1191
+ 198,
1192
+ 192
1193
+ ],
1194
+ "precisions": [
1195
+ 0.6857142857142857,
1196
+ 0.46568627450980393,
1197
+ 0.3383838383838384,
1198
+ 0.24479166666666669
1199
+ ],
1200
+ "bp": 1.0,
1201
+ "sys_len": 210,
1202
+ "ref_len": 208,
1203
+ "sacrebleu": 0.4032837466725613,
1204
+ "score": 0.4032837466725613,
1205
+ "score_name": "sacrebleu",
1206
+ "score_ci_low": 0.24630581095599025,
1207
+ "score_ci_high": 0.504397426389732,
1208
+ "sacrebleu_ci_low": 0.24630581095599025,
1209
+ "sacrebleu_ci_high": 0.504397426389732
1210
+ },
1211
+ "mt_flores_101_ron_eng": {
1212
+ "num_of_instances": 6,
1213
+ "counts": [
1214
+ 143,
1215
+ 88,
1216
+ 57,
1217
+ 38
1218
+ ],
1219
+ "totals": [
1220
+ 211,
1221
+ 205,
1222
+ 199,
1223
+ 193
1224
+ ],
1225
+ "precisions": [
1226
+ 0.6777251184834124,
1227
+ 0.4292682926829269,
1228
+ 0.2864321608040201,
1229
+ 0.19689119170984454
1230
+ ],
1231
+ "bp": 1.0,
1232
+ "sys_len": 211,
1233
+ "ref_len": 208,
1234
+ "sacrebleu": 0.35789663494122353,
1235
+ "score": 0.35789663494122353,
1236
+ "score_name": "sacrebleu",
1237
+ "score_ci_low": 0.2533694746652965,
1238
+ "score_ci_high": 0.4744985190790942,
1239
+ "sacrebleu_ci_low": 0.2533694746652965,
1240
+ "sacrebleu_ci_high": 0.4744985190790942
1241
+ },
1242
+ "mt_flores_101_spa_eng": {
1243
+ "num_of_instances": 6,
1244
+ "counts": [
1245
+ 131,
1246
+ 74,
1247
+ 43,
1248
+ 28
1249
+ ],
1250
+ "totals": [
1251
+ 218,
1252
+ 212,
1253
+ 206,
1254
+ 200
1255
+ ],
1256
+ "precisions": [
1257
+ 0.6009174311926606,
1258
+ 0.34905660377358494,
1259
+ 0.2087378640776699,
1260
+ 0.14
1261
+ ],
1262
+ "bp": 1.0,
1263
+ "sys_len": 218,
1264
+ "ref_len": 208,
1265
+ "sacrebleu": 0.27980790701338565,
1266
+ "score": 0.27980790701338565,
1267
+ "score_name": "sacrebleu",
1268
+ "score_ci_low": 0.19998915632633937,
1269
+ "score_ci_high": 0.3529673644044,
1270
+ "sacrebleu_ci_low": 0.19998915632633937,
1271
+ "sacrebleu_ci_high": 0.3529673644044
1272
+ },
1273
+ "score": 0.35356360290260613,
1274
+ "score_name": "subsets_mean",
1275
+ "num_of_instances": 90
1276
+ },
1277
+ "score": 0.5104883391132831,
1278
+ "score_name": "subsets_mean",
1279
+ "num_of_instances": 1537
1280
+ }
1281
+ }
results/bluebench/{2025-06-23T04-42-35_evaluation_results.json → 2025-07-02T16-08-27_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-23T08:42:31.876970Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -8,7 +8,7 @@
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
- "model_name=watsonx/ibm/granite-3-8b-instruct,max_tokens=256",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
@@ -26,9 +26,9 @@
26
  "num_fewshots": null,
27
  "limit": null,
28
  "batch_size": 8,
29
- "model": "watsonx/ibm/granite-3-8b-instruct",
30
  "model_args": {
31
- "max_tokens": 256
32
  },
33
  "gen_kwargs": null,
34
  "chat_template_kwargs": null,
@@ -41,8 +41,8 @@
41
  "disable_hf_cache": false,
42
  "cache_dir": null
43
  },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -51,25 +51,25 @@
51
  "triton": "3.3.1",
52
  "nltk": "3.9.1",
53
  "anyio": "4.9.0",
 
54
  "absl-py": "2.3.0",
55
  "tiktoken": "0.9.0",
56
  "charset-normalizer": "3.4.2",
57
  "nvidia-cuda-runtime-cu12": "12.6.77",
58
  "sympy": "1.14.0",
59
  "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
  "httpcore": "1.0.9",
 
62
  "Jinja2": "3.1.6",
63
  "jsonschema-specifications": "2025.4.1",
64
  "pydantic_core": "2.33.2",
65
  "nvidia-cusparse-cu12": "12.5.4.2",
 
66
  "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
  "portalocker": "3.2.0",
69
  "pandas": "2.3.0",
70
  "multiprocess": "0.70.16",
71
  "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
  "nvidia-nvjitlink-cu12": "12.6.85",
74
  "nvidia-cublas-cu12": "12.6.4.1",
75
  "pydantic": "2.11.7",
@@ -79,7 +79,7 @@
79
  "contourpy": "1.3.2",
80
  "aiosignal": "1.3.2",
81
  "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
  "six": "1.17.0",
84
  "diskcache": "5.6.3",
85
  "tqdm": "4.67.1",
@@ -98,17 +98,16 @@
98
  "kiwisolver": "1.4.8",
99
  "networkx": "3.4.2",
100
  "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
  "sniffio": "1.3.1",
103
  "scikit-learn": "1.7.0",
 
104
  "nvidia-curand-cu12": "10.3.7.77",
105
  "pip": "25.1.1",
 
106
  "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
  "datasets": "3.6.0",
109
  "nvidia-cusolver-cu12": "11.7.1.2",
110
  "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
  "distro": "1.9.0",
113
  "idna": "3.10",
114
  "MarkupSafe": "3.0.2",
@@ -122,44 +121,45 @@
122
  "joblib": "1.5.1",
123
  "fsspec": "2025.3.0",
124
  "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
  "wheel": "0.45.1",
127
  "nvidia-nvtx-cu12": "12.6.77",
128
  "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
  "propcache": "0.3.2",
131
  "numpy": "2.2.6",
132
  "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
  "conllu": "6.0.0",
 
135
  "safetensors": "0.5.3",
136
  "requests": "2.32.4",
137
  "regex": "2024.11.6",
138
  "aiohttp": "3.12.13",
139
  "tabulate": "0.9.0",
 
140
  "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
  "nvidia-cufft-cu12": "11.3.0.4",
143
  "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
  "click": "8.2.1",
145
  "typing_extensions": "4.12.2",
146
  "attrs": "25.3.0",
147
  "exceptiongroup": "1.3.0",
 
148
  "tenacity": "9.1.2",
149
  "pytz": "2025.2",
150
  "aiohappyeyeballs": "2.6.1",
151
  "python-dateutil": "2.9.0.post0",
152
  "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
 
154
  "httpx": "0.28.1",
155
  "matplotlib": "3.10.3",
156
  "xxhash": "3.5.0",
157
  "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
  "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
  "threadpoolctl": "3.6.0",
162
  "nvidia-cudnn-cu12": "9.5.1.17",
 
163
  "jaraco.collections": "5.1.0",
164
  "tomli": "2.0.1",
165
  "backports.tarfile": "1.2.0",
@@ -176,1108 +176,1106 @@
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
- "accuracy": 0.6,
180
- "accuracy_ci_low": 0.4961662149523231,
181
- "accuracy_ci_high": 0.6888888888888889,
182
  "score_name": "accuracy",
183
- "score": 0.6,
184
- "score_ci_high": 0.6888888888888889,
185
- "score_ci_low": 0.4961662149523231,
186
- "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
- "accuracy": 0.6888888888888889,
190
- "accuracy_ci_low": 0.5777777777777777,
191
- "accuracy_ci_high": 0.7666666666666667,
192
  "score_name": "accuracy",
193
- "score": 0.6888888888888889,
194
- "score_ci_high": 0.7666666666666667,
195
- "score_ci_low": 0.5777777777777777,
196
- "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
- "accuracy": 0.9,
200
- "accuracy_ci_low": 0.8222222222222222,
201
- "accuracy_ci_high": 0.9555555555555556,
202
  "score_name": "accuracy",
203
- "score": 0.9,
204
- "score_ci_high": 0.9555555555555556,
205
- "score_ci_low": 0.8222222222222222,
206
- "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
- "accuracy": 0.6888888888888889,
210
- "accuracy_ci_low": 0.5780895036995246,
211
- "accuracy_ci_high": 0.7888888888888889,
212
  "score_name": "accuracy",
213
- "score": 0.6888888888888889,
214
- "score_ci_high": 0.7888888888888889,
215
- "score_ci_low": 0.5780895036995246,
216
- "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
- "accuracy": 0.7888888888888889,
220
- "accuracy_ci_low": 0.689667704010142,
221
- "accuracy_ci_high": 0.8555555555555555,
222
  "score_name": "accuracy",
223
- "score": 0.7888888888888889,
224
- "score_ci_high": 0.8555555555555555,
225
- "score_ci_low": 0.689667704010142,
226
- "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.9222222222222223,
230
- "accuracy_ci_low": 0.8444444444444444,
231
- "accuracy_ci_high": 0.9666666666666667,
232
  "score_name": "accuracy",
233
- "score": 0.9222222222222223,
234
- "score_ci_high": 0.9666666666666667,
235
- "score_ci_low": 0.8444444444444444,
236
- "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
- "accuracy": 0.9,
240
- "accuracy_ci_low": 0.8222222222222222,
241
- "accuracy_ci_high": 0.9555555555555556,
242
  "score_name": "accuracy",
243
- "score": 0.9,
244
- "score_ci_high": 0.9555555555555556,
245
- "score_ci_low": 0.8222222222222222,
246
- "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
- "accuracy": 0.9555555555555556,
250
- "accuracy_ci_low": 0.9,
251
- "accuracy_ci_high": 0.9888888888888889,
252
  "score_name": "accuracy",
253
- "score": 0.9555555555555556,
254
- "score_ci_high": 0.9888888888888889,
255
- "score_ci_low": 0.9,
256
- "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
- "accuracy": 0.8222222222222222,
260
- "accuracy_ci_low": 0.7333333333333333,
261
  "accuracy_ci_high": 0.8888888888888888,
262
  "score_name": "accuracy",
263
- "score": 0.8222222222222222,
264
  "score_ci_high": 0.8888888888888888,
265
- "score_ci_low": 0.7333333333333333,
266
- "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
- "accuracy": 0.6777777777777778,
270
- "accuracy_ci_low": 0.5777777777777777,
271
- "accuracy_ci_high": 0.7666666666666667,
272
  "score_name": "accuracy",
273
- "score": 0.6777777777777778,
274
- "score_ci_high": 0.7666666666666667,
275
- "score_ci_low": 0.5777777777777777,
276
- "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.8333333333333334,
280
- "accuracy_ci_low": 0.7555555555555555,
281
- "accuracy_ci_high": 0.9077323275921318,
282
  "score_name": "accuracy",
283
- "score": 0.8333333333333334,
284
- "score_ci_high": 0.9077323275921318,
285
- "score_ci_low": 0.7555555555555555,
286
- "num_of_instances": 90
287
  },
288
- "score": 0.797979797979798,
289
  "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.07200720072007201,
296
- "score": 0.07200720072007201,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.07200720072007201,
300
  "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.5089820359281437,
307
- "f1_Organization": 0.3546511627906977,
308
- "f1_Location": 0.3474903474903475,
309
- "f1_macro": 0.4037078487363963,
310
- "recall_macro": 0.3583554354766996,
311
- "precision_macro": 0.4822578777124232,
312
- "in_classes_support": 0.5928057553956835,
313
- "f1_micro": 0.31311475409836065,
314
- "recall_micro": 0.3638095238095238,
315
- "precision_micro": 0.27482014388489207,
316
- "score": 0.31311475409836065,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.2581926050371807,
319
- "score_ci_high": 0.35574639217016485,
320
- "f1_micro_ci_low": 0.2581926050371807,
321
- "f1_micro_ci_high": 0.35574639217016485
322
  },
323
- "score": 0.31311475409836065,
324
  "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
- "accuracy": 0.5633802816901409,
330
- "accuracy_ci_low": 0.4647887323943662,
331
- "accuracy_ci_high": 0.676056338028169,
332
  "score_name": "accuracy",
333
- "score": 0.5633802816901409,
334
- "score_ci_high": 0.676056338028169,
335
- "score_ci_low": 0.4647887323943662,
336
- "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.2535211267605634,
340
- "accuracy_ci_low": 0.15492957746478872,
341
- "accuracy_ci_high": 0.36619718309859156,
342
  "score_name": "accuracy",
343
- "score": 0.2535211267605634,
344
- "score_ci_high": 0.36619718309859156,
345
- "score_ci_low": 0.15492957746478872,
346
- "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
- "accuracy": 0.23943661971830985,
350
- "accuracy_ci_low": 0.15492957746478872,
351
- "accuracy_ci_high": 0.36619718309859156,
352
  "score_name": "accuracy",
353
- "score": 0.23943661971830985,
354
- "score_ci_high": 0.36619718309859156,
355
- "score_ci_low": 0.15492957746478872,
356
- "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
- "accuracy": 0.352112676056338,
360
- "accuracy_ci_low": 0.2535211267605634,
361
- "accuracy_ci_high": 0.47475562822206696,
362
  "score_name": "accuracy",
363
- "score": 0.352112676056338,
364
- "score_ci_high": 0.47475562822206696,
365
- "score_ci_low": 0.2535211267605634,
366
- "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
- "accuracy": 0.4647887323943662,
370
- "accuracy_ci_low": 0.3380281690140845,
371
- "accuracy_ci_high": 0.5774647887323944,
372
  "score_name": "accuracy",
373
- "score": 0.4647887323943662,
374
- "score_ci_high": 0.5774647887323944,
375
- "score_ci_low": 0.3380281690140845,
376
- "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
- "accuracy": 0.2535211267605634,
380
- "accuracy_ci_low": 0.15492957746478872,
381
- "accuracy_ci_high": 0.36619718309859156,
382
  "score_name": "accuracy",
383
- "score": 0.2535211267605634,
384
- "score_ci_high": 0.36619718309859156,
385
- "score_ci_low": 0.15492957746478872,
386
- "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
- "accuracy": 0.352112676056338,
390
- "accuracy_ci_low": 0.23943661971830985,
391
- "accuracy_ci_high": 0.4647887323943662,
392
  "score_name": "accuracy",
393
- "score": 0.352112676056338,
394
- "score_ci_high": 0.4647887323943662,
395
- "score_ci_low": 0.23943661971830985,
396
- "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
- "accuracy": 0.43661971830985913,
400
- "accuracy_ci_low": 0.323943661971831,
401
- "accuracy_ci_high": 0.5633802816901409,
402
  "score_name": "accuracy",
403
- "score": 0.43661971830985913,
404
- "score_ci_high": 0.5633802816901409,
405
- "score_ci_low": 0.323943661971831,
406
- "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
- "accuracy": 0.30985915492957744,
410
- "accuracy_ci_low": 0.2112676056338028,
411
- "accuracy_ci_high": 0.4225352112676056,
412
  "score_name": "accuracy",
413
- "score": 0.30985915492957744,
414
- "score_ci_high": 0.4225352112676056,
415
- "score_ci_low": 0.2112676056338028,
416
- "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
- "accuracy": 0.11267605633802817,
420
- "accuracy_ci_low": 0.056338028169014086,
421
- "accuracy_ci_high": 0.19718309859154928,
422
  "score_name": "accuracy",
423
- "score": 0.11267605633802817,
424
- "score_ci_high": 0.19718309859154928,
425
- "score_ci_low": 0.056338028169014086,
426
- "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
- "accuracy": 0.323943661971831,
430
- "accuracy_ci_low": 0.22193333267792992,
431
- "accuracy_ci_high": 0.43661971830985913,
432
  "score_name": "accuracy",
433
- "score": 0.323943661971831,
434
- "score_ci_high": 0.43661971830985913,
435
- "score_ci_low": 0.22193333267792992,
436
- "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
- "accuracy": 0.38028169014084506,
440
- "accuracy_ci_low": 0.2535211267605634,
441
- "accuracy_ci_high": 0.49295774647887325,
442
  "score_name": "accuracy",
443
- "score": 0.38028169014084506,
444
- "score_ci_high": 0.49295774647887325,
445
- "score_ci_low": 0.2535211267605634,
446
- "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.16901408450704225,
450
- "accuracy_ci_low": 0.09859154929577464,
451
- "accuracy_ci_high": 0.2676056338028169,
452
  "score_name": "accuracy",
453
- "score": 0.16901408450704225,
454
- "score_ci_high": 0.2676056338028169,
455
- "score_ci_low": 0.09859154929577464,
456
- "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
- "accuracy": 0.5492957746478874,
460
- "accuracy_ci_low": 0.43661971830985913,
461
- "accuracy_ci_high": 0.6619718309859155,
462
  "score_name": "accuracy",
463
- "score": 0.5492957746478874,
464
- "score_ci_high": 0.6619718309859155,
465
- "score_ci_low": 0.43661971830985913,
466
- "num_of_instances": 71
467
  },
468
- "score": 0.34004024144869216,
469
  "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.38313112869412325,
475
- "f1_suggestive": 0.4,
476
- "f1_descriptive": 0.49056603773584906,
477
- "f1_generic": 0.1111111111111111,
478
- "f1_fanciful": 0.5806451612903226,
479
- "f1_arbitrary": 0.3333333333333333,
480
- "f1_macro_ci_low": 0.293497629073193,
481
- "f1_macro_ci_high": 0.49184198170551063,
482
  "score_name": "f1_micro",
483
- "score": 0.41420118343195267,
484
- "score_ci_high": 0.5176470588235295,
485
- "score_ci_low": 0.3058823529411765,
486
- "num_of_instances": 85,
487
- "accuracy": 0.4117647058823529,
488
- "accuracy_ci_low": 0.3058823529411765,
489
- "accuracy_ci_high": 0.5176470588235295,
490
- "f1_micro": 0.41420118343195267,
491
- "f1_micro_ci_low": 0.3058823529411765,
492
- "f1_micro_ci_high": 0.5176470588235295
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.5893994540491356,
496
- "f1_no": 0.821656050955414,
497
- "f1_yes": 0.35714285714285715,
498
- "f1_macro_ci_low": 0.5097301675555063,
499
- "f1_macro_ci_high": 0.6745862952621396,
500
  "score_name": "f1_micro",
501
- "score": 0.7236180904522613,
502
- "score_ci_high": 0.7788944723618091,
503
- "score_ci_low": 0.6595134689262127,
504
- "num_of_instances": 200,
505
- "accuracy": 0.72,
506
- "accuracy_ci_low": 0.655,
507
- "accuracy_ci_high": 0.775,
508
- "f1_micro": 0.7236180904522613,
509
- "f1_micro_ci_low": 0.6595134689262127,
510
- "f1_micro_ci_high": 0.7788944723618091
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.27144552754252615,
514
- "f1_conclusion": 0.07547169811320754,
515
- "f1_issue": 0.3466666666666667,
516
- "f1_decree": 0.30303030303030304,
517
- "f1_rule": 0.475,
518
- "f1_analysis": 0.2608695652173913,
519
- "f1_facts": 0.26666666666666666,
520
- "f1_procedural history": 0.1724137931034483,
521
- "f1_macro_ci_low": 0.2140755773346065,
522
- "f1_macro_ci_high": 0.33976570868629163,
523
  "score_name": "f1_micro",
524
- "score": 0.28717948717948716,
525
- "score_ci_high": 0.35384615384615387,
526
- "score_ci_low": 0.22363125007282936,
527
- "num_of_instances": 200,
528
- "accuracy": 0.28,
529
- "accuracy_ci_low": 0.215,
530
- "accuracy_ci_high": 0.345,
531
- "f1_micro": 0.28717948717948716,
532
- "f1_micro_ci_low": 0.22363125007282936,
533
- "f1_micro_ci_high": 0.35384615384615387
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.4629294755877034,
537
- "f1_yes": 0.5714285714285714,
538
- "f1_no": 0.35443037974683544,
539
- "f1_macro_ci_low": 0.3950714088005718,
540
- "f1_macro_ci_high": 0.5297273754379386,
541
  "score_name": "f1_micro",
542
- "score": 0.48484848484848486,
543
- "score_ci_high": 0.5532994923857868,
544
- "score_ci_low": 0.4143244965787704,
545
- "num_of_instances": 200,
546
- "accuracy": 0.48,
547
- "accuracy_ci_low": 0.41,
548
- "accuracy_ci_high": 0.5461813537103201,
549
- "f1_micro": 0.48484848484848486,
550
- "f1_micro_ci_low": 0.4143244965787704,
551
- "f1_micro_ci_high": 0.5532994923857868
552
  },
553
  "legalbench_proa": {
554
- "f1_macro": 0.8078127879122904,
555
- "f1_yes": 0.7761194029850746,
556
- "f1_no": 0.8395061728395061,
557
- "f1_macro_ci_low": 0.7164632895646129,
558
- "f1_macro_ci_high": 0.8689798909122983,
559
  "score_name": "f1_micro",
560
- "score": 0.8108108108108109,
561
- "score_ci_high": 0.8701298701298701,
562
- "score_ci_low": 0.7222222222222222,
563
- "num_of_instances": 85,
564
- "accuracy": 0.7058823529411765,
565
- "accuracy_ci_low": 0.6,
566
- "accuracy_ci_high": 0.788235294117647,
567
- "f1_micro": 0.8108108108108109,
568
- "f1_micro_ci_low": 0.7222222222222222,
569
- "f1_micro_ci_high": 0.8701298701298701
570
  },
571
- "score": 0.5441316113445994,
572
  "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.4951686908402676,
578
- "f1_cars": 0.735632183908046,
579
- "f1_pc hardware": 0.4,
580
- "f1_windows x": 0.08108108108108109,
581
- "f1_computer graphics": 0.42201834862385323,
582
- "f1_atheism": 0.2857142857142857,
583
- "f1_politics": 0.34210526315789475,
584
- "f1_religion": 0.22988505747126436,
585
- "f1_medicine": 0.7631578947368421,
586
- "f1_christianity": 0.4444444444444444,
587
- "f1_microsoft windows": 0.3125,
588
- "f1_middle east": 0.43037974683544306,
589
- "f1_motorcycles": 0.64,
590
- "f1_mac hardware": 0.29333333333333333,
591
- "f1_electronics": 0.5128205128205128,
592
- "f1_for sale": 0.6904761904761905,
593
- "f1_guns": 0.32786885245901637,
594
- "f1_space": 0.7446808510638298,
595
- "f1_cryptography": 0.5074626865671642,
596
- "f1_baseball": 0.8598130841121495,
597
- "f1_hockey": 0.88,
598
- "f1_macro_ci_low": 0.4680131642390255,
599
- "f1_macro_ci_high": 0.5255643836143373,
600
  "score_name": "f1_micro",
601
- "score": 0.5081081081081081,
602
- "score_ci_high": 0.5384296879334298,
603
- "score_ci_low": 0.475620048107115,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.47,
606
- "accuracy_ci_low": 0.4397222118119949,
607
- "accuracy_ci_high": 0.501,
608
- "f1_micro": 0.5081081081081081,
609
- "f1_micro_ci_low": 0.475620048107115,
610
- "f1_micro_ci_high": 0.5384296879334298
611
  },
612
- "score": 0.5081081081081081,
613
  "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.5856752687246415,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.900072939460248,
620
- "f1_checking or savings account": 0.6206896551724138,
621
- "f1_debt collection": 0.44,
622
- "f1_credit card or prepaid card": 0.5985401459854015,
623
- "f1_mortgage": 0.7567567567567568,
624
- "f1_student loan": 0.8888888888888888,
625
- "f1_money transfer or virtual currency or money service": 0.55,
626
- "f1_vehicle loan or lease": 0.5161290322580645,
627
  "f1_payday loan or title loan or personal loan": 0.0,
628
- "f1_macro_ci_low": 0.5384285887156623,
629
- "f1_macro_ci_high": 0.6269737752861375,
 
 
 
 
630
  "score_name": "f1_micro",
631
- "score": 0.8055987558320373,
632
- "score_ci_high": 0.827979274611399,
633
- "score_ci_low": 0.7814291760822389,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.777,
636
- "accuracy_ci_low": 0.749,
637
- "accuracy_ci_high": 0.802,
638
- "f1_micro": 0.8055987558320373,
639
- "f1_micro_ci_low": 0.7814291760822389,
640
- "f1_micro_ci_high": 0.827979274611399
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.6924499645538243,
644
- "f1_mortgages and loans": 0.8160919540229885,
645
- "f1_credit card": 0.7734806629834254,
646
- "f1_retail banking": 0.562962962962963,
647
- "f1_debt collection": 0.5959595959595959,
648
- "f1_credit reporting": 0.7137546468401487,
649
- "f1_macro_ci_low": 0.6512602475695661,
650
- "f1_macro_ci_high": 0.7369419146845784,
651
  "score_name": "f1_micro",
652
- "score": 0.6980146290491118,
653
- "score_ci_high": 0.7373210151084457,
654
- "score_ci_low": 0.6555323590814196,
655
- "num_of_instances": 500,
656
- "accuracy": 0.668,
657
- "accuracy_ci_low": 0.6247351354699405,
658
- "accuracy_ci_high": 0.712,
659
- "f1_micro": 0.6980146290491118,
660
- "f1_micro_ci_low": 0.6555323590814196,
661
- "f1_micro_ci_high": 0.7373210151084457
662
  },
663
- "score": 0.7518066924405746,
664
  "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
- "num_of_instances": 1000,
670
- "execution_accuracy": 0.113,
671
- "program_accuracy": 0.135,
672
- "score": 0.135,
673
  "score_name": "program_accuracy",
674
- "execution_accuracy_ci_low": 0.094,
675
- "execution_accuracy_ci_high": 0.134,
676
- "program_accuracy_ci_low": 0.116,
677
- "program_accuracy_ci_high": 0.158,
678
- "score_ci_low": 0.116,
679
- "score_ci_high": 0.158
 
680
  },
681
- "score": 0.135,
682
  "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.3279655346823218,
688
- "recall": 0.5705879433371356,
689
- "f1": 0.35889039539658296,
690
- "precision_ci_low": 0.3078086310620982,
691
- "precision_ci_high": 0.34963819007576297,
692
- "recall_ci_low": 0.554197356666031,
693
- "recall_ci_high": 0.5883801367096354,
694
- "f1_ci_low": 0.3417642169187518,
695
- "f1_ci_high": 0.37901291711555385,
696
  "score_name": "f1",
697
- "score": 0.35889039539658296,
698
- "score_ci_high": 0.37901291711555385,
699
- "score_ci_low": 0.3417642169187518,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.6124672105411688,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6899554192026456,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5693493613600731,
704
- "faithfullness_f1_token_overlap": 0.3464559400877777,
705
- "faithfullness_recall_token_overlap": 0.27600428527077514,
706
- "faithfullness_precision_token_overlap": 0.5952455002809257,
707
- "correctness_f1_token_overlap": 0.35889039539658296,
708
- "correctness_recall_token_overlap": 0.5705879433371356,
709
- "correctness_precision_token_overlap": 0.3279655346823218
710
  },
711
- "score": 0.35889039539658296,
712
  "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.469,
718
- "accuracy_ci_low": 0.438,
719
- "accuracy_ci_high": 0.499,
720
  "score_name": "accuracy",
721
- "score": 0.469,
722
- "score_ci_high": 0.499,
723
- "score_ci_low": 0.438,
724
- "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.824,
728
- "accuracy_ci_low": 0.7898204640532378,
729
- "accuracy_ci_high": 0.8579264839771418,
730
  "score_name": "accuracy",
731
- "score": 0.824,
732
- "score_ci_high": 0.8579264839771418,
733
- "score_ci_low": 0.7898204640532378,
734
- "num_of_instances": 500
735
  },
736
- "score": 0.6465,
737
  "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
  "severity_critical": 0.0,
743
- "severity_high": 0.0,
744
  "severity_medium": 1.0,
745
- "severity_low": 99.0,
746
- "category_harmful_info": 0.8298518021409916,
747
- "category_deception": 0.8434558517177297,
748
- "category_explicit_content": 0.845593557213292,
749
- "category_violence": 0.8258583758558546,
750
- "category_discrimination": 0.8266353697057754,
751
- "category_substance_abuse": 0.8619899458355373,
752
- "category_pii": 0.8507882124847836,
753
- "safety": 0.8336919906771638,
754
- "safety_ci_low": 0.8111003444322545,
755
- "safety_ci_high": 0.8568521539985686,
756
  "score_name": "safety",
757
- "score": 0.8336919906771638,
758
- "score_ci_high": 0.8568521539985686,
759
- "score_ci_low": 0.8111003444322545,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.8336919906771638,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rouge2": 0.21160139852857196,
770
- "rougeLsum": 0.36324031100110515,
771
- "rouge1": 0.4273940174750716,
772
- "rougeL": 0.2985567591141555,
773
- "score": 0.2985567591141555,
774
  "score_name": "rougeL",
775
- "rouge2_ci_low": 0.20478163308145209,
776
- "rouge2_ci_high": 0.21935297407718782,
777
- "rougeLsum_ci_low": 0.3546878882607467,
778
- "rougeLsum_ci_high": 0.37154314876152733,
779
- "rouge1_ci_low": 0.4178746520059863,
780
- "rouge1_ci_high": 0.43635521340646144,
781
- "rougeL_ci_low": 0.2916818506638873,
782
- "rougeL_ci_high": 0.3064012355591934,
783
- "score_ci_low": 0.2916818506638873,
784
- "score_ci_high": 0.3064012355591934
 
 
 
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rouge2": 0.015911661871209636,
789
- "rougeLsum": 0.0956886215682793,
790
- "rouge1": 0.11520528707442619,
791
- "rougeL": 0.08350863165548258,
792
- "score": 0.08350863165548258,
793
  "score_name": "rougeL",
794
- "rouge2_ci_low": 0.013985799591312902,
795
- "rouge2_ci_high": 0.017727653852883076,
796
- "rougeLsum_ci_low": 0.09148358738071459,
797
- "rougeLsum_ci_high": 0.10004441271360605,
798
- "rouge1_ci_low": 0.10996324785054311,
799
- "rouge1_ci_high": 0.1203422582590639,
800
- "rougeL_ci_low": 0.07993462762229471,
801
- "rougeL_ci_high": 0.0872963198676006,
802
- "score_ci_low": 0.07993462762229471,
803
- "score_ci_high": 0.0872963198676006
 
 
 
804
  },
805
- "score": 0.19103269538481904,
806
  "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
  },
809
  "translation": {
810
  "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
  "counts": [
813
- 1160,
814
- 634,
815
- 402,
816
- 263
817
  ],
818
  "totals": [
819
- 3432,
820
- 3366,
821
- 3300,
822
- 3234
823
  ],
824
  "precisions": [
825
- 0.337995337995338,
826
- 0.1883541295306001,
827
- 0.12181818181818181,
828
- 0.08132343846629561
829
  ],
830
  "bp": 1.0,
831
- "sys_len": 3432,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.1584723235237399,
834
- "score": 0.1584723235237399,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.11474709135550289,
837
- "score_ci_high": 0.20012320167306266,
838
- "sacrebleu_ci_low": 0.11474709135550289,
839
- "sacrebleu_ci_high": 0.20012320167306266
840
  },
841
  "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
  "counts": [
844
- 1242,
845
- 746,
846
- 497,
847
- 332
848
  ],
849
  "totals": [
850
- 3635,
851
- 3569,
852
- 3503,
853
- 3437
854
  ],
855
  "precisions": [
856
- 0.34167812929848695,
857
- 0.20902213505183526,
858
- 0.14187838995147017,
859
- 0.09659586848996218
860
  ],
861
- "bp": 1.0,
862
- "sys_len": 3635,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.17687687871183358,
865
- "score": 0.17687687871183358,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.13879938364214875,
868
- "score_ci_high": 0.2223271554471336,
869
- "sacrebleu_ci_low": 0.13879938364214875,
870
- "sacrebleu_ci_high": 0.2223271554471336
871
  },
872
  "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
  "counts": [
875
- 707,
876
- 291,
877
- 137,
878
- 67
879
  ],
880
  "totals": [
881
- 2678,
882
- 2612,
883
- 2546,
884
- 2480
885
  ],
886
  "precisions": [
887
- 0.26400298730395816,
888
- 0.11140888208269524,
889
- 0.053809897879025924,
890
- 0.027016129032258064
891
  ],
892
  "bp": 1.0,
893
- "sys_len": 2678,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.08086367724146439,
896
- "score": 0.08086367724146439,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.060707255296144236,
899
- "score_ci_high": 0.11207981375295485,
900
- "sacrebleu_ci_low": 0.060707255296144236,
901
- "sacrebleu_ci_high": 0.11207981375295485
902
  },
903
  "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
  "counts": [
906
- 1105,
907
- 576,
908
- 338,
909
- 205
910
  ],
911
  "totals": [
912
- 2865,
913
- 2799,
914
- 2733,
915
- 2667
916
  ],
917
  "precisions": [
918
- 0.3856893542757417,
919
- 0.2057877813504823,
920
- 0.12367361873399195,
921
- 0.07686539182602176
922
  ],
923
- "bp": 1.0,
924
- "sys_len": 2865,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.1657357842387588,
927
- "score": 0.1657357842387588,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.1255662861475735,
930
- "score_ci_high": 0.19513128530364274,
931
- "sacrebleu_ci_low": 0.1255662861475735,
932
- "sacrebleu_ci_high": 0.19513128530364274
933
  },
934
  "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
  "counts": [
937
- 1425,
938
- 950,
939
- 689,
940
- 512
941
  ],
942
  "totals": [
943
- 3952,
944
- 3886,
945
- 3820,
946
- 3754
947
  ],
948
  "precisions": [
949
- 0.3605769230769231,
950
- 0.24446731857951623,
951
- 0.18036649214659686,
952
- 0.13638785295684602
953
  ],
954
- "bp": 1.0,
955
- "sys_len": 3952,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.21579310909975802,
958
- "score": 0.21579310909975802,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.17167305584926412,
961
- "score_ci_high": 0.2671975553706823,
962
- "sacrebleu_ci_low": 0.17167305584926412,
963
- "sacrebleu_ci_high": 0.2671975553706823
964
  },
965
  "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
  "counts": [
968
- 1144,
969
- 510,
970
- 256,
971
- 121
972
  ],
973
  "totals": [
974
- 4088,
975
- 4022,
976
- 3956,
977
- 3890
978
  ],
979
  "precisions": [
980
- 0.27984344422700586,
981
- 0.1268025857782198,
982
- 0.06471183013144591,
983
- 0.031105398457583547
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 4088,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.09193178117454374,
989
- "score": 0.09193178117454374,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.07510753790698527,
992
- "score_ci_high": 0.10802722708228213,
993
- "sacrebleu_ci_low": 0.07510753790698527,
994
- "sacrebleu_ci_high": 0.10802722708228213
995
  },
996
  "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
  "counts": [
999
- 1355,
1000
- 895,
1001
- 641,
1002
- 477
1003
  ],
1004
  "totals": [
1005
- 3672,
1006
- 3606,
1007
- 3540,
1008
- 3474
1009
  ],
1010
  "precisions": [
1011
- 0.3690087145969499,
1012
- 0.24819744869661675,
1013
- 0.1810734463276836,
1014
- 0.1373056994818653
1015
  ],
1016
- "bp": 1.0,
1017
- "sys_len": 3672,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.21844611082203133,
1020
- "score": 0.21844611082203133,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.1740922279282215,
1023
- "score_ci_high": 0.2747838995139129,
1024
- "sacrebleu_ci_low": 0.1740922279282215,
1025
- "sacrebleu_ci_high": 0.2747838995139129
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
  "counts": [
1030
- 958,
1031
- 446,
1032
- 242,
1033
- 144
1034
  ],
1035
  "totals": [
1036
- 3143,
1037
- 3077,
1038
- 3011,
1039
- 2945
1040
  ],
1041
  "precisions": [
1042
- 0.304804327076042,
1043
- 0.1449463763405915,
1044
- 0.080371969445367,
1045
- 0.048896434634974534
1046
  ],
1047
- "bp": 1.0,
1048
- "sys_len": 3143,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.11478960818381517,
1051
- "score": 0.11478960818381517,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.09008415206585028,
1054
- "score_ci_high": 0.15651321779474522,
1055
- "sacrebleu_ci_low": 0.09008415206585028,
1056
- "sacrebleu_ci_high": 0.15651321779474522
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
  "counts": [
1061
- 1223,
1062
- 632,
1063
- 362,
1064
- 212
1065
  ],
1066
  "totals": [
1067
- 3461,
1068
- 3395,
1069
- 3329,
1070
- 3263
1071
  ],
1072
  "precisions": [
1073
- 0.35336607916787055,
1074
- 0.1861561119293078,
1075
- 0.10874136377290478,
1076
- 0.06497088568801716
1077
  ],
1078
- "bp": 1.0,
1079
- "sys_len": 3461,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.14682632542570678,
1082
- "score": 0.14682632542570678,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.11971923721446943,
1085
- "score_ci_high": 0.18346369643073177,
1086
- "sacrebleu_ci_low": 0.11971923721446943,
1087
- "sacrebleu_ci_high": 0.18346369643073177
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
  "counts": [
1092
- 1245,
1093
- 754,
1094
- 504,
1095
- 346
1096
  ],
1097
  "totals": [
1098
- 3378,
1099
- 3312,
1100
- 3246,
1101
- 3180
1102
  ],
1103
  "precisions": [
1104
- 0.3685612788632327,
1105
- 0.2276570048309179,
1106
- 0.15526802218114602,
1107
- 0.10880503144654088
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 3378,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.19403515904057747,
1113
- "score": 0.19403515904057747,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.16072549920746337,
1116
- "score_ci_high": 0.25567936336289826,
1117
- "sacrebleu_ci_low": 0.16072549920746337,
1118
- "sacrebleu_ci_high": 0.25567936336289826
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
  "counts": [
1123
- 1030,
1124
- 467,
1125
- 241,
1126
- 128
1127
  ],
1128
  "totals": [
1129
- 3273,
1130
- 3207,
1131
- 3141,
1132
- 3075
1133
  ],
1134
  "precisions": [
1135
- 0.3146959975557592,
1136
- 0.14561895852821952,
1137
- 0.07672715695638332,
1138
- 0.04162601626016261
1139
  ],
1140
- "bp": 1.0,
1141
- "sys_len": 3273,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.10999065136220543,
1144
- "score": 0.10999065136220543,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.08125966435750204,
1147
- "score_ci_high": 0.15014434309739183,
1148
- "sacrebleu_ci_low": 0.08125966435750204,
1149
- "sacrebleu_ci_high": 0.15014434309739183
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
  "counts": [
1154
- 1040,
1155
- 465,
1156
- 241,
1157
- 132
1158
  ],
1159
  "totals": [
1160
- 3703,
1161
- 3637,
1162
- 3571,
1163
- 3505
1164
  ],
1165
  "precisions": [
1166
- 0.28085336213880635,
1167
- 0.12785262579048667,
1168
- 0.06748809857182862,
1169
- 0.037660485021398
1170
  ],
1171
- "bp": 1.0,
1172
- "sys_len": 3703,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.09774073377105962,
1175
- "score": 0.09774073377105962,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.0819836886276176,
1178
- "score_ci_high": 0.12072091726722378,
1179
- "sacrebleu_ci_low": 0.0819836886276176,
1180
- "sacrebleu_ci_high": 0.12072091726722378
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
  "counts": [
1185
- 1278,
1186
- 818,
1187
- 573,
1188
- 412
1189
  ],
1190
  "totals": [
1191
- 3168,
1192
- 3102,
1193
- 3036,
1194
- 2970
1195
  ],
1196
  "precisions": [
1197
- 0.40340909090909094,
1198
- 0.2637008381689233,
1199
- 0.18873517786561267,
1200
- 0.13872053872053872
1201
  ],
1202
- "bp": 1.0,
1203
- "sys_len": 3168,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.22972735015632778,
1206
- "score": 0.22972735015632778,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.17875705428567942,
1209
- "score_ci_high": 0.30188244314298573,
1210
- "sacrebleu_ci_low": 0.17875705428567942,
1211
- "sacrebleu_ci_high": 0.30188244314298573
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
  "counts": [
1216
- 1283,
1217
- 796,
1218
- 537,
1219
- 365
1220
  ],
1221
  "totals": [
1222
- 4400,
1223
- 4334,
1224
- 4268,
1225
- 4202
1226
  ],
1227
  "precisions": [
1228
- 0.2915909090909091,
1229
- 0.18366405168435626,
1230
- 0.12582005623242737,
1231
- 0.08686339838172298
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 4400,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.1555414731878905,
1237
- "score": 0.1555414731878905,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.12023011304708842,
1240
- "score_ci_high": 0.20687673778594803,
1241
- "sacrebleu_ci_low": 0.12023011304708842,
1242
- "sacrebleu_ci_high": 0.20687673778594803
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
  "counts": [
1247
- 1155,
1248
- 607,
1249
- 355,
1250
- 220
1251
  ],
1252
  "totals": [
1253
- 2824,
1254
- 2758,
1255
- 2692,
1256
- 2626
1257
  ],
1258
  "precisions": [
1259
- 0.4089943342776204,
1260
- 0.22008701957940538,
1261
- 0.13187221396731055,
1262
- 0.08377760853008377
1263
  ],
1264
- "bp": 1.0,
1265
- "sys_len": 2824,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.17758171439939147,
1268
- "score": 0.17758171439939147,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.13942620008407763,
1271
- "score_ci_high": 0.23700138449810748,
1272
- "sacrebleu_ci_low": 0.13942620008407763,
1273
- "sacrebleu_ci_high": 0.23700138449810748
1274
  },
1275
- "score": 0.15562351202260694,
1276
  "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
  },
1279
- "score": 0.4344559230477983,
1280
  "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
  }
1283
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-07-02T20:08:24.273547Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
+ "model_name=watsonx/meta-llama/llama-3-2-1b-instruct,max_tokens=1024",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
 
26
  "num_fewshots": null,
27
  "limit": null,
28
  "batch_size": 8,
29
+ "model": "watsonx/meta-llama/llama-3-2-1b-instruct",
30
  "model_args": {
31
+ "max_tokens": 1024
32
  },
33
  "gen_kwargs": null,
34
  "chat_template_kwargs": null,
 
41
  "disable_hf_cache": false,
42
  "cache_dir": null
43
  },
44
+ "unitxt_version": "1.25.0",
45
+ "unitxt_commit_hash": "c8a5e77fd6ca62039b915dc10700323f50ccaacf",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
51
  "triton": "3.3.1",
52
  "nltk": "3.9.1",
53
  "anyio": "4.9.0",
54
+ "unitxt": "1.25.0",
55
  "absl-py": "2.3.0",
56
  "tiktoken": "0.9.0",
57
  "charset-normalizer": "3.4.2",
58
  "nvidia-cuda-runtime-cu12": "12.6.77",
59
  "sympy": "1.14.0",
60
  "mecab-ko": "1.0.1",
 
61
  "httpcore": "1.0.9",
62
+ "litellm": "1.73.6",
63
  "Jinja2": "3.1.6",
64
  "jsonschema-specifications": "2025.4.1",
65
  "pydantic_core": "2.33.2",
66
  "nvidia-cusparse-cu12": "12.5.4.2",
67
+ "tokenizers": "0.21.2",
68
  "yarl": "1.20.1",
 
69
  "portalocker": "3.2.0",
70
  "pandas": "2.3.0",
71
  "multiprocess": "0.70.16",
72
  "jsonschema": "4.24.0",
 
73
  "nvidia-nvjitlink-cu12": "12.6.85",
74
  "nvidia-cublas-cu12": "12.6.4.1",
75
  "pydantic": "2.11.7",
 
79
  "contourpy": "1.3.2",
80
  "aiosignal": "1.3.2",
81
  "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "openai": "1.93.0",
83
  "six": "1.17.0",
84
  "diskcache": "5.6.3",
85
  "tqdm": "4.67.1",
 
98
  "kiwisolver": "1.4.8",
99
  "networkx": "3.4.2",
100
  "typing-inspection": "0.4.1",
 
101
  "sniffio": "1.3.1",
102
  "scikit-learn": "1.7.0",
103
+ "rpds-py": "0.26.0",
104
  "nvidia-curand-cu12": "10.3.7.77",
105
  "pip": "25.1.1",
106
+ "pillow": "11.3.0",
107
  "fonttools": "4.58.4",
 
108
  "datasets": "3.6.0",
109
  "nvidia-cusolver-cu12": "11.7.1.2",
110
  "cycler": "0.12.1",
 
111
  "distro": "1.9.0",
112
  "idna": "3.10",
113
  "MarkupSafe": "3.0.2",
 
121
  "joblib": "1.5.1",
122
  "fsspec": "2025.3.0",
123
  "dill": "0.3.8",
 
124
  "wheel": "0.45.1",
125
  "nvidia-nvtx-cu12": "12.6.77",
126
  "nvidia-cusparselt-cu12": "0.6.3",
127
+ "lxml": "6.0.0",
128
  "propcache": "0.3.2",
129
  "numpy": "2.2.6",
130
  "mpmath": "1.3.0",
 
131
  "conllu": "6.0.0",
132
+ "huggingface-hub": "0.33.2",
133
  "safetensors": "0.5.3",
134
  "requests": "2.32.4",
135
  "regex": "2024.11.6",
136
  "aiohttp": "3.12.13",
137
  "tabulate": "0.9.0",
138
+ "accelerate": "1.8.1",
139
  "certifi": "2025.6.15",
140
+ "evaluate": "0.4.4",
141
  "nvidia-cufft-cu12": "11.3.0.4",
142
  "nvidia-cuda-nvrtc-cu12": "12.6.77",
143
  "click": "8.2.1",
144
  "typing_extensions": "4.12.2",
145
  "attrs": "25.3.0",
146
  "exceptiongroup": "1.3.0",
147
+ "transformers": "4.53.0",
148
  "tenacity": "9.1.2",
149
  "pytz": "2025.2",
150
  "aiohappyeyeballs": "2.6.1",
151
  "python-dateutil": "2.9.0.post0",
152
  "torch": "2.7.1",
153
+ "python-dotenv": "1.1.1",
154
+ "multidict": "6.6.3",
155
  "httpx": "0.28.1",
156
  "matplotlib": "3.10.3",
157
  "xxhash": "3.5.0",
158
  "PyYAML": "6.0.2",
 
159
  "colorama": "0.4.6",
 
160
  "threadpoolctl": "3.6.0",
161
  "nvidia-cudnn-cu12": "9.5.1.17",
162
+ "hf-xet": "1.1.5",
163
  "jaraco.collections": "5.1.0",
164
  "tomli": "2.0.1",
165
  "backports.tarfile": "1.2.0",
 
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
+ "accuracy": 0.4444444444444444,
180
+ "accuracy_ci_low": 0.1111111111111111,
181
+ "accuracy_ci_high": 0.7777777777777778,
182
  "score_name": "accuracy",
183
+ "score": 0.4444444444444444,
184
+ "score_ci_high": 0.7777777777777778,
185
+ "score_ci_low": 0.1111111111111111,
186
+ "num_of_instances": 9
187
  },
188
  "safety_bbq_disability_status": {
189
+ "accuracy": 0.5555555555555556,
190
+ "accuracy_ci_low": 0.2222222222222222,
191
+ "accuracy_ci_high": 0.8888888888888888,
192
  "score_name": "accuracy",
193
+ "score": 0.5555555555555556,
194
+ "score_ci_high": 0.8888888888888888,
195
+ "score_ci_low": 0.2222222222222222,
196
+ "num_of_instances": 9
197
  },
198
  "safety_bbq_gender_identity": {
199
+ "accuracy": 0.5555555555555556,
200
+ "accuracy_ci_low": 0.2222222222222222,
201
+ "accuracy_ci_high": 0.8888888888888888,
202
  "score_name": "accuracy",
203
+ "score": 0.5555555555555556,
204
+ "score_ci_high": 0.8888888888888888,
205
+ "score_ci_low": 0.2222222222222222,
206
+ "num_of_instances": 9
207
  },
208
  "safety_bbq_nationality": {
209
+ "accuracy": 0.5555555555555556,
210
+ "accuracy_ci_low": 0.2222222222222222,
211
+ "accuracy_ci_high": 0.8888888888888888,
212
  "score_name": "accuracy",
213
+ "score": 0.5555555555555556,
214
+ "score_ci_high": 0.8888888888888888,
215
+ "score_ci_low": 0.2222222222222222,
216
+ "num_of_instances": 9
217
  },
218
  "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.2222222222222222,
220
+ "accuracy_ci_low": 0.0,
221
+ "accuracy_ci_high": 0.5555555555555556,
222
  "score_name": "accuracy",
223
+ "score": 0.2222222222222222,
224
+ "score_ci_high": 0.5555555555555556,
225
+ "score_ci_low": 0.0,
226
+ "num_of_instances": 9
227
  },
228
  "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.4444444444444444,
230
+ "accuracy_ci_low": 0.1111111111111111,
231
+ "accuracy_ci_high": 0.7777777777777778,
232
  "score_name": "accuracy",
233
+ "score": 0.4444444444444444,
234
+ "score_ci_high": 0.7777777777777778,
235
+ "score_ci_low": 0.1111111111111111,
236
+ "num_of_instances": 9
237
  },
238
  "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.4444444444444444,
240
+ "accuracy_ci_low": 0.1111111111111111,
241
+ "accuracy_ci_high": 0.7777777777777778,
242
  "score_name": "accuracy",
243
+ "score": 0.4444444444444444,
244
+ "score_ci_high": 0.7777777777777778,
245
+ "score_ci_low": 0.1111111111111111,
246
+ "num_of_instances": 9
247
  },
248
  "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.4444444444444444,
250
+ "accuracy_ci_low": 0.1111111111111111,
251
+ "accuracy_ci_high": 0.7777777777777778,
252
  "score_name": "accuracy",
253
+ "score": 0.4444444444444444,
254
+ "score_ci_high": 0.7777777777777778,
255
+ "score_ci_low": 0.1111111111111111,
256
+ "num_of_instances": 9
257
  },
258
  "safety_bbq_religion": {
259
+ "accuracy": 0.5555555555555556,
260
+ "accuracy_ci_low": 0.2222222222222222,
261
  "accuracy_ci_high": 0.8888888888888888,
262
  "score_name": "accuracy",
263
+ "score": 0.5555555555555556,
264
  "score_ci_high": 0.8888888888888888,
265
+ "score_ci_low": 0.2222222222222222,
266
+ "num_of_instances": 9
267
  },
268
  "safety_bbq_ses": {
269
+ "accuracy": 0.5555555555555556,
270
+ "accuracy_ci_low": 0.2222222222222222,
271
+ "accuracy_ci_high": 0.8888888888888888,
272
  "score_name": "accuracy",
273
+ "score": 0.5555555555555556,
274
+ "score_ci_high": 0.8888888888888888,
275
+ "score_ci_low": 0.2222222222222222,
276
+ "num_of_instances": 9
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.5555555555555556,
280
+ "accuracy_ci_low": 0.1111111111111111,
281
+ "accuracy_ci_high": 0.7777777777777778,
282
  "score_name": "accuracy",
283
+ "score": 0.5555555555555556,
284
+ "score_ci_high": 0.7777777777777778,
285
+ "score_ci_low": 0.1111111111111111,
286
+ "num_of_instances": 9
287
  },
288
+ "score": 0.48484848484848486,
289
  "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.07109004739336493,
296
+ "score": 0.07109004739336493,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.07109004739336493,
300
  "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.3333333333333333,
307
+ "f1_Organization": 0.14285714285714285,
308
+ "f1_Location": 0.26666666666666666,
309
+ "f1_macro": 0.24761904761904763,
310
+ "recall_macro": 0.1637336093857833,
311
+ "precision_macro": 0.5317460317460317,
312
+ "in_classes_support": 0.7714285714285715,
313
+ "f1_micro": 0.21818181818181817,
314
+ "recall_micro": 0.16,
315
+ "precision_micro": 0.34285714285714286,
316
+ "score": 0.21818181818181817,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.10426925532422195,
319
+ "score_ci_high": 0.3444160571809249,
320
+ "f1_micro_ci_low": 0.10426925532422195,
321
+ "f1_micro_ci_high": 0.3444160571809249
322
  },
323
+ "score": 0.21818181818181817,
324
  "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.42857142857142855,
330
+ "accuracy_ci_low": 0.14285714285714285,
331
+ "accuracy_ci_high": 0.8571428571428571,
332
  "score_name": "accuracy",
333
+ "score": 0.42857142857142855,
334
+ "score_ci_high": 0.8571428571428571,
335
+ "score_ci_low": 0.14285714285714285,
336
+ "num_of_instances": 7
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.14285714285714285,
340
+ "accuracy_ci_low": 0.0,
341
+ "accuracy_ci_high": 0.6807203593841678,
342
  "score_name": "accuracy",
343
+ "score": 0.14285714285714285,
344
+ "score_ci_high": 0.6807203593841678,
345
+ "score_ci_low": 0.0,
346
+ "num_of_instances": 7
347
  },
348
  "mmlu_pro_chemistry": {
349
+ "accuracy": 0.2857142857142857,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.7142857142857143,
352
  "score_name": "accuracy",
353
+ "score": 0.2857142857142857,
354
+ "score_ci_high": 0.7142857142857143,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
  },
358
  "mmlu_pro_computer_science": {
359
+ "accuracy": 0.7142857142857143,
360
+ "accuracy_ci_low": 0.2857142857142857,
361
+ "accuracy_ci_high": 1.0,
362
  "score_name": "accuracy",
363
+ "score": 0.7142857142857143,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 0.2857142857142857,
366
+ "num_of_instances": 7
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.5714285714285714,
370
+ "accuracy_ci_low": 0.14285714285714285,
371
+ "accuracy_ci_high": 0.8571428571428571,
372
  "score_name": "accuracy",
373
+ "score": 0.5714285714285714,
374
+ "score_ci_high": 0.8571428571428571,
375
+ "score_ci_low": 0.14285714285714285,
376
+ "num_of_instances": 7
377
  },
378
  "mmlu_pro_engineering": {
379
+ "accuracy": 0.14285714285714285,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.5714285714285714,
382
  "score_name": "accuracy",
383
+ "score": 0.14285714285714285,
384
+ "score_ci_high": 0.5714285714285714,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.14285714285714285,
390
+ "accuracy_ci_low": 0.0,
391
+ "accuracy_ci_high": 0.5714285714285714,
392
  "score_name": "accuracy",
393
+ "score": 0.14285714285714285,
394
+ "score_ci_high": 0.5714285714285714,
395
+ "score_ci_low": 0.0,
396
+ "num_of_instances": 7
397
  },
398
  "mmlu_pro_history": {
399
+ "accuracy": 0.0,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.0,
402
  "score_name": "accuracy",
403
+ "score": 0.0,
404
+ "score_ci_high": 0.0,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
  },
408
  "mmlu_pro_law": {
409
+ "accuracy": 0.5714285714285714,
410
+ "accuracy_ci_low": 0.14285714285714285,
411
+ "accuracy_ci_high": 0.8571428571428571,
412
  "score_name": "accuracy",
413
+ "score": 0.5714285714285714,
414
+ "score_ci_high": 0.8571428571428571,
415
+ "score_ci_low": 0.14285714285714285,
416
+ "num_of_instances": 7
417
  },
418
  "mmlu_pro_math": {
419
+ "accuracy": 0.0,
420
+ "accuracy_ci_low": 0.0,
421
+ "accuracy_ci_high": 0.0,
422
  "score_name": "accuracy",
423
+ "score": 0.0,
424
+ "score_ci_high": 0.0,
425
+ "score_ci_low": 0.0,
426
+ "num_of_instances": 7
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.0,
430
+ "accuracy_ci_low": 0.0,
431
+ "accuracy_ci_high": 0.0,
432
  "score_name": "accuracy",
433
+ "score": 0.0,
434
+ "score_ci_high": 0.0,
435
+ "score_ci_low": 0.0,
436
+ "num_of_instances": 7
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.0,
440
+ "accuracy_ci_low": 0.0,
441
+ "accuracy_ci_high": 0.0,
442
  "score_name": "accuracy",
443
+ "score": 0.0,
444
+ "score_ci_high": 0.0,
445
+ "score_ci_low": 0.0,
446
+ "num_of_instances": 7
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.14285714285714285,
450
+ "accuracy_ci_low": 0.0,
451
+ "accuracy_ci_high": 0.5714285714285714,
452
  "score_name": "accuracy",
453
+ "score": 0.14285714285714285,
454
+ "score_ci_high": 0.5714285714285714,
455
+ "score_ci_low": 0.0,
456
+ "num_of_instances": 7
457
  },
458
  "mmlu_pro_psychology": {
459
+ "accuracy": 0.2857142857142857,
460
+ "accuracy_ci_low": 0.0,
461
+ "accuracy_ci_high": 0.7142857142857143,
462
  "score_name": "accuracy",
463
+ "score": 0.2857142857142857,
464
+ "score_ci_high": 0.7142857142857143,
465
+ "score_ci_low": 0.0,
466
+ "num_of_instances": 7
467
  },
468
+ "score": 0.24489795918367346,
469
  "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.2682539682539683,
475
+ "f1_suggestive": 0.16666666666666666,
476
+ "f1_fanciful": 0.3333333333333333,
477
+ "f1_generic": 0.2222222222222222,
478
+ "f1_descriptive": 0.3333333333333333,
479
+ "f1_arbitrary": 0.2857142857142857,
480
+ "f1_macro_ci_low": 0.12857142857142856,
481
+ "f1_macro_ci_high": 0.5753151207818901,
482
  "score_name": "f1_micro",
483
+ "score": 0.25,
484
+ "score_ci_high": 0.5,
485
+ "score_ci_low": 0.1,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.25,
488
+ "accuracy_ci_low": 0.1,
489
+ "accuracy_ci_high": 0.5,
490
+ "f1_micro": 0.25,
491
+ "f1_micro_ci_low": 0.1,
492
+ "f1_micro_ci_high": 0.5
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.5238095238095238,
496
+ "f1_no": 0.7142857142857143,
497
+ "f1_yes": 0.3333333333333333,
498
+ "f1_macro_ci_low": 0.3103448275862069,
499
+ "f1_macro_ci_high": 0.7619047619047619,
500
  "score_name": "f1_micro",
501
+ "score": 0.6,
502
+ "score_ci_high": 0.8,
503
+ "score_ci_low": 0.35,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.6,
506
+ "accuracy_ci_low": 0.35,
507
+ "accuracy_ci_high": 0.8,
508
+ "f1_micro": 0.6,
509
+ "f1_micro_ci_low": 0.35,
510
+ "f1_micro_ci_high": 0.8
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.06168831168831169,
514
+ "f1_conclusion": 0.25,
515
+ "f1_analysis": 0.18181818181818182,
516
+ "f1_decree": 0.0,
517
+ "f1_issue": 0.0,
518
+ "f1_facts": 0.0,
519
+ "f1_rule": 0.0,
520
+ "f1_procedural history": 0.0,
521
+ "f1_macro_ci_low": 0.0,
522
+ "f1_macro_ci_high": 0.19169137881749854,
523
  "score_name": "f1_micro",
524
+ "score": 0.10810810810810811,
525
+ "score_ci_high": 0.32753517177309893,
526
+ "score_ci_low": 0.0,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.1,
529
+ "accuracy_ci_low": 0.0,
530
+ "accuracy_ci_high": 0.35,
531
+ "f1_micro": 0.10810810810810811,
532
+ "f1_micro_ci_low": 0.0,
533
+ "f1_micro_ci_high": 0.32753517177309893
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.5059523809523809,
537
+ "f1_yes": 0.5833333333333334,
538
+ "f1_no": 0.42857142857142855,
539
+ "f1_macro_ci_low": 0.2916666666666667,
540
+ "f1_macro_ci_high": 0.7685099295204324,
541
  "score_name": "f1_micro",
542
+ "score": 0.5263157894736842,
543
+ "score_ci_high": 0.7368421052631579,
544
+ "score_ci_low": 0.3076923076923077,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.5,
547
+ "accuracy_ci_low": 0.3,
548
+ "accuracy_ci_high": 0.7,
549
+ "f1_micro": 0.5263157894736842,
550
+ "f1_micro_ci_low": 0.3076923076923077,
551
+ "f1_micro_ci_high": 0.7368421052631579
552
  },
553
  "legalbench_proa": {
554
+ "f1_macro": 0.7777777777777778,
555
+ "f1_yes": 0.7777777777777778,
556
+ "f1_no": 0.7777777777777778,
557
+ "f1_macro_ci_low": 0.5645704686649926,
558
+ "f1_macro_ci_high": 0.9206349206349206,
559
  "score_name": "f1_micro",
560
+ "score": 0.7777777777777778,
561
+ "score_ci_high": 0.918918918918919,
562
+ "score_ci_low": 0.5502143713513362,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.7,
565
+ "accuracy_ci_low": 0.45,
566
+ "accuracy_ci_high": 0.85,
567
+ "f1_micro": 0.7777777777777778,
568
+ "f1_micro_ci_low": 0.5502143713513362,
569
+ "f1_micro_ci_high": 0.918918918918919
570
  },
571
+ "score": 0.452440335071914,
572
  "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.10357142857142856,
578
+ "f1_cars": 0.5,
579
+ "f1_windows x": 0.0,
580
+ "f1_atheism": 0.0,
581
+ "f1_religion": 0.0,
582
+ "f1_medicine": 0.0,
583
+ "f1_hockey": 0.0,
584
+ "f1_christianity": 0.0,
585
+ "f1_computer graphics": 0.0,
586
+ "f1_microsoft windows": 0.0,
587
+ "f1_middle east": 0.0,
588
+ "f1_motorcycles": 0.0,
589
+ "f1_cryptography": 0.0,
590
+ "f1_mac hardware": 0.0,
591
+ "f1_electronics": 0.0,
592
+ "f1_for sale": 0.0,
593
+ "f1_guns": 0.0,
594
+ "f1_politics": 0.25,
595
+ "f1_space": 0.5714285714285714,
596
+ "f1_pc hardware": 0.0,
597
+ "f1_baseball": 0.75,
598
+ "f1_macro_ci_low": 0.06018184125447124,
599
+ "f1_macro_ci_high": 0.14696961511980738,
600
  "score_name": "f1_micro",
601
+ "score": 0.16783216783216784,
602
+ "score_ci_high": 0.2698797944115196,
603
+ "score_ci_low": 0.0881843925888208,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.12,
606
+ "accuracy_ci_low": 0.07,
607
+ "accuracy_ci_high": 0.2,
608
+ "f1_micro": 0.16783216783216784,
609
+ "f1_micro_ci_low": 0.0881843925888208,
610
+ "f1_micro_ci_high": 0.2698797944115196
611
  },
612
+ "score": 0.16783216783216784,
613
  "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.33580801337810684,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.5981308411214953,
620
+ "f1_money transfer or virtual currency or money service": 0.0,
 
 
 
 
 
 
621
  "f1_payday loan or title loan or personal loan": 0.0,
622
+ "f1_mortgage": 1.0,
623
+ "f1_credit card or prepaid card": 0.16666666666666666,
624
+ "f1_checking or savings account": 0.36363636363636365,
625
+ "f1_debt collection": 0.2222222222222222,
626
+ "f1_macro_ci_low": 0.194694481168003,
627
+ "f1_macro_ci_high": 0.5251343899790525,
628
  "score_name": "f1_micro",
629
+ "score": 0.49032258064516127,
630
+ "score_ci_high": 0.6010548560611424,
631
+ "score_ci_low": 0.3901548550180111,
632
+ "num_of_instances": 100,
633
+ "accuracy": 0.38,
634
+ "accuracy_ci_low": 0.29,
635
+ "accuracy_ci_high": 0.49,
636
+ "f1_micro": 0.49032258064516127,
637
+ "f1_micro_ci_low": 0.3901548550180111,
638
+ "f1_micro_ci_high": 0.6010548560611424
639
  },
640
  "cfpb_product_watsonx": {
641
+ "f1_macro": 0.40016806722689074,
642
+ "f1_mortgages and loans": 0.25,
643
+ "f1_debt collection": 0.5294117647058824,
644
+ "f1_credit card": 0.4,
645
+ "f1_credit reporting": 0.5714285714285714,
646
+ "f1_retail banking": 0.25,
647
+ "f1_macro_ci_low": 0.2813509457307339,
648
+ "f1_macro_ci_high": 0.5595889234269605,
649
  "score_name": "f1_micro",
650
+ "score": 0.44680851063829785,
651
+ "score_ci_high": 0.5894736842105263,
652
+ "score_ci_low": 0.3164301824285703,
653
+ "num_of_instances": 50,
654
+ "accuracy": 0.42,
655
+ "accuracy_ci_low": 0.3,
656
+ "accuracy_ci_high": 0.56,
657
+ "f1_micro": 0.44680851063829785,
658
+ "f1_micro_ci_low": 0.3164301824285703,
659
+ "f1_micro_ci_high": 0.5894736842105263
660
  },
661
+ "score": 0.4685655456417296,
662
  "score_name": "subsets_mean",
663
+ "num_of_instances": 150
664
  },
665
  "qa_finance": {
666
  "fin_qa": {
667
+ "num_of_instances": 100,
668
+ "program_accuracy": 0.06,
669
+ "score": 0.06,
 
670
  "score_name": "program_accuracy",
671
+ "execution_accuracy": 0.04,
672
+ "program_accuracy_ci_low": 0.02,
673
+ "program_accuracy_ci_high": 0.12,
674
+ "score_ci_low": 0.02,
675
+ "score_ci_high": 0.12,
676
+ "execution_accuracy_ci_low": 0.01,
677
+ "execution_accuracy_ci_high": 0.1
678
  },
679
+ "score": 0.06,
680
  "score_name": "subsets_mean",
681
+ "num_of_instances": 100
682
  },
683
  "rag_general": {
684
  "rag_response_generation_clapnq": {
685
+ "precision": 0.5394200704985423,
686
+ "recall": 0.4471996521738687,
687
+ "f1": 0.4374006616033173,
688
+ "precision_ci_low": 0.49576534801510874,
689
+ "precision_ci_high": 0.5813735731767599,
690
+ "recall_ci_low": 0.4038227974845249,
691
+ "recall_ci_high": 0.4893013713568421,
692
+ "f1_ci_low": 0.4063022709069138,
693
+ "f1_ci_high": 0.47437539074022306,
694
  "score_name": "f1",
695
+ "score": 0.4374006616033173,
696
+ "score_ci_high": 0.47437539074022306,
697
+ "score_ci_low": 0.4063022709069138,
698
+ "num_of_instances": 100,
699
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6614969465136528,
700
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6515003818273545,
701
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6838387006521225,
702
+ "faithfullness_f1_token_overlap": 0.30369595667553545,
703
+ "faithfullness_recall_token_overlap": 0.22023180928573866,
704
+ "faithfullness_precision_token_overlap": 0.7308649191201634,
705
+ "correctness_f1_token_overlap": 0.4374006616033173,
706
+ "correctness_recall_token_overlap": 0.4471996521738687,
707
+ "correctness_precision_token_overlap": 0.5394200704985423
708
  },
709
+ "score": 0.4374006616033173,
710
  "score_name": "subsets_mean",
711
+ "num_of_instances": 100
712
  },
713
  "reasoning": {
714
  "hellaswag": {
715
+ "accuracy": 0.28,
716
+ "accuracy_ci_low": 0.2,
717
+ "accuracy_ci_high": 0.37,
718
  "score_name": "accuracy",
719
+ "score": 0.28,
720
+ "score_ci_high": 0.37,
721
+ "score_ci_low": 0.2,
722
+ "num_of_instances": 100
723
  },
724
  "openbook_qa": {
725
+ "accuracy": 0.56,
726
+ "accuracy_ci_low": 0.46,
727
+ "accuracy_ci_high": 0.65,
728
  "score_name": "accuracy",
729
+ "score": 0.56,
730
+ "score_ci_high": 0.65,
731
+ "score_ci_low": 0.46,
732
+ "num_of_instances": 100
733
  },
734
+ "score": 0.42000000000000004,
735
  "score_name": "subsets_mean",
736
+ "num_of_instances": 200
737
  },
738
  "safety": {
739
  "attaq_500": {
740
  "severity_critical": 0.0,
741
+ "severity_high": 2.0,
742
  "severity_medium": 1.0,
743
+ "severity_low": 97.0,
744
+ "category_harmful_info": 0.8452391722519043,
745
+ "category_deception": 0.8451698155640954,
746
+ "category_explicit_content": 0.8289112692905796,
747
+ "category_violence": 0.8006358703805342,
748
+ "category_discrimination": 0.7370405632352072,
749
+ "category_substance_abuse": 0.8171433870991072,
750
+ "category_pii": 0.8536775884804902,
751
+ "safety": 0.8190987786335688,
752
+ "safety_ci_low": 0.7945383739210665,
753
+ "safety_ci_high": 0.8410869711974197,
754
  "score_name": "safety",
755
+ "score": 0.8190987786335688,
756
+ "score_ci_high": 0.8410869711974197,
757
+ "score_ci_low": 0.7945383739210665,
758
  "num_of_instances": 100
759
  },
760
+ "score": 0.8190987786335688,
761
  "score_name": "subsets_mean",
762
  "num_of_instances": 100
763
  },
764
  "summarization": {
765
  "billsum_document_filtered_to_6000_chars": {
766
+ "num_of_instances": 100,
767
+ "rougeL": 0.2726363210739404,
768
+ "score": 0.2726363210739404,
 
 
 
769
  "score_name": "rougeL",
770
+ "rouge2": 0.18608701571308153,
771
+ "rouge1": 0.4022994438648122,
772
+ "rougeLsum": 0.34361980166371475,
773
+ "rougeL_ci_low": 0.2551498105781197,
774
+ "rougeL_ci_high": 0.2899026771230966,
775
+ "score_ci_low": 0.2551498105781197,
776
+ "score_ci_high": 0.2899026771230966,
777
+ "rouge2_ci_low": 0.168814962712534,
778
+ "rouge2_ci_high": 0.20356774988182394,
779
+ "rouge1_ci_low": 0.3782875736721082,
780
+ "rouge1_ci_high": 0.42469320666459026,
781
+ "rougeLsum_ci_low": 0.3218158445558661,
782
+ "rougeLsum_ci_high": 0.3641162818529719
783
  },
784
  "tldr_document_filtered_to_6000_chars": {
785
+ "num_of_instances": 100,
786
+ "rougeL": 0.07538049696363007,
787
+ "score": 0.07538049696363007,
 
 
 
788
  "score_name": "rougeL",
789
+ "rouge2": 0.012771223902408348,
790
+ "rouge1": 0.10098952560279839,
791
+ "rougeLsum": 0.08485681708183587,
792
+ "rougeL_ci_low": 0.06453651407342219,
793
+ "rougeL_ci_high": 0.086339345681574,
794
+ "score_ci_low": 0.06453651407342219,
795
+ "score_ci_high": 0.086339345681574,
796
+ "rouge2_ci_low": 0.008152944394513806,
797
+ "rouge2_ci_high": 0.02001103015890974,
798
+ "rouge1_ci_low": 0.08535457370632381,
799
+ "rouge1_ci_high": 0.11647938663113463,
800
+ "rougeLsum_ci_low": 0.07240452773193602,
801
+ "rougeLsum_ci_high": 0.09677432801226872
802
  },
803
+ "score": 0.17400840901878525,
804
  "score_name": "subsets_mean",
805
+ "num_of_instances": 200
806
  },
807
  "translation": {
808
  "mt_flores_101_ara_eng": {
809
+ "num_of_instances": 6,
810
  "counts": [
811
+ 134,
812
+ 81,
813
+ 55,
814
+ 37
815
  ],
816
  "totals": [
817
+ 217,
818
+ 211,
819
+ 205,
820
+ 199
821
  ],
822
  "precisions": [
823
+ 0.6175115207373272,
824
+ 0.38388625592417064,
825
+ 0.2682926829268293,
826
+ 0.18592964824120603
827
  ],
828
  "bp": 1.0,
829
+ "sys_len": 217,
830
+ "ref_len": 208,
831
+ "sacrebleu": 0.32976250692588743,
832
+ "score": 0.32976250692588743,
833
  "score_name": "sacrebleu",
834
+ "score_ci_low": 0.1930375611696375,
835
+ "score_ci_high": 0.45542436819873006,
836
+ "sacrebleu_ci_low": 0.1930375611696375,
837
+ "sacrebleu_ci_high": 0.45542436819873006
838
  },
839
  "mt_flores_101_deu_eng": {
840
+ "num_of_instances": 6,
841
  "counts": [
842
+ 120,
843
+ 60,
844
+ 30,
845
+ 16
846
  ],
847
  "totals": [
848
+ 205,
849
+ 199,
850
+ 193,
851
+ 187
852
  ],
853
  "precisions": [
854
+ 0.5853658536585367,
855
+ 0.30150753768844224,
856
+ 0.15544041450777202,
857
+ 0.08556149732620322
858
  ],
859
+ "bp": 0.9854724123463497,
860
+ "sys_len": 205,
861
+ "ref_len": 208,
862
+ "sacrebleu": 0.21691342969872396,
863
+ "score": 0.21691342969872396,
864
  "score_name": "sacrebleu",
865
+ "score_ci_low": 0.12320730361123032,
866
+ "score_ci_high": 0.3487349414946175,
867
+ "sacrebleu_ci_low": 0.12320730361123032,
868
+ "sacrebleu_ci_high": 0.3487349414946175
869
  },
870
  "mt_flores_101_eng_ara": {
871
+ "num_of_instances": 6,
872
  "counts": [
873
+ 60,
874
+ 15,
875
+ 6,
876
+ 2
877
  ],
878
  "totals": [
879
+ 511,
880
+ 505,
881
+ 499,
882
+ 493
883
  ],
884
  "precisions": [
885
+ 0.11741682974559688,
886
+ 0.0297029702970297,
887
+ 0.012024048096192386,
888
+ 0.004056795131845842
889
  ],
890
  "bp": 1.0,
891
+ "sys_len": 511,
892
+ "ref_len": 209,
893
+ "sacrebleu": 0.020309115200914007,
894
+ "score": 0.020309115200914007,
895
  "score_name": "sacrebleu",
896
+ "score_ci_low": 0.0031235807690743827,
897
+ "score_ci_high": 0.08413507865164595,
898
+ "sacrebleu_ci_low": 0.0031235807690743827,
899
+ "sacrebleu_ci_high": 0.08413507865164595
900
  },
901
  "mt_flores_101_eng_deu": {
902
+ "num_of_instances": 6,
903
  "counts": [
904
+ 110,
905
+ 55,
906
+ 32,
907
+ 22
908
  ],
909
  "totals": [
910
+ 204,
911
+ 198,
912
+ 192,
913
+ 186
914
  ],
915
  "precisions": [
916
+ 0.5392156862745098,
917
+ 0.2777777777777778,
918
+ 0.16666666666666669,
919
+ 0.11827956989247312
920
  ],
921
+ "bp": 0.9428731438548749,
922
+ "sys_len": 204,
923
+ "ref_len": 216,
924
+ "sacrebleu": 0.2197899810468473,
925
+ "score": 0.2197899810468473,
926
  "score_name": "sacrebleu",
927
+ "score_ci_low": 0.07840487865437475,
928
+ "score_ci_high": 0.36940461297883537,
929
+ "sacrebleu_ci_low": 0.07840487865437475,
930
+ "sacrebleu_ci_high": 0.36940461297883537
931
  },
932
  "mt_flores_101_eng_fra": {
933
+ "num_of_instances": 6,
934
  "counts": [
935
+ 153,
936
+ 83,
937
+ 48,
938
+ 28
939
  ],
940
  "totals": [
941
+ 231,
942
+ 225,
943
+ 219,
944
+ 213
945
  ],
946
  "precisions": [
947
+ 0.6623376623376623,
948
+ 0.3688888888888889,
949
+ 0.2191780821917808,
950
+ 0.13145539906103287
951
  ],
952
+ "bp": 0.9828330432930387,
953
+ "sys_len": 231,
954
+ "ref_len": 235,
955
+ "sacrebleu": 0.28468684086858825,
956
+ "score": 0.28468684086858825,
957
  "score_name": "sacrebleu",
958
+ "score_ci_low": 0.2199495343644957,
959
+ "score_ci_high": 0.3534794457690317,
960
+ "sacrebleu_ci_low": 0.2199495343644957,
961
+ "sacrebleu_ci_high": 0.3534794457690317
962
  },
963
  "mt_flores_101_eng_kor": {
964
+ "num_of_instances": 6,
965
  "counts": [
966
+ 104,
967
+ 40,
968
+ 20,
969
+ 11
970
  ],
971
  "totals": [
972
+ 274,
973
+ 268,
974
+ 262,
975
+ 256
976
  ],
977
  "precisions": [
978
+ 0.3795620437956205,
979
+ 0.1492537313432836,
980
+ 0.07633587786259542,
981
+ 0.04296875
982
  ],
983
  "bp": 1.0,
984
+ "sys_len": 274,
985
+ "ref_len": 249,
986
+ "sacrebleu": 0.11675415620470718,
987
+ "score": 0.11675415620470718,
988
  "score_name": "sacrebleu",
989
+ "score_ci_low": 0.06337490011995338,
990
+ "score_ci_high": 0.1718798128824584,
991
+ "sacrebleu_ci_low": 0.06337490011995338,
992
+ "sacrebleu_ci_high": 0.1718798128824584
993
  },
994
  "mt_flores_101_eng_por": {
995
+ "num_of_instances": 6,
996
  "counts": [
997
+ 160,
998
+ 110,
999
+ 83,
1000
+ 63
1001
  ],
1002
  "totals": [
1003
+ 211,
1004
+ 205,
1005
+ 199,
1006
+ 193
1007
  ],
1008
  "precisions": [
1009
+ 0.7582938388625593,
1010
+ 0.5365853658536586,
1011
+ 0.41708542713567837,
1012
+ 0.3264248704663213
1013
  ],
1014
+ "bp": 0.9492028979108159,
1015
+ "sys_len": 211,
1016
+ "ref_len": 222,
1017
+ "sacrebleu": 0.46050060744945276,
1018
+ "score": 0.46050060744945276,
1019
  "score_name": "sacrebleu",
1020
+ "score_ci_low": 0.3827306936268306,
1021
+ "score_ci_high": 0.5671546178089313,
1022
+ "sacrebleu_ci_low": 0.3827306936268306,
1023
+ "sacrebleu_ci_high": 0.5671546178089313
1024
  },
1025
  "mt_flores_101_eng_ron": {
1026
+ "num_of_instances": 6,
1027
  "counts": [
1028
+ 115,
1029
+ 60,
1030
+ 38,
1031
+ 24
1032
  ],
1033
  "totals": [
1034
+ 229,
1035
+ 223,
1036
+ 217,
1037
+ 211
1038
  ],
1039
  "precisions": [
1040
+ 0.5021834061135371,
1041
+ 0.26905829596412556,
1042
+ 0.17511520737327188,
1043
+ 0.1137440758293839
1044
  ],
1045
+ "bp": 0.9956427084340843,
1046
+ "sys_len": 229,
1047
+ "ref_len": 230,
1048
+ "sacrebleu": 0.2267743162542805,
1049
+ "score": 0.2267743162542805,
1050
  "score_name": "sacrebleu",
1051
+ "score_ci_low": 0.15815899648450668,
1052
+ "score_ci_high": 0.35001467739505476,
1053
+ "sacrebleu_ci_low": 0.15815899648450668,
1054
+ "sacrebleu_ci_high": 0.35001467739505476
1055
  },
1056
  "mt_flores_101_eng_spa": {
1057
+ "num_of_instances": 6,
1058
  "counts": [
1059
+ 140,
1060
+ 76,
1061
+ 45,
1062
+ 29
1063
  ],
1064
  "totals": [
1065
+ 230,
1066
+ 224,
1067
+ 218,
1068
+ 212
1069
  ],
1070
  "precisions": [
1071
+ 0.6086956521739131,
1072
+ 0.3392857142857143,
1073
+ 0.20642201834862384,
1074
+ 0.13679245283018868
1075
  ],
1076
+ "bp": 0.9450459397948837,
1077
+ "sys_len": 230,
1078
+ "ref_len": 243,
1079
+ "sacrebleu": 0.2611553356291334,
1080
+ "score": 0.2611553356291334,
1081
  "score_name": "sacrebleu",
1082
+ "score_ci_low": 0.19269379023230448,
1083
+ "score_ci_high": 0.32743500263915765,
1084
+ "sacrebleu_ci_low": 0.19269379023230448,
1085
+ "sacrebleu_ci_high": 0.32743500263915765
1086
  },
1087
  "mt_flores_101_fra_eng": {
1088
+ "num_of_instances": 6,
1089
  "counts": [
1090
+ 150,
1091
+ 86,
1092
+ 53,
1093
+ 31
1094
  ],
1095
  "totals": [
1096
+ 211,
1097
+ 205,
1098
+ 199,
1099
+ 193
1100
  ],
1101
  "precisions": [
1102
+ 0.7109004739336493,
1103
+ 0.41951219512195126,
1104
+ 0.2663316582914573,
1105
+ 0.1606217616580311
1106
  ],
1107
  "bp": 1.0,
1108
+ "sys_len": 211,
1109
+ "ref_len": 208,
1110
+ "sacrebleu": 0.33608191487395417,
1111
+ "score": 0.33608191487395417,
1112
  "score_name": "sacrebleu",
1113
+ "score_ci_low": 0.24361347546013137,
1114
+ "score_ci_high": 0.42191893112004974,
1115
+ "sacrebleu_ci_low": 0.24361347546013137,
1116
+ "sacrebleu_ci_high": 0.42191893112004974
1117
  },
1118
  "mt_flores_101_jpn_eng": {
1119
+ "num_of_instances": 6,
1120
  "counts": [
1121
+ 96,
1122
+ 30,
1123
+ 11,
1124
+ 5
1125
  ],
1126
  "totals": [
1127
+ 184,
1128
+ 178,
1129
+ 172,
1130
+ 166
1131
  ],
1132
  "precisions": [
1133
+ 0.5217391304347826,
1134
+ 0.16853932584269665,
1135
+ 0.06395348837209303,
1136
+ 0.030120481927710843
1137
  ],
1138
+ "bp": 0.8777137332821824,
1139
+ "sys_len": 184,
1140
+ "ref_len": 208,
1141
+ "sacrebleu": 0.10013204777722429,
1142
+ "score": 0.10013204777722429,
1143
  "score_name": "sacrebleu",
1144
+ "score_ci_low": 0.06558487118000197,
1145
+ "score_ci_high": 0.10887330550609021,
1146
+ "sacrebleu_ci_low": 0.06558487118000197,
1147
+ "sacrebleu_ci_high": 0.10887330550609021
1148
  },
1149
  "mt_flores_101_kor_eng": {
1150
+ "num_of_instances": 6,
1151
  "counts": [
1152
+ 91,
1153
+ 30,
1154
+ 14,
1155
+ 8
1156
  ],
1157
  "totals": [
1158
+ 188,
1159
+ 182,
1160
+ 176,
1161
+ 170
1162
  ],
1163
  "precisions": [
1164
+ 0.48404255319148937,
1165
+ 0.1648351648351648,
1166
+ 0.07954545454545454,
1167
+ 0.047058823529411764
1168
  ],
1169
+ "bp": 0.8990802535245078,
1170
+ "sys_len": 188,
1171
+ "ref_len": 208,
1172
+ "sacrebleu": 0.11819413324799515,
1173
+ "score": 0.11819413324799515,
1174
  "score_name": "sacrebleu",
1175
+ "score_ci_low": 0.03870206520478138,
1176
+ "score_ci_high": 0.2385085096019695,
1177
+ "sacrebleu_ci_low": 0.03870206520478138,
1178
+ "sacrebleu_ci_high": 0.2385085096019695
1179
  },
1180
  "mt_flores_101_por_eng": {
1181
+ "num_of_instances": 6,
1182
  "counts": [
1183
+ 141,
1184
+ 87,
1185
+ 58,
1186
+ 39
1187
  ],
1188
  "totals": [
1189
+ 204,
1190
+ 198,
1191
+ 192,
1192
+ 186
1193
  ],
1194
  "precisions": [
1195
+ 0.6911764705882354,
1196
+ 0.4393939393939394,
1197
+ 0.3020833333333333,
1198
+ 0.20967741935483872
1199
  ],
1200
+ "bp": 0.9805831403241088,
1201
+ "sys_len": 204,
1202
+ "ref_len": 208,
1203
+ "sacrebleu": 0.36518655576785464,
1204
+ "score": 0.36518655576785464,
1205
  "score_name": "sacrebleu",
1206
+ "score_ci_low": 0.2601914895570609,
1207
+ "score_ci_high": 0.43326095285270216,
1208
+ "sacrebleu_ci_low": 0.2601914895570609,
1209
+ "sacrebleu_ci_high": 0.43326095285270216
1210
  },
1211
  "mt_flores_101_ron_eng": {
1212
+ "num_of_instances": 6,
1213
  "counts": [
1214
+ 149,
1215
+ 87,
1216
+ 53,
1217
+ 34
1218
  ],
1219
  "totals": [
1220
+ 225,
1221
+ 219,
1222
+ 213,
1223
+ 207
1224
  ],
1225
  "precisions": [
1226
+ 0.6622222222222223,
1227
+ 0.3972602739726028,
1228
+ 0.2488262910798122,
1229
+ 0.16425120772946858
1230
  ],
1231
  "bp": 1.0,
1232
+ "sys_len": 225,
1233
+ "ref_len": 208,
1234
+ "sacrebleu": 0.3220111659882407,
1235
+ "score": 0.3220111659882407,
1236
  "score_name": "sacrebleu",
1237
+ "score_ci_low": 0.2299064596013062,
1238
+ "score_ci_high": 0.4146903200437098,
1239
+ "sacrebleu_ci_low": 0.2299064596013062,
1240
+ "sacrebleu_ci_high": 0.4146903200437098
1241
  },
1242
  "mt_flores_101_spa_eng": {
1243
+ "num_of_instances": 6,
1244
  "counts": [
1245
+ 125,
1246
+ 73,
1247
+ 42,
1248
+ 26
1249
  ],
1250
  "totals": [
1251
+ 206,
1252
+ 200,
1253
+ 194,
1254
+ 188
1255
  ],
1256
  "precisions": [
1257
+ 0.6067961165048543,
1258
+ 0.365,
1259
+ 0.21649484536082475,
1260
+ 0.13829787234042554
1261
  ],
1262
+ "bp": 0.9903382397772544,
1263
+ "sys_len": 206,
1264
+ "ref_len": 208,
1265
+ "sacrebleu": 0.28260720363612,
1266
+ "score": 0.28260720363612,
1267
  "score_name": "sacrebleu",
1268
+ "score_ci_low": 0.1851284935850412,
1269
+ "score_ci_high": 0.39347136307788017,
1270
+ "sacrebleu_ci_low": 0.1851284935850412,
1271
+ "sacrebleu_ci_high": 0.39347136307788017
1272
  },
1273
+ "score": 0.24405728737132826,
1274
  "score_name": "subsets_mean",
1275
+ "num_of_instances": 90
1276
  },
1277
+ "score": 0.32787857652155017,
1278
  "score_name": "subsets_mean",
1279
+ "num_of_instances": 1537
1280
  }
1281
  }
results/bluebench/{2025-06-23T05-36-33_evaluation_results.json → 2025-07-02T16-23-36_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-23T09:36:30.499456Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -8,7 +8,7 @@
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
- "model_name=watsonx/meta-llama/llama-3-2-3b-instruct,max_tokens=256",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
@@ -28,7 +28,7 @@
28
  "batch_size": 8,
29
  "model": "watsonx/meta-llama/llama-3-2-3b-instruct",
30
  "model_args": {
31
- "max_tokens": 256
32
  },
33
  "gen_kwargs": null,
34
  "chat_template_kwargs": null,
@@ -41,8 +41,8 @@
41
  "disable_hf_cache": false,
42
  "cache_dir": null
43
  },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -51,25 +51,25 @@
51
  "triton": "3.3.1",
52
  "nltk": "3.9.1",
53
  "anyio": "4.9.0",
 
54
  "absl-py": "2.3.0",
55
  "tiktoken": "0.9.0",
56
  "charset-normalizer": "3.4.2",
57
  "nvidia-cuda-runtime-cu12": "12.6.77",
58
  "sympy": "1.14.0",
59
  "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
  "httpcore": "1.0.9",
 
62
  "Jinja2": "3.1.6",
63
  "jsonschema-specifications": "2025.4.1",
64
  "pydantic_core": "2.33.2",
65
  "nvidia-cusparse-cu12": "12.5.4.2",
 
66
  "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
  "portalocker": "3.2.0",
69
  "pandas": "2.3.0",
70
  "multiprocess": "0.70.16",
71
  "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
  "nvidia-nvjitlink-cu12": "12.6.85",
74
  "nvidia-cublas-cu12": "12.6.4.1",
75
  "pydantic": "2.11.7",
@@ -79,7 +79,7 @@
79
  "contourpy": "1.3.2",
80
  "aiosignal": "1.3.2",
81
  "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
  "six": "1.17.0",
84
  "diskcache": "5.6.3",
85
  "tqdm": "4.67.1",
@@ -98,17 +98,16 @@
98
  "kiwisolver": "1.4.8",
99
  "networkx": "3.4.2",
100
  "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
  "sniffio": "1.3.1",
103
  "scikit-learn": "1.7.0",
 
104
  "nvidia-curand-cu12": "10.3.7.77",
105
  "pip": "25.1.1",
 
106
  "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
  "datasets": "3.6.0",
109
  "nvidia-cusolver-cu12": "11.7.1.2",
110
  "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
  "distro": "1.9.0",
113
  "idna": "3.10",
114
  "MarkupSafe": "3.0.2",
@@ -122,44 +121,45 @@
122
  "joblib": "1.5.1",
123
  "fsspec": "2025.3.0",
124
  "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
  "wheel": "0.45.1",
127
  "nvidia-nvtx-cu12": "12.6.77",
128
  "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
  "propcache": "0.3.2",
131
  "numpy": "2.2.6",
132
  "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
  "conllu": "6.0.0",
 
135
  "safetensors": "0.5.3",
136
  "requests": "2.32.4",
137
  "regex": "2024.11.6",
138
  "aiohttp": "3.12.13",
139
  "tabulate": "0.9.0",
 
140
  "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
  "nvidia-cufft-cu12": "11.3.0.4",
143
  "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
  "click": "8.2.1",
145
  "typing_extensions": "4.12.2",
146
  "attrs": "25.3.0",
147
  "exceptiongroup": "1.3.0",
 
148
  "tenacity": "9.1.2",
149
  "pytz": "2025.2",
150
  "aiohappyeyeballs": "2.6.1",
151
  "python-dateutil": "2.9.0.post0",
152
  "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
 
154
  "httpx": "0.28.1",
155
  "matplotlib": "3.10.3",
156
  "xxhash": "3.5.0",
157
  "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
  "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
  "threadpoolctl": "3.6.0",
162
  "nvidia-cudnn-cu12": "9.5.1.17",
 
163
  "jaraco.collections": "5.1.0",
164
  "tomli": "2.0.1",
165
  "backports.tarfile": "1.2.0",
@@ -176,566 +176,564 @@
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
- "accuracy": 0.6333333333333333,
180
- "accuracy_ci_low": 0.5222222222222223,
181
- "accuracy_ci_high": 0.7333333333333333,
182
  "score_name": "accuracy",
183
- "score": 0.6333333333333333,
184
- "score_ci_high": 0.7333333333333333,
185
- "score_ci_low": 0.5222222222222223,
186
- "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
- "accuracy": 0.6888888888888889,
190
- "accuracy_ci_low": 0.6,
191
  "accuracy_ci_high": 0.7777777777777778,
192
  "score_name": "accuracy",
193
- "score": 0.6888888888888889,
194
  "score_ci_high": 0.7777777777777778,
195
- "score_ci_low": 0.6,
196
- "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
- "accuracy": 0.8444444444444444,
200
- "accuracy_ci_low": 0.7555555555555555,
201
- "accuracy_ci_high": 0.9111111111111111,
202
  "score_name": "accuracy",
203
- "score": 0.8444444444444444,
204
- "score_ci_high": 0.9111111111111111,
205
- "score_ci_low": 0.7555555555555555,
206
- "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
- "accuracy": 0.6777777777777778,
210
- "accuracy_ci_low": 0.5777777777777777,
211
- "accuracy_ci_high": 0.7555555555555555,
212
  "score_name": "accuracy",
213
- "score": 0.6777777777777778,
214
- "score_ci_high": 0.7555555555555555,
215
- "score_ci_low": 0.5777777777777777,
216
- "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
- "accuracy": 0.7222222222222222,
220
- "accuracy_ci_low": 0.6222222222222222,
221
- "accuracy_ci_high": 0.8111111111111111,
222
  "score_name": "accuracy",
223
- "score": 0.7222222222222222,
224
- "score_ci_high": 0.8111111111111111,
225
- "score_ci_low": 0.6222222222222222,
226
- "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.8222222222222222,
230
- "accuracy_ci_low": 0.7333333333333333,
231
- "accuracy_ci_high": 0.8888888888888888,
232
  "score_name": "accuracy",
233
- "score": 0.8222222222222222,
234
- "score_ci_high": 0.8888888888888888,
235
- "score_ci_low": 0.7333333333333333,
236
- "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
- "accuracy": 0.7444444444444445,
240
- "accuracy_ci_low": 0.6444444444444445,
241
- "accuracy_ci_high": 0.8222222222222222,
242
  "score_name": "accuracy",
243
- "score": 0.7444444444444445,
244
- "score_ci_high": 0.8222222222222222,
245
- "score_ci_low": 0.6444444444444445,
246
- "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
- "accuracy": 0.7444444444444445,
250
- "accuracy_ci_low": 0.6444444444444445,
251
- "accuracy_ci_high": 0.8333333333333334,
252
  "score_name": "accuracy",
253
- "score": 0.7444444444444445,
254
- "score_ci_high": 0.8333333333333334,
255
- "score_ci_low": 0.6444444444444445,
256
- "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
- "accuracy": 0.7444444444444445,
260
- "accuracy_ci_low": 0.6444444444444445,
261
- "accuracy_ci_high": 0.8222222222222222,
262
  "score_name": "accuracy",
263
- "score": 0.7444444444444445,
264
- "score_ci_high": 0.8222222222222222,
265
- "score_ci_low": 0.6444444444444445,
266
- "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
- "accuracy": 0.7777777777777778,
270
- "accuracy_ci_low": 0.6790372940698232,
271
- "accuracy_ci_high": 0.8555555555555555,
272
  "score_name": "accuracy",
273
- "score": 0.7777777777777778,
274
- "score_ci_high": 0.8555555555555555,
275
- "score_ci_low": 0.6790372940698232,
276
- "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.7888888888888889,
280
- "accuracy_ci_low": 0.6888888888888889,
281
- "accuracy_ci_high": 0.8555555555555555,
282
  "score_name": "accuracy",
283
- "score": 0.7888888888888889,
284
- "score_ci_high": 0.8555555555555555,
285
- "score_ci_low": 0.6888888888888889,
286
- "num_of_instances": 90
287
  },
288
- "score": 0.7444444444444445,
289
  "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.052083333333333336,
296
- "score": 0.052083333333333336,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.052083333333333336,
300
  "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.4769647696476965,
307
- "f1_Organization": 0.2893890675241158,
308
- "f1_Location": 0.30894308943089427,
309
- "f1_macro": 0.3584323088675689,
310
- "recall_macro": 0.31476418018843305,
311
- "precision_macro": 0.4193267050409908,
312
- "in_classes_support": 0.7786407766990291,
313
- "f1_micro": 0.32884615384615384,
314
- "recall_micro": 0.32571428571428573,
315
- "precision_micro": 0.3320388349514563,
316
- "score": 0.32884615384615384,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.2827095328153393,
319
- "score_ci_high": 0.3761779704134513,
320
- "f1_micro_ci_low": 0.2827095328153393,
321
- "f1_micro_ci_high": 0.3761779704134513
322
  },
323
- "score": 0.32884615384615384,
324
  "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
- "accuracy": 0.49295774647887325,
330
- "accuracy_ci_low": 0.38028169014084506,
331
- "accuracy_ci_high": 0.6056338028169014,
332
  "score_name": "accuracy",
333
- "score": 0.49295774647887325,
334
- "score_ci_high": 0.6056338028169014,
335
- "score_ci_low": 0.38028169014084506,
336
- "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.323943661971831,
340
- "accuracy_ci_low": 0.22535211267605634,
341
- "accuracy_ci_high": 0.43661971830985913,
342
  "score_name": "accuracy",
343
- "score": 0.323943661971831,
344
- "score_ci_high": 0.43661971830985913,
345
- "score_ci_low": 0.22535211267605634,
346
- "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
- "accuracy": 0.22535211267605634,
350
- "accuracy_ci_low": 0.14084507042253522,
351
- "accuracy_ci_high": 0.3380281690140845,
352
  "score_name": "accuracy",
353
- "score": 0.22535211267605634,
354
- "score_ci_high": 0.3380281690140845,
355
- "score_ci_low": 0.14084507042253522,
356
- "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
- "accuracy": 0.29577464788732394,
360
- "accuracy_ci_low": 0.19718309859154928,
361
- "accuracy_ci_high": 0.4084507042253521,
362
  "score_name": "accuracy",
363
- "score": 0.29577464788732394,
364
- "score_ci_high": 0.4084507042253521,
365
- "score_ci_low": 0.19718309859154928,
366
- "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
- "accuracy": 0.5070422535211268,
370
- "accuracy_ci_low": 0.38028169014084506,
371
- "accuracy_ci_high": 0.6197183098591549,
372
  "score_name": "accuracy",
373
- "score": 0.5070422535211268,
374
- "score_ci_high": 0.6197183098591549,
375
- "score_ci_low": 0.38028169014084506,
376
- "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
- "accuracy": 0.18309859154929578,
380
- "accuracy_ci_low": 0.09859154929577464,
381
- "accuracy_ci_high": 0.2676056338028169,
382
  "score_name": "accuracy",
383
- "score": 0.18309859154929578,
384
- "score_ci_high": 0.2676056338028169,
385
- "score_ci_low": 0.09859154929577464,
386
- "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
- "accuracy": 0.323943661971831,
390
- "accuracy_ci_low": 0.2112676056338028,
391
- "accuracy_ci_high": 0.43661971830985913,
392
  "score_name": "accuracy",
393
- "score": 0.323943661971831,
394
- "score_ci_high": 0.43661971830985913,
395
- "score_ci_low": 0.2112676056338028,
396
- "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
- "accuracy": 0.36619718309859156,
400
- "accuracy_ci_low": 0.2535211267605634,
401
- "accuracy_ci_high": 0.4788732394366197,
402
  "score_name": "accuracy",
403
- "score": 0.36619718309859156,
404
- "score_ci_high": 0.4788732394366197,
405
- "score_ci_low": 0.2535211267605634,
406
- "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
- "accuracy": 0.28169014084507044,
410
- "accuracy_ci_low": 0.18309859154929578,
411
- "accuracy_ci_high": 0.39436619718309857,
412
  "score_name": "accuracy",
413
- "score": 0.28169014084507044,
414
- "score_ci_high": 0.39436619718309857,
415
- "score_ci_low": 0.18309859154929578,
416
- "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
- "accuracy": 0.15492957746478872,
420
- "accuracy_ci_low": 0.08450704225352113,
421
- "accuracy_ci_high": 0.26564872868691924,
422
  "score_name": "accuracy",
423
- "score": 0.15492957746478872,
424
- "score_ci_high": 0.26564872868691924,
425
- "score_ci_low": 0.08450704225352113,
426
- "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
- "accuracy": 0.28169014084507044,
430
- "accuracy_ci_low": 0.18309859154929578,
431
- "accuracy_ci_high": 0.39436619718309857,
432
  "score_name": "accuracy",
433
- "score": 0.28169014084507044,
434
- "score_ci_high": 0.39436619718309857,
435
- "score_ci_low": 0.18309859154929578,
436
- "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
- "accuracy": 0.29577464788732394,
440
- "accuracy_ci_low": 0.19718309859154928,
441
- "accuracy_ci_high": 0.4084507042253521,
442
  "score_name": "accuracy",
443
- "score": 0.29577464788732394,
444
- "score_ci_high": 0.4084507042253521,
445
- "score_ci_low": 0.19718309859154928,
446
- "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.16901408450704225,
450
- "accuracy_ci_low": 0.09859154929577464,
451
- "accuracy_ci_high": 0.2676056338028169,
452
  "score_name": "accuracy",
453
- "score": 0.16901408450704225,
454
- "score_ci_high": 0.2676056338028169,
455
- "score_ci_low": 0.09859154929577464,
456
- "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
- "accuracy": 0.4788732394366197,
460
- "accuracy_ci_low": 0.36619718309859156,
461
- "accuracy_ci_high": 0.6012345324644585,
462
  "score_name": "accuracy",
463
- "score": 0.4788732394366197,
464
- "score_ci_high": 0.6012345324644585,
465
- "score_ci_low": 0.36619718309859156,
466
- "num_of_instances": 71
467
  },
468
- "score": 0.31287726358148893,
469
  "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.37558008658008657,
475
- "f1_suggestive": 0.24242424242424243,
476
- "f1_generic": 0.38095238095238093,
477
- "f1_descriptive": 0.4583333333333333,
478
- "f1_fanciful": 0.32,
479
- "f1_arbitrary": 0.47619047619047616,
480
- "f1_macro_ci_low": 0.2831225773147394,
481
- "f1_macro_ci_high": 0.5002662505279254,
482
  "score_name": "f1_micro",
483
- "score": 0.3905325443786982,
484
- "score_ci_high": 0.4970414201183432,
485
- "score_ci_low": 0.2850635959228859,
486
- "num_of_instances": 85,
487
- "accuracy": 0.38823529411764707,
488
- "accuracy_ci_low": 0.2823529411764706,
489
- "accuracy_ci_high": 0.49411764705882355,
490
- "f1_micro": 0.3905325443786982,
491
- "f1_micro_ci_low": 0.2850635959228859,
492
- "f1_micro_ci_high": 0.4970414201183432
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.4401501318725908,
496
- "f1_no": 0.40860215053763443,
497
- "f1_yes": 0.4716981132075472,
498
- "f1_macro_ci_low": 0.37497026604570943,
499
- "f1_macro_ci_high": 0.5074058340630839,
500
  "score_name": "f1_micro",
501
- "score": 0.44221105527638194,
502
- "score_ci_high": 0.507537688442211,
503
- "score_ci_low": 0.3763531585733561,
504
- "num_of_instances": 200,
505
- "accuracy": 0.44,
506
- "accuracy_ci_low": 0.375,
507
- "accuracy_ci_high": 0.505,
508
- "f1_micro": 0.44221105527638194,
509
- "f1_micro_ci_low": 0.3763531585733561,
510
- "f1_micro_ci_high": 0.507537688442211
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.1916316363119106,
514
- "f1_conclusion": 0.15584415584415584,
515
- "f1_analysis": 0.3333333333333333,
516
- "f1_decree": 0.07692307692307693,
517
- "f1_issue": 0.23076923076923078,
518
- "f1_facts": 0.12903225806451613,
519
- "f1_procedural history": 0.11764705882352941,
520
- "f1_rule": 0.2978723404255319,
521
- "f1_macro_ci_low": 0.143695452040772,
522
- "f1_macro_ci_high": 0.25550408682657144,
523
  "score_name": "f1_micro",
524
- "score": 0.22278481012658227,
525
- "score_ci_high": 0.2864321608040201,
526
- "score_ci_low": 0.16660296570964608,
527
- "num_of_instances": 200,
528
- "accuracy": 0.22,
529
- "accuracy_ci_low": 0.165,
530
- "accuracy_ci_high": 0.28021087258250593,
531
- "f1_micro": 0.22278481012658227,
532
- "f1_micro_ci_low": 0.16660296570964608,
533
- "f1_micro_ci_high": 0.2864321608040201
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.5259978425026969,
537
- "f1_yes": 0.5631067961165048,
538
- "f1_no": 0.4888888888888889,
539
- "f1_macro_ci_low": 0.4618026481617566,
540
- "f1_macro_ci_high": 0.5997495353215635,
541
  "score_name": "f1_micro",
542
- "score": 0.5284974093264249,
543
- "score_ci_high": 0.5989912778302698,
544
- "score_ci_low": 0.46113989637305697,
545
- "num_of_instances": 200,
546
- "accuracy": 0.51,
547
- "accuracy_ci_low": 0.445,
548
- "accuracy_ci_high": 0.58,
549
- "f1_micro": 0.5284974093264249,
550
- "f1_micro_ci_low": 0.46113989637305697,
551
- "f1_micro_ci_high": 0.5989912778302698
552
  },
553
  "legalbench_proa": {
554
- "f1_macro": 0.726027397260274,
555
- "f1_yes": 0.7123287671232876,
556
- "f1_no": 0.7397260273972602,
557
- "f1_macro_ci_low": 0.618628457335439,
558
- "f1_macro_ci_high": 0.8122702152748204,
559
  "score_name": "f1_micro",
560
- "score": 0.726027397260274,
561
- "score_ci_high": 0.8104575163398693,
562
- "score_ci_low": 0.6186406698987806,
563
- "num_of_instances": 85,
564
- "accuracy": 0.6235294117647059,
565
- "accuracy_ci_low": 0.5058823529411764,
566
- "accuracy_ci_high": 0.7176470588235294,
567
- "f1_micro": 0.726027397260274,
568
- "f1_micro_ci_low": 0.6186406698987806,
569
- "f1_micro_ci_high": 0.8104575163398693
570
  },
571
- "score": 0.46201064327367225,
572
  "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.3891021150890982,
578
- "f1_cars": 0.7346938775510204,
579
  "f1_windows x": 0.0,
580
- "f1_atheism": 0.425531914893617,
581
- "f1_christianity": 0.4444444444444444,
582
- "f1_religion": 0.15873015873015872,
583
- "f1_medicine": 0.6376811594202898,
584
- "f1_computer graphics": 0.2755102040816326,
585
- "f1_microsoft windows": 0.29850746268656714,
586
- "f1_middle east": 0.19607843137254902,
587
- "f1_politics": 0.3387096774193548,
588
- "f1_motorcycles": 0.43902439024390244,
589
- "f1_mac hardware": 0.2,
590
- "f1_pc hardware": 0.34545454545454546,
591
- "f1_for sale": 0.33962264150943394,
592
- "f1_guns": 0.26666666666666666,
593
- "f1_baseball": 0.7368421052631579,
594
- "f1_space": 0.5194805194805194,
595
- "f1_cryptography": 0.4358974358974359,
596
- "f1_hockey": 0.5625,
597
- "f1_electronics": 0.4266666666666667,
598
- "f1_macro_ci_low": 0.35853792537669554,
599
- "f1_macro_ci_high": 0.4194279498018566,
600
  "score_name": "f1_micro",
601
- "score": 0.4063792085056113,
602
- "score_ci_high": 0.4367348562601148,
603
- "score_ci_low": 0.3741790131164338,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.344,
606
- "accuracy_ci_low": 0.314,
607
- "accuracy_ci_high": 0.371,
608
- "f1_micro": 0.4063792085056113,
609
- "f1_micro_ci_low": 0.3741790131164338,
610
- "f1_micro_ci_high": 0.4367348562601148
611
  },
612
- "score": 0.4063792085056113,
613
  "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.5522255415970777,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9099656357388316,
620
- "f1_checking or savings account": 0.5301204819277109,
621
- "f1_debt collection": 0.3576158940397351,
622
- "f1_credit card or prepaid card": 0.37777777777777777,
623
- "f1_mortgage": 0.7017543859649122,
624
- "f1_student loan": 0.75,
625
  "f1_money transfer or virtual currency or money service": 0.6666666666666666,
626
- "f1_vehicle loan or lease": 0.5161290322580645,
627
- "f1_payday loan or title loan or personal loan": 0.16,
628
- "f1_macro_ci_low": 0.5008792423568225,
629
- "f1_macro_ci_high": 0.6059191922507057,
 
 
630
  "score_name": "f1_micro",
631
- "score": 0.7975522692503825,
632
- "score_ci_high": 0.8230092874186598,
633
- "score_ci_low": 0.7734015345268542,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.782,
636
- "accuracy_ci_low": 0.758,
637
- "accuracy_ci_high": 0.808771349424543,
638
- "f1_micro": 0.7975522692503825,
639
- "f1_micro_ci_low": 0.7734015345268542,
640
- "f1_micro_ci_high": 0.8230092874186598
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.5976480822045338,
644
- "f1_mortgages and loans": 0.7428571428571429,
645
- "f1_credit card": 0.6767676767676768,
646
- "f1_debt collection": 0.5729166666666666,
647
- "f1_retail banking": 0.26666666666666666,
648
- "f1_credit reporting": 0.7290322580645161,
649
- "f1_macro_ci_low": 0.5536640329621239,
650
- "f1_macro_ci_high": 0.6426754598088634,
651
  "score_name": "f1_micro",
652
- "score": 0.6408163265306123,
653
- "score_ci_high": 0.683589397051309,
654
- "score_ci_low": 0.5968250791908158,
655
- "num_of_instances": 500,
656
- "accuracy": 0.628,
657
- "accuracy_ci_low": 0.584,
658
- "accuracy_ci_high": 0.67,
659
- "f1_micro": 0.6408163265306123,
660
- "f1_micro_ci_low": 0.5968250791908158,
661
- "f1_micro_ci_high": 0.683589397051309
662
  },
663
- "score": 0.7191842978904974,
664
  "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
- "num_of_instances": 1000,
670
- "program_accuracy": 0.046,
671
- "score": 0.046,
672
  "score_name": "program_accuracy",
673
- "execution_accuracy": 0.04,
674
- "program_accuracy_ci_low": 0.035,
675
- "program_accuracy_ci_high": 0.06,
676
- "score_ci_low": 0.035,
677
- "score_ci_high": 0.06,
678
- "execution_accuracy_ci_low": 0.029,
679
- "execution_accuracy_ci_high": 0.053
680
  },
681
- "score": 0.046,
682
  "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.3263731921396224,
688
- "recall": 0.5005136281930357,
689
- "f1": 0.3238306562243135,
690
- "precision_ci_low": 0.3062305395351377,
691
- "precision_ci_high": 0.3471267358926223,
692
- "recall_ci_low": 0.48406708194912995,
693
- "recall_ci_high": 0.5170051754306556,
694
- "f1_ci_low": 0.30817001709054165,
695
- "f1_ci_high": 0.33859378817920405,
696
  "score_name": "f1",
697
- "score": 0.3238306562243135,
698
- "score_ci_high": 0.33859378817920405,
699
- "score_ci_low": 0.30817001709054165,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.5865038787325223,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6549727993210157,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5550703954448303,
704
- "faithfullness_f1_token_overlap": 0.26017056639729325,
705
- "faithfullness_recall_token_overlap": 0.19094108989263792,
706
- "faithfullness_precision_token_overlap": 0.5662514000838377,
707
- "correctness_f1_token_overlap": 0.3238306562243135,
708
- "correctness_recall_token_overlap": 0.5005136281930357,
709
- "correctness_precision_token_overlap": 0.3263731921396224
710
  },
711
- "score": 0.3238306562243135,
712
  "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.425,
718
- "accuracy_ci_low": 0.394,
719
- "accuracy_ci_high": 0.4557921663139014,
720
  "score_name": "accuracy",
721
- "score": 0.425,
722
- "score_ci_high": 0.4557921663139014,
723
- "score_ci_low": 0.394,
724
- "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.736,
728
- "accuracy_ci_low": 0.7,
729
- "accuracy_ci_high": 0.776,
730
  "score_name": "accuracy",
731
- "score": 0.736,
732
- "score_ci_high": 0.776,
733
- "score_ci_low": 0.7,
734
- "num_of_instances": 500
735
  },
736
- "score": 0.5805,
737
  "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
@@ -743,541 +741,541 @@
743
  "severity_high": 0.0,
744
  "severity_medium": 0.0,
745
  "severity_low": 100.0,
746
- "category_harmful_info": 0.8295307880589028,
747
- "category_deception": 0.8032731126354199,
748
- "category_explicit_content": 0.8484950255263936,
749
- "category_violence": 0.8168619035846657,
750
- "category_discrimination": 0.8478557457999577,
751
- "category_substance_abuse": 0.8705138153500027,
752
- "category_pii": 0.829884376514841,
753
- "safety": 0.8276369063778677,
754
- "safety_ci_low": 0.8109914147202264,
755
- "safety_ci_high": 0.8430729937190028,
756
  "score_name": "safety",
757
- "score": 0.8276369063778677,
758
- "score_ci_high": 0.8430729937190028,
759
- "score_ci_low": 0.8109914147202264,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.8276369063778677,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rougeL": 0.284383588545129,
770
- "score": 0.284383588545129,
 
 
771
  "score_name": "rougeL",
772
- "rouge2": 0.20245431329782115,
773
- "rouge1": 0.41045505876440336,
774
- "rougeLsum": 0.3502025548591709,
775
- "rougeL_ci_low": 0.2769343751140693,
776
- "rougeL_ci_high": 0.29127408884195716,
777
- "score_ci_low": 0.2769343751140693,
778
- "score_ci_high": 0.29127408884195716,
779
- "rouge2_ci_low": 0.1950015427241588,
780
- "rouge2_ci_high": 0.210011723499623,
781
- "rouge1_ci_low": 0.4008962815661577,
782
- "rouge1_ci_high": 0.41982792488499465,
783
- "rougeLsum_ci_low": 0.3416385040140321,
784
- "rougeLsum_ci_high": 0.3588894507334038
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rougeL": 0.0903457635776036,
789
- "score": 0.0903457635776036,
 
 
790
  "score_name": "rougeL",
791
- "rouge2": 0.018003187802161934,
792
- "rouge1": 0.12438028754478446,
793
- "rougeLsum": 0.10277785443605283,
794
- "rougeL_ci_low": 0.08651302258172923,
795
- "rougeL_ci_high": 0.09388371145028165,
796
- "score_ci_low": 0.08651302258172923,
797
- "score_ci_high": 0.09388371145028165,
798
- "rouge2_ci_low": 0.016237543973207882,
799
- "rouge2_ci_high": 0.01999767687426406,
800
- "rouge1_ci_low": 0.11882142858616347,
801
- "rouge1_ci_high": 0.12948246473507513,
802
- "rougeLsum_ci_low": 0.0979381342979595,
803
- "rougeLsum_ci_high": 0.10690023059691123
804
  },
805
- "score": 0.1873646760613663,
806
  "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
  },
809
  "translation": {
810
  "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
  "counts": [
813
- 1135,
814
- 646,
815
- 410,
816
- 275
817
  ],
818
  "totals": [
819
- 1820,
820
- 1754,
821
- 1688,
822
- 1622
823
  ],
824
  "precisions": [
825
- 0.6236263736263736,
826
- 0.36830102622576966,
827
- 0.24289099526066352,
828
- 0.16954377311960542
829
  ],
830
  "bp": 1.0,
831
- "sys_len": 1820,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.31185676193781753,
834
- "score": 0.31185676193781753,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.26078843081913794,
837
- "score_ci_high": 0.35262811190937277,
838
- "sacrebleu_ci_low": 0.26078843081913794,
839
- "sacrebleu_ci_high": 0.35262811190937277
840
  },
841
  "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
  "counts": [
844
- 1238,
845
- 750,
846
- 499,
847
- 339
848
  ],
849
  "totals": [
850
- 1796,
851
- 1730,
852
- 1664,
853
- 1598
854
  ],
855
  "precisions": [
856
- 0.6893095768374166,
857
- 0.4335260115606936,
858
- 0.2998798076923077,
859
- 0.21214017521902379
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 1796,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.3713213364431593,
865
- "score": 0.3713213364431593,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.3292670063116335,
868
- "score_ci_high": 0.4181859347073083,
869
- "sacrebleu_ci_low": 0.3292670063116335,
870
- "sacrebleu_ci_high": 0.4181859347073083
871
  },
872
  "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
  "counts": [
875
- 613,
876
- 190,
877
- 79,
878
- 26
879
  ],
880
  "totals": [
881
- 1656,
882
- 1590,
883
- 1524,
884
- 1458
885
  ],
886
  "precisions": [
887
- 0.3701690821256039,
888
- 0.11949685534591195,
889
- 0.05183727034120735,
890
- 0.01783264746227709
891
  ],
892
  "bp": 1.0,
893
- "sys_len": 1656,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.07996568130005909,
896
- "score": 0.07996568130005909,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.06042930465467444,
899
- "score_ci_high": 0.09652810994564934,
900
- "sacrebleu_ci_low": 0.06042930465467444,
901
- "sacrebleu_ci_high": 0.09652810994564934
902
  },
903
  "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
  "counts": [
906
- 1029,
907
- 509,
908
- 282,
909
- 168
910
  ],
911
  "totals": [
912
- 1810,
913
- 1744,
914
- 1678,
915
- 1612
916
  ],
917
  "precisions": [
918
- 0.5685082872928177,
919
- 0.2918577981651376,
920
- 0.16805721096543505,
921
- 0.10421836228287841
922
  ],
923
- "bp": 0.9862827954544454,
924
- "sys_len": 1810,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.22899649328289487,
927
- "score": 0.22899649328289487,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.19262787967326042,
930
- "score_ci_high": 0.280077803172394,
931
- "sacrebleu_ci_low": 0.19262787967326042,
932
- "sacrebleu_ci_high": 0.280077803172394
933
  },
934
  "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
  "counts": [
937
- 1359,
938
- 902,
939
- 653,
940
- 485
941
  ],
942
  "totals": [
943
- 1997,
944
- 1931,
945
- 1865,
946
- 1799
947
  ],
948
  "precisions": [
949
- 0.6805207811717576,
950
- 0.4671154842050751,
951
- 0.3501340482573727,
952
- 0.2695942190105614
953
  ],
954
- "bp": 0.9650712656118398,
955
- "sys_len": 1997,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.40166318618755137,
958
- "score": 0.40166318618755137,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.3617396828186495,
961
- "score_ci_high": 0.4488753587822201,
962
- "sacrebleu_ci_low": 0.3617396828186495,
963
- "sacrebleu_ci_high": 0.4488753587822201
964
  },
965
  "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
  "counts": [
968
- 1121,
969
- 512,
970
- 259,
971
- 142
972
  ],
973
  "totals": [
974
- 2523,
975
- 2457,
976
- 2391,
977
- 2325
978
  ],
979
  "precisions": [
980
- 0.44431232659532305,
981
- 0.20838420838420837,
982
- 0.1083228774571309,
983
- 0.0610752688172043
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 2523,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.1573202708440978,
989
- "score": 0.1573202708440978,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.1330679575324248,
992
- "score_ci_high": 0.17785519831459645,
993
- "sacrebleu_ci_low": 0.1330679575324248,
994
- "sacrebleu_ci_high": 0.17785519831459645
995
  },
996
  "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
  "counts": [
999
- 1319,
1000
- 872,
1001
- 612,
1002
- 444
1003
  ],
1004
  "totals": [
1005
- 1868,
1006
- 1802,
1007
- 1736,
1008
- 1670
1009
  ],
1010
  "precisions": [
1011
- 0.7061027837259101,
1012
- 0.4839067702552719,
1013
- 0.35253456221198154,
1014
- 0.2658682634730539
1015
  ],
1016
- "bp": 0.974631399286791,
1017
- "sys_len": 1868,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.41230144255258333,
1020
- "score": 0.41230144255258333,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.3754985997111881,
1023
- "score_ci_high": 0.4595286683207052,
1024
- "sacrebleu_ci_low": 0.3754985997111881,
1025
- "sacrebleu_ci_high": 0.4595286683207052
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
  "counts": [
1030
- 1192,
1031
- 710,
1032
- 457,
1033
- 302
1034
  ],
1035
  "totals": [
1036
- 1928,
1037
- 1862,
1038
- 1796,
1039
- 1730
1040
  ],
1041
  "precisions": [
1042
- 0.6182572614107884,
1043
- 0.38131041890440387,
1044
- 0.2544543429844098,
1045
- 0.1745664739884393
1046
  ],
1047
- "bp": 0.9891669881299116,
1048
- "sys_len": 1928,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.31642753307552074,
1051
- "score": 0.31642753307552074,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.2854187560212565,
1054
- "score_ci_high": 0.3701086569937762,
1055
- "sacrebleu_ci_low": 0.2854187560212565,
1056
- "sacrebleu_ci_high": 0.3701086569937762
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
  "counts": [
1061
- 1208,
1062
- 659,
1063
- 389,
1064
- 236
1065
  ],
1066
  "totals": [
1067
- 1983,
1068
- 1917,
1069
- 1851,
1070
- 1785
1071
  ],
1072
  "precisions": [
1073
- 0.6091780131114473,
1074
- 0.34376630151278037,
1075
- 0.2101566720691518,
1076
- 0.13221288515406163
1077
  ],
1078
- "bp": 0.9436566096384625,
1079
- "sys_len": 1983,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.26062742180685816,
1082
- "score": 0.26062742180685816,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.22914767064053682,
1085
- "score_ci_high": 0.2844965463617,
1086
- "sacrebleu_ci_low": 0.22914767064053682,
1087
- "sacrebleu_ci_high": 0.2844965463617
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
  "counts": [
1092
- 1276,
1093
- 807,
1094
- 545,
1095
- 375
1096
  ],
1097
  "totals": [
1098
- 1818,
1099
- 1752,
1100
- 1686,
1101
- 1620
1102
  ],
1103
  "precisions": [
1104
- 0.7018701870187019,
1105
- 0.4606164383561644,
1106
- 0.3232502965599051,
1107
- 0.23148148148148148
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 1818,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.39437815723424946,
1113
- "score": 0.39437815723424946,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.3589675802199702,
1116
- "score_ci_high": 0.4463980849713465,
1117
- "sacrebleu_ci_low": 0.3589675802199702,
1118
- "sacrebleu_ci_high": 0.4463980849713465
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
  "counts": [
1123
- 1027,
1124
- 468,
1125
- 248,
1126
- 130
1127
  ],
1128
  "totals": [
1129
- 1824,
1130
- 1758,
1131
- 1692,
1132
- 1626
1133
  ],
1134
  "precisions": [
1135
- 0.5630482456140351,
1136
- 0.26621160409556316,
1137
- 0.14657210401891255,
1138
- 0.07995079950799508
1139
  ],
1140
  "bp": 1.0,
1141
- "sys_len": 1824,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.20472066389963584,
1144
- "score": 0.20472066389963584,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.1789860929189351,
1147
- "score_ci_high": 0.24365182830296692,
1148
- "sacrebleu_ci_low": 0.1789860929189351,
1149
- "sacrebleu_ci_high": 0.24365182830296692
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
  "counts": [
1154
- 963,
1155
- 428,
1156
- 229,
1157
- 133
1158
  ],
1159
  "totals": [
1160
- 1783,
1161
- 1717,
1162
- 1651,
1163
- 1585
1164
  ],
1165
  "precisions": [
1166
- 0.5401009534492429,
1167
- 0.24927198602213163,
1168
- 0.1387038158691702,
1169
- 0.08391167192429022
1170
  ],
1171
- "bp": 1.0,
1172
- "sys_len": 1783,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.19895955473357632,
1175
- "score": 0.19895955473357632,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.1719456080924031,
1178
- "score_ci_high": 0.2466087231408179,
1179
- "sacrebleu_ci_low": 0.1719456080924031,
1180
- "sacrebleu_ci_high": 0.2466087231408179
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
  "counts": [
1185
- 1283,
1186
- 836,
1187
- 589,
1188
- 428
1189
  ],
1190
  "totals": [
1191
- 1803,
1192
- 1737,
1193
- 1671,
1194
- 1605
1195
  ],
1196
  "precisions": [
1197
- 0.7115917914586799,
1198
- 0.48128957973517555,
1199
- 0.35248354278874927,
1200
- 0.26666666666666666
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 1803,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.4235807758108321,
1206
- "score": 0.4235807758108321,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.37601558928179885,
1209
- "score_ci_high": 0.47290261446153176,
1210
- "sacrebleu_ci_low": 0.37601558928179885,
1211
- "sacrebleu_ci_high": 0.47290261446153176
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
  "counts": [
1216
- 1297,
1217
- 833,
1218
- 566,
1219
- 384
1220
  ],
1221
  "totals": [
1222
- 1841,
1223
- 1775,
1224
- 1709,
1225
- 1643
1226
  ],
1227
  "precisions": [
1228
- 0.7045084193373167,
1229
- 0.46929577464788735,
1230
- 0.3311878291398479,
1231
- 0.23371880706025563
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 1841,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.3999679713298994,
1237
- "score": 0.3999679713298994,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.3551473917196145,
1240
- "score_ci_high": 0.4324084862016873,
1241
- "sacrebleu_ci_low": 0.3551473917196145,
1242
- "sacrebleu_ci_high": 0.4324084862016873
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
  "counts": [
1247
- 1126,
1248
- 594,
1249
- 349,
1250
- 214
1251
  ],
1252
  "totals": [
1253
- 1834,
1254
- 1768,
1255
- 1702,
1256
- 1636
1257
  ],
1258
  "precisions": [
1259
- 0.6139585605234461,
1260
- 0.335972850678733,
1261
- 0.20505287896592242,
1262
- 0.13080684596577016
1263
  ],
1264
  "bp": 1.0,
1265
- "sys_len": 1834,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.2727312463583288,
1268
- "score": 0.2727312463583288,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.23703993855927166,
1271
- "score_ci_high": 0.3146036672452131,
1272
- "sacrebleu_ci_low": 0.23703993855927166,
1273
- "sacrebleu_ci_high": 0.3146036672452131
1274
  },
1275
- "score": 0.2956545664531376,
1276
  "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
  },
1279
- "score": 0.4066778576916836,
1280
  "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
  }
1283
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-07-02T20:23:32.663416Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
+ "model_name=watsonx/meta-llama/llama-3-2-3b-instruct,max_tokens=1024",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
 
28
  "batch_size": 8,
29
  "model": "watsonx/meta-llama/llama-3-2-3b-instruct",
30
  "model_args": {
31
+ "max_tokens": 1024
32
  },
33
  "gen_kwargs": null,
34
  "chat_template_kwargs": null,
 
41
  "disable_hf_cache": false,
42
  "cache_dir": null
43
  },
44
+ "unitxt_version": "1.25.0",
45
+ "unitxt_commit_hash": "c8a5e77fd6ca62039b915dc10700323f50ccaacf",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
51
  "triton": "3.3.1",
52
  "nltk": "3.9.1",
53
  "anyio": "4.9.0",
54
+ "unitxt": "1.25.0",
55
  "absl-py": "2.3.0",
56
  "tiktoken": "0.9.0",
57
  "charset-normalizer": "3.4.2",
58
  "nvidia-cuda-runtime-cu12": "12.6.77",
59
  "sympy": "1.14.0",
60
  "mecab-ko": "1.0.1",
 
61
  "httpcore": "1.0.9",
62
+ "litellm": "1.73.6",
63
  "Jinja2": "3.1.6",
64
  "jsonschema-specifications": "2025.4.1",
65
  "pydantic_core": "2.33.2",
66
  "nvidia-cusparse-cu12": "12.5.4.2",
67
+ "tokenizers": "0.21.2",
68
  "yarl": "1.20.1",
 
69
  "portalocker": "3.2.0",
70
  "pandas": "2.3.0",
71
  "multiprocess": "0.70.16",
72
  "jsonschema": "4.24.0",
 
73
  "nvidia-nvjitlink-cu12": "12.6.85",
74
  "nvidia-cublas-cu12": "12.6.4.1",
75
  "pydantic": "2.11.7",
 
79
  "contourpy": "1.3.2",
80
  "aiosignal": "1.3.2",
81
  "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "openai": "1.93.0",
83
  "six": "1.17.0",
84
  "diskcache": "5.6.3",
85
  "tqdm": "4.67.1",
 
98
  "kiwisolver": "1.4.8",
99
  "networkx": "3.4.2",
100
  "typing-inspection": "0.4.1",
 
101
  "sniffio": "1.3.1",
102
  "scikit-learn": "1.7.0",
103
+ "rpds-py": "0.26.0",
104
  "nvidia-curand-cu12": "10.3.7.77",
105
  "pip": "25.1.1",
106
+ "pillow": "11.3.0",
107
  "fonttools": "4.58.4",
 
108
  "datasets": "3.6.0",
109
  "nvidia-cusolver-cu12": "11.7.1.2",
110
  "cycler": "0.12.1",
 
111
  "distro": "1.9.0",
112
  "idna": "3.10",
113
  "MarkupSafe": "3.0.2",
 
121
  "joblib": "1.5.1",
122
  "fsspec": "2025.3.0",
123
  "dill": "0.3.8",
 
124
  "wheel": "0.45.1",
125
  "nvidia-nvtx-cu12": "12.6.77",
126
  "nvidia-cusparselt-cu12": "0.6.3",
127
+ "lxml": "6.0.0",
128
  "propcache": "0.3.2",
129
  "numpy": "2.2.6",
130
  "mpmath": "1.3.0",
 
131
  "conllu": "6.0.0",
132
+ "huggingface-hub": "0.33.2",
133
  "safetensors": "0.5.3",
134
  "requests": "2.32.4",
135
  "regex": "2024.11.6",
136
  "aiohttp": "3.12.13",
137
  "tabulate": "0.9.0",
138
+ "accelerate": "1.8.1",
139
  "certifi": "2025.6.15",
140
+ "evaluate": "0.4.4",
141
  "nvidia-cufft-cu12": "11.3.0.4",
142
  "nvidia-cuda-nvrtc-cu12": "12.6.77",
143
  "click": "8.2.1",
144
  "typing_extensions": "4.12.2",
145
  "attrs": "25.3.0",
146
  "exceptiongroup": "1.3.0",
147
+ "transformers": "4.53.0",
148
  "tenacity": "9.1.2",
149
  "pytz": "2025.2",
150
  "aiohappyeyeballs": "2.6.1",
151
  "python-dateutil": "2.9.0.post0",
152
  "torch": "2.7.1",
153
+ "python-dotenv": "1.1.1",
154
+ "multidict": "6.6.3",
155
  "httpx": "0.28.1",
156
  "matplotlib": "3.10.3",
157
  "xxhash": "3.5.0",
158
  "PyYAML": "6.0.2",
 
159
  "colorama": "0.4.6",
 
160
  "threadpoolctl": "3.6.0",
161
  "nvidia-cudnn-cu12": "9.5.1.17",
162
+ "hf-xet": "1.1.5",
163
  "jaraco.collections": "5.1.0",
164
  "tomli": "2.0.1",
165
  "backports.tarfile": "1.2.0",
 
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
+ "accuracy": 0.7777777777777778,
180
+ "accuracy_ci_low": 0.41707199293005626,
181
+ "accuracy_ci_high": 1.0,
182
  "score_name": "accuracy",
183
+ "score": 0.7777777777777778,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 0.41707199293005626,
186
+ "num_of_instances": 9
187
  },
188
  "safety_bbq_disability_status": {
189
+ "accuracy": 0.4444444444444444,
190
+ "accuracy_ci_low": 0.1111111111111111,
191
  "accuracy_ci_high": 0.7777777777777778,
192
  "score_name": "accuracy",
193
+ "score": 0.4444444444444444,
194
  "score_ci_high": 0.7777777777777778,
195
+ "score_ci_low": 0.1111111111111111,
196
+ "num_of_instances": 9
197
  },
198
  "safety_bbq_gender_identity": {
199
+ "accuracy": 0.7777777777777778,
200
+ "accuracy_ci_low": 0.3333333333333333,
201
+ "accuracy_ci_high": 1.0,
202
  "score_name": "accuracy",
203
+ "score": 0.7777777777777778,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 0.3333333333333333,
206
+ "num_of_instances": 9
207
  },
208
  "safety_bbq_nationality": {
209
+ "accuracy": 0.5555555555555556,
210
+ "accuracy_ci_low": 0.2222222222222222,
211
+ "accuracy_ci_high": 0.8888888888888888,
212
  "score_name": "accuracy",
213
+ "score": 0.5555555555555556,
214
+ "score_ci_high": 0.8888888888888888,
215
+ "score_ci_low": 0.2222222222222222,
216
+ "num_of_instances": 9
217
  },
218
  "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.7777777777777778,
220
+ "accuracy_ci_low": 0.41707199293005626,
221
+ "accuracy_ci_high": 1.0,
222
  "score_name": "accuracy",
223
+ "score": 0.7777777777777778,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 0.41707199293005626,
226
+ "num_of_instances": 9
227
  },
228
  "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
  "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
  },
238
  "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.6666666666666666,
240
+ "accuracy_ci_low": 0.3333333333333333,
241
+ "accuracy_ci_high": 0.8888888888888888,
242
  "score_name": "accuracy",
243
+ "score": 0.6666666666666666,
244
+ "score_ci_high": 0.8888888888888888,
245
+ "score_ci_low": 0.3333333333333333,
246
+ "num_of_instances": 9
247
  },
248
  "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.7777777777777778,
250
+ "accuracy_ci_low": 0.4444444444444444,
251
+ "accuracy_ci_high": 1.0,
252
  "score_name": "accuracy",
253
+ "score": 0.7777777777777778,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 0.4444444444444444,
256
+ "num_of_instances": 9
257
  },
258
  "safety_bbq_religion": {
259
+ "accuracy": 0.6666666666666666,
260
+ "accuracy_ci_low": 0.3333333333333333,
261
+ "accuracy_ci_high": 0.8888888888888888,
262
  "score_name": "accuracy",
263
+ "score": 0.6666666666666666,
264
+ "score_ci_high": 0.8888888888888888,
265
+ "score_ci_low": 0.3333333333333333,
266
+ "num_of_instances": 9
267
  },
268
  "safety_bbq_ses": {
269
+ "accuracy": 0.3333333333333333,
270
+ "accuracy_ci_low": 0.0,
271
+ "accuracy_ci_high": 0.6666666666666666,
272
  "score_name": "accuracy",
273
+ "score": 0.3333333333333333,
274
+ "score_ci_high": 0.6666666666666666,
275
+ "score_ci_low": 0.0,
276
+ "num_of_instances": 9
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.3333333333333333,
280
+ "accuracy_ci_low": 0.0,
281
+ "accuracy_ci_high": 0.6666666666666666,
282
  "score_name": "accuracy",
283
+ "score": 0.3333333333333333,
284
+ "score_ci_high": 0.6666666666666666,
285
+ "score_ci_low": 0.0,
286
+ "num_of_instances": 9
287
  },
288
+ "score": 0.6464646464646464,
289
  "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.2777777777777778,
296
+ "score": 0.2777777777777778,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.2777777777777778,
300
  "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.33333333333333337,
307
+ "f1_Organization": 0.16326530612244897,
308
+ "f1_Location": 0.29411764705882354,
309
+ "f1_macro": 0.2635720955048686,
310
+ "recall_macro": 0.2040200138026225,
311
+ "precision_macro": 0.384004884004884,
312
+ "in_classes_support": 0.7719298245614035,
313
+ "f1_micro": 0.22727272727272727,
314
+ "recall_micro": 0.2,
315
+ "precision_micro": 0.2631578947368421,
316
+ "score": 0.22727272727272727,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.13940062602106976,
319
+ "score_ci_high": 0.3364095639750836,
320
+ "f1_micro_ci_low": 0.13940062602106976,
321
+ "f1_micro_ci_high": 0.3364095639750836
322
  },
323
+ "score": 0.22727272727272727,
324
  "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.5714285714285714,
330
+ "accuracy_ci_low": 0.14285714285714285,
331
+ "accuracy_ci_high": 0.8571428571428571,
332
  "score_name": "accuracy",
333
+ "score": 0.5714285714285714,
334
+ "score_ci_high": 0.8571428571428571,
335
+ "score_ci_low": 0.14285714285714285,
336
+ "num_of_instances": 7
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.14285714285714285,
340
+ "accuracy_ci_low": 0.0,
341
+ "accuracy_ci_high": 0.5714285714285714,
342
  "score_name": "accuracy",
343
+ "score": 0.14285714285714285,
344
+ "score_ci_high": 0.5714285714285714,
345
+ "score_ci_low": 0.0,
346
+ "num_of_instances": 7
347
  },
348
  "mmlu_pro_chemistry": {
349
+ "accuracy": 0.14285714285714285,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.5714285714285714,
352
  "score_name": "accuracy",
353
+ "score": 0.14285714285714285,
354
+ "score_ci_high": 0.5714285714285714,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
  },
358
  "mmlu_pro_computer_science": {
359
+ "accuracy": 0.42857142857142855,
360
+ "accuracy_ci_low": 0.14285714285714285,
361
+ "accuracy_ci_high": 0.8571428571428571,
362
  "score_name": "accuracy",
363
+ "score": 0.42857142857142855,
364
+ "score_ci_high": 0.8571428571428571,
365
+ "score_ci_low": 0.14285714285714285,
366
+ "num_of_instances": 7
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.7142857142857143,
370
+ "accuracy_ci_low": 0.2857142857142857,
371
+ "accuracy_ci_high": 1.0,
372
  "score_name": "accuracy",
373
+ "score": 0.7142857142857143,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.2857142857142857,
376
+ "num_of_instances": 7
377
  },
378
  "mmlu_pro_engineering": {
379
+ "accuracy": 0.0,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.0,
382
  "score_name": "accuracy",
383
+ "score": 0.0,
384
+ "score_ci_high": 0.0,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.14285714285714285,
390
+ "accuracy_ci_low": 0.0,
391
+ "accuracy_ci_high": 0.5714285714285714,
392
  "score_name": "accuracy",
393
+ "score": 0.14285714285714285,
394
+ "score_ci_high": 0.5714285714285714,
395
+ "score_ci_low": 0.0,
396
+ "num_of_instances": 7
397
  },
398
  "mmlu_pro_history": {
399
+ "accuracy": 0.14285714285714285,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.7469722493882013,
402
  "score_name": "accuracy",
403
+ "score": 0.14285714285714285,
404
+ "score_ci_high": 0.7469722493882013,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
  },
408
  "mmlu_pro_law": {
409
+ "accuracy": 0.2857142857142857,
410
+ "accuracy_ci_low": 0.0,
411
+ "accuracy_ci_high": 0.7142857142857143,
412
  "score_name": "accuracy",
413
+ "score": 0.2857142857142857,
414
+ "score_ci_high": 0.7142857142857143,
415
+ "score_ci_low": 0.0,
416
+ "num_of_instances": 7
417
  },
418
  "mmlu_pro_math": {
419
+ "accuracy": 0.5714285714285714,
420
+ "accuracy_ci_low": 0.14285714285714285,
421
+ "accuracy_ci_high": 0.8571428571428571,
422
  "score_name": "accuracy",
423
+ "score": 0.5714285714285714,
424
+ "score_ci_high": 0.8571428571428571,
425
+ "score_ci_low": 0.14285714285714285,
426
+ "num_of_instances": 7
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.2857142857142857,
430
+ "accuracy_ci_low": 0.0,
431
+ "accuracy_ci_high": 0.7142857142857143,
432
  "score_name": "accuracy",
433
+ "score": 0.2857142857142857,
434
+ "score_ci_high": 0.7142857142857143,
435
+ "score_ci_low": 0.0,
436
+ "num_of_instances": 7
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.2857142857142857,
440
+ "accuracy_ci_low": 0.0,
441
+ "accuracy_ci_high": 0.7142857142857143,
442
  "score_name": "accuracy",
443
+ "score": 0.2857142857142857,
444
+ "score_ci_high": 0.7142857142857143,
445
+ "score_ci_low": 0.0,
446
+ "num_of_instances": 7
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.14285714285714285,
450
+ "accuracy_ci_low": 0.0,
451
+ "accuracy_ci_high": 0.5714285714285714,
452
  "score_name": "accuracy",
453
+ "score": 0.14285714285714285,
454
+ "score_ci_high": 0.5714285714285714,
455
+ "score_ci_low": 0.0,
456
+ "num_of_instances": 7
457
  },
458
  "mmlu_pro_psychology": {
459
+ "accuracy": 0.14285714285714285,
460
+ "accuracy_ci_low": 0.0,
461
+ "accuracy_ci_high": 0.5714285714285714,
462
  "score_name": "accuracy",
463
+ "score": 0.14285714285714285,
464
+ "score_ci_high": 0.5714285714285714,
465
+ "score_ci_low": 0.0,
466
+ "num_of_instances": 7
467
  },
468
+ "score": 0.2857142857142857,
469
  "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.21875901875901876,
475
+ "f1_suggestive": 0.0,
476
+ "f1_descriptive": 0.36363636363636365,
477
+ "f1_generic": 0.0,
478
+ "f1_fanciful": 0.2857142857142857,
479
+ "f1_arbitrary": 0.4444444444444444,
480
+ "f1_macro_ci_low": 0.1,
481
+ "f1_macro_ci_high": 0.4116972927189486,
482
  "score_name": "f1_micro",
483
+ "score": 0.25,
484
+ "score_ci_high": 0.45,
485
+ "score_ci_low": 0.1,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.25,
488
+ "accuracy_ci_low": 0.1,
489
+ "accuracy_ci_high": 0.45,
490
+ "f1_micro": 0.25,
491
+ "f1_micro_ci_low": 0.1,
492
+ "f1_micro_ci_high": 0.45
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.4373401534526854,
496
+ "f1_no": 0.35294117647058826,
497
+ "f1_yes": 0.5217391304347826,
498
+ "f1_macro_ci_low": 0.24812030075187969,
499
+ "f1_macro_ci_high": 0.6493608471738732,
500
  "score_name": "f1_micro",
501
+ "score": 0.45,
502
+ "score_ci_high": 0.65,
503
+ "score_ci_low": 0.25,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.45,
506
+ "accuracy_ci_low": 0.25,
507
+ "accuracy_ci_high": 0.65,
508
+ "f1_micro": 0.45,
509
+ "f1_micro_ci_low": 0.25,
510
+ "f1_micro_ci_high": 0.65
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.32173382173382176,
514
+ "f1_conclusion": 0.3076923076923077,
515
+ "f1_analysis": 0.0,
516
+ "f1_decree": 0.0,
517
+ "f1_issue": 0.4444444444444444,
518
+ "f1_facts": 0.5,
519
+ "f1_procedural history": 1.0,
520
+ "f1_rule": 0.0,
521
+ "f1_macro_ci_low": 0.16826244282941694,
522
+ "f1_macro_ci_high": 0.5700272265278143,
523
  "score_name": "f1_micro",
524
+ "score": 0.3076923076923077,
525
+ "score_ci_high": 0.5294117647058824,
526
+ "score_ci_low": 0.10526315789473684,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.3,
529
+ "accuracy_ci_low": 0.15,
530
+ "accuracy_ci_high": 0.5,
531
+ "f1_micro": 0.3076923076923077,
532
+ "f1_micro_ci_low": 0.10526315789473684,
533
+ "f1_micro_ci_high": 0.5294117647058824
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.5833333333333333,
537
+ "f1_yes": 0.6666666666666666,
538
+ "f1_no": 0.5,
539
+ "f1_macro_ci_low": 0.3483709273182957,
540
+ "f1_macro_ci_high": 0.797979797979798,
541
  "score_name": "f1_micro",
542
+ "score": 0.6,
543
+ "score_ci_high": 0.8,
544
+ "score_ci_low": 0.4,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.6,
547
+ "accuracy_ci_low": 0.4,
548
+ "accuracy_ci_high": 0.8,
549
+ "f1_micro": 0.6,
550
+ "f1_micro_ci_low": 0.4,
551
+ "f1_micro_ci_high": 0.8
552
  },
553
  "legalbench_proa": {
554
+ "f1_macro": 0.7222222222222222,
555
+ "f1_yes": 0.6666666666666666,
556
+ "f1_no": 0.7777777777777778,
557
+ "f1_macro_ci_low": 0.4917908886027495,
558
+ "f1_macro_ci_high": 0.8742358224473542,
559
  "score_name": "f1_micro",
560
+ "score": 0.7272727272727273,
561
+ "score_ci_high": 0.8648648648648649,
562
+ "score_ci_low": 0.5078970996299597,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.6,
565
+ "accuracy_ci_low": 0.4,
566
+ "accuracy_ci_high": 0.8,
567
+ "f1_micro": 0.7272727272727273,
568
+ "f1_micro_ci_low": 0.5078970996299597,
569
+ "f1_micro_ci_high": 0.8648648648648649
570
  },
571
+ "score": 0.466993006993007,
572
  "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.2824164578111946,
578
+ "f1_cars": 0.6,
579
  "f1_windows x": 0.0,
580
+ "f1_atheism": 0.0,
581
+ "f1_christianity": 0.0,
582
+ "f1_religion": 0.0,
583
+ "f1_medicine": 0.3333333333333333,
584
+ "f1_computer graphics": 0.3157894736842105,
585
+ "f1_microsoft windows": 0.4444444444444444,
586
+ "f1_middle east": 0.0,
587
+ "f1_politics": 0.5714285714285714,
588
+ "f1_motorcycles": 0.0,
589
+ "f1_mac hardware": 0.3333333333333333,
590
+ "f1_pc hardware": 0.3333333333333333,
591
+ "f1_for sale": 0.0,
592
+ "f1_guns": 0.5,
593
+ "f1_space": 0.75,
594
+ "f1_cryptography": 0.0,
595
+ "f1_baseball": 0.8,
596
+ "f1_electronics": 0.6666666666666666,
597
+ "f1_hockey": 0.0,
598
+ "f1_macro_ci_low": 0.22475419677111075,
599
+ "f1_macro_ci_high": 0.36930934309468255,
600
  "score_name": "f1_micro",
601
+ "score": 0.34782608695652173,
602
+ "score_ci_high": 0.45707499587676786,
603
+ "score_ci_low": 0.25313614072293233,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.28,
606
+ "accuracy_ci_low": 0.2,
607
+ "accuracy_ci_high": 0.38,
608
+ "f1_micro": 0.34782608695652173,
609
+ "f1_micro_ci_low": 0.25313614072293233,
610
+ "f1_micro_ci_high": 0.45707499587676786
611
  },
612
+ "score": 0.34782608695652173,
613
  "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.5681318681318681,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9,
620
+ "f1_credit card or prepaid card": 0.3076923076923077,
 
 
 
 
621
  "f1_money transfer or virtual currency or money service": 0.6666666666666666,
622
+ "f1_mortgage": 1.0,
623
+ "f1_debt collection": 0.3333333333333333,
624
+ "f1_checking or savings account": 0.7692307692307693,
625
+ "f1_payday loan or title loan or personal loan": 0.0,
626
+ "f1_macro_ci_low": 0.3709228365796495,
627
+ "f1_macro_ci_high": 0.744785019590881,
628
  "score_name": "f1_micro",
629
+ "score": 0.7835051546391752,
630
+ "score_ci_high": 0.8585858585858586,
631
+ "score_ci_low": 0.6975697603592736,
632
+ "num_of_instances": 100,
633
+ "accuracy": 0.76,
634
+ "accuracy_ci_low": 0.67,
635
+ "accuracy_ci_high": 0.84,
636
+ "f1_micro": 0.7835051546391752,
637
+ "f1_micro_ci_low": 0.6975697603592736,
638
+ "f1_micro_ci_high": 0.8585858585858586
639
  },
640
  "cfpb_product_watsonx": {
641
+ "f1_macro": 0.5947033358798065,
642
+ "f1_mortgages and loans": 0.7619047619047619,
643
+ "f1_credit card": 0.5454545454545454,
644
+ "f1_debt collection": 0.5882352941176471,
645
+ "f1_retail banking": 0.36363636363636365,
646
+ "f1_credit reporting": 0.7142857142857143,
647
+ "f1_macro_ci_low": 0.4651735405033739,
648
+ "f1_macro_ci_high": 0.7569013016004565,
649
  "score_name": "f1_micro",
650
+ "score": 0.6262626262626263,
651
+ "score_ci_high": 0.7676767676767676,
652
+ "score_ci_low": 0.4897959183673469,
653
+ "num_of_instances": 50,
654
+ "accuracy": 0.62,
655
+ "accuracy_ci_low": 0.48,
656
+ "accuracy_ci_high": 0.76,
657
+ "f1_micro": 0.6262626262626263,
658
+ "f1_micro_ci_low": 0.4897959183673469,
659
+ "f1_micro_ci_high": 0.7676767676767676
660
  },
661
+ "score": 0.7048838904509007,
662
  "score_name": "subsets_mean",
663
+ "num_of_instances": 150
664
  },
665
  "qa_finance": {
666
  "fin_qa": {
667
+ "num_of_instances": 100,
668
+ "program_accuracy": 0.13,
669
+ "score": 0.13,
670
  "score_name": "program_accuracy",
671
+ "execution_accuracy": 0.1,
672
+ "program_accuracy_ci_low": 0.07,
673
+ "program_accuracy_ci_high": 0.2,
674
+ "score_ci_low": 0.07,
675
+ "score_ci_high": 0.2,
676
+ "execution_accuracy_ci_low": 0.05,
677
+ "execution_accuracy_ci_high": 0.17
678
  },
679
+ "score": 0.13,
680
  "score_name": "subsets_mean",
681
+ "num_of_instances": 100
682
  },
683
  "rag_general": {
684
  "rag_response_generation_clapnq": {
685
+ "precision": 0.5259718170827713,
686
+ "recall": 0.5325565848189471,
687
+ "f1": 0.48788698516603257,
688
+ "precision_ci_low": 0.4911250761135513,
689
+ "precision_ci_high": 0.5653352686639593,
690
+ "recall_ci_low": 0.49293364282057184,
691
+ "recall_ci_high": 0.5717848110862319,
692
+ "f1_ci_low": 0.45660847819815814,
693
+ "f1_ci_high": 0.5189210923819673,
694
  "score_name": "f1",
695
+ "score": 0.48788698516603257,
696
+ "score_ci_high": 0.5189210923819673,
697
+ "score_ci_low": 0.45660847819815814,
698
+ "num_of_instances": 100,
699
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6809317979216576,
700
+ "correctness_recall_bert_score.deberta_large_mnli": 0.691152431666851,
701
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6823357805609703,
702
+ "faithfullness_f1_token_overlap": 0.34443287111054743,
703
+ "faithfullness_recall_token_overlap": 0.24725245575960428,
704
+ "faithfullness_precision_token_overlap": 0.7209445115448376,
705
+ "correctness_f1_token_overlap": 0.48788698516603257,
706
+ "correctness_recall_token_overlap": 0.5325565848189471,
707
+ "correctness_precision_token_overlap": 0.5259718170827713
708
  },
709
+ "score": 0.48788698516603257,
710
  "score_name": "subsets_mean",
711
+ "num_of_instances": 100
712
  },
713
  "reasoning": {
714
  "hellaswag": {
715
+ "accuracy": 0.44,
716
+ "accuracy_ci_low": 0.34,
717
+ "accuracy_ci_high": 0.54,
718
  "score_name": "accuracy",
719
+ "score": 0.44,
720
+ "score_ci_high": 0.54,
721
+ "score_ci_low": 0.34,
722
+ "num_of_instances": 100
723
  },
724
  "openbook_qa": {
725
+ "accuracy": 0.7,
726
+ "accuracy_ci_low": 0.6,
727
+ "accuracy_ci_high": 0.78,
728
  "score_name": "accuracy",
729
+ "score": 0.7,
730
+ "score_ci_high": 0.78,
731
+ "score_ci_low": 0.6,
732
+ "num_of_instances": 100
733
  },
734
+ "score": 0.57,
735
  "score_name": "subsets_mean",
736
+ "num_of_instances": 200
737
  },
738
  "safety": {
739
  "attaq_500": {
 
741
  "severity_high": 0.0,
742
  "severity_medium": 0.0,
743
  "severity_low": 100.0,
744
+ "category_harmful_info": 0.846627970521767,
745
+ "category_deception": 0.8117720823396336,
746
+ "category_explicit_content": 0.8644592114771256,
747
+ "category_violence": 0.8120507185894345,
748
+ "category_discrimination": 0.8723783436275664,
749
+ "category_substance_abuse": 0.8963490577207672,
750
+ "category_pii": 0.7459356255001491,
751
+ "safety": 0.8339210312772125,
752
+ "safety_ci_low": 0.8157590553137859,
753
+ "safety_ci_high": 0.8500795039523255,
754
  "score_name": "safety",
755
+ "score": 0.8339210312772125,
756
+ "score_ci_high": 0.8500795039523255,
757
+ "score_ci_low": 0.8157590553137859,
758
  "num_of_instances": 100
759
  },
760
+ "score": 0.8339210312772125,
761
  "score_name": "subsets_mean",
762
  "num_of_instances": 100
763
  },
764
  "summarization": {
765
  "billsum_document_filtered_to_6000_chars": {
766
+ "num_of_instances": 100,
767
+ "rouge1": 0.4201269465168249,
768
+ "rougeLsum": 0.35887081152360367,
769
+ "rougeL": 0.2899376436388642,
770
+ "score": 0.2899376436388642,
771
  "score_name": "rougeL",
772
+ "rouge2": 0.2040248319542606,
773
+ "rouge1_ci_low": 0.3957950378754922,
774
+ "rouge1_ci_high": 0.44198037514196525,
775
+ "rougeLsum_ci_low": 0.33566125462866964,
776
+ "rougeLsum_ci_high": 0.37945926824008097,
777
+ "rougeL_ci_low": 0.27073181817971537,
778
+ "rougeL_ci_high": 0.3107229983596245,
779
+ "score_ci_low": 0.27073181817971537,
780
+ "score_ci_high": 0.3107229983596245,
781
+ "rouge2_ci_low": 0.18748753359560238,
782
+ "rouge2_ci_high": 0.2273071634840196
 
 
783
  },
784
  "tldr_document_filtered_to_6000_chars": {
785
+ "num_of_instances": 100,
786
+ "rouge1": 0.11018773543004212,
787
+ "rougeLsum": 0.08916779934450281,
788
+ "rougeL": 0.08311133323979714,
789
+ "score": 0.08311133323979714,
790
  "score_name": "rougeL",
791
+ "rouge2": 0.015505336966737249,
792
+ "rouge1_ci_low": 0.09524241123987982,
793
+ "rouge1_ci_high": 0.12446415160426098,
794
+ "rougeLsum_ci_low": 0.07806656289344661,
795
+ "rougeLsum_ci_high": 0.10055148752407524,
796
+ "rougeL_ci_low": 0.0727724269814928,
797
+ "rougeL_ci_high": 0.0928135162226291,
798
+ "score_ci_low": 0.0727724269814928,
799
+ "score_ci_high": 0.0928135162226291,
800
+ "rouge2_ci_low": 0.011076119889397521,
801
+ "rouge2_ci_high": 0.020887950896596442
 
 
802
  },
803
+ "score": 0.18652448843933067,
804
  "score_name": "subsets_mean",
805
+ "num_of_instances": 200
806
  },
807
  "translation": {
808
  "mt_flores_101_ara_eng": {
809
+ "num_of_instances": 6,
810
  "counts": [
811
+ 134,
812
+ 89,
813
+ 66,
814
+ 54
815
  ],
816
  "totals": [
817
+ 211,
818
+ 205,
819
+ 199,
820
+ 193
821
  ],
822
  "precisions": [
823
+ 0.6350710900473934,
824
+ 0.43414634146341463,
825
+ 0.3316582914572864,
826
+ 0.27979274611398963
827
  ],
828
  "bp": 1.0,
829
+ "sys_len": 211,
830
+ "ref_len": 208,
831
+ "sacrebleu": 0.3999414802337733,
832
+ "score": 0.3999414802337733,
833
  "score_name": "sacrebleu",
834
+ "score_ci_low": 0.09825253161374167,
835
+ "score_ci_high": 0.5276086387431496,
836
+ "sacrebleu_ci_low": 0.09825253161374167,
837
+ "sacrebleu_ci_high": 0.5276086387431496
838
  },
839
  "mt_flores_101_deu_eng": {
840
+ "num_of_instances": 6,
841
  "counts": [
842
+ 135,
843
+ 76,
844
+ 45,
845
+ 31
846
  ],
847
  "totals": [
848
+ 212,
849
+ 206,
850
+ 200,
851
+ 194
852
  ],
853
  "precisions": [
854
+ 0.6367924528301887,
855
+ 0.36893203883495146,
856
+ 0.225,
857
+ 0.15979381443298968
858
  ],
859
  "bp": 1.0,
860
+ "sys_len": 212,
861
+ "ref_len": 208,
862
+ "sacrebleu": 0.30315985479098034,
863
+ "score": 0.30315985479098034,
864
  "score_name": "sacrebleu",
865
+ "score_ci_low": 0.21741553813332964,
866
+ "score_ci_high": 0.44780469853287025,
867
+ "sacrebleu_ci_low": 0.21741553813332964,
868
+ "sacrebleu_ci_high": 0.44780469853287025
869
  },
870
  "mt_flores_101_eng_ara": {
871
+ "num_of_instances": 6,
872
  "counts": [
873
+ 71,
874
+ 26,
875
+ 13,
876
+ 5
877
  ],
878
  "totals": [
879
+ 888,
880
+ 882,
881
+ 876,
882
+ 870
883
  ],
884
  "precisions": [
885
+ 0.07995495495495496,
886
+ 0.02947845804988662,
887
+ 0.014840182648401827,
888
+ 0.005747126436781609
889
  ],
890
  "bp": 1.0,
891
+ "sys_len": 888,
892
+ "ref_len": 209,
893
+ "sacrebleu": 0.02117434748734448,
894
+ "score": 0.02117434748734448,
895
  "score_name": "sacrebleu",
896
+ "score_ci_low": 0.0034363647304289997,
897
+ "score_ci_high": 0.12617952190760182,
898
+ "sacrebleu_ci_low": 0.0034363647304289997,
899
+ "sacrebleu_ci_high": 0.12617952190760182
900
  },
901
  "mt_flores_101_eng_deu": {
902
+ "num_of_instances": 6,
903
  "counts": [
904
+ 120,
905
+ 63,
906
+ 34,
907
+ 20
908
  ],
909
  "totals": [
910
+ 221,
911
+ 215,
912
+ 209,
913
+ 203
914
  ],
915
  "precisions": [
916
+ 0.5429864253393665,
917
+ 0.2930232558139535,
918
+ 0.1626794258373206,
919
+ 0.09852216748768473
920
  ],
921
+ "bp": 1.0,
922
+ "sys_len": 221,
923
+ "ref_len": 216,
924
+ "sacrebleu": 0.22471880288808055,
925
+ "score": 0.22471880288808055,
926
  "score_name": "sacrebleu",
927
+ "score_ci_low": 0.10763204349992618,
928
+ "score_ci_high": 0.366138035767947,
929
+ "sacrebleu_ci_low": 0.10763204349992618,
930
+ "sacrebleu_ci_high": 0.366138035767947
931
  },
932
  "mt_flores_101_eng_fra": {
933
+ "num_of_instances": 6,
934
  "counts": [
935
+ 167,
936
+ 112,
937
+ 82,
938
+ 61
939
  ],
940
  "totals": [
941
+ 236,
942
+ 230,
943
+ 224,
944
+ 218
945
  ],
946
  "precisions": [
947
+ 0.7076271186440678,
948
+ 0.48695652173913045,
949
+ 0.36607142857142855,
950
+ 0.2798165137614679
951
  ],
952
+ "bp": 1.0,
953
+ "sys_len": 236,
954
+ "ref_len": 235,
955
+ "sacrebleu": 0.43344446111073914,
956
+ "score": 0.43344446111073914,
957
  "score_name": "sacrebleu",
958
+ "score_ci_low": 0.3499428856979402,
959
+ "score_ci_high": 0.5041148126175684,
960
+ "sacrebleu_ci_low": 0.3499428856979402,
961
+ "sacrebleu_ci_high": 0.5041148126175684
962
  },
963
  "mt_flores_101_eng_kor": {
964
+ "num_of_instances": 6,
965
  "counts": [
966
+ 125,
967
+ 55,
968
+ 26,
969
+ 12
970
  ],
971
  "totals": [
972
+ 294,
973
+ 288,
974
+ 282,
975
+ 276
976
  ],
977
  "precisions": [
978
+ 0.4251700680272109,
979
+ 0.1909722222222222,
980
+ 0.0921985815602837,
981
+ 0.043478260869565216
982
  ],
983
  "bp": 1.0,
984
+ "sys_len": 294,
985
+ "ref_len": 249,
986
+ "sacrebleu": 0.13431741406488118,
987
+ "score": 0.13431741406488118,
988
  "score_name": "sacrebleu",
989
+ "score_ci_low": 0.08622044326635767,
990
+ "score_ci_high": 0.18006459720682508,
991
+ "sacrebleu_ci_low": 0.08622044326635767,
992
+ "sacrebleu_ci_high": 0.18006459720682508
993
  },
994
  "mt_flores_101_eng_por": {
995
+ "num_of_instances": 6,
996
  "counts": [
997
+ 162,
998
+ 115,
999
+ 85,
1000
+ 66
1001
  ],
1002
  "totals": [
1003
+ 217,
1004
+ 211,
1005
+ 205,
1006
+ 199
1007
  ],
1008
  "precisions": [
1009
+ 0.7465437788018433,
1010
+ 0.5450236966824644,
1011
+ 0.4146341463414634,
1012
+ 0.3316582914572864
1013
  ],
1014
+ "bp": 0.977221952990032,
1015
+ "sys_len": 217,
1016
+ "ref_len": 222,
1017
+ "sacrebleu": 0.47528035317523853,
1018
+ "score": 0.47528035317523853,
1019
  "score_name": "sacrebleu",
1020
+ "score_ci_low": 0.3904344196224883,
1021
+ "score_ci_high": 0.6010867967504717,
1022
+ "sacrebleu_ci_low": 0.3904344196224883,
1023
+ "sacrebleu_ci_high": 0.6010867967504717
1024
  },
1025
  "mt_flores_101_eng_ron": {
1026
+ "num_of_instances": 6,
1027
  "counts": [
1028
+ 141,
1029
+ 87,
1030
+ 63,
1031
+ 46
1032
  ],
1033
  "totals": [
1034
+ 225,
1035
+ 219,
1036
+ 213,
1037
+ 207
1038
  ],
1039
  "precisions": [
1040
+ 0.6266666666666666,
1041
+ 0.3972602739726028,
1042
+ 0.29577464788732394,
1043
+ 0.2222222222222222
1044
  ],
1045
+ "bp": 0.9780228724846006,
1046
+ "sys_len": 225,
1047
+ "ref_len": 230,
1048
+ "sacrebleu": 0.34979536672149464,
1049
+ "score": 0.34979536672149464,
1050
  "score_name": "sacrebleu",
1051
+ "score_ci_low": 0.27052307609565535,
1052
+ "score_ci_high": 0.46625375184661805,
1053
+ "sacrebleu_ci_low": 0.27052307609565535,
1054
+ "sacrebleu_ci_high": 0.46625375184661805
1055
  },
1056
  "mt_flores_101_eng_spa": {
1057
+ "num_of_instances": 6,
1058
  "counts": [
1059
+ 151,
1060
+ 87,
1061
+ 51,
1062
+ 31
1063
  ],
1064
  "totals": [
1065
+ 229,
1066
+ 223,
1067
+ 217,
1068
+ 211
1069
  ],
1070
  "precisions": [
1071
+ 0.6593886462882097,
1072
+ 0.3901345291479821,
1073
+ 0.2350230414746544,
1074
+ 0.14691943127962084
1075
  ],
1076
+ "bp": 0.9406958880448453,
1077
+ "sys_len": 229,
1078
+ "ref_len": 243,
1079
+ "sacrebleu": 0.2887923132416329,
1080
+ "score": 0.2887923132416329,
1081
  "score_name": "sacrebleu",
1082
+ "score_ci_low": 0.23190126496152094,
1083
+ "score_ci_high": 0.35200649860482036,
1084
+ "sacrebleu_ci_low": 0.23190126496152094,
1085
+ "sacrebleu_ci_high": 0.35200649860482036
1086
  },
1087
  "mt_flores_101_fra_eng": {
1088
+ "num_of_instances": 6,
1089
  "counts": [
1090
+ 157,
1091
+ 105,
1092
+ 75,
1093
+ 53
1094
  ],
1095
  "totals": [
1096
+ 214,
1097
+ 208,
1098
+ 202,
1099
+ 196
1100
  ],
1101
  "precisions": [
1102
+ 0.733644859813084,
1103
+ 0.5048076923076923,
1104
+ 0.3712871287128713,
1105
+ 0.27040816326530615
1106
  ],
1107
  "bp": 1.0,
1108
+ "sys_len": 214,
1109
+ "ref_len": 208,
1110
+ "sacrebleu": 0.43912219013856996,
1111
+ "score": 0.43912219013856996,
1112
  "score_name": "sacrebleu",
1113
+ "score_ci_low": 0.35256330907180095,
1114
+ "score_ci_high": 0.5295701336610964,
1115
+ "sacrebleu_ci_low": 0.35256330907180095,
1116
+ "sacrebleu_ci_high": 0.5295701336610964
1117
  },
1118
  "mt_flores_101_jpn_eng": {
1119
+ "num_of_instances": 6,
1120
  "counts": [
1121
+ 120,
1122
+ 51,
1123
+ 28,
1124
+ 18
1125
  ],
1126
  "totals": [
1127
+ 218,
1128
+ 212,
1129
+ 206,
1130
+ 200
1131
  ],
1132
  "precisions": [
1133
+ 0.5504587155963303,
1134
+ 0.24056603773584906,
1135
+ 0.13592233009708737,
1136
+ 0.09
1137
  ],
1138
  "bp": 1.0,
1139
+ "sys_len": 218,
1140
+ "ref_len": 208,
1141
+ "sacrebleu": 0.20061947843312603,
1142
+ "score": 0.20061947843312603,
1143
  "score_name": "sacrebleu",
1144
+ "score_ci_low": 0.07445839277243944,
1145
+ "score_ci_high": 0.35371176032415624,
1146
+ "sacrebleu_ci_low": 0.07445839277243944,
1147
+ "sacrebleu_ci_high": 0.35371176032415624
1148
  },
1149
  "mt_flores_101_kor_eng": {
1150
+ "num_of_instances": 6,
1151
  "counts": [
1152
+ 111,
1153
+ 48,
1154
+ 25,
1155
+ 13
1156
  ],
1157
  "totals": [
1158
+ 193,
1159
+ 187,
1160
+ 181,
1161
+ 175
1162
  ],
1163
  "precisions": [
1164
+ 0.5751295336787564,
1165
+ 0.2566844919786096,
1166
+ 0.13812154696132597,
1167
+ 0.07428571428571429
1168
  ],
1169
+ "bp": 0.9252232610888251,
1170
+ "sys_len": 193,
1171
+ "ref_len": 208,
1172
+ "sacrebleu": 0.18252802404200877,
1173
+ "score": 0.18252802404200877,
1174
  "score_name": "sacrebleu",
1175
+ "score_ci_low": 0.11632952815932861,
1176
+ "score_ci_high": 0.2243103062098973,
1177
+ "sacrebleu_ci_low": 0.11632952815932861,
1178
+ "sacrebleu_ci_high": 0.2243103062098973
1179
  },
1180
  "mt_flores_101_por_eng": {
1181
+ "num_of_instances": 6,
1182
  "counts": [
1183
+ 155,
1184
+ 99,
1185
+ 69,
1186
+ 52
1187
  ],
1188
  "totals": [
1189
+ 211,
1190
+ 205,
1191
+ 199,
1192
+ 193
1193
  ],
1194
  "precisions": [
1195
+ 0.7345971563981043,
1196
+ 0.48292682926829267,
1197
+ 0.34673366834170855,
1198
+ 0.2694300518134715
1199
  ],
1200
  "bp": 1.0,
1201
+ "sys_len": 211,
1202
+ "ref_len": 208,
1203
+ "sacrebleu": 0.42667103009537916,
1204
+ "score": 0.42667103009537916,
1205
  "score_name": "sacrebleu",
1206
+ "score_ci_low": 0.23618053251632215,
1207
+ "score_ci_high": 0.5738399910229026,
1208
+ "sacrebleu_ci_low": 0.23618053251632215,
1209
+ "sacrebleu_ci_high": 0.5738399910229026
1210
  },
1211
  "mt_flores_101_ron_eng": {
1212
+ "num_of_instances": 6,
1213
  "counts": [
1214
+ 152,
1215
+ 97,
1216
+ 63,
1217
+ 46
1218
  ],
1219
  "totals": [
1220
+ 224,
1221
+ 218,
1222
+ 212,
1223
+ 206
1224
  ],
1225
  "precisions": [
1226
+ 0.6785714285714286,
1227
+ 0.444954128440367,
1228
+ 0.2971698113207547,
1229
+ 0.2233009708737864
1230
  ],
1231
  "bp": 1.0,
1232
+ "sys_len": 224,
1233
+ "ref_len": 208,
1234
+ "sacrebleu": 0.37622835832744195,
1235
+ "score": 0.37622835832744195,
1236
  "score_name": "sacrebleu",
1237
+ "score_ci_low": 0.18968604641672795,
1238
+ "score_ci_high": 0.517042761677566,
1239
+ "sacrebleu_ci_low": 0.18968604641672795,
1240
+ "sacrebleu_ci_high": 0.517042761677566
1241
  },
1242
  "mt_flores_101_spa_eng": {
1243
+ "num_of_instances": 6,
1244
  "counts": [
1245
+ 139,
1246
+ 85,
1247
+ 49,
1248
+ 33
1249
  ],
1250
  "totals": [
1251
+ 219,
1252
+ 213,
1253
+ 207,
1254
+ 201
1255
  ],
1256
  "precisions": [
1257
+ 0.634703196347032,
1258
+ 0.39906103286384975,
1259
+ 0.23671497584541062,
1260
+ 0.16417910447761194
1261
  ],
1262
  "bp": 1.0,
1263
+ "sys_len": 219,
1264
+ "ref_len": 208,
1265
+ "sacrebleu": 0.31498393643128203,
1266
+ "score": 0.31498393643128203,
1267
  "score_name": "sacrebleu",
1268
+ "score_ci_low": 0.22866378052841901,
1269
+ "score_ci_high": 0.3789594362496447,
1270
+ "sacrebleu_ci_low": 0.22866378052841901,
1271
+ "sacrebleu_ci_high": 0.3789594362496447
1272
  },
1273
+ "score": 0.3047184940787982,
1274
  "score_name": "subsets_mean",
1275
+ "num_of_instances": 90
1276
  },
1277
+ "score": 0.42076795543009543,
1278
  "score_name": "subsets_mean",
1279
+ "num_of_instances": 1537
1280
  }
1281
  }
results/bluebench/{2025-06-22T17-10-54_evaluation_results.json → 2025-07-02T17-12-27_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-22T21:10:50.634203Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -8,7 +8,7 @@
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
- "model_name=watsonx/meta-llama/llama-3-2-90b-vision-instruct,max_tokens=256",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
@@ -26,9 +26,9 @@
26
  "num_fewshots": null,
27
  "limit": null,
28
  "batch_size": 8,
29
- "model": "watsonx/meta-llama/llama-3-2-90b-vision-instruct",
30
  "model_args": {
31
- "max_tokens": 256
32
  },
33
  "gen_kwargs": null,
34
  "chat_template_kwargs": null,
@@ -41,8 +41,8 @@
41
  "disable_hf_cache": false,
42
  "cache_dir": null
43
  },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -51,25 +51,25 @@
51
  "triton": "3.3.1",
52
  "nltk": "3.9.1",
53
  "anyio": "4.9.0",
 
54
  "absl-py": "2.3.0",
55
  "tiktoken": "0.9.0",
56
  "charset-normalizer": "3.4.2",
57
  "nvidia-cuda-runtime-cu12": "12.6.77",
58
  "sympy": "1.14.0",
59
  "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
  "httpcore": "1.0.9",
 
62
  "Jinja2": "3.1.6",
63
  "jsonschema-specifications": "2025.4.1",
64
  "pydantic_core": "2.33.2",
65
  "nvidia-cusparse-cu12": "12.5.4.2",
 
66
  "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
  "portalocker": "3.2.0",
69
  "pandas": "2.3.0",
70
  "multiprocess": "0.70.16",
71
  "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
  "nvidia-nvjitlink-cu12": "12.6.85",
74
  "nvidia-cublas-cu12": "12.6.4.1",
75
  "pydantic": "2.11.7",
@@ -79,7 +79,7 @@
79
  "contourpy": "1.3.2",
80
  "aiosignal": "1.3.2",
81
  "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
  "six": "1.17.0",
84
  "diskcache": "5.6.3",
85
  "tqdm": "4.67.1",
@@ -98,17 +98,16 @@
98
  "kiwisolver": "1.4.8",
99
  "networkx": "3.4.2",
100
  "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
  "sniffio": "1.3.1",
103
  "scikit-learn": "1.7.0",
 
104
  "nvidia-curand-cu12": "10.3.7.77",
105
  "pip": "25.1.1",
 
106
  "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
  "datasets": "3.6.0",
109
  "nvidia-cusolver-cu12": "11.7.1.2",
110
  "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
  "distro": "1.9.0",
113
  "idna": "3.10",
114
  "MarkupSafe": "3.0.2",
@@ -122,44 +121,45 @@
122
  "joblib": "1.5.1",
123
  "fsspec": "2025.3.0",
124
  "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
  "wheel": "0.45.1",
127
  "nvidia-nvtx-cu12": "12.6.77",
128
  "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
  "propcache": "0.3.2",
131
  "numpy": "2.2.6",
132
  "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
  "conllu": "6.0.0",
 
135
  "safetensors": "0.5.3",
136
  "requests": "2.32.4",
137
  "regex": "2024.11.6",
138
  "aiohttp": "3.12.13",
139
  "tabulate": "0.9.0",
 
140
  "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
  "nvidia-cufft-cu12": "11.3.0.4",
143
  "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
  "click": "8.2.1",
145
  "typing_extensions": "4.12.2",
146
  "attrs": "25.3.0",
147
  "exceptiongroup": "1.3.0",
 
148
  "tenacity": "9.1.2",
149
  "pytz": "2025.2",
150
  "aiohappyeyeballs": "2.6.1",
151
  "python-dateutil": "2.9.0.post0",
152
  "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
 
154
  "httpx": "0.28.1",
155
  "matplotlib": "3.10.3",
156
  "xxhash": "3.5.0",
157
  "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
  "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
  "threadpoolctl": "3.6.0",
162
  "nvidia-cudnn-cu12": "9.5.1.17",
 
163
  "jaraco.collections": "5.1.0",
164
  "tomli": "2.0.1",
165
  "backports.tarfile": "1.2.0",
@@ -176,14 +176,14 @@
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
- "accuracy": 0.8222222222222222,
180
- "accuracy_ci_low": 0.7333333333333333,
181
- "accuracy_ci_high": 0.8888888888888888,
182
  "score_name": "accuracy",
183
- "score": 0.8222222222222222,
184
- "score_ci_high": 0.8888888888888888,
185
- "score_ci_low": 0.7333333333333333,
186
- "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
  "accuracy": 1.0,
@@ -193,17 +193,17 @@
193
  "score": 1.0,
194
  "score_ci_high": 1.0,
195
  "score_ci_low": 1.0,
196
- "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
- "accuracy": 0.9888888888888889,
200
- "accuracy_ci_low": 0.9366915726689814,
201
  "accuracy_ci_high": 1.0,
202
  "score_name": "accuracy",
203
- "score": 0.9888888888888889,
204
  "score_ci_high": 1.0,
205
- "score_ci_low": 0.9366915726689814,
206
- "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
  "accuracy": 1.0,
@@ -213,7 +213,7 @@
213
  "score": 1.0,
214
  "score_ci_high": 1.0,
215
  "score_ci_low": 1.0,
216
- "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
  "accuracy": 1.0,
@@ -223,7 +223,7 @@
223
  "score": 1.0,
224
  "score_ci_high": 1.0,
225
  "score_ci_low": 1.0,
226
- "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
  "accuracy": 1.0,
@@ -233,7 +233,7 @@
233
  "score": 1.0,
234
  "score_ci_high": 1.0,
235
  "score_ci_low": 1.0,
236
- "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
  "accuracy": 1.0,
@@ -243,17 +243,17 @@
243
  "score": 1.0,
244
  "score_ci_high": 1.0,
245
  "score_ci_low": 1.0,
246
- "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
- "accuracy": 1.0,
250
- "accuracy_ci_low": 1.0,
251
  "accuracy_ci_high": 1.0,
252
  "score_name": "accuracy",
253
- "score": 1.0,
254
  "score_ci_high": 1.0,
255
- "score_ci_low": 1.0,
256
- "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
  "accuracy": 1.0,
@@ -263,479 +263,477 @@
263
  "score": 1.0,
264
  "score_ci_high": 1.0,
265
  "score_ci_low": 1.0,
266
- "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
- "accuracy": 0.9777777777777777,
270
- "accuracy_ci_low": 0.9222222222222223,
271
  "accuracy_ci_high": 1.0,
272
  "score_name": "accuracy",
273
- "score": 0.9777777777777777,
274
  "score_ci_high": 1.0,
275
- "score_ci_low": 0.9222222222222223,
276
- "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.9,
280
- "accuracy_ci_low": 0.8222222222222222,
281
- "accuracy_ci_high": 0.9555555555555556,
282
  "score_name": "accuracy",
283
- "score": 0.9,
284
- "score_ci_high": 0.9555555555555556,
285
- "score_ci_low": 0.8222222222222222,
286
- "num_of_instances": 90
287
  },
288
- "score": 0.9717171717171718,
289
  "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.15036803364879076,
296
- "score": 0.15036803364879076,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.15036803364879076,
300
  "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.6063829787234042,
307
- "f1_Organization": 0.3867069486404834,
308
- "f1_Location": 0.43678160919540227,
309
- "f1_macro": 0.4766238455197633,
310
- "recall_macro": 0.43686343505993114,
311
- "precision_macro": 0.5290149382542261,
312
- "in_classes_support": 0.8519230769230769,
313
- "f1_micro": 0.44976076555023925,
314
- "recall_micro": 0.44761904761904764,
315
- "precision_micro": 0.4519230769230769,
316
- "score": 0.44976076555023925,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.40206449128985794,
319
- "score_ci_high": 0.5019430325767736,
320
- "f1_micro_ci_low": 0.40206449128985794,
321
- "f1_micro_ci_high": 0.5019430325767736
322
  },
323
- "score": 0.44976076555023925,
324
  "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
- "accuracy": 0.6901408450704225,
330
- "accuracy_ci_low": 0.5774647887323944,
331
- "accuracy_ci_high": 0.7887323943661971,
332
  "score_name": "accuracy",
333
- "score": 0.6901408450704225,
334
- "score_ci_high": 0.7887323943661971,
335
- "score_ci_low": 0.5774647887323944,
336
- "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.39436619718309857,
340
- "accuracy_ci_low": 0.26949490209003363,
341
- "accuracy_ci_high": 0.5070422535211268,
342
  "score_name": "accuracy",
343
- "score": 0.39436619718309857,
344
- "score_ci_high": 0.5070422535211268,
345
- "score_ci_low": 0.26949490209003363,
346
- "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
- "accuracy": 0.2676056338028169,
350
- "accuracy_ci_low": 0.16901408450704225,
351
- "accuracy_ci_high": 0.38028169014084506,
352
  "score_name": "accuracy",
353
- "score": 0.2676056338028169,
354
- "score_ci_high": 0.38028169014084506,
355
- "score_ci_low": 0.16901408450704225,
356
- "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
- "accuracy": 0.5633802816901409,
360
- "accuracy_ci_low": 0.4507042253521127,
361
- "accuracy_ci_high": 0.676056338028169,
362
  "score_name": "accuracy",
363
- "score": 0.5633802816901409,
364
- "score_ci_high": 0.676056338028169,
365
- "score_ci_low": 0.4507042253521127,
366
- "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
- "accuracy": 0.6901408450704225,
370
- "accuracy_ci_low": 0.5774647887323944,
371
- "accuracy_ci_high": 0.7887323943661971,
372
  "score_name": "accuracy",
373
- "score": 0.6901408450704225,
374
- "score_ci_high": 0.7887323943661971,
375
- "score_ci_low": 0.5774647887323944,
376
- "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
- "accuracy": 0.38028169014084506,
380
- "accuracy_ci_low": 0.2676056338028169,
381
- "accuracy_ci_high": 0.49295774647887325,
382
  "score_name": "accuracy",
383
- "score": 0.38028169014084506,
384
- "score_ci_high": 0.49295774647887325,
385
- "score_ci_low": 0.2676056338028169,
386
- "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
- "accuracy": 0.5633802816901409,
390
- "accuracy_ci_low": 0.4507042253521127,
391
- "accuracy_ci_high": 0.676056338028169,
392
  "score_name": "accuracy",
393
- "score": 0.5633802816901409,
394
- "score_ci_high": 0.676056338028169,
395
- "score_ci_low": 0.4507042253521127,
396
- "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
- "accuracy": 0.6619718309859155,
400
- "accuracy_ci_low": 0.5492957746478874,
401
- "accuracy_ci_high": 0.7605633802816901,
402
  "score_name": "accuracy",
403
- "score": 0.6619718309859155,
404
- "score_ci_high": 0.7605633802816901,
405
- "score_ci_low": 0.5492957746478874,
406
- "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
- "accuracy": 0.5070422535211268,
410
- "accuracy_ci_low": 0.39436619718309857,
411
- "accuracy_ci_high": 0.6197183098591549,
412
  "score_name": "accuracy",
413
- "score": 0.5070422535211268,
414
- "score_ci_high": 0.6197183098591549,
415
- "score_ci_low": 0.39436619718309857,
416
- "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
- "accuracy": 0.36619718309859156,
420
- "accuracy_ci_low": 0.2535211267605634,
421
- "accuracy_ci_high": 0.4788732394366197,
422
  "score_name": "accuracy",
423
- "score": 0.36619718309859156,
424
- "score_ci_high": 0.4788732394366197,
425
- "score_ci_low": 0.2535211267605634,
426
- "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
- "accuracy": 0.5211267605633803,
430
- "accuracy_ci_low": 0.4084507042253521,
431
- "accuracy_ci_high": 0.6338028169014085,
432
  "score_name": "accuracy",
433
- "score": 0.5211267605633803,
434
- "score_ci_high": 0.6338028169014085,
435
- "score_ci_low": 0.4084507042253521,
436
- "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
- "accuracy": 0.647887323943662,
440
- "accuracy_ci_low": 0.5211267605633803,
441
- "accuracy_ci_high": 0.7605633802816901,
442
  "score_name": "accuracy",
443
- "score": 0.647887323943662,
444
- "score_ci_high": 0.7605633802816901,
445
- "score_ci_low": 0.5211267605633803,
446
- "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.43661971830985913,
450
- "accuracy_ci_low": 0.323943661971831,
451
- "accuracy_ci_high": 0.5492957746478874,
452
  "score_name": "accuracy",
453
- "score": 0.43661971830985913,
454
- "score_ci_high": 0.5492957746478874,
455
- "score_ci_low": 0.323943661971831,
456
- "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
- "accuracy": 0.6901408450704225,
460
- "accuracy_ci_low": 0.5774647887323944,
461
- "accuracy_ci_high": 0.7887323943661971,
462
  "score_name": "accuracy",
463
- "score": 0.6901408450704225,
464
- "score_ci_high": 0.7887323943661971,
465
- "score_ci_low": 0.5774647887323944,
466
- "num_of_instances": 71
467
  },
468
- "score": 0.5271629778672032,
469
  "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.7225597780588705,
475
- "f1_suggestive": 0.6896551724137931,
476
- "f1_generic": 0.9333333333333333,
477
- "f1_fanciful": 0.5185185185185185,
478
- "f1_descriptive": 0.7894736842105263,
479
- "f1_arbitrary": 0.6818181818181818,
480
- "f1_macro_ci_low": 0.6323470231492377,
481
- "f1_macro_ci_high": 0.8166804118889143,
482
  "score_name": "f1_micro",
483
- "score": 0.7261904761904762,
484
- "score_ci_high": 0.8165680473372781,
485
- "score_ci_low": 0.6278443317985081,
486
- "num_of_instances": 85,
487
- "accuracy": 0.7176470588235294,
488
- "accuracy_ci_low": 0.6235294117647059,
489
- "accuracy_ci_high": 0.8,
490
- "f1_micro": 0.7261904761904762,
491
- "f1_micro_ci_low": 0.6278443317985081,
492
- "f1_micro_ci_high": 0.8165680473372781
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.6249597423510467,
496
- "f1_no": 0.6869565217391305,
497
- "f1_yes": 0.562962962962963,
498
- "f1_macro_ci_low": 0.5550554427556457,
499
- "f1_macro_ci_high": 0.695881941412769,
500
  "score_name": "f1_micro",
501
- "score": 0.6410958904109589,
502
- "score_ci_high": 0.7049180327868853,
503
- "score_ci_low": 0.5737704918032787,
504
- "num_of_instances": 200,
505
- "accuracy": 0.585,
506
- "accuracy_ci_low": 0.52,
507
- "accuracy_ci_high": 0.65,
508
- "f1_micro": 0.6410958904109589,
509
- "f1_micro_ci_low": 0.5737704918032787,
510
- "f1_micro_ci_high": 0.7049180327868853
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.3038558663558663,
514
- "f1_conclusion": 0.1111111111111111,
515
- "f1_decree": 0.24242424242424243,
516
- "f1_issue": 0.2916666666666667,
517
- "f1_analysis": 0.5625,
518
- "f1_facts": 0.12121212121212122,
519
- "f1_procedural history": 0.375,
520
- "f1_rule": 0.4230769230769231,
521
- "f1_macro_ci_low": 0.24917773569698498,
522
- "f1_macro_ci_high": 0.3785030935767383,
523
  "score_name": "f1_micro",
524
- "score": 0.3393939393939394,
525
- "score_ci_high": 0.4145430992532546,
526
- "score_ci_low": 0.2731916089829871,
527
- "num_of_instances": 200,
528
- "accuracy": 0.28,
529
- "accuracy_ci_low": 0.22,
530
- "accuracy_ci_high": 0.3484825462990022,
531
- "f1_micro": 0.3393939393939394,
532
- "f1_micro_ci_low": 0.2731916089829871,
533
- "f1_micro_ci_high": 0.4145430992532546
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.5240648011782032,
537
- "f1_yes": 0.49142857142857144,
538
- "f1_no": 0.5567010309278351,
539
- "f1_macro_ci_low": 0.45400963495793933,
540
- "f1_macro_ci_high": 0.5921495411901395,
541
  "score_name": "f1_micro",
542
- "score": 0.5257452574525745,
543
- "score_ci_high": 0.5909095637067483,
544
- "score_ci_low": 0.4547945205479452,
545
- "num_of_instances": 200,
546
- "accuracy": 0.485,
547
- "accuracy_ci_low": 0.415,
548
- "accuracy_ci_high": 0.55,
549
- "f1_micro": 0.5257452574525745,
550
- "f1_micro_ci_low": 0.4547945205479452,
551
- "f1_micro_ci_high": 0.5909095637067483
552
  },
553
  "legalbench_proa": {
554
- "f1_macro": 0.7668918918918919,
555
- "f1_yes": 0.75,
556
- "f1_no": 0.7837837837837838,
557
- "f1_macro_ci_low": 0.679529165397271,
558
- "f1_macro_ci_high": 0.8388811527947668,
559
  "score_name": "f1_micro",
560
- "score": 0.7671232876712328,
561
- "score_ci_high": 0.8378378378378378,
562
- "score_ci_low": 0.6808510638297872,
563
- "num_of_instances": 85,
564
- "accuracy": 0.6588235294117647,
565
- "accuracy_ci_low": 0.5647058823529412,
566
- "accuracy_ci_high": 0.7529411764705882,
567
- "f1_micro": 0.7671232876712328,
568
- "f1_micro_ci_low": 0.6808510638297872,
569
- "f1_micro_ci_high": 0.8378378378378378
570
  },
571
- "score": 0.5999097702238364,
572
  "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.6635217038391774,
578
- "f1_cars": 0.9019607843137255,
579
- "f1_windows x": 0.19444444444444445,
580
- "f1_computer graphics": 0.4496124031007752,
581
- "f1_atheism": 0.5614035087719298,
582
- "f1_christianity": 0.8113207547169812,
583
- "f1_religion": 0.3103448275862069,
584
- "f1_medicine": 0.8275862068965517,
585
- "f1_for sale": 0.6923076923076923,
586
- "f1_microsoft windows": 0.6818181818181818,
587
- "f1_middle east": 0.684931506849315,
588
- "f1_motorcycles": 0.7962962962962963,
589
- "f1_pc hardware": 0.6474820143884892,
590
- "f1_mac hardware": 0.7307692307692307,
591
- "f1_guns": 0.4594594594594595,
592
- "f1_space": 0.8440366972477065,
593
- "f1_cryptography": 0.7105263157894737,
594
- "f1_baseball": 0.9491525423728814,
595
- "f1_hockey": 0.9701492537313433,
596
- "f1_politics": 0.38016528925619836,
597
  "f1_electronics": 0.6666666666666666,
598
- "f1_macro_ci_low": 0.6398385377906187,
599
- "f1_macro_ci_high": 0.6921196013936116,
 
 
 
 
 
 
 
600
  "score_name": "f1_micro",
601
- "score": 0.6843198338525441,
602
- "score_ci_high": 0.7121991620876709,
603
- "score_ci_low": 0.6566124058286812,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.659,
606
- "accuracy_ci_low": 0.629,
607
- "accuracy_ci_high": 0.688,
608
- "f1_micro": 0.6843198338525441,
609
- "f1_micro_ci_low": 0.6566124058286812,
610
- "f1_micro_ci_high": 0.7121991620876709
611
  },
612
- "score": 0.6843198338525441,
613
  "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.7097642052328873,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9338959212376934,
620
- "f1_checking or savings account": 0.8495575221238938,
621
- "f1_debt collection": 0.5492957746478874,
622
- "f1_credit card or prepaid card": 0.5060240963855421,
623
- "f1_mortgage": 0.8115942028985508,
624
- "f1_payday loan or title loan or personal loan": 0.47058823529411764,
625
- "f1_student loan": 0.896551724137931,
626
- "f1_money transfer or virtual currency or money service": 0.8148148148148148,
627
- "f1_vehicle loan or lease": 0.5555555555555556,
628
- "f1_macro_ci_low": 0.6547879605458498,
629
- "f1_macro_ci_high": 0.7665337654404797,
630
  "score_name": "f1_micro",
631
- "score": 0.8641221374045801,
632
- "score_ci_high": 0.8833607904776744,
633
- "score_ci_low": 0.8421586938502544,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.849,
636
- "accuracy_ci_low": 0.826,
637
- "accuracy_ci_high": 0.869,
638
- "f1_micro": 0.8641221374045801,
639
- "f1_micro_ci_low": 0.8421586938502544,
640
- "f1_micro_ci_high": 0.8833607904776744
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.762019616454399,
644
- "f1_mortgages and loans": 0.8181818181818182,
645
- "f1_credit card": 0.8,
646
- "f1_debt collection": 0.6859903381642513,
647
- "f1_credit reporting": 0.78,
648
- "f1_retail banking": 0.725925925925926,
649
- "f1_macro_ci_low": 0.7228549596893655,
650
- "f1_macro_ci_high": 0.7974306633005968,
651
  "score_name": "f1_micro",
652
- "score": 0.7633434038267876,
653
- "score_ci_high": 0.7971877449640327,
654
- "score_ci_low": 0.725195552217836,
655
- "num_of_instances": 500,
656
- "accuracy": 0.758,
657
- "accuracy_ci_low": 0.718,
658
- "accuracy_ci_high": 0.794,
659
- "f1_micro": 0.7633434038267876,
660
- "f1_micro_ci_low": 0.725195552217836,
661
- "f1_micro_ci_high": 0.7971877449640327
662
  },
663
- "score": 0.8137327706156838,
664
  "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
- "num_of_instances": 1000,
670
- "program_accuracy": 0.215,
671
- "score": 0.215,
672
  "score_name": "program_accuracy",
673
- "execution_accuracy": 0.197,
674
- "program_accuracy_ci_low": 0.19,
675
- "program_accuracy_ci_high": 0.241,
676
- "score_ci_low": 0.19,
677
- "score_ci_high": 0.241,
678
- "execution_accuracy_ci_low": 0.175,
679
- "execution_accuracy_ci_high": 0.2231767765112022
680
  },
681
- "score": 0.215,
682
  "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.3315078335561397,
688
- "recall": 0.5240469619010103,
689
- "f1": 0.34432296142118446,
690
- "precision_ci_low": 0.3111263193000635,
691
- "precision_ci_high": 0.35216605940257417,
692
- "recall_ci_low": 0.5095542204077604,
693
- "recall_ci_high": 0.5396498579199621,
694
- "f1_ci_low": 0.3272930525098391,
695
- "f1_ci_high": 0.3622790296659677,
696
  "score_name": "f1",
697
- "score": 0.34432296142118446,
698
- "score_ci_high": 0.3622790296659677,
699
- "score_ci_low": 0.3272930525098391,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.5984612627824147,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6712562903761864,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5609361777206262,
704
- "faithfullness_f1_token_overlap": 0.2786719270117274,
705
- "faithfullness_recall_token_overlap": 0.20806249969631424,
706
- "faithfullness_precision_token_overlap": 0.5649102031372216,
707
- "correctness_f1_token_overlap": 0.34432296142118446,
708
- "correctness_recall_token_overlap": 0.5240469619010103,
709
- "correctness_precision_token_overlap": 0.3315078335561397
710
  },
711
- "score": 0.34432296142118446,
712
  "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.556,
718
- "accuracy_ci_low": 0.524,
719
- "accuracy_ci_high": 0.5869866405591143,
720
  "score_name": "accuracy",
721
- "score": 0.556,
722
- "score_ci_high": 0.5869866405591143,
723
- "score_ci_low": 0.524,
724
- "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.94,
728
- "accuracy_ci_low": 0.916,
729
- "accuracy_ci_high": 0.956,
730
  "score_name": "accuracy",
731
- "score": 0.94,
732
- "score_ci_high": 0.956,
733
- "score_ci_low": 0.916,
734
- "num_of_instances": 500
735
  },
736
- "score": 0.748,
737
  "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
@@ -743,541 +741,541 @@
743
  "severity_high": 0.0,
744
  "severity_medium": 1.0,
745
  "severity_low": 99.0,
746
- "category_harmful_info": 0.8564385112286323,
747
- "category_deception": 0.8711490425648111,
748
- "category_explicit_content": 0.8348137058994987,
749
- "category_violence": 0.8371000632880226,
750
- "category_discrimination": 0.7546705811742752,
751
- "category_substance_abuse": 0.8373568359348509,
752
- "category_pii": 0.8237267407002272,
753
- "safety": 0.8398502923258477,
754
- "safety_ci_low": 0.8202995500858341,
755
- "safety_ci_high": 0.8593436721902035,
756
  "score_name": "safety",
757
- "score": 0.8398502923258477,
758
- "score_ci_high": 0.8593436721902035,
759
- "score_ci_low": 0.8202995500858341,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.8398502923258477,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rouge2": 0.2234078484606255,
770
- "rouge1": 0.4320936503911059,
771
- "rougeL": 0.30661978583917676,
772
- "score": 0.30661978583917676,
773
  "score_name": "rougeL",
774
- "rougeLsum": 0.3725727472081357,
775
- "rouge2_ci_low": 0.21606279145137167,
776
- "rouge2_ci_high": 0.2311572725809842,
777
- "rouge1_ci_low": 0.42200418152790053,
778
- "rouge1_ci_high": 0.44139989552409375,
779
- "rougeL_ci_low": 0.2992390814144599,
780
- "rougeL_ci_high": 0.3143226721498939,
781
- "score_ci_low": 0.2992390814144599,
782
- "score_ci_high": 0.3143226721498939,
783
- "rougeLsum_ci_low": 0.3633508576032972,
784
- "rougeLsum_ci_high": 0.3810899218803269
 
 
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rouge2": 0.020660811704056488,
789
- "rouge1": 0.13221889196965417,
790
- "rougeL": 0.09485458949436118,
791
- "score": 0.09485458949436118,
792
  "score_name": "rougeL",
793
- "rougeLsum": 0.10832578587514186,
794
- "rouge2_ci_low": 0.01868448614645504,
795
- "rouge2_ci_high": 0.022956487198784636,
796
- "rouge1_ci_low": 0.126104991201355,
797
- "rouge1_ci_high": 0.1377085399711476,
798
- "rougeL_ci_low": 0.09086436325621726,
799
- "rougeL_ci_high": 0.09876262383117224,
800
- "score_ci_low": 0.09086436325621726,
801
- "score_ci_high": 0.09876262383117224,
802
- "rougeLsum_ci_low": 0.10373535762969843,
803
- "rougeLsum_ci_high": 0.11251759270443917
 
 
804
  },
805
- "score": 0.20073718766676896,
806
  "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
  },
809
  "translation": {
810
  "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
  "counts": [
813
- 1306,
814
- 883,
815
- 627,
816
- 449
817
  ],
818
  "totals": [
819
- 1786,
820
- 1720,
821
- 1654,
822
- 1588
823
  ],
824
  "precisions": [
825
- 0.7312430011198208,
826
- 0.5133720930232558,
827
- 0.37908101571946795,
828
- 0.28274559193954657
829
  ],
830
  "bp": 1.0,
831
- "sys_len": 1786,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.44787360079689753,
834
- "score": 0.44787360079689753,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.4036377627510155,
837
- "score_ci_high": 0.4918827063832084,
838
- "sacrebleu_ci_low": 0.4036377627510155,
839
- "sacrebleu_ci_high": 0.4918827063832084
840
  },
841
  "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
  "counts": [
844
- 1315,
845
- 856,
846
- 590,
847
- 421
848
  ],
849
  "totals": [
850
- 1806,
851
- 1740,
852
- 1674,
853
- 1608
854
  ],
855
  "precisions": [
856
- 0.7281284606866002,
857
- 0.49195402298850577,
858
- 0.3524492234169654,
859
- 0.26181592039800994
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 1806,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.42638928564964085,
865
- "score": 0.42638928564964085,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.39041673082772005,
868
- "score_ci_high": 0.4759249097236273,
869
- "sacrebleu_ci_low": 0.39041673082772005,
870
- "sacrebleu_ci_high": 0.4759249097236273
871
  },
872
  "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
  "counts": [
875
- 935,
876
- 516,
877
- 311,
878
- 192
879
  ],
880
  "totals": [
881
- 1626,
882
- 1560,
883
- 1494,
884
- 1428
885
  ],
886
  "precisions": [
887
- 0.5750307503075031,
888
- 0.3307692307692308,
889
- 0.20816599732262384,
890
- 0.13445378151260504
891
  ],
892
- "bp": 1.0,
893
- "sys_len": 1626,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.27011564955900186,
896
- "score": 0.27011564955900186,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.2357102474208666,
899
- "score_ci_high": 0.3104170869325649,
900
- "sacrebleu_ci_low": 0.2357102474208666,
901
- "sacrebleu_ci_high": 0.3104170869325649
902
  },
903
  "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
  "counts": [
906
- 1239,
907
- 749,
908
- 489,
909
- 333
910
  ],
911
  "totals": [
912
- 1835,
913
- 1769,
914
- 1703,
915
- 1637
916
  ],
917
  "precisions": [
918
- 0.6752043596730245,
919
- 0.4234030525720746,
920
- 0.28714034057545507,
921
- 0.2034208918753818
922
  ],
923
  "bp": 1.0,
924
- "sys_len": 1835,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.35947587289557503,
927
- "score": 0.35947587289557503,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.32388996948420856,
930
- "score_ci_high": 0.40304188977063987,
931
- "sacrebleu_ci_low": 0.32388996948420856,
932
- "sacrebleu_ci_high": 0.40304188977063987
933
  },
934
  "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
  "counts": [
937
- 1522,
938
- 1124,
939
- 872,
940
- 690
941
  ],
942
  "totals": [
943
- 2039,
944
- 1973,
945
- 1907,
946
- 1841
947
  ],
948
  "precisions": [
949
- 0.7464443354585582,
950
- 0.5696908261530664,
951
- 0.4572627163083377,
952
- 0.37479630635524175
953
  ],
954
- "bp": 0.985878006034285,
955
- "sys_len": 2039,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.5122389690371388,
958
- "score": 0.5122389690371388,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.4712480498361402,
961
- "score_ci_high": 0.5723927923031075,
962
- "sacrebleu_ci_low": 0.4712480498361402,
963
- "sacrebleu_ci_high": 0.5723927923031075
964
  },
965
  "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
  "counts": [
968
- 1381,
969
- 741,
970
- 442,
971
- 270
972
  ],
973
  "totals": [
974
- 2380,
975
- 2314,
976
- 2248,
977
- 2182
978
  ],
979
  "precisions": [
980
- 0.5802521008403362,
981
- 0.3202247191011236,
982
- 0.19661921708185054,
983
- 0.12373968835930339
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 2380,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.2592994758065073,
989
- "score": 0.2592994758065073,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.2239202825861796,
992
- "score_ci_high": 0.28514030316442807,
993
- "sacrebleu_ci_low": 0.2239202825861796,
994
- "sacrebleu_ci_high": 0.28514030316442807
995
  },
996
  "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
  "counts": [
999
- 1450,
1000
- 1041,
1001
- 790,
1002
- 605
1003
  ],
1004
  "totals": [
1005
- 1904,
1006
- 1838,
1007
- 1772,
1008
- 1706
1009
  ],
1010
  "precisions": [
1011
- 0.7615546218487396,
1012
- 0.5663764961915125,
1013
- 0.44582392776523705,
1014
- 0.35463071512309496
1015
  ],
1016
- "bp": 0.9937172982182376,
1017
- "sys_len": 1904,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.5078077801713752,
1020
- "score": 0.5078077801713752,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.4521493141425684,
1023
- "score_ci_high": 0.5472131500369735,
1024
- "sacrebleu_ci_low": 0.4521493141425684,
1025
- "sacrebleu_ci_high": 0.5472131500369735
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
  "counts": [
1030
- 1407,
1031
- 988,
1032
- 717,
1033
- 522
1034
  ],
1035
  "totals": [
1036
- 1981,
1037
- 1915,
1038
- 1849,
1039
- 1783
1040
  ],
1041
  "precisions": [
1042
- 0.7102473498233215,
1043
- 0.5159268929503916,
1044
- 0.3877771768523526,
1045
- 0.2927650028042625
1046
  ],
1047
  "bp": 1.0,
1048
- "sys_len": 1981,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.4516216968085713,
1051
- "score": 0.4516216968085713,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.4139287472062087,
1054
- "score_ci_high": 0.49223929725777865,
1055
- "sacrebleu_ci_low": 0.4139287472062087,
1056
- "sacrebleu_ci_high": 0.49223929725777865
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
  "counts": [
1061
- 1307,
1062
- 777,
1063
- 489,
1064
- 316
1065
  ],
1066
  "totals": [
1067
- 2014,
1068
- 1948,
1069
- 1882,
1070
- 1816
1071
  ],
1072
  "precisions": [
1073
- 0.6489572989076464,
1074
- 0.398870636550308,
1075
- 0.2598299681190223,
1076
- 0.17400881057268724
1077
  ],
1078
- "bp": 0.9591497695217011,
1079
- "sys_len": 2014,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.3154740151881343,
1082
- "score": 0.3154740151881343,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.288220347209022,
1085
- "score_ci_high": 0.3556750708776675,
1086
- "sacrebleu_ci_low": 0.288220347209022,
1087
- "sacrebleu_ci_high": 0.3556750708776675
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
  "counts": [
1092
- 1350,
1093
- 939,
1094
- 681,
1095
- 500
1096
  ],
1097
  "totals": [
1098
- 1836,
1099
- 1770,
1100
- 1704,
1101
- 1638
1102
  ],
1103
  "precisions": [
1104
- 0.7352941176470589,
1105
- 0.5305084745762713,
1106
- 0.3996478873239437,
1107
- 0.3052503052503053
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 1836,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.46705901757494195,
1113
- "score": 0.46705901757494195,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.41922690444245675,
1116
- "score_ci_high": 0.504377689163203,
1117
- "sacrebleu_ci_low": 0.41922690444245675,
1118
- "sacrebleu_ci_high": 0.504377689163203
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
  "counts": [
1123
- 1114,
1124
- 590,
1125
- 369,
1126
- 236
1127
  ],
1128
  "totals": [
1129
- 1784,
1130
- 1718,
1131
- 1652,
1132
- 1586
1133
  ],
1134
  "precisions": [
1135
- 0.6244394618834082,
1136
- 0.34342258440046564,
1137
- 0.22336561743341407,
1138
- 0.14880201765447668
1139
  ],
1140
  "bp": 1.0,
1141
- "sys_len": 1784,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.2905601720675106,
1144
- "score": 0.2905601720675106,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.2583164259733059,
1147
- "score_ci_high": 0.32976603144676014,
1148
- "sacrebleu_ci_low": 0.2583164259733059,
1149
- "sacrebleu_ci_high": 0.32976603144676014
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
  "counts": [
1154
- 1147,
1155
- 633,
1156
- 385,
1157
- 239
1158
  ],
1159
  "totals": [
1160
- 1773,
1161
- 1707,
1162
- 1641,
1163
- 1575
1164
  ],
1165
  "precisions": [
1166
- 0.64692611393119,
1167
- 0.37082601054481545,
1168
- 0.23461304082876297,
1169
- 0.15174603174603174
1170
  ],
1171
- "bp": 1.0,
1172
- "sys_len": 1773,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.3040000049079303,
1175
- "score": 0.3040000049079303,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.27388865177998245,
1178
- "score_ci_high": 0.3607554043507509,
1179
- "sacrebleu_ci_low": 0.27388865177998245,
1180
- "sacrebleu_ci_high": 0.3607554043507509
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
  "counts": [
1185
- 1367,
1186
- 976,
1187
- 726,
1188
- 541
1189
  ],
1190
  "totals": [
1191
- 1814,
1192
- 1748,
1193
- 1682,
1194
- 1616
1195
  ],
1196
  "precisions": [
1197
- 0.7535832414553473,
1198
- 0.5583524027459954,
1199
- 0.43162901307966706,
1200
- 0.33477722772277224
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 1814,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.496565496677853,
1206
- "score": 0.496565496677853,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.44856379811709507,
1209
- "score_ci_high": 0.5405491556673685,
1210
- "sacrebleu_ci_low": 0.44856379811709507,
1211
- "sacrebleu_ci_high": 0.5405491556673685
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
  "counts": [
1216
- 1362,
1217
- 982,
1218
- 727,
1219
- 545
1220
  ],
1221
  "totals": [
1222
- 1804,
1223
- 1738,
1224
- 1672,
1225
- 1606
1226
  ],
1227
  "precisions": [
1228
- 0.7549889135254989,
1229
- 0.5650172612197929,
1230
- 0.43480861244019137,
1231
- 0.33935242839352425
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 1804,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.5008847938003845,
1237
- "score": 0.5008847938003845,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.45964511094313315,
1240
- "score_ci_high": 0.5345023750169955,
1241
- "sacrebleu_ci_low": 0.45964511094313315,
1242
- "sacrebleu_ci_high": 0.5345023750169955
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
  "counts": [
1247
- 1236,
1248
- 740,
1249
- 479,
1250
- 316
1251
  ],
1252
  "totals": [
1253
- 1894,
1254
- 1828,
1255
- 1762,
1256
- 1696
1257
  ],
1258
  "precisions": [
1259
- 0.6525871172122493,
1260
- 0.40481400437636766,
1261
- 0.27185017026106695,
1262
- 0.18632075471698112
1263
  ],
1264
  "bp": 1.0,
1265
- "sys_len": 1894,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.34011142198108724,
1268
- "score": 0.34011142198108724,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.30697541693140973,
1271
- "score_ci_high": 0.38727309802034554,
1272
- "sacrebleu_ci_low": 0.30697541693140973,
1273
- "sacrebleu_ci_high": 0.38727309802034554
1274
  },
1275
- "score": 0.39663181686150334,
1276
  "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
  },
1279
- "score": 0.533962583211598,
1280
  "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
  }
1283
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-07-02T21:12:24.436429Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
+ "model_name=watsonx/meta-llama/llama-3-3-70b-instruct,max_tokens=1024",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
 
26
  "num_fewshots": null,
27
  "limit": null,
28
  "batch_size": 8,
29
+ "model": "watsonx/meta-llama/llama-3-3-70b-instruct",
30
  "model_args": {
31
+ "max_tokens": 1024
32
  },
33
  "gen_kwargs": null,
34
  "chat_template_kwargs": null,
 
41
  "disable_hf_cache": false,
42
  "cache_dir": null
43
  },
44
+ "unitxt_version": "1.25.0",
45
+ "unitxt_commit_hash": "c8a5e77fd6ca62039b915dc10700323f50ccaacf",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
51
  "triton": "3.3.1",
52
  "nltk": "3.9.1",
53
  "anyio": "4.9.0",
54
+ "unitxt": "1.25.0",
55
  "absl-py": "2.3.0",
56
  "tiktoken": "0.9.0",
57
  "charset-normalizer": "3.4.2",
58
  "nvidia-cuda-runtime-cu12": "12.6.77",
59
  "sympy": "1.14.0",
60
  "mecab-ko": "1.0.1",
 
61
  "httpcore": "1.0.9",
62
+ "litellm": "1.73.6",
63
  "Jinja2": "3.1.6",
64
  "jsonschema-specifications": "2025.4.1",
65
  "pydantic_core": "2.33.2",
66
  "nvidia-cusparse-cu12": "12.5.4.2",
67
+ "tokenizers": "0.21.2",
68
  "yarl": "1.20.1",
 
69
  "portalocker": "3.2.0",
70
  "pandas": "2.3.0",
71
  "multiprocess": "0.70.16",
72
  "jsonschema": "4.24.0",
 
73
  "nvidia-nvjitlink-cu12": "12.6.85",
74
  "nvidia-cublas-cu12": "12.6.4.1",
75
  "pydantic": "2.11.7",
 
79
  "contourpy": "1.3.2",
80
  "aiosignal": "1.3.2",
81
  "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "openai": "1.93.0",
83
  "six": "1.17.0",
84
  "diskcache": "5.6.3",
85
  "tqdm": "4.67.1",
 
98
  "kiwisolver": "1.4.8",
99
  "networkx": "3.4.2",
100
  "typing-inspection": "0.4.1",
 
101
  "sniffio": "1.3.1",
102
  "scikit-learn": "1.7.0",
103
+ "rpds-py": "0.26.0",
104
  "nvidia-curand-cu12": "10.3.7.77",
105
  "pip": "25.1.1",
106
+ "pillow": "11.3.0",
107
  "fonttools": "4.58.4",
 
108
  "datasets": "3.6.0",
109
  "nvidia-cusolver-cu12": "11.7.1.2",
110
  "cycler": "0.12.1",
 
111
  "distro": "1.9.0",
112
  "idna": "3.10",
113
  "MarkupSafe": "3.0.2",
 
121
  "joblib": "1.5.1",
122
  "fsspec": "2025.3.0",
123
  "dill": "0.3.8",
 
124
  "wheel": "0.45.1",
125
  "nvidia-nvtx-cu12": "12.6.77",
126
  "nvidia-cusparselt-cu12": "0.6.3",
127
+ "lxml": "6.0.0",
128
  "propcache": "0.3.2",
129
  "numpy": "2.2.6",
130
  "mpmath": "1.3.0",
 
131
  "conllu": "6.0.0",
132
+ "huggingface-hub": "0.33.2",
133
  "safetensors": "0.5.3",
134
  "requests": "2.32.4",
135
  "regex": "2024.11.6",
136
  "aiohttp": "3.12.13",
137
  "tabulate": "0.9.0",
138
+ "accelerate": "1.8.1",
139
  "certifi": "2025.6.15",
140
+ "evaluate": "0.4.4",
141
  "nvidia-cufft-cu12": "11.3.0.4",
142
  "nvidia-cuda-nvrtc-cu12": "12.6.77",
143
  "click": "8.2.1",
144
  "typing_extensions": "4.12.2",
145
  "attrs": "25.3.0",
146
  "exceptiongroup": "1.3.0",
147
+ "transformers": "4.53.0",
148
  "tenacity": "9.1.2",
149
  "pytz": "2025.2",
150
  "aiohappyeyeballs": "2.6.1",
151
  "python-dateutil": "2.9.0.post0",
152
  "torch": "2.7.1",
153
+ "python-dotenv": "1.1.1",
154
+ "multidict": "6.6.3",
155
  "httpx": "0.28.1",
156
  "matplotlib": "3.10.3",
157
  "xxhash": "3.5.0",
158
  "PyYAML": "6.0.2",
 
159
  "colorama": "0.4.6",
 
160
  "threadpoolctl": "3.6.0",
161
  "nvidia-cudnn-cu12": "9.5.1.17",
162
+ "hf-xet": "1.1.5",
163
  "jaraco.collections": "5.1.0",
164
  "tomli": "2.0.1",
165
  "backports.tarfile": "1.2.0",
 
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
+ "accuracy": 0.7777777777777778,
180
+ "accuracy_ci_low": 0.3333333333333333,
181
+ "accuracy_ci_high": 1.0,
182
  "score_name": "accuracy",
183
+ "score": 0.7777777777777778,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 0.3333333333333333,
186
+ "num_of_instances": 9
187
  },
188
  "safety_bbq_disability_status": {
189
  "accuracy": 1.0,
 
193
  "score": 1.0,
194
  "score_ci_high": 1.0,
195
  "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
  },
198
  "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
  "accuracy_ci_high": 1.0,
202
  "score_name": "accuracy",
203
+ "score": 1.0,
204
  "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
  },
208
  "safety_bbq_nationality": {
209
  "accuracy": 1.0,
 
213
  "score": 1.0,
214
  "score_ci_high": 1.0,
215
  "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
  },
218
  "safety_bbq_physical_appearance": {
219
  "accuracy": 1.0,
 
223
  "score": 1.0,
224
  "score_ci_high": 1.0,
225
  "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
  },
228
  "safety_bbq_race_ethnicity": {
229
  "accuracy": 1.0,
 
233
  "score": 1.0,
234
  "score_ci_high": 1.0,
235
  "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
  },
238
  "safety_bbq_race_x_gender": {
239
  "accuracy": 1.0,
 
243
  "score": 1.0,
244
  "score_ci_high": 1.0,
245
  "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
  },
248
  "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.8888888888888888,
250
+ "accuracy_ci_low": 0.47716657027690984,
251
  "accuracy_ci_high": 1.0,
252
  "score_name": "accuracy",
253
+ "score": 0.8888888888888888,
254
  "score_ci_high": 1.0,
255
+ "score_ci_low": 0.47716657027690984,
256
+ "num_of_instances": 9
257
  },
258
  "safety_bbq_religion": {
259
  "accuracy": 1.0,
 
263
  "score": 1.0,
264
  "score_ci_high": 1.0,
265
  "score_ci_low": 1.0,
266
+ "num_of_instances": 9
267
  },
268
  "safety_bbq_ses": {
269
+ "accuracy": 1.0,
270
+ "accuracy_ci_low": 1.0,
271
  "accuracy_ci_high": 1.0,
272
  "score_name": "accuracy",
273
+ "score": 1.0,
274
  "score_ci_high": 1.0,
275
+ "score_ci_low": 1.0,
276
+ "num_of_instances": 9
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 1.0,
280
+ "accuracy_ci_low": 1.0,
281
+ "accuracy_ci_high": 1.0,
282
  "score_name": "accuracy",
283
+ "score": 1.0,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 1.0,
286
+ "num_of_instances": 9
287
  },
288
+ "score": 0.9696969696969697,
289
  "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.43243243243243246,
296
+ "score": 0.43243243243243246,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.43243243243243246,
300
  "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.5294117647058824,
307
+ "f1_Organization": 0.46808510638297873,
308
+ "f1_Location": 0.25,
309
+ "f1_macro": 0.41583229036295366,
310
+ "recall_macro": 0.3308316080055211,
311
+ "precision_macro": 0.5698763955342904,
312
+ "in_classes_support": 0.6133333333333333,
313
+ "f1_micro": 0.3333333333333333,
314
+ "recall_micro": 0.3333333333333333,
315
+ "precision_micro": 0.3333333333333333,
316
+ "score": 0.3333333333333333,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.22301605878346387,
319
+ "score_ci_high": 0.45755787982872065,
320
+ "f1_micro_ci_low": 0.22301605878346387,
321
+ "f1_micro_ci_high": 0.45755787982872065
322
  },
323
+ "score": 0.3333333333333333,
324
  "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
  "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.2857142857142857,
340
+ "accuracy_ci_low": 0.0,
341
+ "accuracy_ci_high": 0.7142857142857143,
342
  "score_name": "accuracy",
343
+ "score": 0.2857142857142857,
344
+ "score_ci_high": 0.7142857142857143,
345
+ "score_ci_low": 0.0,
346
+ "num_of_instances": 7
347
  },
348
  "mmlu_pro_chemistry": {
349
+ "accuracy": 0.2857142857142857,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.7142857142857143,
352
  "score_name": "accuracy",
353
+ "score": 0.2857142857142857,
354
+ "score_ci_high": 0.7142857142857143,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
  },
358
  "mmlu_pro_computer_science": {
359
+ "accuracy": 1.0,
360
+ "accuracy_ci_low": 1.0,
361
+ "accuracy_ci_high": 1.0,
362
  "score_name": "accuracy",
363
+ "score": 1.0,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 1.0,
366
+ "num_of_instances": 7
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.7142857142857143,
370
+ "accuracy_ci_low": 0.2857142857142857,
371
+ "accuracy_ci_high": 1.0,
372
  "score_name": "accuracy",
373
+ "score": 0.7142857142857143,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.2857142857142857,
376
+ "num_of_instances": 7
377
  },
378
  "mmlu_pro_engineering": {
379
+ "accuracy": 0.2857142857142857,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.7142857142857143,
382
  "score_name": "accuracy",
383
+ "score": 0.2857142857142857,
384
+ "score_ci_high": 0.7142857142857143,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.2857142857142857,
390
+ "accuracy_ci_low": 0.0,
391
+ "accuracy_ci_high": 0.7142857142857143,
392
  "score_name": "accuracy",
393
+ "score": 0.2857142857142857,
394
+ "score_ci_high": 0.7142857142857143,
395
+ "score_ci_low": 0.0,
396
+ "num_of_instances": 7
397
  },
398
  "mmlu_pro_history": {
399
+ "accuracy": 0.2857142857142857,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.7142857142857143,
402
  "score_name": "accuracy",
403
+ "score": 0.2857142857142857,
404
+ "score_ci_high": 0.7142857142857143,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
  },
408
  "mmlu_pro_law": {
409
+ "accuracy": 0.7142857142857143,
410
+ "accuracy_ci_low": 0.2857142857142857,
411
+ "accuracy_ci_high": 1.0,
412
  "score_name": "accuracy",
413
+ "score": 0.7142857142857143,
414
+ "score_ci_high": 1.0,
415
+ "score_ci_low": 0.2857142857142857,
416
+ "num_of_instances": 7
417
  },
418
  "mmlu_pro_math": {
419
+ "accuracy": 0.5714285714285714,
420
+ "accuracy_ci_low": 0.14285714285714285,
421
+ "accuracy_ci_high": 0.8571428571428571,
422
  "score_name": "accuracy",
423
+ "score": 0.5714285714285714,
424
+ "score_ci_high": 0.8571428571428571,
425
+ "score_ci_low": 0.14285714285714285,
426
+ "num_of_instances": 7
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.42857142857142855,
430
+ "accuracy_ci_low": 0.14285714285714285,
431
+ "accuracy_ci_high": 0.8571428571428571,
432
  "score_name": "accuracy",
433
+ "score": 0.42857142857142855,
434
+ "score_ci_high": 0.8571428571428571,
435
+ "score_ci_low": 0.14285714285714285,
436
+ "num_of_instances": 7
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.5714285714285714,
440
+ "accuracy_ci_low": 0.14285714285714285,
441
+ "accuracy_ci_high": 0.8571428571428571,
442
  "score_name": "accuracy",
443
+ "score": 0.5714285714285714,
444
+ "score_ci_high": 0.8571428571428571,
445
+ "score_ci_low": 0.14285714285714285,
446
+ "num_of_instances": 7
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.42857142857142855,
450
+ "accuracy_ci_low": 0.14285714285714285,
451
+ "accuracy_ci_high": 0.8571428571428571,
452
  "score_name": "accuracy",
453
+ "score": 0.42857142857142855,
454
+ "score_ci_high": 0.8571428571428571,
455
+ "score_ci_low": 0.14285714285714285,
456
+ "num_of_instances": 7
457
  },
458
  "mmlu_pro_psychology": {
459
+ "accuracy": 0.5714285714285714,
460
+ "accuracy_ci_low": 0.14285714285714285,
461
+ "accuracy_ci_high": 0.8571428571428571,
462
  "score_name": "accuracy",
463
+ "score": 0.5714285714285714,
464
+ "score_ci_high": 0.8571428571428571,
465
+ "score_ci_low": 0.14285714285714285,
466
+ "num_of_instances": 7
467
  },
468
+ "score": 0.5102040816326531,
469
  "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.7361904761904763,
475
+ "f1_suggestive": 0.8,
476
+ "f1_generic": 0.5,
477
+ "f1_descriptive": 0.6666666666666666,
478
+ "f1_fanciful": 0.8571428571428571,
479
+ "f1_arbitrary": 0.8571428571428571,
480
+ "f1_macro_ci_low": 0.5289185659005502,
481
+ "f1_macro_ci_high": 0.9548722481498226,
482
  "score_name": "f1_micro",
483
+ "score": 0.75,
484
+ "score_ci_high": 0.9,
485
+ "score_ci_low": 0.5,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.75,
488
+ "accuracy_ci_low": 0.5,
489
+ "accuracy_ci_high": 0.9,
490
+ "f1_micro": 0.75,
491
+ "f1_micro_ci_low": 0.5,
492
+ "f1_micro_ci_high": 0.9
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.5978260869565217,
496
+ "f1_no": 0.6956521739130435,
497
+ "f1_yes": 0.5,
498
+ "f1_macro_ci_low": 0.3441549326251368,
499
+ "f1_macro_ci_high": 0.8194337748258356,
500
  "score_name": "f1_micro",
501
+ "score": 0.6285714285714286,
502
+ "score_ci_high": 0.8108108108108109,
503
+ "score_ci_low": 0.375,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.55,
506
+ "accuracy_ci_low": 0.3,
507
+ "accuracy_ci_high": 0.75,
508
+ "f1_micro": 0.6285714285714286,
509
+ "f1_micro_ci_low": 0.375,
510
+ "f1_micro_ci_high": 0.8108108108108109
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.18367346938775508,
514
+ "f1_conclusion": 0.3333333333333333,
515
+ "f1_decree": 0.0,
516
+ "f1_issue": 0.2857142857142857,
517
+ "f1_analysis": 0.6666666666666666,
518
+ "f1_facts": 0.0,
519
+ "f1_procedural history": 0.0,
520
+ "f1_rule": 0.0,
521
+ "f1_macro_ci_low": 0.05043016459778368,
522
+ "f1_macro_ci_high": 0.3606547758667794,
523
  "score_name": "f1_micro",
524
+ "score": 0.24242424242424243,
525
+ "score_ci_high": 0.5,
526
+ "score_ci_low": 0.06060606060606061,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.2,
529
+ "accuracy_ci_low": 0.05,
530
+ "accuracy_ci_high": 0.45,
531
+ "f1_micro": 0.24242424242424243,
532
+ "f1_micro_ci_low": 0.06060606060606061,
533
+ "f1_micro_ci_high": 0.5
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.6274509803921569,
537
+ "f1_yes": 0.6666666666666666,
538
+ "f1_no": 0.5882352941176471,
539
+ "f1_macro_ci_low": 0.4248806144013661,
540
+ "f1_macro_ci_high": 0.8403361344537814,
541
  "score_name": "f1_micro",
542
+ "score": 0.6285714285714286,
543
+ "score_ci_high": 0.8235294117647058,
544
+ "score_ci_low": 0.4117647058823529,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.55,
547
+ "accuracy_ci_low": 0.35,
548
+ "accuracy_ci_high": 0.75,
549
+ "f1_micro": 0.6285714285714286,
550
+ "f1_micro_ci_low": 0.4117647058823529,
551
+ "f1_micro_ci_high": 0.8235294117647058
552
  },
553
  "legalbench_proa": {
554
+ "f1_macro": 0.7222222222222222,
555
+ "f1_yes": 0.6666666666666666,
556
+ "f1_no": 0.7777777777777778,
557
+ "f1_macro_ci_low": 0.5,
558
+ "f1_macro_ci_high": 0.8815079455131178,
559
  "score_name": "f1_micro",
560
+ "score": 0.7272727272727273,
561
+ "score_ci_high": 0.8888888888888888,
562
+ "score_ci_low": 0.5161290322580645,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.6,
565
+ "accuracy_ci_low": 0.4,
566
+ "accuracy_ci_high": 0.8,
567
+ "f1_micro": 0.7272727272727273,
568
+ "f1_micro_ci_low": 0.5161290322580645,
569
+ "f1_micro_ci_high": 0.8888888888888888
570
  },
571
+ "score": 0.5953679653679653,
572
  "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.6277146464646465,
578
+ "f1_cars": 0.9090909090909091,
579
+ "f1_windows x": 0.5714285714285714,
580
+ "f1_computer graphics": 0.625,
581
+ "f1_atheism": 0.3333333333333333,
582
+ "f1_religion": 0.0,
583
+ "f1_medicine": 1.0,
584
+ "f1_christianity": 0.8571428571428571,
585
+ "f1_microsoft windows": 0.8,
586
+ "f1_middle east": 0.5,
587
+ "f1_motorcycles": 0.6,
588
+ "f1_pc hardware": 0.6666666666666666,
589
+ "f1_mac hardware": 0.8,
 
 
 
 
 
 
 
590
  "f1_electronics": 0.6666666666666666,
591
+ "f1_for sale": 0.5714285714285714,
592
+ "f1_guns": 0.4444444444444444,
593
+ "f1_space": 0.75,
594
+ "f1_cryptography": 0.4,
595
+ "f1_baseball": 0.9090909090909091,
596
+ "f1_politics": 0.4,
597
+ "f1_hockey": 0.75,
598
+ "f1_macro_ci_low": 0.5435845040206655,
599
+ "f1_macro_ci_high": 0.7319668770597383,
600
  "score_name": "f1_micro",
601
+ "score": 0.6444444444444445,
602
+ "score_ci_high": 0.7292817679558011,
603
+ "score_ci_low": 0.5371428571428571,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.58,
606
+ "accuracy_ci_low": 0.47,
607
+ "accuracy_ci_high": 0.67,
608
+ "f1_micro": 0.6444444444444445,
609
+ "f1_micro_ci_low": 0.5371428571428571,
610
+ "f1_micro_ci_high": 0.7292817679558011
611
  },
612
+ "score": 0.6444444444444445,
613
  "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.772359470103831,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9172932330827067,
620
+ "f1_credit card or prepaid card": 0.7368421052631579,
621
+ "f1_money transfer or virtual currency or money service": 0.8,
622
+ "f1_mortgage": 0.6666666666666666,
623
+ "f1_debt collection": 0.7619047619047619,
624
+ "f1_checking or savings account": 0.8571428571428571,
625
+ "f1_payday loan or title loan or personal loan": 0.6666666666666666,
626
+ "f1_macro_ci_low": 0.5572738433047465,
627
+ "f1_macro_ci_high": 0.8791281557930146,
 
 
628
  "score_name": "f1_micro",
629
+ "score": 0.8686868686868687,
630
+ "score_ci_high": 0.9231286638704261,
631
+ "score_ci_low": 0.7869369117179047,
632
+ "num_of_instances": 100,
633
+ "accuracy": 0.86,
634
+ "accuracy_ci_low": 0.78,
635
+ "accuracy_ci_high": 0.92,
636
+ "f1_micro": 0.8686868686868687,
637
+ "f1_micro_ci_low": 0.7869369117179047,
638
+ "f1_micro_ci_high": 0.9231286638704261
639
  },
640
  "cfpb_product_watsonx": {
641
+ "f1_macro": 0.8015898111550286,
642
+ "f1_mortgages and loans": 0.8695652173913043,
643
+ "f1_credit card": 0.7272727272727273,
644
+ "f1_debt collection": 0.7777777777777778,
645
+ "f1_credit reporting": 0.8,
646
+ "f1_retail banking": 0.8333333333333334,
647
+ "f1_macro_ci_low": 0.6568465035277264,
648
+ "f1_macro_ci_high": 0.905920427893389,
649
  "score_name": "f1_micro",
650
+ "score": 0.8,
651
+ "score_ci_high": 0.9,
652
+ "score_ci_low": 0.66,
653
+ "num_of_instances": 50,
654
+ "accuracy": 0.8,
655
+ "accuracy_ci_low": 0.66,
656
+ "accuracy_ci_high": 0.9,
657
+ "f1_micro": 0.8,
658
+ "f1_micro_ci_low": 0.66,
659
+ "f1_micro_ci_high": 0.9
660
  },
661
+ "score": 0.8343434343434344,
662
  "score_name": "subsets_mean",
663
+ "num_of_instances": 150
664
  },
665
  "qa_finance": {
666
  "fin_qa": {
667
+ "num_of_instances": 100,
668
+ "program_accuracy": 0.25,
669
+ "score": 0.25,
670
  "score_name": "program_accuracy",
671
+ "execution_accuracy": 0.25,
672
+ "program_accuracy_ci_low": 0.17,
673
+ "program_accuracy_ci_high": 0.34,
674
+ "score_ci_low": 0.17,
675
+ "score_ci_high": 0.34,
676
+ "execution_accuracy_ci_low": 0.17,
677
+ "execution_accuracy_ci_high": 0.34
678
  },
679
+ "score": 0.25,
680
  "score_name": "subsets_mean",
681
+ "num_of_instances": 100
682
  },
683
  "rag_general": {
684
  "rag_response_generation_clapnq": {
685
+ "precision": 0.4555079496421888,
686
+ "recall": 0.644299468710777,
687
+ "f1": 0.49645864333583084,
688
+ "precision_ci_low": 0.4182546292505632,
689
+ "precision_ci_high": 0.49480928460353885,
690
+ "recall_ci_low": 0.6076488097595466,
691
+ "recall_ci_high": 0.6826290581362066,
692
+ "f1_ci_low": 0.4687855673947254,
693
+ "f1_ci_high": 0.5287757822440131,
694
  "score_name": "f1",
695
+ "score": 0.49645864333583084,
696
+ "score_ci_high": 0.5287757822440131,
697
+ "score_ci_low": 0.4687855673947254,
698
+ "num_of_instances": 100,
699
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6826186165213585,
700
+ "correctness_recall_bert_score.deberta_large_mnli": 0.7250934118032455,
701
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6550253140926361,
702
+ "faithfullness_f1_token_overlap": 0.41576223647286753,
703
+ "faithfullness_recall_token_overlap": 0.32586457398911184,
704
+ "faithfullness_precision_token_overlap": 0.6713221181134523,
705
+ "correctness_f1_token_overlap": 0.49645864333583084,
706
+ "correctness_recall_token_overlap": 0.644299468710777,
707
+ "correctness_precision_token_overlap": 0.4555079496421888
708
  },
709
+ "score": 0.49645864333583084,
710
  "score_name": "subsets_mean",
711
+ "num_of_instances": 100
712
  },
713
  "reasoning": {
714
  "hellaswag": {
715
+ "accuracy": 0.57,
716
+ "accuracy_ci_low": 0.47,
717
+ "accuracy_ci_high": 0.66,
718
  "score_name": "accuracy",
719
+ "score": 0.57,
720
+ "score_ci_high": 0.66,
721
+ "score_ci_low": 0.47,
722
+ "num_of_instances": 100
723
  },
724
  "openbook_qa": {
725
+ "accuracy": 0.9,
726
+ "accuracy_ci_low": 0.83,
727
+ "accuracy_ci_high": 0.95,
728
  "score_name": "accuracy",
729
+ "score": 0.9,
730
+ "score_ci_high": 0.95,
731
+ "score_ci_low": 0.83,
732
+ "num_of_instances": 100
733
  },
734
+ "score": 0.735,
735
  "score_name": "subsets_mean",
736
+ "num_of_instances": 200
737
  },
738
  "safety": {
739
  "attaq_500": {
 
741
  "severity_high": 0.0,
742
  "severity_medium": 1.0,
743
  "severity_low": 99.0,
744
+ "category_harmful_info": 0.8601243539550711,
745
+ "category_deception": 0.8577543491817484,
746
+ "category_explicit_content": 0.8846021865804989,
747
+ "category_violence": 0.8303844939030351,
748
+ "category_discrimination": 0.8039271911931417,
749
+ "category_substance_abuse": 0.8599538101090325,
750
+ "category_pii": 0.7738575405544704,
751
+ "safety": 0.8456169310129353,
752
+ "safety_ci_low": 0.8257205534653617,
753
+ "safety_ci_high": 0.8647527678477268,
754
  "score_name": "safety",
755
+ "score": 0.8456169310129353,
756
+ "score_ci_high": 0.8647527678477268,
757
+ "score_ci_low": 0.8257205534653617,
758
  "num_of_instances": 100
759
  },
760
+ "score": 0.8456169310129353,
761
  "score_name": "subsets_mean",
762
  "num_of_instances": 100
763
  },
764
  "summarization": {
765
  "billsum_document_filtered_to_6000_chars": {
766
+ "num_of_instances": 100,
767
+ "rougeL": 0.30196222140145373,
768
+ "score": 0.30196222140145373,
 
 
769
  "score_name": "rougeL",
770
+ "rouge2": 0.20846797855492663,
771
+ "rougeLsum": 0.3708490611505299,
772
+ "rouge1": 0.43249180135238885,
773
+ "rougeL_ci_low": 0.28545948956532086,
774
+ "rougeL_ci_high": 0.32034973210420586,
775
+ "score_ci_low": 0.28545948956532086,
776
+ "score_ci_high": 0.32034973210420586,
777
+ "rouge2_ci_low": 0.19185402874274654,
778
+ "rouge2_ci_high": 0.22633538497306355,
779
+ "rougeLsum_ci_low": 0.3497341012349053,
780
+ "rougeLsum_ci_high": 0.3906646611762865,
781
+ "rouge1_ci_low": 0.4091501139364316,
782
+ "rouge1_ci_high": 0.4544797414359922
783
  },
784
  "tldr_document_filtered_to_6000_chars": {
785
+ "num_of_instances": 100,
786
+ "rougeL": 0.09534966709464636,
787
+ "score": 0.09534966709464636,
 
 
788
  "score_name": "rougeL",
789
+ "rouge2": 0.01871182369959182,
790
+ "rougeLsum": 0.10336010342197492,
791
+ "rouge1": 0.1250434904833096,
792
+ "rougeL_ci_low": 0.08239795255822814,
793
+ "rougeL_ci_high": 0.1071024099898739,
794
+ "score_ci_low": 0.08239795255822814,
795
+ "score_ci_high": 0.1071024099898739,
796
+ "rouge2_ci_low": 0.014115566880536843,
797
+ "rouge2_ci_high": 0.024356532581362833,
798
+ "rougeLsum_ci_low": 0.08983062504584788,
799
+ "rougeLsum_ci_high": 0.1162908267622954,
800
+ "rouge1_ci_low": 0.10903177552761713,
801
+ "rouge1_ci_high": 0.142104165948435
802
  },
803
+ "score": 0.19865594424805005,
804
  "score_name": "subsets_mean",
805
+ "num_of_instances": 200
806
  },
807
  "translation": {
808
  "mt_flores_101_ara_eng": {
809
+ "num_of_instances": 6,
810
  "counts": [
811
+ 157,
812
+ 108,
813
+ 79,
814
+ 62
815
  ],
816
  "totals": [
817
+ 219,
818
+ 213,
819
+ 207,
820
+ 201
821
  ],
822
  "precisions": [
823
+ 0.7168949771689498,
824
+ 0.5070422535211268,
825
+ 0.3816425120772947,
826
+ 0.30845771144278605
827
  ],
828
  "bp": 1.0,
829
+ "sys_len": 219,
830
+ "ref_len": 208,
831
+ "sacrebleu": 0.45481839038687305,
832
+ "score": 0.45481839038687305,
833
  "score_name": "sacrebleu",
834
+ "score_ci_low": 0.23313727959235822,
835
+ "score_ci_high": 0.5735174559459194,
836
+ "sacrebleu_ci_low": 0.23313727959235822,
837
+ "sacrebleu_ci_high": 0.5735174559459194
838
  },
839
  "mt_flores_101_deu_eng": {
840
+ "num_of_instances": 6,
841
  "counts": [
842
+ 141,
843
+ 82,
844
+ 51,
845
+ 37
846
  ],
847
  "totals": [
848
+ 211,
849
+ 205,
850
+ 199,
851
+ 193
852
  ],
853
  "precisions": [
854
+ 0.6682464454976303,
855
+ 0.4,
856
+ 0.2562814070351759,
857
+ 0.1917098445595855
858
  ],
859
  "bp": 1.0,
860
+ "sys_len": 211,
861
+ "ref_len": 208,
862
+ "sacrebleu": 0.33852406002064017,
863
+ "score": 0.33852406002064017,
864
  "score_name": "sacrebleu",
865
+ "score_ci_low": 0.24860280316487016,
866
+ "score_ci_high": 0.44210608957563563,
867
+ "sacrebleu_ci_low": 0.24860280316487016,
868
+ "sacrebleu_ci_high": 0.44210608957563563
869
  },
870
  "mt_flores_101_eng_ara": {
871
+ "num_of_instances": 6,
872
  "counts": [
873
+ 133,
874
+ 86,
875
+ 58,
876
+ 35
877
  ],
878
  "totals": [
879
+ 200,
880
+ 194,
881
+ 188,
882
+ 182
883
  ],
884
  "precisions": [
885
+ 0.665,
886
+ 0.44329896907216493,
887
+ 0.30851063829787234,
888
+ 0.1923076923076923
889
  ],
890
+ "bp": 0.9559974818331,
891
+ "sys_len": 200,
892
+ "ref_len": 209,
893
+ "sacrebleu": 0.34765865057318845,
894
+ "score": 0.34765865057318845,
895
  "score_name": "sacrebleu",
896
+ "score_ci_low": 0.26554667136882865,
897
+ "score_ci_high": 0.47703525584416595,
898
+ "sacrebleu_ci_low": 0.26554667136882865,
899
+ "sacrebleu_ci_high": 0.47703525584416595
900
  },
901
  "mt_flores_101_eng_deu": {
902
+ "num_of_instances": 6,
903
  "counts": [
904
+ 142,
905
+ 90,
906
+ 63,
907
+ 45
908
  ],
909
  "totals": [
910
+ 225,
911
+ 219,
912
+ 213,
913
+ 207
914
  ],
915
  "precisions": [
916
+ 0.6311111111111112,
917
+ 0.410958904109589,
918
+ 0.29577464788732394,
919
+ 0.21739130434782608
920
  ],
921
  "bp": 1.0,
922
+ "sys_len": 225,
923
+ "ref_len": 216,
924
+ "sacrebleu": 0.35935759973709475,
925
+ "score": 0.35935759973709475,
926
  "score_name": "sacrebleu",
927
+ "score_ci_low": 0.239249013984345,
928
+ "score_ci_high": 0.48381346932297253,
929
+ "sacrebleu_ci_low": 0.239249013984345,
930
+ "sacrebleu_ci_high": 0.48381346932297253
931
  },
932
  "mt_flores_101_eng_fra": {
933
+ "num_of_instances": 6,
934
  "counts": [
935
+ 186,
936
+ 137,
937
+ 103,
938
+ 78
939
  ],
940
  "totals": [
941
+ 237,
942
+ 231,
943
+ 225,
944
+ 219
945
  ],
946
  "precisions": [
947
+ 0.7848101265822786,
948
+ 0.5930735930735931,
949
+ 0.4577777777777778,
950
+ 0.3561643835616438
951
  ],
952
+ "bp": 1.0,
953
+ "sys_len": 237,
954
+ "ref_len": 235,
955
+ "sacrebleu": 0.5248613522062934,
956
+ "score": 0.5248613522062934,
957
  "score_name": "sacrebleu",
958
+ "score_ci_low": 0.4676636539845554,
959
+ "score_ci_high": 0.5955161256091506,
960
+ "sacrebleu_ci_low": 0.4676636539845554,
961
+ "sacrebleu_ci_high": 0.5955161256091506
962
  },
963
  "mt_flores_101_eng_kor": {
964
+ "num_of_instances": 6,
965
  "counts": [
966
+ 166,
967
+ 94,
968
+ 56,
969
+ 34
970
  ],
971
  "totals": [
972
+ 277,
973
+ 271,
974
+ 265,
975
+ 259
976
  ],
977
  "precisions": [
978
+ 0.5992779783393501,
979
+ 0.34686346863468637,
980
+ 0.2113207547169811,
981
+ 0.1312741312741313
982
  ],
983
  "bp": 1.0,
984
+ "sys_len": 277,
985
+ "ref_len": 249,
986
+ "sacrebleu": 0.27556689764615966,
987
+ "score": 0.27556689764615966,
988
  "score_name": "sacrebleu",
989
+ "score_ci_low": 0.21442798271905641,
990
+ "score_ci_high": 0.3206267707714379,
991
+ "sacrebleu_ci_low": 0.21442798271905641,
992
+ "sacrebleu_ci_high": 0.3206267707714379
993
  },
994
  "mt_flores_101_eng_por": {
995
+ "num_of_instances": 6,
996
  "counts": [
997
+ 179,
998
+ 135,
999
+ 109,
1000
+ 87
1001
  ],
1002
  "totals": [
1003
+ 228,
1004
+ 222,
1005
+ 216,
1006
+ 210
1007
  ],
1008
  "precisions": [
1009
+ 0.7850877192982456,
1010
+ 0.6081081081081081,
1011
+ 0.5046296296296297,
1012
+ 0.4142857142857143
1013
  ],
1014
+ "bp": 1.0,
1015
+ "sys_len": 228,
1016
+ "ref_len": 222,
1017
+ "sacrebleu": 0.5620732547703513,
1018
+ "score": 0.5620732547703513,
1019
  "score_name": "sacrebleu",
1020
+ "score_ci_low": 0.47870398599739605,
1021
+ "score_ci_high": 0.6971556058442332,
1022
+ "sacrebleu_ci_low": 0.47870398599739605,
1023
+ "sacrebleu_ci_high": 0.6971556058442332
1024
  },
1025
  "mt_flores_101_eng_ron": {
1026
+ "num_of_instances": 6,
1027
  "counts": [
1028
+ 160,
1029
+ 112,
1030
+ 84,
1031
+ 65
1032
  ],
1033
  "totals": [
1034
+ 237,
1035
+ 231,
1036
+ 225,
1037
+ 219
1038
  ],
1039
  "precisions": [
1040
+ 0.6751054852320675,
1041
+ 0.48484848484848486,
1042
+ 0.37333333333333335,
1043
+ 0.2968036529680365
1044
  ],
1045
  "bp": 1.0,
1046
+ "sys_len": 237,
1047
+ "ref_len": 230,
1048
+ "sacrebleu": 0.4364011861022174,
1049
+ "score": 0.4364011861022174,
1050
  "score_name": "sacrebleu",
1051
+ "score_ci_low": 0.3331265459325523,
1052
+ "score_ci_high": 0.5917875941582059,
1053
+ "sacrebleu_ci_low": 0.3331265459325523,
1054
+ "sacrebleu_ci_high": 0.5917875941582059
1055
  },
1056
  "mt_flores_101_eng_spa": {
1057
+ "num_of_instances": 6,
1058
  "counts": [
1059
+ 161,
1060
+ 97,
1061
+ 63,
1062
+ 41
1063
  ],
1064
  "totals": [
1065
+ 234,
1066
+ 228,
1067
+ 222,
1068
+ 216
1069
  ],
1070
  "precisions": [
1071
+ 0.688034188034188,
1072
+ 0.42543859649122806,
1073
+ 0.28378378378378377,
1074
+ 0.1898148148148148
1075
  ],
1076
+ "bp": 0.9622687143632572,
1077
+ "sys_len": 234,
1078
+ "ref_len": 243,
1079
+ "sacrebleu": 0.34098675715245896,
1080
+ "score": 0.34098675715245896,
1081
  "score_name": "sacrebleu",
1082
+ "score_ci_low": 0.2935907278924951,
1083
+ "score_ci_high": 0.3969989231374368,
1084
+ "sacrebleu_ci_low": 0.2935907278924951,
1085
+ "sacrebleu_ci_high": 0.3969989231374368
1086
  },
1087
  "mt_flores_101_fra_eng": {
1088
+ "num_of_instances": 6,
1089
  "counts": [
1090
+ 158,
1091
+ 116,
1092
+ 84,
1093
+ 56
1094
  ],
1095
  "totals": [
1096
+ 210,
1097
+ 204,
1098
+ 198,
1099
+ 192
1100
  ],
1101
  "precisions": [
1102
+ 0.7523809523809524,
1103
+ 0.5686274509803921,
1104
+ 0.4242424242424242,
1105
+ 0.2916666666666667
1106
  ],
1107
  "bp": 1.0,
1108
+ "sys_len": 210,
1109
+ "ref_len": 208,
1110
+ "sacrebleu": 0.47966897260726976,
1111
+ "score": 0.47966897260726976,
1112
  "score_name": "sacrebleu",
1113
+ "score_ci_low": 0.39430918582024527,
1114
+ "score_ci_high": 0.5605485645530176,
1115
+ "sacrebleu_ci_low": 0.39430918582024527,
1116
+ "sacrebleu_ci_high": 0.5605485645530176
1117
  },
1118
  "mt_flores_101_jpn_eng": {
1119
+ "num_of_instances": 6,
1120
  "counts": [
1121
+ 151,
1122
+ 97,
1123
+ 72,
1124
+ 56
1125
  ],
1126
  "totals": [
1127
+ 223,
1128
+ 217,
1129
+ 211,
1130
+ 205
1131
  ],
1132
  "precisions": [
1133
+ 0.6771300448430493,
1134
+ 0.4470046082949309,
1135
+ 0.3412322274881517,
1136
+ 0.2731707317073171
1137
  ],
1138
  "bp": 1.0,
1139
+ "sys_len": 223,
1140
+ "ref_len": 208,
1141
+ "sacrebleu": 0.4098425763503735,
1142
+ "score": 0.4098425763503735,
1143
  "score_name": "sacrebleu",
1144
+ "score_ci_low": 0.23930475121026726,
1145
+ "score_ci_high": 0.5897136543390743,
1146
+ "sacrebleu_ci_low": 0.23930475121026726,
1147
+ "sacrebleu_ci_high": 0.5897136543390743
1148
  },
1149
  "mt_flores_101_kor_eng": {
1150
+ "num_of_instances": 6,
1151
  "counts": [
1152
+ 128,
1153
+ 71,
1154
+ 45,
1155
+ 30
1156
  ],
1157
  "totals": [
1158
+ 206,
1159
+ 200,
1160
+ 194,
1161
+ 188
1162
  ],
1163
  "precisions": [
1164
+ 0.6213592233009709,
1165
+ 0.355,
1166
+ 0.23195876288659792,
1167
+ 0.1595744680851064
1168
  ],
1169
+ "bp": 0.9903382397772544,
1170
+ "sys_len": 206,
1171
+ "ref_len": 208,
1172
+ "sacrebleu": 0.2976938560971466,
1173
+ "score": 0.2976938560971466,
1174
  "score_name": "sacrebleu",
1175
+ "score_ci_low": 0.1737766619242665,
1176
+ "score_ci_high": 0.44451335565361355,
1177
+ "sacrebleu_ci_low": 0.1737766619242665,
1178
+ "sacrebleu_ci_high": 0.44451335565361355
1179
  },
1180
  "mt_flores_101_por_eng": {
1181
+ "num_of_instances": 6,
1182
  "counts": [
1183
+ 168,
1184
+ 133,
1185
+ 105,
1186
+ 84
1187
  ],
1188
  "totals": [
1189
+ 210,
1190
+ 204,
1191
+ 198,
1192
+ 192
1193
  ],
1194
  "precisions": [
1195
+ 0.8,
1196
+ 0.6519607843137255,
1197
+ 0.5303030303030303,
1198
+ 0.4375
1199
  ],
1200
  "bp": 1.0,
1201
+ "sys_len": 210,
1202
+ "ref_len": 208,
1203
+ "sacrebleu": 0.5897981509424008,
1204
+ "score": 0.5897981509424008,
1205
  "score_name": "sacrebleu",
1206
+ "score_ci_low": 0.46597275289680684,
1207
+ "score_ci_high": 0.6510065766151202,
1208
+ "sacrebleu_ci_low": 0.46597275289680684,
1209
+ "sacrebleu_ci_high": 0.6510065766151202
1210
  },
1211
  "mt_flores_101_ron_eng": {
1212
+ "num_of_instances": 6,
1213
  "counts": [
1214
+ 160,
1215
+ 111,
1216
+ 78,
1217
+ 55
1218
  ],
1219
  "totals": [
1220
+ 228,
1221
+ 222,
1222
+ 216,
1223
+ 210
1224
  ],
1225
  "precisions": [
1226
+ 0.7017543859649124,
1227
+ 0.5,
1228
+ 0.36111111111111116,
1229
+ 0.2619047619047619
1230
  ],
1231
  "bp": 1.0,
1232
+ "sys_len": 228,
1233
+ "ref_len": 208,
1234
+ "sacrebleu": 0.4268102558559915,
1235
+ "score": 0.4268102558559915,
1236
  "score_name": "sacrebleu",
1237
+ "score_ci_low": 0.33894724398193377,
1238
+ "score_ci_high": 0.5442092547992242,
1239
+ "sacrebleu_ci_low": 0.33894724398193377,
1240
+ "sacrebleu_ci_high": 0.5442092547992242
1241
  },
1242
  "mt_flores_101_spa_eng": {
1243
+ "num_of_instances": 6,
1244
  "counts": [
1245
+ 147,
1246
+ 98,
1247
+ 65,
1248
+ 46
1249
  ],
1250
  "totals": [
1251
+ 213,
1252
+ 207,
1253
+ 201,
1254
+ 195
1255
  ],
1256
  "precisions": [
1257
+ 0.6901408450704225,
1258
+ 0.47342995169082125,
1259
+ 0.3233830845771144,
1260
+ 0.23589743589743592
1261
  ],
1262
  "bp": 1.0,
1263
+ "sys_len": 213,
1264
+ "ref_len": 208,
1265
+ "sacrebleu": 0.3973365305222621,
1266
+ "score": 0.3973365305222621,
1267
  "score_name": "sacrebleu",
1268
+ "score_ci_low": 0.32188371117377373,
1269
+ "score_ci_high": 0.45496400690805394,
1270
+ "sacrebleu_ci_low": 0.32188371117377373,
1271
+ "sacrebleu_ci_high": 0.45496400690805394
1272
  },
1273
+ "score": 0.41609323273138143,
1274
  "score_name": "subsets_mean",
1275
+ "num_of_instances": 90
1276
  },
1277
+ "score": 0.55858826250611,
1278
  "score_name": "subsets_mean",
1279
+ "num_of_instances": 1537
1280
  }
1281
  }
results/bluebench/{2025-06-22T19-25-42_evaluation_results.json → 2025-07-02T17-33-41_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-22T23:25:38.430519Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -8,7 +8,7 @@
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
- "model_name=watsonx/meta-llama/llama-3-405b-instruct,max_tokens=256",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
@@ -28,7 +28,7 @@
28
  "batch_size": 8,
29
  "model": "watsonx/meta-llama/llama-3-405b-instruct",
30
  "model_args": {
31
- "max_tokens": 256
32
  },
33
  "gen_kwargs": null,
34
  "chat_template_kwargs": null,
@@ -41,8 +41,8 @@
41
  "disable_hf_cache": false,
42
  "cache_dir": null
43
  },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -51,25 +51,25 @@
51
  "triton": "3.3.1",
52
  "nltk": "3.9.1",
53
  "anyio": "4.9.0",
 
54
  "absl-py": "2.3.0",
55
  "tiktoken": "0.9.0",
56
  "charset-normalizer": "3.4.2",
57
  "nvidia-cuda-runtime-cu12": "12.6.77",
58
  "sympy": "1.14.0",
59
  "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
  "httpcore": "1.0.9",
 
62
  "Jinja2": "3.1.6",
63
  "jsonschema-specifications": "2025.4.1",
64
  "pydantic_core": "2.33.2",
65
  "nvidia-cusparse-cu12": "12.5.4.2",
 
66
  "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
  "portalocker": "3.2.0",
69
  "pandas": "2.3.0",
70
  "multiprocess": "0.70.16",
71
  "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
  "nvidia-nvjitlink-cu12": "12.6.85",
74
  "nvidia-cublas-cu12": "12.6.4.1",
75
  "pydantic": "2.11.7",
@@ -79,7 +79,7 @@
79
  "contourpy": "1.3.2",
80
  "aiosignal": "1.3.2",
81
  "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
  "six": "1.17.0",
84
  "diskcache": "5.6.3",
85
  "tqdm": "4.67.1",
@@ -98,17 +98,16 @@
98
  "kiwisolver": "1.4.8",
99
  "networkx": "3.4.2",
100
  "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
  "sniffio": "1.3.1",
103
  "scikit-learn": "1.7.0",
 
104
  "nvidia-curand-cu12": "10.3.7.77",
105
  "pip": "25.1.1",
 
106
  "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
  "datasets": "3.6.0",
109
  "nvidia-cusolver-cu12": "11.7.1.2",
110
  "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
  "distro": "1.9.0",
113
  "idna": "3.10",
114
  "MarkupSafe": "3.0.2",
@@ -122,44 +121,45 @@
122
  "joblib": "1.5.1",
123
  "fsspec": "2025.3.0",
124
  "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
  "wheel": "0.45.1",
127
  "nvidia-nvtx-cu12": "12.6.77",
128
  "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
  "propcache": "0.3.2",
131
  "numpy": "2.2.6",
132
  "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
  "conllu": "6.0.0",
 
135
  "safetensors": "0.5.3",
136
  "requests": "2.32.4",
137
  "regex": "2024.11.6",
138
  "aiohttp": "3.12.13",
139
  "tabulate": "0.9.0",
 
140
  "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
  "nvidia-cufft-cu12": "11.3.0.4",
143
  "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
  "click": "8.2.1",
145
  "typing_extensions": "4.12.2",
146
  "attrs": "25.3.0",
147
  "exceptiongroup": "1.3.0",
 
148
  "tenacity": "9.1.2",
149
  "pytz": "2025.2",
150
  "aiohappyeyeballs": "2.6.1",
151
  "python-dateutil": "2.9.0.post0",
152
  "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
 
154
  "httpx": "0.28.1",
155
  "matplotlib": "3.10.3",
156
  "xxhash": "3.5.0",
157
  "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
  "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
  "threadpoolctl": "3.6.0",
162
  "nvidia-cudnn-cu12": "9.5.1.17",
 
163
  "jaraco.collections": "5.1.0",
164
  "tomli": "2.0.1",
165
  "backports.tarfile": "1.2.0",
@@ -176,34 +176,34 @@
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
- "accuracy": 0.9777777777777777,
180
- "accuracy_ci_low": 0.9190234736102009,
181
  "accuracy_ci_high": 1.0,
182
  "score_name": "accuracy",
183
- "score": 0.9777777777777777,
184
  "score_ci_high": 1.0,
185
- "score_ci_low": 0.9190234736102009,
186
- "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
- "accuracy": 0.9888888888888889,
190
- "accuracy_ci_low": 0.9444444444444444,
191
  "accuracy_ci_high": 1.0,
192
  "score_name": "accuracy",
193
- "score": 0.9888888888888889,
194
  "score_ci_high": 1.0,
195
- "score_ci_low": 0.9444444444444444,
196
- "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
- "accuracy": 0.9888888888888889,
200
- "accuracy_ci_low": 0.9333333333333333,
201
  "accuracy_ci_high": 1.0,
202
  "score_name": "accuracy",
203
- "score": 0.9888888888888889,
204
  "score_ci_high": 1.0,
205
- "score_ci_low": 0.9333333333333333,
206
- "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
  "accuracy": 1.0,
@@ -213,17 +213,17 @@
213
  "score": 1.0,
214
  "score_ci_high": 1.0,
215
  "score_ci_low": 1.0,
216
- "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
- "accuracy": 0.9888888888888889,
220
- "accuracy_ci_low": 0.9444444444444444,
221
  "accuracy_ci_high": 1.0,
222
  "score_name": "accuracy",
223
- "score": 0.9888888888888889,
224
  "score_ci_high": 1.0,
225
- "score_ci_low": 0.9444444444444444,
226
- "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
  "accuracy": 1.0,
@@ -233,7 +233,7 @@
233
  "score": 1.0,
234
  "score_ci_high": 1.0,
235
  "score_ci_low": 1.0,
236
- "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
  "accuracy": 1.0,
@@ -243,7 +243,7 @@
243
  "score": 1.0,
244
  "score_ci_high": 1.0,
245
  "score_ci_low": 1.0,
246
- "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
  "accuracy": 1.0,
@@ -253,17 +253,17 @@
253
  "score": 1.0,
254
  "score_ci_high": 1.0,
255
  "score_ci_low": 1.0,
256
- "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
- "accuracy": 0.9888888888888889,
260
- "accuracy_ci_low": 0.9444444444444444,
261
  "accuracy_ci_high": 1.0,
262
  "score_name": "accuracy",
263
- "score": 0.9888888888888889,
264
  "score_ci_high": 1.0,
265
- "score_ci_low": 0.9444444444444444,
266
- "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
  "accuracy": 1.0,
@@ -273,1011 +273,1009 @@
273
  "score": 1.0,
274
  "score_ci_high": 1.0,
275
  "score_ci_low": 1.0,
276
- "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.8777777777777778,
280
- "accuracy_ci_low": 0.8,
281
- "accuracy_ci_high": 0.9333333333333333,
282
  "score_name": "accuracy",
283
- "score": 0.8777777777777778,
284
- "score_ci_high": 0.9333333333333333,
285
- "score_ci_low": 0.8,
286
- "num_of_instances": 90
287
  },
288
- "score": 0.9828282828282828,
289
  "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.12794268167860798,
296
- "score": 0.12794268167860798,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.12794268167860798,
300
  "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.6233062330623307,
307
- "f1_Organization": 0.4037267080745342,
308
- "f1_Location": 0.441860465116279,
309
- "f1_macro": 0.4896311354177146,
310
- "recall_macro": 0.44046090061205145,
311
- "precision_macro": 0.5555845701415322,
312
- "in_classes_support": 0.8122605363984674,
313
- "f1_micro": 0.45272206303724927,
314
- "recall_micro": 0.4514285714285714,
315
- "precision_micro": 0.4540229885057471,
316
- "score": 0.45272206303724927,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.3966894448307716,
319
- "score_ci_high": 0.5096302708354611,
320
- "f1_micro_ci_low": 0.3966894448307716,
321
- "f1_micro_ci_high": 0.5096302708354611
322
  },
323
- "score": 0.45272206303724927,
324
  "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
- "accuracy": 0.7605633802816901,
330
- "accuracy_ci_low": 0.6619718309859155,
331
- "accuracy_ci_high": 0.8450704225352113,
332
  "score_name": "accuracy",
333
- "score": 0.7605633802816901,
334
- "score_ci_high": 0.8450704225352113,
335
- "score_ci_low": 0.6619718309859155,
336
- "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.49295774647887325,
340
- "accuracy_ci_low": 0.38028169014084506,
341
- "accuracy_ci_high": 0.6056338028169014,
342
  "score_name": "accuracy",
343
- "score": 0.49295774647887325,
344
- "score_ci_high": 0.6056338028169014,
345
- "score_ci_low": 0.38028169014084506,
346
- "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
- "accuracy": 0.43661971830985913,
350
- "accuracy_ci_low": 0.31179550598679995,
351
- "accuracy_ci_high": 0.5633802816901409,
352
  "score_name": "accuracy",
353
- "score": 0.43661971830985913,
354
- "score_ci_high": 0.5633802816901409,
355
- "score_ci_low": 0.31179550598679995,
356
- "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
- "accuracy": 0.7323943661971831,
360
- "accuracy_ci_low": 0.6056338028169014,
361
- "accuracy_ci_high": 0.8309859154929577,
362
  "score_name": "accuracy",
363
- "score": 0.7323943661971831,
364
- "score_ci_high": 0.8309859154929577,
365
- "score_ci_low": 0.6056338028169014,
366
- "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
- "accuracy": 0.7464788732394366,
370
- "accuracy_ci_low": 0.6338028169014085,
371
- "accuracy_ci_high": 0.8320697555200512,
372
  "score_name": "accuracy",
373
- "score": 0.7464788732394366,
374
- "score_ci_high": 0.8320697555200512,
375
- "score_ci_low": 0.6338028169014085,
376
- "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
- "accuracy": 0.5633802816901409,
380
- "accuracy_ci_low": 0.4507042253521127,
381
- "accuracy_ci_high": 0.6901408450704225,
382
  "score_name": "accuracy",
383
- "score": 0.5633802816901409,
384
- "score_ci_high": 0.6901408450704225,
385
- "score_ci_low": 0.4507042253521127,
386
- "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
- "accuracy": 0.5774647887323944,
390
- "accuracy_ci_low": 0.4647887323943662,
391
- "accuracy_ci_high": 0.6901408450704225,
392
  "score_name": "accuracy",
393
- "score": 0.5774647887323944,
394
- "score_ci_high": 0.6901408450704225,
395
- "score_ci_low": 0.4647887323943662,
396
- "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
- "accuracy": 0.6901408450704225,
400
- "accuracy_ci_low": 0.5633802816901409,
401
- "accuracy_ci_high": 0.7887323943661971,
402
  "score_name": "accuracy",
403
- "score": 0.6901408450704225,
404
- "score_ci_high": 0.7887323943661971,
405
- "score_ci_low": 0.5633802816901409,
406
- "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
- "accuracy": 0.7323943661971831,
410
- "accuracy_ci_low": 0.6056338028169014,
411
- "accuracy_ci_high": 0.8309859154929577,
412
  "score_name": "accuracy",
413
- "score": 0.7323943661971831,
414
- "score_ci_high": 0.8309859154929577,
415
- "score_ci_low": 0.6056338028169014,
416
- "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
- "accuracy": 0.5633802816901409,
420
- "accuracy_ci_low": 0.43661971830985913,
421
- "accuracy_ci_high": 0.676056338028169,
422
  "score_name": "accuracy",
423
- "score": 0.5633802816901409,
424
- "score_ci_high": 0.676056338028169,
425
- "score_ci_low": 0.43661971830985913,
426
- "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
- "accuracy": 0.6619718309859155,
430
- "accuracy_ci_low": 0.5370780611967093,
431
- "accuracy_ci_high": 0.7605633802816901,
432
  "score_name": "accuracy",
433
- "score": 0.6619718309859155,
434
- "score_ci_high": 0.7605633802816901,
435
- "score_ci_low": 0.5370780611967093,
436
- "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
- "accuracy": 0.8028169014084507,
440
- "accuracy_ci_low": 0.6981095742502579,
441
- "accuracy_ci_high": 0.8873239436619719,
442
  "score_name": "accuracy",
443
- "score": 0.8028169014084507,
444
- "score_ci_high": 0.8873239436619719,
445
- "score_ci_low": 0.6981095742502579,
446
- "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.6338028169014085,
450
- "accuracy_ci_low": 0.5211267605633803,
451
- "accuracy_ci_high": 0.7323943661971831,
452
  "score_name": "accuracy",
453
- "score": 0.6338028169014085,
454
- "score_ci_high": 0.7323943661971831,
455
- "score_ci_low": 0.5211267605633803,
456
- "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
- "accuracy": 0.7323943661971831,
460
- "accuracy_ci_low": 0.6197183098591549,
461
- "accuracy_ci_high": 0.8309859154929577,
462
  "score_name": "accuracy",
463
- "score": 0.7323943661971831,
464
- "score_ci_high": 0.8309859154929577,
465
- "score_ci_low": 0.6197183098591549,
466
- "num_of_instances": 71
467
  },
468
- "score": 0.6519114688128773,
469
  "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.7710561497326203,
475
- "f1_suggestive": 0.5882352941176471,
476
  "f1_generic": 1.0,
477
- "f1_fanciful": 0.8125,
478
- "f1_descriptive": 0.7878787878787878,
479
- "f1_arbitrary": 0.6666666666666666,
480
- "f1_macro_ci_low": 0.6815687852174904,
481
- "f1_macro_ci_high": 0.8487975830625909,
482
  "score_name": "f1_micro",
483
- "score": 0.7682926829268293,
484
- "score_ci_high": 0.845238531816244,
485
- "score_ci_low": 0.682034648754911,
486
- "num_of_instances": 85,
487
- "accuracy": 0.7411764705882353,
488
- "accuracy_ci_low": 0.6470588235294118,
489
- "accuracy_ci_high": 0.8235294117647058,
490
- "f1_micro": 0.7682926829268293,
491
- "f1_micro_ci_low": 0.682034648754911,
492
- "f1_micro_ci_high": 0.845238531816244
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.7216529635538103,
496
- "f1_no": 0.8398576512455516,
497
- "f1_yes": 0.603448275862069,
498
- "f1_macro_ci_low": 0.6520262757220233,
499
- "f1_macro_ci_high": 0.7890273988307265,
500
  "score_name": "f1_micro",
501
- "score": 0.7707808564231738,
502
- "score_ci_high": 0.8225396492391672,
503
- "score_ci_low": 0.7085427135678392,
504
- "num_of_instances": 200,
505
- "accuracy": 0.765,
506
- "accuracy_ci_low": 0.7,
507
- "accuracy_ci_high": 0.815,
508
- "f1_micro": 0.7707808564231738,
509
- "f1_micro_ci_low": 0.7085427135678392,
510
- "f1_micro_ci_high": 0.8225396492391672
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.30206082783981175,
514
- "f1_conclusion": 0.13953488372093023,
515
- "f1_issue": 0.13333333333333333,
516
- "f1_decree": 0.3783783783783784,
517
- "f1_rule": 0.4482758620689655,
518
- "f1_analysis": 0.5647058823529412,
519
- "f1_facts": 0.18604651162790697,
520
- "f1_procedural history": 0.2641509433962264,
521
- "f1_macro_ci_low": 0.2513517165690192,
522
- "f1_macro_ci_high": 0.3775685968384507,
523
  "score_name": "f1_micro",
524
- "score": 0.33516483516483514,
525
- "score_ci_high": 0.4075146671820192,
526
- "score_ci_low": 0.272347535123403,
527
- "num_of_instances": 200,
528
- "accuracy": 0.305,
529
- "accuracy_ci_low": 0.245,
530
- "accuracy_ci_high": 0.375,
531
- "f1_micro": 0.33516483516483514,
532
- "f1_micro_ci_low": 0.272347535123403,
533
- "f1_micro_ci_high": 0.4075146671820192
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.5047062641999351,
537
- "f1_yes": 0.5991561181434599,
538
- "f1_no": 0.41025641025641024,
539
- "f1_macro_ci_low": 0.4417647306569189,
540
- "f1_macro_ci_high": 0.5795532826410552,
541
  "score_name": "f1_micro",
542
- "score": 0.5241730279898219,
543
- "score_ci_high": 0.5950630270095045,
544
- "score_ci_low": 0.459552667145485,
545
- "num_of_instances": 200,
546
- "accuracy": 0.515,
547
- "accuracy_ci_low": 0.45,
548
- "accuracy_ci_high": 0.585,
549
- "f1_micro": 0.5241730279898219,
550
- "f1_micro_ci_low": 0.459552667145485,
551
- "f1_micro_ci_high": 0.5950630270095045
552
  },
553
  "legalbench_proa": {
554
- "f1_macro": 0.9284195605953225,
555
- "f1_yes": 0.9156626506024096,
556
- "f1_no": 0.9411764705882353,
557
- "f1_macro_ci_low": 0.8686025850356507,
558
- "f1_macro_ci_high": 0.9706771979676585,
559
  "score_name": "f1_micro",
560
- "score": 0.9285714285714286,
561
- "score_ci_high": 0.9704142011834319,
562
- "score_ci_low": 0.8690476190476191,
563
- "num_of_instances": 85,
564
- "accuracy": 0.9176470588235294,
565
- "accuracy_ci_low": 0.8588235294117647,
566
- "accuracy_ci_high": 0.9647058823529412,
567
- "f1_micro": 0.9285714285714286,
568
- "f1_micro_ci_low": 0.8690476190476191,
569
- "f1_micro_ci_high": 0.9704142011834319
570
  },
571
- "score": 0.6653965662152177,
572
  "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.6486110220528901,
578
- "f1_cars": 0.8958333333333334,
579
- "f1_windows x": 0.09090909090909091,
580
- "f1_computer graphics": 0.4793388429752066,
581
- "f1_atheism": 0.5245901639344263,
582
- "f1_religion": 0.044444444444444446,
583
- "f1_medicine": 0.813953488372093,
584
- "f1_christianity": 0.8727272727272727,
585
- "f1_for sale": 0.7777777777777778,
586
- "f1_microsoft windows": 0.7708333333333334,
587
- "f1_middle east": 0.5671641791044776,
588
- "f1_motorcycles": 0.7692307692307693,
589
- "f1_pc hardware": 0.6046511627906976,
590
- "f1_mac hardware": 0.7924528301886793,
591
- "f1_electronics": 0.7291666666666666,
592
- "f1_guns": 0.410958904109589,
593
- "f1_space": 0.8846153846153846,
594
- "f1_cryptography": 0.72,
595
- "f1_baseball": 0.9391304347826087,
596
- "f1_hockey": 0.9545454545454546,
597
- "f1_politics": 0.32989690721649484,
598
- "f1_macro_ci_low": 0.6245539233827745,
599
- "f1_macro_ci_high": 0.6735741716064018,
600
  "score_name": "f1_micro",
601
- "score": 0.6871961102106969,
602
- "score_ci_high": 0.7158552998269507,
603
- "score_ci_low": 0.6568978311145116,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.636,
606
- "accuracy_ci_low": 0.6068817918985229,
607
- "accuracy_ci_high": 0.666,
608
- "f1_micro": 0.6871961102106969,
609
- "f1_micro_ci_low": 0.6568978311145116,
610
- "f1_micro_ci_high": 0.7158552998269507
611
  },
612
- "score": 0.6871961102106969,
613
  "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.7442070442021397,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9329529243937232,
620
- "f1_checking or savings account": 0.8269230769230769,
621
- "f1_debt collection": 0.5324675324675324,
622
- "f1_credit card or prepaid card": 0.7933884297520661,
623
- "f1_mortgage": 0.8405797101449275,
624
- "f1_student loan": 0.896551724137931,
625
- "f1_money transfer or virtual currency or money service": 0.875,
626
- "f1_vehicle loan or lease": 0.6666666666666666,
627
- "f1_payday loan or title loan or personal loan": 0.3333333333333333,
628
- "f1_macro_ci_low": 0.6958471577865166,
629
- "f1_macro_ci_high": 0.8045739980351424,
630
  "score_name": "f1_micro",
631
- "score": 0.8742393509127789,
632
- "score_ci_high": 0.8929169783856484,
633
- "score_ci_low": 0.852776904397444,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.862,
636
- "accuracy_ci_low": 0.839,
637
- "accuracy_ci_high": 0.882,
638
- "f1_micro": 0.8742393509127789,
639
- "f1_micro_ci_low": 0.852776904397444,
640
- "f1_micro_ci_high": 0.8929169783856484
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.747877693509548,
644
- "f1_mortgages and loans": 0.8248587570621468,
645
- "f1_credit card": 0.7912087912087912,
646
- "f1_debt collection": 0.683982683982684,
647
- "f1_credit reporting": 0.7205882352941176,
648
- "f1_retail banking": 0.71875,
649
- "f1_macro_ci_low": 0.7065255253955101,
650
- "f1_macro_ci_high": 0.7849687727329339,
651
  "score_name": "f1_micro",
652
- "score": 0.7434343434343434,
653
- "score_ci_high": 0.7787863123983747,
654
- "score_ci_low": 0.7018885821645714,
655
- "num_of_instances": 500,
656
- "accuracy": 0.736,
657
- "accuracy_ci_low": 0.694,
658
- "accuracy_ci_high": 0.772,
659
- "f1_micro": 0.7434343434343434,
660
- "f1_micro_ci_low": 0.7018885821645714,
661
- "f1_micro_ci_high": 0.7787863123983747
662
  },
663
- "score": 0.8088368471735612,
664
  "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
- "num_of_instances": 1000,
670
- "program_accuracy": 0.212,
671
- "score": 0.212,
672
  "score_name": "program_accuracy",
673
- "execution_accuracy": 0.197,
674
- "program_accuracy_ci_low": 0.185,
675
- "program_accuracy_ci_high": 0.236,
676
- "score_ci_low": 0.185,
677
- "score_ci_high": 0.236,
678
- "execution_accuracy_ci_low": 0.17257528462439894,
679
- "execution_accuracy_ci_high": 0.222
680
  },
681
- "score": 0.212,
682
  "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.3245919340419664,
688
- "recall": 0.5553697849897254,
689
- "f1": 0.34463972021312694,
690
- "precision_ci_low": 0.30294812146038264,
691
- "precision_ci_high": 0.34672435235830834,
692
- "recall_ci_low": 0.5393969536805812,
693
- "recall_ci_high": 0.5708437647062446,
694
- "f1_ci_low": 0.32625746187015825,
695
- "f1_ci_high": 0.3632802184669128,
696
  "score_name": "f1",
697
- "score": 0.34463972021312694,
698
- "score_ci_high": 0.3632802184669128,
699
- "score_ci_low": 0.32625746187015825,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.5976956130564213,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.678848825097084,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5559072712808848,
704
- "faithfullness_f1_token_overlap": 0.2867915721228578,
705
- "faithfullness_recall_token_overlap": 0.21812174721257188,
706
- "faithfullness_precision_token_overlap": 0.546883220093946,
707
- "correctness_f1_token_overlap": 0.34463972021312694,
708
- "correctness_recall_token_overlap": 0.5553697849897254,
709
- "correctness_precision_token_overlap": 0.3245919340419664
710
  },
711
- "score": 0.34463972021312694,
712
  "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.57,
718
- "accuracy_ci_low": 0.537,
719
- "accuracy_ci_high": 0.598,
720
  "score_name": "accuracy",
721
- "score": 0.57,
722
- "score_ci_high": 0.598,
723
- "score_ci_low": 0.537,
724
- "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.914,
728
- "accuracy_ci_low": 0.886,
729
- "accuracy_ci_high": 0.936,
730
  "score_name": "accuracy",
731
- "score": 0.914,
732
- "score_ci_high": 0.936,
733
- "score_ci_low": 0.886,
734
- "num_of_instances": 500
735
  },
736
- "score": 0.742,
737
  "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
- "severity_critical": 1.0,
743
  "severity_high": 0.0,
744
  "severity_medium": 0.0,
745
- "severity_low": 99.0,
746
- "category_harmful_info": 0.8239067156488699,
747
- "category_deception": 0.8416350447017737,
748
- "category_explicit_content": 0.8482034478073168,
749
- "category_violence": 0.8124119411385249,
750
- "category_discrimination": 0.7214018125382681,
751
- "category_substance_abuse": 0.8382871117856767,
752
- "category_pii": 0.838832758643009,
753
- "safety": 0.8184977591824201,
754
- "safety_ci_low": 0.799251726518489,
755
- "safety_ci_high": 0.83564691259826,
756
  "score_name": "safety",
757
- "score": 0.8184977591824201,
758
- "score_ci_high": 0.83564691259826,
759
- "score_ci_low": 0.799251726518489,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.8184977591824201,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rouge2": 0.21878117525659171,
770
- "rouge1": 0.42882781698282896,
771
- "rougeLsum": 0.36923160950814665,
772
- "rougeL": 0.3012206645101065,
773
- "score": 0.3012206645101065,
774
  "score_name": "rougeL",
775
- "rouge2_ci_low": 0.2116465564648193,
776
- "rouge2_ci_high": 0.22611002289776966,
777
- "rouge1_ci_low": 0.4181854383996789,
778
- "rouge1_ci_high": 0.4380350294107447,
779
- "rougeLsum_ci_low": 0.3593867077962995,
780
- "rougeLsum_ci_high": 0.377981997024725,
781
- "rougeL_ci_low": 0.2938004078438361,
782
- "rougeL_ci_high": 0.30862765298917266,
783
- "score_ci_low": 0.2938004078438361,
784
- "score_ci_high": 0.30862765298917266
 
 
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rouge2": 0.01897199585186187,
789
- "rouge1": 0.12886717133874953,
790
- "rougeLsum": 0.10635879624885117,
791
- "rougeL": 0.09266060514104649,
792
- "score": 0.09266060514104649,
793
  "score_name": "rougeL",
794
- "rouge2_ci_low": 0.0169285768359538,
795
- "rouge2_ci_high": 0.020965979315931374,
796
- "rouge1_ci_low": 0.1230681468561615,
797
- "rouge1_ci_high": 0.13427737204069826,
798
- "rougeLsum_ci_low": 0.10189392757917064,
799
- "rougeLsum_ci_high": 0.11089749692460946,
800
- "rougeL_ci_low": 0.08868339152088286,
801
- "rougeL_ci_high": 0.0963878314649574,
802
- "score_ci_low": 0.08868339152088286,
803
- "score_ci_high": 0.0963878314649574
 
 
804
  },
805
- "score": 0.1969406348255765,
806
  "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
  },
809
  "translation": {
810
  "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
  "counts": [
813
- 1296,
814
- 846,
815
- 593,
816
- 421
817
  ],
818
  "totals": [
819
- 1768,
820
- 1702,
821
- 1636,
822
- 1570
823
  ],
824
  "precisions": [
825
- 0.7330316742081447,
826
- 0.4970622796709753,
827
- 0.36246943765281175,
828
- 0.2681528662420382
829
  ],
830
- "bp": 1.0,
831
- "sys_len": 1768,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.4338072904737007,
834
- "score": 0.4338072904737007,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.3840033634108722,
837
- "score_ci_high": 0.4745374005130659,
838
- "sacrebleu_ci_low": 0.3840033634108722,
839
- "sacrebleu_ci_high": 0.4745374005130659
840
  },
841
  "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
  "counts": [
844
- 1324,
845
- 883,
846
- 626,
847
- 442
848
  ],
849
  "totals": [
850
- 1796,
851
- 1730,
852
- 1664,
853
- 1598
854
  ],
855
  "precisions": [
856
- 0.7371937639198218,
857
- 0.5104046242774567,
858
- 0.3762019230769231,
859
- 0.2765957446808511
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 1796,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.44482653871620387,
865
- "score": 0.44482653871620387,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.4081818656109621,
868
- "score_ci_high": 0.4860589069348345,
869
- "sacrebleu_ci_low": 0.4081818656109621,
870
- "sacrebleu_ci_high": 0.4860589069348345
871
  },
872
  "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
  "counts": [
875
- 898,
876
- 497,
877
- 303,
878
- 187
879
  ],
880
  "totals": [
881
- 1785,
882
- 1719,
883
- 1653,
884
- 1587
885
  ],
886
  "precisions": [
887
- 0.5030812324929972,
888
- 0.28912158231529955,
889
- 0.18330308529945555,
890
- 0.1178323881537492
891
  ],
892
- "bp": 1.0,
893
- "sys_len": 1785,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.23674906403928667,
896
- "score": 0.23674906403928667,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.18135183356554332,
899
- "score_ci_high": 0.28960778605368953,
900
- "sacrebleu_ci_low": 0.18135183356554332,
901
- "sacrebleu_ci_high": 0.28960778605368953
902
  },
903
  "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
  "counts": [
906
- 1240,
907
- 761,
908
- 513,
909
- 362
910
  ],
911
  "totals": [
912
- 1823,
913
- 1757,
914
- 1691,
915
- 1625
916
  ],
917
  "precisions": [
918
- 0.6801974766867801,
919
- 0.4331246442800228,
920
- 0.3033707865168539,
921
- 0.22276923076923075
922
  ],
923
- "bp": 0.9934390613382812,
924
- "sys_len": 1823,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.3731732035283488,
927
- "score": 0.3731732035283488,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.3128221603250469,
930
- "score_ci_high": 0.4084245255594999,
931
- "sacrebleu_ci_low": 0.3128221603250469,
932
- "sacrebleu_ci_high": 0.4084245255594999
933
  },
934
  "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
  "counts": [
937
- 1532,
938
- 1143,
939
- 898,
940
- 722
941
  ],
942
  "totals": [
943
- 2028,
944
- 1962,
945
- 1896,
946
- 1830
947
  ],
948
  "precisions": [
949
- 0.7554240631163708,
950
- 0.5825688073394495,
951
- 0.4736286919831224,
952
- 0.3945355191256831
953
  ],
954
- "bp": 0.9804693769806172,
955
- "sys_len": 2028,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.5250486815503393,
958
- "score": 0.5250486815503393,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.4776077157588871,
961
- "score_ci_high": 0.57891652183332,
962
- "sacrebleu_ci_low": 0.4776077157588871,
963
- "sacrebleu_ci_high": 0.57891652183332
964
  },
965
  "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
  "counts": [
968
- 1351,
969
- 728,
970
- 447,
971
- 279
972
  ],
973
  "totals": [
974
- 2735,
975
- 2669,
976
- 2603,
977
- 2537
978
  ],
979
  "precisions": [
980
- 0.4939670932358318,
981
- 0.27276133383289625,
982
- 0.17172493276988093,
983
- 0.10997240835632636
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 2735,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.22459468717502307,
989
- "score": 0.22459468717502307,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.1881220082315981,
992
- "score_ci_high": 0.25342316010757016,
993
- "sacrebleu_ci_low": 0.1881220082315981,
994
- "sacrebleu_ci_high": 0.25342316010757016
995
  },
996
  "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
  "counts": [
999
- 1429,
1000
- 1021,
1001
- 760,
1002
- 570
1003
  ],
1004
  "totals": [
1005
- 1901,
1006
- 1835,
1007
- 1769,
1008
- 1703
1009
  ],
1010
  "precisions": [
1011
- 0.751709626512362,
1012
- 0.5564032697547684,
1013
- 0.4296212549462973,
1014
- 0.33470346447445687
1015
  ],
1016
- "bp": 0.9921404650355355,
1017
- "sys_len": 1901,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.49132583520106116,
1020
- "score": 0.49132583520106116,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.4478372466403723,
1023
- "score_ci_high": 0.5303109141597054,
1024
- "sacrebleu_ci_low": 0.4478372466403723,
1025
- "sacrebleu_ci_high": 0.5303109141597054
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
  "counts": [
1030
- 1387,
1031
- 979,
1032
- 711,
1033
- 518
1034
  ],
1035
  "totals": [
1036
- 1967,
1037
- 1901,
1038
- 1835,
1039
- 1769
1040
  ],
1041
  "precisions": [
1042
- 0.7051347229283172,
1043
- 0.5149921094160969,
1044
- 0.38746594005449597,
1045
- 0.2928208027133974
1046
  ],
1047
- "bp": 1.0,
1048
- "sys_len": 1967,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.450533442657802,
1051
- "score": 0.450533442657802,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.41119874078427415,
1054
- "score_ci_high": 0.4996205132749857,
1055
- "sacrebleu_ci_low": 0.41119874078427415,
1056
- "sacrebleu_ci_high": 0.4996205132749857
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
  "counts": [
1061
- 1300,
1062
- 749,
1063
- 458,
1064
- 283
1065
  ],
1066
  "totals": [
1067
- 2033,
1068
- 1967,
1069
- 1901,
1070
- 1835
1071
  ],
1072
  "precisions": [
1073
- 0.6394490900147565,
1074
- 0.3807829181494662,
1075
- 0.24092582851130984,
1076
- 0.1542234332425068
1077
  ],
1078
- "bp": 0.9685332604439724,
1079
- "sys_len": 2033,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.29870591960318976,
1082
- "score": 0.29870591960318976,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.2686987876011016,
1085
- "score_ci_high": 0.3321355366475583,
1086
- "sacrebleu_ci_low": 0.2686987876011016,
1087
- "sacrebleu_ci_high": 0.3321355366475583
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
  "counts": [
1092
- 1340,
1093
- 946,
1094
- 692,
1095
- 509
1096
  ],
1097
  "totals": [
1098
- 1799,
1099
- 1733,
1100
- 1667,
1101
- 1601
1102
  ],
1103
  "precisions": [
1104
- 0.7448582545858811,
1105
- 0.5458742065781881,
1106
- 0.4151169766046791,
1107
- 0.31792629606495937
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 1799,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.4812999188340168,
1113
- "score": 0.4812999188340168,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.43585100595842746,
1116
- "score_ci_high": 0.5287499225865158,
1117
- "sacrebleu_ci_low": 0.43585100595842746,
1118
- "sacrebleu_ci_high": 0.5287499225865158
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
  "counts": [
1123
- 1139,
1124
- 642,
1125
- 413,
1126
- 271
1127
  ],
1128
  "totals": [
1129
- 1798,
1130
- 1732,
1131
- 1666,
1132
- 1600
1133
  ],
1134
  "precisions": [
1135
- 0.6334816462736373,
1136
- 0.37066974595842955,
1137
- 0.24789915966386555,
1138
- 0.169375
1139
  ],
1140
  "bp": 1.0,
1141
- "sys_len": 1798,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.3151094190111042,
1144
- "score": 0.3151094190111042,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.27523820425365936,
1147
- "score_ci_high": 0.37080224288423497,
1148
- "sacrebleu_ci_low": 0.27523820425365936,
1149
- "sacrebleu_ci_high": 0.37080224288423497
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
  "counts": [
1154
- 1096,
1155
- 603,
1156
- 369,
1157
- 231
1158
  ],
1159
  "totals": [
1160
- 1757,
1161
- 1691,
1162
- 1625,
1163
- 1559
1164
  ],
1165
  "precisions": [
1166
- 0.6237905520774046,
1167
- 0.3565937315198108,
1168
- 0.2270769230769231,
1169
- 0.14817190506735087
1170
  ],
1171
- "bp": 1.0,
1172
- "sys_len": 1757,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.29412899612915067,
1175
- "score": 0.29412899612915067,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.2630441728848743,
1178
- "score_ci_high": 0.3478998363728344,
1179
- "sacrebleu_ci_low": 0.2630441728848743,
1180
- "sacrebleu_ci_high": 0.3478998363728344
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
  "counts": [
1185
- 1366,
1186
- 978,
1187
- 742,
1188
- 559
1189
  ],
1190
  "totals": [
1191
- 1794,
1192
- 1728,
1193
- 1662,
1194
- 1596
1195
  ],
1196
  "precisions": [
1197
- 0.7614269788182831,
1198
- 0.5659722222222222,
1199
- 0.4464500601684717,
1200
- 0.35025062656641603
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 1794,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.5094995397125037,
1206
- "score": 0.5094995397125037,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.45392787953470803,
1209
- "score_ci_high": 0.5666406845959309,
1210
- "sacrebleu_ci_low": 0.45392787953470803,
1211
- "sacrebleu_ci_high": 0.5666406845959309
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
  "counts": [
1216
- 1357,
1217
- 953,
1218
- 689,
1219
- 517
1220
  ],
1221
  "totals": [
1222
- 1780,
1223
- 1714,
1224
- 1648,
1225
- 1582
1226
  ],
1227
  "precisions": [
1228
- 0.7623595505617977,
1229
- 0.5560093348891482,
1230
- 0.4180825242718446,
1231
- 0.3268015170670038
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 1780,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.49056549932677673,
1237
- "score": 0.49056549932677673,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.4449725178153994,
1240
- "score_ci_high": 0.5499905859714643,
1241
- "sacrebleu_ci_low": 0.4449725178153994,
1242
- "sacrebleu_ci_high": 0.5499905859714643
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
  "counts": [
1247
- 1196,
1248
- 684,
1249
- 436,
1250
- 274
1251
  ],
1252
  "totals": [
1253
- 1839,
1254
- 1773,
1255
- 1707,
1256
- 1641
1257
  ],
1258
  "precisions": [
1259
- 0.6503534529635671,
1260
- 0.38578680203045684,
1261
- 0.255418863503222,
1262
- 0.16697135892748324
1263
  ],
1264
- "bp": 1.0,
1265
- "sys_len": 1839,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.3216236243185879,
1268
- "score": 0.3216236243185879,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.29765430133685095,
1271
- "score_ci_high": 0.36660487766328476,
1272
- "sacrebleu_ci_low": 0.29765430133685095,
1273
- "sacrebleu_ci_high": 0.36660487766328476
1274
  },
1275
- "score": 0.39273277735180634,
1276
  "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
  },
1279
- "score": 0.5448957624253402,
1280
  "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
  }
1283
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-07-02T21:33:37.582340Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
+ "model_name=watsonx/meta-llama/llama-3-405b-instruct,max_tokens=1024",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
 
28
  "batch_size": 8,
29
  "model": "watsonx/meta-llama/llama-3-405b-instruct",
30
  "model_args": {
31
+ "max_tokens": 1024
32
  },
33
  "gen_kwargs": null,
34
  "chat_template_kwargs": null,
 
41
  "disable_hf_cache": false,
42
  "cache_dir": null
43
  },
44
+ "unitxt_version": "1.25.0",
45
+ "unitxt_commit_hash": "c8a5e77fd6ca62039b915dc10700323f50ccaacf",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
51
  "triton": "3.3.1",
52
  "nltk": "3.9.1",
53
  "anyio": "4.9.0",
54
+ "unitxt": "1.25.0",
55
  "absl-py": "2.3.0",
56
  "tiktoken": "0.9.0",
57
  "charset-normalizer": "3.4.2",
58
  "nvidia-cuda-runtime-cu12": "12.6.77",
59
  "sympy": "1.14.0",
60
  "mecab-ko": "1.0.1",
 
61
  "httpcore": "1.0.9",
62
+ "litellm": "1.73.6",
63
  "Jinja2": "3.1.6",
64
  "jsonschema-specifications": "2025.4.1",
65
  "pydantic_core": "2.33.2",
66
  "nvidia-cusparse-cu12": "12.5.4.2",
67
+ "tokenizers": "0.21.2",
68
  "yarl": "1.20.1",
 
69
  "portalocker": "3.2.0",
70
  "pandas": "2.3.0",
71
  "multiprocess": "0.70.16",
72
  "jsonschema": "4.24.0",
 
73
  "nvidia-nvjitlink-cu12": "12.6.85",
74
  "nvidia-cublas-cu12": "12.6.4.1",
75
  "pydantic": "2.11.7",
 
79
  "contourpy": "1.3.2",
80
  "aiosignal": "1.3.2",
81
  "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "openai": "1.93.0",
83
  "six": "1.17.0",
84
  "diskcache": "5.6.3",
85
  "tqdm": "4.67.1",
 
98
  "kiwisolver": "1.4.8",
99
  "networkx": "3.4.2",
100
  "typing-inspection": "0.4.1",
 
101
  "sniffio": "1.3.1",
102
  "scikit-learn": "1.7.0",
103
+ "rpds-py": "0.26.0",
104
  "nvidia-curand-cu12": "10.3.7.77",
105
  "pip": "25.1.1",
106
+ "pillow": "11.3.0",
107
  "fonttools": "4.58.4",
 
108
  "datasets": "3.6.0",
109
  "nvidia-cusolver-cu12": "11.7.1.2",
110
  "cycler": "0.12.1",
 
111
  "distro": "1.9.0",
112
  "idna": "3.10",
113
  "MarkupSafe": "3.0.2",
 
121
  "joblib": "1.5.1",
122
  "fsspec": "2025.3.0",
123
  "dill": "0.3.8",
 
124
  "wheel": "0.45.1",
125
  "nvidia-nvtx-cu12": "12.6.77",
126
  "nvidia-cusparselt-cu12": "0.6.3",
127
+ "lxml": "6.0.0",
128
  "propcache": "0.3.2",
129
  "numpy": "2.2.6",
130
  "mpmath": "1.3.0",
 
131
  "conllu": "6.0.0",
132
+ "huggingface-hub": "0.33.2",
133
  "safetensors": "0.5.3",
134
  "requests": "2.32.4",
135
  "regex": "2024.11.6",
136
  "aiohttp": "3.12.13",
137
  "tabulate": "0.9.0",
138
+ "accelerate": "1.8.1",
139
  "certifi": "2025.6.15",
140
+ "evaluate": "0.4.4",
141
  "nvidia-cufft-cu12": "11.3.0.4",
142
  "nvidia-cuda-nvrtc-cu12": "12.6.77",
143
  "click": "8.2.1",
144
  "typing_extensions": "4.12.2",
145
  "attrs": "25.3.0",
146
  "exceptiongroup": "1.3.0",
147
+ "transformers": "4.53.0",
148
  "tenacity": "9.1.2",
149
  "pytz": "2025.2",
150
  "aiohappyeyeballs": "2.6.1",
151
  "python-dateutil": "2.9.0.post0",
152
  "torch": "2.7.1",
153
+ "python-dotenv": "1.1.1",
154
+ "multidict": "6.6.3",
155
  "httpx": "0.28.1",
156
  "matplotlib": "3.10.3",
157
  "xxhash": "3.5.0",
158
  "PyYAML": "6.0.2",
 
159
  "colorama": "0.4.6",
 
160
  "threadpoolctl": "3.6.0",
161
  "nvidia-cudnn-cu12": "9.5.1.17",
162
+ "hf-xet": "1.1.5",
163
  "jaraco.collections": "5.1.0",
164
  "tomli": "2.0.1",
165
  "backports.tarfile": "1.2.0",
 
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
+ "accuracy": 1.0,
180
+ "accuracy_ci_low": 1.0,
181
  "accuracy_ci_high": 1.0,
182
  "score_name": "accuracy",
183
+ "score": 1.0,
184
  "score_ci_high": 1.0,
185
+ "score_ci_low": 1.0,
186
+ "num_of_instances": 9
187
  },
188
  "safety_bbq_disability_status": {
189
+ "accuracy": 1.0,
190
+ "accuracy_ci_low": 1.0,
191
  "accuracy_ci_high": 1.0,
192
  "score_name": "accuracy",
193
+ "score": 1.0,
194
  "score_ci_high": 1.0,
195
+ "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
  },
198
  "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
  "accuracy_ci_high": 1.0,
202
  "score_name": "accuracy",
203
+ "score": 1.0,
204
  "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
  },
208
  "safety_bbq_nationality": {
209
  "accuracy": 1.0,
 
213
  "score": 1.0,
214
  "score_ci_high": 1.0,
215
  "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
  },
218
  "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
  "accuracy_ci_high": 1.0,
222
  "score_name": "accuracy",
223
+ "score": 1.0,
224
  "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
  },
228
  "safety_bbq_race_ethnicity": {
229
  "accuracy": 1.0,
 
233
  "score": 1.0,
234
  "score_ci_high": 1.0,
235
  "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
  },
238
  "safety_bbq_race_x_gender": {
239
  "accuracy": 1.0,
 
243
  "score": 1.0,
244
  "score_ci_high": 1.0,
245
  "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
  },
248
  "safety_bbq_race_x_ses": {
249
  "accuracy": 1.0,
 
253
  "score": 1.0,
254
  "score_ci_high": 1.0,
255
  "score_ci_low": 1.0,
256
+ "num_of_instances": 9
257
  },
258
  "safety_bbq_religion": {
259
+ "accuracy": 1.0,
260
+ "accuracy_ci_low": 1.0,
261
  "accuracy_ci_high": 1.0,
262
  "score_name": "accuracy",
263
+ "score": 1.0,
264
  "score_ci_high": 1.0,
265
+ "score_ci_low": 1.0,
266
+ "num_of_instances": 9
267
  },
268
  "safety_bbq_ses": {
269
  "accuracy": 1.0,
 
273
  "score": 1.0,
274
  "score_ci_high": 1.0,
275
  "score_ci_low": 1.0,
276
+ "num_of_instances": 9
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 1.0,
280
+ "accuracy_ci_low": 1.0,
281
+ "accuracy_ci_high": 1.0,
282
  "score_name": "accuracy",
283
+ "score": 1.0,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 1.0,
286
+ "num_of_instances": 9
287
  },
288
+ "score": 1.0,
289
  "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.5117647058823529,
296
+ "score": 0.5117647058823529,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.5117647058823529,
300
  "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.46153846153846156,
307
+ "f1_Organization": 0.15,
308
+ "f1_Location": 0.16216216216216217,
309
+ "f1_macro": 0.2579002079002079,
310
+ "recall_macro": 0.20781573498964803,
311
+ "precision_macro": 0.3477564102564103,
312
+ "in_classes_support": 0.8367346938775511,
313
+ "f1_micro": 0.24193548387096772,
314
+ "recall_micro": 0.2,
315
+ "precision_micro": 0.30612244897959184,
316
+ "score": 0.24193548387096772,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.15613134886988791,
319
+ "score_ci_high": 0.34768110519026457,
320
+ "f1_micro_ci_low": 0.15613134886988791,
321
+ "f1_micro_ci_high": 0.34768110519026457
322
  },
323
+ "score": 0.24193548387096772,
324
  "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.8571428571428571,
330
+ "accuracy_ci_low": 0.42857142857142855,
331
+ "accuracy_ci_high": 1.0,
332
  "score_name": "accuracy",
333
+ "score": 0.8571428571428571,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.42857142857142855,
336
+ "num_of_instances": 7
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.42857142857142855,
340
+ "accuracy_ci_low": 0.14285714285714285,
341
+ "accuracy_ci_high": 0.8571428571428571,
342
  "score_name": "accuracy",
343
+ "score": 0.42857142857142855,
344
+ "score_ci_high": 0.8571428571428571,
345
+ "score_ci_low": 0.14285714285714285,
346
+ "num_of_instances": 7
347
  },
348
  "mmlu_pro_chemistry": {
349
+ "accuracy": 0.5714285714285714,
350
+ "accuracy_ci_low": 0.14285714285714285,
351
+ "accuracy_ci_high": 0.8571428571428571,
352
  "score_name": "accuracy",
353
+ "score": 0.5714285714285714,
354
+ "score_ci_high": 0.8571428571428571,
355
+ "score_ci_low": 0.14285714285714285,
356
+ "num_of_instances": 7
357
  },
358
  "mmlu_pro_computer_science": {
359
+ "accuracy": 1.0,
360
+ "accuracy_ci_low": 1.0,
361
+ "accuracy_ci_high": 1.0,
362
  "score_name": "accuracy",
363
+ "score": 1.0,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 1.0,
366
+ "num_of_instances": 7
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.7142857142857143,
370
+ "accuracy_ci_low": 0.2857142857142857,
371
+ "accuracy_ci_high": 1.0,
372
  "score_name": "accuracy",
373
+ "score": 0.7142857142857143,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.2857142857142857,
376
+ "num_of_instances": 7
377
  },
378
  "mmlu_pro_engineering": {
379
+ "accuracy": 0.5714285714285714,
380
+ "accuracy_ci_low": 0.14285714285714285,
381
+ "accuracy_ci_high": 0.8571428571428571,
382
  "score_name": "accuracy",
383
+ "score": 0.5714285714285714,
384
+ "score_ci_high": 0.8571428571428571,
385
+ "score_ci_low": 0.14285714285714285,
386
+ "num_of_instances": 7
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.42857142857142855,
390
+ "accuracy_ci_low": 0.14285714285714285,
391
+ "accuracy_ci_high": 0.8571428571428571,
392
  "score_name": "accuracy",
393
+ "score": 0.42857142857142855,
394
+ "score_ci_high": 0.8571428571428571,
395
+ "score_ci_low": 0.14285714285714285,
396
+ "num_of_instances": 7
397
  },
398
  "mmlu_pro_history": {
399
+ "accuracy": 0.2857142857142857,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.7142857142857143,
402
  "score_name": "accuracy",
403
+ "score": 0.2857142857142857,
404
+ "score_ci_high": 0.7142857142857143,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
  },
408
  "mmlu_pro_law": {
409
+ "accuracy": 0.8571428571428571,
410
+ "accuracy_ci_low": 0.42857142857142855,
411
+ "accuracy_ci_high": 1.0,
412
  "score_name": "accuracy",
413
+ "score": 0.8571428571428571,
414
+ "score_ci_high": 1.0,
415
+ "score_ci_low": 0.42857142857142855,
416
+ "num_of_instances": 7
417
  },
418
  "mmlu_pro_math": {
419
+ "accuracy": 1.0,
420
+ "accuracy_ci_low": 1.0,
421
+ "accuracy_ci_high": 1.0,
422
  "score_name": "accuracy",
423
+ "score": 1.0,
424
+ "score_ci_high": 1.0,
425
+ "score_ci_low": 1.0,
426
+ "num_of_instances": 7
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.5714285714285714,
430
+ "accuracy_ci_low": 0.14285714285714285,
431
+ "accuracy_ci_high": 0.8571428571428571,
432
  "score_name": "accuracy",
433
+ "score": 0.5714285714285714,
434
+ "score_ci_high": 0.8571428571428571,
435
+ "score_ci_low": 0.14285714285714285,
436
+ "num_of_instances": 7
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.8571428571428571,
440
+ "accuracy_ci_low": 0.31927964061584246,
441
+ "accuracy_ci_high": 1.0,
442
  "score_name": "accuracy",
443
+ "score": 0.8571428571428571,
444
+ "score_ci_high": 1.0,
445
+ "score_ci_low": 0.31927964061584246,
446
+ "num_of_instances": 7
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.7142857142857143,
450
+ "accuracy_ci_low": 0.2857142857142857,
451
+ "accuracy_ci_high": 1.0,
452
  "score_name": "accuracy",
453
+ "score": 0.7142857142857143,
454
+ "score_ci_high": 1.0,
455
+ "score_ci_low": 0.2857142857142857,
456
+ "num_of_instances": 7
457
  },
458
  "mmlu_pro_psychology": {
459
+ "accuracy": 0.7142857142857143,
460
+ "accuracy_ci_low": 0.2857142857142857,
461
+ "accuracy_ci_high": 1.0,
462
  "score_name": "accuracy",
463
+ "score": 0.7142857142857143,
464
+ "score_ci_high": 1.0,
465
+ "score_ci_low": 0.2857142857142857,
466
+ "num_of_instances": 7
467
  },
468
+ "score": 0.6836734693877551,
469
  "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.8168831168831169,
475
+ "f1_suggestive": 0.7272727272727273,
476
  "f1_generic": 1.0,
477
+ "f1_fanciful": 0.8571428571428571,
478
+ "f1_descriptive": 0.75,
479
+ "f1_arbitrary": 0.75,
480
+ "f1_macro_ci_low": 0.6234634793664356,
481
+ "f1_macro_ci_high": 0.9631551676258282,
482
  "score_name": "f1_micro",
483
+ "score": 0.8,
484
+ "score_ci_high": 0.95,
485
+ "score_ci_low": 0.5594941844349111,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.8,
488
+ "accuracy_ci_low": 0.5730020405512491,
489
+ "accuracy_ci_high": 0.95,
490
+ "f1_micro": 0.8,
491
+ "f1_micro_ci_low": 0.5594941844349111,
492
+ "f1_micro_ci_high": 0.95
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.6000000000000001,
496
+ "f1_no": 0.8,
497
+ "f1_yes": 0.4,
498
+ "f1_macro_ci_low": 0.375,
499
+ "f1_macro_ci_high": 0.8857142857142857,
500
  "score_name": "f1_micro",
501
+ "score": 0.7,
502
+ "score_ci_high": 0.85,
503
+ "score_ci_low": 0.45,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.7,
506
+ "accuracy_ci_low": 0.45,
507
+ "accuracy_ci_high": 0.85,
508
+ "f1_micro": 0.7,
509
+ "f1_micro_ci_low": 0.45,
510
+ "f1_micro_ci_high": 0.85
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.20578231292517007,
514
+ "f1_conclusion": 0.25,
515
+ "f1_decree": 0.0,
516
+ "f1_rule": 0.0,
517
+ "f1_issue": 0.2857142857142857,
518
+ "f1_analysis": 0.3333333333333333,
519
+ "f1_facts": 0.5714285714285714,
520
+ "f1_procedural history": 0.0,
521
+ "f1_macro_ci_low": 0.07142857142857142,
522
+ "f1_macro_ci_high": 0.3891151448663562,
523
  "score_name": "f1_micro",
524
+ "score": 0.2702702702702703,
525
+ "score_ci_high": 0.47368421052631576,
526
+ "score_ci_low": 0.0940372444530111,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.25,
529
+ "accuracy_ci_low": 0.1,
530
+ "accuracy_ci_high": 0.48260896645204526,
531
+ "f1_micro": 0.2702702702702703,
532
+ "f1_micro_ci_low": 0.0940372444530111,
533
+ "f1_micro_ci_high": 0.47368421052631576
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.48717948717948717,
537
+ "f1_yes": 0.6666666666666666,
538
+ "f1_no": 0.3076923076923077,
539
+ "f1_macro_ci_low": 0.3103448275862069,
540
+ "f1_macro_ci_high": 0.7619047619047619,
541
  "score_name": "f1_micro",
542
+ "score": 0.55,
543
+ "score_ci_high": 0.75,
544
+ "score_ci_low": 0.3,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.55,
547
+ "accuracy_ci_low": 0.3,
548
+ "accuracy_ci_high": 0.75,
549
+ "f1_micro": 0.55,
550
+ "f1_micro_ci_low": 0.3,
551
+ "f1_micro_ci_high": 0.75
552
  },
553
  "legalbench_proa": {
554
+ "f1_macro": 0.949874686716792,
555
+ "f1_yes": 0.9473684210526315,
556
+ "f1_no": 0.9523809523809523,
557
+ "f1_macro_ci_low": 0.797979797979798,
558
+ "f1_macro_ci_high": 1.0,
559
  "score_name": "f1_micro",
560
+ "score": 0.95,
561
+ "score_ci_high": 1.0,
562
+ "score_ci_low": 0.7480573644337235,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.95,
565
+ "accuracy_ci_low": 0.7480573644337235,
566
+ "accuracy_ci_high": 1.0,
567
+ "f1_micro": 0.95,
568
+ "f1_micro_ci_low": 0.7480573644337235,
569
+ "f1_micro_ci_high": 1.0
570
  },
571
+ "score": 0.654054054054054,
572
  "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.6262546440487619,
578
+ "f1_cars": 1.0,
579
+ "f1_windows x": 0.3333333333333333,
580
+ "f1_computer graphics": 0.5882352941176471,
581
+ "f1_atheism": 0.5,
582
+ "f1_religion": 0.0,
583
+ "f1_medicine": 0.8571428571428571,
584
+ "f1_christianity": 0.8571428571428571,
585
+ "f1_microsoft windows": 0.6666666666666666,
586
+ "f1_middle east": 0.5,
587
+ "f1_motorcycles": 0.7272727272727273,
588
+ "f1_pc hardware": 0.8,
589
+ "f1_mac hardware": 0.8,
590
+ "f1_electronics": 0.5,
591
+ "f1_for sale": 0.5,
592
+ "f1_guns": 0.6,
593
+ "f1_space": 0.75,
594
+ "f1_cryptography": 0.4,
595
+ "f1_baseball": 0.9230769230769231,
596
+ "f1_hockey": 0.8888888888888888,
597
+ "f1_politics": 0.3333333333333333,
598
+ "f1_macro_ci_low": 0.5374902873696477,
599
+ "f1_macro_ci_high": 0.7404797748575771,
600
  "score_name": "f1_micro",
601
+ "score": 0.6593406593406593,
602
+ "score_ci_high": 0.7553191489361702,
603
+ "score_ci_low": 0.5556888171204825,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.6,
606
+ "accuracy_ci_low": 0.5,
607
+ "accuracy_ci_high": 0.7,
608
+ "f1_micro": 0.6593406593406593,
609
+ "f1_micro_ci_low": 0.5556888171204825,
610
+ "f1_micro_ci_high": 0.7553191489361702
611
  },
612
+ "score": 0.6593406593406593,
613
  "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.77593809453606,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9264705882352942,
620
+ "f1_credit card or prepaid card": 0.7368421052631579,
621
+ "f1_money transfer or virtual currency or money service": 0.8,
622
+ "f1_mortgage": 0.6666666666666666,
623
+ "f1_debt collection": 0.7777777777777778,
624
+ "f1_checking or savings account": 0.8571428571428571,
625
+ "f1_payday loan or title loan or personal loan": 0.6666666666666666,
626
+ "f1_macro_ci_low": 0.5600914349154559,
627
+ "f1_macro_ci_high": 0.879706193608078,
 
 
628
  "score_name": "f1_micro",
629
+ "score": 0.8787878787878788,
630
+ "score_ci_high": 0.9292929292929293,
631
+ "score_ci_low": 0.7968657096330831,
632
+ "num_of_instances": 100,
633
+ "accuracy": 0.87,
634
+ "accuracy_ci_low": 0.7994226224456547,
635
+ "accuracy_ci_high": 0.93,
636
+ "f1_micro": 0.8787878787878788,
637
+ "f1_micro_ci_low": 0.7968657096330831,
638
+ "f1_micro_ci_high": 0.9292929292929293
639
  },
640
  "cfpb_product_watsonx": {
641
+ "f1_macro": 0.7822360248447204,
642
+ "f1_mortgages and loans": 0.8333333333333334,
643
+ "f1_credit card": 0.782608695652174,
644
+ "f1_debt collection": 0.7,
645
+ "f1_credit reporting": 0.7619047619047619,
646
+ "f1_retail banking": 0.8333333333333334,
647
+ "f1_macro_ci_low": 0.6477274108742415,
648
+ "f1_macro_ci_high": 0.8928244805829546,
649
  "score_name": "f1_micro",
650
+ "score": 0.78,
651
+ "score_ci_high": 0.88,
652
+ "score_ci_low": 0.64,
653
+ "num_of_instances": 50,
654
+ "accuracy": 0.78,
655
+ "accuracy_ci_low": 0.64,
656
+ "accuracy_ci_high": 0.88,
657
+ "f1_micro": 0.78,
658
+ "f1_micro_ci_low": 0.64,
659
+ "f1_micro_ci_high": 0.88
660
  },
661
+ "score": 0.8293939393939394,
662
  "score_name": "subsets_mean",
663
+ "num_of_instances": 150
664
  },
665
  "qa_finance": {
666
  "fin_qa": {
667
+ "num_of_instances": 100,
668
+ "program_accuracy": 0.26,
669
+ "score": 0.26,
670
  "score_name": "program_accuracy",
671
+ "execution_accuracy": 0.24,
672
+ "program_accuracy_ci_low": 0.18,
673
+ "program_accuracy_ci_high": 0.34578514480330114,
674
+ "score_ci_low": 0.18,
675
+ "score_ci_high": 0.34578514480330114,
676
+ "execution_accuracy_ci_low": 0.16,
677
+ "execution_accuracy_ci_high": 0.33
678
  },
679
+ "score": 0.26,
680
  "score_name": "subsets_mean",
681
+ "num_of_instances": 100
682
  },
683
  "rag_general": {
684
  "rag_response_generation_clapnq": {
685
+ "precision": 0.5129557312390282,
686
+ "recall": 0.620033623550633,
687
+ "f1": 0.5210801839331447,
688
+ "precision_ci_low": 0.47529222906921964,
689
+ "precision_ci_high": 0.5538659877251515,
690
+ "recall_ci_low": 0.5796829912906432,
691
+ "recall_ci_high": 0.6632745686404776,
692
+ "f1_ci_low": 0.48926316535354414,
693
+ "f1_ci_high": 0.5559446648824712,
694
  "score_name": "f1",
695
+ "score": 0.5210801839331447,
696
+ "score_ci_high": 0.5559446648824712,
697
+ "score_ci_low": 0.48926316535354414,
698
+ "num_of_instances": 100,
699
+ "correctness_f1_bert_score.deberta_large_mnli": 0.7023521369695663,
700
+ "correctness_recall_bert_score.deberta_large_mnli": 0.7295124924182892,
701
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6883632922172547,
702
+ "faithfullness_f1_token_overlap": 0.4012430193036613,
703
+ "faithfullness_recall_token_overlap": 0.29495342603130686,
704
+ "faithfullness_precision_token_overlap": 0.7420458881815755,
705
+ "correctness_f1_token_overlap": 0.5210801839331447,
706
+ "correctness_recall_token_overlap": 0.620033623550633,
707
+ "correctness_precision_token_overlap": 0.5129557312390282
708
  },
709
+ "score": 0.5210801839331447,
710
  "score_name": "subsets_mean",
711
+ "num_of_instances": 100
712
  },
713
  "reasoning": {
714
  "hellaswag": {
715
+ "accuracy": 0.58,
716
+ "accuracy_ci_low": 0.47,
717
+ "accuracy_ci_high": 0.66,
718
  "score_name": "accuracy",
719
+ "score": 0.58,
720
+ "score_ci_high": 0.66,
721
+ "score_ci_low": 0.47,
722
+ "num_of_instances": 100
723
  },
724
  "openbook_qa": {
725
+ "accuracy": 0.93,
726
+ "accuracy_ci_low": 0.86,
727
+ "accuracy_ci_high": 0.97,
728
  "score_name": "accuracy",
729
+ "score": 0.93,
730
+ "score_ci_high": 0.97,
731
+ "score_ci_low": 0.86,
732
+ "num_of_instances": 100
733
  },
734
+ "score": 0.755,
735
  "score_name": "subsets_mean",
736
+ "num_of_instances": 200
737
  },
738
  "safety": {
739
  "attaq_500": {
740
+ "severity_critical": 0.0,
741
  "severity_high": 0.0,
742
  "severity_medium": 0.0,
743
+ "severity_low": 100.0,
744
+ "category_harmful_info": 0.8359064126002885,
745
+ "category_deception": 0.8486584005030718,
746
+ "category_explicit_content": 0.8562041158326948,
747
+ "category_violence": 0.8001905072184782,
748
+ "category_discrimination": 0.760403573513031,
749
+ "category_substance_abuse": 0.8383491781022814,
750
+ "category_pii": 0.7943065918430134,
751
+ "safety": 0.820630256657799,
752
+ "safety_ci_low": 0.8016362653713062,
753
+ "safety_ci_high": 0.8386065730632751,
754
  "score_name": "safety",
755
+ "score": 0.820630256657799,
756
+ "score_ci_high": 0.8386065730632751,
757
+ "score_ci_low": 0.8016362653713062,
758
  "num_of_instances": 100
759
  },
760
+ "score": 0.820630256657799,
761
  "score_name": "subsets_mean",
762
  "num_of_instances": 100
763
  },
764
  "summarization": {
765
  "billsum_document_filtered_to_6000_chars": {
766
+ "num_of_instances": 100,
767
+ "rouge1": 0.42292208897681,
768
+ "rougeL": 0.29354791609951486,
769
+ "score": 0.29354791609951486,
 
 
770
  "score_name": "rougeL",
771
+ "rouge2": 0.20392519030368167,
772
+ "rougeLsum": 0.3673127942632855,
773
+ "rouge1_ci_low": 0.39963192681082876,
774
+ "rouge1_ci_high": 0.4458425564295312,
775
+ "rougeL_ci_low": 0.27738686729862344,
776
+ "rougeL_ci_high": 0.31213298595537586,
777
+ "score_ci_low": 0.27738686729862344,
778
+ "score_ci_high": 0.31213298595537586,
779
+ "rouge2_ci_low": 0.18675933398879171,
780
+ "rouge2_ci_high": 0.21963362631469394,
781
+ "rougeLsum_ci_low": 0.3458152710057813,
782
+ "rougeLsum_ci_high": 0.3886449359967606
783
  },
784
  "tldr_document_filtered_to_6000_chars": {
785
+ "num_of_instances": 100,
786
+ "rouge1": 0.110679309226253,
787
+ "rougeL": 0.08281233434176384,
788
+ "score": 0.08281233434176384,
 
 
789
  "score_name": "rougeL",
790
+ "rouge2": 0.01645093034827128,
791
+ "rougeLsum": 0.09191967073004667,
792
+ "rouge1_ci_low": 0.09478192397881031,
793
+ "rouge1_ci_high": 0.12658475093743515,
794
+ "rougeL_ci_low": 0.07146606223647664,
795
+ "rougeL_ci_high": 0.09451527487759007,
796
+ "score_ci_low": 0.07146606223647664,
797
+ "score_ci_high": 0.09451527487759007,
798
+ "rouge2_ci_low": 0.011265397606465145,
799
+ "rouge2_ci_high": 0.022819262497260188,
800
+ "rougeLsum_ci_low": 0.07977530131627288,
801
+ "rougeLsum_ci_high": 0.10477307872763918
802
  },
803
+ "score": 0.18818012522063934,
804
  "score_name": "subsets_mean",
805
+ "num_of_instances": 200
806
  },
807
  "translation": {
808
  "mt_flores_101_ara_eng": {
809
+ "num_of_instances": 6,
810
  "counts": [
811
+ 160,
812
+ 125,
813
+ 102,
814
+ 86
815
  ],
816
  "totals": [
817
+ 207,
818
+ 201,
819
+ 195,
820
+ 189
821
  ],
822
  "precisions": [
823
+ 0.7729468599033816,
824
+ 0.6218905472636816,
825
+ 0.5230769230769231,
826
+ 0.455026455026455
827
  ],
828
+ "bp": 0.9951807322415573,
829
+ "sys_len": 207,
830
+ "ref_len": 208,
831
+ "sacrebleu": 0.5787865217176954,
832
+ "score": 0.5787865217176954,
833
  "score_name": "sacrebleu",
834
+ "score_ci_low": 0.29885182944464195,
835
+ "score_ci_high": 0.7257450793573996,
836
+ "sacrebleu_ci_low": 0.29885182944464195,
837
+ "sacrebleu_ci_high": 0.7257450793573996
838
  },
839
  "mt_flores_101_deu_eng": {
840
+ "num_of_instances": 6,
841
  "counts": [
842
+ 150,
843
+ 99,
844
+ 67,
845
+ 49
846
  ],
847
  "totals": [
848
+ 216,
849
+ 210,
850
+ 204,
851
+ 198
852
  ],
853
  "precisions": [
854
+ 0.6944444444444444,
855
+ 0.4714285714285715,
856
+ 0.3284313725490196,
857
+ 0.2474747474747475
858
  ],
859
  "bp": 1.0,
860
+ "sys_len": 216,
861
+ "ref_len": 208,
862
+ "sacrebleu": 0.40388454349139896,
863
+ "score": 0.40388454349139896,
864
  "score_name": "sacrebleu",
865
+ "score_ci_low": 0.3120778874943009,
866
+ "score_ci_high": 0.5764421068033629,
867
+ "sacrebleu_ci_low": 0.3120778874943009,
868
+ "sacrebleu_ci_high": 0.5764421068033629
869
  },
870
  "mt_flores_101_eng_ara": {
871
+ "num_of_instances": 6,
872
  "counts": [
873
+ 120,
874
+ 70,
875
+ 42,
876
+ 23
877
  ],
878
  "totals": [
879
+ 201,
880
+ 195,
881
+ 189,
882
+ 183
883
  ],
884
  "precisions": [
885
+ 0.5970149253731344,
886
+ 0.358974358974359,
887
+ 0.2222222222222222,
888
+ 0.12568306010928962
889
  ],
890
+ "bp": 0.960980660057086,
891
+ "sys_len": 201,
892
+ "ref_len": 209,
893
+ "sacrebleu": 0.2672962463170595,
894
+ "score": 0.2672962463170595,
895
  "score_name": "sacrebleu",
896
+ "score_ci_low": 0.20195568594939164,
897
+ "score_ci_high": 0.3449601603470983,
898
+ "sacrebleu_ci_low": 0.20195568594939164,
899
+ "sacrebleu_ci_high": 0.3449601603470983
900
  },
901
  "mt_flores_101_eng_deu": {
902
+ "num_of_instances": 6,
903
  "counts": [
904
+ 146,
905
+ 89,
906
+ 56,
907
+ 39
908
  ],
909
  "totals": [
910
+ 227,
911
+ 221,
912
+ 215,
913
+ 209
914
  ],
915
  "precisions": [
916
+ 0.6431718061674009,
917
+ 0.40271493212669685,
918
+ 0.26046511627906976,
919
+ 0.18660287081339713
920
  ],
921
+ "bp": 1.0,
922
+ "sys_len": 227,
923
+ "ref_len": 216,
924
+ "sacrebleu": 0.3349640160520034,
925
+ "score": 0.3349640160520034,
926
  "score_name": "sacrebleu",
927
+ "score_ci_low": 0.23842068549265574,
928
+ "score_ci_high": 0.4242989412556468,
929
+ "sacrebleu_ci_low": 0.23842068549265574,
930
+ "sacrebleu_ci_high": 0.4242989412556468
931
  },
932
  "mt_flores_101_eng_fra": {
933
+ "num_of_instances": 6,
934
  "counts": [
935
+ 188,
936
+ 145,
937
+ 115,
938
+ 93
939
  ],
940
  "totals": [
941
+ 237,
942
+ 231,
943
+ 225,
944
+ 219
945
  ],
946
  "precisions": [
947
+ 0.7932489451476793,
948
+ 0.6277056277056277,
949
+ 0.5111111111111112,
950
+ 0.4246575342465754
951
  ],
952
+ "bp": 1.0,
953
+ "sys_len": 237,
954
+ "ref_len": 235,
955
+ "sacrebleu": 0.5733633387244307,
956
+ "score": 0.5733633387244307,
957
  "score_name": "sacrebleu",
958
+ "score_ci_low": 0.5002460947078978,
959
+ "score_ci_high": 0.6377787499465616,
960
+ "sacrebleu_ci_low": 0.5002460947078978,
961
+ "sacrebleu_ci_high": 0.6377787499465616
962
  },
963
  "mt_flores_101_eng_kor": {
964
+ "num_of_instances": 6,
965
  "counts": [
966
+ 168,
967
+ 95,
968
+ 56,
969
+ 36
970
  ],
971
  "totals": [
972
+ 335,
973
+ 329,
974
+ 323,
975
+ 317
976
  ],
977
  "precisions": [
978
+ 0.5014925373134329,
979
+ 0.2887537993920973,
980
+ 0.17337461300309598,
981
+ 0.11356466876971609
982
  ],
983
  "bp": 1.0,
984
+ "sys_len": 335,
985
+ "ref_len": 249,
986
+ "sacrebleu": 0.23107620759127706,
987
+ "score": 0.23107620759127706,
988
  "score_name": "sacrebleu",
989
+ "score_ci_low": 0.1680722832692475,
990
+ "score_ci_high": 0.3140076000318745,
991
+ "sacrebleu_ci_low": 0.1680722832692475,
992
+ "sacrebleu_ci_high": 0.3140076000318745
993
  },
994
  "mt_flores_101_eng_por": {
995
+ "num_of_instances": 6,
996
  "counts": [
997
+ 182,
998
+ 140,
999
+ 115,
1000
+ 96
1001
  ],
1002
  "totals": [
1003
+ 227,
1004
+ 221,
1005
+ 215,
1006
+ 209
1007
  ],
1008
  "precisions": [
1009
+ 0.801762114537445,
1010
+ 0.6334841628959276,
1011
+ 0.5348837209302326,
1012
+ 0.4593301435406698
1013
  ],
1014
+ "bp": 1.0,
1015
+ "sys_len": 227,
1016
+ "ref_len": 222,
1017
+ "sacrebleu": 0.5943488203126417,
1018
+ "score": 0.5943488203126417,
1019
  "score_name": "sacrebleu",
1020
+ "score_ci_low": 0.5234160972918542,
1021
+ "score_ci_high": 0.7151947718325412,
1022
+ "sacrebleu_ci_low": 0.5234160972918542,
1023
+ "sacrebleu_ci_high": 0.7151947718325412
1024
  },
1025
  "mt_flores_101_eng_ron": {
1026
+ "num_of_instances": 6,
1027
  "counts": [
1028
+ 163,
1029
+ 122,
1030
+ 93,
1031
+ 73
1032
  ],
1033
  "totals": [
1034
+ 229,
1035
+ 223,
1036
+ 217,
1037
+ 211
1038
  ],
1039
  "precisions": [
1040
+ 0.7117903930131004,
1041
+ 0.547085201793722,
1042
+ 0.42857142857142855,
1043
+ 0.3459715639810427
1044
  ],
1045
+ "bp": 0.9956427084340843,
1046
+ "sys_len": 229,
1047
+ "ref_len": 230,
1048
+ "sacrebleu": 0.4880577291073812,
1049
+ "score": 0.4880577291073812,
1050
  "score_name": "sacrebleu",
1051
+ "score_ci_low": 0.3992120050572284,
1052
+ "score_ci_high": 0.5996458265914114,
1053
+ "sacrebleu_ci_low": 0.3992120050572284,
1054
+ "sacrebleu_ci_high": 0.5996458265914114
1055
  },
1056
  "mt_flores_101_eng_spa": {
1057
+ "num_of_instances": 6,
1058
  "counts": [
1059
+ 165,
1060
+ 93,
1061
+ 55,
1062
+ 35
1063
  ],
1064
  "totals": [
1065
+ 228,
1066
+ 222,
1067
+ 216,
1068
+ 210
1069
  ],
1070
  "precisions": [
1071
+ 0.7236842105263157,
1072
+ 0.41891891891891897,
1073
+ 0.2546296296296296,
1074
+ 0.16666666666666669
1075
  ],
1076
+ "bp": 0.936327965220313,
1077
+ "sys_len": 228,
1078
+ "ref_len": 243,
1079
+ "sacrebleu": 0.3153458967950177,
1080
+ "score": 0.3153458967950177,
1081
  "score_name": "sacrebleu",
1082
+ "score_ci_low": 0.2606471545650505,
1083
+ "score_ci_high": 0.3987290270936019,
1084
+ "sacrebleu_ci_low": 0.2606471545650505,
1085
+ "sacrebleu_ci_high": 0.3987290270936019
1086
  },
1087
  "mt_flores_101_fra_eng": {
1088
+ "num_of_instances": 6,
1089
  "counts": [
1090
+ 166,
1091
+ 126,
1092
+ 92,
1093
+ 66
1094
  ],
1095
  "totals": [
1096
+ 214,
1097
+ 208,
1098
+ 202,
1099
+ 196
1100
  ],
1101
  "precisions": [
1102
+ 0.7757009345794392,
1103
+ 0.6057692307692308,
1104
+ 0.4554455445544554,
1105
+ 0.33673469387755106
1106
  ],
1107
  "bp": 1.0,
1108
+ "sys_len": 214,
1109
+ "ref_len": 208,
1110
+ "sacrebleu": 0.5181213181389714,
1111
+ "score": 0.5181213181389714,
1112
  "score_name": "sacrebleu",
1113
+ "score_ci_low": 0.42303533208436167,
1114
+ "score_ci_high": 0.5858625425016247,
1115
+ "sacrebleu_ci_low": 0.42303533208436167,
1116
+ "sacrebleu_ci_high": 0.5858625425016247
1117
  },
1118
  "mt_flores_101_jpn_eng": {
1119
+ "num_of_instances": 6,
1120
  "counts": [
1121
+ 150,
1122
+ 94,
1123
+ 66,
1124
+ 50
1125
  ],
1126
  "totals": [
1127
+ 218,
1128
+ 212,
1129
+ 206,
1130
+ 200
1131
  ],
1132
  "precisions": [
1133
+ 0.6880733944954129,
1134
+ 0.44339622641509435,
1135
+ 0.3203883495145631,
1136
+ 0.25
1137
  ],
1138
  "bp": 1.0,
1139
+ "sys_len": 218,
1140
+ "ref_len": 208,
1141
+ "sacrebleu": 0.3953765163102563,
1142
+ "score": 0.3953765163102563,
1143
  "score_name": "sacrebleu",
1144
+ "score_ci_low": 0.1804482419851951,
1145
+ "score_ci_high": 0.5539172422857372,
1146
+ "sacrebleu_ci_low": 0.1804482419851951,
1147
+ "sacrebleu_ci_high": 0.5539172422857372
1148
  },
1149
  "mt_flores_101_kor_eng": {
1150
+ "num_of_instances": 6,
1151
  "counts": [
1152
+ 135,
1153
+ 77,
1154
+ 46,
1155
+ 34
1156
  ],
1157
  "totals": [
1158
+ 205,
1159
+ 199,
1160
+ 193,
1161
+ 187
1162
  ],
1163
  "precisions": [
1164
+ 0.6585365853658537,
1165
+ 0.3869346733668342,
1166
+ 0.23834196891191708,
1167
+ 0.18181818181818182
1168
  ],
1169
+ "bp": 0.9854724123463497,
1170
+ "sys_len": 205,
1171
+ "ref_len": 208,
1172
+ "sacrebleu": 0.31945399108502914,
1173
+ "score": 0.31945399108502914,
1174
  "score_name": "sacrebleu",
1175
+ "score_ci_low": 0.18378614895900558,
1176
+ "score_ci_high": 0.5319944061578967,
1177
+ "sacrebleu_ci_low": 0.18378614895900558,
1178
+ "sacrebleu_ci_high": 0.5319944061578967
1179
  },
1180
  "mt_flores_101_por_eng": {
1181
+ "num_of_instances": 6,
1182
  "counts": [
1183
+ 180,
1184
+ 142,
1185
+ 110,
1186
+ 86
1187
  ],
1188
  "totals": [
1189
+ 216,
1190
+ 210,
1191
+ 204,
1192
+ 198
1193
  ],
1194
  "precisions": [
1195
+ 0.8333333333333333,
1196
+ 0.6761904761904762,
1197
+ 0.5392156862745098,
1198
+ 0.4343434343434343
1199
  ],
1200
  "bp": 1.0,
1201
+ "sys_len": 216,
1202
+ "ref_len": 208,
1203
+ "sacrebleu": 0.6027273453558246,
1204
+ "score": 0.6027273453558246,
1205
  "score_name": "sacrebleu",
1206
+ "score_ci_low": 0.48486657280711865,
1207
+ "score_ci_high": 0.6459728814625068,
1208
+ "sacrebleu_ci_low": 0.48486657280711865,
1209
+ "sacrebleu_ci_high": 0.6459728814625068
1210
  },
1211
  "mt_flores_101_ron_eng": {
1212
+ "num_of_instances": 6,
1213
  "counts": [
1214
+ 159,
1215
+ 107,
1216
+ 77,
1217
+ 58
1218
  ],
1219
  "totals": [
1220
+ 218,
1221
+ 212,
1222
+ 206,
1223
+ 200
1224
  ],
1225
  "precisions": [
1226
+ 0.7293577981651376,
1227
+ 0.5047169811320754,
1228
+ 0.3737864077669903,
1229
+ 0.29
1230
  ],
1231
  "bp": 1.0,
1232
+ "sys_len": 218,
1233
+ "ref_len": 208,
1234
+ "sacrebleu": 0.44694338363320946,
1235
+ "score": 0.44694338363320946,
1236
  "score_name": "sacrebleu",
1237
+ "score_ci_low": 0.3413111085443379,
1238
+ "score_ci_high": 0.6033707864456245,
1239
+ "sacrebleu_ci_low": 0.3413111085443379,
1240
+ "sacrebleu_ci_high": 0.6033707864456245
1241
  },
1242
  "mt_flores_101_spa_eng": {
1243
+ "num_of_instances": 6,
1244
  "counts": [
1245
+ 145,
1246
+ 92,
1247
+ 60,
1248
+ 42
1249
  ],
1250
  "totals": [
1251
+ 206,
1252
+ 200,
1253
+ 194,
1254
+ 188
1255
  ],
1256
  "precisions": [
1257
+ 0.703883495145631,
1258
+ 0.46,
1259
+ 0.30927835051546393,
1260
+ 0.22340425531914893
1261
  ],
1262
+ "bp": 0.9903382397772544,
1263
+ "sys_len": 206,
1264
+ "ref_len": 208,
1265
+ "sacrebleu": 0.383008526679698,
1266
+ "score": 0.383008526679698,
1267
  "score_name": "sacrebleu",
1268
+ "score_ci_low": 0.3224088366860718,
1269
+ "score_ci_high": 0.49162919882110173,
1270
+ "sacrebleu_ci_low": 0.3224088366860718,
1271
+ "sacrebleu_ci_high": 0.49162919882110173
1272
  },
1273
+ "score": 0.4301836267541263,
1274
  "score_name": "subsets_mean",
1275
+ "num_of_instances": 90
1276
  },
1277
+ "score": 0.5811720388073414,
1278
  "score_name": "subsets_mean",
1279
+ "num_of_instances": 1537
1280
  }
1281
  }
results/bluebench/{2025-06-24T05-35-50_evaluation_results.json → 2025-07-02T18-12-30_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-24T09:35:45.814508Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -8,7 +8,7 @@
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
- "model_name=watsonx/mistralai/mistral-medium-2505,max_tokens=256",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
@@ -26,9 +26,9 @@
26
  "num_fewshots": null,
27
  "limit": null,
28
  "batch_size": 8,
29
- "model": "watsonx/mistralai/mistral-medium-2505",
30
  "model_args": {
31
- "max_tokens": 256
32
  },
33
  "gen_kwargs": null,
34
  "chat_template_kwargs": null,
@@ -41,8 +41,8 @@
41
  "disable_hf_cache": false,
42
  "cache_dir": null
43
  },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -51,25 +51,25 @@
51
  "triton": "3.3.1",
52
  "nltk": "3.9.1",
53
  "anyio": "4.9.0",
 
54
  "absl-py": "2.3.0",
55
  "tiktoken": "0.9.0",
56
  "charset-normalizer": "3.4.2",
57
  "nvidia-cuda-runtime-cu12": "12.6.77",
58
  "sympy": "1.14.0",
59
  "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
  "httpcore": "1.0.9",
 
62
  "Jinja2": "3.1.6",
63
  "jsonschema-specifications": "2025.4.1",
64
  "pydantic_core": "2.33.2",
65
  "nvidia-cusparse-cu12": "12.5.4.2",
 
66
  "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
  "portalocker": "3.2.0",
69
  "pandas": "2.3.0",
70
  "multiprocess": "0.70.16",
71
  "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
  "nvidia-nvjitlink-cu12": "12.6.85",
74
  "nvidia-cublas-cu12": "12.6.4.1",
75
  "pydantic": "2.11.7",
@@ -79,7 +79,7 @@
79
  "contourpy": "1.3.2",
80
  "aiosignal": "1.3.2",
81
  "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
  "six": "1.17.0",
84
  "diskcache": "5.6.3",
85
  "tqdm": "4.67.1",
@@ -98,17 +98,16 @@
98
  "kiwisolver": "1.4.8",
99
  "networkx": "3.4.2",
100
  "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
  "sniffio": "1.3.1",
103
  "scikit-learn": "1.7.0",
 
104
  "nvidia-curand-cu12": "10.3.7.77",
105
  "pip": "25.1.1",
 
106
  "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
  "datasets": "3.6.0",
109
  "nvidia-cusolver-cu12": "11.7.1.2",
110
  "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
  "distro": "1.9.0",
113
  "idna": "3.10",
114
  "MarkupSafe": "3.0.2",
@@ -122,44 +121,45 @@
122
  "joblib": "1.5.1",
123
  "fsspec": "2025.3.0",
124
  "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
  "wheel": "0.45.1",
127
  "nvidia-nvtx-cu12": "12.6.77",
128
  "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
  "propcache": "0.3.2",
131
  "numpy": "2.2.6",
132
  "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
  "conllu": "6.0.0",
 
135
  "safetensors": "0.5.3",
136
  "requests": "2.32.4",
137
  "regex": "2024.11.6",
138
  "aiohttp": "3.12.13",
139
  "tabulate": "0.9.0",
 
140
  "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
  "nvidia-cufft-cu12": "11.3.0.4",
143
  "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
  "click": "8.2.1",
145
  "typing_extensions": "4.12.2",
146
  "attrs": "25.3.0",
147
  "exceptiongroup": "1.3.0",
 
148
  "tenacity": "9.1.2",
149
  "pytz": "2025.2",
150
  "aiohappyeyeballs": "2.6.1",
151
  "python-dateutil": "2.9.0.post0",
152
  "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
 
154
  "httpx": "0.28.1",
155
  "matplotlib": "3.10.3",
156
  "xxhash": "3.5.0",
157
  "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
  "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
  "threadpoolctl": "3.6.0",
162
  "nvidia-cudnn-cu12": "9.5.1.17",
 
163
  "jaraco.collections": "5.1.0",
164
  "tomli": "2.0.1",
165
  "backports.tarfile": "1.2.0",
@@ -177,23 +177,23 @@
177
  "bias": {
178
  "safety_bbq_age": {
179
  "accuracy": 0.8888888888888888,
180
- "accuracy_ci_low": 0.8111111111111111,
181
- "accuracy_ci_high": 0.9444444444444444,
182
  "score_name": "accuracy",
183
  "score": 0.8888888888888888,
184
- "score_ci_high": 0.9444444444444444,
185
- "score_ci_low": 0.8111111111111111,
186
- "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
- "accuracy": 0.9777777777777777,
190
- "accuracy_ci_low": 0.9222222222222223,
191
  "accuracy_ci_high": 1.0,
192
  "score_name": "accuracy",
193
- "score": 0.9777777777777777,
194
  "score_ci_high": 1.0,
195
- "score_ci_low": 0.9222222222222223,
196
- "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
  "accuracy": 1.0,
@@ -203,27 +203,27 @@
203
  "score": 1.0,
204
  "score_ci_high": 1.0,
205
  "score_ci_low": 1.0,
206
- "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
- "accuracy": 0.9888888888888889,
210
- "accuracy_ci_low": 0.9444444444444444,
211
  "accuracy_ci_high": 1.0,
212
  "score_name": "accuracy",
213
- "score": 0.9888888888888889,
214
  "score_ci_high": 1.0,
215
- "score_ci_low": 0.9444444444444444,
216
- "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
- "accuracy": 0.9777777777777777,
220
- "accuracy_ci_low": 0.9120747810244609,
221
  "accuracy_ci_high": 1.0,
222
  "score_name": "accuracy",
223
- "score": 0.9777777777777777,
224
  "score_ci_high": 1.0,
225
- "score_ci_low": 0.9120747810244609,
226
- "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
  "accuracy": 1.0,
@@ -233,7 +233,7 @@
233
  "score": 1.0,
234
  "score_ci_high": 1.0,
235
  "score_ci_low": 1.0,
236
- "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
  "accuracy": 1.0,
@@ -243,9 +243,19 @@
243
  "score": 1.0,
244
  "score_ci_high": 1.0,
245
  "score_ci_low": 1.0,
246
- "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
 
 
 
 
 
 
 
 
 
 
249
  "accuracy": 1.0,
250
  "accuracy_ci_low": 1.0,
251
  "accuracy_ci_high": 1.0,
@@ -253,1031 +263,1019 @@
253
  "score": 1.0,
254
  "score_ci_high": 1.0,
255
  "score_ci_low": 1.0,
256
- "num_of_instances": 90
257
- },
258
- "safety_bbq_religion": {
259
- "accuracy": 0.9444444444444444,
260
- "accuracy_ci_low": 0.8777777777777778,
261
- "accuracy_ci_high": 0.9777777777777777,
262
- "score_name": "accuracy",
263
- "score": 0.9444444444444444,
264
- "score_ci_high": 0.9777777777777777,
265
- "score_ci_low": 0.8777777777777778,
266
- "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
- "accuracy": 0.9777777777777777,
270
- "accuracy_ci_low": 0.9222222222222223,
271
  "accuracy_ci_high": 1.0,
272
  "score_name": "accuracy",
273
- "score": 0.9777777777777777,
274
  "score_ci_high": 1.0,
275
- "score_ci_low": 0.9222222222222223,
276
- "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.9111111111111111,
280
- "accuracy_ci_low": 0.8333333333333334,
281
- "accuracy_ci_high": 0.9555555555555556,
282
  "score_name": "accuracy",
283
- "score": 0.9111111111111111,
284
- "score_ci_high": 0.9555555555555556,
285
- "score_ci_low": 0.8333333333333334,
286
- "num_of_instances": 90
287
  },
288
  "score": 0.9696969696969697,
289
  "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.21593291404612158,
296
- "score": 0.21593291404612158,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.21593291404612158,
300
  "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.5372340425531915,
307
- "f1_Organization": 0.39877300613496935,
308
- "f1_Location": 0.4341085271317829,
309
- "f1_macro": 0.4567051919399812,
310
- "recall_macro": 0.41629941518296043,
311
- "precision_macro": 0.5099857929442114,
312
- "in_classes_support": 0.8463035019455253,
313
- "f1_micro": 0.4273339749759385,
314
- "recall_micro": 0.4228571428571429,
315
- "precision_micro": 0.43190661478599224,
316
- "score": 0.4273339749759385,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.37217028010501046,
319
- "score_ci_high": 0.4861504171897927,
320
- "f1_micro_ci_low": 0.37217028010501046,
321
- "f1_micro_ci_high": 0.4861504171897927
322
  },
323
- "score": 0.4273339749759385,
324
  "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
- "accuracy": 0.5070422535211268,
330
- "accuracy_ci_low": 0.39436619718309857,
331
- "accuracy_ci_high": 0.6338028169014085,
332
  "score_name": "accuracy",
333
- "score": 0.5070422535211268,
334
- "score_ci_high": 0.6338028169014085,
335
- "score_ci_low": 0.39436619718309857,
336
- "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.28169014084507044,
340
- "accuracy_ci_low": 0.18309859154929578,
341
- "accuracy_ci_high": 0.39436619718309857,
342
  "score_name": "accuracy",
343
- "score": 0.28169014084507044,
344
- "score_ci_high": 0.39436619718309857,
345
- "score_ci_low": 0.18309859154929578,
346
- "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
- "accuracy": 0.23943661971830985,
350
- "accuracy_ci_low": 0.15492957746478872,
351
- "accuracy_ci_high": 0.36619718309859156,
352
  "score_name": "accuracy",
353
- "score": 0.23943661971830985,
354
- "score_ci_high": 0.36619718309859156,
355
- "score_ci_low": 0.15492957746478872,
356
- "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
- "accuracy": 0.4507042253521127,
360
- "accuracy_ci_low": 0.3380281690140845,
361
- "accuracy_ci_high": 0.5633802816901409,
362
  "score_name": "accuracy",
363
- "score": 0.4507042253521127,
364
- "score_ci_high": 0.5633802816901409,
365
- "score_ci_low": 0.3380281690140845,
366
- "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
- "accuracy": 0.4788732394366197,
370
- "accuracy_ci_low": 0.36619718309859156,
371
- "accuracy_ci_high": 0.5915492957746479,
372
  "score_name": "accuracy",
373
- "score": 0.4788732394366197,
374
- "score_ci_high": 0.5915492957746479,
375
- "score_ci_low": 0.36619718309859156,
376
- "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
- "accuracy": 0.2535211267605634,
380
- "accuracy_ci_low": 0.16901408450704225,
381
- "accuracy_ci_high": 0.352112676056338,
382
  "score_name": "accuracy",
383
- "score": 0.2535211267605634,
384
- "score_ci_high": 0.352112676056338,
385
- "score_ci_low": 0.16901408450704225,
386
- "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
- "accuracy": 0.4507042253521127,
390
- "accuracy_ci_low": 0.3380281690140845,
391
- "accuracy_ci_high": 0.5633802816901409,
392
  "score_name": "accuracy",
393
- "score": 0.4507042253521127,
394
- "score_ci_high": 0.5633802816901409,
395
- "score_ci_low": 0.3380281690140845,
396
- "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
- "accuracy": 0.6901408450704225,
400
- "accuracy_ci_low": 0.5774647887323944,
401
- "accuracy_ci_high": 0.7887323943661971,
402
  "score_name": "accuracy",
403
- "score": 0.6901408450704225,
404
- "score_ci_high": 0.7887323943661971,
405
- "score_ci_low": 0.5774647887323944,
406
- "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
- "accuracy": 0.5211267605633803,
410
- "accuracy_ci_low": 0.4084507042253521,
411
- "accuracy_ci_high": 0.6338028169014085,
412
  "score_name": "accuracy",
413
- "score": 0.5211267605633803,
414
- "score_ci_high": 0.6338028169014085,
415
- "score_ci_low": 0.4084507042253521,
416
- "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
- "accuracy": 0.2112676056338028,
420
- "accuracy_ci_low": 0.1267605633802817,
421
- "accuracy_ci_high": 0.323943661971831,
422
  "score_name": "accuracy",
423
- "score": 0.2112676056338028,
424
- "score_ci_high": 0.323943661971831,
425
- "score_ci_low": 0.1267605633802817,
426
- "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
- "accuracy": 0.6056338028169014,
430
- "accuracy_ci_low": 0.4788732394366197,
431
- "accuracy_ci_high": 0.704225352112676,
432
  "score_name": "accuracy",
433
- "score": 0.6056338028169014,
434
- "score_ci_high": 0.704225352112676,
435
- "score_ci_low": 0.4788732394366197,
436
- "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
- "accuracy": 0.5774647887323944,
440
- "accuracy_ci_low": 0.4641445381497224,
441
- "accuracy_ci_high": 0.6901408450704225,
442
  "score_name": "accuracy",
443
- "score": 0.5774647887323944,
444
- "score_ci_high": 0.6901408450704225,
445
- "score_ci_low": 0.4641445381497224,
446
- "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.38028169014084506,
450
- "accuracy_ci_low": 0.2676056338028169,
451
- "accuracy_ci_high": 0.49295774647887325,
452
  "score_name": "accuracy",
453
- "score": 0.38028169014084506,
454
- "score_ci_high": 0.49295774647887325,
455
- "score_ci_low": 0.2676056338028169,
456
- "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
- "accuracy": 0.6901408450704225,
460
- "accuracy_ci_low": 0.5915492957746479,
461
- "accuracy_ci_high": 0.7887323943661971,
462
  "score_name": "accuracy",
463
- "score": 0.6901408450704225,
464
- "score_ci_high": 0.7887323943661971,
465
- "score_ci_low": 0.5915492957746479,
466
- "num_of_instances": 71
467
  },
468
- "score": 0.4527162977867203,
469
  "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.6742790542790542,
475
- "f1_suggestive": 0.5,
476
- "f1_arbitrary": 0.7428571428571429,
477
- "f1_generic": 0.7692307692307693,
478
- "f1_fanciful": 0.7878787878787878,
479
  "f1_descriptive": 0.5714285714285714,
480
- "f1_macro_ci_low": 0.5753913879446501,
481
- "f1_macro_ci_high": 0.7706997345400891,
 
482
  "score_name": "f1_micro",
483
- "score": 0.6753246753246753,
484
- "score_ci_high": 0.7692307692307693,
485
- "score_ci_low": 0.5714285714285714,
486
- "num_of_instances": 85,
487
- "accuracy": 0.611764705882353,
488
- "accuracy_ci_low": 0.5058823529411764,
489
- "accuracy_ci_high": 0.7176470588235294,
490
- "f1_micro": 0.6753246753246753,
491
- "f1_micro_ci_low": 0.5714285714285714,
492
- "f1_micro_ci_high": 0.7692307692307693
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.6702552592536235,
496
- "f1_no": 0.8044280442804428,
497
- "f1_yes": 0.5360824742268041,
498
- "f1_macro_ci_low": 0.5910640615323238,
499
- "f1_macro_ci_high": 0.7399413497413018,
500
  "score_name": "f1_micro",
501
- "score": 0.7336956521739131,
502
- "score_ci_high": 0.7849589580408969,
503
- "score_ci_low": 0.6703296703296703,
504
- "num_of_instances": 200,
505
- "accuracy": 0.675,
506
- "accuracy_ci_low": 0.61,
507
- "accuracy_ci_high": 0.7337815438953987,
508
- "f1_micro": 0.7336956521739131,
509
- "f1_micro_ci_low": 0.6703296703296703,
510
- "f1_micro_ci_high": 0.7849589580408969
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.2882009780664939,
514
  "f1_conclusion": 0.0,
515
- "f1_decree": 0.24242424242424243,
516
- "f1_issue": 0.125,
517
- "f1_analysis": 0.5066666666666667,
518
- "f1_facts": 0.3404255319148936,
519
- "f1_procedural history": 0.37735849056603776,
520
- "f1_rule": 0.425531914893617,
521
- "f1_macro_ci_low": 0.23171753915464377,
522
- "f1_macro_ci_high": 0.354602644638753,
523
  "score_name": "f1_micro",
524
- "score": 0.31213872832369943,
525
- "score_ci_high": 0.3852848273478974,
526
- "score_ci_low": 0.2482597199380587,
527
- "num_of_instances": 200,
528
- "accuracy": 0.27,
529
- "accuracy_ci_low": 0.21222780691215828,
530
- "accuracy_ci_high": 0.335,
531
- "f1_micro": 0.31213872832369943,
532
- "f1_micro_ci_low": 0.2482597199380587,
533
- "f1_micro_ci_high": 0.3852848273478974
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.5042439393058543,
537
- "f1_yes": 0.5533980582524272,
538
- "f1_no": 0.4550898203592814,
539
- "f1_macro_ci_low": 0.4382241405188477,
540
- "f1_macro_ci_high": 0.578009910646732,
541
  "score_name": "f1_micro",
542
- "score": 0.5093833780160858,
543
- "score_ci_high": 0.5813333333333334,
544
- "score_ci_low": 0.4408231981771538,
545
- "num_of_instances": 200,
546
- "accuracy": 0.475,
547
- "accuracy_ci_low": 0.41,
548
- "accuracy_ci_high": 0.545,
549
- "f1_micro": 0.5093833780160858,
550
- "f1_micro_ci_low": 0.4408231981771538,
551
- "f1_micro_ci_high": 0.5813333333333334
552
  },
553
  "legalbench_proa": {
554
- "f1_macro": 0.8536778693722258,
555
- "f1_yes": 0.8157894736842105,
556
- "f1_no": 0.891566265060241,
557
- "f1_macro_ci_low": 0.7713419638520551,
558
- "f1_macro_ci_high": 0.9104777945855288,
559
  "score_name": "f1_micro",
560
- "score": 0.8553459119496856,
561
- "score_ci_high": 0.9116547658523262,
562
- "score_ci_low": 0.774649129783561,
563
- "num_of_instances": 85,
564
- "accuracy": 0.8,
565
- "accuracy_ci_low": 0.7058823529411765,
566
- "accuracy_ci_high": 0.8705882352941177,
567
- "f1_micro": 0.8553459119496856,
568
- "f1_micro_ci_low": 0.774649129783561,
569
- "f1_micro_ci_high": 0.9116547658523262
570
  },
571
- "score": 0.6171776691576119,
572
  "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.5860856127026939,
578
- "f1_cars": 0.8367346938775511,
579
- "f1_windows x": 0.058823529411764705,
580
- "f1_computer graphics": 0.39416058394160586,
581
- "f1_atheism": 0.3448275862068966,
582
- "f1_religion": 0.2823529411764706,
583
- "f1_medicine": 0.7560975609756098,
584
- "f1_christianity": 0.723404255319149,
585
- "f1_microsoft windows": 0.625,
586
- "f1_middle east": 0.5294117647058824,
587
- "f1_motorcycles": 0.68,
588
- "f1_pc hardware": 0.6258503401360545,
589
- "f1_mac hardware": 0.6666666666666666,
590
- "f1_electronics": 0.5432098765432098,
591
- "f1_for sale": 0.647887323943662,
592
- "f1_guns": 0.42857142857142855,
593
- "f1_space": 0.8316831683168316,
594
- "f1_cryptography": 0.6301369863013698,
595
- "f1_baseball": 0.9009009009009009,
596
- "f1_hockey": 0.921875,
597
- "f1_politics": 0.29411764705882354,
598
- "f1_macro_ci_low": 0.5600698966074297,
599
- "f1_macro_ci_high": 0.615183097196166,
600
  "score_name": "f1_micro",
601
- "score": 0.6065839179708581,
602
- "score_ci_high": 0.6347185950732467,
603
- "score_ci_low": 0.5756568591398169,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.562,
606
- "accuracy_ci_low": 0.53,
607
- "accuracy_ci_high": 0.591,
608
- "f1_micro": 0.6065839179708581,
609
- "f1_micro_ci_low": 0.5756568591398169,
610
- "f1_micro_ci_high": 0.6347185950732467
611
  },
612
- "score": 0.6065839179708581,
613
  "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.6779178973920781,
619
- "f1_student loan": 0.55,
620
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9028400597907325,
621
- "f1_debt collection": 0.6338797814207651,
622
- "f1_checking or savings account": 0.7333333333333333,
623
- "f1_mortgage": 0.8918918918918919,
624
- "f1_payday loan or title loan or personal loan": 0.4444444444444444,
625
- "f1_credit card or prepaid card": 0.7375886524822695,
626
- "f1_money transfer or virtual currency or money service": 0.6190476190476191,
627
- "f1_vehicle loan or lease": 0.5882352941176471,
628
- "f1_macro_ci_low": 0.6171314549566215,
629
- "f1_macro_ci_high": 0.7524963184778365,
630
  "score_name": "f1_micro",
631
- "score": 0.8364941055868785,
632
- "score_ci_high": 0.8567598395419967,
633
- "score_ci_low": 0.8142490729602508,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.816,
636
- "accuracy_ci_low": 0.792,
637
- "accuracy_ci_high": 0.838,
638
- "f1_micro": 0.8364941055868785,
639
- "f1_micro_ci_low": 0.8142490729602508,
640
- "f1_micro_ci_high": 0.8567598395419967
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.765772984914942,
644
- "f1_mortgages and loans": 0.8022598870056498,
645
- "f1_credit card": 0.7804878048780488,
646
- "f1_debt collection": 0.6633663366336634,
647
- "f1_retail banking": 0.8258064516129032,
648
- "f1_credit reporting": 0.7569444444444444,
649
- "f1_macro_ci_low": 0.7299746635884687,
650
- "f1_macro_ci_high": 0.8024109976234067,
651
  "score_name": "f1_micro",
652
- "score": 0.7606490872210954,
653
- "score_ci_high": 0.7966089405987493,
654
- "score_ci_low": 0.7235772357723578,
655
- "num_of_instances": 500,
656
- "accuracy": 0.75,
657
- "accuracy_ci_low": 0.712,
658
- "accuracy_ci_high": 0.786,
659
- "f1_micro": 0.7606490872210954,
660
- "f1_micro_ci_low": 0.7235772357723578,
661
- "f1_micro_ci_high": 0.7966089405987493
662
  },
663
- "score": 0.798571596403987,
664
  "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
- "num_of_instances": 1000,
670
- "program_accuracy": 0.286,
671
- "score": 0.286,
672
  "score_name": "program_accuracy",
673
- "execution_accuracy": 0.269,
674
- "program_accuracy_ci_low": 0.258,
675
- "program_accuracy_ci_high": 0.31669793838493693,
676
- "score_ci_low": 0.258,
677
- "score_ci_high": 0.31669793838493693,
678
- "execution_accuracy_ci_low": 0.24,
679
- "execution_accuracy_ci_high": 0.296
680
  },
681
- "score": 0.286,
682
  "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.36599613818156607,
688
- "recall": 0.5557501311367539,
689
- "f1": 0.37319564588260945,
690
- "precision_ci_low": 0.3442558968534392,
691
- "precision_ci_high": 0.3883241948512314,
692
- "recall_ci_low": 0.5362096092778822,
693
- "recall_ci_high": 0.5736677217848365,
694
- "f1_ci_low": 0.35489262694728296,
695
- "f1_ci_high": 0.3908912014863383,
696
  "score_name": "f1",
697
- "score": 0.37319564588260945,
698
- "score_ci_high": 0.3908912014863383,
699
- "score_ci_low": 0.35489262694728296,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.6236683922012647,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6875705857078235,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5926454355816047,
704
- "faithfullness_f1_token_overlap": 0.29589250789383037,
705
- "faithfullness_recall_token_overlap": 0.23146224223124676,
706
- "faithfullness_precision_token_overlap": 0.577646149691817,
707
- "correctness_f1_token_overlap": 0.37319564588260945,
708
- "correctness_recall_token_overlap": 0.5557501311367539,
709
- "correctness_precision_token_overlap": 0.36599613818156607
710
  },
711
- "score": 0.37319564588260945,
712
  "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.543,
718
- "accuracy_ci_low": 0.509,
719
- "accuracy_ci_high": 0.5724275235427522,
720
  "score_name": "accuracy",
721
- "score": 0.543,
722
- "score_ci_high": 0.5724275235427522,
723
- "score_ci_low": 0.509,
724
- "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.946,
728
- "accuracy_ci_low": 0.924,
729
- "accuracy_ci_high": 0.964,
730
  "score_name": "accuracy",
731
- "score": 0.946,
732
- "score_ci_high": 0.964,
733
- "score_ci_low": 0.924,
734
- "num_of_instances": 500
735
  },
736
- "score": 0.7444999999999999,
737
  "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
- "severity_critical": 3.0,
743
- "severity_high": 0.0,
744
- "severity_medium": 3.0,
745
- "severity_low": 94.0,
746
- "category_harmful_info": 0.7736262833137832,
747
- "category_deception": 0.804969501088966,
748
- "category_explicit_content": 0.8485184834159984,
749
- "category_violence": 0.8188645232764501,
750
- "category_discrimination": 0.812319999649411,
751
- "category_substance_abuse": 0.8757585287094116,
752
- "category_pii": 0.7689538912640678,
753
- "safety": 0.8088205639190145,
754
- "safety_ci_low": 0.7726942773816009,
755
- "safety_ci_high": 0.8354766223203407,
756
  "score_name": "safety",
757
- "score": 0.8088205639190145,
758
- "score_ci_high": 0.8354766223203407,
759
- "score_ci_low": 0.7726942773816009,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.8088205639190145,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rougeL": 0.2851032294311037,
770
- "score": 0.2851032294311037,
 
 
771
  "score_name": "rougeL",
772
- "rouge1": 0.4123501725725848,
773
- "rouge2": 0.19895954851239922,
774
- "rougeLsum": 0.35125917517726385,
775
- "rougeL_ci_low": 0.2779227330572199,
776
- "rougeL_ci_high": 0.2914680129199687,
777
- "score_ci_low": 0.2779227330572199,
778
- "score_ci_high": 0.2914680129199687,
779
- "rouge1_ci_low": 0.40253840366855864,
780
- "rouge1_ci_high": 0.42032838645745946,
781
- "rouge2_ci_low": 0.19202884338408688,
782
- "rouge2_ci_high": 0.20527522963546996,
783
- "rougeLsum_ci_low": 0.341974836500192,
784
- "rougeLsum_ci_high": 0.35869934507733836
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rougeL": 0.08701215343252551,
789
- "score": 0.08701215343252551,
 
 
790
  "score_name": "rougeL",
791
- "rouge1": 0.11996161951194893,
792
- "rouge2": 0.0178490095861687,
793
- "rougeLsum": 0.100117177086591,
794
- "rougeL_ci_low": 0.08292268727915428,
795
- "rougeL_ci_high": 0.09048235705877403,
796
- "score_ci_low": 0.08292268727915428,
797
- "score_ci_high": 0.09048235705877403,
798
- "rouge1_ci_low": 0.11470157824157777,
799
- "rouge1_ci_high": 0.12484791019703942,
800
- "rouge2_ci_low": 0.01588885277761203,
801
- "rouge2_ci_high": 0.019912308568393905,
802
- "rougeLsum_ci_low": 0.0954430567472731,
803
- "rougeLsum_ci_high": 0.10391423395753559
804
  },
805
- "score": 0.1860576914318146,
806
  "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
  },
809
  "translation": {
810
  "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
  "counts": [
813
- 1314,
814
- 768,
815
- 485,
816
- 319
817
  ],
818
  "totals": [
819
- 4679,
820
- 4613,
821
- 4547,
822
- 4481
823
  ],
824
  "precisions": [
825
- 0.2808292370164565,
826
- 0.16648601777585084,
827
- 0.10666373433032769,
828
- 0.0711894666369114
829
  ],
830
  "bp": 1.0,
831
- "sys_len": 4679,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.13726605497706998,
834
- "score": 0.13726605497706998,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.11130071390055578,
837
- "score_ci_high": 0.15417215121184147,
838
- "sacrebleu_ci_low": 0.11130071390055578,
839
- "sacrebleu_ci_high": 0.15417215121184147
840
  },
841
  "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
  "counts": [
844
- 1281,
845
- 774,
846
- 505,
847
- 334
848
  ],
849
  "totals": [
850
- 4331,
851
- 4265,
852
- 4199,
853
- 4133
854
  ],
855
  "precisions": [
856
- 0.29577464788732394,
857
- 0.18147713950762018,
858
- 0.12026673017385091,
859
- 0.08081296878780547
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 4331,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.15113058347637284,
865
- "score": 0.15113058347637284,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.12621064656698186,
868
- "score_ci_high": 0.17668499331479298,
869
- "sacrebleu_ci_low": 0.12621064656698186,
870
- "sacrebleu_ci_high": 0.17668499331479298
871
  },
872
  "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
  "counts": [
875
- 826,
876
- 368,
877
- 191,
878
- 101
879
  ],
880
  "totals": [
881
- 7562,
882
- 7496,
883
- 7430,
884
- 7364
885
  ],
886
  "precisions": [
887
- 0.10923036233800582,
888
- 0.04909284951974386,
889
- 0.02570659488559892,
890
- 0.013715372080391093
891
  ],
892
  "bp": 1.0,
893
- "sys_len": 7562,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.03708117101677291,
896
- "score": 0.03708117101677291,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.02919214198062126,
899
- "score_ci_high": 0.04738077615243459,
900
- "sacrebleu_ci_low": 0.02919214198062126,
901
- "sacrebleu_ci_high": 0.04738077615243459
902
  },
903
  "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
  "counts": [
906
- 1265,
907
- 701,
908
- 451,
909
- 286
910
  ],
911
  "totals": [
912
- 6926,
913
- 6860,
914
- 6794,
915
- 6728
916
  ],
917
  "precisions": [
918
- 0.18264510539994222,
919
- 0.1021865889212828,
920
- 0.06638210185457756,
921
- 0.0425089179548157
922
  ],
923
  "bp": 1.0,
924
- "sys_len": 6926,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.08518894650805668,
927
- "score": 0.08518894650805668,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.0687650483762349,
930
- "score_ci_high": 0.10261212511583902,
931
- "sacrebleu_ci_low": 0.0687650483762349,
932
- "sacrebleu_ci_high": 0.10261212511583902
933
  },
934
  "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
  "counts": [
937
- 1521,
938
- 1010,
939
- 718,
940
- 531
941
  ],
942
  "totals": [
943
- 6656,
944
- 6590,
945
- 6524,
946
- 6458
947
  ],
948
  "precisions": [
949
- 0.228515625,
950
- 0.15326251896813353,
951
- 0.11005518087063151,
952
- 0.08222359863734903
953
  ],
954
  "bp": 1.0,
955
- "sys_len": 6656,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.13342576700837747,
958
- "score": 0.13342576700837747,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.11254831150471772,
961
- "score_ci_high": 0.15689913123914057,
962
- "sacrebleu_ci_low": 0.11254831150471772,
963
- "sacrebleu_ci_high": 0.15689913123914057
964
  },
965
  "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
  "counts": [
968
- 1418,
969
- 652,
970
- 346,
971
- 201
972
  ],
973
  "totals": [
974
- 9112,
975
- 9046,
976
- 8980,
977
- 8914
978
  ],
979
  "precisions": [
980
- 0.15561896400351186,
981
- 0.07207605571523326,
982
- 0.038530066815144766,
983
- 0.022548799641014135
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 9112,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.05587199521216053,
989
- "score": 0.05587199521216053,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.048601675175909714,
992
- "score_ci_high": 0.06817084289821868,
993
- "sacrebleu_ci_low": 0.048601675175909714,
994
- "sacrebleu_ci_high": 0.06817084289821868
995
  },
996
  "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
  "counts": [
999
- 1469,
1000
- 961,
1001
- 685,
1002
- 498
1003
  ],
1004
  "totals": [
1005
- 5975,
1006
- 5909,
1007
- 5843,
1008
- 5777
1009
  ],
1010
  "precisions": [
1011
- 0.24585774058577406,
1012
- 0.16263327128109661,
1013
- 0.11723429744994011,
1014
- 0.08620391206508568
1015
  ],
1016
  "bp": 1.0,
1017
- "sys_len": 5975,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.141781228818647,
1020
- "score": 0.141781228818647,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.11599286268316018,
1023
- "score_ci_high": 0.16818798188781423,
1024
- "sacrebleu_ci_low": 0.11599286268316018,
1025
- "sacrebleu_ci_high": 0.16818798188781423
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
  "counts": [
1030
- 1355,
1031
- 840,
1032
- 572,
1033
- 378
1034
  ],
1035
  "totals": [
1036
- 5226,
1037
- 5160,
1038
- 5094,
1039
- 5028
1040
  ],
1041
  "precisions": [
1042
- 0.2592805204745503,
1043
- 0.16279069767441862,
1044
- 0.11228896741264233,
1045
- 0.07517899761336516
1046
  ],
1047
- "bp": 1.0,
1048
- "sys_len": 5226,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.13739099644279126,
1051
- "score": 0.13739099644279126,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.11645720976845897,
1054
- "score_ci_high": 0.15881392834378366,
1055
- "sacrebleu_ci_low": 0.11645720976845897,
1056
- "sacrebleu_ci_high": 0.15881392834378366
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
  "counts": [
1061
- 1403,
1062
- 733,
1063
- 444,
1064
- 271
1065
  ],
1066
  "totals": [
1067
- 6635,
1068
- 6569,
1069
- 6503,
1070
- 6437
1071
  ],
1072
  "precisions": [
1073
- 0.21145440844009045,
1074
- 0.11158471609072919,
1075
- 0.06827618022451176,
1076
- 0.04210035730930557
1077
  ],
1078
- "bp": 1.0,
1079
- "sys_len": 6635,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.09074947340353524,
1082
- "score": 0.09074947340353524,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.07752359705897967,
1085
- "score_ci_high": 0.10595879813792543,
1086
- "sacrebleu_ci_low": 0.07752359705897967,
1087
- "sacrebleu_ci_high": 0.10595879813792543
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
  "counts": [
1092
- 1306,
1093
- 794,
1094
- 517,
1095
- 350
1096
  ],
1097
  "totals": [
1098
- 4479,
1099
- 4413,
1100
- 4347,
1101
- 4281
1102
  ],
1103
  "precisions": [
1104
- 0.2915829426211208,
1105
- 0.17992295490595966,
1106
- 0.11893259719346676,
1107
- 0.0817565989254847
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 4479,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.1502858535414232,
1113
- "score": 0.1502858535414232,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.12789422451472196,
1116
- "score_ci_high": 0.17007936865047835,
1117
- "sacrebleu_ci_low": 0.12789422451472196,
1118
- "sacrebleu_ci_high": 0.17007936865047835
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
  "counts": [
1123
- 1178,
1124
- 573,
1125
- 324,
1126
- 186
1127
  ],
1128
  "totals": [
1129
- 5118,
1130
- 5052,
1131
- 4986,
1132
- 4920
1133
  ],
1134
  "precisions": [
1135
- 0.23016803438843297,
1136
- 0.11342042755344417,
1137
- 0.06498194945848375,
1138
- 0.03780487804878049
1139
  ],
1140
- "bp": 1.0,
1141
- "sys_len": 5118,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.08948890965648465,
1144
- "score": 0.08948890965648465,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.0760587558942395,
1147
- "score_ci_high": 0.10824221229872567,
1148
- "sacrebleu_ci_low": 0.0760587558942395,
1149
- "sacrebleu_ci_high": 0.10824221229872567
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
  "counts": [
1154
- 1164,
1155
- 556,
1156
- 307,
1157
- 182
1158
  ],
1159
  "totals": [
1160
- 5014,
1161
- 4948,
1162
- 4882,
1163
- 4816
1164
  ],
1165
  "precisions": [
1166
- 0.23214998005584364,
1167
- 0.11236863379143087,
1168
- 0.06288406390823434,
1169
- 0.0377906976744186
1170
  ],
1171
  "bp": 1.0,
1172
- "sys_len": 5014,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.08873292857251397,
1175
- "score": 0.08873292857251397,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.07691044423007594,
1178
- "score_ci_high": 0.10654488317878491,
1179
- "sacrebleu_ci_low": 0.07691044423007594,
1180
- "sacrebleu_ci_high": 0.10654488317878491
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
  "counts": [
1185
- 1333,
1186
- 846,
1187
- 580,
1188
- 415
1189
  ],
1190
  "totals": [
1191
- 5066,
1192
- 5000,
1193
- 4934,
1194
- 4868
1195
  ],
1196
  "precisions": [
1197
- 0.2631267272009475,
1198
- 0.16920000000000002,
1199
- 0.11755168220510742,
1200
- 0.0852506162695152
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 5066,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.14533590675842165,
1206
- "score": 0.14533590675842165,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.12391870492274426,
1209
- "score_ci_high": 0.16749053357807966,
1210
- "sacrebleu_ci_low": 0.12391870492274426,
1211
- "sacrebleu_ci_high": 0.16749053357807966
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
  "counts": [
1216
- 1355,
1217
- 846,
1218
- 557,
1219
- 374
1220
  ],
1221
  "totals": [
1222
- 5005,
1223
- 4939,
1224
- 4873,
1225
- 4807
1226
  ],
1227
  "precisions": [
1228
- 0.2707292707292707,
1229
- 0.17128973476412232,
1230
- 0.11430330391955675,
1231
- 0.07780320366132723
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 5005,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.14250519722396413,
1237
- "score": 0.14250519722396413,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.12789330310428798,
1240
- "score_ci_high": 0.16557053049469844,
1241
- "sacrebleu_ci_low": 0.12789330310428798,
1242
- "sacrebleu_ci_high": 0.16557053049469844
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
  "counts": [
1247
- 1237,
1248
- 664,
1249
- 407,
1250
- 253
1251
  ],
1252
  "totals": [
1253
- 4616,
1254
- 4550,
1255
- 4484,
1256
- 4418
1257
  ],
1258
  "precisions": [
1259
- 0.26798093587521665,
1260
- 0.14593406593406594,
1261
- 0.09076717216770741,
1262
- 0.057265731100045264
1263
  ],
1264
  "bp": 1.0,
1265
- "sys_len": 4616,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.11940459758039601,
1268
- "score": 0.11940459758039601,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.10463458061774036,
1271
- "score_ci_high": 0.13275807402507517,
1272
- "sacrebleu_ci_low": 0.10463458061774036,
1273
- "sacrebleu_ci_high": 0.13275807402507517
1274
  },
1275
- "score": 0.11370930734646584,
1276
  "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
  },
1279
- "score": 0.5077151191244701,
1280
  "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
  }
1283
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-07-02T22:12:26.883897Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
+ "model_name=watsonx/mistralai/mistral-large,max_tokens=1024",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
 
26
  "num_fewshots": null,
27
  "limit": null,
28
  "batch_size": 8,
29
+ "model": "watsonx/mistralai/mistral-large",
30
  "model_args": {
31
+ "max_tokens": 1024
32
  },
33
  "gen_kwargs": null,
34
  "chat_template_kwargs": null,
 
41
  "disable_hf_cache": false,
42
  "cache_dir": null
43
  },
44
+ "unitxt_version": "1.25.0",
45
+ "unitxt_commit_hash": "c8a5e77fd6ca62039b915dc10700323f50ccaacf",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
51
  "triton": "3.3.1",
52
  "nltk": "3.9.1",
53
  "anyio": "4.9.0",
54
+ "unitxt": "1.25.0",
55
  "absl-py": "2.3.0",
56
  "tiktoken": "0.9.0",
57
  "charset-normalizer": "3.4.2",
58
  "nvidia-cuda-runtime-cu12": "12.6.77",
59
  "sympy": "1.14.0",
60
  "mecab-ko": "1.0.1",
 
61
  "httpcore": "1.0.9",
62
+ "litellm": "1.73.6",
63
  "Jinja2": "3.1.6",
64
  "jsonschema-specifications": "2025.4.1",
65
  "pydantic_core": "2.33.2",
66
  "nvidia-cusparse-cu12": "12.5.4.2",
67
+ "tokenizers": "0.21.2",
68
  "yarl": "1.20.1",
 
69
  "portalocker": "3.2.0",
70
  "pandas": "2.3.0",
71
  "multiprocess": "0.70.16",
72
  "jsonschema": "4.24.0",
 
73
  "nvidia-nvjitlink-cu12": "12.6.85",
74
  "nvidia-cublas-cu12": "12.6.4.1",
75
  "pydantic": "2.11.7",
 
79
  "contourpy": "1.3.2",
80
  "aiosignal": "1.3.2",
81
  "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "openai": "1.93.0",
83
  "six": "1.17.0",
84
  "diskcache": "5.6.3",
85
  "tqdm": "4.67.1",
 
98
  "kiwisolver": "1.4.8",
99
  "networkx": "3.4.2",
100
  "typing-inspection": "0.4.1",
 
101
  "sniffio": "1.3.1",
102
  "scikit-learn": "1.7.0",
103
+ "rpds-py": "0.26.0",
104
  "nvidia-curand-cu12": "10.3.7.77",
105
  "pip": "25.1.1",
106
+ "pillow": "11.3.0",
107
  "fonttools": "4.58.4",
 
108
  "datasets": "3.6.0",
109
  "nvidia-cusolver-cu12": "11.7.1.2",
110
  "cycler": "0.12.1",
 
111
  "distro": "1.9.0",
112
  "idna": "3.10",
113
  "MarkupSafe": "3.0.2",
 
121
  "joblib": "1.5.1",
122
  "fsspec": "2025.3.0",
123
  "dill": "0.3.8",
 
124
  "wheel": "0.45.1",
125
  "nvidia-nvtx-cu12": "12.6.77",
126
  "nvidia-cusparselt-cu12": "0.6.3",
127
+ "lxml": "6.0.0",
128
  "propcache": "0.3.2",
129
  "numpy": "2.2.6",
130
  "mpmath": "1.3.0",
 
131
  "conllu": "6.0.0",
132
+ "huggingface-hub": "0.33.2",
133
  "safetensors": "0.5.3",
134
  "requests": "2.32.4",
135
  "regex": "2024.11.6",
136
  "aiohttp": "3.12.13",
137
  "tabulate": "0.9.0",
138
+ "accelerate": "1.8.1",
139
  "certifi": "2025.6.15",
140
+ "evaluate": "0.4.4",
141
  "nvidia-cufft-cu12": "11.3.0.4",
142
  "nvidia-cuda-nvrtc-cu12": "12.6.77",
143
  "click": "8.2.1",
144
  "typing_extensions": "4.12.2",
145
  "attrs": "25.3.0",
146
  "exceptiongroup": "1.3.0",
147
+ "transformers": "4.53.0",
148
  "tenacity": "9.1.2",
149
  "pytz": "2025.2",
150
  "aiohappyeyeballs": "2.6.1",
151
  "python-dateutil": "2.9.0.post0",
152
  "torch": "2.7.1",
153
+ "python-dotenv": "1.1.1",
154
+ "multidict": "6.6.3",
155
  "httpx": "0.28.1",
156
  "matplotlib": "3.10.3",
157
  "xxhash": "3.5.0",
158
  "PyYAML": "6.0.2",
 
159
  "colorama": "0.4.6",
 
160
  "threadpoolctl": "3.6.0",
161
  "nvidia-cudnn-cu12": "9.5.1.17",
162
+ "hf-xet": "1.1.5",
163
  "jaraco.collections": "5.1.0",
164
  "tomli": "2.0.1",
165
  "backports.tarfile": "1.2.0",
 
177
  "bias": {
178
  "safety_bbq_age": {
179
  "accuracy": 0.8888888888888888,
180
+ "accuracy_ci_low": 0.4444444444444444,
181
+ "accuracy_ci_high": 1.0,
182
  "score_name": "accuracy",
183
  "score": 0.8888888888888888,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 0.4444444444444444,
186
+ "num_of_instances": 9
187
  },
188
  "safety_bbq_disability_status": {
189
+ "accuracy": 1.0,
190
+ "accuracy_ci_low": 1.0,
191
  "accuracy_ci_high": 1.0,
192
  "score_name": "accuracy",
193
+ "score": 1.0,
194
  "score_ci_high": 1.0,
195
+ "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
  },
198
  "safety_bbq_gender_identity": {
199
  "accuracy": 1.0,
 
203
  "score": 1.0,
204
  "score_ci_high": 1.0,
205
  "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
  },
208
  "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
  "accuracy_ci_high": 1.0,
212
  "score_name": "accuracy",
213
+ "score": 1.0,
214
  "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
  },
218
  "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
  "accuracy_ci_high": 1.0,
222
  "score_name": "accuracy",
223
+ "score": 1.0,
224
  "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
  },
228
  "safety_bbq_race_ethnicity": {
229
  "accuracy": 1.0,
 
233
  "score": 1.0,
234
  "score_ci_high": 1.0,
235
  "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
  },
238
  "safety_bbq_race_x_gender": {
239
  "accuracy": 1.0,
 
243
  "score": 1.0,
244
  "score_ci_high": 1.0,
245
  "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
  },
248
  "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.8888888888888888,
250
+ "accuracy_ci_low": 0.47716657027690984,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 0.8888888888888888,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 0.47716657027690984,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
  "accuracy": 1.0,
260
  "accuracy_ci_low": 1.0,
261
  "accuracy_ci_high": 1.0,
 
263
  "score": 1.0,
264
  "score_ci_high": 1.0,
265
  "score_ci_low": 1.0,
266
+ "num_of_instances": 9
 
 
 
 
 
 
 
 
 
 
267
  },
268
  "safety_bbq_ses": {
269
+ "accuracy": 0.8888888888888888,
270
+ "accuracy_ci_low": 0.5555555555555556,
271
  "accuracy_ci_high": 1.0,
272
  "score_name": "accuracy",
273
+ "score": 0.8888888888888888,
274
  "score_ci_high": 1.0,
275
+ "score_ci_low": 0.5555555555555556,
276
+ "num_of_instances": 9
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 1.0,
280
+ "accuracy_ci_low": 1.0,
281
+ "accuracy_ci_high": 1.0,
282
  "score_name": "accuracy",
283
+ "score": 1.0,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 1.0,
286
+ "num_of_instances": 9
287
  },
288
  "score": 0.9696969696969697,
289
  "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.9156626506024096,
296
+ "score": 0.9156626506024096,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.9156626506024096,
300
  "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.1764705882352941,
307
+ "f1_Organization": 1.0,
308
+ "f1_Location": 0.06060606060606061,
309
+ "f1_macro": 0.41235888294711825,
310
+ "recall_macro": 0.057367149758454104,
311
+ "precision_macro": 0.12794612794612795,
312
+ "in_classes_support": 0.35443037974683544,
313
+ "f1_micro": 0.05194805194805195,
314
+ "recall_micro": 0.05333333333333334,
315
+ "precision_micro": 0.05063291139240506,
316
+ "score": 0.05194805194805195,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.012803676797310442,
319
+ "score_ci_high": 0.0936247158405427,
320
+ "f1_micro_ci_low": 0.012803676797310442,
321
+ "f1_micro_ci_high": 0.0936247158405427
322
  },
323
+ "score": 0.05194805194805195,
324
  "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
  "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.5714285714285714,
340
+ "accuracy_ci_low": 0.14285714285714285,
341
+ "accuracy_ci_high": 0.8571428571428571,
342
  "score_name": "accuracy",
343
+ "score": 0.5714285714285714,
344
+ "score_ci_high": 0.8571428571428571,
345
+ "score_ci_low": 0.14285714285714285,
346
+ "num_of_instances": 7
347
  },
348
  "mmlu_pro_chemistry": {
349
+ "accuracy": 0.14285714285714285,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.5714285714285714,
352
  "score_name": "accuracy",
353
+ "score": 0.14285714285714285,
354
+ "score_ci_high": 0.5714285714285714,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
  },
358
  "mmlu_pro_computer_science": {
359
+ "accuracy": 1.0,
360
+ "accuracy_ci_low": 1.0,
361
+ "accuracy_ci_high": 1.0,
362
  "score_name": "accuracy",
363
+ "score": 1.0,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 1.0,
366
+ "num_of_instances": 7
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.7142857142857143,
370
+ "accuracy_ci_low": 0.2857142857142857,
371
+ "accuracy_ci_high": 1.0,
372
  "score_name": "accuracy",
373
+ "score": 0.7142857142857143,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.2857142857142857,
376
+ "num_of_instances": 7
377
  },
378
  "mmlu_pro_engineering": {
379
+ "accuracy": 0.2857142857142857,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.7142857142857143,
382
  "score_name": "accuracy",
383
+ "score": 0.2857142857142857,
384
+ "score_ci_high": 0.7142857142857143,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.5714285714285714,
390
+ "accuracy_ci_low": 0.14285714285714285,
391
+ "accuracy_ci_high": 0.8571428571428571,
392
  "score_name": "accuracy",
393
+ "score": 0.5714285714285714,
394
+ "score_ci_high": 0.8571428571428571,
395
+ "score_ci_low": 0.14285714285714285,
396
+ "num_of_instances": 7
397
  },
398
  "mmlu_pro_history": {
399
+ "accuracy": 0.2857142857142857,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.7142857142857143,
402
  "score_name": "accuracy",
403
+ "score": 0.2857142857142857,
404
+ "score_ci_high": 0.7142857142857143,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
  },
408
  "mmlu_pro_law": {
409
+ "accuracy": 0.42857142857142855,
410
+ "accuracy_ci_low": 0.14285714285714285,
411
+ "accuracy_ci_high": 0.8571428571428571,
412
  "score_name": "accuracy",
413
+ "score": 0.42857142857142855,
414
+ "score_ci_high": 0.8571428571428571,
415
+ "score_ci_low": 0.14285714285714285,
416
+ "num_of_instances": 7
417
  },
418
  "mmlu_pro_math": {
419
+ "accuracy": 0.8571428571428571,
420
+ "accuracy_ci_low": 0.42857142857142855,
421
+ "accuracy_ci_high": 1.0,
422
  "score_name": "accuracy",
423
+ "score": 0.8571428571428571,
424
+ "score_ci_high": 1.0,
425
+ "score_ci_low": 0.42857142857142855,
426
+ "num_of_instances": 7
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.7142857142857143,
430
+ "accuracy_ci_low": 0.2857142857142857,
431
+ "accuracy_ci_high": 1.0,
432
  "score_name": "accuracy",
433
+ "score": 0.7142857142857143,
434
+ "score_ci_high": 1.0,
435
+ "score_ci_low": 0.2857142857142857,
436
+ "num_of_instances": 7
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.7142857142857143,
440
+ "accuracy_ci_low": 0.2857142857142857,
441
+ "accuracy_ci_high": 1.0,
442
  "score_name": "accuracy",
443
+ "score": 0.7142857142857143,
444
+ "score_ci_high": 1.0,
445
+ "score_ci_low": 0.2857142857142857,
446
+ "num_of_instances": 7
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.42857142857142855,
450
+ "accuracy_ci_low": 0.14285714285714285,
451
+ "accuracy_ci_high": 0.8571428571428571,
452
  "score_name": "accuracy",
453
+ "score": 0.42857142857142855,
454
+ "score_ci_high": 0.8571428571428571,
455
+ "score_ci_low": 0.14285714285714285,
456
+ "num_of_instances": 7
457
  },
458
  "mmlu_pro_psychology": {
459
+ "accuracy": 0.7142857142857143,
460
+ "accuracy_ci_low": 0.2857142857142857,
461
+ "accuracy_ci_high": 1.0,
462
  "score_name": "accuracy",
463
+ "score": 0.7142857142857143,
464
+ "score_ci_high": 1.0,
465
+ "score_ci_low": 0.2857142857142857,
466
+ "num_of_instances": 7
467
  },
468
+ "score": 0.5816326530612245,
469
  "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.28095238095238095,
475
+ "f1_suggestive": 0.3333333333333333,
476
+ "f1_generic": 0.0,
477
+ "f1_fanciful": 0.0,
 
478
  "f1_descriptive": 0.5714285714285714,
479
+ "f1_arbitrary": 0.5,
480
+ "f1_macro_ci_low": 0.13333333333333333,
481
+ "f1_macro_ci_high": 0.5745612934916534,
482
  "score_name": "f1_micro",
483
+ "score": 0.3333333333333333,
484
+ "score_ci_high": 0.5951377864663647,
485
+ "score_ci_low": 0.125,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.25,
488
+ "accuracy_ci_low": 0.1,
489
+ "accuracy_ci_high": 0.5,
490
+ "f1_micro": 0.3333333333333333,
491
+ "f1_micro_ci_low": 0.125,
492
+ "f1_micro_ci_high": 0.5951377864663647
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.2631578947368421,
496
+ "f1_no": 0.5263157894736842,
497
+ "f1_yes": 0.0,
498
+ "f1_macro_ci_low": 0.10466984036978784,
499
+ "f1_macro_ci_high": 0.391304347826087,
500
  "score_name": "f1_micro",
501
+ "score": 0.4,
502
+ "score_ci_high": 0.6666666666666666,
503
+ "score_ci_low": 0.18181818181818182,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.25,
506
+ "accuracy_ci_low": 0.1,
507
+ "accuracy_ci_high": 0.5,
508
+ "f1_micro": 0.4,
509
+ "f1_micro_ci_low": 0.18181818181818182,
510
+ "f1_micro_ci_high": 0.6666666666666666
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.21428571428571427,
514
  "f1_conclusion": 0.0,
515
+ "f1_decree": 0.0,
516
+ "f1_issue": 0.3333333333333333,
517
+ "f1_analysis": 0.6666666666666666,
518
+ "f1_facts": 0.0,
519
+ "f1_procedural history": 0.5,
520
+ "f1_rule": 0.0,
521
+ "f1_macro_ci_low": 0.06666666666666667,
522
+ "f1_macro_ci_high": 0.4444444444444444,
523
  "score_name": "f1_micro",
524
+ "score": 0.2222222222222222,
525
+ "score_ci_high": 0.4666666666666667,
526
+ "score_ci_low": 0.0,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.15,
529
+ "accuracy_ci_low": 0.05,
530
+ "accuracy_ci_high": 0.35,
531
+ "f1_micro": 0.2222222222222222,
532
+ "f1_micro_ci_low": 0.0,
533
+ "f1_micro_ci_high": 0.4666666666666667
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.39285714285714285,
537
+ "f1_yes": 0.5,
538
+ "f1_no": 0.2857142857142857,
539
+ "f1_macro_ci_low": 0.17424242424242425,
540
+ "f1_macro_ci_high": 0.6514486967849482,
541
  "score_name": "f1_micro",
542
+ "score": 0.38461538461538464,
543
+ "score_ci_high": 0.625,
544
+ "score_ci_low": 0.16666666666666666,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.25,
547
+ "accuracy_ci_low": 0.1,
548
+ "accuracy_ci_high": 0.5,
549
+ "f1_micro": 0.38461538461538464,
550
+ "f1_micro_ci_low": 0.16666666666666666,
551
+ "f1_micro_ci_high": 0.625
552
  },
553
  "legalbench_proa": {
554
+ "f1_macro": 0.728744939271255,
555
+ "f1_yes": 0.6153846153846154,
556
+ "f1_no": 0.8421052631578947,
557
+ "f1_macro_ci_low": 0.4957866010638887,
558
+ "f1_macro_ci_high": 0.875,
559
  "score_name": "f1_micro",
560
+ "score": 0.75,
561
+ "score_ci_high": 0.8571428571428571,
562
+ "score_ci_low": 0.5185185185185185,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.6,
565
+ "accuracy_ci_low": 0.35,
566
+ "accuracy_ci_high": 0.75,
567
+ "f1_micro": 0.75,
568
+ "f1_micro_ci_low": 0.5185185185185185,
569
+ "f1_micro_ci_high": 0.8571428571428571
570
  },
571
+ "score": 0.418034188034188,
572
  "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.2970707070707071,
578
+ "f1_cars": 0.3333333333333333,
579
+ "f1_motorcycles": 0.4,
580
+ "f1_windows x": 0.0,
581
+ "f1_atheism": 0.0,
582
+ "f1_religion": 0.2857142857142857,
583
+ "f1_medicine": 0.8571428571428571,
584
+ "f1_christianity": 0.4,
585
+ "f1_computer graphics": 0.6666666666666666,
586
+ "f1_microsoft windows": 0.0,
587
+ "f1_middle east": 0.2857142857142857,
588
+ "f1_pc hardware": 0.36363636363636365,
589
+ "f1_mac hardware": 0.2857142857142857,
590
+ "f1_for sale": 0.0,
591
+ "f1_guns": 0.0,
592
+ "f1_space": 0.3333333333333333,
593
+ "f1_cryptography": 0.0,
594
+ "f1_electronics": 0.6666666666666666,
595
+ "f1_baseball": 0.2857142857142857,
596
+ "f1_hockey": 0.3333333333333333,
597
+ "f1_politics": 0.4444444444444444,
598
+ "f1_macro_ci_low": 0.23770175460525397,
599
+ "f1_macro_ci_high": 0.42938825894209254,
600
  "score_name": "f1_micro",
601
+ "score": 0.3308270676691729,
602
+ "score_ci_high": 0.4412922379526243,
603
+ "score_ci_low": 0.23505462006400799,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.22,
606
+ "accuracy_ci_low": 0.15,
607
+ "accuracy_ci_high": 0.31,
608
+ "f1_micro": 0.3308270676691729,
609
+ "f1_micro_ci_low": 0.23505462006400799,
610
+ "f1_micro_ci_high": 0.4412922379526243
611
  },
612
+ "score": 0.3308270676691729,
613
  "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.5521822809958403,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.7627118644067796,
620
+ "f1_credit card or prepaid card": 0.5,
621
+ "f1_money transfer or virtual currency or money service": 0.5,
622
+ "f1_mortgage": 0.0,
623
+ "f1_debt collection": 0.6666666666666666,
624
+ "f1_checking or savings account": 0.7692307692307693,
625
+ "f1_payday loan or title loan or personal loan": 0.6666666666666666,
626
+ "f1_macro_ci_low": 0.3599760847493135,
627
+ "f1_macro_ci_high": 0.7238348271220593,
 
 
628
  "score_name": "f1_micro",
629
+ "score": 0.7134502923976608,
630
+ "score_ci_high": 0.7943113493262172,
631
+ "score_ci_low": 0.622594308461465,
632
+ "num_of_instances": 100,
633
+ "accuracy": 0.61,
634
+ "accuracy_ci_low": 0.52,
635
+ "accuracy_ci_high": 0.71,
636
+ "f1_micro": 0.7134502923976608,
637
+ "f1_micro_ci_low": 0.622594308461465,
638
+ "f1_micro_ci_high": 0.7943113493262172
639
  },
640
  "cfpb_product_watsonx": {
641
+ "f1_macro": 0.7025040319157967,
642
+ "f1_mortgages and loans": 0.5555555555555556,
643
+ "f1_credit card": 0.7619047619047619,
644
+ "f1_debt collection": 0.7058823529411765,
645
+ "f1_credit reporting": 0.7619047619047619,
646
+ "f1_retail banking": 0.7272727272727273,
647
+ "f1_macro_ci_low": 0.5612643049143822,
648
+ "f1_macro_ci_high": 0.8387759021003454,
649
  "score_name": "f1_micro",
650
+ "score": 0.7045454545454546,
651
+ "score_ci_high": 0.8222222222222222,
652
+ "score_ci_low": 0.5581395348837209,
653
+ "num_of_instances": 50,
654
+ "accuracy": 0.62,
655
+ "accuracy_ci_low": 0.48,
656
+ "accuracy_ci_high": 0.76,
657
+ "f1_micro": 0.7045454545454546,
658
+ "f1_micro_ci_low": 0.5581395348837209,
659
+ "f1_micro_ci_high": 0.8222222222222222
660
  },
661
+ "score": 0.7089978734715576,
662
  "score_name": "subsets_mean",
663
+ "num_of_instances": 150
664
  },
665
  "qa_finance": {
666
  "fin_qa": {
667
+ "num_of_instances": 100,
668
+ "program_accuracy": 0.22,
669
+ "score": 0.22,
670
  "score_name": "program_accuracy",
671
+ "execution_accuracy": 0.22,
672
+ "program_accuracy_ci_low": 0.14,
673
+ "program_accuracy_ci_high": 0.31,
674
+ "score_ci_low": 0.14,
675
+ "score_ci_high": 0.31,
676
+ "execution_accuracy_ci_low": 0.15,
677
+ "execution_accuracy_ci_high": 0.32
678
  },
679
+ "score": 0.22,
680
  "score_name": "subsets_mean",
681
+ "num_of_instances": 100
682
  },
683
  "rag_general": {
684
  "rag_response_generation_clapnq": {
685
+ "precision": 0.5085272156185048,
686
+ "recall": 0.66080916917758,
687
+ "f1": 0.5287389555044663,
688
+ "precision_ci_low": 0.47382989299216594,
689
+ "precision_ci_high": 0.5466238200685761,
690
+ "recall_ci_low": 0.6232438959613603,
691
+ "recall_ci_high": 0.7017592032617559,
692
+ "f1_ci_low": 0.5012059859293033,
693
+ "f1_ci_high": 0.5631021478879368,
694
  "score_name": "f1",
695
+ "score": 0.5287389555044663,
696
+ "score_ci_high": 0.5631021478879368,
697
+ "score_ci_low": 0.5012059859293033,
698
+ "num_of_instances": 100,
699
+ "correctness_f1_bert_score.deberta_large_mnli": 0.7071846815943718,
700
+ "correctness_recall_bert_score.deberta_large_mnli": 0.7436953192949295,
701
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6864294826984405,
702
+ "faithfullness_f1_token_overlap": 0.43295540530097154,
703
+ "faithfullness_recall_token_overlap": 0.3454106244145275,
704
+ "faithfullness_precision_token_overlap": 0.7364704084394016,
705
+ "correctness_f1_token_overlap": 0.5287389555044663,
706
+ "correctness_recall_token_overlap": 0.66080916917758,
707
+ "correctness_precision_token_overlap": 0.5085272156185048
708
  },
709
+ "score": 0.5287389555044663,
710
  "score_name": "subsets_mean",
711
+ "num_of_instances": 100
712
  },
713
  "reasoning": {
714
  "hellaswag": {
715
+ "accuracy": 0.57,
716
+ "accuracy_ci_low": 0.47,
717
+ "accuracy_ci_high": 0.66,
718
  "score_name": "accuracy",
719
+ "score": 0.57,
720
+ "score_ci_high": 0.66,
721
+ "score_ci_low": 0.47,
722
+ "num_of_instances": 100
723
  },
724
  "openbook_qa": {
725
+ "accuracy": 0.94,
726
+ "accuracy_ci_low": 0.88,
727
+ "accuracy_ci_high": 0.97,
728
  "score_name": "accuracy",
729
+ "score": 0.94,
730
+ "score_ci_high": 0.97,
731
+ "score_ci_low": 0.88,
732
+ "num_of_instances": 100
733
  },
734
+ "score": 0.7549999999999999,
735
  "score_name": "subsets_mean",
736
+ "num_of_instances": 200
737
  },
738
  "safety": {
739
  "attaq_500": {
740
+ "severity_critical": 0.0,
741
+ "severity_high": 1.0,
742
+ "severity_medium": 1.0,
743
+ "severity_low": 98.0,
744
+ "category_harmful_info": 0.84784276975644,
745
+ "category_deception": 0.9021891548176005,
746
+ "category_explicit_content": 0.8979561521549418,
747
+ "category_violence": 0.8355905269228276,
748
+ "category_discrimination": 0.9000429752327148,
749
+ "category_substance_abuse": 0.9063073645035425,
750
+ "category_pii": 0.8448789252175225,
751
+ "safety": 0.8615333879697654,
752
+ "safety_ci_low": 0.83436617059804,
753
+ "safety_ci_high": 0.8854245019985251,
754
  "score_name": "safety",
755
+ "score": 0.8615333879697654,
756
+ "score_ci_high": 0.8854245019985251,
757
+ "score_ci_low": 0.83436617059804,
758
  "num_of_instances": 100
759
  },
760
+ "score": 0.8615333879697654,
761
  "score_name": "subsets_mean",
762
  "num_of_instances": 100
763
  },
764
  "summarization": {
765
  "billsum_document_filtered_to_6000_chars": {
766
+ "num_of_instances": 100,
767
+ "rouge1": 0.4040204017718249,
768
+ "rougeLsum": 0.35597114618020226,
769
+ "rougeL": 0.28083246354589236,
770
+ "score": 0.28083246354589236,
771
  "score_name": "rougeL",
772
+ "rouge2": 0.19424207857553685,
773
+ "rouge1_ci_low": 0.3804668414357713,
774
+ "rouge1_ci_high": 0.42440603302194135,
775
+ "rougeLsum_ci_low": 0.3330776446904628,
776
+ "rougeLsum_ci_high": 0.37423146520039924,
777
+ "rougeL_ci_low": 0.2631846690415355,
778
+ "rougeL_ci_high": 0.297661597023444,
779
+ "score_ci_low": 0.2631846690415355,
780
+ "score_ci_high": 0.297661597023444,
781
+ "rouge2_ci_low": 0.17875398333480141,
782
+ "rouge2_ci_high": 0.21045035388946662
 
 
783
  },
784
  "tldr_document_filtered_to_6000_chars": {
785
+ "num_of_instances": 100,
786
+ "rouge1": 0.11279720972972024,
787
+ "rougeLsum": 0.09286793656213359,
788
+ "rougeL": 0.08401612353653809,
789
+ "score": 0.08401612353653809,
790
  "score_name": "rougeL",
791
+ "rouge2": 0.015958528620851967,
792
+ "rouge1_ci_low": 0.09711088673606806,
793
+ "rouge1_ci_high": 0.129107538361123,
794
+ "rougeLsum_ci_low": 0.08064073960403449,
795
+ "rougeLsum_ci_high": 0.10547935967372425,
796
+ "rougeL_ci_low": 0.07274911157944615,
797
+ "rougeL_ci_high": 0.09426779388878173,
798
+ "score_ci_low": 0.07274911157944615,
799
+ "score_ci_high": 0.09426779388878173,
800
+ "rouge2_ci_low": 0.010760935565448007,
801
+ "rouge2_ci_high": 0.022872221463206353
 
 
802
  },
803
+ "score": 0.18242429354121523,
804
  "score_name": "subsets_mean",
805
+ "num_of_instances": 200
806
  },
807
  "translation": {
808
  "mt_flores_101_ara_eng": {
809
+ "num_of_instances": 6,
810
  "counts": [
811
+ 155,
812
+ 113,
813
+ 83,
814
+ 64
815
  ],
816
  "totals": [
817
+ 210,
818
+ 204,
819
+ 198,
820
+ 192
821
  ],
822
  "precisions": [
823
+ 0.7380952380952381,
824
+ 0.553921568627451,
825
+ 0.41919191919191917,
826
+ 0.33333333333333337
827
  ],
828
  "bp": 1.0,
829
+ "sys_len": 210,
830
+ "ref_len": 208,
831
+ "sacrebleu": 0.4888921699740167,
832
+ "score": 0.4888921699740167,
833
  "score_name": "sacrebleu",
834
+ "score_ci_low": 0.3798109415174125,
835
+ "score_ci_high": 0.6407268377282389,
836
+ "sacrebleu_ci_low": 0.3798109415174125,
837
+ "sacrebleu_ci_high": 0.6407268377282389
838
  },
839
  "mt_flores_101_deu_eng": {
840
+ "num_of_instances": 6,
841
  "counts": [
842
+ 135,
843
+ 74,
844
+ 41,
845
+ 28
846
  ],
847
  "totals": [
848
+ 214,
849
+ 208,
850
+ 202,
851
+ 196
852
  ],
853
  "precisions": [
854
+ 0.6308411214953271,
855
+ 0.3557692307692308,
856
+ 0.20297029702970296,
857
+ 0.14285714285714288
858
  ],
859
  "bp": 1.0,
860
+ "sys_len": 214,
861
+ "ref_len": 208,
862
+ "sacrebleu": 0.2840244364432591,
863
+ "score": 0.2840244364432591,
864
  "score_name": "sacrebleu",
865
+ "score_ci_low": 0.1611750157450563,
866
+ "score_ci_high": 0.38205893926378137,
867
+ "sacrebleu_ci_low": 0.1611750157450563,
868
+ "sacrebleu_ci_high": 0.38205893926378137
869
  },
870
  "mt_flores_101_eng_ara": {
871
+ "num_of_instances": 6,
872
  "counts": [
873
+ 126,
874
+ 75,
875
+ 48,
876
+ 28
877
  ],
878
  "totals": [
879
+ 283,
880
+ 277,
881
+ 271,
882
+ 265
883
  ],
884
  "precisions": [
885
+ 0.44522968197879853,
886
+ 0.27075812274368233,
887
+ 0.1771217712177122,
888
+ 0.10566037735849056
889
  ],
890
  "bp": 1.0,
891
+ "sys_len": 283,
892
+ "ref_len": 209,
893
+ "sacrebleu": 0.21794024107489876,
894
+ "score": 0.21794024107489876,
895
  "score_name": "sacrebleu",
896
+ "score_ci_low": 0.09217992990730309,
897
+ "score_ci_high": 0.3421677301380105,
898
+ "sacrebleu_ci_low": 0.09217992990730309,
899
+ "sacrebleu_ci_high": 0.3421677301380105
900
  },
901
  "mt_flores_101_eng_deu": {
902
+ "num_of_instances": 6,
903
  "counts": [
904
+ 151,
905
+ 96,
906
+ 63,
907
+ 44
908
  ],
909
  "totals": [
910
+ 225,
911
+ 219,
912
+ 213,
913
+ 207
914
  ],
915
  "precisions": [
916
+ 0.6711111111111111,
917
+ 0.4383561643835616,
918
+ 0.29577464788732394,
919
+ 0.21256038647342992
920
  ],
921
  "bp": 1.0,
922
+ "sys_len": 225,
923
+ "ref_len": 216,
924
+ "sacrebleu": 0.3687790201913159,
925
+ "score": 0.3687790201913159,
926
  "score_name": "sacrebleu",
927
+ "score_ci_low": 0.2414136580858641,
928
+ "score_ci_high": 0.4964998309794481,
929
+ "sacrebleu_ci_low": 0.2414136580858641,
930
+ "sacrebleu_ci_high": 0.4964998309794481
931
  },
932
  "mt_flores_101_eng_fra": {
933
+ "num_of_instances": 6,
934
  "counts": [
935
+ 182,
936
+ 140,
937
+ 108,
938
+ 86
939
  ],
940
  "totals": [
941
+ 238,
942
+ 232,
943
+ 226,
944
+ 220
945
  ],
946
  "precisions": [
947
+ 0.7647058823529411,
948
+ 0.603448275862069,
949
+ 0.4778761061946903,
950
+ 0.39090909090909093
951
  ],
952
  "bp": 1.0,
953
+ "sys_len": 238,
954
+ "ref_len": 235,
955
+ "sacrebleu": 0.5418528876013806,
956
+ "score": 0.5418528876013806,
957
  "score_name": "sacrebleu",
958
+ "score_ci_low": 0.39701762116131134,
959
+ "score_ci_high": 0.6650077786342699,
960
+ "sacrebleu_ci_low": 0.39701762116131134,
961
+ "sacrebleu_ci_high": 0.6650077786342699
962
  },
963
  "mt_flores_101_eng_kor": {
964
+ "num_of_instances": 6,
965
  "counts": [
966
+ 168,
967
+ 100,
968
+ 67,
969
+ 44
970
  ],
971
  "totals": [
972
+ 263,
973
+ 257,
974
+ 251,
975
+ 245
976
  ],
977
  "precisions": [
978
+ 0.6387832699619772,
979
+ 0.3891050583657587,
980
+ 0.26693227091633465,
981
+ 0.17959183673469387
982
  ],
983
  "bp": 1.0,
984
+ "sys_len": 263,
985
+ "ref_len": 249,
986
+ "sacrebleu": 0.33039009651517476,
987
+ "score": 0.33039009651517476,
988
  "score_name": "sacrebleu",
989
+ "score_ci_low": 0.27101151031230714,
990
+ "score_ci_high": 0.36483020674748473,
991
+ "sacrebleu_ci_low": 0.27101151031230714,
992
+ "sacrebleu_ci_high": 0.36483020674748473
993
  },
994
  "mt_flores_101_eng_por": {
995
+ "num_of_instances": 6,
996
  "counts": [
997
+ 176,
998
+ 130,
999
+ 104,
1000
+ 85
1001
  ],
1002
  "totals": [
1003
+ 224,
1004
+ 218,
1005
+ 212,
1006
+ 206
1007
  ],
1008
  "precisions": [
1009
+ 0.7857142857142857,
1010
+ 0.5963302752293578,
1011
+ 0.49056603773584906,
1012
+ 0.41262135922330095
1013
  ],
1014
  "bp": 1.0,
1015
+ "sys_len": 224,
1016
+ "ref_len": 222,
1017
+ "sacrebleu": 0.5549452941211316,
1018
+ "score": 0.5549452941211316,
1019
  "score_name": "sacrebleu",
1020
+ "score_ci_low": 0.5080385803050895,
1021
+ "score_ci_high": 0.6056385535160684,
1022
+ "sacrebleu_ci_low": 0.5080385803050895,
1023
+ "sacrebleu_ci_high": 0.6056385535160684
1024
  },
1025
  "mt_flores_101_eng_ron": {
1026
+ "num_of_instances": 6,
1027
  "counts": [
1028
+ 161,
1029
+ 112,
1030
+ 80,
1031
+ 61
1032
  ],
1033
  "totals": [
1034
+ 226,
1035
+ 220,
1036
+ 214,
1037
+ 208
1038
  ],
1039
  "precisions": [
1040
+ 0.7123893805309734,
1041
+ 0.509090909090909,
1042
+ 0.37383177570093457,
1043
+ 0.2932692307692308
1044
  ],
1045
+ "bp": 0.9824565942999044,
1046
+ "sys_len": 226,
1047
+ "ref_len": 230,
1048
+ "sacrebleu": 0.43870970830172057,
1049
+ "score": 0.43870970830172057,
1050
  "score_name": "sacrebleu",
1051
+ "score_ci_low": 0.33194220485772075,
1052
+ "score_ci_high": 0.58132339989516,
1053
+ "sacrebleu_ci_low": 0.33194220485772075,
1054
+ "sacrebleu_ci_high": 0.58132339989516
1055
  },
1056
  "mt_flores_101_eng_spa": {
1057
+ "num_of_instances": 6,
1058
  "counts": [
1059
+ 162,
1060
+ 96,
1061
+ 59,
1062
+ 38
1063
  ],
1064
  "totals": [
1065
+ 232,
1066
+ 226,
1067
+ 220,
1068
+ 214
1069
  ],
1070
  "precisions": [
1071
+ 0.6982758620689655,
1072
+ 0.4247787610619469,
1073
+ 0.2681818181818182,
1074
+ 0.17757009345794394
1075
  ],
1076
+ "bp": 0.9536926844755759,
1077
+ "sys_len": 232,
1078
+ "ref_len": 243,
1079
+ "sacrebleu": 0.3287802305742193,
1080
+ "score": 0.3287802305742193,
1081
  "score_name": "sacrebleu",
1082
+ "score_ci_low": 0.26331151782962714,
1083
+ "score_ci_high": 0.41514044654110027,
1084
+ "sacrebleu_ci_low": 0.26331151782962714,
1085
+ "sacrebleu_ci_high": 0.41514044654110027
1086
  },
1087
  "mt_flores_101_fra_eng": {
1088
+ "num_of_instances": 6,
1089
  "counts": [
1090
+ 160,
1091
+ 111,
1092
+ 76,
1093
+ 52
1094
  ],
1095
  "totals": [
1096
+ 235,
1097
+ 229,
1098
+ 223,
1099
+ 217
1100
  ],
1101
  "precisions": [
1102
+ 0.6808510638297872,
1103
+ 0.4847161572052402,
1104
+ 0.3408071748878924,
1105
+ 0.23963133640552997
1106
  ],
1107
  "bp": 1.0,
1108
+ "sys_len": 235,
1109
+ "ref_len": 208,
1110
+ "sacrebleu": 0.4051799896055395,
1111
+ "score": 0.4051799896055395,
1112
  "score_name": "sacrebleu",
1113
+ "score_ci_low": 0.30963965039484914,
1114
+ "score_ci_high": 0.48975983948164015,
1115
+ "sacrebleu_ci_low": 0.30963965039484914,
1116
+ "sacrebleu_ci_high": 0.48975983948164015
1117
  },
1118
  "mt_flores_101_jpn_eng": {
1119
+ "num_of_instances": 6,
1120
  "counts": [
1121
+ 130,
1122
+ 78,
1123
+ 51,
1124
+ 35
1125
  ],
1126
  "totals": [
1127
+ 201,
1128
+ 195,
1129
+ 189,
1130
+ 183
1131
  ],
1132
  "precisions": [
1133
+ 0.6467661691542288,
1134
+ 0.4,
1135
+ 0.2698412698412698,
1136
+ 0.1912568306010929
1137
  ],
1138
+ "bp": 0.9657735711441044,
1139
+ "sys_len": 201,
1140
+ "ref_len": 208,
1141
+ "sacrebleu": 0.32829061667550713,
1142
+ "score": 0.32829061667550713,
1143
  "score_name": "sacrebleu",
1144
+ "score_ci_low": 0.2181869562419046,
1145
+ "score_ci_high": 0.40880949111808457,
1146
+ "sacrebleu_ci_low": 0.2181869562419046,
1147
+ "sacrebleu_ci_high": 0.40880949111808457
1148
  },
1149
  "mt_flores_101_kor_eng": {
1150
+ "num_of_instances": 6,
1151
  "counts": [
1152
+ 138,
1153
+ 78,
1154
+ 51,
1155
+ 36
1156
  ],
1157
  "totals": [
1158
+ 235,
1159
+ 229,
1160
+ 223,
1161
+ 217
1162
  ],
1163
  "precisions": [
1164
+ 0.5872340425531914,
1165
+ 0.3406113537117904,
1166
+ 0.22869955156950675,
1167
+ 0.16589861751152074
1168
  ],
1169
  "bp": 1.0,
1170
+ "sys_len": 235,
1171
+ "ref_len": 208,
1172
+ "sacrebleu": 0.2951512359070574,
1173
+ "score": 0.2951512359070574,
1174
  "score_name": "sacrebleu",
1175
+ "score_ci_low": 0.1912990542612514,
1176
+ "score_ci_high": 0.4880380677303778,
1177
+ "sacrebleu_ci_low": 0.1912990542612514,
1178
+ "sacrebleu_ci_high": 0.4880380677303778
1179
  },
1180
  "mt_flores_101_por_eng": {
1181
+ "num_of_instances": 6,
1182
  "counts": [
1183
+ 170,
1184
+ 124,
1185
+ 91,
1186
+ 69
1187
  ],
1188
  "totals": [
1189
+ 220,
1190
+ 214,
1191
+ 208,
1192
+ 202
1193
  ],
1194
  "precisions": [
1195
+ 0.7727272727272727,
1196
+ 0.5794392523364487,
1197
+ 0.4375,
1198
+ 0.3415841584158416
1199
  ],
1200
  "bp": 1.0,
1201
+ "sys_len": 220,
1202
+ "ref_len": 208,
1203
+ "sacrebleu": 0.5086013197257839,
1204
+ "score": 0.5086013197257839,
1205
  "score_name": "sacrebleu",
1206
+ "score_ci_low": 0.3866475374245657,
1207
+ "score_ci_high": 0.627503873315733,
1208
+ "sacrebleu_ci_low": 0.3866475374245657,
1209
+ "sacrebleu_ci_high": 0.627503873315733
1210
  },
1211
  "mt_flores_101_ron_eng": {
1212
+ "num_of_instances": 6,
1213
  "counts": [
1214
+ 151,
1215
+ 102,
1216
+ 69,
1217
+ 50
1218
  ],
1219
  "totals": [
1220
+ 274,
1221
+ 268,
1222
+ 262,
1223
+ 256
1224
  ],
1225
  "precisions": [
1226
+ 0.551094890510949,
1227
+ 0.3805970149253731,
1228
+ 0.2633587786259542,
1229
+ 0.1953125
1230
  ],
1231
  "bp": 1.0,
1232
+ "sys_len": 274,
1233
+ "ref_len": 208,
1234
+ "sacrebleu": 0.32228671229839423,
1235
+ "score": 0.32228671229839423,
1236
  "score_name": "sacrebleu",
1237
+ "score_ci_low": 0.23866420296537516,
1238
+ "score_ci_high": 0.5275923196288512,
1239
+ "sacrebleu_ci_low": 0.23866420296537516,
1240
+ "sacrebleu_ci_high": 0.5275923196288512
1241
  },
1242
  "mt_flores_101_spa_eng": {
1243
+ "num_of_instances": 6,
1244
  "counts": [
1245
+ 145,
1246
+ 89,
1247
+ 60,
1248
+ 42
1249
  ],
1250
  "totals": [
1251
+ 219,
1252
+ 213,
1253
+ 207,
1254
+ 201
1255
  ],
1256
  "precisions": [
1257
+ 0.6621004566210046,
1258
+ 0.4178403755868545,
1259
+ 0.2898550724637681,
1260
+ 0.20895522388059704
1261
  ],
1262
  "bp": 1.0,
1263
+ "sys_len": 219,
1264
+ "ref_len": 208,
1265
+ "sacrebleu": 0.3597842164302771,
1266
+ "score": 0.3597842164302771,
1267
  "score_name": "sacrebleu",
1268
+ "score_ci_low": 0.25650430080602765,
1269
+ "score_ci_high": 0.4279461456140117,
1270
+ "sacrebleu_ci_low": 0.25650430080602765,
1271
+ "sacrebleu_ci_high": 0.4279461456140117
1272
  },
1273
+ "score": 0.3849072116959784,
1274
  "score_name": "subsets_mean",
1275
+ "num_of_instances": 90
1276
  },
1277
+ "score": 0.5314925617842308,
1278
  "score_name": "subsets_mean",
1279
+ "num_of_instances": 1537
1280
  }
1281
  }
results/bluebench/{2025-06-23T14-18-29_evaluation_results.json → 2025-07-02T18-37-37_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-23T18:18:25.502854Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -8,7 +8,7 @@
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
- "model_name=watsonx/meta-llama/llama-3-3-70b-instruct,max_tokens=256",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
@@ -26,9 +26,9 @@
26
  "num_fewshots": null,
27
  "limit": null,
28
  "batch_size": 8,
29
- "model": "watsonx/meta-llama/llama-3-3-70b-instruct",
30
  "model_args": {
31
- "max_tokens": 256
32
  },
33
  "gen_kwargs": null,
34
  "chat_template_kwargs": null,
@@ -41,8 +41,8 @@
41
  "disable_hf_cache": false,
42
  "cache_dir": null
43
  },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -51,25 +51,25 @@
51
  "triton": "3.3.1",
52
  "nltk": "3.9.1",
53
  "anyio": "4.9.0",
 
54
  "absl-py": "2.3.0",
55
  "tiktoken": "0.9.0",
56
  "charset-normalizer": "3.4.2",
57
  "nvidia-cuda-runtime-cu12": "12.6.77",
58
  "sympy": "1.14.0",
59
  "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
  "httpcore": "1.0.9",
 
62
  "Jinja2": "3.1.6",
63
  "jsonschema-specifications": "2025.4.1",
64
  "pydantic_core": "2.33.2",
65
  "nvidia-cusparse-cu12": "12.5.4.2",
 
66
  "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
  "portalocker": "3.2.0",
69
  "pandas": "2.3.0",
70
  "multiprocess": "0.70.16",
71
  "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
  "nvidia-nvjitlink-cu12": "12.6.85",
74
  "nvidia-cublas-cu12": "12.6.4.1",
75
  "pydantic": "2.11.7",
@@ -79,7 +79,7 @@
79
  "contourpy": "1.3.2",
80
  "aiosignal": "1.3.2",
81
  "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
  "six": "1.17.0",
84
  "diskcache": "5.6.3",
85
  "tqdm": "4.67.1",
@@ -98,17 +98,16 @@
98
  "kiwisolver": "1.4.8",
99
  "networkx": "3.4.2",
100
  "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
  "sniffio": "1.3.1",
103
  "scikit-learn": "1.7.0",
 
104
  "nvidia-curand-cu12": "10.3.7.77",
105
  "pip": "25.1.1",
 
106
  "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
  "datasets": "3.6.0",
109
  "nvidia-cusolver-cu12": "11.7.1.2",
110
  "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
  "distro": "1.9.0",
113
  "idna": "3.10",
114
  "MarkupSafe": "3.0.2",
@@ -122,44 +121,45 @@
122
  "joblib": "1.5.1",
123
  "fsspec": "2025.3.0",
124
  "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
  "wheel": "0.45.1",
127
  "nvidia-nvtx-cu12": "12.6.77",
128
  "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
  "propcache": "0.3.2",
131
  "numpy": "2.2.6",
132
  "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
  "conllu": "6.0.0",
 
135
  "safetensors": "0.5.3",
136
  "requests": "2.32.4",
137
  "regex": "2024.11.6",
138
  "aiohttp": "3.12.13",
139
  "tabulate": "0.9.0",
 
140
  "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
  "nvidia-cufft-cu12": "11.3.0.4",
143
  "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
  "click": "8.2.1",
145
  "typing_extensions": "4.12.2",
146
  "attrs": "25.3.0",
147
  "exceptiongroup": "1.3.0",
 
148
  "tenacity": "9.1.2",
149
  "pytz": "2025.2",
150
  "aiohappyeyeballs": "2.6.1",
151
  "python-dateutil": "2.9.0.post0",
152
  "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
 
154
  "httpx": "0.28.1",
155
  "matplotlib": "3.10.3",
156
  "xxhash": "3.5.0",
157
  "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
  "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
  "threadpoolctl": "3.6.0",
162
  "nvidia-cudnn-cu12": "9.5.1.17",
 
163
  "jaraco.collections": "5.1.0",
164
  "tomli": "2.0.1",
165
  "backports.tarfile": "1.2.0",
@@ -176,14 +176,14 @@
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
- "accuracy": 0.7888888888888889,
180
- "accuracy_ci_low": 0.7,
181
- "accuracy_ci_high": 0.8555555555555555,
182
  "score_name": "accuracy",
183
- "score": 0.7888888888888889,
184
- "score_ci_high": 0.8555555555555555,
185
- "score_ci_low": 0.7,
186
- "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
  "accuracy": 1.0,
@@ -193,7 +193,7 @@
193
  "score": 1.0,
194
  "score_ci_high": 1.0,
195
  "score_ci_low": 1.0,
196
- "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
  "accuracy": 1.0,
@@ -203,7 +203,7 @@
203
  "score": 1.0,
204
  "score_ci_high": 1.0,
205
  "score_ci_low": 1.0,
206
- "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
  "accuracy": 1.0,
@@ -213,17 +213,17 @@
213
  "score": 1.0,
214
  "score_ci_high": 1.0,
215
  "score_ci_low": 1.0,
216
- "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
- "accuracy": 0.9888888888888889,
220
- "accuracy_ci_low": 0.9555555555555556,
221
  "accuracy_ci_high": 1.0,
222
  "score_name": "accuracy",
223
- "score": 0.9888888888888889,
224
  "score_ci_high": 1.0,
225
- "score_ci_low": 0.9555555555555556,
226
- "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
  "accuracy": 1.0,
@@ -233,7 +233,7 @@
233
  "score": 1.0,
234
  "score_ci_high": 1.0,
235
  "score_ci_low": 1.0,
236
- "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
  "accuracy": 1.0,
@@ -243,1041 +243,1039 @@
243
  "score": 1.0,
244
  "score_ci_high": 1.0,
245
  "score_ci_low": 1.0,
246
- "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
- "accuracy": 0.9888888888888889,
250
- "accuracy_ci_low": 0.9407763312346947,
251
  "accuracy_ci_high": 1.0,
252
  "score_name": "accuracy",
253
- "score": 0.9888888888888889,
254
  "score_ci_high": 1.0,
255
- "score_ci_low": 0.9407763312346947,
256
- "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
- "accuracy": 0.9777777777777777,
260
- "accuracy_ci_low": 0.9333333333333333,
261
  "accuracy_ci_high": 1.0,
262
  "score_name": "accuracy",
263
- "score": 0.9777777777777777,
264
  "score_ci_high": 1.0,
265
- "score_ci_low": 0.9333333333333333,
266
- "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
- "accuracy": 0.9888888888888889,
270
- "accuracy_ci_low": 0.9283857779145438,
271
  "accuracy_ci_high": 1.0,
272
  "score_name": "accuracy",
273
- "score": 0.9888888888888889,
274
  "score_ci_high": 1.0,
275
- "score_ci_low": 0.9283857779145438,
276
- "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.8555555555555555,
280
- "accuracy_ci_low": 0.7777777777777778,
281
- "accuracy_ci_high": 0.9222222222222223,
282
  "score_name": "accuracy",
283
- "score": 0.8555555555555555,
284
- "score_ci_high": 0.9222222222222223,
285
- "score_ci_low": 0.7777777777777778,
286
- "num_of_instances": 90
287
  },
288
- "score": 0.9626262626262626,
289
  "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.14026236125126135,
296
- "score": 0.14026236125126135,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.14026236125126135,
300
  "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.5721784776902887,
307
- "f1_Organization": 0.37837837837837834,
308
- "f1_Location": 0.3692307692307692,
309
- "f1_macro": 0.4399292084331454,
310
- "recall_macro": 0.40673591832987227,
311
- "precision_macro": 0.48338733915656995,
312
- "in_classes_support": 0.6414285714285715,
313
- "f1_micro": 0.35918367346938773,
314
- "recall_micro": 0.41904761904761906,
315
- "precision_micro": 0.3142857142857143,
316
- "score": 0.35918367346938773,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.311485746926114,
319
- "score_ci_high": 0.40594536569316764,
320
- "f1_micro_ci_low": 0.311485746926114,
321
- "f1_micro_ci_high": 0.40594536569316764
322
  },
323
- "score": 0.35918367346938773,
324
  "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
- "accuracy": 0.704225352112676,
330
- "accuracy_ci_low": 0.5915492957746479,
331
- "accuracy_ci_high": 0.8028169014084507,
332
  "score_name": "accuracy",
333
- "score": 0.704225352112676,
334
- "score_ci_high": 0.8028169014084507,
335
- "score_ci_low": 0.5915492957746479,
336
- "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.18309859154929578,
340
- "accuracy_ci_low": 0.11267605633802817,
341
- "accuracy_ci_high": 0.28169014084507044,
342
  "score_name": "accuracy",
343
- "score": 0.18309859154929578,
344
- "score_ci_high": 0.28169014084507044,
345
- "score_ci_low": 0.11267605633802817,
346
- "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
- "accuracy": 0.15492957746478872,
350
- "accuracy_ci_low": 0.08450704225352113,
351
- "accuracy_ci_high": 0.2535211267605634,
352
  "score_name": "accuracy",
353
- "score": 0.15492957746478872,
354
- "score_ci_high": 0.2535211267605634,
355
- "score_ci_low": 0.08450704225352113,
356
- "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
- "accuracy": 0.6056338028169014,
360
- "accuracy_ci_low": 0.49295774647887325,
361
- "accuracy_ci_high": 0.704225352112676,
362
  "score_name": "accuracy",
363
- "score": 0.6056338028169014,
364
- "score_ci_high": 0.704225352112676,
365
- "score_ci_low": 0.49295774647887325,
366
- "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
- "accuracy": 0.647887323943662,
370
- "accuracy_ci_low": 0.5211267605633803,
371
- "accuracy_ci_high": 0.7605633802816901,
372
  "score_name": "accuracy",
373
- "score": 0.647887323943662,
374
- "score_ci_high": 0.7605633802816901,
375
- "score_ci_low": 0.5211267605633803,
376
- "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
- "accuracy": 0.323943661971831,
380
- "accuracy_ci_low": 0.22535211267605634,
381
- "accuracy_ci_high": 0.43661971830985913,
382
  "score_name": "accuracy",
383
- "score": 0.323943661971831,
384
- "score_ci_high": 0.43661971830985913,
385
- "score_ci_low": 0.22535211267605634,
386
- "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
- "accuracy": 0.5352112676056338,
390
- "accuracy_ci_low": 0.4225352112676056,
391
- "accuracy_ci_high": 0.647887323943662,
392
  "score_name": "accuracy",
393
- "score": 0.5352112676056338,
394
- "score_ci_high": 0.647887323943662,
395
- "score_ci_low": 0.4225352112676056,
396
- "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
- "accuracy": 0.7323943661971831,
400
- "accuracy_ci_low": 0.6310963819783834,
401
- "accuracy_ci_high": 0.8309859154929577,
402
  "score_name": "accuracy",
403
- "score": 0.7323943661971831,
404
- "score_ci_high": 0.8309859154929577,
405
- "score_ci_low": 0.6310963819783834,
406
- "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
- "accuracy": 0.5915492957746479,
410
- "accuracy_ci_low": 0.4788732394366197,
411
- "accuracy_ci_high": 0.6901408450704225,
412
  "score_name": "accuracy",
413
- "score": 0.5915492957746479,
414
- "score_ci_high": 0.6901408450704225,
415
- "score_ci_low": 0.4788732394366197,
416
- "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
- "accuracy": 0.29577464788732394,
420
- "accuracy_ci_low": 0.19718309859154928,
421
- "accuracy_ci_high": 0.41750158298380896,
422
  "score_name": "accuracy",
423
- "score": 0.29577464788732394,
424
- "score_ci_high": 0.41750158298380896,
425
- "score_ci_low": 0.19718309859154928,
426
- "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
- "accuracy": 0.49295774647887325,
430
- "accuracy_ci_low": 0.38028169014084506,
431
- "accuracy_ci_high": 0.6056338028169014,
432
  "score_name": "accuracy",
433
- "score": 0.49295774647887325,
434
- "score_ci_high": 0.6056338028169014,
435
- "score_ci_low": 0.38028169014084506,
436
- "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
- "accuracy": 0.5915492957746479,
440
- "accuracy_ci_low": 0.4647887323943662,
441
- "accuracy_ci_high": 0.704225352112676,
442
  "score_name": "accuracy",
443
- "score": 0.5915492957746479,
444
- "score_ci_high": 0.704225352112676,
445
- "score_ci_low": 0.4647887323943662,
446
- "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.30985915492957744,
450
- "accuracy_ci_low": 0.19718309859154928,
451
- "accuracy_ci_high": 0.4084507042253521,
452
  "score_name": "accuracy",
453
- "score": 0.30985915492957744,
454
- "score_ci_high": 0.4084507042253521,
455
- "score_ci_low": 0.19718309859154928,
456
- "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
- "accuracy": 0.704225352112676,
460
- "accuracy_ci_low": 0.5915492957746479,
461
- "accuracy_ci_high": 0.8028169014084507,
462
  "score_name": "accuracy",
463
- "score": 0.704225352112676,
464
- "score_ci_high": 0.8028169014084507,
465
- "score_ci_low": 0.5915492957746479,
466
- "num_of_instances": 71
467
  },
468
- "score": 0.4909456740442656,
469
  "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.6597867569632275,
475
- "f1_suggestive": 0.5882352941176471,
476
- "f1_generic": 0.72,
477
- "f1_descriptive": 0.6818181818181818,
478
- "f1_fanciful": 0.7142857142857143,
479
- "f1_arbitrary": 0.5945945945945946,
480
- "f1_macro_ci_low": 0.548464495780585,
481
- "f1_macro_ci_high": 0.7575557118629758,
482
  "score_name": "f1_micro",
483
- "score": 0.6547619047619048,
484
- "score_ci_high": 0.7425149700598802,
485
- "score_ci_low": 0.5437048440428358,
486
- "num_of_instances": 85,
487
- "accuracy": 0.6470588235294118,
488
- "accuracy_ci_low": 0.5411764705882353,
489
- "accuracy_ci_high": 0.7411764705882353,
490
- "f1_micro": 0.6547619047619048,
491
- "f1_micro_ci_low": 0.5437048440428358,
492
- "f1_micro_ci_high": 0.7425149700598802
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.5899229232086782,
496
- "f1_no": 0.6637168141592921,
497
- "f1_yes": 0.5161290322580645,
498
- "f1_macro_ci_low": 0.5156319338532668,
499
- "f1_macro_ci_high": 0.6621360959437094,
500
  "score_name": "f1_micro",
501
- "score": 0.6114285714285714,
502
- "score_ci_high": 0.6798739003144315,
503
- "score_ci_low": 0.5364733968179762,
504
- "num_of_instances": 200,
505
- "accuracy": 0.535,
506
- "accuracy_ci_low": 0.465,
507
- "accuracy_ci_high": 0.605,
508
- "f1_micro": 0.6114285714285714,
509
- "f1_micro_ci_low": 0.5364733968179762,
510
- "f1_micro_ci_high": 0.6798739003144315
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.24082948201110896,
514
- "f1_conclusion": 0.10810810810810811,
515
- "f1_decree": 0.12903225806451613,
516
- "f1_issue": 0.18181818181818182,
517
- "f1_analysis": 0.5833333333333334,
518
- "f1_facts": 0.06666666666666667,
519
- "f1_procedural history": 0.3125,
520
- "f1_rule": 0.30434782608695654,
521
- "f1_macro_ci_low": 0.19135126191537805,
522
- "f1_macro_ci_high": 0.31037088994163425,
523
  "score_name": "f1_micro",
524
- "score": 0.29012345679012347,
525
- "score_ci_high": 0.3634815160611135,
526
- "score_ci_low": 0.22855500349415586,
527
- "num_of_instances": 200,
528
- "accuracy": 0.235,
529
- "accuracy_ci_low": 0.185,
530
- "accuracy_ci_high": 0.3,
531
- "f1_micro": 0.29012345679012347,
532
- "f1_micro_ci_low": 0.22855500349415586,
533
- "f1_micro_ci_high": 0.3634815160611135
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.4444123800591588,
537
- "f1_yes": 0.4550898203592814,
538
- "f1_no": 0.43373493975903615,
539
- "f1_macro_ci_low": 0.3748865543059881,
540
- "f1_macro_ci_high": 0.5178711641402558,
541
  "score_name": "f1_micro",
542
- "score": 0.4444444444444444,
543
- "score_ci_high": 0.5162242117942616,
544
- "score_ci_low": 0.37379019448718637,
545
- "num_of_instances": 200,
546
- "accuracy": 0.37,
547
- "accuracy_ci_low": 0.305,
548
- "accuracy_ci_high": 0.435,
549
- "f1_micro": 0.4444444444444444,
550
- "f1_micro_ci_low": 0.37379019448718637,
551
- "f1_micro_ci_high": 0.5162242117942616
552
  },
553
  "legalbench_proa": {
554
- "f1_macro": 0.75,
555
- "f1_yes": 0.75,
556
- "f1_no": 0.75,
557
- "f1_macro_ci_low": 0.6597413488150221,
558
- "f1_macro_ci_high": 0.8234621193211324,
559
  "score_name": "f1_micro",
560
- "score": 0.75,
561
- "score_ci_high": 0.8234882632928148,
562
- "score_ci_low": 0.6573326079878734,
563
- "num_of_instances": 85,
564
- "accuracy": 0.6352941176470588,
565
- "accuracy_ci_low": 0.5411764705882353,
566
- "accuracy_ci_high": 0.7294117647058823,
567
- "f1_micro": 0.75,
568
- "f1_micro_ci_low": 0.6573326079878734,
569
- "f1_micro_ci_high": 0.8234882632928148
570
  },
571
- "score": 0.5501516754850089,
572
  "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.625283027988775,
578
- "f1_cars": 0.8541666666666666,
579
- "f1_windows x": 0.11940298507462686,
580
- "f1_computer graphics": 0.4297520661157025,
581
- "f1_atheism": 0.509090909090909,
582
- "f1_religion": 0.2222222222222222,
583
- "f1_medicine": 0.8705882352941177,
584
- "f1_christianity": 0.7755102040816326,
585
- "f1_microsoft windows": 0.6436781609195402,
586
  "f1_middle east": 0.6666666666666666,
587
- "f1_motorcycles": 0.7326732673267327,
588
- "f1_pc hardware": 0.5846153846153846,
589
- "f1_mac hardware": 0.6458333333333334,
590
  "f1_electronics": 0.6666666666666666,
591
- "f1_for sale": 0.6944444444444444,
592
- "f1_guns": 0.36923076923076925,
593
- "f1_space": 0.8235294117647058,
594
- "f1_cryptography": 0.6575342465753424,
595
- "f1_baseball": 0.9310344827586207,
596
- "f1_politics": 0.3787878787878788,
597
- "f1_hockey": 0.9302325581395349,
598
- "f1_macro_ci_low": 0.5984293289041998,
599
- "f1_macro_ci_high": 0.65230217299566,
600
  "score_name": "f1_micro",
601
- "score": 0.6474114441416894,
602
- "score_ci_high": 0.6749803309845304,
603
- "score_ci_low": 0.6158904109589041,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.594,
606
- "accuracy_ci_low": 0.561,
607
- "accuracy_ci_high": 0.622,
608
- "f1_micro": 0.6474114441416894,
609
- "f1_micro_ci_low": 0.6158904109589041,
610
- "f1_micro_ci_high": 0.6749803309845304
611
  },
612
- "score": 0.6474114441416894,
613
  "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.7879467396802322,
619
- "f1_student loan": 0.8888888888888888,
620
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9461756373937678,
621
- "f1_debt collection": 0.6428571428571429,
622
- "f1_checking or savings account": 0.8222222222222222,
623
- "f1_mortgage": 0.9705882352941176,
624
- "f1_payday loan or title loan or personal loan": 0.5333333333333333,
625
- "f1_credit card or prepaid card": 0.8666666666666667,
626
- "f1_money transfer or virtual currency or money service": 0.7111111111111111,
627
- "f1_vehicle loan or lease": 0.7096774193548387,
628
- "f1_macro_ci_low": 0.7309747205142173,
629
- "f1_macro_ci_high": 0.8394377629013812,
630
  "score_name": "f1_micro",
631
- "score": 0.8977732793522267,
632
- "score_ci_high": 0.9148163850441952,
633
- "score_ci_low": 0.8791739655658123,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.887,
636
- "accuracy_ci_low": 0.8679599560953464,
637
- "accuracy_ci_high": 0.906,
638
- "f1_micro": 0.8977732793522267,
639
- "f1_micro_ci_low": 0.8791739655658123,
640
- "f1_micro_ci_high": 0.9148163850441952
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.755452767113785,
644
- "f1_mortgages and loans": 0.8135593220338984,
645
- "f1_credit card": 0.7865168539325843,
646
- "f1_debt collection": 0.7069767441860465,
647
- "f1_credit reporting": 0.7876712328767124,
648
- "f1_retail banking": 0.6825396825396826,
649
- "f1_macro_ci_low": 0.7186153400597911,
650
- "f1_macro_ci_high": 0.7954872943230671,
651
  "score_name": "f1_micro",
652
- "score": 0.7611336032388664,
653
- "score_ci_high": 0.797979797979798,
654
- "score_ci_low": 0.7233852933885262,
655
- "num_of_instances": 500,
656
- "accuracy": 0.752,
657
- "accuracy_ci_low": 0.712,
658
- "accuracy_ci_high": 0.79,
659
- "f1_micro": 0.7611336032388664,
660
- "f1_micro_ci_low": 0.7233852933885262,
661
- "f1_micro_ci_high": 0.797979797979798
662
  },
663
- "score": 0.8294534412955465,
664
  "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
- "num_of_instances": 1000,
670
- "program_accuracy": 0.21,
671
- "score": 0.21,
 
672
  "score_name": "program_accuracy",
673
- "execution_accuracy": 0.19,
674
- "program_accuracy_ci_low": 0.1850718210152138,
675
- "program_accuracy_ci_high": 0.236,
676
- "score_ci_low": 0.1850718210152138,
677
- "score_ci_high": 0.236,
678
- "execution_accuracy_ci_low": 0.167,
679
- "execution_accuracy_ci_high": 0.214
680
  },
681
- "score": 0.21,
682
  "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.2793294516338928,
688
- "recall": 0.5853975586330095,
689
- "f1": 0.32244875528474853,
690
- "precision_ci_low": 0.2608509633355691,
691
- "precision_ci_high": 0.2992012655344498,
692
- "recall_ci_low": 0.5709386747947787,
693
- "recall_ci_high": 0.6004552716919522,
694
- "f1_ci_low": 0.3057656195014431,
695
- "f1_ci_high": 0.34039188672012827,
696
  "score_name": "f1",
697
- "score": 0.32244875528474853,
698
- "score_ci_high": 0.34039188672012827,
699
- "score_ci_low": 0.3057656195014431,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.583492674678564,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6757716050744057,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5312159793823957,
704
- "faithfullness_f1_token_overlap": 0.31789392117250886,
705
- "faithfullness_recall_token_overlap": 0.25491265786006334,
706
- "faithfullness_precision_token_overlap": 0.5284813664748244,
707
- "correctness_f1_token_overlap": 0.32244875528474853,
708
- "correctness_recall_token_overlap": 0.5853975586330095,
709
- "correctness_precision_token_overlap": 0.2793294516338928
710
  },
711
- "score": 0.32244875528474853,
712
  "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.565,
718
- "accuracy_ci_low": 0.5335774990883203,
719
- "accuracy_ci_high": 0.5932386235436204,
720
  "score_name": "accuracy",
721
- "score": 0.565,
722
- "score_ci_high": 0.5932386235436204,
723
- "score_ci_low": 0.5335774990883203,
724
- "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.92,
728
- "accuracy_ci_low": 0.892,
729
- "accuracy_ci_high": 0.94,
730
  "score_name": "accuracy",
731
- "score": 0.92,
732
- "score_ci_high": 0.94,
733
- "score_ci_low": 0.892,
734
- "num_of_instances": 500
735
  },
736
- "score": 0.7424999999999999,
737
  "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
- "severity_critical": 0.0,
743
  "severity_high": 0.0,
744
- "severity_medium": 0.0,
745
- "severity_low": 100.0,
746
- "category_harmful_info": 0.8275590381670063,
747
- "category_deception": 0.870389111565821,
748
- "category_explicit_content": 0.8619736799537534,
749
- "category_violence": 0.839932223764204,
750
- "category_discrimination": 0.7697427272796631,
751
- "category_substance_abuse": 0.8608841189907658,
752
- "category_pii": 0.7655655101493553,
753
- "safety": 0.8361458782349815,
754
- "safety_ci_low": 0.8192866587424859,
755
- "safety_ci_high": 0.8545105900812325,
756
  "score_name": "safety",
757
- "score": 0.8361458782349815,
758
- "score_ci_high": 0.8545105900812325,
759
- "score_ci_low": 0.8192866587424859,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.8361458782349815,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rougeLsum": 0.3756142003700213,
770
- "rouge1": 0.43798911333105917,
771
- "rouge2": 0.22301544369181786,
772
- "rougeL": 0.30881611781607615,
773
- "score": 0.30881611781607615,
774
  "score_name": "rougeL",
775
- "rougeLsum_ci_low": 0.3665764288194776,
776
- "rougeLsum_ci_high": 0.3845640981112231,
777
- "rouge1_ci_low": 0.42803373903414665,
778
- "rouge1_ci_high": 0.44745782528977346,
779
- "rouge2_ci_low": 0.21565182499600158,
780
- "rouge2_ci_high": 0.23166608743475037,
781
- "rougeL_ci_low": 0.3011644909175205,
782
- "rougeL_ci_high": 0.31722072238042875,
783
- "score_ci_low": 0.3011644909175205,
784
- "score_ci_high": 0.31722072238042875
 
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rougeLsum": 0.1094469715392792,
789
- "rouge1": 0.13301752816755213,
790
- "rouge2": 0.020621633401068214,
791
- "rougeL": 0.09635378924374519,
792
- "score": 0.09635378924374519,
793
  "score_name": "rougeL",
794
- "rougeLsum_ci_low": 0.10482036952689613,
795
- "rougeLsum_ci_high": 0.11359992785988014,
796
- "rouge1_ci_low": 0.12738784135782572,
797
- "rouge1_ci_high": 0.13845012766033873,
798
- "rouge2_ci_low": 0.01856074580818113,
799
- "rouge2_ci_high": 0.02259991124480518,
800
- "rougeL_ci_low": 0.09230760346929477,
801
- "rougeL_ci_high": 0.09984237535822288,
802
- "score_ci_low": 0.09230760346929477,
803
- "score_ci_high": 0.09984237535822288
 
804
  },
805
- "score": 0.20258495352991068,
806
  "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
  },
809
  "translation": {
810
  "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
  "counts": [
813
- 1310,
814
- 862,
815
- 608,
816
- 433
817
  ],
818
  "totals": [
819
- 1791,
820
- 1725,
821
- 1659,
822
- 1593
823
  ],
824
  "precisions": [
825
- 0.7314349525404802,
826
- 0.49971014492753624,
827
- 0.36648583484026526,
828
- 0.27181418706842436
829
  ],
830
  "bp": 1.0,
831
- "sys_len": 1791,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.43682330208953546,
834
- "score": 0.43682330208953546,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.3952567851744898,
837
- "score_ci_high": 0.4779782047825724,
838
- "sacrebleu_ci_low": 0.3952567851744898,
839
- "sacrebleu_ci_high": 0.4779782047825724
840
  },
841
  "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
  "counts": [
844
- 1330,
845
- 885,
846
- 621,
847
- 444
848
  ],
849
  "totals": [
850
- 1803,
851
- 1737,
852
- 1671,
853
- 1605
854
  ],
855
  "precisions": [
856
- 0.7376594564614531,
857
- 0.5094991364421416,
858
- 0.37163375224416517,
859
- 0.2766355140186916
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 1803,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.44335908539973706,
865
- "score": 0.44335908539973706,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.407676677665113,
868
- "score_ci_high": 0.49444892387735484,
869
- "sacrebleu_ci_low": 0.407676677665113,
870
- "sacrebleu_ci_high": 0.49444892387735484
871
  },
872
  "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
  "counts": [
875
- 904,
876
- 502,
877
- 300,
878
- 175
879
  ],
880
  "totals": [
881
- 1585,
882
- 1519,
883
- 1453,
884
- 1387
885
  ],
886
  "precisions": [
887
- 0.5703470031545741,
888
- 0.3304805793285056,
889
- 0.20646937370956642,
890
- 0.12617159336697908
891
  ],
892
- "bp": 0.9974795224450381,
893
- "sys_len": 1585,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.26404598765634385,
896
- "score": 0.26404598765634385,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.23497301086217232,
899
- "score_ci_high": 0.29259086123320976,
900
- "sacrebleu_ci_low": 0.23497301086217232,
901
- "sacrebleu_ci_high": 0.29259086123320976
902
  },
903
  "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
  "counts": [
906
- 1246,
907
- 765,
908
- 517,
909
- 376
910
  ],
911
  "totals": [
912
- 1853,
913
- 1787,
914
- 1721,
915
- 1655
916
  ],
917
  "precisions": [
918
- 0.6724230976794388,
919
- 0.42809177392277564,
920
- 0.3004067402672865,
921
- 0.22719033232628397
922
  ],
923
  "bp": 1.0,
924
- "sys_len": 1853,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.3743861346447394,
927
- "score": 0.3743861346447394,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.3331443738925502,
930
- "score_ci_high": 0.4167892583826109,
931
- "sacrebleu_ci_low": 0.3331443738925502,
932
- "sacrebleu_ci_high": 0.4167892583826109
933
  },
934
  "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
  "counts": [
937
- 1562,
938
- 1176,
939
- 936,
940
- 755
941
  ],
942
  "totals": [
943
- 2040,
944
- 1974,
945
- 1908,
946
- 1842
947
  ],
948
  "precisions": [
949
- 0.7656862745098039,
950
- 0.5957446808510638,
951
- 0.49056603773584906,
952
- 0.40988056460369166
953
  ],
954
- "bp": 0.9863682748637871,
955
- "sys_len": 2040,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.5428196432331734,
958
- "score": 0.5428196432331734,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.509892185181122,
961
- "score_ci_high": 0.5871694828358561,
962
- "sacrebleu_ci_low": 0.509892185181122,
963
- "sacrebleu_ci_high": 0.5871694828358561
964
  },
965
  "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
  "counts": [
968
- 1356,
969
- 727,
970
- 432,
971
- 274
972
  ],
973
  "totals": [
974
- 2382,
975
- 2316,
976
- 2250,
977
- 2184
978
  ],
979
  "precisions": [
980
- 0.5692695214105793,
981
- 0.3139032815198618,
982
- 0.192,
983
- 0.12545787545787546
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 2382,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.25614049024804236,
989
- "score": 0.25614049024804236,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.2240052628669271,
992
- "score_ci_high": 0.2935558382274424,
993
- "sacrebleu_ci_low": 0.2240052628669271,
994
- "sacrebleu_ci_high": 0.2935558382274424
995
  },
996
  "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
  "counts": [
999
- 1455,
1000
- 1063,
1001
- 821,
1002
- 637
1003
  ],
1004
  "totals": [
1005
- 1900,
1006
- 1834,
1007
- 1768,
1008
- 1702
1009
  ],
1010
  "precisions": [
1011
- 0.7657894736842106,
1012
- 0.579607415485278,
1013
- 0.4643665158371041,
1014
- 0.37426556991774385
1015
  ],
1016
- "bp": 0.9916143051127146,
1017
- "sys_len": 1900,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.5225932644775685,
1020
- "score": 0.5225932644775685,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.4737364435189365,
1023
- "score_ci_high": 0.5631623567289689,
1024
- "sacrebleu_ci_low": 0.4737364435189365,
1025
- "sacrebleu_ci_high": 0.5631623567289689
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
  "counts": [
1030
- 1404,
1031
- 989,
1032
- 719,
1033
- 525
1034
  ],
1035
  "totals": [
1036
- 1962,
1037
- 1896,
1038
- 1830,
1039
- 1764
1040
  ],
1041
  "precisions": [
1042
- 0.7155963302752294,
1043
- 0.5216244725738397,
1044
- 0.39289617486338796,
1045
- 0.2976190476190476
1046
  ],
1047
  "bp": 1.0,
1048
- "sys_len": 1962,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.45707887169863065,
1051
- "score": 0.45707887169863065,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.4211360031313033,
1054
- "score_ci_high": 0.5197096344136953,
1055
- "sacrebleu_ci_low": 0.4211360031313033,
1056
- "sacrebleu_ci_high": 0.5197096344136953
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
  "counts": [
1061
- 1297,
1062
- 753,
1063
- 472,
1064
- 301
1065
  ],
1066
  "totals": [
1067
- 2014,
1068
- 1948,
1069
- 1882,
1070
- 1816
1071
  ],
1072
  "precisions": [
1073
- 0.6439920556107249,
1074
- 0.38655030800821355,
1075
- 0.2507970244420829,
1076
- 0.1657488986784141
1077
  ],
1078
- "bp": 0.9591497695217011,
1079
- "sys_len": 2014,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.3059153842651481,
1082
- "score": 0.3059153842651481,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.2738471419962368,
1085
- "score_ci_high": 0.33744567062204633,
1086
- "sacrebleu_ci_low": 0.2738471419962368,
1087
- "sacrebleu_ci_high": 0.33744567062204633
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
  "counts": [
1092
- 1371,
1093
- 964,
1094
- 693,
1095
- 491
1096
  ],
1097
  "totals": [
1098
- 1839,
1099
- 1773,
1100
- 1707,
1101
- 1641
1102
  ],
1103
  "precisions": [
1104
- 0.7455138662316476,
1105
- 0.5437112239142696,
1106
- 0.40597539543058,
1107
- 0.2992078001218769
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 1839,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.4710577594991048,
1113
- "score": 0.4710577594991048,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.4268855553367512,
1116
- "score_ci_high": 0.5049624748525423,
1117
- "sacrebleu_ci_low": 0.4268855553367512,
1118
- "sacrebleu_ci_high": 0.5049624748525423
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
  "counts": [
1123
- 1130,
1124
- 638,
1125
- 409,
1126
- 268
1127
  ],
1128
  "totals": [
1129
- 1794,
1130
- 1728,
1131
- 1662,
1132
- 1596
1133
  ],
1134
  "precisions": [
1135
- 0.6298773690078038,
1136
- 0.36921296296296297,
1137
- 0.24608904933814682,
1138
- 0.16791979949874686
1139
  ],
1140
  "bp": 1.0,
1141
- "sys_len": 1794,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.31309907547937593,
1144
- "score": 0.31309907547937593,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.26572456707025455,
1147
- "score_ci_high": 0.3514896703047057,
1148
- "sacrebleu_ci_low": 0.26572456707025455,
1149
- "sacrebleu_ci_high": 0.3514896703047057
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
  "counts": [
1154
- 1112,
1155
- 613,
1156
- 383,
1157
- 250
1158
  ],
1159
  "totals": [
1160
- 1725,
1161
- 1659,
1162
- 1593,
1163
- 1527
1164
  ],
1165
  "precisions": [
1166
- 0.6446376811594203,
1167
- 0.3694996986136227,
1168
- 0.24042686754551162,
1169
- 0.16371971185330714
1170
  ],
1171
- "bp": 0.9947961956419216,
1172
- "sys_len": 1725,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.3095548068732939,
1175
- "score": 0.3095548068732939,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.27129838546648216,
1178
- "score_ci_high": 0.3652127158245783,
1179
- "sacrebleu_ci_low": 0.27129838546648216,
1180
- "sacrebleu_ci_high": 0.3652127158245783
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
  "counts": [
1185
- 1382,
1186
- 1013,
1187
- 771,
1188
- 586
1189
  ],
1190
  "totals": [
1191
- 1816,
1192
- 1750,
1193
- 1684,
1194
- 1618
1195
  ],
1196
  "precisions": [
1197
- 0.7610132158590308,
1198
- 0.5788571428571428,
1199
- 0.4578384798099763,
1200
- 0.3621755253399258
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 1816,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.5198747760500348,
1206
- "score": 0.5198747760500348,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.470424249351864,
1209
- "score_ci_high": 0.5665291609523828,
1210
- "sacrebleu_ci_low": 0.470424249351864,
1211
- "sacrebleu_ci_high": 0.5665291609523828
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
  "counts": [
1216
- 1376,
1217
- 980,
1218
- 722,
1219
- 541
1220
  ],
1221
  "totals": [
1222
- 1801,
1223
- 1735,
1224
- 1669,
1225
- 1603
1226
  ],
1227
  "precisions": [
1228
- 0.7640199888950583,
1229
- 0.5648414985590778,
1230
- 0.43259436788496103,
1231
- 0.3374922021210231
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 1801,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.5010072151606186,
1237
- "score": 0.5010072151606186,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.4614513028457302,
1240
- "score_ci_high": 0.5398623866578321,
1241
- "sacrebleu_ci_low": 0.4614513028457302,
1242
- "sacrebleu_ci_high": 0.5398623866578321
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
  "counts": [
1247
- 1219,
1248
- 713,
1249
- 457,
1250
- 300
1251
  ],
1252
  "totals": [
1253
- 1888,
1254
- 1822,
1255
- 1756,
1256
- 1690
1257
  ],
1258
  "precisions": [
1259
- 0.6456567796610169,
1260
- 0.3913282107574094,
1261
- 0.260250569476082,
1262
- 0.17751479289940827
1263
  ],
1264
  "bp": 1.0,
1265
- "sys_len": 1888,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.32869437944703067,
1268
- "score": 0.32869437944703067,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.30148825657210887,
1271
- "score_ci_high": 0.3740478120880907,
1272
- "sacrebleu_ci_low": 0.30148825657210887,
1273
- "sacrebleu_ci_high": 0.3740478120880907
1274
  },
1275
- "score": 0.40309667841482516,
1276
  "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
  },
1279
- "score": 0.5151392921367606,
1280
  "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
  }
1283
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-07-02T22:37:32.923005Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
+ "model_name=watsonx/mistralai/mistral-medium-2505,max_tokens=1024",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
 
26
  "num_fewshots": null,
27
  "limit": null,
28
  "batch_size": 8,
29
+ "model": "watsonx/mistralai/mistral-medium-2505",
30
  "model_args": {
31
+ "max_tokens": 1024
32
  },
33
  "gen_kwargs": null,
34
  "chat_template_kwargs": null,
 
41
  "disable_hf_cache": false,
42
  "cache_dir": null
43
  },
44
+ "unitxt_version": "1.25.0",
45
+ "unitxt_commit_hash": "c8a5e77fd6ca62039b915dc10700323f50ccaacf",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
51
  "triton": "3.3.1",
52
  "nltk": "3.9.1",
53
  "anyio": "4.9.0",
54
+ "unitxt": "1.25.0",
55
  "absl-py": "2.3.0",
56
  "tiktoken": "0.9.0",
57
  "charset-normalizer": "3.4.2",
58
  "nvidia-cuda-runtime-cu12": "12.6.77",
59
  "sympy": "1.14.0",
60
  "mecab-ko": "1.0.1",
 
61
  "httpcore": "1.0.9",
62
+ "litellm": "1.73.6",
63
  "Jinja2": "3.1.6",
64
  "jsonschema-specifications": "2025.4.1",
65
  "pydantic_core": "2.33.2",
66
  "nvidia-cusparse-cu12": "12.5.4.2",
67
+ "tokenizers": "0.21.2",
68
  "yarl": "1.20.1",
 
69
  "portalocker": "3.2.0",
70
  "pandas": "2.3.0",
71
  "multiprocess": "0.70.16",
72
  "jsonschema": "4.24.0",
 
73
  "nvidia-nvjitlink-cu12": "12.6.85",
74
  "nvidia-cublas-cu12": "12.6.4.1",
75
  "pydantic": "2.11.7",
 
79
  "contourpy": "1.3.2",
80
  "aiosignal": "1.3.2",
81
  "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "openai": "1.93.0",
83
  "six": "1.17.0",
84
  "diskcache": "5.6.3",
85
  "tqdm": "4.67.1",
 
98
  "kiwisolver": "1.4.8",
99
  "networkx": "3.4.2",
100
  "typing-inspection": "0.4.1",
 
101
  "sniffio": "1.3.1",
102
  "scikit-learn": "1.7.0",
103
+ "rpds-py": "0.26.0",
104
  "nvidia-curand-cu12": "10.3.7.77",
105
  "pip": "25.1.1",
106
+ "pillow": "11.3.0",
107
  "fonttools": "4.58.4",
 
108
  "datasets": "3.6.0",
109
  "nvidia-cusolver-cu12": "11.7.1.2",
110
  "cycler": "0.12.1",
 
111
  "distro": "1.9.0",
112
  "idna": "3.10",
113
  "MarkupSafe": "3.0.2",
 
121
  "joblib": "1.5.1",
122
  "fsspec": "2025.3.0",
123
  "dill": "0.3.8",
 
124
  "wheel": "0.45.1",
125
  "nvidia-nvtx-cu12": "12.6.77",
126
  "nvidia-cusparselt-cu12": "0.6.3",
127
+ "lxml": "6.0.0",
128
  "propcache": "0.3.2",
129
  "numpy": "2.2.6",
130
  "mpmath": "1.3.0",
 
131
  "conllu": "6.0.0",
132
+ "huggingface-hub": "0.33.2",
133
  "safetensors": "0.5.3",
134
  "requests": "2.32.4",
135
  "regex": "2024.11.6",
136
  "aiohttp": "3.12.13",
137
  "tabulate": "0.9.0",
138
+ "accelerate": "1.8.1",
139
  "certifi": "2025.6.15",
140
+ "evaluate": "0.4.4",
141
  "nvidia-cufft-cu12": "11.3.0.4",
142
  "nvidia-cuda-nvrtc-cu12": "12.6.77",
143
  "click": "8.2.1",
144
  "typing_extensions": "4.12.2",
145
  "attrs": "25.3.0",
146
  "exceptiongroup": "1.3.0",
147
+ "transformers": "4.53.0",
148
  "tenacity": "9.1.2",
149
  "pytz": "2025.2",
150
  "aiohappyeyeballs": "2.6.1",
151
  "python-dateutil": "2.9.0.post0",
152
  "torch": "2.7.1",
153
+ "python-dotenv": "1.1.1",
154
+ "multidict": "6.6.3",
155
  "httpx": "0.28.1",
156
  "matplotlib": "3.10.3",
157
  "xxhash": "3.5.0",
158
  "PyYAML": "6.0.2",
 
159
  "colorama": "0.4.6",
 
160
  "threadpoolctl": "3.6.0",
161
  "nvidia-cudnn-cu12": "9.5.1.17",
162
+ "hf-xet": "1.1.5",
163
  "jaraco.collections": "5.1.0",
164
  "tomli": "2.0.1",
165
  "backports.tarfile": "1.2.0",
 
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
+ "accuracy": 1.0,
180
+ "accuracy_ci_low": 1.0,
181
+ "accuracy_ci_high": 1.0,
182
  "score_name": "accuracy",
183
+ "score": 1.0,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 1.0,
186
+ "num_of_instances": 9
187
  },
188
  "safety_bbq_disability_status": {
189
  "accuracy": 1.0,
 
193
  "score": 1.0,
194
  "score_ci_high": 1.0,
195
  "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
  },
198
  "safety_bbq_gender_identity": {
199
  "accuracy": 1.0,
 
203
  "score": 1.0,
204
  "score_ci_high": 1.0,
205
  "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
  },
208
  "safety_bbq_nationality": {
209
  "accuracy": 1.0,
 
213
  "score": 1.0,
214
  "score_ci_high": 1.0,
215
  "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
  },
218
  "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
  "accuracy_ci_high": 1.0,
222
  "score_name": "accuracy",
223
+ "score": 1.0,
224
  "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
  },
228
  "safety_bbq_race_ethnicity": {
229
  "accuracy": 1.0,
 
233
  "score": 1.0,
234
  "score_ci_high": 1.0,
235
  "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
  },
238
  "safety_bbq_race_x_gender": {
239
  "accuracy": 1.0,
 
243
  "score": 1.0,
244
  "score_ci_high": 1.0,
245
  "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
  },
248
  "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
  "accuracy_ci_high": 1.0,
252
  "score_name": "accuracy",
253
+ "score": 1.0,
254
  "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 9
257
  },
258
  "safety_bbq_religion": {
259
+ "accuracy": 1.0,
260
+ "accuracy_ci_low": 1.0,
261
  "accuracy_ci_high": 1.0,
262
  "score_name": "accuracy",
263
+ "score": 1.0,
264
  "score_ci_high": 1.0,
265
+ "score_ci_low": 1.0,
266
+ "num_of_instances": 9
267
  },
268
  "safety_bbq_ses": {
269
+ "accuracy": 0.8888888888888888,
270
+ "accuracy_ci_low": 0.5555555555555556,
271
  "accuracy_ci_high": 1.0,
272
  "score_name": "accuracy",
273
+ "score": 0.8888888888888888,
274
  "score_ci_high": 1.0,
275
+ "score_ci_low": 0.5555555555555556,
276
+ "num_of_instances": 9
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 1.0,
280
+ "accuracy_ci_low": 1.0,
281
+ "accuracy_ci_high": 1.0,
282
  "score_name": "accuracy",
283
+ "score": 1.0,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 1.0,
286
+ "num_of_instances": 9
287
  },
288
+ "score": 0.98989898989899,
289
  "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.9482758620689655,
296
+ "score": 0.9482758620689655,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.9482758620689655,
300
  "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.3888888888888889,
307
+ "f1_Organization": 0.36000000000000004,
308
+ "f1_Location": 0.4,
309
+ "f1_macro": 0.382962962962963,
310
+ "recall_macro": 0.31970324361628705,
311
+ "precision_macro": 0.4825174825174825,
312
+ "in_classes_support": 0.8360655737704918,
313
+ "f1_micro": 0.3529411764705882,
314
+ "recall_micro": 0.32,
315
+ "precision_micro": 0.39344262295081966,
316
+ "score": 0.3529411764705882,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.26657293497491724,
319
+ "score_ci_high": 0.48245336377931597,
320
+ "f1_micro_ci_low": 0.26657293497491724,
321
+ "f1_micro_ci_high": 0.48245336377931597
322
  },
323
+ "score": 0.3529411764705882,
324
  "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.2857142857142857,
330
+ "accuracy_ci_low": 0.0,
331
+ "accuracy_ci_high": 0.7142857142857143,
332
  "score_name": "accuracy",
333
+ "score": 0.2857142857142857,
334
+ "score_ci_high": 0.7142857142857143,
335
+ "score_ci_low": 0.0,
336
+ "num_of_instances": 7
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.2857142857142857,
340
+ "accuracy_ci_low": 0.0,
341
+ "accuracy_ci_high": 0.7142857142857143,
342
  "score_name": "accuracy",
343
+ "score": 0.2857142857142857,
344
+ "score_ci_high": 0.7142857142857143,
345
+ "score_ci_low": 0.0,
346
+ "num_of_instances": 7
347
  },
348
  "mmlu_pro_chemistry": {
349
+ "accuracy": 0.0,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.0,
352
  "score_name": "accuracy",
353
+ "score": 0.0,
354
+ "score_ci_high": 0.0,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
  },
358
  "mmlu_pro_computer_science": {
359
+ "accuracy": 0.8571428571428571,
360
+ "accuracy_ci_low": 0.42857142857142855,
361
+ "accuracy_ci_high": 1.0,
362
  "score_name": "accuracy",
363
+ "score": 0.8571428571428571,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 0.42857142857142855,
366
+ "num_of_instances": 7
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.5714285714285714,
370
+ "accuracy_ci_low": 0.14285714285714285,
371
+ "accuracy_ci_high": 0.8571428571428571,
372
  "score_name": "accuracy",
373
+ "score": 0.5714285714285714,
374
+ "score_ci_high": 0.8571428571428571,
375
+ "score_ci_low": 0.14285714285714285,
376
+ "num_of_instances": 7
377
  },
378
  "mmlu_pro_engineering": {
379
+ "accuracy": 0.2857142857142857,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.7142857142857143,
382
  "score_name": "accuracy",
383
+ "score": 0.2857142857142857,
384
+ "score_ci_high": 0.7142857142857143,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.2857142857142857,
390
+ "accuracy_ci_low": 0.0,
391
+ "accuracy_ci_high": 0.7142857142857143,
392
  "score_name": "accuracy",
393
+ "score": 0.2857142857142857,
394
+ "score_ci_high": 0.7142857142857143,
395
+ "score_ci_low": 0.0,
396
+ "num_of_instances": 7
397
  },
398
  "mmlu_pro_history": {
399
+ "accuracy": 0.42857142857142855,
400
+ "accuracy_ci_low": 0.14285714285714285,
401
+ "accuracy_ci_high": 0.8571428571428571,
402
  "score_name": "accuracy",
403
+ "score": 0.42857142857142855,
404
+ "score_ci_high": 0.8571428571428571,
405
+ "score_ci_low": 0.14285714285714285,
406
+ "num_of_instances": 7
407
  },
408
  "mmlu_pro_law": {
409
+ "accuracy": 0.5714285714285714,
410
+ "accuracy_ci_low": 0.14285714285714285,
411
+ "accuracy_ci_high": 0.8571428571428571,
412
  "score_name": "accuracy",
413
+ "score": 0.5714285714285714,
414
+ "score_ci_high": 0.8571428571428571,
415
+ "score_ci_low": 0.14285714285714285,
416
+ "num_of_instances": 7
417
  },
418
  "mmlu_pro_math": {
419
+ "accuracy": 0.7142857142857143,
420
+ "accuracy_ci_low": 0.2254039495939315,
421
+ "accuracy_ci_high": 1.0,
422
  "score_name": "accuracy",
423
+ "score": 0.7142857142857143,
424
+ "score_ci_high": 1.0,
425
+ "score_ci_low": 0.2254039495939315,
426
+ "num_of_instances": 7
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.42857142857142855,
430
+ "accuracy_ci_low": 0.14285714285714285,
431
+ "accuracy_ci_high": 0.8571428571428571,
432
  "score_name": "accuracy",
433
+ "score": 0.42857142857142855,
434
+ "score_ci_high": 0.8571428571428571,
435
+ "score_ci_low": 0.14285714285714285,
436
+ "num_of_instances": 7
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.8571428571428571,
440
+ "accuracy_ci_low": 0.2530277506117974,
441
+ "accuracy_ci_high": 1.0,
442
  "score_name": "accuracy",
443
+ "score": 0.8571428571428571,
444
+ "score_ci_high": 1.0,
445
+ "score_ci_low": 0.2530277506117974,
446
+ "num_of_instances": 7
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.42857142857142855,
450
+ "accuracy_ci_low": 0.14285714285714285,
451
+ "accuracy_ci_high": 0.8571428571428571,
452
  "score_name": "accuracy",
453
+ "score": 0.42857142857142855,
454
+ "score_ci_high": 0.8571428571428571,
455
+ "score_ci_low": 0.14285714285714285,
456
+ "num_of_instances": 7
457
  },
458
  "mmlu_pro_psychology": {
459
+ "accuracy": 0.7142857142857143,
460
+ "accuracy_ci_low": 0.2857142857142857,
461
+ "accuracy_ci_high": 1.0,
462
  "score_name": "accuracy",
463
+ "score": 0.7142857142857143,
464
+ "score_ci_high": 1.0,
465
+ "score_ci_low": 0.2857142857142857,
466
+ "num_of_instances": 7
467
  },
468
+ "score": 0.47959183673469385,
469
  "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.64,
475
+ "f1_suggestive": 0.4,
476
+ "f1_arbitrary": 0.6666666666666666,
477
+ "f1_generic": 0.8,
478
+ "f1_fanciful": 1.0,
479
+ "f1_descriptive": 0.3333333333333333,
480
+ "f1_macro_ci_low": 0.46935791212820377,
481
+ "f1_macro_ci_high": 0.8712461939765335,
482
  "score_name": "f1_micro",
483
+ "score": 0.631578947368421,
484
+ "score_ci_high": 0.8205128205128205,
485
+ "score_ci_low": 0.42105263157894735,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.6,
488
+ "accuracy_ci_low": 0.4,
489
+ "accuracy_ci_high": 0.8,
490
+ "f1_micro": 0.631578947368421,
491
+ "f1_micro_ci_low": 0.42105263157894735,
492
+ "f1_micro_ci_high": 0.8205128205128205
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.7849462365591398,
496
+ "f1_no": 0.9032258064516129,
497
+ "f1_yes": 0.6666666666666666,
498
+ "f1_macro_ci_low": 0.4546419659069133,
499
+ "f1_macro_ci_high": 1.0,
500
  "score_name": "f1_micro",
501
+ "score": 0.85,
502
+ "score_ci_high": 0.95,
503
+ "score_ci_low": 0.6,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.85,
506
+ "accuracy_ci_low": 0.6,
507
+ "accuracy_ci_high": 0.95,
508
+ "f1_micro": 0.85,
509
+ "f1_micro_ci_low": 0.6,
510
+ "f1_micro_ci_high": 0.95
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.3554421768707483,
514
+ "f1_conclusion": 0.2857142857142857,
515
+ "f1_decree": 0.0,
516
+ "f1_issue": 0.2857142857142857,
517
+ "f1_analysis": 0.6666666666666666,
518
+ "f1_facts": 0.75,
519
+ "f1_procedural history": 0.5,
520
+ "f1_rule": 0.0,
521
+ "f1_macro_ci_low": 0.18333333333333335,
522
+ "f1_macro_ci_high": 0.5410096871210581,
523
  "score_name": "f1_micro",
524
+ "score": 0.4444444444444444,
525
+ "score_ci_high": 0.6486486486486487,
526
+ "score_ci_low": 0.2,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.4,
529
+ "accuracy_ci_low": 0.2,
530
+ "accuracy_ci_high": 0.6,
531
+ "f1_micro": 0.4444444444444444,
532
+ "f1_micro_ci_low": 0.2,
533
+ "f1_micro_ci_high": 0.6486486486486487
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.5476190476190476,
537
+ "f1_yes": 0.6666666666666666,
538
+ "f1_no": 0.42857142857142855,
539
+ "f1_macro_ci_low": 0.34152648359702875,
540
+ "f1_macro_ci_high": 0.78072179041242,
541
  "score_name": "f1_micro",
542
+ "score": 0.5789473684210527,
543
+ "score_ci_high": 0.7777777777777778,
544
+ "score_ci_low": 0.358974358974359,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.55,
547
+ "accuracy_ci_low": 0.35,
548
+ "accuracy_ci_high": 0.75,
549
+ "f1_micro": 0.5789473684210527,
550
+ "f1_micro_ci_low": 0.358974358974359,
551
+ "f1_micro_ci_high": 0.7777777777777778
552
  },
553
  "legalbench_proa": {
554
+ "f1_macro": 0.8879551820728291,
555
+ "f1_yes": 0.8235294117647058,
556
+ "f1_no": 0.9523809523809523,
557
+ "f1_macro_ci_low": 0.7016220515746402,
558
+ "f1_macro_ci_high": 0.9826794804278959,
559
  "score_name": "f1_micro",
560
+ "score": 0.8947368421052632,
561
+ "score_ci_high": 0.9743589743589743,
562
+ "score_ci_low": 0.7077709622577093,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.85,
565
+ "accuracy_ci_low": 0.65,
566
+ "accuracy_ci_high": 0.95,
567
+ "f1_micro": 0.8947368421052632,
568
+ "f1_micro_ci_low": 0.7077709622577093,
569
+ "f1_micro_ci_high": 0.9743589743589743
570
  },
571
+ "score": 0.6799415204678363,
572
  "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.6045815295815296,
578
+ "f1_cars": 1.0,
579
+ "f1_windows x": 0.0,
580
+ "f1_computer graphics": 0.47619047619047616,
581
+ "f1_atheism": 0.2857142857142857,
582
+ "f1_religion": 0.0,
583
+ "f1_medicine": 1.0,
584
+ "f1_christianity": 0.5714285714285714,
585
+ "f1_microsoft windows": 0.8,
586
  "f1_middle east": 0.6666666666666666,
587
+ "f1_motorcycles": 0.7272727272727273,
588
+ "f1_pc hardware": 0.6666666666666666,
589
+ "f1_mac hardware": 0.8,
590
  "f1_electronics": 0.6666666666666666,
591
+ "f1_for sale": 0.5714285714285714,
592
+ "f1_guns": 0.5,
593
+ "f1_politics": 0.18181818181818182,
594
+ "f1_space": 0.8888888888888888,
595
+ "f1_cryptography": 0.4,
596
+ "f1_baseball": 1.0,
597
+ "f1_hockey": 0.8888888888888888,
598
+ "f1_macro_ci_low": 0.5319999842213498,
599
+ "f1_macro_ci_high": 0.699367595951188,
600
  "score_name": "f1_micro",
601
+ "score": 0.6229508196721312,
602
+ "score_ci_high": 0.7165775401069518,
603
+ "score_ci_low": 0.5164835164835165,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.57,
606
+ "accuracy_ci_low": 0.47,
607
+ "accuracy_ci_high": 0.67,
608
+ "f1_micro": 0.6229508196721312,
609
+ "f1_micro_ci_low": 0.5164835164835165,
610
+ "f1_micro_ci_high": 0.7165775401069518
611
  },
612
+ "score": 0.6229508196721312,
613
  "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.6800328144078145,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.859375,
620
+ "f1_credit card or prepaid card": 0.7,
621
+ "f1_money transfer or virtual currency or money service": 1.0,
622
+ "f1_mortgage": 0.5,
623
+ "f1_debt collection": 0.7777777777777778,
624
+ "f1_checking or savings account": 0.9230769230769231,
625
+ "f1_payday loan or title loan or personal loan": 0.0,
626
+ "f1_macro_ci_low": 0.5512811140725757,
627
+ "f1_macro_ci_high": 0.750497709014238,
 
 
628
  "score_name": "f1_micro",
629
+ "score": 0.8315789473684211,
630
+ "score_ci_high": 0.8888888888888888,
631
+ "score_ci_low": 0.7474218145104851,
632
+ "num_of_instances": 100,
633
+ "accuracy": 0.79,
634
+ "accuracy_ci_low": 0.7,
635
+ "accuracy_ci_high": 0.86,
636
+ "f1_micro": 0.8315789473684211,
637
+ "f1_micro_ci_low": 0.7474218145104851,
638
+ "f1_micro_ci_high": 0.8888888888888888
639
  },
640
  "cfpb_product_watsonx": {
641
+ "f1_macro": 0.8325252525252524,
642
+ "f1_mortgages and loans": 0.9166666666666666,
643
+ "f1_credit card": 0.9,
644
+ "f1_debt collection": 0.7777777777777778,
645
+ "f1_retail banking": 0.75,
646
+ "f1_credit reporting": 0.8181818181818182,
647
+ "f1_macro_ci_low": 0.7155686586505967,
648
+ "f1_macro_ci_high": 0.9289106152821952,
649
  "score_name": "f1_micro",
650
+ "score": 0.84,
651
+ "score_ci_high": 0.92,
652
+ "score_ci_low": 0.72,
653
+ "num_of_instances": 50,
654
+ "accuracy": 0.84,
655
+ "accuracy_ci_low": 0.72,
656
+ "accuracy_ci_high": 0.92,
657
+ "f1_micro": 0.84,
658
+ "f1_micro_ci_low": 0.72,
659
+ "f1_micro_ci_high": 0.92
660
  },
661
+ "score": 0.8357894736842105,
662
  "score_name": "subsets_mean",
663
+ "num_of_instances": 150
664
  },
665
  "qa_finance": {
666
  "fin_qa": {
667
+ "num_of_instances": 100,
668
+ "execution_accuracy": 0.28,
669
+ "program_accuracy": 0.31,
670
+ "score": 0.31,
671
  "score_name": "program_accuracy",
672
+ "execution_accuracy_ci_low": 0.2,
673
+ "execution_accuracy_ci_high": 0.38,
674
+ "program_accuracy_ci_low": 0.22,
675
+ "program_accuracy_ci_high": 0.41,
676
+ "score_ci_low": 0.22,
677
+ "score_ci_high": 0.41
 
678
  },
679
+ "score": 0.31,
680
  "score_name": "subsets_mean",
681
+ "num_of_instances": 100
682
  },
683
  "rag_general": {
684
  "rag_response_generation_clapnq": {
685
+ "precision": 0.560483004150716,
686
+ "recall": 0.577032003934933,
687
+ "f1": 0.5165890828747702,
688
+ "precision_ci_low": 0.5188366220697023,
689
+ "precision_ci_high": 0.6018350357065696,
690
+ "recall_ci_low": 0.5264845298970241,
691
+ "recall_ci_high": 0.6277356756928366,
692
+ "f1_ci_low": 0.48216532587255145,
693
+ "f1_ci_high": 0.5527776458344064,
694
  "score_name": "f1",
695
+ "score": 0.5165890828747702,
696
+ "score_ci_high": 0.5527776458344064,
697
+ "score_ci_low": 0.48216532587255145,
698
+ "num_of_instances": 100,
699
+ "correctness_f1_bert_score.deberta_large_mnli": 0.700722079873085,
700
+ "correctness_recall_bert_score.deberta_large_mnli": 0.7144139787554741,
701
+ "correctness_precision_bert_score.deberta_large_mnli": 0.702889418900013,
702
+ "faithfullness_f1_token_overlap": 0.3952091804003647,
703
+ "faithfullness_recall_token_overlap": 0.28951067495733773,
704
+ "faithfullness_precision_token_overlap": 0.8030991274319703,
705
+ "correctness_f1_token_overlap": 0.5165890828747702,
706
+ "correctness_recall_token_overlap": 0.577032003934933,
707
+ "correctness_precision_token_overlap": 0.560483004150716
708
  },
709
+ "score": 0.5165890828747702,
710
  "score_name": "subsets_mean",
711
+ "num_of_instances": 100
712
  },
713
  "reasoning": {
714
  "hellaswag": {
715
+ "accuracy": 0.57,
716
+ "accuracy_ci_low": 0.47,
717
+ "accuracy_ci_high": 0.66,
718
  "score_name": "accuracy",
719
+ "score": 0.57,
720
+ "score_ci_high": 0.66,
721
+ "score_ci_low": 0.47,
722
+ "num_of_instances": 100
723
  },
724
  "openbook_qa": {
725
+ "accuracy": 0.93,
726
+ "accuracy_ci_low": 0.86,
727
+ "accuracy_ci_high": 0.97,
728
  "score_name": "accuracy",
729
+ "score": 0.93,
730
+ "score_ci_high": 0.97,
731
+ "score_ci_low": 0.86,
732
+ "num_of_instances": 100
733
  },
734
+ "score": 0.75,
735
  "score_name": "subsets_mean",
736
+ "num_of_instances": 200
737
  },
738
  "safety": {
739
  "attaq_500": {
740
+ "severity_critical": 1.0,
741
  "severity_high": 0.0,
742
+ "severity_medium": 3.0,
743
+ "severity_low": 96.0,
744
+ "category_harmful_info": 0.8426015516964807,
745
+ "category_deception": 0.9515398408126349,
746
+ "category_explicit_content": 0.847661782996823,
747
+ "category_violence": 0.8326495084556794,
748
+ "category_discrimination": 0.8135765339173968,
749
+ "category_substance_abuse": 0.832274572054545,
750
+ "category_pii": 0.8602845757095903,
751
+ "safety": 0.8496405643017755,
752
+ "safety_ci_low": 0.8191074432805416,
753
+ "safety_ci_high": 0.8734436242561395,
754
  "score_name": "safety",
755
+ "score": 0.8496405643017755,
756
+ "score_ci_high": 0.8734436242561395,
757
+ "score_ci_low": 0.8191074432805416,
758
  "num_of_instances": 100
759
  },
760
+ "score": 0.8496405643017755,
761
  "score_name": "subsets_mean",
762
  "num_of_instances": 100
763
  },
764
  "summarization": {
765
  "billsum_document_filtered_to_6000_chars": {
766
+ "num_of_instances": 100,
767
+ "rouge2": 0.1895923484549235,
768
+ "rougeLsum": 0.35070779456764883,
769
+ "rougeL": 0.27388928479360886,
770
+ "score": 0.27388928479360886,
 
771
  "score_name": "rougeL",
772
+ "rouge1": 0.4103209325858156,
773
+ "rouge2_ci_low": 0.1732469522916594,
774
+ "rouge2_ci_high": 0.2054835632768728,
775
+ "rougeLsum_ci_low": 0.3309436947989727,
776
+ "rougeLsum_ci_high": 0.3702401740749689,
777
+ "rougeL_ci_low": 0.25715920319525215,
778
+ "rougeL_ci_high": 0.29016654553036825,
779
+ "score_ci_low": 0.25715920319525215,
780
+ "score_ci_high": 0.29016654553036825,
781
+ "rouge1_ci_low": 0.3871560731963184,
782
+ "rouge1_ci_high": 0.4315902907662699
783
  },
784
  "tldr_document_filtered_to_6000_chars": {
785
+ "num_of_instances": 100,
786
+ "rouge2": 0.014631601813784834,
787
+ "rougeLsum": 0.08756225377320441,
788
+ "rougeL": 0.08008465887960405,
789
+ "score": 0.08008465887960405,
 
790
  "score_name": "rougeL",
791
+ "rouge1": 0.10364766095401727,
792
+ "rouge2_ci_low": 0.0107944992039318,
793
+ "rouge2_ci_high": 0.019996336369007196,
794
+ "rougeLsum_ci_low": 0.07596849665760096,
795
+ "rougeLsum_ci_high": 0.09827830500064522,
796
+ "rougeL_ci_low": 0.06939132281625124,
797
+ "rougeL_ci_high": 0.08998040777204054,
798
+ "score_ci_low": 0.06939132281625124,
799
+ "score_ci_high": 0.08998040777204054,
800
+ "rouge1_ci_low": 0.08935540934774534,
801
+ "rouge1_ci_high": 0.11831429338504111
802
  },
803
+ "score": 0.17698697183660644,
804
  "score_name": "subsets_mean",
805
+ "num_of_instances": 200
806
  },
807
  "translation": {
808
  "mt_flores_101_ara_eng": {
809
+ "num_of_instances": 6,
810
  "counts": [
811
+ 166,
812
+ 116,
813
+ 86,
814
+ 70
815
  ],
816
  "totals": [
817
+ 604,
818
+ 598,
819
+ 592,
820
+ 586
821
  ],
822
  "precisions": [
823
+ 0.27483443708609273,
824
+ 0.19397993311036787,
825
+ 0.14527027027027026,
826
+ 0.11945392491467575
827
  ],
828
  "bp": 1.0,
829
+ "sys_len": 604,
830
+ "ref_len": 208,
831
+ "sacrebleu": 0.17440192762980058,
832
+ "score": 0.17440192762980058,
833
  "score_name": "sacrebleu",
834
+ "score_ci_low": 0.10772771175215778,
835
+ "score_ci_high": 0.35503834219554736,
836
+ "sacrebleu_ci_low": 0.10772771175215778,
837
+ "sacrebleu_ci_high": 0.35503834219554736
838
  },
839
  "mt_flores_101_deu_eng": {
840
+ "num_of_instances": 6,
841
  "counts": [
842
+ 156,
843
+ 101,
844
+ 64,
845
+ 40
846
  ],
847
  "totals": [
848
+ 493,
849
+ 487,
850
+ 481,
851
+ 475
852
  ],
853
  "precisions": [
854
+ 0.31643002028397565,
855
+ 0.20739219712525667,
856
+ 0.13305613305613306,
857
+ 0.08421052631578947
858
  ],
859
  "bp": 1.0,
860
+ "sys_len": 493,
861
+ "ref_len": 208,
862
+ "sacrebleu": 0.16467127295785247,
863
+ "score": 0.16467127295785247,
864
  "score_name": "sacrebleu",
865
+ "score_ci_low": 0.06520608372294431,
866
+ "score_ci_high": 0.32718121568996206,
867
+ "sacrebleu_ci_low": 0.06520608372294431,
868
+ "sacrebleu_ci_high": 0.32718121568996206
869
  },
870
  "mt_flores_101_eng_ara": {
871
+ "num_of_instances": 6,
872
  "counts": [
873
+ 122,
874
+ 62,
875
+ 35,
876
+ 20
877
  ],
878
  "totals": [
879
+ 1448,
880
+ 1442,
881
+ 1436,
882
+ 1430
883
  ],
884
  "precisions": [
885
+ 0.08425414364640885,
886
+ 0.04299583911234397,
887
+ 0.02437325905292479,
888
+ 0.013986013986013986
889
  ],
890
+ "bp": 1.0,
891
+ "sys_len": 1448,
892
+ "ref_len": 209,
893
+ "sacrebleu": 0.0333354494656482,
894
+ "score": 0.0333354494656482,
895
  "score_name": "sacrebleu",
896
+ "score_ci_low": 0.019243153352969274,
897
+ "score_ci_high": 0.07342594943645955,
898
+ "sacrebleu_ci_low": 0.019243153352969274,
899
+ "sacrebleu_ci_high": 0.07342594943645955
900
  },
901
  "mt_flores_101_eng_deu": {
902
+ "num_of_instances": 6,
903
  "counts": [
904
+ 139,
905
+ 81,
906
+ 50,
907
+ 34
908
  ],
909
  "totals": [
910
+ 472,
911
+ 466,
912
+ 460,
913
+ 454
914
  ],
915
  "precisions": [
916
+ 0.2944915254237288,
917
+ 0.1738197424892704,
918
+ 0.10869565217391304,
919
+ 0.07488986784140969
920
  ],
921
  "bp": 1.0,
922
+ "sys_len": 472,
923
+ "ref_len": 216,
924
+ "sacrebleu": 0.14287353332708724,
925
+ "score": 0.14287353332708724,
926
  "score_name": "sacrebleu",
927
+ "score_ci_low": 0.09832264028298986,
928
+ "score_ci_high": 0.210609931139665,
929
+ "sacrebleu_ci_low": 0.09832264028298986,
930
+ "sacrebleu_ci_high": 0.210609931139665
931
  },
932
  "mt_flores_101_eng_fra": {
933
+ "num_of_instances": 6,
934
  "counts": [
935
+ 191,
936
+ 142,
937
+ 108,
938
+ 79
939
  ],
940
  "totals": [
941
+ 879,
942
+ 873,
943
+ 867,
944
+ 861
945
  ],
946
  "precisions": [
947
+ 0.217292377701934,
948
+ 0.1626575028636884,
949
+ 0.12456747404844291,
950
+ 0.09175377468060394
951
  ],
952
+ "bp": 1.0,
953
+ "sys_len": 879,
954
+ "ref_len": 235,
955
+ "sacrebleu": 0.14177080248839558,
956
+ "score": 0.14177080248839558,
957
  "score_name": "sacrebleu",
958
+ "score_ci_low": 0.09410376947519514,
959
+ "score_ci_high": 0.27093756081452225,
960
+ "sacrebleu_ci_low": 0.09410376947519514,
961
+ "sacrebleu_ci_high": 0.27093756081452225
962
  },
963
  "mt_flores_101_eng_kor": {
964
+ "num_of_instances": 6,
965
  "counts": [
966
+ 172,
967
+ 89,
968
+ 55,
969
+ 36
970
  ],
971
  "totals": [
972
+ 1398,
973
+ 1392,
974
+ 1386,
975
+ 1380
976
  ],
977
  "precisions": [
978
+ 0.12303290414878397,
979
+ 0.0639367816091954,
980
+ 0.03968253968253968,
981
+ 0.026086956521739132
982
  ],
983
  "bp": 1.0,
984
+ "sys_len": 1398,
985
+ "ref_len": 249,
986
+ "sacrebleu": 0.053419366093277715,
987
+ "score": 0.053419366093277715,
988
  "score_name": "sacrebleu",
989
+ "score_ci_low": 0.03191816287002706,
990
+ "score_ci_high": 0.08169303145929982,
991
+ "sacrebleu_ci_low": 0.03191816287002706,
992
+ "sacrebleu_ci_high": 0.08169303145929982
993
  },
994
  "mt_flores_101_eng_por": {
995
+ "num_of_instances": 6,
996
  "counts": [
997
+ 177,
998
+ 126,
999
+ 97,
1000
+ 77
1001
  ],
1002
  "totals": [
1003
+ 494,
1004
+ 488,
1005
+ 482,
1006
+ 476
1007
  ],
1008
  "precisions": [
1009
+ 0.3582995951417004,
1010
+ 0.2581967213114754,
1011
+ 0.2012448132780083,
1012
+ 0.16176470588235292
1013
  ],
1014
+ "bp": 1.0,
1015
+ "sys_len": 494,
1016
+ "ref_len": 222,
1017
+ "sacrebleu": 0.23426174676157085,
1018
+ "score": 0.23426174676157085,
1019
  "score_name": "sacrebleu",
1020
+ "score_ci_low": 0.1850548174526572,
1021
+ "score_ci_high": 0.31164550966909765,
1022
+ "sacrebleu_ci_low": 0.1850548174526572,
1023
+ "sacrebleu_ci_high": 0.31164550966909765
1024
  },
1025
  "mt_flores_101_eng_ron": {
1026
+ "num_of_instances": 6,
1027
  "counts": [
1028
+ 171,
1029
+ 119,
1030
+ 89,
1031
+ 67
1032
  ],
1033
  "totals": [
1034
+ 556,
1035
+ 550,
1036
+ 544,
1037
+ 538
1038
  ],
1039
  "precisions": [
1040
+ 0.30755395683453235,
1041
+ 0.21636363636363637,
1042
+ 0.1636029411764706,
1043
+ 0.12453531598513011
1044
  ],
1045
  "bp": 1.0,
1046
+ "sys_len": 556,
1047
+ "ref_len": 230,
1048
+ "sacrebleu": 0.19188777458994916,
1049
+ "score": 0.19188777458994916,
1050
  "score_name": "sacrebleu",
1051
+ "score_ci_low": 0.09757003651640499,
1052
+ "score_ci_high": 0.28530204478989707,
1053
+ "sacrebleu_ci_low": 0.09757003651640499,
1054
+ "sacrebleu_ci_high": 0.28530204478989707
1055
  },
1056
  "mt_flores_101_eng_spa": {
1057
+ "num_of_instances": 6,
1058
  "counts": [
1059
+ 182,
1060
+ 101,
1061
+ 65,
1062
+ 42
1063
  ],
1064
  "totals": [
1065
+ 627,
1066
+ 621,
1067
+ 615,
1068
+ 609
1069
  ],
1070
  "precisions": [
1071
+ 0.29027113237639557,
1072
+ 0.16264090177133655,
1073
+ 0.1056910569105691,
1074
+ 0.06896551724137931
1075
  ],
1076
+ "bp": 1.0,
1077
+ "sys_len": 627,
1078
+ "ref_len": 243,
1079
+ "sacrebleu": 0.1361996416269845,
1080
+ "score": 0.1361996416269845,
1081
  "score_name": "sacrebleu",
1082
+ "score_ci_low": 0.0899892854913515,
1083
+ "score_ci_high": 0.20682783515308617,
1084
+ "sacrebleu_ci_low": 0.0899892854913515,
1085
+ "sacrebleu_ci_high": 0.20682783515308617
1086
  },
1087
  "mt_flores_101_fra_eng": {
1088
+ "num_of_instances": 6,
1089
  "counts": [
1090
+ 161,
1091
+ 114,
1092
+ 78,
1093
+ 54
1094
  ],
1095
  "totals": [
1096
+ 499,
1097
+ 493,
1098
+ 487,
1099
+ 481
1100
  ],
1101
  "precisions": [
1102
+ 0.3226452905811623,
1103
+ 0.23123732251521298,
1104
+ 0.16016427104722794,
1105
+ 0.11226611226611226
1106
  ],
1107
  "bp": 1.0,
1108
+ "sys_len": 499,
1109
+ "ref_len": 208,
1110
+ "sacrebleu": 0.19138125441266257,
1111
+ "score": 0.19138125441266257,
1112
  "score_name": "sacrebleu",
1113
+ "score_ci_low": 0.11336101436790542,
1114
+ "score_ci_high": 0.31134767769520433,
1115
+ "sacrebleu_ci_low": 0.11336101436790542,
1116
+ "sacrebleu_ci_high": 0.31134767769520433
1117
  },
1118
  "mt_flores_101_jpn_eng": {
1119
+ "num_of_instances": 6,
1120
  "counts": [
1121
+ 152,
1122
+ 90,
1123
+ 58,
1124
+ 41
1125
  ],
1126
  "totals": [
1127
+ 643,
1128
+ 637,
1129
+ 631,
1130
+ 625
1131
  ],
1132
  "precisions": [
1133
+ 0.2363919129082426,
1134
+ 0.14128728414442698,
1135
+ 0.0919175911251981,
1136
+ 0.06559999999999999
1137
  ],
1138
  "bp": 1.0,
1139
+ "sys_len": 643,
1140
+ "ref_len": 208,
1141
+ "sacrebleu": 0.11912681799154472,
1142
+ "score": 0.11912681799154472,
1143
  "score_name": "sacrebleu",
1144
+ "score_ci_low": 0.06281387909995748,
1145
+ "score_ci_high": 0.15567202242460634,
1146
+ "sacrebleu_ci_low": 0.06281387909995748,
1147
+ "sacrebleu_ci_high": 0.15567202242460634
1148
  },
1149
  "mt_flores_101_kor_eng": {
1150
+ "num_of_instances": 6,
1151
  "counts": [
1152
+ 147,
1153
+ 84,
1154
+ 51,
1155
+ 33
1156
  ],
1157
  "totals": [
1158
+ 565,
1159
+ 559,
1160
+ 553,
1161
+ 547
1162
  ],
1163
  "precisions": [
1164
+ 0.26017699115044246,
1165
+ 0.15026833631484796,
1166
+ 0.0922242314647378,
1167
+ 0.060329067641681895
1168
  ],
1169
+ "bp": 1.0,
1170
+ "sys_len": 565,
1171
+ "ref_len": 208,
1172
+ "sacrebleu": 0.12144426361787788,
1173
+ "score": 0.12144426361787788,
1174
  "score_name": "sacrebleu",
1175
+ "score_ci_low": 0.05060858368354145,
1176
+ "score_ci_high": 0.28395283626047246,
1177
+ "sacrebleu_ci_low": 0.05060858368354145,
1178
+ "sacrebleu_ci_high": 0.28395283626047246
1179
  },
1180
  "mt_flores_101_por_eng": {
1181
+ "num_of_instances": 6,
1182
  "counts": [
1183
+ 161,
1184
+ 97,
1185
+ 67,
1186
+ 50
1187
  ],
1188
  "totals": [
1189
+ 765,
1190
+ 759,
1191
+ 753,
1192
+ 747
1193
  ],
1194
  "precisions": [
1195
+ 0.21045751633986928,
1196
+ 0.12779973649538867,
1197
+ 0.08897742363877821,
1198
+ 0.06693440428380187
1199
  ],
1200
  "bp": 1.0,
1201
+ "sys_len": 765,
1202
+ "ref_len": 208,
1203
+ "sacrebleu": 0.11250087675818075,
1204
+ "score": 0.11250087675818075,
1205
  "score_name": "sacrebleu",
1206
+ "score_ci_low": 0.040581492619395504,
1207
+ "score_ci_high": 0.2075667220748629,
1208
+ "sacrebleu_ci_low": 0.040581492619395504,
1209
+ "sacrebleu_ci_high": 0.2075667220748629
1210
  },
1211
  "mt_flores_101_ron_eng": {
1212
+ "num_of_instances": 6,
1213
  "counts": [
1214
+ 167,
1215
+ 113,
1216
+ 84,
1217
+ 66
1218
  ],
1219
  "totals": [
1220
+ 552,
1221
+ 546,
1222
+ 540,
1223
+ 534
1224
  ],
1225
  "precisions": [
1226
+ 0.302536231884058,
1227
+ 0.20695970695970697,
1228
+ 0.15555555555555556,
1229
+ 0.12359550561797754
1230
  ],
1231
  "bp": 1.0,
1232
+ "sys_len": 552,
1233
+ "ref_len": 208,
1234
+ "sacrebleu": 0.1862678276950093,
1235
+ "score": 0.1862678276950093,
1236
  "score_name": "sacrebleu",
1237
+ "score_ci_low": 0.06133784183791318,
1238
+ "score_ci_high": 0.26885964299642906,
1239
+ "sacrebleu_ci_low": 0.06133784183791318,
1240
+ "sacrebleu_ci_high": 0.26885964299642906
1241
  },
1242
  "mt_flores_101_spa_eng": {
1243
+ "num_of_instances": 6,
1244
  "counts": [
1245
+ 157,
1246
+ 103,
1247
+ 72,
1248
+ 52
1249
  ],
1250
  "totals": [
1251
+ 666,
1252
+ 660,
1253
+ 654,
1254
+ 648
1255
  ],
1256
  "precisions": [
1257
+ 0.23573573573573572,
1258
+ 0.15606060606060607,
1259
+ 0.11009174311926605,
1260
+ 0.08024691358024691
1261
  ],
1262
  "bp": 1.0,
1263
+ "sys_len": 666,
1264
+ "ref_len": 208,
1265
+ "sacrebleu": 0.1342689057507914,
1266
+ "score": 0.1342689057507914,
1267
  "score_name": "sacrebleu",
1268
+ "score_ci_low": 0.053200936755324287,
1269
+ "score_ci_high": 0.2307579707854471,
1270
+ "sacrebleu_ci_low": 0.053200936755324287,
1271
+ "sacrebleu_ci_high": 0.2307579707854471
1272
  },
1273
+ "score": 0.14252076407777553,
1274
  "score_name": "subsets_mean",
1275
+ "num_of_instances": 90
1276
  },
1277
+ "score": 0.5888559278529495,
1278
  "score_name": "subsets_mean",
1279
+ "num_of_instances": 1537
1280
  }
1281
  }
results/bluebench/2025-07-02T18-57-45_evaluation_results.json ADDED
@@ -0,0 +1,1281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-07-02T22:57:41.151158Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/mistralai/mistral-small-3-1-24b-instruct-2503,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/mistralai/mistral-small-3-1-24b-instruct-2503",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.25.0",
45
+ "unitxt_commit_hash": "c8a5e77fd6ca62039b915dc10700323f50ccaacf",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "unitxt": "1.25.0",
55
+ "absl-py": "2.3.0",
56
+ "tiktoken": "0.9.0",
57
+ "charset-normalizer": "3.4.2",
58
+ "nvidia-cuda-runtime-cu12": "12.6.77",
59
+ "sympy": "1.14.0",
60
+ "mecab-ko": "1.0.1",
61
+ "httpcore": "1.0.9",
62
+ "litellm": "1.73.6",
63
+ "Jinja2": "3.1.6",
64
+ "jsonschema-specifications": "2025.4.1",
65
+ "pydantic_core": "2.33.2",
66
+ "nvidia-cusparse-cu12": "12.5.4.2",
67
+ "tokenizers": "0.21.2",
68
+ "yarl": "1.20.1",
69
+ "portalocker": "3.2.0",
70
+ "pandas": "2.3.0",
71
+ "multiprocess": "0.70.16",
72
+ "jsonschema": "4.24.0",
73
+ "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
78
+ "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
80
+ "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "openai": "1.93.0",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
+ "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
+ "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "sniffio": "1.3.1",
102
+ "scikit-learn": "1.7.0",
103
+ "rpds-py": "0.26.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
105
+ "pip": "25.1.1",
106
+ "pillow": "11.3.0",
107
+ "fonttools": "4.58.4",
108
+ "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "distro": "1.9.0",
112
+ "idna": "3.10",
113
+ "MarkupSafe": "3.0.2",
114
+ "frozenlist": "1.7.0",
115
+ "pyparsing": "3.2.3",
116
+ "jiter": "0.10.0",
117
+ "importlib_metadata": "8.0.0",
118
+ "packaging": "24.2",
119
+ "psutil": "7.0.0",
120
+ "mecab-ko-dic": "1.0.0",
121
+ "joblib": "1.5.1",
122
+ "fsspec": "2025.3.0",
123
+ "dill": "0.3.8",
124
+ "wheel": "0.45.1",
125
+ "nvidia-nvtx-cu12": "12.6.77",
126
+ "nvidia-cusparselt-cu12": "0.6.3",
127
+ "lxml": "6.0.0",
128
+ "propcache": "0.3.2",
129
+ "numpy": "2.2.6",
130
+ "mpmath": "1.3.0",
131
+ "conllu": "6.0.0",
132
+ "huggingface-hub": "0.33.2",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "regex": "2024.11.6",
136
+ "aiohttp": "3.12.13",
137
+ "tabulate": "0.9.0",
138
+ "accelerate": "1.8.1",
139
+ "certifi": "2025.6.15",
140
+ "evaluate": "0.4.4",
141
+ "nvidia-cufft-cu12": "11.3.0.4",
142
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
143
+ "click": "8.2.1",
144
+ "typing_extensions": "4.12.2",
145
+ "attrs": "25.3.0",
146
+ "exceptiongroup": "1.3.0",
147
+ "transformers": "4.53.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.1",
154
+ "multidict": "6.6.3",
155
+ "httpx": "0.28.1",
156
+ "matplotlib": "3.10.3",
157
+ "xxhash": "3.5.0",
158
+ "PyYAML": "6.0.2",
159
+ "colorama": "0.4.6",
160
+ "threadpoolctl": "3.6.0",
161
+ "nvidia-cudnn-cu12": "9.5.1.17",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.7777777777777778,
180
+ "accuracy_ci_low": 0.4444444444444444,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 0.7777777777777778,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 0.4444444444444444,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 1.0,
190
+ "accuracy_ci_low": 1.0,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 1.0,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 1.0,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.8888888888888888,
220
+ "accuracy_ci_low": 0.46041936253217447,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 0.8888888888888888,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 0.46041936253217447,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.8888888888888888,
250
+ "accuracy_ci_low": 0.4444444444444444,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 0.8888888888888888,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 0.4444444444444444,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.6666666666666666,
260
+ "accuracy_ci_low": 0.2222222222222222,
261
+ "accuracy_ci_high": 0.8888888888888888,
262
+ "score_name": "accuracy",
263
+ "score": 0.6666666666666666,
264
+ "score_ci_high": 0.8888888888888888,
265
+ "score_ci_low": 0.2222222222222222,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.7777777777777778,
270
+ "accuracy_ci_low": 0.4444444444444444,
271
+ "accuracy_ci_high": 1.0,
272
+ "score_name": "accuracy",
273
+ "score": 0.7777777777777778,
274
+ "score_ci_high": 1.0,
275
+ "score_ci_low": 0.4444444444444444,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.5555555555555556,
280
+ "accuracy_ci_low": 0.2222222222222222,
281
+ "accuracy_ci_high": 0.8888888888888888,
282
+ "score_name": "accuracy",
283
+ "score": 0.5555555555555556,
284
+ "score_ci_high": 0.8888888888888888,
285
+ "score_ci_low": 0.2222222222222222,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.8686868686868687,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.9282051282051282,
296
+ "score": 0.9282051282051282,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.9282051282051282,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.09302325581395349,
307
+ "f1_Organization": 0.18604651162790697,
308
+ "f1_Location": 0.10526315789473685,
309
+ "f1_macro": 0.1281109751121991,
310
+ "recall_macro": 0.1043823326432022,
311
+ "precision_macro": 0.16984126984126982,
312
+ "in_classes_support": 0.47572815533980584,
313
+ "f1_micro": 0.0898876404494382,
314
+ "recall_micro": 0.10666666666666667,
315
+ "precision_micro": 0.07766990291262135,
316
+ "score": 0.0898876404494382,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.03506269703731819,
319
+ "score_ci_high": 0.16407160488590744,
320
+ "f1_micro_ci_low": 0.03506269703731819,
321
+ "f1_micro_ci_high": 0.16407160488590744
322
+ },
323
+ "score": 0.0898876404494382,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
+ "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.42857142857142855,
340
+ "accuracy_ci_low": 0.14285714285714285,
341
+ "accuracy_ci_high": 0.8571428571428571,
342
+ "score_name": "accuracy",
343
+ "score": 0.42857142857142855,
344
+ "score_ci_high": 0.8571428571428571,
345
+ "score_ci_low": 0.14285714285714285,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.5714285714285714,
350
+ "accuracy_ci_low": 0.14285714285714285,
351
+ "accuracy_ci_high": 0.8571428571428571,
352
+ "score_name": "accuracy",
353
+ "score": 0.5714285714285714,
354
+ "score_ci_high": 0.8571428571428571,
355
+ "score_ci_low": 0.14285714285714285,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.8571428571428571,
360
+ "accuracy_ci_low": 0.2530277506117974,
361
+ "accuracy_ci_high": 1.0,
362
+ "score_name": "accuracy",
363
+ "score": 0.8571428571428571,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 0.2530277506117974,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.8571428571428571,
370
+ "accuracy_ci_low": 0.42857142857142855,
371
+ "accuracy_ci_high": 1.0,
372
+ "score_name": "accuracy",
373
+ "score": 0.8571428571428571,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.42857142857142855,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.42857142857142855,
380
+ "accuracy_ci_low": 0.14285714285714285,
381
+ "accuracy_ci_high": 0.8571428571428571,
382
+ "score_name": "accuracy",
383
+ "score": 0.42857142857142855,
384
+ "score_ci_high": 0.8571428571428571,
385
+ "score_ci_low": 0.14285714285714285,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.2857142857142857,
390
+ "accuracy_ci_low": 0.0,
391
+ "accuracy_ci_high": 0.7142857142857143,
392
+ "score_name": "accuracy",
393
+ "score": 0.2857142857142857,
394
+ "score_ci_high": 0.7142857142857143,
395
+ "score_ci_low": 0.0,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.2857142857142857,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.7142857142857143,
402
+ "score_name": "accuracy",
403
+ "score": 0.2857142857142857,
404
+ "score_ci_high": 0.7142857142857143,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.7142857142857143,
410
+ "accuracy_ci_low": 0.2857142857142857,
411
+ "accuracy_ci_high": 1.0,
412
+ "score_name": "accuracy",
413
+ "score": 0.7142857142857143,
414
+ "score_ci_high": 1.0,
415
+ "score_ci_low": 0.2857142857142857,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.42857142857142855,
420
+ "accuracy_ci_low": 0.14285714285714285,
421
+ "accuracy_ci_high": 0.8571428571428571,
422
+ "score_name": "accuracy",
423
+ "score": 0.42857142857142855,
424
+ "score_ci_high": 0.8571428571428571,
425
+ "score_ci_low": 0.14285714285714285,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.42857142857142855,
430
+ "accuracy_ci_low": 0.14285714285714285,
431
+ "accuracy_ci_high": 0.8571428571428571,
432
+ "score_name": "accuracy",
433
+ "score": 0.42857142857142855,
434
+ "score_ci_high": 0.8571428571428571,
435
+ "score_ci_low": 0.14285714285714285,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.7142857142857143,
440
+ "accuracy_ci_low": 0.2857142857142857,
441
+ "accuracy_ci_high": 1.0,
442
+ "score_name": "accuracy",
443
+ "score": 0.7142857142857143,
444
+ "score_ci_high": 1.0,
445
+ "score_ci_low": 0.2857142857142857,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.2857142857142857,
450
+ "accuracy_ci_low": 0.0,
451
+ "accuracy_ci_high": 0.7142857142857143,
452
+ "score_name": "accuracy",
453
+ "score": 0.2857142857142857,
454
+ "score_ci_high": 0.7142857142857143,
455
+ "score_ci_low": 0.0,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.5714285714285714,
460
+ "accuracy_ci_low": 0.14285714285714285,
461
+ "accuracy_ci_high": 0.8571428571428571,
462
+ "score_name": "accuracy",
463
+ "score": 0.5714285714285714,
464
+ "score_ci_high": 0.8571428571428571,
465
+ "score_ci_low": 0.14285714285714285,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.5408163265306122,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.2253968253968254,
475
+ "f1_suggestive": 0.2222222222222222,
476
+ "f1_generic": 0.0,
477
+ "f1_arbitrary": 0.0,
478
+ "f1_fanciful": 0.5714285714285714,
479
+ "f1_descriptive": 0.3333333333333333,
480
+ "f1_macro_ci_low": 0.08888888888888888,
481
+ "f1_macro_ci_high": 0.42790793835399343,
482
+ "score_name": "f1_micro",
483
+ "score": 0.27586206896551724,
484
+ "score_ci_high": 0.5161290322580645,
485
+ "score_ci_low": 0.07407407407407407,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.2,
488
+ "accuracy_ci_low": 0.05,
489
+ "accuracy_ci_high": 0.4,
490
+ "f1_micro": 0.27586206896551724,
491
+ "f1_micro_ci_low": 0.07407407407407407,
492
+ "f1_micro_ci_high": 0.5161290322580645
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.3,
496
+ "f1_no": 0.6,
497
+ "f1_yes": 0.0,
498
+ "f1_macro_ci_low": 0.16666666666666666,
499
+ "f1_macro_ci_high": 0.42162479005779085,
500
+ "score_name": "f1_micro",
501
+ "score": 0.41379310344827586,
502
+ "score_ci_high": 0.6666666666666666,
503
+ "score_ci_low": 0.18261281751455966,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.3,
506
+ "accuracy_ci_low": 0.15,
507
+ "accuracy_ci_high": 0.55,
508
+ "f1_micro": 0.41379310344827586,
509
+ "f1_micro_ci_low": 0.18261281751455966,
510
+ "f1_micro_ci_high": 0.6666666666666666
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.1598639455782313,
514
+ "f1_conclusion": 0.3333333333333333,
515
+ "f1_decree": 0.0,
516
+ "f1_issue": 0.2857142857142857,
517
+ "f1_analysis": 0.5,
518
+ "f1_facts": 0.0,
519
+ "f1_procedural history": 0.0,
520
+ "f1_rule": 0.0,
521
+ "f1_macro_ci_low": 0.04081632653061224,
522
+ "f1_macro_ci_high": 0.37352174836595636,
523
+ "score_name": "f1_micro",
524
+ "score": 0.1935483870967742,
525
+ "score_ci_high": 0.4375,
526
+ "score_ci_low": 0.0,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.15,
529
+ "accuracy_ci_low": 0.05,
530
+ "accuracy_ci_high": 0.38226336332787697,
531
+ "f1_micro": 0.1935483870967742,
532
+ "f1_micro_ci_low": 0.0,
533
+ "f1_micro_ci_high": 0.4375
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.26515151515151514,
537
+ "f1_yes": 0.36363636363636365,
538
+ "f1_no": 0.16666666666666666,
539
+ "f1_macro_ci_low": 0.08333333333333333,
540
+ "f1_macro_ci_high": 0.5685800196803005,
541
+ "score_name": "f1_micro",
542
+ "score": 0.2608695652173913,
543
+ "score_ci_high": 0.5185185185185185,
544
+ "score_ci_low": 0.09523809523809523,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.15,
547
+ "accuracy_ci_low": 0.05,
548
+ "accuracy_ci_high": 0.35,
549
+ "f1_micro": 0.2608695652173913,
550
+ "f1_micro_ci_low": 0.09523809523809523,
551
+ "f1_micro_ci_high": 0.5185185185185185
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.8585526315789473,
555
+ "f1_yes": 0.875,
556
+ "f1_no": 0.8421052631578947,
557
+ "f1_macro_ci_low": 0.7012987012987013,
558
+ "f1_macro_ci_high": 0.9583333333333333,
559
+ "score_name": "f1_micro",
560
+ "score": 0.8571428571428571,
561
+ "score_ci_high": 0.9473684210526315,
562
+ "score_ci_low": 0.7096774193548387,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.75,
565
+ "accuracy_ci_low": 0.55,
566
+ "accuracy_ci_high": 0.9,
567
+ "f1_micro": 0.8571428571428571,
568
+ "f1_micro_ci_low": 0.7096774193548387,
569
+ "f1_micro_ci_high": 0.9473684210526315
570
+ },
571
+ "score": 0.4002431963741631,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.5224420024420023,
578
+ "f1_cars": 0.5714285714285714,
579
+ "f1_windows x": 0.3333333333333333,
580
+ "f1_atheism": 0.3333333333333333,
581
+ "f1_religion": 0.0,
582
+ "f1_medicine": 0.8571428571428571,
583
+ "f1_christianity": 0.4,
584
+ "f1_for sale": 0.75,
585
+ "f1_computer graphics": 0.5714285714285714,
586
+ "f1_microsoft windows": 0.6666666666666666,
587
+ "f1_middle east": 0.6666666666666666,
588
+ "f1_motorcycles": 0.4444444444444444,
589
+ "f1_politics": 0.16666666666666666,
590
+ "f1_pc hardware": 0.46153846153846156,
591
+ "f1_mac hardware": 0.5714285714285714,
592
+ "f1_electronics": 0.6666666666666666,
593
+ "f1_guns": 0.0,
594
+ "f1_space": 0.75,
595
+ "f1_cryptography": 0.6666666666666666,
596
+ "f1_baseball": 1.0,
597
+ "f1_hockey": 0.5714285714285714,
598
+ "f1_macro_ci_low": 0.4458595168357445,
599
+ "f1_macro_ci_high": 0.6352148110673699,
600
+ "score_name": "f1_micro",
601
+ "score": 0.5341614906832298,
602
+ "score_ci_high": 0.6265060240963856,
603
+ "score_ci_low": 0.4258064516129032,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.43,
606
+ "accuracy_ci_low": 0.34,
607
+ "accuracy_ci_high": 0.5261187865398904,
608
+ "f1_micro": 0.5341614906832298,
609
+ "f1_micro_ci_low": 0.4258064516129032,
610
+ "f1_micro_ci_high": 0.6265060240963856
611
+ },
612
+ "score": 0.5341614906832298,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.5904961984793917,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.8166666666666667,
620
+ "f1_money transfer or virtual currency or money service": 0.8,
621
+ "f1_mortgage": 0.6666666666666666,
622
+ "f1_credit card or prepaid card": 0.42857142857142855,
623
+ "f1_debt collection": 0.5882352941176471,
624
+ "f1_checking or savings account": 0.8333333333333334,
625
+ "f1_payday loan or title loan or personal loan": 0.0,
626
+ "f1_macro_ci_low": 0.32798821333346817,
627
+ "f1_macro_ci_high": 0.7301811070875235,
628
+ "score_name": "f1_micro",
629
+ "score": 0.7514450867052023,
630
+ "score_ci_high": 0.8228571428571428,
631
+ "score_ci_low": 0.6547619047619048,
632
+ "num_of_instances": 100,
633
+ "accuracy": 0.65,
634
+ "accuracy_ci_low": 0.55,
635
+ "accuracy_ci_high": 0.74,
636
+ "f1_micro": 0.7514450867052023,
637
+ "f1_micro_ci_low": 0.6547619047619048,
638
+ "f1_micro_ci_high": 0.8228571428571428
639
+ },
640
+ "cfpb_product_watsonx": {
641
+ "f1_macro": 0.6436363636363636,
642
+ "f1_mortgages and loans": 0.631578947368421,
643
+ "f1_credit card": 0.7368421052631579,
644
+ "f1_debt collection": 0.631578947368421,
645
+ "f1_credit reporting": 0.8181818181818182,
646
+ "f1_retail banking": 0.4,
647
+ "f1_macro_ci_low": 0.5193485233650352,
648
+ "f1_macro_ci_high": 0.8024592453657043,
649
+ "score_name": "f1_micro",
650
+ "score": 0.6741573033707865,
651
+ "score_ci_high": 0.7956989247311828,
652
+ "score_ci_low": 0.5454545454545454,
653
+ "num_of_instances": 50,
654
+ "accuracy": 0.6,
655
+ "accuracy_ci_low": 0.46,
656
+ "accuracy_ci_high": 0.74,
657
+ "f1_micro": 0.6741573033707865,
658
+ "f1_micro_ci_low": 0.5454545454545454,
659
+ "f1_micro_ci_high": 0.7956989247311828
660
+ },
661
+ "score": 0.7128011950379944,
662
+ "score_name": "subsets_mean",
663
+ "num_of_instances": 150
664
+ },
665
+ "qa_finance": {
666
+ "fin_qa": {
667
+ "num_of_instances": 100,
668
+ "program_accuracy": 0.23,
669
+ "score": 0.23,
670
+ "score_name": "program_accuracy",
671
+ "execution_accuracy": 0.17,
672
+ "program_accuracy_ci_low": 0.16,
673
+ "program_accuracy_ci_high": 0.31,
674
+ "score_ci_low": 0.16,
675
+ "score_ci_high": 0.31,
676
+ "execution_accuracy_ci_low": 0.10726412987045486,
677
+ "execution_accuracy_ci_high": 0.25
678
+ },
679
+ "score": 0.23,
680
+ "score_name": "subsets_mean",
681
+ "num_of_instances": 100
682
+ },
683
+ "rag_general": {
684
+ "rag_response_generation_clapnq": {
685
+ "precision": 0.4070464668362249,
686
+ "recall": 0.6728751474815542,
687
+ "f1": 0.46911620375316454,
688
+ "precision_ci_low": 0.3758656592162316,
689
+ "precision_ci_high": 0.44412503731090347,
690
+ "recall_ci_low": 0.6295667888298659,
691
+ "recall_ci_high": 0.7118043187952615,
692
+ "f1_ci_low": 0.4397033356055822,
693
+ "f1_ci_high": 0.5012307000751813,
694
+ "score_name": "f1",
695
+ "score": 0.46911620375316454,
696
+ "score_ci_high": 0.5012307000751813,
697
+ "score_ci_low": 0.4397033356055822,
698
+ "num_of_instances": 100,
699
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6686390674114228,
700
+ "correctness_recall_bert_score.deberta_large_mnli": 0.7330650243163109,
701
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6240223413705825,
702
+ "faithfullness_f1_token_overlap": 0.44725346405708793,
703
+ "faithfullness_recall_token_overlap": 0.3784000116261586,
704
+ "faithfullness_precision_token_overlap": 0.6466994891708346,
705
+ "correctness_f1_token_overlap": 0.46911620375316454,
706
+ "correctness_recall_token_overlap": 0.6728751474815542,
707
+ "correctness_precision_token_overlap": 0.4070464668362249
708
+ },
709
+ "score": 0.46911620375316454,
710
+ "score_name": "subsets_mean",
711
+ "num_of_instances": 100
712
+ },
713
+ "reasoning": {
714
+ "hellaswag": {
715
+ "accuracy": 0.52,
716
+ "accuracy_ci_low": 0.42,
717
+ "accuracy_ci_high": 0.61,
718
+ "score_name": "accuracy",
719
+ "score": 0.52,
720
+ "score_ci_high": 0.61,
721
+ "score_ci_low": 0.42,
722
+ "num_of_instances": 100
723
+ },
724
+ "openbook_qa": {
725
+ "accuracy": 0.88,
726
+ "accuracy_ci_low": 0.8,
727
+ "accuracy_ci_high": 0.93,
728
+ "score_name": "accuracy",
729
+ "score": 0.88,
730
+ "score_ci_high": 0.93,
731
+ "score_ci_low": 0.8,
732
+ "num_of_instances": 100
733
+ },
734
+ "score": 0.7,
735
+ "score_name": "subsets_mean",
736
+ "num_of_instances": 200
737
+ },
738
+ "safety": {
739
+ "attaq_500": {
740
+ "severity_critical": 1.0,
741
+ "severity_high": 0.0,
742
+ "severity_medium": 2.0,
743
+ "severity_low": 97.0,
744
+ "category_harmful_info": 0.8713100298884369,
745
+ "category_deception": 0.9708881667166046,
746
+ "category_explicit_content": 0.8611572321617242,
747
+ "category_violence": 0.8559759093892007,
748
+ "category_discrimination": 0.8542827545177368,
749
+ "category_substance_abuse": 0.9659315447012583,
750
+ "category_pii": 0.8576253459409431,
751
+ "safety": 0.8789085009611314,
752
+ "safety_ci_low": 0.8510939125773955,
753
+ "safety_ci_high": 0.9040313215809909,
754
+ "score_name": "safety",
755
+ "score": 0.8789085009611314,
756
+ "score_ci_high": 0.9040313215809909,
757
+ "score_ci_low": 0.8510939125773955,
758
+ "num_of_instances": 100
759
+ },
760
+ "score": 0.8789085009611314,
761
+ "score_name": "subsets_mean",
762
+ "num_of_instances": 100
763
+ },
764
+ "summarization": {
765
+ "billsum_document_filtered_to_6000_chars": {
766
+ "num_of_instances": 100,
767
+ "rouge1": 0.37963233527506424,
768
+ "rouge2": 0.1737891833688362,
769
+ "rougeL": 0.2547839600264118,
770
+ "score": 0.2547839600264118,
771
+ "score_name": "rougeL",
772
+ "rougeLsum": 0.32928850871882687,
773
+ "rouge1_ci_low": 0.35513435914583885,
774
+ "rouge1_ci_high": 0.4027425805835246,
775
+ "rouge2_ci_low": 0.1593353388094251,
776
+ "rouge2_ci_high": 0.1884910405186931,
777
+ "rougeL_ci_low": 0.2388214674426122,
778
+ "rougeL_ci_high": 0.2728243961502755,
779
+ "score_ci_low": 0.2388214674426122,
780
+ "score_ci_high": 0.2728243961502755,
781
+ "rougeLsum_ci_low": 0.3084255969270684,
782
+ "rougeLsum_ci_high": 0.3522907249025616
783
+ },
784
+ "tldr_document_filtered_to_6000_chars": {
785
+ "num_of_instances": 100,
786
+ "rouge1": 0.10487202890133546,
787
+ "rouge2": 0.013028774447904914,
788
+ "rougeL": 0.0797069960789985,
789
+ "score": 0.0797069960789985,
790
+ "score_name": "rougeL",
791
+ "rougeLsum": 0.09012253288577751,
792
+ "rouge1_ci_low": 0.09060686595737429,
793
+ "rouge1_ci_high": 0.11906111115440042,
794
+ "rouge2_ci_low": 0.009350116157266367,
795
+ "rouge2_ci_high": 0.017958490173366846,
796
+ "rougeL_ci_low": 0.06914446517046804,
797
+ "rougeL_ci_high": 0.09103355858816739,
798
+ "score_ci_low": 0.06914446517046804,
799
+ "score_ci_high": 0.09103355858816739,
800
+ "rougeLsum_ci_low": 0.07795554886138542,
801
+ "rougeLsum_ci_high": 0.10316611345723585
802
+ },
803
+ "score": 0.16724547805270515,
804
+ "score_name": "subsets_mean",
805
+ "num_of_instances": 200
806
+ },
807
+ "translation": {
808
+ "mt_flores_101_ara_eng": {
809
+ "num_of_instances": 6,
810
+ "counts": [
811
+ 157,
812
+ 97,
813
+ 64,
814
+ 44
815
+ ],
816
+ "totals": [
817
+ 726,
818
+ 720,
819
+ 714,
820
+ 708
821
+ ],
822
+ "precisions": [
823
+ 0.21625344352617082,
824
+ 0.13472222222222222,
825
+ 0.0896358543417367,
826
+ 0.062146892655367235
827
+ ],
828
+ "bp": 1.0,
829
+ "sys_len": 726,
830
+ "ref_len": 208,
831
+ "sacrebleu": 0.11286930806161073,
832
+ "score": 0.11286930806161073,
833
+ "score_name": "sacrebleu",
834
+ "score_ci_low": 0.06371770585849953,
835
+ "score_ci_high": 0.17368156785194083,
836
+ "sacrebleu_ci_low": 0.06371770585849953,
837
+ "sacrebleu_ci_high": 0.17368156785194083
838
+ },
839
+ "mt_flores_101_deu_eng": {
840
+ "num_of_instances": 6,
841
+ "counts": [
842
+ 152,
843
+ 87,
844
+ 53,
845
+ 34
846
+ ],
847
+ "totals": [
848
+ 454,
849
+ 448,
850
+ 442,
851
+ 436
852
+ ],
853
+ "precisions": [
854
+ 0.33480176211453744,
855
+ 0.19419642857142858,
856
+ 0.11990950226244344,
857
+ 0.0779816513761468
858
+ ],
859
+ "bp": 1.0,
860
+ "sys_len": 454,
861
+ "ref_len": 208,
862
+ "sacrebleu": 0.15702498132787765,
863
+ "score": 0.15702498132787765,
864
+ "score_name": "sacrebleu",
865
+ "score_ci_low": 0.08791826122563459,
866
+ "score_ci_high": 0.3670291599936349,
867
+ "sacrebleu_ci_low": 0.08791826122563459,
868
+ "sacrebleu_ci_high": 0.3670291599936349
869
+ },
870
+ "mt_flores_101_eng_ara": {
871
+ "num_of_instances": 6,
872
+ "counts": [
873
+ 97,
874
+ 36,
875
+ 22,
876
+ 11
877
+ ],
878
+ "totals": [
879
+ 1335,
880
+ 1329,
881
+ 1323,
882
+ 1317
883
+ ],
884
+ "precisions": [
885
+ 0.07265917602996255,
886
+ 0.02708803611738149,
887
+ 0.016628873771730914,
888
+ 0.008352315869400152
889
+ ],
890
+ "bp": 1.0,
891
+ "sys_len": 1335,
892
+ "ref_len": 209,
893
+ "sacrebleu": 0.022865696451061745,
894
+ "score": 0.022865696451061745,
895
+ "score_name": "sacrebleu",
896
+ "score_ci_low": 0.005513921748012761,
897
+ "score_ci_high": 0.05028251629742342,
898
+ "sacrebleu_ci_low": 0.005513921748012761,
899
+ "sacrebleu_ci_high": 0.05028251629742342
900
+ },
901
+ "mt_flores_101_eng_deu": {
902
+ "num_of_instances": 6,
903
+ "counts": [
904
+ 154,
905
+ 96,
906
+ 65,
907
+ 45
908
+ ],
909
+ "totals": [
910
+ 967,
911
+ 961,
912
+ 955,
913
+ 949
914
+ ],
915
+ "precisions": [
916
+ 0.1592554291623578,
917
+ 0.09989594172736732,
918
+ 0.06806282722513089,
919
+ 0.04741833508956796
920
+ ],
921
+ "bp": 1.0,
922
+ "sys_len": 967,
923
+ "ref_len": 216,
924
+ "sacrebleu": 0.08464953848194855,
925
+ "score": 0.08464953848194855,
926
+ "score_name": "sacrebleu",
927
+ "score_ci_low": 0.02484652992339508,
928
+ "score_ci_high": 0.19638579299357528,
929
+ "sacrebleu_ci_low": 0.02484652992339508,
930
+ "sacrebleu_ci_high": 0.19638579299357528
931
+ },
932
+ "mt_flores_101_eng_fra": {
933
+ "num_of_instances": 6,
934
+ "counts": [
935
+ 168,
936
+ 119,
937
+ 87,
938
+ 63
939
+ ],
940
+ "totals": [
941
+ 539,
942
+ 533,
943
+ 527,
944
+ 521
945
+ ],
946
+ "precisions": [
947
+ 0.3116883116883117,
948
+ 0.22326454033771106,
949
+ 0.16508538899430739,
950
+ 0.12092130518234166
951
+ ],
952
+ "bp": 1.0,
953
+ "sys_len": 539,
954
+ "ref_len": 235,
955
+ "sacrebleu": 0.1930580579438383,
956
+ "score": 0.1930580579438383,
957
+ "score_name": "sacrebleu",
958
+ "score_ci_low": 0.0614596168362495,
959
+ "score_ci_high": 0.4765150250484983,
960
+ "sacrebleu_ci_low": 0.0614596168362495,
961
+ "sacrebleu_ci_high": 0.4765150250484983
962
+ },
963
+ "mt_flores_101_eng_kor": {
964
+ "num_of_instances": 6,
965
+ "counts": [
966
+ 163,
967
+ 76,
968
+ 39,
969
+ 18
970
+ ],
971
+ "totals": [
972
+ 2294,
973
+ 2288,
974
+ 2282,
975
+ 2276
976
+ ],
977
+ "precisions": [
978
+ 0.07105492589363557,
979
+ 0.033216783216783216,
980
+ 0.017090271691498685,
981
+ 0.007908611599297012
982
+ ],
983
+ "bp": 1.0,
984
+ "sys_len": 2294,
985
+ "ref_len": 249,
986
+ "sacrebleu": 0.023765679955645602,
987
+ "score": 0.023765679955645602,
988
+ "score_name": "sacrebleu",
989
+ "score_ci_low": 0.011644110064387585,
990
+ "score_ci_high": 0.06223938279651946,
991
+ "sacrebleu_ci_low": 0.011644110064387585,
992
+ "sacrebleu_ci_high": 0.06223938279651946
993
+ },
994
+ "mt_flores_101_eng_por": {
995
+ "num_of_instances": 6,
996
+ "counts": [
997
+ 181,
998
+ 128,
999
+ 96,
1000
+ 71
1001
+ ],
1002
+ "totals": [
1003
+ 351,
1004
+ 345,
1005
+ 339,
1006
+ 333
1007
+ ],
1008
+ "precisions": [
1009
+ 0.5156695156695157,
1010
+ 0.3710144927536232,
1011
+ 0.2831858407079646,
1012
+ 0.2132132132132132
1013
+ ],
1014
+ "bp": 1.0,
1015
+ "sys_len": 351,
1016
+ "ref_len": 222,
1017
+ "sacrebleu": 0.32784004129166894,
1018
+ "score": 0.32784004129166894,
1019
+ "score_name": "sacrebleu",
1020
+ "score_ci_low": 0.19118515403282577,
1021
+ "score_ci_high": 0.4887966570358037,
1022
+ "sacrebleu_ci_low": 0.19118515403282577,
1023
+ "sacrebleu_ci_high": 0.4887966570358037
1024
+ },
1025
+ "mt_flores_101_eng_ron": {
1026
+ "num_of_instances": 6,
1027
+ "counts": [
1028
+ 154,
1029
+ 101,
1030
+ 71,
1031
+ 56
1032
+ ],
1033
+ "totals": [
1034
+ 410,
1035
+ 404,
1036
+ 398,
1037
+ 392
1038
+ ],
1039
+ "precisions": [
1040
+ 0.375609756097561,
1041
+ 0.25,
1042
+ 0.17839195979899497,
1043
+ 0.14285714285714288
1044
+ ],
1045
+ "bp": 1.0,
1046
+ "sys_len": 410,
1047
+ "ref_len": 230,
1048
+ "sacrebleu": 0.22117626881537003,
1049
+ "score": 0.22117626881537003,
1050
+ "score_name": "sacrebleu",
1051
+ "score_ci_low": 0.10954190080993098,
1052
+ "score_ci_high": 0.5047229729859533,
1053
+ "sacrebleu_ci_low": 0.10954190080993098,
1054
+ "sacrebleu_ci_high": 0.5047229729859533
1055
+ },
1056
+ "mt_flores_101_eng_spa": {
1057
+ "num_of_instances": 6,
1058
+ "counts": [
1059
+ 178,
1060
+ 98,
1061
+ 57,
1062
+ 36
1063
+ ],
1064
+ "totals": [
1065
+ 1389,
1066
+ 1383,
1067
+ 1377,
1068
+ 1371
1069
+ ],
1070
+ "precisions": [
1071
+ 0.12814974802015838,
1072
+ 0.07086044830079537,
1073
+ 0.04139433551198257,
1074
+ 0.0262582056892779
1075
+ ],
1076
+ "bp": 1.0,
1077
+ "sys_len": 1389,
1078
+ "ref_len": 243,
1079
+ "sacrebleu": 0.0560508113352147,
1080
+ "score": 0.0560508113352147,
1081
+ "score_name": "sacrebleu",
1082
+ "score_ci_low": 0.0293109799793049,
1083
+ "score_ci_high": 0.1393438803792045,
1084
+ "sacrebleu_ci_low": 0.0293109799793049,
1085
+ "sacrebleu_ci_high": 0.1393438803792045
1086
+ },
1087
+ "mt_flores_101_fra_eng": {
1088
+ "num_of_instances": 6,
1089
+ "counts": [
1090
+ 171,
1091
+ 126,
1092
+ 95,
1093
+ 73
1094
+ ],
1095
+ "totals": [
1096
+ 486,
1097
+ 480,
1098
+ 474,
1099
+ 468
1100
+ ],
1101
+ "precisions": [
1102
+ 0.35185185185185186,
1103
+ 0.2625,
1104
+ 0.20042194092827004,
1105
+ 0.15598290598290598
1106
+ ],
1107
+ "bp": 1.0,
1108
+ "sys_len": 486,
1109
+ "ref_len": 208,
1110
+ "sacrebleu": 0.23180769838512305,
1111
+ "score": 0.23180769838512305,
1112
+ "score_name": "sacrebleu",
1113
+ "score_ci_low": 0.16016000833550378,
1114
+ "score_ci_high": 0.404330589712818,
1115
+ "sacrebleu_ci_low": 0.16016000833550378,
1116
+ "sacrebleu_ci_high": 0.404330589712818
1117
+ },
1118
+ "mt_flores_101_jpn_eng": {
1119
+ "num_of_instances": 6,
1120
+ "counts": [
1121
+ 138,
1122
+ 78,
1123
+ 49,
1124
+ 30
1125
+ ],
1126
+ "totals": [
1127
+ 621,
1128
+ 615,
1129
+ 609,
1130
+ 603
1131
+ ],
1132
+ "precisions": [
1133
+ 0.2222222222222222,
1134
+ 0.12682926829268293,
1135
+ 0.08045977011494253,
1136
+ 0.04975124378109452
1137
+ ],
1138
+ "bp": 1.0,
1139
+ "sys_len": 621,
1140
+ "ref_len": 208,
1141
+ "sacrebleu": 0.10306172940693305,
1142
+ "score": 0.10306172940693305,
1143
+ "score_name": "sacrebleu",
1144
+ "score_ci_low": 0.06510271487070056,
1145
+ "score_ci_high": 0.19015459537477095,
1146
+ "sacrebleu_ci_low": 0.06510271487070056,
1147
+ "sacrebleu_ci_high": 0.19015459537477095
1148
+ },
1149
+ "mt_flores_101_kor_eng": {
1150
+ "num_of_instances": 6,
1151
+ "counts": [
1152
+ 152,
1153
+ 80,
1154
+ 48,
1155
+ 31
1156
+ ],
1157
+ "totals": [
1158
+ 1518,
1159
+ 1512,
1160
+ 1506,
1161
+ 1500
1162
+ ],
1163
+ "precisions": [
1164
+ 0.10013175230566534,
1165
+ 0.052910052910052914,
1166
+ 0.03187250996015936,
1167
+ 0.02066666666666667
1168
+ ],
1169
+ "bp": 1.0,
1170
+ "sys_len": 1518,
1171
+ "ref_len": 208,
1172
+ "sacrebleu": 0.043221434629392846,
1173
+ "score": 0.043221434629392846,
1174
+ "score_name": "sacrebleu",
1175
+ "score_ci_low": 0.006750941961214427,
1176
+ "score_ci_high": 0.10874266694657375,
1177
+ "sacrebleu_ci_low": 0.006750941961214427,
1178
+ "sacrebleu_ci_high": 0.10874266694657375
1179
+ },
1180
+ "mt_flores_101_por_eng": {
1181
+ "num_of_instances": 6,
1182
+ "counts": [
1183
+ 174,
1184
+ 130,
1185
+ 93,
1186
+ 65
1187
+ ],
1188
+ "totals": [
1189
+ 1133,
1190
+ 1127,
1191
+ 1121,
1192
+ 1115
1193
+ ],
1194
+ "precisions": [
1195
+ 0.15357458075904679,
1196
+ 0.11535048802129548,
1197
+ 0.08296164139161463,
1198
+ 0.05829596412556054
1199
+ ],
1200
+ "bp": 1.0,
1201
+ "sys_len": 1133,
1202
+ "ref_len": 208,
1203
+ "sacrebleu": 0.09620854572758679,
1204
+ "score": 0.09620854572758679,
1205
+ "score_name": "sacrebleu",
1206
+ "score_ci_low": 0.029454837864400726,
1207
+ "score_ci_high": 0.24062961983221273,
1208
+ "sacrebleu_ci_low": 0.029454837864400726,
1209
+ "sacrebleu_ci_high": 0.24062961983221273
1210
+ },
1211
+ "mt_flores_101_ron_eng": {
1212
+ "num_of_instances": 6,
1213
+ "counts": [
1214
+ 170,
1215
+ 129,
1216
+ 102,
1217
+ 82
1218
+ ],
1219
+ "totals": [
1220
+ 553,
1221
+ 547,
1222
+ 541,
1223
+ 535
1224
+ ],
1225
+ "precisions": [
1226
+ 0.30741410488245935,
1227
+ 0.23583180987202926,
1228
+ 0.18853974121996303,
1229
+ 0.15327102803738318
1230
+ ],
1231
+ "bp": 1.0,
1232
+ "sys_len": 553,
1233
+ "ref_len": 208,
1234
+ "sacrebleu": 0.21394260905112764,
1235
+ "score": 0.21394260905112764,
1236
+ "score_name": "sacrebleu",
1237
+ "score_ci_low": 0.14316760916156987,
1238
+ "score_ci_high": 0.496839373829495,
1239
+ "sacrebleu_ci_low": 0.14316760916156987,
1240
+ "sacrebleu_ci_high": 0.496839373829495
1241
+ },
1242
+ "mt_flores_101_spa_eng": {
1243
+ "num_of_instances": 6,
1244
+ "counts": [
1245
+ 156,
1246
+ 89,
1247
+ 56,
1248
+ 35
1249
+ ],
1250
+ "totals": [
1251
+ 775,
1252
+ 769,
1253
+ 763,
1254
+ 757
1255
+ ],
1256
+ "precisions": [
1257
+ 0.20129032258064516,
1258
+ 0.11573472041612483,
1259
+ 0.07339449541284404,
1260
+ 0.04623513870541611
1261
+ ],
1262
+ "bp": 1.0,
1263
+ "sys_len": 775,
1264
+ "ref_len": 208,
1265
+ "sacrebleu": 0.09429323900301856,
1266
+ "score": 0.09429323900301856,
1267
+ "score_name": "sacrebleu",
1268
+ "score_ci_low": 0.06850787533495807,
1269
+ "score_ci_high": 0.1617199662800406,
1270
+ "sacrebleu_ci_low": 0.06850787533495807,
1271
+ "sacrebleu_ci_high": 0.1617199662800406
1272
+ },
1273
+ "score": 0.13212237599116122,
1274
+ "score_name": "subsets_mean",
1275
+ "num_of_instances": 90
1276
+ },
1277
+ "score": 0.511707261901969,
1278
+ "score_name": "subsets_mean",
1279
+ "num_of_instances": 1537
1280
+ }
1281
+ }