jbnayahu commited on
Commit
631583f
·
verified ·
2 Parent(s): e94b8a3 0201c01

Merge branch #jbnayahu/bluebench' into 'ibm-research/bluebench'

Browse files
results/bluebench/2025-07-03T10-34-07_evaluation_results.json DELETED
@@ -1,1281 +0,0 @@
1
- {
2
- "environment_info": {
3
- "timestamp_utc": "2025-07-03T14:34:02.551035Z",
4
- "command_line_invocation": [
5
- "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
- "--tasks",
7
- "benchmarks.bluebench",
8
- "--model",
9
- "cross_provider",
10
- "--model_args",
11
- "model_name=watsonx/meta-llama/llama-3-2-90b-vision-instruct,max_tokens=1024",
12
- "--output_path",
13
- "./results/bluebench",
14
- "--log_samples",
15
- "--trust_remote_code",
16
- "--batch_size",
17
- "8",
18
- "--verbosity",
19
- "ERROR"
20
- ],
21
- "parsed_arguments": {
22
- "tasks": [
23
- "benchmarks.bluebench"
24
- ],
25
- "split": "test",
26
- "num_fewshots": null,
27
- "limit": null,
28
- "batch_size": 8,
29
- "model": "watsonx/meta-llama/llama-3-2-90b-vision-instruct",
30
- "model_args": {
31
- "max_tokens": 1024
32
- },
33
- "gen_kwargs": null,
34
- "chat_template_kwargs": null,
35
- "output_path": "./results/bluebench",
36
- "output_file_prefix": "evaluation_results",
37
- "log_samples": true,
38
- "verbosity": "ERROR",
39
- "apply_chat_template": false,
40
- "trust_remote_code": true,
41
- "disable_hf_cache": false,
42
- "cache_dir": null
43
- },
44
- "unitxt_version": "1.25.0",
45
- "unitxt_commit_hash": "f087fa0d9c77a2dab916bae59414b093e5be4041",
46
- "python_version": "3.10.18",
47
- "system": "Linux",
48
- "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
- "installed_packages": {
50
- "nvidia-cufile-cu12": "1.11.1.6",
51
- "triton": "3.3.1",
52
- "nltk": "3.9.1",
53
- "anyio": "4.9.0",
54
- "unitxt": "1.25.0",
55
- "absl-py": "2.3.0",
56
- "tiktoken": "0.9.0",
57
- "charset-normalizer": "3.4.2",
58
- "nvidia-cuda-runtime-cu12": "12.6.77",
59
- "sympy": "1.14.0",
60
- "mecab-ko": "1.0.1",
61
- "httpcore": "1.0.9",
62
- "litellm": "1.73.6",
63
- "Jinja2": "3.1.6",
64
- "jsonschema-specifications": "2025.4.1",
65
- "pydantic_core": "2.33.2",
66
- "nvidia-cusparse-cu12": "12.5.4.2",
67
- "tokenizers": "0.21.2",
68
- "yarl": "1.20.1",
69
- "portalocker": "3.2.0",
70
- "pandas": "2.3.0",
71
- "multiprocess": "0.70.16",
72
- "jsonschema": "4.24.0",
73
- "nvidia-nvjitlink-cu12": "12.6.85",
74
- "nvidia-cublas-cu12": "12.6.4.1",
75
- "pydantic": "2.11.7",
76
- "async-timeout": "5.0.1",
77
- "annotated-types": "0.7.0",
78
- "rouge_score": "0.1.2",
79
- "contourpy": "1.3.2",
80
- "aiosignal": "1.3.2",
81
- "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "openai": "1.93.0",
83
- "six": "1.17.0",
84
- "diskcache": "5.6.3",
85
- "tqdm": "4.67.1",
86
- "pyarrow": "20.0.0",
87
- "h11": "0.16.0",
88
- "zipp": "3.19.2",
89
- "tzdata": "2025.2",
90
- "bert-score": "0.3.13",
91
- "setuptools": "80.9.0",
92
- "referencing": "0.36.2",
93
- "sacrebleu": "2.5.1",
94
- "filelock": "3.18.0",
95
- "urllib3": "2.5.0",
96
- "scipy": "1.15.3",
97
- "nvidia-nccl-cu12": "2.26.2",
98
- "kiwisolver": "1.4.8",
99
- "networkx": "3.4.2",
100
- "typing-inspection": "0.4.1",
101
- "sniffio": "1.3.1",
102
- "scikit-learn": "1.7.0",
103
- "rpds-py": "0.26.0",
104
- "nvidia-curand-cu12": "10.3.7.77",
105
- "pip": "25.1.1",
106
- "pillow": "11.3.0",
107
- "fonttools": "4.58.4",
108
- "datasets": "3.6.0",
109
- "nvidia-cusolver-cu12": "11.7.1.2",
110
- "cycler": "0.12.1",
111
- "distro": "1.9.0",
112
- "idna": "3.10",
113
- "MarkupSafe": "3.0.2",
114
- "frozenlist": "1.7.0",
115
- "pyparsing": "3.2.3",
116
- "jiter": "0.10.0",
117
- "importlib_metadata": "8.0.0",
118
- "packaging": "24.2",
119
- "psutil": "7.0.0",
120
- "mecab-ko-dic": "1.0.0",
121
- "joblib": "1.5.1",
122
- "fsspec": "2025.3.0",
123
- "dill": "0.3.8",
124
- "wheel": "0.45.1",
125
- "nvidia-nvtx-cu12": "12.6.77",
126
- "nvidia-cusparselt-cu12": "0.6.3",
127
- "lxml": "6.0.0",
128
- "propcache": "0.3.2",
129
- "numpy": "2.2.6",
130
- "mpmath": "1.3.0",
131
- "conllu": "6.0.0",
132
- "huggingface-hub": "0.33.2",
133
- "safetensors": "0.5.3",
134
- "requests": "2.32.4",
135
- "regex": "2024.11.6",
136
- "aiohttp": "3.12.13",
137
- "tabulate": "0.9.0",
138
- "accelerate": "1.8.1",
139
- "certifi": "2025.6.15",
140
- "evaluate": "0.4.4",
141
- "nvidia-cufft-cu12": "11.3.0.4",
142
- "nvidia-cuda-nvrtc-cu12": "12.6.77",
143
- "click": "8.2.1",
144
- "typing_extensions": "4.12.2",
145
- "attrs": "25.3.0",
146
- "exceptiongroup": "1.3.0",
147
- "transformers": "4.53.0",
148
- "tenacity": "9.1.2",
149
- "pytz": "2025.2",
150
- "aiohappyeyeballs": "2.6.1",
151
- "python-dateutil": "2.9.0.post0",
152
- "torch": "2.7.1",
153
- "python-dotenv": "1.1.1",
154
- "multidict": "6.6.3",
155
- "httpx": "0.28.1",
156
- "matplotlib": "3.10.3",
157
- "xxhash": "3.5.0",
158
- "PyYAML": "6.0.2",
159
- "colorama": "0.4.6",
160
- "threadpoolctl": "3.6.0",
161
- "nvidia-cudnn-cu12": "9.5.1.17",
162
- "hf-xet": "1.1.5",
163
- "jaraco.collections": "5.1.0",
164
- "tomli": "2.0.1",
165
- "backports.tarfile": "1.2.0",
166
- "jaraco.context": "5.3.0",
167
- "typeguard": "4.3.0",
168
- "autocommand": "2.2.2",
169
- "jaraco.text": "3.12.1",
170
- "more-itertools": "10.3.0",
171
- "platformdirs": "4.2.2",
172
- "inflect": "7.3.1",
173
- "jaraco.functools": "4.0.1"
174
- }
175
- },
176
- "results": {
177
- "bias": {
178
- "safety_bbq_age": {
179
- "accuracy": 1.0,
180
- "accuracy_ci_low": 1.0,
181
- "accuracy_ci_high": 1.0,
182
- "score_name": "accuracy",
183
- "score": 1.0,
184
- "score_ci_high": 1.0,
185
- "score_ci_low": 1.0,
186
- "num_of_instances": 9
187
- },
188
- "safety_bbq_disability_status": {
189
- "accuracy": 1.0,
190
- "accuracy_ci_low": 1.0,
191
- "accuracy_ci_high": 1.0,
192
- "score_name": "accuracy",
193
- "score": 1.0,
194
- "score_ci_high": 1.0,
195
- "score_ci_low": 1.0,
196
- "num_of_instances": 9
197
- },
198
- "safety_bbq_gender_identity": {
199
- "accuracy": 1.0,
200
- "accuracy_ci_low": 1.0,
201
- "accuracy_ci_high": 1.0,
202
- "score_name": "accuracy",
203
- "score": 1.0,
204
- "score_ci_high": 1.0,
205
- "score_ci_low": 1.0,
206
- "num_of_instances": 9
207
- },
208
- "safety_bbq_nationality": {
209
- "accuracy": 1.0,
210
- "accuracy_ci_low": 1.0,
211
- "accuracy_ci_high": 1.0,
212
- "score_name": "accuracy",
213
- "score": 1.0,
214
- "score_ci_high": 1.0,
215
- "score_ci_low": 1.0,
216
- "num_of_instances": 9
217
- },
218
- "safety_bbq_physical_appearance": {
219
- "accuracy": 1.0,
220
- "accuracy_ci_low": 1.0,
221
- "accuracy_ci_high": 1.0,
222
- "score_name": "accuracy",
223
- "score": 1.0,
224
- "score_ci_high": 1.0,
225
- "score_ci_low": 1.0,
226
- "num_of_instances": 9
227
- },
228
- "safety_bbq_race_ethnicity": {
229
- "accuracy": 1.0,
230
- "accuracy_ci_low": 1.0,
231
- "accuracy_ci_high": 1.0,
232
- "score_name": "accuracy",
233
- "score": 1.0,
234
- "score_ci_high": 1.0,
235
- "score_ci_low": 1.0,
236
- "num_of_instances": 9
237
- },
238
- "safety_bbq_race_x_gender": {
239
- "accuracy": 1.0,
240
- "accuracy_ci_low": 1.0,
241
- "accuracy_ci_high": 1.0,
242
- "score_name": "accuracy",
243
- "score": 1.0,
244
- "score_ci_high": 1.0,
245
- "score_ci_low": 1.0,
246
- "num_of_instances": 9
247
- },
248
- "safety_bbq_race_x_ses": {
249
- "accuracy": 1.0,
250
- "accuracy_ci_low": 1.0,
251
- "accuracy_ci_high": 1.0,
252
- "score_name": "accuracy",
253
- "score": 1.0,
254
- "score_ci_high": 1.0,
255
- "score_ci_low": 1.0,
256
- "num_of_instances": 9
257
- },
258
- "safety_bbq_religion": {
259
- "accuracy": 1.0,
260
- "accuracy_ci_low": 1.0,
261
- "accuracy_ci_high": 1.0,
262
- "score_name": "accuracy",
263
- "score": 1.0,
264
- "score_ci_high": 1.0,
265
- "score_ci_low": 1.0,
266
- "num_of_instances": 9
267
- },
268
- "safety_bbq_ses": {
269
- "accuracy": 1.0,
270
- "accuracy_ci_low": 1.0,
271
- "accuracy_ci_high": 1.0,
272
- "score_name": "accuracy",
273
- "score": 1.0,
274
- "score_ci_high": 1.0,
275
- "score_ci_low": 1.0,
276
- "num_of_instances": 9
277
- },
278
- "safety_bbq_sexual_orientation": {
279
- "accuracy": 1.0,
280
- "accuracy_ci_low": 1.0,
281
- "accuracy_ci_high": 1.0,
282
- "score_name": "accuracy",
283
- "score": 1.0,
284
- "score_ci_high": 1.0,
285
- "score_ci_low": 1.0,
286
- "num_of_instances": 9
287
- },
288
- "score": 1.0,
289
- "score_name": "subsets_mean",
290
- "num_of_instances": 99
291
- },
292
- "chatbot_abilities": {
293
- "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 100,
295
- "llama_3_70b_instruct_template_arena_hard": 0.8711656441717791,
296
- "score": 0.8711656441717791,
297
- "score_name": "llama_3_70b_instruct_template_arena_hard"
298
- },
299
- "score": 0.8711656441717791,
300
- "score_name": "subsets_mean",
301
- "num_of_instances": 100
302
- },
303
- "entity_extraction": {
304
- "universal_ner_en_ewt": {
305
- "num_of_instances": 100,
306
- "f1_Person": 0.8260869565217391,
307
- "f1_Organization": 0.6551724137931035,
308
- "f1_Location": 0.7272727272727272,
309
- "f1_macro": 0.7361773658625234,
310
- "recall_macro": 0.7237750172532781,
311
- "precision_macro": 0.7531400966183576,
312
- "in_classes_support": 1.0,
313
- "f1_micro": 0.7297297297297296,
314
- "recall_micro": 0.72,
315
- "precision_micro": 0.7397260273972602,
316
- "score": 0.7297297297297296,
317
- "score_name": "f1_micro",
318
- "score_ci_low": 0.6619502313182618,
319
- "score_ci_high": 0.7835819840150043,
320
- "f1_micro_ci_low": 0.6619502313182618,
321
- "f1_micro_ci_high": 0.7835819840150043
322
- },
323
- "score": 0.7297297297297296,
324
- "score_name": "subsets_mean",
325
- "num_of_instances": 100
326
- },
327
- "knowledge": {
328
- "mmlu_pro_biology": {
329
- "accuracy": 0.7142857142857143,
330
- "accuracy_ci_low": 0.2857142857142857,
331
- "accuracy_ci_high": 1.0,
332
- "score_name": "accuracy",
333
- "score": 0.7142857142857143,
334
- "score_ci_high": 1.0,
335
- "score_ci_low": 0.2857142857142857,
336
- "num_of_instances": 7
337
- },
338
- "mmlu_pro_business": {
339
- "accuracy": 0.42857142857142855,
340
- "accuracy_ci_low": 0.14285714285714285,
341
- "accuracy_ci_high": 0.8571428571428571,
342
- "score_name": "accuracy",
343
- "score": 0.42857142857142855,
344
- "score_ci_high": 0.8571428571428571,
345
- "score_ci_low": 0.14285714285714285,
346
- "num_of_instances": 7
347
- },
348
- "mmlu_pro_chemistry": {
349
- "accuracy": 0.42857142857142855,
350
- "accuracy_ci_low": 0.14285714285714285,
351
- "accuracy_ci_high": 0.8571428571428571,
352
- "score_name": "accuracy",
353
- "score": 0.42857142857142855,
354
- "score_ci_high": 0.8571428571428571,
355
- "score_ci_low": 0.14285714285714285,
356
- "num_of_instances": 7
357
- },
358
- "mmlu_pro_computer_science": {
359
- "accuracy": 1.0,
360
- "accuracy_ci_low": 1.0,
361
- "accuracy_ci_high": 1.0,
362
- "score_name": "accuracy",
363
- "score": 1.0,
364
- "score_ci_high": 1.0,
365
- "score_ci_low": 1.0,
366
- "num_of_instances": 7
367
- },
368
- "mmlu_pro_economics": {
369
- "accuracy": 0.7142857142857143,
370
- "accuracy_ci_low": 0.2857142857142857,
371
- "accuracy_ci_high": 1.0,
372
- "score_name": "accuracy",
373
- "score": 0.7142857142857143,
374
- "score_ci_high": 1.0,
375
- "score_ci_low": 0.2857142857142857,
376
- "num_of_instances": 7
377
- },
378
- "mmlu_pro_engineering": {
379
- "accuracy": 0.42857142857142855,
380
- "accuracy_ci_low": 0.14285714285714285,
381
- "accuracy_ci_high": 0.8571428571428571,
382
- "score_name": "accuracy",
383
- "score": 0.42857142857142855,
384
- "score_ci_high": 0.8571428571428571,
385
- "score_ci_low": 0.14285714285714285,
386
- "num_of_instances": 7
387
- },
388
- "mmlu_pro_health": {
389
- "accuracy": 0.42857142857142855,
390
- "accuracy_ci_low": 0.14285714285714285,
391
- "accuracy_ci_high": 0.8571428571428571,
392
- "score_name": "accuracy",
393
- "score": 0.42857142857142855,
394
- "score_ci_high": 0.8571428571428571,
395
- "score_ci_low": 0.14285714285714285,
396
- "num_of_instances": 7
397
- },
398
- "mmlu_pro_history": {
399
- "accuracy": 0.42857142857142855,
400
- "accuracy_ci_low": 0.14285714285714285,
401
- "accuracy_ci_high": 0.8571428571428571,
402
- "score_name": "accuracy",
403
- "score": 0.42857142857142855,
404
- "score_ci_high": 0.8571428571428571,
405
- "score_ci_low": 0.14285714285714285,
406
- "num_of_instances": 7
407
- },
408
- "mmlu_pro_law": {
409
- "accuracy": 0.5714285714285714,
410
- "accuracy_ci_low": 0.14285714285714285,
411
- "accuracy_ci_high": 0.8571428571428571,
412
- "score_name": "accuracy",
413
- "score": 0.5714285714285714,
414
- "score_ci_high": 0.8571428571428571,
415
- "score_ci_low": 0.14285714285714285,
416
- "num_of_instances": 7
417
- },
418
- "mmlu_pro_math": {
419
- "accuracy": 0.42857142857142855,
420
- "accuracy_ci_low": 0.14285714285714285,
421
- "accuracy_ci_high": 0.8571428571428571,
422
- "score_name": "accuracy",
423
- "score": 0.42857142857142855,
424
- "score_ci_high": 0.8571428571428571,
425
- "score_ci_low": 0.14285714285714285,
426
- "num_of_instances": 7
427
- },
428
- "mmlu_pro_other": {
429
- "accuracy": 0.5714285714285714,
430
- "accuracy_ci_low": 0.14285714285714285,
431
- "accuracy_ci_high": 0.8571428571428571,
432
- "score_name": "accuracy",
433
- "score": 0.5714285714285714,
434
- "score_ci_high": 0.8571428571428571,
435
- "score_ci_low": 0.14285714285714285,
436
- "num_of_instances": 7
437
- },
438
- "mmlu_pro_philosophy": {
439
- "accuracy": 0.7142857142857143,
440
- "accuracy_ci_low": 0.2857142857142857,
441
- "accuracy_ci_high": 1.0,
442
- "score_name": "accuracy",
443
- "score": 0.7142857142857143,
444
- "score_ci_high": 1.0,
445
- "score_ci_low": 0.2857142857142857,
446
- "num_of_instances": 7
447
- },
448
- "mmlu_pro_physics": {
449
- "accuracy": 0.2857142857142857,
450
- "accuracy_ci_low": 0.0,
451
- "accuracy_ci_high": 0.7142857142857143,
452
- "score_name": "accuracy",
453
- "score": 0.2857142857142857,
454
- "score_ci_high": 0.7142857142857143,
455
- "score_ci_low": 0.0,
456
- "num_of_instances": 7
457
- },
458
- "mmlu_pro_psychology": {
459
- "accuracy": 0.5714285714285714,
460
- "accuracy_ci_low": 0.14285714285714285,
461
- "accuracy_ci_high": 0.8571428571428571,
462
- "score_name": "accuracy",
463
- "score": 0.5714285714285714,
464
- "score_ci_high": 0.8571428571428571,
465
- "score_ci_low": 0.14285714285714285,
466
- "num_of_instances": 7
467
- },
468
- "score": 0.5510204081632653,
469
- "score_name": "subsets_mean",
470
- "num_of_instances": 98
471
- },
472
- "legal": {
473
- "legalbench_abercrombie": {
474
- "f1_macro": 0.8147619047619047,
475
- "f1_suggestive": 0.6666666666666666,
476
- "f1_generic": 1.0,
477
- "f1_fanciful": 0.8571428571428571,
478
- "f1_descriptive": 0.8,
479
- "f1_arbitrary": 0.75,
480
- "f1_macro_ci_low": 0.6241071521113625,
481
- "f1_macro_ci_high": 0.9652136441488661,
482
- "score_name": "f1_micro",
483
- "score": 0.8,
484
- "score_ci_high": 0.95,
485
- "score_ci_low": 0.55,
486
- "num_of_instances": 20,
487
- "accuracy": 0.8,
488
- "accuracy_ci_low": 0.55,
489
- "accuracy_ci_high": 0.95,
490
- "f1_micro": 0.8,
491
- "f1_micro_ci_low": 0.55,
492
- "f1_micro_ci_high": 0.95
493
- },
494
- "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.7386363636363636,
496
- "f1_no": 0.7272727272727273,
497
- "f1_yes": 0.75,
498
- "f1_macro_ci_low": 0.5080213903743316,
499
- "f1_macro_ci_high": 0.9157902232720109,
500
- "score_name": "f1_micro",
501
- "score": 0.7368421052631579,
502
- "score_ci_high": 0.8947368421052632,
503
- "score_ci_low": 0.5128205128205128,
504
- "num_of_instances": 20,
505
- "accuracy": 0.7,
506
- "accuracy_ci_low": 0.5,
507
- "accuracy_ci_high": 0.9,
508
- "f1_micro": 0.7368421052631579,
509
- "f1_micro_ci_low": 0.5128205128205128,
510
- "f1_micro_ci_high": 0.8947368421052632
511
- },
512
- "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.17687074829931973,
514
- "f1_conclusion": 0.2857142857142857,
515
- "f1_decree": 0.0,
516
- "f1_issue": 0.2857142857142857,
517
- "f1_analysis": 0.6666666666666666,
518
- "f1_facts": 0.0,
519
- "f1_procedural history": 0.0,
520
- "f1_rule": 0.0,
521
- "f1_macro_ci_low": 0.037037037037037035,
522
- "f1_macro_ci_high": 0.3410139249890439,
523
- "score_name": "f1_micro",
524
- "score": 0.23529411764705882,
525
- "score_ci_high": 0.48484848484848486,
526
- "score_ci_low": 0.058823529411764705,
527
- "num_of_instances": 20,
528
- "accuracy": 0.2,
529
- "accuracy_ci_low": 0.05,
530
- "accuracy_ci_high": 0.45,
531
- "f1_micro": 0.23529411764705882,
532
- "f1_micro_ci_low": 0.058823529411764705,
533
- "f1_micro_ci_high": 0.48484848484848486
534
- },
535
- "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.6277777777777778,
537
- "f1_yes": 0.7,
538
- "f1_no": 0.5555555555555556,
539
- "f1_macro_ci_low": 0.4143115659353126,
540
- "f1_macro_ci_high": 0.849624060150376,
541
- "score_name": "f1_micro",
542
- "score": 0.631578947368421,
543
- "score_ci_high": 0.8421052631578947,
544
- "score_ci_low": 0.4069581788631691,
545
- "num_of_instances": 20,
546
- "accuracy": 0.6,
547
- "accuracy_ci_low": 0.4,
548
- "accuracy_ci_high": 0.8,
549
- "f1_micro": 0.631578947368421,
550
- "f1_micro_ci_low": 0.4069581788631691,
551
- "f1_micro_ci_high": 0.8421052631578947
552
- },
553
- "legalbench_proa": {
554
- "f1_macro": 0.743421052631579,
555
- "f1_yes": 0.75,
556
- "f1_no": 0.7368421052631579,
557
- "f1_macro_ci_low": 0.5133179285198034,
558
- "f1_macro_ci_high": 0.898989898989899,
559
- "score_name": "f1_micro",
560
- "score": 0.7428571428571429,
561
- "score_ci_high": 0.8888888888888888,
562
- "score_ci_low": 0.5,
563
- "num_of_instances": 20,
564
- "accuracy": 0.65,
565
- "accuracy_ci_low": 0.4,
566
- "accuracy_ci_high": 0.85,
567
- "f1_micro": 0.7428571428571429,
568
- "f1_micro_ci_low": 0.5,
569
- "f1_micro_ci_high": 0.8888888888888888
570
- },
571
- "score": 0.6293144626271561,
572
- "score_name": "subsets_mean",
573
- "num_of_instances": 100
574
- },
575
- "news_classification": {
576
- "20_newsgroups_short": {
577
- "f1_macro": 0.6527228980170158,
578
- "f1_cars": 1.0,
579
- "f1_windows x": 0.5714285714285714,
580
- "f1_computer graphics": 0.5882352941176471,
581
- "f1_atheism": 0.3333333333333333,
582
- "f1_christianity": 0.6666666666666666,
583
- "f1_religion": 0.25,
584
- "f1_medicine": 1.0,
585
- "f1_microsoft windows": 0.8,
586
- "f1_middle east": 0.5,
587
- "f1_motorcycles": 0.7272727272727273,
588
- "f1_pc hardware": 0.75,
589
- "f1_mac hardware": 0.8888888888888888,
590
- "f1_electronics": 0.5,
591
- "f1_for sale": 0.8888888888888888,
592
- "f1_guns": 0.4444444444444444,
593
- "f1_space": 0.6,
594
- "f1_cryptography": 0.3333333333333333,
595
- "f1_baseball": 0.9230769230769231,
596
- "f1_hockey": 0.8888888888888888,
597
- "f1_politics": 0.4,
598
- "f1_macro_ci_low": 0.5594466279416972,
599
- "f1_macro_ci_high": 0.7484812715694672,
600
- "score_name": "f1_micro",
601
- "score": 0.6701030927835051,
602
- "score_ci_high": 0.7525773195876289,
603
- "score_ci_low": 0.5628781799105581,
604
- "num_of_instances": 100,
605
- "accuracy": 0.65,
606
- "accuracy_ci_low": 0.54,
607
- "accuracy_ci_high": 0.74,
608
- "f1_micro": 0.6701030927835051,
609
- "f1_micro_ci_low": 0.5628781799105581,
610
- "f1_micro_ci_high": 0.7525773195876289
611
- },
612
- "score": 0.6701030927835051,
613
- "score_name": "subsets_mean",
614
- "num_of_instances": 100
615
- },
616
- "product_help": {
617
- "cfpb_product_2023": {
618
- "f1_macro": 0.7779168114934538,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9343065693430657,
620
- "f1_credit card or prepaid card": 0.3333333333333333,
621
- "f1_money transfer or virtual currency or money service": 0.8,
622
- "f1_mortgage": 0.6666666666666666,
623
- "f1_debt collection": 0.7777777777777778,
624
- "f1_checking or savings account": 0.9333333333333333,
625
- "f1_payday loan or title loan or personal loan": 1.0,
626
- "f1_macro_ci_low": 0.5932457464249341,
627
- "f1_macro_ci_high": 0.8817165227032664,
628
- "score_name": "f1_micro",
629
- "score": 0.875,
630
- "score_ci_high": 0.9238578680203046,
631
- "score_ci_low": 0.8006847676679175,
632
- "num_of_instances": 100,
633
- "accuracy": 0.84,
634
- "accuracy_ci_low": 0.76,
635
- "accuracy_ci_high": 0.9,
636
- "f1_micro": 0.875,
637
- "f1_micro_ci_low": 0.8006847676679175,
638
- "f1_micro_ci_high": 0.9238578680203046
639
- },
640
- "cfpb_product_watsonx": {
641
- "f1_macro": 0.8249444681938961,
642
- "f1_mortgages and loans": 0.8695652173913043,
643
- "f1_credit card": 0.7619047619047619,
644
- "f1_debt collection": 0.7368421052631579,
645
- "f1_credit reporting": 0.8333333333333334,
646
- "f1_retail banking": 0.9230769230769231,
647
- "f1_macro_ci_low": 0.6928894894256992,
648
- "f1_macro_ci_high": 0.9177170275914066,
649
- "score_name": "f1_micro",
650
- "score": 0.82,
651
- "score_ci_high": 0.9,
652
- "score_ci_low": 0.68,
653
- "num_of_instances": 50,
654
- "accuracy": 0.82,
655
- "accuracy_ci_low": 0.68,
656
- "accuracy_ci_high": 0.9,
657
- "f1_micro": 0.82,
658
- "f1_micro_ci_low": 0.68,
659
- "f1_micro_ci_high": 0.9
660
- },
661
- "score": 0.8474999999999999,
662
- "score_name": "subsets_mean",
663
- "num_of_instances": 150
664
- },
665
- "qa_finance": {
666
- "fin_qa": {
667
- "num_of_instances": 100,
668
- "program_accuracy": 0.26,
669
- "score": 0.26,
670
- "score_name": "program_accuracy",
671
- "execution_accuracy": 0.25,
672
- "program_accuracy_ci_low": 0.18,
673
- "program_accuracy_ci_high": 0.35,
674
- "score_ci_low": 0.18,
675
- "score_ci_high": 0.35,
676
- "execution_accuracy_ci_low": 0.17,
677
- "execution_accuracy_ci_high": 0.34
678
- },
679
- "score": 0.26,
680
- "score_name": "subsets_mean",
681
- "num_of_instances": 100
682
- },
683
- "rag_general": {
684
- "rag_response_generation_clapnq": {
685
- "precision": 0.5074323401711549,
686
- "recall": 0.5740276560353169,
687
- "f1": 0.49603980753298627,
688
- "precision_ci_low": 0.47150664604306436,
689
- "precision_ci_high": 0.5444306654624451,
690
- "recall_ci_low": 0.5309832235253571,
691
- "recall_ci_high": 0.6151274674565376,
692
- "f1_ci_low": 0.46604824453263755,
693
- "f1_ci_high": 0.5262523731070278,
694
- "score_name": "f1",
695
- "score": 0.49603980753298627,
696
- "score_ci_high": 0.5262523731070278,
697
- "score_ci_low": 0.46604824453263755,
698
- "num_of_instances": 100,
699
- "correctness_f1_bert_score.deberta_large_mnli": 0.6860781842470169,
700
- "correctness_recall_bert_score.deberta_large_mnli": 0.709225146472454,
701
- "correctness_precision_bert_score.deberta_large_mnli": 0.6771670934557915,
702
- "faithfullness_f1_token_overlap": 0.36312371853081976,
703
- "faithfullness_recall_token_overlap": 0.2603375679120915,
704
- "faithfullness_precision_token_overlap": 0.7208838371950508,
705
- "correctness_f1_token_overlap": 0.49603980753298627,
706
- "correctness_recall_token_overlap": 0.5740276560353169,
707
- "correctness_precision_token_overlap": 0.5074323401711549
708
- },
709
- "score": 0.49603980753298627,
710
- "score_name": "subsets_mean",
711
- "num_of_instances": 100
712
- },
713
- "reasoning": {
714
- "hellaswag": {
715
- "accuracy": 0.57,
716
- "accuracy_ci_low": 0.47,
717
- "accuracy_ci_high": 0.66,
718
- "score_name": "accuracy",
719
- "score": 0.57,
720
- "score_ci_high": 0.66,
721
- "score_ci_low": 0.47,
722
- "num_of_instances": 100
723
- },
724
- "openbook_qa": {
725
- "accuracy": 0.93,
726
- "accuracy_ci_low": 0.8636775645084782,
727
- "accuracy_ci_high": 0.97,
728
- "score_name": "accuracy",
729
- "score": 0.93,
730
- "score_ci_high": 0.97,
731
- "score_ci_low": 0.8636775645084782,
732
- "num_of_instances": 100
733
- },
734
- "score": 0.75,
735
- "score_name": "subsets_mean",
736
- "num_of_instances": 200
737
- },
738
- "safety": {
739
- "attaq_500": {
740
- "severity_critical": 0.0,
741
- "severity_high": 0.0,
742
- "severity_medium": 1.0,
743
- "severity_low": 99.0,
744
- "category_harmful_info": 0.8584611813966482,
745
- "category_deception": 0.884253122025367,
746
- "category_explicit_content": 0.8562488650733774,
747
- "category_violence": 0.8526820468745888,
748
- "category_discrimination": 0.8608146858593775,
749
- "category_substance_abuse": 0.8552402012877994,
750
- "category_pii": 0.8454244594883035,
751
- "safety": 0.8586448376372249,
752
- "safety_ci_low": 0.8379978850647828,
753
- "safety_ci_high": 0.8767468237261271,
754
- "score_name": "safety",
755
- "score": 0.8586448376372249,
756
- "score_ci_high": 0.8767468237261271,
757
- "score_ci_low": 0.8379978850647828,
758
- "num_of_instances": 100
759
- },
760
- "score": 0.8586448376372249,
761
- "score_name": "subsets_mean",
762
- "num_of_instances": 100
763
- },
764
- "summarization": {
765
- "billsum_document_filtered_to_6000_chars": {
766
- "num_of_instances": 100,
767
- "rouge2": 0.20801889674008944,
768
- "rougeL": 0.2968640964283658,
769
- "score": 0.2968640964283658,
770
- "score_name": "rougeL",
771
- "rouge1": 0.42312199770919734,
772
- "rougeLsum": 0.36721142345095514,
773
- "rouge2_ci_low": 0.19303195128069772,
774
- "rouge2_ci_high": 0.2239123862881902,
775
- "rougeL_ci_low": 0.2793124666427084,
776
- "rougeL_ci_high": 0.31605456043834845,
777
- "score_ci_low": 0.2793124666427084,
778
- "score_ci_high": 0.31605456043834845,
779
- "rouge1_ci_low": 0.4001670264004444,
780
- "rouge1_ci_high": 0.4447744439473787,
781
- "rougeLsum_ci_low": 0.3471579822744466,
782
- "rougeLsum_ci_high": 0.38772570475787616
783
- },
784
- "tldr_document_filtered_to_6000_chars": {
785
- "num_of_instances": 100,
786
- "rouge2": 0.01604522130758397,
787
- "rougeL": 0.08637910785624431,
788
- "score": 0.08637910785624431,
789
- "score_name": "rougeL",
790
- "rouge1": 0.11342470214059448,
791
- "rougeLsum": 0.09470632571584116,
792
- "rouge2_ci_low": 0.010956366582946934,
793
- "rouge2_ci_high": 0.02241153955079208,
794
- "rougeL_ci_low": 0.07452990644153164,
795
- "rougeL_ci_high": 0.09690240900042019,
796
- "score_ci_low": 0.07452990644153164,
797
- "score_ci_high": 0.09690240900042019,
798
- "rouge1_ci_low": 0.09801484446894211,
799
- "rouge1_ci_high": 0.12984514349711393,
800
- "rougeLsum_ci_low": 0.08127416438123053,
801
- "rougeLsum_ci_high": 0.10623331358688204
802
- },
803
- "score": 0.19162160214230506,
804
- "score_name": "subsets_mean",
805
- "num_of_instances": 200
806
- },
807
- "translation": {
808
- "mt_flores_101_ara_eng": {
809
- "num_of_instances": 6,
810
- "counts": [
811
- 161,
812
- 119,
813
- 91,
814
- 71
815
- ],
816
- "totals": [
817
- 220,
818
- 214,
819
- 208,
820
- 202
821
- ],
822
- "precisions": [
823
- 0.7318181818181819,
824
- 0.5560747663551402,
825
- 0.4375,
826
- 0.35148514851485146
827
- ],
828
- "bp": 1.0,
829
- "sys_len": 220,
830
- "ref_len": 208,
831
- "sacrebleu": 0.500155852462094,
832
- "score": 0.500155852462094,
833
- "score_name": "sacrebleu",
834
- "score_ci_low": 0.27107466732933977,
835
- "score_ci_high": 0.6456509094349956,
836
- "sacrebleu_ci_low": 0.27107466732933977,
837
- "sacrebleu_ci_high": 0.6456509094349956
838
- },
839
- "mt_flores_101_deu_eng": {
840
- "num_of_instances": 6,
841
- "counts": [
842
- 141,
843
- 85,
844
- 54,
845
- 39
846
- ],
847
- "totals": [
848
- 216,
849
- 210,
850
- 204,
851
- 198
852
- ],
853
- "precisions": [
854
- 0.6527777777777777,
855
- 0.40476190476190477,
856
- 0.2647058823529412,
857
- 0.19696969696969696
858
- ],
859
- "bp": 1.0,
860
- "sys_len": 216,
861
- "ref_len": 208,
862
- "sacrebleu": 0.34259577311211076,
863
- "score": 0.34259577311211076,
864
- "score_name": "sacrebleu",
865
- "score_ci_low": 0.22516597371165897,
866
- "score_ci_high": 0.543023787423078,
867
- "sacrebleu_ci_low": 0.22516597371165897,
868
- "sacrebleu_ci_high": 0.543023787423078
869
- },
870
- "mt_flores_101_eng_ara": {
871
- "num_of_instances": 6,
872
- "counts": [
873
- 131,
874
- 80,
875
- 50,
876
- 29
877
- ],
878
- "totals": [
879
- 203,
880
- 197,
881
- 191,
882
- 185
883
- ],
884
- "precisions": [
885
- 0.645320197044335,
886
- 0.40609137055837563,
887
- 0.2617801047120419,
888
- 0.15675675675675677
889
- ],
890
- "bp": 0.9708758757257812,
891
- "sys_len": 203,
892
- "ref_len": 209,
893
- "sacrebleu": 0.31264694630569706,
894
- "score": 0.31264694630569706,
895
- "score_name": "sacrebleu",
896
- "score_ci_low": 0.21875352119497682,
897
- "score_ci_high": 0.43847677077007524,
898
- "sacrebleu_ci_low": 0.21875352119497682,
899
- "sacrebleu_ci_high": 0.43847677077007524
900
- },
901
- "mt_flores_101_eng_deu": {
902
- "num_of_instances": 6,
903
- "counts": [
904
- 144,
905
- 94,
906
- 68,
907
- 52
908
- ],
909
- "totals": [
910
- 224,
911
- 218,
912
- 212,
913
- 206
914
- ],
915
- "precisions": [
916
- 0.6428571428571429,
917
- 0.4311926605504587,
918
- 0.32075471698113206,
919
- 0.2524271844660194
920
- ],
921
- "bp": 1.0,
922
- "sys_len": 224,
923
- "ref_len": 216,
924
- "sacrebleu": 0.38705595372857227,
925
- "score": 0.38705595372857227,
926
- "score_name": "sacrebleu",
927
- "score_ci_low": 0.270030091960412,
928
- "score_ci_high": 0.5388939200476505,
929
- "sacrebleu_ci_low": 0.270030091960412,
930
- "sacrebleu_ci_high": 0.5388939200476505
931
- },
932
- "mt_flores_101_eng_fra": {
933
- "num_of_instances": 6,
934
- "counts": [
935
- 188,
936
- 150,
937
- 122,
938
- 100
939
- ],
940
- "totals": [
941
- 244,
942
- 238,
943
- 232,
944
- 226
945
- ],
946
- "precisions": [
947
- 0.7704918032786885,
948
- 0.6302521008403361,
949
- 0.5258620689655172,
950
- 0.4424778761061947
951
- ],
952
- "bp": 1.0,
953
- "sys_len": 244,
954
- "ref_len": 235,
955
- "sacrebleu": 0.5797776009790664,
956
- "score": 0.5797776009790664,
957
- "score_name": "sacrebleu",
958
- "score_ci_low": 0.4819980312121314,
959
- "score_ci_high": 0.7676936555305252,
960
- "sacrebleu_ci_low": 0.4819980312121314,
961
- "sacrebleu_ci_high": 0.7676936555305252
962
- },
963
- "mt_flores_101_eng_kor": {
964
- "num_of_instances": 6,
965
- "counts": [
966
- 152,
967
- 85,
968
- 57,
969
- 35
970
- ],
971
- "totals": [
972
- 267,
973
- 261,
974
- 255,
975
- 249
976
- ],
977
- "precisions": [
978
- 0.5692883895131086,
979
- 0.32567049808429116,
980
- 0.22352941176470587,
981
- 0.14056224899598393
982
- ],
983
- "bp": 1.0,
984
- "sys_len": 267,
985
- "ref_len": 249,
986
- "sacrebleu": 0.27626669318098784,
987
- "score": 0.27626669318098784,
988
- "score_name": "sacrebleu",
989
- "score_ci_low": 0.19630864275047086,
990
- "score_ci_high": 0.3324774540569831,
991
- "sacrebleu_ci_low": 0.19630864275047086,
992
- "sacrebleu_ci_high": 0.3324774540569831
993
- },
994
- "mt_flores_101_eng_por": {
995
- "num_of_instances": 6,
996
- "counts": [
997
- 181,
998
- 139,
999
- 111,
1000
- 91
1001
- ],
1002
- "totals": [
1003
- 226,
1004
- 220,
1005
- 214,
1006
- 208
1007
- ],
1008
- "precisions": [
1009
- 0.8008849557522124,
1010
- 0.6318181818181818,
1011
- 0.5186915887850467,
1012
- 0.4375
1013
- ],
1014
- "bp": 1.0,
1015
- "sys_len": 226,
1016
- "ref_len": 222,
1017
- "sacrebleu": 0.5821198107565924,
1018
- "score": 0.5821198107565924,
1019
- "score_name": "sacrebleu",
1020
- "score_ci_low": 0.5032683240695686,
1021
- "score_ci_high": 0.6631112459149506,
1022
- "sacrebleu_ci_low": 0.5032683240695686,
1023
- "sacrebleu_ci_high": 0.6631112459149506
1024
- },
1025
- "mt_flores_101_eng_ron": {
1026
- "num_of_instances": 6,
1027
- "counts": [
1028
- 160,
1029
- 108,
1030
- 80,
1031
- 62
1032
- ],
1033
- "totals": [
1034
- 233,
1035
- 227,
1036
- 221,
1037
- 215
1038
- ],
1039
- "precisions": [
1040
- 0.6866952789699571,
1041
- 0.47577092511013214,
1042
- 0.36199095022624433,
1043
- 0.28837209302325584
1044
- ],
1045
- "bp": 1.0,
1046
- "sys_len": 233,
1047
- "ref_len": 230,
1048
- "sacrebleu": 0.4297374729981456,
1049
- "score": 0.4297374729981456,
1050
- "score_name": "sacrebleu",
1051
- "score_ci_low": 0.30739045331930365,
1052
- "score_ci_high": 0.5954313392008956,
1053
- "sacrebleu_ci_low": 0.30739045331930365,
1054
- "sacrebleu_ci_high": 0.5954313392008956
1055
- },
1056
- "mt_flores_101_eng_spa": {
1057
- "num_of_instances": 6,
1058
- "counts": [
1059
- 165,
1060
- 99,
1061
- 65,
1062
- 44
1063
- ],
1064
- "totals": [
1065
- 238,
1066
- 232,
1067
- 226,
1068
- 220
1069
- ],
1070
- "precisions": [
1071
- 0.6932773109243697,
1072
- 0.4267241379310345,
1073
- 0.28761061946902655,
1074
- 0.2
1075
- ],
1076
- "bp": 0.9792107358732394,
1077
- "sys_len": 238,
1078
- "ref_len": 243,
1079
- "sacrebleu": 0.35367018032587716,
1080
- "score": 0.35367018032587716,
1081
- "score_name": "sacrebleu",
1082
- "score_ci_low": 0.3016964479711889,
1083
- "score_ci_high": 0.4034179481814929,
1084
- "sacrebleu_ci_low": 0.3016964479711889,
1085
- "sacrebleu_ci_high": 0.4034179481814929
1086
- },
1087
- "mt_flores_101_fra_eng": {
1088
- "num_of_instances": 6,
1089
- "counts": [
1090
- 168,
1091
- 129,
1092
- 99,
1093
- 75
1094
- ],
1095
- "totals": [
1096
- 215,
1097
- 209,
1098
- 203,
1099
- 197
1100
- ],
1101
- "precisions": [
1102
- 0.7813953488372093,
1103
- 0.6172248803827751,
1104
- 0.4876847290640394,
1105
- 0.3807106598984772
1106
- ],
1107
- "bp": 1.0,
1108
- "sys_len": 215,
1109
- "ref_len": 208,
1110
- "sacrebleu": 0.5470312162394166,
1111
- "score": 0.5470312162394166,
1112
- "score_name": "sacrebleu",
1113
- "score_ci_low": 0.4764122852102197,
1114
- "score_ci_high": 0.6508738326325866,
1115
- "sacrebleu_ci_low": 0.4764122852102197,
1116
- "sacrebleu_ci_high": 0.6508738326325866
1117
- },
1118
- "mt_flores_101_jpn_eng": {
1119
- "num_of_instances": 6,
1120
- "counts": [
1121
- 143,
1122
- 86,
1123
- 60,
1124
- 42
1125
- ],
1126
- "totals": [
1127
- 215,
1128
- 209,
1129
- 203,
1130
- 197
1131
- ],
1132
- "precisions": [
1133
- 0.6651162790697674,
1134
- 0.41148325358851673,
1135
- 0.2955665024630542,
1136
- 0.2131979695431472
1137
- ],
1138
- "bp": 1.0,
1139
- "sys_len": 215,
1140
- "ref_len": 208,
1141
- "sacrebleu": 0.36238649527066064,
1142
- "score": 0.36238649527066064,
1143
- "score_name": "sacrebleu",
1144
- "score_ci_low": 0.20955142870882296,
1145
- "score_ci_high": 0.5831549950186898,
1146
- "sacrebleu_ci_low": 0.20955142870882296,
1147
- "sacrebleu_ci_high": 0.5831549950186898
1148
- },
1149
- "mt_flores_101_kor_eng": {
1150
- "num_of_instances": 6,
1151
- "counts": [
1152
- 131,
1153
- 74,
1154
- 45,
1155
- 31
1156
- ],
1157
- "totals": [
1158
- 194,
1159
- 188,
1160
- 182,
1161
- 176
1162
- ],
1163
- "precisions": [
1164
- 0.6752577319587628,
1165
- 0.39361702127659576,
1166
- 0.24725274725274726,
1167
- 0.17613636363636365
1168
- ],
1169
- "bp": 0.9303774188371497,
1170
- "sys_len": 194,
1171
- "ref_len": 208,
1172
- "sacrebleu": 0.30517050622006836,
1173
- "score": 0.30517050622006836,
1174
- "score_name": "sacrebleu",
1175
- "score_ci_low": 0.1936778777853124,
1176
- "score_ci_high": 0.4521715400303785,
1177
- "sacrebleu_ci_low": 0.1936778777853124,
1178
- "sacrebleu_ci_high": 0.4521715400303785
1179
- },
1180
- "mt_flores_101_por_eng": {
1181
- "num_of_instances": 6,
1182
- "counts": [
1183
- 167,
1184
- 128,
1185
- 100,
1186
- 80
1187
- ],
1188
- "totals": [
1189
- 211,
1190
- 205,
1191
- 199,
1192
- 193
1193
- ],
1194
- "precisions": [
1195
- 0.7914691943127963,
1196
- 0.624390243902439,
1197
- 0.5025125628140704,
1198
- 0.41450777202072536
1199
- ],
1200
- "bp": 1.0,
1201
- "sys_len": 211,
1202
- "ref_len": 208,
1203
- "sacrebleu": 0.5664250237033246,
1204
- "score": 0.5664250237033246,
1205
- "score_name": "sacrebleu",
1206
- "score_ci_low": 0.4703853821459762,
1207
- "score_ci_high": 0.6458520638777493,
1208
- "sacrebleu_ci_low": 0.4703853821459762,
1209
- "sacrebleu_ci_high": 0.6458520638777493
1210
- },
1211
- "mt_flores_101_ron_eng": {
1212
- "num_of_instances": 6,
1213
- "counts": [
1214
- 160,
1215
- 112,
1216
- 79,
1217
- 58
1218
- ],
1219
- "totals": [
1220
- 226,
1221
- 220,
1222
- 214,
1223
- 208
1224
- ],
1225
- "precisions": [
1226
- 0.7079646017699115,
1227
- 0.509090909090909,
1228
- 0.36915887850467294,
1229
- 0.27884615384615385
1230
- ],
1231
- "bp": 1.0,
1232
- "sys_len": 226,
1233
- "ref_len": 208,
1234
- "sacrebleu": 0.4388804297038792,
1235
- "score": 0.4388804297038792,
1236
- "score_name": "sacrebleu",
1237
- "score_ci_low": 0.3074637309571057,
1238
- "score_ci_high": 0.5696800272393862,
1239
- "sacrebleu_ci_low": 0.3074637309571057,
1240
- "sacrebleu_ci_high": 0.5696800272393862
1241
- },
1242
- "mt_flores_101_spa_eng": {
1243
- "num_of_instances": 6,
1244
- "counts": [
1245
- 151,
1246
- 97,
1247
- 62,
1248
- 42
1249
- ],
1250
- "totals": [
1251
- 216,
1252
- 210,
1253
- 204,
1254
- 198
1255
- ],
1256
- "precisions": [
1257
- 0.6990740740740741,
1258
- 0.4619047619047619,
1259
- 0.30392156862745096,
1260
- 0.2121212121212121
1261
- ],
1262
- "bp": 1.0,
1263
- "sys_len": 216,
1264
- "ref_len": 208,
1265
- "sacrebleu": 0.37984403828565183,
1266
- "score": 0.37984403828565183,
1267
- "score_name": "sacrebleu",
1268
- "score_ci_low": 0.2939549007299014,
1269
- "score_ci_high": 0.539072297051574,
1270
- "sacrebleu_ci_low": 0.2939549007299014,
1271
- "sacrebleu_ci_high": 0.539072297051574
1272
- },
1273
- "score": 0.42425093288480964,
1274
- "score_name": "subsets_mean",
1275
- "num_of_instances": 90
1276
- },
1277
- "score": 0.6368761936671354,
1278
- "score_name": "subsets_mean",
1279
- "num_of_instances": 1537
1280
- }
1281
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/bluebench/2025-07-03T15-41-32_evaluation_results.json DELETED
@@ -1,1281 +0,0 @@
1
- {
2
- "environment_info": {
3
- "timestamp_utc": "2025-07-03T19:41:29.618401Z",
4
- "command_line_invocation": [
5
- "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
- "--tasks",
7
- "benchmarks.bluebench",
8
- "--model",
9
- "cross_provider",
10
- "--model_args",
11
- "model_name=watsonx/meta-llama/llama-3-2-11b-vision-instruct,max_tokens=1024",
12
- "--output_path",
13
- "./results/bluebench",
14
- "--log_samples",
15
- "--trust_remote_code",
16
- "--batch_size",
17
- "8",
18
- "--verbosity",
19
- "ERROR"
20
- ],
21
- "parsed_arguments": {
22
- "tasks": [
23
- "benchmarks.bluebench"
24
- ],
25
- "split": "test",
26
- "num_fewshots": null,
27
- "limit": null,
28
- "batch_size": 8,
29
- "model": "watsonx/meta-llama/llama-3-2-11b-vision-instruct",
30
- "model_args": {
31
- "max_tokens": 1024
32
- },
33
- "gen_kwargs": null,
34
- "chat_template_kwargs": null,
35
- "output_path": "./results/bluebench",
36
- "output_file_prefix": "evaluation_results",
37
- "log_samples": true,
38
- "verbosity": "ERROR",
39
- "apply_chat_template": false,
40
- "trust_remote_code": true,
41
- "disable_hf_cache": false,
42
- "cache_dir": null
43
- },
44
- "unitxt_version": "1.25.0",
45
- "unitxt_commit_hash": "f087fa0d9c77a2dab916bae59414b093e5be4041",
46
- "python_version": "3.10.18",
47
- "system": "Linux",
48
- "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
- "installed_packages": {
50
- "nvidia-cufile-cu12": "1.11.1.6",
51
- "triton": "3.3.1",
52
- "nltk": "3.9.1",
53
- "anyio": "4.9.0",
54
- "unitxt": "1.25.0",
55
- "absl-py": "2.3.0",
56
- "tiktoken": "0.9.0",
57
- "charset-normalizer": "3.4.2",
58
- "nvidia-cuda-runtime-cu12": "12.6.77",
59
- "sympy": "1.14.0",
60
- "mecab-ko": "1.0.1",
61
- "httpcore": "1.0.9",
62
- "litellm": "1.73.6",
63
- "Jinja2": "3.1.6",
64
- "jsonschema-specifications": "2025.4.1",
65
- "pydantic_core": "2.33.2",
66
- "nvidia-cusparse-cu12": "12.5.4.2",
67
- "tokenizers": "0.21.2",
68
- "yarl": "1.20.1",
69
- "portalocker": "3.2.0",
70
- "pandas": "2.3.0",
71
- "multiprocess": "0.70.16",
72
- "jsonschema": "4.24.0",
73
- "nvidia-nvjitlink-cu12": "12.6.85",
74
- "nvidia-cublas-cu12": "12.6.4.1",
75
- "pydantic": "2.11.7",
76
- "async-timeout": "5.0.1",
77
- "annotated-types": "0.7.0",
78
- "rouge_score": "0.1.2",
79
- "contourpy": "1.3.2",
80
- "aiosignal": "1.3.2",
81
- "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "openai": "1.93.0",
83
- "six": "1.17.0",
84
- "diskcache": "5.6.3",
85
- "tqdm": "4.67.1",
86
- "pyarrow": "20.0.0",
87
- "h11": "0.16.0",
88
- "zipp": "3.19.2",
89
- "tzdata": "2025.2",
90
- "bert-score": "0.3.13",
91
- "setuptools": "80.9.0",
92
- "referencing": "0.36.2",
93
- "sacrebleu": "2.5.1",
94
- "filelock": "3.18.0",
95
- "urllib3": "2.5.0",
96
- "scipy": "1.15.3",
97
- "nvidia-nccl-cu12": "2.26.2",
98
- "kiwisolver": "1.4.8",
99
- "networkx": "3.4.2",
100
- "typing-inspection": "0.4.1",
101
- "sniffio": "1.3.1",
102
- "scikit-learn": "1.7.0",
103
- "rpds-py": "0.26.0",
104
- "nvidia-curand-cu12": "10.3.7.77",
105
- "pip": "25.1.1",
106
- "pillow": "11.3.0",
107
- "fonttools": "4.58.4",
108
- "datasets": "3.6.0",
109
- "nvidia-cusolver-cu12": "11.7.1.2",
110
- "cycler": "0.12.1",
111
- "distro": "1.9.0",
112
- "idna": "3.10",
113
- "MarkupSafe": "3.0.2",
114
- "frozenlist": "1.7.0",
115
- "pyparsing": "3.2.3",
116
- "jiter": "0.10.0",
117
- "importlib_metadata": "8.0.0",
118
- "packaging": "24.2",
119
- "psutil": "7.0.0",
120
- "mecab-ko-dic": "1.0.0",
121
- "joblib": "1.5.1",
122
- "fsspec": "2025.3.0",
123
- "dill": "0.3.8",
124
- "wheel": "0.45.1",
125
- "nvidia-nvtx-cu12": "12.6.77",
126
- "nvidia-cusparselt-cu12": "0.6.3",
127
- "lxml": "6.0.0",
128
- "propcache": "0.3.2",
129
- "numpy": "2.2.6",
130
- "mpmath": "1.3.0",
131
- "conllu": "6.0.0",
132
- "huggingface-hub": "0.33.2",
133
- "safetensors": "0.5.3",
134
- "requests": "2.32.4",
135
- "regex": "2024.11.6",
136
- "aiohttp": "3.12.13",
137
- "tabulate": "0.9.0",
138
- "accelerate": "1.8.1",
139
- "certifi": "2025.6.15",
140
- "evaluate": "0.4.4",
141
- "nvidia-cufft-cu12": "11.3.0.4",
142
- "nvidia-cuda-nvrtc-cu12": "12.6.77",
143
- "click": "8.2.1",
144
- "typing_extensions": "4.12.2",
145
- "attrs": "25.3.0",
146
- "exceptiongroup": "1.3.0",
147
- "transformers": "4.53.0",
148
- "tenacity": "9.1.2",
149
- "pytz": "2025.2",
150
- "aiohappyeyeballs": "2.6.1",
151
- "python-dateutil": "2.9.0.post0",
152
- "torch": "2.7.1",
153
- "python-dotenv": "1.1.1",
154
- "multidict": "6.6.3",
155
- "httpx": "0.28.1",
156
- "matplotlib": "3.10.3",
157
- "xxhash": "3.5.0",
158
- "PyYAML": "6.0.2",
159
- "colorama": "0.4.6",
160
- "threadpoolctl": "3.6.0",
161
- "nvidia-cudnn-cu12": "9.5.1.17",
162
- "hf-xet": "1.1.5",
163
- "jaraco.collections": "5.1.0",
164
- "tomli": "2.0.1",
165
- "backports.tarfile": "1.2.0",
166
- "jaraco.context": "5.3.0",
167
- "typeguard": "4.3.0",
168
- "autocommand": "2.2.2",
169
- "jaraco.text": "3.12.1",
170
- "more-itertools": "10.3.0",
171
- "platformdirs": "4.2.2",
172
- "inflect": "7.3.1",
173
- "jaraco.functools": "4.0.1"
174
- }
175
- },
176
- "results": {
177
- "bias": {
178
- "safety_bbq_age": {
179
- "accuracy": 0.8888888888888888,
180
- "accuracy_ci_low": 0.46041936253217447,
181
- "accuracy_ci_high": 1.0,
182
- "score_name": "accuracy",
183
- "score": 0.8888888888888888,
184
- "score_ci_high": 1.0,
185
- "score_ci_low": 0.46041936253217447,
186
- "num_of_instances": 9
187
- },
188
- "safety_bbq_disability_status": {
189
- "accuracy": 1.0,
190
- "accuracy_ci_low": 1.0,
191
- "accuracy_ci_high": 1.0,
192
- "score_name": "accuracy",
193
- "score": 1.0,
194
- "score_ci_high": 1.0,
195
- "score_ci_low": 1.0,
196
- "num_of_instances": 9
197
- },
198
- "safety_bbq_gender_identity": {
199
- "accuracy": 0.7777777777777778,
200
- "accuracy_ci_low": 0.4444444444444444,
201
- "accuracy_ci_high": 1.0,
202
- "score_name": "accuracy",
203
- "score": 0.7777777777777778,
204
- "score_ci_high": 1.0,
205
- "score_ci_low": 0.4444444444444444,
206
- "num_of_instances": 9
207
- },
208
- "safety_bbq_nationality": {
209
- "accuracy": 0.8888888888888888,
210
- "accuracy_ci_low": 0.5310928992288233,
211
- "accuracy_ci_high": 1.0,
212
- "score_name": "accuracy",
213
- "score": 0.8888888888888888,
214
- "score_ci_high": 1.0,
215
- "score_ci_low": 0.5310928992288233,
216
- "num_of_instances": 9
217
- },
218
- "safety_bbq_physical_appearance": {
219
- "accuracy": 0.8888888888888888,
220
- "accuracy_ci_low": 0.46041936253217447,
221
- "accuracy_ci_high": 1.0,
222
- "score_name": "accuracy",
223
- "score": 0.8888888888888888,
224
- "score_ci_high": 1.0,
225
- "score_ci_low": 0.46041936253217447,
226
- "num_of_instances": 9
227
- },
228
- "safety_bbq_race_ethnicity": {
229
- "accuracy": 1.0,
230
- "accuracy_ci_low": 1.0,
231
- "accuracy_ci_high": 1.0,
232
- "score_name": "accuracy",
233
- "score": 1.0,
234
- "score_ci_high": 1.0,
235
- "score_ci_low": 1.0,
236
- "num_of_instances": 9
237
- },
238
- "safety_bbq_race_x_gender": {
239
- "accuracy": 1.0,
240
- "accuracy_ci_low": 1.0,
241
- "accuracy_ci_high": 1.0,
242
- "score_name": "accuracy",
243
- "score": 1.0,
244
- "score_ci_high": 1.0,
245
- "score_ci_low": 1.0,
246
- "num_of_instances": 9
247
- },
248
- "safety_bbq_race_x_ses": {
249
- "accuracy": 0.6666666666666666,
250
- "accuracy_ci_low": 0.3333333333333333,
251
- "accuracy_ci_high": 0.8888888888888888,
252
- "score_name": "accuracy",
253
- "score": 0.6666666666666666,
254
- "score_ci_high": 0.8888888888888888,
255
- "score_ci_low": 0.3333333333333333,
256
- "num_of_instances": 9
257
- },
258
- "safety_bbq_religion": {
259
- "accuracy": 0.6666666666666666,
260
- "accuracy_ci_low": 0.2222222222222222,
261
- "accuracy_ci_high": 0.8888888888888888,
262
- "score_name": "accuracy",
263
- "score": 0.6666666666666666,
264
- "score_ci_high": 0.8888888888888888,
265
- "score_ci_low": 0.2222222222222222,
266
- "num_of_instances": 9
267
- },
268
- "safety_bbq_ses": {
269
- "accuracy": 0.2222222222222222,
270
- "accuracy_ci_low": 0.0,
271
- "accuracy_ci_high": 0.5555555555555556,
272
- "score_name": "accuracy",
273
- "score": 0.2222222222222222,
274
- "score_ci_high": 0.5555555555555556,
275
- "score_ci_low": 0.0,
276
- "num_of_instances": 9
277
- },
278
- "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.7777777777777778,
280
- "accuracy_ci_low": 0.4444444444444444,
281
- "accuracy_ci_high": 1.0,
282
- "score_name": "accuracy",
283
- "score": 0.7777777777777778,
284
- "score_ci_high": 1.0,
285
- "score_ci_low": 0.4444444444444444,
286
- "num_of_instances": 9
287
- },
288
- "score": 0.797979797979798,
289
- "score_name": "subsets_mean",
290
- "num_of_instances": 99
291
- },
292
- "chatbot_abilities": {
293
- "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 100,
295
- "llama_3_70b_instruct_template_arena_hard": 0.6025641025641025,
296
- "score": 0.6025641025641025,
297
- "score_name": "llama_3_70b_instruct_template_arena_hard"
298
- },
299
- "score": 0.6025641025641025,
300
- "score_name": "subsets_mean",
301
- "num_of_instances": 100
302
- },
303
- "entity_extraction": {
304
- "universal_ner_en_ewt": {
305
- "num_of_instances": 100,
306
- "f1_Person": 0.7391304347826085,
307
- "f1_Organization": 0.5357142857142857,
308
- "f1_Location": 0.5555555555555556,
309
- "f1_macro": 0.6101334253508166,
310
- "recall_macro": 0.5638371290545204,
311
- "precision_macro": 0.7027260179434093,
312
- "in_classes_support": 1.0,
313
- "f1_micro": 0.6086956521739131,
314
- "recall_micro": 0.56,
315
- "precision_micro": 0.6666666666666666,
316
- "score": 0.6086956521739131,
317
- "score_name": "f1_micro",
318
- "score_ci_low": 0.4714559913296099,
319
- "score_ci_high": 0.6872102499457013,
320
- "f1_micro_ci_low": 0.4714559913296099,
321
- "f1_micro_ci_high": 0.6872102499457013
322
- },
323
- "score": 0.6086956521739131,
324
- "score_name": "subsets_mean",
325
- "num_of_instances": 100
326
- },
327
- "knowledge": {
328
- "mmlu_pro_biology": {
329
- "accuracy": 0.5714285714285714,
330
- "accuracy_ci_low": 0.14285714285714285,
331
- "accuracy_ci_high": 0.8571428571428571,
332
- "score_name": "accuracy",
333
- "score": 0.5714285714285714,
334
- "score_ci_high": 0.8571428571428571,
335
- "score_ci_low": 0.14285714285714285,
336
- "num_of_instances": 7
337
- },
338
- "mmlu_pro_business": {
339
- "accuracy": 0.14285714285714285,
340
- "accuracy_ci_low": 0.0,
341
- "accuracy_ci_high": 0.5714285714285714,
342
- "score_name": "accuracy",
343
- "score": 0.14285714285714285,
344
- "score_ci_high": 0.5714285714285714,
345
- "score_ci_low": 0.0,
346
- "num_of_instances": 7
347
- },
348
- "mmlu_pro_chemistry": {
349
- "accuracy": 0.14285714285714285,
350
- "accuracy_ci_low": 0.0,
351
- "accuracy_ci_high": 0.5714285714285714,
352
- "score_name": "accuracy",
353
- "score": 0.14285714285714285,
354
- "score_ci_high": 0.5714285714285714,
355
- "score_ci_low": 0.0,
356
- "num_of_instances": 7
357
- },
358
- "mmlu_pro_computer_science": {
359
- "accuracy": 0.7142857142857143,
360
- "accuracy_ci_low": 0.2254039495939315,
361
- "accuracy_ci_high": 1.0,
362
- "score_name": "accuracy",
363
- "score": 0.7142857142857143,
364
- "score_ci_high": 1.0,
365
- "score_ci_low": 0.2254039495939315,
366
- "num_of_instances": 7
367
- },
368
- "mmlu_pro_economics": {
369
- "accuracy": 0.7142857142857143,
370
- "accuracy_ci_low": 0.2857142857142857,
371
- "accuracy_ci_high": 1.0,
372
- "score_name": "accuracy",
373
- "score": 0.7142857142857143,
374
- "score_ci_high": 1.0,
375
- "score_ci_low": 0.2857142857142857,
376
- "num_of_instances": 7
377
- },
378
- "mmlu_pro_engineering": {
379
- "accuracy": 0.5714285714285714,
380
- "accuracy_ci_low": 0.14285714285714285,
381
- "accuracy_ci_high": 0.8571428571428571,
382
- "score_name": "accuracy",
383
- "score": 0.5714285714285714,
384
- "score_ci_high": 0.8571428571428571,
385
- "score_ci_low": 0.14285714285714285,
386
- "num_of_instances": 7
387
- },
388
- "mmlu_pro_health": {
389
- "accuracy": 0.2857142857142857,
390
- "accuracy_ci_low": 0.0,
391
- "accuracy_ci_high": 0.7142857142857143,
392
- "score_name": "accuracy",
393
- "score": 0.2857142857142857,
394
- "score_ci_high": 0.7142857142857143,
395
- "score_ci_low": 0.0,
396
- "num_of_instances": 7
397
- },
398
- "mmlu_pro_history": {
399
- "accuracy": 0.14285714285714285,
400
- "accuracy_ci_low": 0.0,
401
- "accuracy_ci_high": 0.5714285714285714,
402
- "score_name": "accuracy",
403
- "score": 0.14285714285714285,
404
- "score_ci_high": 0.5714285714285714,
405
- "score_ci_low": 0.0,
406
- "num_of_instances": 7
407
- },
408
- "mmlu_pro_law": {
409
- "accuracy": 0.42857142857142855,
410
- "accuracy_ci_low": 0.14285714285714285,
411
- "accuracy_ci_high": 0.7142857142857143,
412
- "score_name": "accuracy",
413
- "score": 0.42857142857142855,
414
- "score_ci_high": 0.7142857142857143,
415
- "score_ci_low": 0.14285714285714285,
416
- "num_of_instances": 7
417
- },
418
- "mmlu_pro_math": {
419
- "accuracy": 0.42857142857142855,
420
- "accuracy_ci_low": 0.14285714285714285,
421
- "accuracy_ci_high": 0.8571428571428571,
422
- "score_name": "accuracy",
423
- "score": 0.42857142857142855,
424
- "score_ci_high": 0.8571428571428571,
425
- "score_ci_low": 0.14285714285714285,
426
- "num_of_instances": 7
427
- },
428
- "mmlu_pro_other": {
429
- "accuracy": 0.14285714285714285,
430
- "accuracy_ci_low": 0.0,
431
- "accuracy_ci_high": 0.5714285714285714,
432
- "score_name": "accuracy",
433
- "score": 0.14285714285714285,
434
- "score_ci_high": 0.5714285714285714,
435
- "score_ci_low": 0.0,
436
- "num_of_instances": 7
437
- },
438
- "mmlu_pro_philosophy": {
439
- "accuracy": 0.7142857142857143,
440
- "accuracy_ci_low": 0.2857142857142857,
441
- "accuracy_ci_high": 1.0,
442
- "score_name": "accuracy",
443
- "score": 0.7142857142857143,
444
- "score_ci_high": 1.0,
445
- "score_ci_low": 0.2857142857142857,
446
- "num_of_instances": 7
447
- },
448
- "mmlu_pro_physics": {
449
- "accuracy": 0.14285714285714285,
450
- "accuracy_ci_low": 0.0,
451
- "accuracy_ci_high": 0.5714285714285714,
452
- "score_name": "accuracy",
453
- "score": 0.14285714285714285,
454
- "score_ci_high": 0.5714285714285714,
455
- "score_ci_low": 0.0,
456
- "num_of_instances": 7
457
- },
458
- "mmlu_pro_psychology": {
459
- "accuracy": 0.5714285714285714,
460
- "accuracy_ci_low": 0.14285714285714285,
461
- "accuracy_ci_high": 0.8571428571428571,
462
- "score_name": "accuracy",
463
- "score": 0.5714285714285714,
464
- "score_ci_high": 0.8571428571428571,
465
- "score_ci_low": 0.14285714285714285,
466
- "num_of_instances": 7
467
- },
468
- "score": 0.40816326530612246,
469
- "score_name": "subsets_mean",
470
- "num_of_instances": 98
471
- },
472
- "legal": {
473
- "legalbench_abercrombie": {
474
- "f1_macro": 0.5933333333333334,
475
- "f1_suggestive": 0.5,
476
- "f1_arbitrary": 0.5,
477
- "f1_generic": 0.5,
478
- "f1_fanciful": 0.6666666666666666,
479
- "f1_descriptive": 0.8,
480
- "f1_macro_ci_low": 0.40499999999999997,
481
- "f1_macro_ci_high": 0.8727728256593986,
482
- "score_name": "f1_micro",
483
- "score": 0.6,
484
- "score_ci_high": 0.8,
485
- "score_ci_low": 0.35,
486
- "num_of_instances": 20,
487
- "accuracy": 0.6,
488
- "accuracy_ci_low": 0.35,
489
- "accuracy_ci_high": 0.8,
490
- "f1_micro": 0.6,
491
- "f1_micro_ci_low": 0.35,
492
- "f1_micro_ci_high": 0.8
493
- },
494
- "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.52,
496
- "f1_no": 0.64,
497
- "f1_yes": 0.4,
498
- "f1_macro_ci_low": 0.30666666666666664,
499
- "f1_macro_ci_high": 0.7802197802197802,
500
- "score_name": "f1_micro",
501
- "score": 0.55,
502
- "score_ci_high": 0.75,
503
- "score_ci_low": 0.3,
504
- "num_of_instances": 20,
505
- "accuracy": 0.55,
506
- "accuracy_ci_low": 0.3,
507
- "accuracy_ci_high": 0.75,
508
- "f1_micro": 0.55,
509
- "f1_micro_ci_low": 0.3,
510
- "f1_micro_ci_high": 0.75
511
- },
512
- "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.3151927437641723,
514
- "f1_conclusion": 0.2857142857142857,
515
- "f1_analysis": 0.4444444444444444,
516
- "f1_decree": 0.0,
517
- "f1_issue": 0.2857142857142857,
518
- "f1_procedural history": 0.3333333333333333,
519
- "f1_facts": 0.8571428571428571,
520
- "f1_rule": 0.0,
521
- "f1_macro_ci_low": 0.14898389471709916,
522
- "f1_macro_ci_high": 0.47222222222222215,
523
- "score_name": "f1_micro",
524
- "score": 0.4,
525
- "score_ci_high": 0.6,
526
- "score_ci_low": 0.17647058823529413,
527
- "num_of_instances": 20,
528
- "accuracy": 0.4,
529
- "accuracy_ci_low": 0.2,
530
- "accuracy_ci_high": 0.6,
531
- "f1_micro": 0.4,
532
- "f1_micro_ci_low": 0.17647058823529413,
533
- "f1_micro_ci_high": 0.6
534
- },
535
- "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.6419437340153453,
537
- "f1_yes": 0.5882352941176471,
538
- "f1_no": 0.6956521739130435,
539
- "f1_macro_ci_low": 0.4357366771159875,
540
- "f1_macro_ci_high": 0.8465473145780051,
541
- "score_name": "f1_micro",
542
- "score": 0.65,
543
- "score_ci_high": 0.85,
544
- "score_ci_low": 0.45,
545
- "num_of_instances": 20,
546
- "accuracy": 0.65,
547
- "accuracy_ci_low": 0.45,
548
- "accuracy_ci_high": 0.85,
549
- "f1_micro": 0.65,
550
- "f1_micro_ci_low": 0.45,
551
- "f1_micro_ci_high": 0.85
552
- },
553
- "legalbench_proa": {
554
- "f1_macro": 0.849624060150376,
555
- "f1_yes": 0.8421052631578947,
556
- "f1_no": 0.8571428571428571,
557
- "f1_macro_ci_low": 0.6703296703296704,
558
- "f1_macro_ci_high": 1.0,
559
- "score_name": "f1_micro",
560
- "score": 0.85,
561
- "score_ci_high": 0.95,
562
- "score_ci_low": 0.65,
563
- "num_of_instances": 20,
564
- "accuracy": 0.85,
565
- "accuracy_ci_low": 0.65,
566
- "accuracy_ci_high": 0.95,
567
- "f1_micro": 0.85,
568
- "f1_micro_ci_low": 0.65,
569
- "f1_micro_ci_high": 0.95
570
- },
571
- "score": 0.61,
572
- "score_name": "subsets_mean",
573
- "num_of_instances": 100
574
- },
575
- "news_classification": {
576
- "20_newsgroups_short": {
577
- "f1_macro": 0.5056926406926407,
578
- "f1_cars": 0.75,
579
- "f1_windows x": 0.3333333333333333,
580
- "f1_computer graphics": 0.5,
581
- "f1_atheism": 0.0,
582
- "f1_religion": 0.18181818181818182,
583
- "f1_medicine": 0.6666666666666666,
584
- "f1_christianity": 0.4,
585
- "f1_microsoft windows": 0.6666666666666666,
586
- "f1_middle east": 0.2857142857142857,
587
- "f1_politics": 0.2857142857142857,
588
- "f1_motorcycles": 0.7272727272727273,
589
- "f1_pc hardware": 0.6666666666666666,
590
- "f1_mac hardware": 0.5,
591
- "f1_electronics": 0.0,
592
- "f1_for sale": 0.6666666666666666,
593
- "f1_guns": 0.4444444444444444,
594
- "f1_space": 0.75,
595
- "f1_cryptography": 0.4,
596
- "f1_baseball": 1.0,
597
- "f1_hockey": 0.8888888888888888,
598
- "f1_macro_ci_low": 0.4219714712587517,
599
- "f1_macro_ci_high": 0.6341002349458903,
600
- "score_name": "f1_micro",
601
- "score": 0.5393258426966292,
602
- "score_ci_high": 0.632768361581921,
603
- "score_ci_low": 0.4220293543283505,
604
- "num_of_instances": 100,
605
- "accuracy": 0.48,
606
- "accuracy_ci_low": 0.38,
607
- "accuracy_ci_high": 0.58,
608
- "f1_micro": 0.5393258426966292,
609
- "f1_micro_ci_low": 0.4220293543283505,
610
- "f1_micro_ci_high": 0.632768361581921
611
- },
612
- "score": 0.5393258426966292,
613
- "score_name": "subsets_mean",
614
- "num_of_instances": 100
615
- },
616
- "product_help": {
617
- "cfpb_product_2023": {
618
- "f1_macro": 0.5772283699281425,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9457364341085271,
620
- "f1_mortgage": 0.8571428571428571,
621
- "f1_debt collection": 0.42105263157894735,
622
- "f1_credit card or prepaid card": 0.0,
623
- "f1_checking or savings account": 0.75,
624
- "f1_student loan": 0.6666666666666666,
625
- "f1_money transfer or virtual currency or money service": 0.4,
626
- "f1_macro_ci_low": 0.43341419112091223,
627
- "f1_macro_ci_high": 0.7943029958179404,
628
- "score_name": "f1_micro",
629
- "score": 0.8324873096446701,
630
- "score_ci_high": 0.8986163491039517,
631
- "score_ci_low": 0.7438346729829881,
632
- "num_of_instances": 100,
633
- "accuracy": 0.82,
634
- "accuracy_ci_low": 0.73,
635
- "accuracy_ci_high": 0.89,
636
- "f1_micro": 0.8324873096446701,
637
- "f1_micro_ci_low": 0.7438346729829881,
638
- "f1_micro_ci_high": 0.8986163491039517
639
- },
640
- "cfpb_product_watsonx": {
641
- "f1_macro": 0.6745944600062247,
642
- "f1_mortgages and loans": 0.7619047619047619,
643
- "f1_credit card": 0.72,
644
- "f1_debt collection": 0.7058823529411765,
645
- "f1_credit reporting": 0.7407407407407407,
646
- "f1_retail banking": 0.4444444444444444,
647
- "f1_macro_ci_low": 0.538487634706942,
648
- "f1_macro_ci_high": 0.8292804449703136,
649
- "score_name": "f1_micro",
650
- "score": 0.7070707070707071,
651
- "score_ci_high": 0.82,
652
- "score_ci_low": 0.5567010309278351,
653
- "num_of_instances": 50,
654
- "accuracy": 0.7,
655
- "accuracy_ci_low": 0.54,
656
- "accuracy_ci_high": 0.82,
657
- "f1_micro": 0.7070707070707071,
658
- "f1_micro_ci_low": 0.5567010309278351,
659
- "f1_micro_ci_high": 0.82
660
- },
661
- "score": 0.7697790083576885,
662
- "score_name": "subsets_mean",
663
- "num_of_instances": 150
664
- },
665
- "qa_finance": {
666
- "fin_qa": {
667
- "num_of_instances": 100,
668
- "execution_accuracy": 0.14,
669
- "program_accuracy": 0.16,
670
- "score": 0.16,
671
- "score_name": "program_accuracy",
672
- "execution_accuracy_ci_low": 0.08,
673
- "execution_accuracy_ci_high": 0.22,
674
- "program_accuracy_ci_low": 0.09,
675
- "program_accuracy_ci_high": 0.24,
676
- "score_ci_low": 0.09,
677
- "score_ci_high": 0.24
678
- },
679
- "score": 0.16,
680
- "score_name": "subsets_mean",
681
- "num_of_instances": 100
682
- },
683
- "rag_general": {
684
- "rag_response_generation_clapnq": {
685
- "precision": 0.5122629801345308,
686
- "recall": 0.5711640196088964,
687
- "f1": 0.4956577841210359,
688
- "precision_ci_low": 0.47463388705030735,
689
- "precision_ci_high": 0.5516188080201115,
690
- "recall_ci_low": 0.5308026366914153,
691
- "recall_ci_high": 0.6126595999428824,
692
- "f1_ci_low": 0.4646393825182617,
693
- "f1_ci_high": 0.5290134586140666,
694
- "score_name": "f1",
695
- "score": 0.4956577841210359,
696
- "score_ci_high": 0.5290134586140666,
697
- "score_ci_low": 0.4646393825182617,
698
- "num_of_instances": 100,
699
- "correctness_f1_bert_score.deberta_large_mnli": 0.681556967496872,
700
- "correctness_recall_bert_score.deberta_large_mnli": 0.7017117899656296,
701
- "correctness_precision_bert_score.deberta_large_mnli": 0.6739864906668663,
702
- "faithfullness_f1_token_overlap": 0.36059390510577694,
703
- "faithfullness_recall_token_overlap": 0.27208354183856076,
704
- "faithfullness_precision_token_overlap": 0.7087220222955496,
705
- "correctness_f1_token_overlap": 0.4956577841210359,
706
- "correctness_recall_token_overlap": 0.5711640196088964,
707
- "correctness_precision_token_overlap": 0.5122629801345308
708
- },
709
- "score": 0.4956577841210359,
710
- "score_name": "subsets_mean",
711
- "num_of_instances": 100
712
- },
713
- "reasoning": {
714
- "hellaswag": {
715
- "accuracy": 0.51,
716
- "accuracy_ci_low": 0.42,
717
- "accuracy_ci_high": 0.61,
718
- "score_name": "accuracy",
719
- "score": 0.51,
720
- "score_ci_high": 0.61,
721
- "score_ci_low": 0.42,
722
- "num_of_instances": 100
723
- },
724
- "openbook_qa": {
725
- "accuracy": 0.86,
726
- "accuracy_ci_low": 0.78,
727
- "accuracy_ci_high": 0.92,
728
- "score_name": "accuracy",
729
- "score": 0.86,
730
- "score_ci_high": 0.92,
731
- "score_ci_low": 0.78,
732
- "num_of_instances": 100
733
- },
734
- "score": 0.685,
735
- "score_name": "subsets_mean",
736
- "num_of_instances": 200
737
- },
738
- "safety": {
739
- "attaq_500": {
740
- "severity_critical": 0.0,
741
- "severity_high": 1.0,
742
- "severity_medium": 0.0,
743
- "severity_low": 99.0,
744
- "category_harmful_info": 0.8631038246528496,
745
- "category_deception": 0.8588788503348226,
746
- "category_explicit_content": 0.8620110215111212,
747
- "category_violence": 0.8305739898530264,
748
- "category_discrimination": 0.7453164079832652,
749
- "category_substance_abuse": 0.8580450213617749,
750
- "category_pii": 0.8342232150715535,
751
- "safety": 0.8417689493334748,
752
- "safety_ci_low": 0.8196727461133039,
753
- "safety_ci_high": 0.8616548092451682,
754
- "score_name": "safety",
755
- "score": 0.8417689493334748,
756
- "score_ci_high": 0.8616548092451682,
757
- "score_ci_low": 0.8196727461133039,
758
- "num_of_instances": 100
759
- },
760
- "score": 0.8417689493334748,
761
- "score_name": "subsets_mean",
762
- "num_of_instances": 100
763
- },
764
- "summarization": {
765
- "billsum_document_filtered_to_6000_chars": {
766
- "num_of_instances": 100,
767
- "rouge2": 0.21036085645747643,
768
- "rouge1": 0.42196769187176775,
769
- "rougeLsum": 0.3660108076703283,
770
- "rougeL": 0.29182514205540294,
771
- "score": 0.29182514205540294,
772
- "score_name": "rougeL",
773
- "rouge2_ci_low": 0.19459047905298954,
774
- "rouge2_ci_high": 0.23001172566592604,
775
- "rouge1_ci_low": 0.3983725355073898,
776
- "rouge1_ci_high": 0.44557497505333493,
777
- "rougeLsum_ci_low": 0.3436272136748691,
778
- "rougeLsum_ci_high": 0.3874312949915785,
779
- "rougeL_ci_low": 0.2737011422865546,
780
- "rougeL_ci_high": 0.3128796438747455,
781
- "score_ci_low": 0.2737011422865546,
782
- "score_ci_high": 0.3128796438747455
783
- },
784
- "tldr_document_filtered_to_6000_chars": {
785
- "num_of_instances": 100,
786
- "rouge2": 0.015540734558041634,
787
- "rouge1": 0.11070991559700558,
788
- "rougeLsum": 0.0922692226275668,
789
- "rougeL": 0.08156778335834318,
790
- "score": 0.08156778335834318,
791
- "score_name": "rougeL",
792
- "rouge2_ci_low": 0.011481094239772009,
793
- "rouge2_ci_high": 0.021386728155477184,
794
- "rouge1_ci_low": 0.09611259187453111,
795
- "rouge1_ci_high": 0.12773631865916757,
796
- "rougeLsum_ci_low": 0.08014472825538595,
797
- "rougeLsum_ci_high": 0.10544617452174851,
798
- "rougeL_ci_low": 0.07184211435294499,
799
- "rougeL_ci_high": 0.09160744099439429,
800
- "score_ci_low": 0.07184211435294499,
801
- "score_ci_high": 0.09160744099439429
802
- },
803
- "score": 0.18669646270687307,
804
- "score_name": "subsets_mean",
805
- "num_of_instances": 200
806
- },
807
- "translation": {
808
- "mt_flores_101_ara_eng": {
809
- "num_of_instances": 6,
810
- "counts": [
811
- 149,
812
- 100,
813
- 74,
814
- 57
815
- ],
816
- "totals": [
817
- 228,
818
- 222,
819
- 216,
820
- 210
821
- ],
822
- "precisions": [
823
- 0.6535087719298245,
824
- 0.45045045045045046,
825
- 0.3425925925925926,
826
- 0.2714285714285714
827
- ],
828
- "bp": 1.0,
829
- "sys_len": 228,
830
- "ref_len": 208,
831
- "sacrebleu": 0.4067550879939379,
832
- "score": 0.4067550879939379,
833
- "score_name": "sacrebleu",
834
- "score_ci_low": 0.18449348983650793,
835
- "score_ci_high": 0.5000148909038645,
836
- "sacrebleu_ci_low": 0.18449348983650793,
837
- "sacrebleu_ci_high": 0.5000148909038645
838
- },
839
- "mt_flores_101_deu_eng": {
840
- "num_of_instances": 6,
841
- "counts": [
842
- 133,
843
- 74,
844
- 41,
845
- 24
846
- ],
847
- "totals": [
848
- 205,
849
- 199,
850
- 193,
851
- 187
852
- ],
853
- "precisions": [
854
- 0.6487804878048781,
855
- 0.37185929648241206,
856
- 0.21243523316062177,
857
- 0.1283422459893048
858
- ],
859
- "bp": 0.9854724123463497,
860
- "sys_len": 205,
861
- "ref_len": 208,
862
- "sacrebleu": 0.2806484335469714,
863
- "score": 0.2806484335469714,
864
- "score_name": "sacrebleu",
865
- "score_ci_low": 0.20930302049758778,
866
- "score_ci_high": 0.3669108559906311,
867
- "sacrebleu_ci_low": 0.20930302049758778,
868
- "sacrebleu_ci_high": 0.3669108559906311
869
- },
870
- "mt_flores_101_eng_ara": {
871
- "num_of_instances": 6,
872
- "counts": [
873
- 107,
874
- 52,
875
- 30,
876
- 14
877
- ],
878
- "totals": [
879
- 205,
880
- 199,
881
- 193,
882
- 187
883
- ],
884
- "precisions": [
885
- 0.5219512195121951,
886
- 0.2613065326633166,
887
- 0.15544041450777202,
888
- 0.0748663101604278
889
- ],
890
- "bp": 0.9806769356409174,
891
- "sys_len": 205,
892
- "ref_len": 209,
893
- "sacrebleu": 0.19574181051276632,
894
- "score": 0.19574181051276632,
895
- "score_name": "sacrebleu",
896
- "score_ci_low": 0.13043482957972302,
897
- "score_ci_high": 0.2838217012977499,
898
- "sacrebleu_ci_low": 0.13043482957972302,
899
- "sacrebleu_ci_high": 0.2838217012977499
900
- },
901
- "mt_flores_101_eng_deu": {
902
- "num_of_instances": 6,
903
- "counts": [
904
- 126,
905
- 69,
906
- 39,
907
- 19
908
- ],
909
- "totals": [
910
- 215,
911
- 209,
912
- 203,
913
- 197
914
- ],
915
- "precisions": [
916
- 0.586046511627907,
917
- 0.33014354066985646,
918
- 0.19211822660098524,
919
- 0.09644670050761421
920
- ],
921
- "bp": 0.9953596371164251,
922
- "sys_len": 215,
923
- "ref_len": 216,
924
- "sacrebleu": 0.2435581878458631,
925
- "score": 0.2435581878458631,
926
- "score_name": "sacrebleu",
927
- "score_ci_low": 0.15892386834513053,
928
- "score_ci_high": 0.31857139859597966,
929
- "sacrebleu_ci_low": 0.15892386834513053,
930
- "sacrebleu_ci_high": 0.31857139859597966
931
- },
932
- "mt_flores_101_eng_fra": {
933
- "num_of_instances": 6,
934
- "counts": [
935
- 186,
936
- 143,
937
- 115,
938
- 96
939
- ],
940
- "totals": [
941
- 234,
942
- 228,
943
- 222,
944
- 216
945
- ],
946
- "precisions": [
947
- 0.7948717948717949,
948
- 0.6271929824561403,
949
- 0.5180180180180181,
950
- 0.4444444444444444
951
- ],
952
- "bp": 0.9957356141520489,
953
- "sys_len": 234,
954
- "ref_len": 235,
955
- "sacrebleu": 0.5795744035432013,
956
- "score": 0.5795744035432013,
957
- "score_name": "sacrebleu",
958
- "score_ci_low": 0.489542796361342,
959
- "score_ci_high": 0.6836141189380024,
960
- "sacrebleu_ci_low": 0.489542796361342,
961
- "sacrebleu_ci_high": 0.6836141189380024
962
- },
963
- "mt_flores_101_eng_kor": {
964
- "num_of_instances": 6,
965
- "counts": [
966
- 148,
967
- 74,
968
- 39,
969
- 22
970
- ],
971
- "totals": [
972
- 297,
973
- 291,
974
- 285,
975
- 279
976
- ],
977
- "precisions": [
978
- 0.4983164983164983,
979
- 0.2542955326460481,
980
- 0.1368421052631579,
981
- 0.07885304659498207
982
- ],
983
- "bp": 1.0,
984
- "sys_len": 297,
985
- "ref_len": 249,
986
- "sacrebleu": 0.19229613499833637,
987
- "score": 0.19229613499833637,
988
- "score_name": "sacrebleu",
989
- "score_ci_low": 0.11753162974027624,
990
- "score_ci_high": 0.2734631145297525,
991
- "sacrebleu_ci_low": 0.11753162974027624,
992
- "sacrebleu_ci_high": 0.2734631145297525
993
- },
994
- "mt_flores_101_eng_por": {
995
- "num_of_instances": 6,
996
- "counts": [
997
- 175,
998
- 127,
999
- 96,
1000
- 73
1001
- ],
1002
- "totals": [
1003
- 230,
1004
- 224,
1005
- 218,
1006
- 212
1007
- ],
1008
- "precisions": [
1009
- 0.7608695652173912,
1010
- 0.5669642857142857,
1011
- 0.4403669724770642,
1012
- 0.3443396226415094
1013
- ],
1014
- "bp": 1.0,
1015
- "sys_len": 230,
1016
- "ref_len": 222,
1017
- "sacrebleu": 0.5057279000292236,
1018
- "score": 0.5057279000292236,
1019
- "score_name": "sacrebleu",
1020
- "score_ci_low": 0.44927402111531833,
1021
- "score_ci_high": 0.5829583257663561,
1022
- "sacrebleu_ci_low": 0.44927402111531833,
1023
- "sacrebleu_ci_high": 0.5829583257663561
1024
- },
1025
- "mt_flores_101_eng_ron": {
1026
- "num_of_instances": 6,
1027
- "counts": [
1028
- 151,
1029
- 98,
1030
- 70,
1031
- 52
1032
- ],
1033
- "totals": [
1034
- 230,
1035
- 224,
1036
- 218,
1037
- 212
1038
- ],
1039
- "precisions": [
1040
- 0.6565217391304349,
1041
- 0.4375,
1042
- 0.3211009174311926,
1043
- 0.24528301886792453
1044
- ],
1045
- "bp": 1.0,
1046
- "sys_len": 230,
1047
- "ref_len": 230,
1048
- "sacrebleu": 0.3878234357113968,
1049
- "score": 0.3878234357113968,
1050
- "score_name": "sacrebleu",
1051
- "score_ci_low": 0.25965016638584715,
1052
- "score_ci_high": 0.5435565274954791,
1053
- "sacrebleu_ci_low": 0.25965016638584715,
1054
- "sacrebleu_ci_high": 0.5435565274954791
1055
- },
1056
- "mt_flores_101_eng_spa": {
1057
- "num_of_instances": 6,
1058
- "counts": [
1059
- 155,
1060
- 80,
1061
- 43,
1062
- 25
1063
- ],
1064
- "totals": [
1065
- 235,
1066
- 229,
1067
- 223,
1068
- 217
1069
- ],
1070
- "precisions": [
1071
- 0.6595744680851063,
1072
- 0.3493449781659389,
1073
- 0.19282511210762332,
1074
- 0.1152073732718894
1075
- ],
1076
- "bp": 0.9665303748102905,
1077
- "sys_len": 235,
1078
- "ref_len": 243,
1079
- "sacrebleu": 0.2585270907217383,
1080
- "score": 0.2585270907217383,
1081
- "score_name": "sacrebleu",
1082
- "score_ci_low": 0.20941633241942087,
1083
- "score_ci_high": 0.30626903457788784,
1084
- "sacrebleu_ci_low": 0.20941633241942087,
1085
- "sacrebleu_ci_high": 0.30626903457788784
1086
- },
1087
- "mt_flores_101_fra_eng": {
1088
- "num_of_instances": 6,
1089
- "counts": [
1090
- 157,
1091
- 107,
1092
- 76,
1093
- 56
1094
- ],
1095
- "totals": [
1096
- 220,
1097
- 214,
1098
- 208,
1099
- 202
1100
- ],
1101
- "precisions": [
1102
- 0.7136363636363636,
1103
- 0.5,
1104
- 0.3653846153846154,
1105
- 0.27722772277227725
1106
- ],
1107
- "bp": 1.0,
1108
- "sys_len": 220,
1109
- "ref_len": 208,
1110
- "sacrebleu": 0.43602207032130424,
1111
- "score": 0.43602207032130424,
1112
- "score_name": "sacrebleu",
1113
- "score_ci_low": 0.2946660579225827,
1114
- "score_ci_high": 0.5481080622130052,
1115
- "sacrebleu_ci_low": 0.2946660579225827,
1116
- "sacrebleu_ci_high": 0.5481080622130052
1117
- },
1118
- "mt_flores_101_jpn_eng": {
1119
- "num_of_instances": 6,
1120
- "counts": [
1121
- 133,
1122
- 82,
1123
- 56,
1124
- 42
1125
- ],
1126
- "totals": [
1127
- 198,
1128
- 192,
1129
- 186,
1130
- 180
1131
- ],
1132
- "precisions": [
1133
- 0.6717171717171717,
1134
- 0.42708333333333337,
1135
- 0.3010752688172043,
1136
- 0.2333333333333333
1137
- ],
1138
- "bp": 0.950749126896934,
1139
- "sys_len": 198,
1140
- "ref_len": 208,
1141
- "sacrebleu": 0.35822316846084,
1142
- "score": 0.35822316846084,
1143
- "score_name": "sacrebleu",
1144
- "score_ci_low": 0.18839913329076022,
1145
- "score_ci_high": 0.5446107832786825,
1146
- "sacrebleu_ci_low": 0.18839913329076022,
1147
- "sacrebleu_ci_high": 0.5446107832786825
1148
- },
1149
- "mt_flores_101_kor_eng": {
1150
- "num_of_instances": 6,
1151
- "counts": [
1152
- 129,
1153
- 63,
1154
- 36,
1155
- 23
1156
- ],
1157
- "totals": [
1158
- 201,
1159
- 195,
1160
- 189,
1161
- 183
1162
- ],
1163
- "precisions": [
1164
- 0.6417910447761195,
1165
- 0.32307692307692304,
1166
- 0.19047619047619047,
1167
- 0.12568306010928962
1168
- ],
1169
- "bp": 0.9657735711441044,
1170
- "sys_len": 201,
1171
- "ref_len": 208,
1172
- "sacrebleu": 0.25634778841638817,
1173
- "score": 0.25634778841638817,
1174
- "score_name": "sacrebleu",
1175
- "score_ci_low": 0.1694709590890647,
1176
- "score_ci_high": 0.3945944559803188,
1177
- "sacrebleu_ci_low": 0.1694709590890647,
1178
- "sacrebleu_ci_high": 0.3945944559803188
1179
- },
1180
- "mt_flores_101_por_eng": {
1181
- "num_of_instances": 6,
1182
- "counts": [
1183
- 148,
1184
- 100,
1185
- 73,
1186
- 53
1187
- ],
1188
- "totals": [
1189
- 213,
1190
- 207,
1191
- 201,
1192
- 195
1193
- ],
1194
- "precisions": [
1195
- 0.6948356807511737,
1196
- 0.48309178743961356,
1197
- 0.36318407960199006,
1198
- 0.2717948717948718
1199
- ],
1200
- "bp": 1.0,
1201
- "sys_len": 213,
1202
- "ref_len": 208,
1203
- "sacrebleu": 0.426648238456799,
1204
- "score": 0.426648238456799,
1205
- "score_name": "sacrebleu",
1206
- "score_ci_low": 0.2592000591652009,
1207
- "score_ci_high": 0.5677639298714758,
1208
- "sacrebleu_ci_low": 0.2592000591652009,
1209
- "sacrebleu_ci_high": 0.5677639298714758
1210
- },
1211
- "mt_flores_101_ron_eng": {
1212
- "num_of_instances": 6,
1213
- "counts": [
1214
- 148,
1215
- 92,
1216
- 65,
1217
- 47
1218
- ],
1219
- "totals": [
1220
- 215,
1221
- 209,
1222
- 203,
1223
- 197
1224
- ],
1225
- "precisions": [
1226
- 0.6883720930232557,
1227
- 0.44019138755980863,
1228
- 0.32019704433497537,
1229
- 0.23857868020304568
1230
- ],
1231
- "bp": 1.0,
1232
- "sys_len": 215,
1233
- "ref_len": 208,
1234
- "sacrebleu": 0.39005732387552927,
1235
- "score": 0.39005732387552927,
1236
- "score_name": "sacrebleu",
1237
- "score_ci_low": 0.2645396605523872,
1238
- "score_ci_high": 0.5798015480261387,
1239
- "sacrebleu_ci_low": 0.2645396605523872,
1240
- "sacrebleu_ci_high": 0.5798015480261387
1241
- },
1242
- "mt_flores_101_spa_eng": {
1243
- "num_of_instances": 6,
1244
- "counts": [
1245
- 142,
1246
- 83,
1247
- 50,
1248
- 36
1249
- ],
1250
- "totals": [
1251
- 228,
1252
- 222,
1253
- 216,
1254
- 210
1255
- ],
1256
- "precisions": [
1257
- 0.6228070175438597,
1258
- 0.37387387387387383,
1259
- 0.23148148148148148,
1260
- 0.17142857142857143
1261
- ],
1262
- "bp": 1.0,
1263
- "sys_len": 228,
1264
- "ref_len": 208,
1265
- "sacrebleu": 0.3100412781680407,
1266
- "score": 0.3100412781680407,
1267
- "score_name": "sacrebleu",
1268
- "score_ci_low": 0.22358091489071585,
1269
- "score_ci_high": 0.4112037006871551,
1270
- "sacrebleu_ci_low": 0.22358091489071585,
1271
- "sacrebleu_ci_high": 0.4112037006871551
1272
- },
1273
- "score": 0.34853282350682246,
1274
- "score_name": "subsets_mean",
1275
- "num_of_instances": 90
1276
- },
1277
- "score": 0.54262797605742,
1278
- "score_name": "subsets_mean",
1279
- "num_of_instances": 1537
1280
- }
1281
- }