jbnayahu commited on
Commit
84652c6
·
unverified ·
1 Parent(s): f4fad76

more gpt results

Browse files

Signed-off-by: Jonathan Bnayahu <[email protected]>

results/bluebench/2025-08-03T09-40-01_evaluation_results.json ADDED
@@ -0,0 +1,1282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-08-03T13:39:57.204417Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=azure/Azure/gpt-4o-ncf,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "azure/Azure/gpt-4o-ncf",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.26.5",
45
+ "unitxt_commit_hash": "N/A",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "tiktoken": "0.9.0",
55
+ "charset-normalizer": "3.4.2",
56
+ "nvidia-cuda-runtime-cu12": "12.6.77",
57
+ "pyarrow": "21.0.0",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "httpcore": "1.0.9",
61
+ "pip": "25.2",
62
+ "certifi": "2025.7.14",
63
+ "evaluate": "0.4.5",
64
+ "Jinja2": "3.1.6",
65
+ "jsonschema-specifications": "2025.4.1",
66
+ "pydantic_core": "2.33.2",
67
+ "nvidia-cusparse-cu12": "12.5.4.2",
68
+ "aiosignal": "1.4.0",
69
+ "yarl": "1.20.1",
70
+ "unitxt": "1.26.5",
71
+ "jsonschema": "4.25.0",
72
+ "portalocker": "3.2.0",
73
+ "multiprocess": "0.70.16",
74
+ "nvidia-nvjitlink-cu12": "12.6.85",
75
+ "nvidia-cublas-cu12": "12.6.4.1",
76
+ "pydantic": "2.11.7",
77
+ "async-timeout": "5.0.1",
78
+ "annotated-types": "0.7.0",
79
+ "rouge_score": "0.1.2",
80
+ "contourpy": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "matplotlib": "3.10.5",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "h11": "0.16.0",
87
+ "zipp": "3.19.2",
88
+ "tzdata": "2025.2",
89
+ "bert-score": "0.3.13",
90
+ "setuptools": "80.9.0",
91
+ "referencing": "0.36.2",
92
+ "sacrebleu": "2.5.1",
93
+ "filelock": "3.18.0",
94
+ "urllib3": "2.5.0",
95
+ "scipy": "1.15.3",
96
+ "nvidia-nccl-cu12": "2.26.2",
97
+ "kiwisolver": "1.4.8",
98
+ "networkx": "3.4.2",
99
+ "typing-inspection": "0.4.1",
100
+ "sniffio": "1.3.1",
101
+ "rpds-py": "0.26.0",
102
+ "nvidia-curand-cu12": "10.3.7.77",
103
+ "litellm": "1.74.12",
104
+ "pillow": "11.3.0",
105
+ "datasets": "3.6.0",
106
+ "nvidia-cusolver-cu12": "11.7.1.2",
107
+ "cycler": "0.12.1",
108
+ "tokenizers": "0.21.4",
109
+ "distro": "1.9.0",
110
+ "idna": "3.10",
111
+ "MarkupSafe": "3.0.2",
112
+ "frozenlist": "1.7.0",
113
+ "pyparsing": "3.2.3",
114
+ "regex": "2025.7.34",
115
+ "jiter": "0.10.0",
116
+ "importlib_metadata": "8.0.0",
117
+ "packaging": "24.2",
118
+ "psutil": "7.0.0",
119
+ "mecab-ko-dic": "1.0.0",
120
+ "joblib": "1.5.1",
121
+ "transformers": "4.54.1",
122
+ "fsspec": "2025.3.0",
123
+ "scikit-learn": "1.7.1",
124
+ "dill": "0.3.8",
125
+ "wheel": "0.45.1",
126
+ "nvidia-nvtx-cu12": "12.6.77",
127
+ "nvidia-cusparselt-cu12": "0.6.3",
128
+ "lxml": "6.0.0",
129
+ "propcache": "0.3.2",
130
+ "numpy": "2.2.6",
131
+ "mpmath": "1.3.0",
132
+ "conllu": "6.0.0",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "fonttools": "4.59.0",
136
+ "tabulate": "0.9.0",
137
+ "typing_extensions": "4.12.2",
138
+ "absl-py": "2.3.1",
139
+ "accelerate": "1.9.0",
140
+ "nvidia-cufft-cu12": "11.3.0.4",
141
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
142
+ "click": "8.2.1",
143
+ "attrs": "25.3.0",
144
+ "exceptiongroup": "1.3.0",
145
+ "tenacity": "9.1.2",
146
+ "huggingface-hub": "0.34.3",
147
+ "pytz": "2025.2",
148
+ "aiohappyeyeballs": "2.6.1",
149
+ "python-dateutil": "2.9.0.post0",
150
+ "torch": "2.7.1",
151
+ "python-dotenv": "1.1.1",
152
+ "multidict": "6.6.3",
153
+ "httpx": "0.28.1",
154
+ "aiohttp": "3.12.15",
155
+ "xxhash": "3.5.0",
156
+ "PyYAML": "6.0.2",
157
+ "colorama": "0.4.6",
158
+ "openai": "1.98.0",
159
+ "threadpoolctl": "3.6.0",
160
+ "nvidia-cudnn-cu12": "9.5.1.17",
161
+ "pandas": "2.3.1",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.8888888888888888,
180
+ "accuracy_ci_low": 0.4444444444444444,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 0.8888888888888888,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 0.4444444444444444,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 1.0,
190
+ "accuracy_ci_low": 1.0,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 1.0,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 1.0,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 1.0,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 1.0,
260
+ "accuracy_ci_low": 1.0,
261
+ "accuracy_ci_high": 1.0,
262
+ "score_name": "accuracy",
263
+ "score": 1.0,
264
+ "score_ci_high": 1.0,
265
+ "score_ci_low": 1.0,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.5555555555555556,
270
+ "accuracy_ci_low": 0.2222222222222222,
271
+ "accuracy_ci_high": 0.8888888888888888,
272
+ "score_name": "accuracy",
273
+ "score": 0.5555555555555556,
274
+ "score_ci_high": 0.8888888888888888,
275
+ "score_ci_low": 0.2222222222222222,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.8888888888888888,
280
+ "accuracy_ci_low": 0.5555555555555556,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 0.8888888888888888,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 0.5555555555555556,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.9393939393939393,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.9414414414414415,
296
+ "score": 0.9414414414414415,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.9414414414414415,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.7999999999999999,
307
+ "f1_Organization": 0.7096774193548386,
308
+ "f1_Location": 0.7058823529411765,
309
+ "f1_macro": 0.7385199240986716,
310
+ "recall_macro": 0.80175983436853,
311
+ "precision_macro": 0.6848220769789397,
312
+ "in_classes_support": 1.0,
313
+ "f1_micro": 0.7361963190184048,
314
+ "recall_micro": 0.8,
315
+ "precision_micro": 0.6818181818181818,
316
+ "score": 0.7361963190184048,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.6504174815433912,
319
+ "score_ci_high": 0.782581269719831,
320
+ "f1_micro_ci_low": 0.6504174815433912,
321
+ "f1_micro_ci_high": 0.782581269719831
322
+ },
323
+ "score": 0.7361963190184048,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
+ "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.14285714285714285,
340
+ "accuracy_ci_low": 0.0,
341
+ "accuracy_ci_high": 0.5714285714285714,
342
+ "score_name": "accuracy",
343
+ "score": 0.14285714285714285,
344
+ "score_ci_high": 0.5714285714285714,
345
+ "score_ci_low": 0.0,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.0,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.0,
352
+ "score_name": "accuracy",
353
+ "score": 0.0,
354
+ "score_ci_high": 0.0,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.7142857142857143,
360
+ "accuracy_ci_low": 0.2857142857142857,
361
+ "accuracy_ci_high": 1.0,
362
+ "score_name": "accuracy",
363
+ "score": 0.7142857142857143,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 0.2857142857142857,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.7142857142857143,
370
+ "accuracy_ci_low": 0.2857142857142857,
371
+ "accuracy_ci_high": 1.0,
372
+ "score_name": "accuracy",
373
+ "score": 0.7142857142857143,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.2857142857142857,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.14285714285714285,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.5714285714285714,
382
+ "score_name": "accuracy",
383
+ "score": 0.14285714285714285,
384
+ "score_ci_high": 0.5714285714285714,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.5714285714285714,
390
+ "accuracy_ci_low": 0.14285714285714285,
391
+ "accuracy_ci_high": 0.8571428571428571,
392
+ "score_name": "accuracy",
393
+ "score": 0.5714285714285714,
394
+ "score_ci_high": 0.8571428571428571,
395
+ "score_ci_low": 0.14285714285714285,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.42857142857142855,
400
+ "accuracy_ci_low": 0.14285714285714285,
401
+ "accuracy_ci_high": 0.8571428571428571,
402
+ "score_name": "accuracy",
403
+ "score": 0.42857142857142855,
404
+ "score_ci_high": 0.8571428571428571,
405
+ "score_ci_low": 0.14285714285714285,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.8571428571428571,
410
+ "accuracy_ci_low": 0.42857142857142855,
411
+ "accuracy_ci_high": 1.0,
412
+ "score_name": "accuracy",
413
+ "score": 0.8571428571428571,
414
+ "score_ci_high": 1.0,
415
+ "score_ci_low": 0.42857142857142855,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.2857142857142857,
420
+ "accuracy_ci_low": 0.0,
421
+ "accuracy_ci_high": 0.7142857142857143,
422
+ "score_name": "accuracy",
423
+ "score": 0.2857142857142857,
424
+ "score_ci_high": 0.7142857142857143,
425
+ "score_ci_low": 0.0,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.5714285714285714,
430
+ "accuracy_ci_low": 0.14285714285714285,
431
+ "accuracy_ci_high": 0.8571428571428571,
432
+ "score_name": "accuracy",
433
+ "score": 0.5714285714285714,
434
+ "score_ci_high": 0.8571428571428571,
435
+ "score_ci_low": 0.14285714285714285,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.5714285714285714,
440
+ "accuracy_ci_low": 0.14285714285714285,
441
+ "accuracy_ci_high": 0.8571428571428571,
442
+ "score_name": "accuracy",
443
+ "score": 0.5714285714285714,
444
+ "score_ci_high": 0.8571428571428571,
445
+ "score_ci_low": 0.14285714285714285,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.0,
450
+ "accuracy_ci_low": 0.0,
451
+ "accuracy_ci_high": 0.0,
452
+ "score_name": "accuracy",
453
+ "score": 0.0,
454
+ "score_ci_high": 0.0,
455
+ "score_ci_low": 0.0,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.7142857142857143,
460
+ "accuracy_ci_low": 0.2857142857142857,
461
+ "accuracy_ci_high": 1.0,
462
+ "score_name": "accuracy",
463
+ "score": 0.7142857142857143,
464
+ "score_ci_high": 1.0,
465
+ "score_ci_low": 0.2857142857142857,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.45918367346938777,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.5199999999999999,
475
+ "f1_suggestive": 0.2,
476
+ "f1_generic": 0.8,
477
+ "f1_fanciful": 0.4,
478
+ "f1_descriptive": 0.4,
479
+ "f1_arbitrary": 0.8,
480
+ "f1_macro_ci_low": 0.3363636363636363,
481
+ "f1_macro_ci_high": 0.766060606060606,
482
+ "score_name": "f1_micro",
483
+ "score": 0.4666666666666667,
484
+ "score_ci_high": 0.6857142857142857,
485
+ "score_ci_low": 0.23076923076923078,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.35,
488
+ "accuracy_ci_low": 0.15,
489
+ "accuracy_ci_high": 0.6,
490
+ "f1_micro": 0.4666666666666667,
491
+ "f1_micro_ci_low": 0.23076923076923078,
492
+ "f1_micro_ci_high": 0.6857142857142857
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.5793103448275863,
496
+ "f1_no": 0.7586206896551724,
497
+ "f1_yes": 0.4,
498
+ "f1_macro_ci_low": 0.375,
499
+ "f1_macro_ci_high": 0.868365507202327,
500
+ "score_name": "f1_micro",
501
+ "score": 0.6666666666666666,
502
+ "score_ci_high": 0.8717948717948718,
503
+ "score_ci_low": 0.45355819395422325,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.65,
506
+ "accuracy_ci_low": 0.45,
507
+ "accuracy_ci_high": 0.85,
508
+ "f1_micro": 0.6666666666666666,
509
+ "f1_micro_ci_low": 0.45355819395422325,
510
+ "f1_micro_ci_high": 0.8717948717948718
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.26643990929705214,
514
+ "f1_conclusion": 0.2222222222222222,
515
+ "f1_decree": 0.0,
516
+ "f1_issue": 0.2857142857142857,
517
+ "f1_analysis": 0.5,
518
+ "f1_facts": 0.8571428571428571,
519
+ "f1_procedural history": 0.0,
520
+ "f1_rule": 0.0,
521
+ "f1_macro_ci_low": 0.1142857142857143,
522
+ "f1_macro_ci_high": 0.45052752383482225,
523
+ "score_name": "f1_micro",
524
+ "score": 0.35294117647058826,
525
+ "score_ci_high": 0.5714285714285714,
526
+ "score_ci_low": 0.12903225806451613,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.3,
529
+ "accuracy_ci_low": 0.1,
530
+ "accuracy_ci_high": 0.5,
531
+ "f1_micro": 0.35294117647058826,
532
+ "f1_micro_ci_low": 0.12903225806451613,
533
+ "f1_micro_ci_high": 0.5714285714285714
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.696969696969697,
537
+ "f1_yes": 0.7272727272727273,
538
+ "f1_no": 0.6666666666666666,
539
+ "f1_macro_ci_low": 0.4949494949494949,
540
+ "f1_macro_ci_high": 0.898989898989899,
541
+ "score_name": "f1_micro",
542
+ "score": 0.7,
543
+ "score_ci_high": 0.9,
544
+ "score_ci_low": 0.5,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.7,
547
+ "accuracy_ci_low": 0.5,
548
+ "accuracy_ci_high": 0.9,
549
+ "f1_micro": 0.7,
550
+ "f1_micro_ci_low": 0.5,
551
+ "f1_micro_ci_high": 0.9
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.8585526315789473,
555
+ "f1_yes": 0.875,
556
+ "f1_no": 0.8421052631578947,
557
+ "f1_macro_ci_low": 0.6847205623637095,
558
+ "f1_macro_ci_high": 0.9545454545454546,
559
+ "score_name": "f1_micro",
560
+ "score": 0.8571428571428571,
561
+ "score_ci_high": 0.9473684210526315,
562
+ "score_ci_low": 0.6706944990883059,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.75,
565
+ "accuracy_ci_low": 0.50468235519016,
566
+ "accuracy_ci_high": 0.9,
567
+ "f1_micro": 0.8571428571428571,
568
+ "f1_micro_ci_low": 0.6706944990883059,
569
+ "f1_micro_ci_high": 0.9473684210526315
570
+ },
571
+ "score": 0.6086834733893557,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.6508639236580414,
578
+ "f1_cars": 0.8333333333333334,
579
+ "f1_windows x": 0.3333333333333333,
580
+ "f1_computer graphics": 0.5555555555555556,
581
+ "f1_atheism": 0.2857142857142857,
582
+ "f1_religion": 0.0,
583
+ "f1_medicine": 0.8571428571428571,
584
+ "f1_christianity": 0.8571428571428571,
585
+ "f1_microsoft windows": 0.8333333333333334,
586
+ "f1_middle east": 0.8333333333333334,
587
+ "f1_motorcycles": 0.7272727272727273,
588
+ "f1_pc hardware": 0.7692307692307693,
589
+ "f1_mac hardware": 1.0,
590
+ "f1_electronics": 0.3333333333333333,
591
+ "f1_for sale": 0.75,
592
+ "f1_guns": 0.6666666666666666,
593
+ "f1_space": 0.5714285714285714,
594
+ "f1_cryptography": 0.3333333333333333,
595
+ "f1_baseball": 1.0,
596
+ "f1_politics": 0.5882352941176471,
597
+ "f1_hockey": 0.8888888888888888,
598
+ "f1_macro_ci_low": 0.5563237523685153,
599
+ "f1_macro_ci_high": 0.7401830936884861,
600
+ "score_name": "f1_micro",
601
+ "score": 0.6907216494845361,
602
+ "score_ci_high": 0.7645361476644139,
603
+ "score_ci_low": 0.5804059043570197,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.67,
606
+ "accuracy_ci_low": 0.56,
607
+ "accuracy_ci_high": 0.75,
608
+ "f1_micro": 0.6907216494845361,
609
+ "f1_micro_ci_low": 0.5804059043570197,
610
+ "f1_micro_ci_high": 0.7645361476644139
611
+ },
612
+ "score": 0.6907216494845361,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.7410023219814241,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9411764705882353,
620
+ "f1_debt collection": 0.7368421052631579,
621
+ "f1_payday loan or title loan or personal loan": 0.0,
622
+ "f1_student loan": 0.8333333333333334,
623
+ "f1_credit card or prepaid card": 0.75,
624
+ "f1_checking or savings account": 1.0,
625
+ "f1_mortgage": 0.6666666666666666,
626
+ "f1_money transfer or virtual currency or money service": 1.0,
627
+ "f1_macro_ci_low": 0.548026288422304,
628
+ "f1_macro_ci_high": 0.8446388968731338,
629
+ "score_name": "f1_micro",
630
+ "score": 0.898989898989899,
631
+ "score_ci_high": 0.9447236180904522,
632
+ "score_ci_low": 0.8203180754561684,
633
+ "num_of_instances": 100,
634
+ "accuracy": 0.89,
635
+ "accuracy_ci_low": 0.81,
636
+ "accuracy_ci_high": 0.94,
637
+ "f1_micro": 0.898989898989899,
638
+ "f1_micro_ci_low": 0.8203180754561684,
639
+ "f1_micro_ci_high": 0.9447236180904522
640
+ },
641
+ "cfpb_product_watsonx": {
642
+ "f1_macro": 0.8260549519130755,
643
+ "f1_mortgages and loans": 0.8695652173913043,
644
+ "f1_credit card": 0.8181818181818182,
645
+ "f1_debt collection": 0.7368421052631579,
646
+ "f1_credit reporting": 0.782608695652174,
647
+ "f1_retail banking": 0.9230769230769231,
648
+ "f1_macro_ci_low": 0.7025385602172309,
649
+ "f1_macro_ci_high": 0.9116966675085018,
650
+ "score_name": "f1_micro",
651
+ "score": 0.82,
652
+ "score_ci_high": 0.9,
653
+ "score_ci_low": 0.7,
654
+ "num_of_instances": 50,
655
+ "accuracy": 0.82,
656
+ "accuracy_ci_low": 0.7,
657
+ "accuracy_ci_high": 0.9,
658
+ "f1_micro": 0.82,
659
+ "f1_micro_ci_low": 0.7,
660
+ "f1_micro_ci_high": 0.9
661
+ },
662
+ "score": 0.8594949494949495,
663
+ "score_name": "subsets_mean",
664
+ "num_of_instances": 150
665
+ },
666
+ "qa_finance": {
667
+ "fin_qa": {
668
+ "num_of_instances": 100,
669
+ "execution_accuracy": 0.36,
670
+ "program_accuracy": 0.37,
671
+ "score": 0.37,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy_ci_low": 0.27,
674
+ "execution_accuracy_ci_high": 0.47,
675
+ "program_accuracy_ci_low": 0.2811354521803329,
676
+ "program_accuracy_ci_high": 0.47,
677
+ "score_ci_low": 0.2811354521803329,
678
+ "score_ci_high": 0.47
679
+ },
680
+ "score": 0.37,
681
+ "score_name": "subsets_mean",
682
+ "num_of_instances": 100
683
+ },
684
+ "rag_general": {
685
+ "rag_response_generation_clapnq": {
686
+ "precision": 0.4468480048160072,
687
+ "recall": 0.6454220314570971,
688
+ "f1": 0.4890230179217559,
689
+ "precision_ci_low": 0.4151579792947102,
690
+ "precision_ci_high": 0.4845603023826521,
691
+ "recall_ci_low": 0.6061330597384731,
692
+ "recall_ci_high": 0.6765859073642304,
693
+ "f1_ci_low": 0.46005008735451697,
694
+ "f1_ci_high": 0.5174377148355835,
695
+ "score_name": "f1",
696
+ "score": 0.4890230179217559,
697
+ "score_ci_high": 0.5174377148355835,
698
+ "score_ci_low": 0.46005008735451697,
699
+ "num_of_instances": 100,
700
+ "correctness_f1_bert_score.deberta_large_mnli": 0.691885244846344,
701
+ "correctness_recall_bert_score.deberta_large_mnli": 0.7364352762699127,
702
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6613857835531235,
703
+ "faithfullness_f1_token_overlap": 0.43039423317750836,
704
+ "faithfullness_recall_token_overlap": 0.35370116631248527,
705
+ "faithfullness_precision_token_overlap": 0.6790419566943843,
706
+ "correctness_f1_token_overlap": 0.4890230179217559,
707
+ "correctness_recall_token_overlap": 0.6454220314570971,
708
+ "correctness_precision_token_overlap": 0.4468480048160072
709
+ },
710
+ "score": 0.4890230179217559,
711
+ "score_name": "subsets_mean",
712
+ "num_of_instances": 100
713
+ },
714
+ "reasoning": {
715
+ "hellaswag": {
716
+ "accuracy": 0.62,
717
+ "accuracy_ci_low": 0.51,
718
+ "accuracy_ci_high": 0.7,
719
+ "score_name": "accuracy",
720
+ "score": 0.62,
721
+ "score_ci_high": 0.7,
722
+ "score_ci_low": 0.51,
723
+ "num_of_instances": 100
724
+ },
725
+ "openbook_qa": {
726
+ "accuracy": 0.94,
727
+ "accuracy_ci_low": 0.89,
728
+ "accuracy_ci_high": 0.98,
729
+ "score_name": "accuracy",
730
+ "score": 0.94,
731
+ "score_ci_high": 0.98,
732
+ "score_ci_low": 0.89,
733
+ "num_of_instances": 100
734
+ },
735
+ "score": 0.78,
736
+ "score_name": "subsets_mean",
737
+ "num_of_instances": 200
738
+ },
739
+ "safety": {
740
+ "attaq_500": {
741
+ "severity_critical": 0.0,
742
+ "severity_high": 0.0,
743
+ "severity_medium": 0.0,
744
+ "severity_low": 100.0,
745
+ "category_harmful_info": 0.8862442294401782,
746
+ "category_deception": 0.9089467666215367,
747
+ "category_explicit_content": 0.8660169005243467,
748
+ "category_violence": 0.8813284475474604,
749
+ "category_discrimination": 0.7921433370737802,
750
+ "category_substance_abuse": 0.892341727639238,
751
+ "category_pii": 0.8837605582343208,
752
+ "safety": 0.8784392809950642,
753
+ "safety_ci_low": 0.8622394069744699,
754
+ "safety_ci_high": 0.8922010585173956,
755
+ "score_name": "safety",
756
+ "score": 0.8784392809950642,
757
+ "score_ci_high": 0.8922010585173956,
758
+ "score_ci_low": 0.8622394069744699,
759
+ "num_of_instances": 100
760
+ },
761
+ "score": 0.8784392809950642,
762
+ "score_name": "subsets_mean",
763
+ "num_of_instances": 100
764
+ },
765
+ "summarization": {
766
+ "billsum_document_filtered_to_6000_chars": {
767
+ "num_of_instances": 100,
768
+ "rougeLsum": 0.3334800194661833,
769
+ "rouge2": 0.1746055052600959,
770
+ "rouge1": 0.3914063192107438,
771
+ "rougeL": 0.2607874086538398,
772
+ "score": 0.2607874086538398,
773
+ "score_name": "rougeL",
774
+ "rougeLsum_ci_low": 0.31372911043552615,
775
+ "rougeLsum_ci_high": 0.3521588230191023,
776
+ "rouge2_ci_low": 0.16154343447204916,
777
+ "rouge2_ci_high": 0.18686015868835737,
778
+ "rouge1_ci_low": 0.36979472683613324,
779
+ "rouge1_ci_high": 0.41112417139895086,
780
+ "rougeL_ci_low": 0.24640816277583857,
781
+ "rougeL_ci_high": 0.2769615579591105,
782
+ "score_ci_low": 0.24640816277583857,
783
+ "score_ci_high": 0.2769615579591105
784
+ },
785
+ "tldr_document_filtered_to_6000_chars": {
786
+ "num_of_instances": 100,
787
+ "rougeLsum": 0.08980268920854961,
788
+ "rouge2": 0.012765072464340065,
789
+ "rouge1": 0.10813972613064696,
790
+ "rougeL": 0.08042515106945874,
791
+ "score": 0.08042515106945874,
792
+ "score_name": "rougeL",
793
+ "rougeLsum_ci_low": 0.07759889049057624,
794
+ "rougeLsum_ci_high": 0.1032183831189881,
795
+ "rouge2_ci_low": 0.008976164023617238,
796
+ "rouge2_ci_high": 0.018103937102878734,
797
+ "rouge1_ci_low": 0.09248155246218394,
798
+ "rouge1_ci_high": 0.1250184504642972,
799
+ "rougeL_ci_low": 0.06959906992080947,
800
+ "rougeL_ci_high": 0.09223760457164616,
801
+ "score_ci_low": 0.06959906992080947,
802
+ "score_ci_high": 0.09223760457164616
803
+ },
804
+ "score": 0.17060627986164928,
805
+ "score_name": "subsets_mean",
806
+ "num_of_instances": 200
807
+ },
808
+ "translation": {
809
+ "mt_flores_101_ara_eng": {
810
+ "num_of_instances": 6,
811
+ "counts": [
812
+ 156,
813
+ 105,
814
+ 71,
815
+ 50
816
+ ],
817
+ "totals": [
818
+ 226,
819
+ 220,
820
+ 214,
821
+ 208
822
+ ],
823
+ "precisions": [
824
+ 0.6902654867256638,
825
+ 0.4772727272727273,
826
+ 0.3317757009345794,
827
+ 0.2403846153846154
828
+ ],
829
+ "bp": 1.0,
830
+ "sys_len": 226,
831
+ "ref_len": 208,
832
+ "sacrebleu": 0.4026090246453075,
833
+ "score": 0.4026090246453075,
834
+ "score_name": "sacrebleu",
835
+ "score_ci_low": 0.24025628806486451,
836
+ "score_ci_high": 0.5204212192812463,
837
+ "sacrebleu_ci_low": 0.24025628806486451,
838
+ "sacrebleu_ci_high": 0.5204212192812463
839
+ },
840
+ "mt_flores_101_deu_eng": {
841
+ "num_of_instances": 6,
842
+ "counts": [
843
+ 143,
844
+ 93,
845
+ 63,
846
+ 45
847
+ ],
848
+ "totals": [
849
+ 219,
850
+ 213,
851
+ 207,
852
+ 201
853
+ ],
854
+ "precisions": [
855
+ 0.6529680365296804,
856
+ 0.43661971830985913,
857
+ 0.30434782608695654,
858
+ 0.22388059701492538
859
+ ],
860
+ "bp": 1.0,
861
+ "sys_len": 219,
862
+ "ref_len": 208,
863
+ "sacrebleu": 0.3733322279107499,
864
+ "score": 0.3733322279107499,
865
+ "score_name": "sacrebleu",
866
+ "score_ci_low": 0.26773105555308707,
867
+ "score_ci_high": 0.5628718725153604,
868
+ "sacrebleu_ci_low": 0.26773105555308707,
869
+ "sacrebleu_ci_high": 0.5628718725153604
870
+ },
871
+ "mt_flores_101_eng_ara": {
872
+ "num_of_instances": 6,
873
+ "counts": [
874
+ 118,
875
+ 70,
876
+ 45,
877
+ 27
878
+ ],
879
+ "totals": [
880
+ 211,
881
+ 205,
882
+ 199,
883
+ 193
884
+ ],
885
+ "precisions": [
886
+ 0.5592417061611374,
887
+ 0.34146341463414637,
888
+ 0.22613065326633167,
889
+ 0.13989637305699482
890
+ ],
891
+ "bp": 1.0,
892
+ "sys_len": 211,
893
+ "ref_len": 209,
894
+ "sacrebleu": 0.27879013737554453,
895
+ "score": 0.27879013737554453,
896
+ "score_name": "sacrebleu",
897
+ "score_ci_low": 0.242285190601197,
898
+ "score_ci_high": 0.3370610364230129,
899
+ "sacrebleu_ci_low": 0.242285190601197,
900
+ "sacrebleu_ci_high": 0.3370610364230129
901
+ },
902
+ "mt_flores_101_eng_deu": {
903
+ "num_of_instances": 6,
904
+ "counts": [
905
+ 148,
906
+ 92,
907
+ 62,
908
+ 46
909
+ ],
910
+ "totals": [
911
+ 217,
912
+ 211,
913
+ 205,
914
+ 199
915
+ ],
916
+ "precisions": [
917
+ 0.6820276497695852,
918
+ 0.43601895734597157,
919
+ 0.3024390243902439,
920
+ 0.23115577889447236
921
+ ],
922
+ "bp": 1.0,
923
+ "sys_len": 217,
924
+ "ref_len": 216,
925
+ "sacrebleu": 0.3797191362653853,
926
+ "score": 0.3797191362653853,
927
+ "score_name": "sacrebleu",
928
+ "score_ci_low": 0.24991245830969483,
929
+ "score_ci_high": 0.537890541133251,
930
+ "sacrebleu_ci_low": 0.24991245830969483,
931
+ "sacrebleu_ci_high": 0.537890541133251
932
+ },
933
+ "mt_flores_101_eng_fra": {
934
+ "num_of_instances": 6,
935
+ "counts": [
936
+ 190,
937
+ 150,
938
+ 119,
939
+ 95
940
+ ],
941
+ "totals": [
942
+ 244,
943
+ 238,
944
+ 232,
945
+ 226
946
+ ],
947
+ "precisions": [
948
+ 0.7786885245901639,
949
+ 0.6302521008403361,
950
+ 0.5129310344827587,
951
+ 0.42035398230088494
952
+ ],
953
+ "bp": 1.0,
954
+ "sys_len": 244,
955
+ "ref_len": 235,
956
+ "sacrebleu": 0.5703455465960385,
957
+ "score": 0.5703455465960385,
958
+ "score_name": "sacrebleu",
959
+ "score_ci_low": 0.4931990672262396,
960
+ "score_ci_high": 0.6753511173840523,
961
+ "sacrebleu_ci_low": 0.4931990672262396,
962
+ "sacrebleu_ci_high": 0.6753511173840523
963
+ },
964
+ "mt_flores_101_eng_kor": {
965
+ "num_of_instances": 6,
966
+ "counts": [
967
+ 162,
968
+ 92,
969
+ 63,
970
+ 43
971
+ ],
972
+ "totals": [
973
+ 273,
974
+ 267,
975
+ 261,
976
+ 255
977
+ ],
978
+ "precisions": [
979
+ 0.5934065934065934,
980
+ 0.3445692883895131,
981
+ 0.24137931034482757,
982
+ 0.16862745098039217
983
+ ],
984
+ "bp": 1.0,
985
+ "sys_len": 273,
986
+ "ref_len": 249,
987
+ "sacrebleu": 0.3020398964371346,
988
+ "score": 0.3020398964371346,
989
+ "score_name": "sacrebleu",
990
+ "score_ci_low": 0.23699249066795575,
991
+ "score_ci_high": 0.36104670705975184,
992
+ "sacrebleu_ci_low": 0.23699249066795575,
993
+ "sacrebleu_ci_high": 0.36104670705975184
994
+ },
995
+ "mt_flores_101_eng_por": {
996
+ "num_of_instances": 6,
997
+ "counts": [
998
+ 181,
999
+ 136,
1000
+ 110,
1001
+ 89
1002
+ ],
1003
+ "totals": [
1004
+ 228,
1005
+ 222,
1006
+ 216,
1007
+ 210
1008
+ ],
1009
+ "precisions": [
1010
+ 0.7938596491228069,
1011
+ 0.6126126126126126,
1012
+ 0.5092592592592592,
1013
+ 0.4238095238095238
1014
+ ],
1015
+ "bp": 1.0,
1016
+ "sys_len": 228,
1017
+ "ref_len": 222,
1018
+ "sacrebleu": 0.5691933623646763,
1019
+ "score": 0.5691933623646763,
1020
+ "score_name": "sacrebleu",
1021
+ "score_ci_low": 0.5074329158262365,
1022
+ "score_ci_high": 0.6529930317459868,
1023
+ "sacrebleu_ci_low": 0.5074329158262365,
1024
+ "sacrebleu_ci_high": 0.6529930317459868
1025
+ },
1026
+ "mt_flores_101_eng_ron": {
1027
+ "num_of_instances": 6,
1028
+ "counts": [
1029
+ 165,
1030
+ 125,
1031
+ 97,
1032
+ 82
1033
+ ],
1034
+ "totals": [
1035
+ 230,
1036
+ 224,
1037
+ 218,
1038
+ 212
1039
+ ],
1040
+ "precisions": [
1041
+ 0.717391304347826,
1042
+ 0.5580357142857143,
1043
+ 0.444954128440367,
1044
+ 0.38679245283018865
1045
+ ],
1046
+ "bp": 1.0,
1047
+ "sys_len": 230,
1048
+ "ref_len": 230,
1049
+ "sacrebleu": 0.5123335940057543,
1050
+ "score": 0.5123335940057543,
1051
+ "score_name": "sacrebleu",
1052
+ "score_ci_low": 0.3840244184926614,
1053
+ "score_ci_high": 0.650779271851941,
1054
+ "sacrebleu_ci_low": 0.3840244184926614,
1055
+ "sacrebleu_ci_high": 0.650779271851941
1056
+ },
1057
+ "mt_flores_101_eng_spa": {
1058
+ "num_of_instances": 6,
1059
+ "counts": [
1060
+ 170,
1061
+ 109,
1062
+ 73,
1063
+ 47
1064
+ ],
1065
+ "totals": [
1066
+ 236,
1067
+ 230,
1068
+ 224,
1069
+ 218
1070
+ ],
1071
+ "precisions": [
1072
+ 0.7203389830508474,
1073
+ 0.47391304347826085,
1074
+ 0.32589285714285715,
1075
+ 0.21559633027522934
1076
+ ],
1077
+ "bp": 0.9707745538991623,
1078
+ "sys_len": 236,
1079
+ "ref_len": 243,
1080
+ "sacrebleu": 0.3820377957320921,
1081
+ "score": 0.3820377957320921,
1082
+ "score_name": "sacrebleu",
1083
+ "score_ci_low": 0.3085199315474183,
1084
+ "score_ci_high": 0.48773289703817563,
1085
+ "sacrebleu_ci_low": 0.3085199315474183,
1086
+ "sacrebleu_ci_high": 0.48773289703817563
1087
+ },
1088
+ "mt_flores_101_fra_eng": {
1089
+ "num_of_instances": 6,
1090
+ "counts": [
1091
+ 158,
1092
+ 111,
1093
+ 79,
1094
+ 59
1095
+ ],
1096
+ "totals": [
1097
+ 225,
1098
+ 219,
1099
+ 213,
1100
+ 207
1101
+ ],
1102
+ "precisions": [
1103
+ 0.7022222222222223,
1104
+ 0.5068493150684932,
1105
+ 0.37089201877934275,
1106
+ 0.28502415458937197
1107
+ ],
1108
+ "bp": 1.0,
1109
+ "sys_len": 225,
1110
+ "ref_len": 208,
1111
+ "sacrebleu": 0.44042366511701625,
1112
+ "score": 0.44042366511701625,
1113
+ "score_name": "sacrebleu",
1114
+ "score_ci_low": 0.341980662697259,
1115
+ "score_ci_high": 0.5496900259922087,
1116
+ "sacrebleu_ci_low": 0.341980662697259,
1117
+ "sacrebleu_ci_high": 0.5496900259922087
1118
+ },
1119
+ "mt_flores_101_jpn_eng": {
1120
+ "num_of_instances": 6,
1121
+ "counts": [
1122
+ 150,
1123
+ 90,
1124
+ 58,
1125
+ 39
1126
+ ],
1127
+ "totals": [
1128
+ 236,
1129
+ 230,
1130
+ 224,
1131
+ 218
1132
+ ],
1133
+ "precisions": [
1134
+ 0.635593220338983,
1135
+ 0.391304347826087,
1136
+ 0.2589285714285714,
1137
+ 0.17889908256880735
1138
+ ],
1139
+ "bp": 1.0,
1140
+ "sys_len": 236,
1141
+ "ref_len": 208,
1142
+ "sacrebleu": 0.32762007434781915,
1143
+ "score": 0.32762007434781915,
1144
+ "score_name": "sacrebleu",
1145
+ "score_ci_low": 0.20337120135607298,
1146
+ "score_ci_high": 0.4231259198402102,
1147
+ "sacrebleu_ci_low": 0.20337120135607298,
1148
+ "sacrebleu_ci_high": 0.4231259198402102
1149
+ },
1150
+ "mt_flores_101_kor_eng": {
1151
+ "num_of_instances": 6,
1152
+ "counts": [
1153
+ 135,
1154
+ 76,
1155
+ 45,
1156
+ 29
1157
+ ],
1158
+ "totals": [
1159
+ 226,
1160
+ 220,
1161
+ 214,
1162
+ 208
1163
+ ],
1164
+ "precisions": [
1165
+ 0.5973451327433628,
1166
+ 0.34545454545454546,
1167
+ 0.2102803738317757,
1168
+ 0.13942307692307693
1169
+ ],
1170
+ "bp": 1.0,
1171
+ "sys_len": 226,
1172
+ "ref_len": 208,
1173
+ "sacrebleu": 0.2788928697008729,
1174
+ "score": 0.2788928697008729,
1175
+ "score_name": "sacrebleu",
1176
+ "score_ci_low": 0.18358949678749556,
1177
+ "score_ci_high": 0.4387198667737409,
1178
+ "sacrebleu_ci_low": 0.18358949678749556,
1179
+ "sacrebleu_ci_high": 0.4387198667737409
1180
+ },
1181
+ "mt_flores_101_por_eng": {
1182
+ "num_of_instances": 6,
1183
+ "counts": [
1184
+ 168,
1185
+ 124,
1186
+ 93,
1187
+ 73
1188
+ ],
1189
+ "totals": [
1190
+ 221,
1191
+ 215,
1192
+ 209,
1193
+ 203
1194
+ ],
1195
+ "precisions": [
1196
+ 0.7601809954751131,
1197
+ 0.5767441860465117,
1198
+ 0.4449760765550239,
1199
+ 0.35960591133004927
1200
+ ],
1201
+ "bp": 1.0,
1202
+ "sys_len": 221,
1203
+ "ref_len": 208,
1204
+ "sacrebleu": 0.5146546836832124,
1205
+ "score": 0.5146546836832124,
1206
+ "score_name": "sacrebleu",
1207
+ "score_ci_low": 0.378423475348725,
1208
+ "score_ci_high": 0.6374865872050556,
1209
+ "sacrebleu_ci_low": 0.378423475348725,
1210
+ "sacrebleu_ci_high": 0.6374865872050556
1211
+ },
1212
+ "mt_flores_101_ron_eng": {
1213
+ "num_of_instances": 6,
1214
+ "counts": [
1215
+ 166,
1216
+ 121,
1217
+ 91,
1218
+ 72
1219
+ ],
1220
+ "totals": [
1221
+ 227,
1222
+ 221,
1223
+ 215,
1224
+ 209
1225
+ ],
1226
+ "precisions": [
1227
+ 0.7312775330396475,
1228
+ 0.5475113122171945,
1229
+ 0.4232558139534883,
1230
+ 0.34449760765550236
1231
+ ],
1232
+ "bp": 1.0,
1233
+ "sys_len": 227,
1234
+ "ref_len": 208,
1235
+ "sacrebleu": 0.49154820843517344,
1236
+ "score": 0.49154820843517344,
1237
+ "score_name": "sacrebleu",
1238
+ "score_ci_low": 0.4117068924616905,
1239
+ "score_ci_high": 0.6461238952977133,
1240
+ "sacrebleu_ci_low": 0.4117068924616905,
1241
+ "sacrebleu_ci_high": 0.6461238952977133
1242
+ },
1243
+ "mt_flores_101_spa_eng": {
1244
+ "num_of_instances": 6,
1245
+ "counts": [
1246
+ 150,
1247
+ 96,
1248
+ 65,
1249
+ 48
1250
+ ],
1251
+ "totals": [
1252
+ 219,
1253
+ 213,
1254
+ 207,
1255
+ 201
1256
+ ],
1257
+ "precisions": [
1258
+ 0.684931506849315,
1259
+ 0.45070422535211263,
1260
+ 0.3140096618357488,
1261
+ 0.23880597014925375
1262
+ ],
1263
+ "bp": 1.0,
1264
+ "sys_len": 219,
1265
+ "ref_len": 208,
1266
+ "sacrebleu": 0.3900602917326207,
1267
+ "score": 0.3900602917326207,
1268
+ "score_name": "sacrebleu",
1269
+ "score_ci_low": 0.3529954080391477,
1270
+ "score_ci_high": 0.44601468942834244,
1271
+ "sacrebleu_ci_low": 0.3529954080391477,
1272
+ "sacrebleu_ci_high": 0.44601468942834244
1273
+ },
1274
+ "score": 0.41424003428995987,
1275
+ "score_name": "subsets_mean",
1276
+ "num_of_instances": 90
1277
+ },
1278
+ "score": 0.6413403122123419,
1279
+ "score_name": "subsets_mean",
1280
+ "num_of_instances": 1537
1281
+ }
1282
+ }
results/bluebench/2025-08-03T12-21-28_evaluation_results.json ADDED
@@ -0,0 +1,1282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-08-03T16:21:24.530955Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=azure/Azure/o3-mini-ncf,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "azure/Azure/o3-mini-ncf",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.26.5",
45
+ "unitxt_commit_hash": "N/A",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "tiktoken": "0.9.0",
55
+ "charset-normalizer": "3.4.2",
56
+ "nvidia-cuda-runtime-cu12": "12.6.77",
57
+ "pyarrow": "21.0.0",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "httpcore": "1.0.9",
61
+ "pip": "25.2",
62
+ "certifi": "2025.7.14",
63
+ "evaluate": "0.4.5",
64
+ "Jinja2": "3.1.6",
65
+ "jsonschema-specifications": "2025.4.1",
66
+ "pydantic_core": "2.33.2",
67
+ "nvidia-cusparse-cu12": "12.5.4.2",
68
+ "aiosignal": "1.4.0",
69
+ "yarl": "1.20.1",
70
+ "unitxt": "1.26.5",
71
+ "jsonschema": "4.25.0",
72
+ "portalocker": "3.2.0",
73
+ "multiprocess": "0.70.16",
74
+ "nvidia-nvjitlink-cu12": "12.6.85",
75
+ "nvidia-cublas-cu12": "12.6.4.1",
76
+ "pydantic": "2.11.7",
77
+ "async-timeout": "5.0.1",
78
+ "annotated-types": "0.7.0",
79
+ "rouge_score": "0.1.2",
80
+ "contourpy": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "matplotlib": "3.10.5",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "h11": "0.16.0",
87
+ "zipp": "3.19.2",
88
+ "tzdata": "2025.2",
89
+ "bert-score": "0.3.13",
90
+ "setuptools": "80.9.0",
91
+ "referencing": "0.36.2",
92
+ "sacrebleu": "2.5.1",
93
+ "filelock": "3.18.0",
94
+ "urllib3": "2.5.0",
95
+ "scipy": "1.15.3",
96
+ "nvidia-nccl-cu12": "2.26.2",
97
+ "kiwisolver": "1.4.8",
98
+ "networkx": "3.4.2",
99
+ "typing-inspection": "0.4.1",
100
+ "sniffio": "1.3.1",
101
+ "rpds-py": "0.26.0",
102
+ "nvidia-curand-cu12": "10.3.7.77",
103
+ "litellm": "1.74.12",
104
+ "pillow": "11.3.0",
105
+ "datasets": "3.6.0",
106
+ "nvidia-cusolver-cu12": "11.7.1.2",
107
+ "cycler": "0.12.1",
108
+ "tokenizers": "0.21.4",
109
+ "distro": "1.9.0",
110
+ "idna": "3.10",
111
+ "MarkupSafe": "3.0.2",
112
+ "frozenlist": "1.7.0",
113
+ "pyparsing": "3.2.3",
114
+ "regex": "2025.7.34",
115
+ "jiter": "0.10.0",
116
+ "importlib_metadata": "8.0.0",
117
+ "packaging": "24.2",
118
+ "psutil": "7.0.0",
119
+ "mecab-ko-dic": "1.0.0",
120
+ "joblib": "1.5.1",
121
+ "transformers": "4.54.1",
122
+ "fsspec": "2025.3.0",
123
+ "scikit-learn": "1.7.1",
124
+ "dill": "0.3.8",
125
+ "wheel": "0.45.1",
126
+ "nvidia-nvtx-cu12": "12.6.77",
127
+ "nvidia-cusparselt-cu12": "0.6.3",
128
+ "lxml": "6.0.0",
129
+ "propcache": "0.3.2",
130
+ "numpy": "2.2.6",
131
+ "mpmath": "1.3.0",
132
+ "conllu": "6.0.0",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "fonttools": "4.59.0",
136
+ "tabulate": "0.9.0",
137
+ "typing_extensions": "4.12.2",
138
+ "absl-py": "2.3.1",
139
+ "accelerate": "1.9.0",
140
+ "nvidia-cufft-cu12": "11.3.0.4",
141
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
142
+ "click": "8.2.1",
143
+ "attrs": "25.3.0",
144
+ "exceptiongroup": "1.3.0",
145
+ "tenacity": "9.1.2",
146
+ "huggingface-hub": "0.34.3",
147
+ "pytz": "2025.2",
148
+ "aiohappyeyeballs": "2.6.1",
149
+ "python-dateutil": "2.9.0.post0",
150
+ "torch": "2.7.1",
151
+ "python-dotenv": "1.1.1",
152
+ "multidict": "6.6.3",
153
+ "httpx": "0.28.1",
154
+ "aiohttp": "3.12.15",
155
+ "xxhash": "3.5.0",
156
+ "PyYAML": "6.0.2",
157
+ "colorama": "0.4.6",
158
+ "openai": "1.98.0",
159
+ "threadpoolctl": "3.6.0",
160
+ "nvidia-cudnn-cu12": "9.5.1.17",
161
+ "pandas": "2.3.1",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.8888888888888888,
180
+ "accuracy_ci_low": 0.5310928992288233,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 0.8888888888888888,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 0.5310928992288233,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 1.0,
190
+ "accuracy_ci_low": 1.0,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 1.0,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 0.8888888888888888,
200
+ "accuracy_ci_low": 0.46041936253217447,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 0.8888888888888888,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 0.46041936253217447,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 1.0,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 1.0,
260
+ "accuracy_ci_low": 1.0,
261
+ "accuracy_ci_high": 1.0,
262
+ "score_name": "accuracy",
263
+ "score": 1.0,
264
+ "score_ci_high": 1.0,
265
+ "score_ci_low": 1.0,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 1.0,
270
+ "accuracy_ci_low": 1.0,
271
+ "accuracy_ci_high": 1.0,
272
+ "score_name": "accuracy",
273
+ "score": 1.0,
274
+ "score_ci_high": 1.0,
275
+ "score_ci_low": 1.0,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 1.0,
280
+ "accuracy_ci_low": 1.0,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 1.0,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 1.0,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.9797979797979798,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.3711340206185567,
296
+ "score": 0.3711340206185567,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.3711340206185567,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.8181818181818182,
307
+ "f1_Organization": 0.6428571428571429,
308
+ "f1_Location": 0.75,
309
+ "f1_macro": 0.737012987012987,
310
+ "recall_macro": 0.683488612836439,
311
+ "precision_macro": 0.8125,
312
+ "in_classes_support": 1.0,
313
+ "f1_micro": 0.7285714285714285,
314
+ "recall_micro": 0.68,
315
+ "precision_micro": 0.7846153846153846,
316
+ "score": 0.7285714285714285,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.5888115359844259,
319
+ "score_ci_high": 0.8223830090806191,
320
+ "f1_micro_ci_low": 0.5888115359844259,
321
+ "f1_micro_ci_high": 0.8223830090806191
322
+ },
323
+ "score": 0.7285714285714285,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
+ "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.2857142857142857,
340
+ "accuracy_ci_low": 0.0,
341
+ "accuracy_ci_high": 0.7142857142857143,
342
+ "score_name": "accuracy",
343
+ "score": 0.2857142857142857,
344
+ "score_ci_high": 0.7142857142857143,
345
+ "score_ci_low": 0.0,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.0,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.0,
352
+ "score_name": "accuracy",
353
+ "score": 0.0,
354
+ "score_ci_high": 0.0,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 1.0,
360
+ "accuracy_ci_low": 1.0,
361
+ "accuracy_ci_high": 1.0,
362
+ "score_name": "accuracy",
363
+ "score": 1.0,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 1.0,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.7142857142857143,
370
+ "accuracy_ci_low": 0.2857142857142857,
371
+ "accuracy_ci_high": 1.0,
372
+ "score_name": "accuracy",
373
+ "score": 0.7142857142857143,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.2857142857142857,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.42857142857142855,
380
+ "accuracy_ci_low": 0.14285714285714285,
381
+ "accuracy_ci_high": 0.8571428571428571,
382
+ "score_name": "accuracy",
383
+ "score": 0.42857142857142855,
384
+ "score_ci_high": 0.8571428571428571,
385
+ "score_ci_low": 0.14285714285714285,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.2857142857142857,
390
+ "accuracy_ci_low": 0.0,
391
+ "accuracy_ci_high": 0.7142857142857143,
392
+ "score_name": "accuracy",
393
+ "score": 0.2857142857142857,
394
+ "score_ci_high": 0.7142857142857143,
395
+ "score_ci_low": 0.0,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.14285714285714285,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.6807203593841678,
402
+ "score_name": "accuracy",
403
+ "score": 0.14285714285714285,
404
+ "score_ci_high": 0.6807203593841678,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.7142857142857143,
410
+ "accuracy_ci_low": 0.2857142857142857,
411
+ "accuracy_ci_high": 1.0,
412
+ "score_name": "accuracy",
413
+ "score": 0.7142857142857143,
414
+ "score_ci_high": 1.0,
415
+ "score_ci_low": 0.2857142857142857,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 1.0,
420
+ "accuracy_ci_low": 1.0,
421
+ "accuracy_ci_high": 1.0,
422
+ "score_name": "accuracy",
423
+ "score": 1.0,
424
+ "score_ci_high": 1.0,
425
+ "score_ci_low": 1.0,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.14285714285714285,
430
+ "accuracy_ci_low": 0.0,
431
+ "accuracy_ci_high": 0.5714285714285714,
432
+ "score_name": "accuracy",
433
+ "score": 0.14285714285714285,
434
+ "score_ci_high": 0.5714285714285714,
435
+ "score_ci_low": 0.0,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.5714285714285714,
440
+ "accuracy_ci_low": 0.14285714285714285,
441
+ "accuracy_ci_high": 0.8571428571428571,
442
+ "score_name": "accuracy",
443
+ "score": 0.5714285714285714,
444
+ "score_ci_high": 0.8571428571428571,
445
+ "score_ci_low": 0.14285714285714285,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.2857142857142857,
450
+ "accuracy_ci_low": 0.0,
451
+ "accuracy_ci_high": 0.7142857142857143,
452
+ "score_name": "accuracy",
453
+ "score": 0.2857142857142857,
454
+ "score_ci_high": 0.7142857142857143,
455
+ "score_ci_low": 0.0,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.7142857142857143,
460
+ "accuracy_ci_low": 0.2857142857142857,
461
+ "accuracy_ci_high": 1.0,
462
+ "score_name": "accuracy",
463
+ "score": 0.7142857142857143,
464
+ "score_ci_high": 1.0,
465
+ "score_ci_low": 0.2857142857142857,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.5,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.0,
475
+ "f1_suggestive": 0.0,
476
+ "f1_generic": 0.0,
477
+ "f1_fanciful": 0.0,
478
+ "f1_descriptive": 0.0,
479
+ "f1_arbitrary": 0.0,
480
+ "f1_macro_ci_low": 0.0,
481
+ "f1_macro_ci_high": 0.0,
482
+ "score_name": "f1_micro",
483
+ "score": 0.0,
484
+ "score_ci_high": 0.0,
485
+ "score_ci_low": 0.0,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.0,
488
+ "accuracy_ci_low": 0.0,
489
+ "accuracy_ci_high": 0.0,
490
+ "f1_micro": 0.0,
491
+ "f1_micro_ci_low": 0.0,
492
+ "f1_micro_ci_high": 0.0
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.47619047619047616,
496
+ "f1_no": 0.6666666666666666,
497
+ "f1_yes": 0.2857142857142857,
498
+ "f1_macro_ci_low": 0.2857142857142857,
499
+ "f1_macro_ci_high": 0.8813336459688916,
500
+ "score_name": "f1_micro",
501
+ "score": 0.5806451612903226,
502
+ "score_ci_high": 0.7878787878787878,
503
+ "score_ci_low": 0.3333333333333333,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.45,
506
+ "accuracy_ci_low": 0.25,
507
+ "accuracy_ci_high": 0.7,
508
+ "f1_micro": 0.5806451612903226,
509
+ "f1_micro_ci_low": 0.3333333333333333,
510
+ "f1_micro_ci_high": 0.7878787878787878
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.20272108843537415,
514
+ "f1_conclusion": 0.0,
515
+ "f1_decree": 0.0,
516
+ "f1_issue": 0.2857142857142857,
517
+ "f1_analysis": 0.8,
518
+ "f1_facts": 0.3333333333333333,
519
+ "f1_procedural history": 0.0,
520
+ "f1_rule": 0.0,
521
+ "f1_macro_ci_low": 0.05,
522
+ "f1_macro_ci_high": 0.3627437138685071,
523
+ "score_name": "f1_micro",
524
+ "score": 0.27586206896551724,
525
+ "score_ci_high": 0.5223537291196929,
526
+ "score_ci_low": 0.07650831228694685,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.2,
529
+ "accuracy_ci_low": 0.05,
530
+ "accuracy_ci_high": 0.40138012181413957,
531
+ "f1_micro": 0.27586206896551724,
532
+ "f1_micro_ci_low": 0.07650831228694685,
533
+ "f1_micro_ci_high": 0.5223537291196929
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.3666666666666667,
537
+ "f1_yes": 0.3333333333333333,
538
+ "f1_no": 0.4,
539
+ "f1_macro_ci_low": 0.16690480249191558,
540
+ "f1_macro_ci_high": 0.6427398020907653,
541
+ "score_name": "f1_micro",
542
+ "score": 0.37037037037037035,
543
+ "score_ci_high": 0.625,
544
+ "score_ci_low": 0.16,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.25,
547
+ "accuracy_ci_low": 0.1,
548
+ "accuracy_ci_high": 0.5,
549
+ "f1_micro": 0.37037037037037035,
550
+ "f1_micro_ci_low": 0.16,
551
+ "f1_micro_ci_high": 0.625
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.660633484162896,
555
+ "f1_yes": 0.6153846153846154,
556
+ "f1_no": 0.7058823529411765,
557
+ "f1_macro_ci_low": 0.40467435105346683,
558
+ "f1_macro_ci_high": 0.8261376660890378,
559
+ "score_name": "f1_micro",
560
+ "score": 0.6666666666666666,
561
+ "score_ci_high": 0.8235294117647058,
562
+ "score_ci_low": 0.46153846153846156,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.5,
565
+ "accuracy_ci_low": 0.3,
566
+ "accuracy_ci_high": 0.7,
567
+ "f1_micro": 0.6666666666666666,
568
+ "f1_micro_ci_low": 0.46153846153846156,
569
+ "f1_micro_ci_high": 0.8235294117647058
570
+ },
571
+ "score": 0.37870885345857536,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.38195526695526694,
578
+ "f1_cars": 0.5714285714285714,
579
+ "f1_windows x": 0.5714285714285714,
580
+ "f1_computer graphics": 0.5454545454545454,
581
+ "f1_atheism": 0.0,
582
+ "f1_christianity": 0.2857142857142857,
583
+ "f1_religion": 0.0,
584
+ "f1_medicine": 0.6666666666666666,
585
+ "f1_microsoft windows": 0.5,
586
+ "f1_middle east": 0.2857142857142857,
587
+ "f1_motorcycles": 0.25,
588
+ "f1_for sale": 0.5714285714285714,
589
+ "f1_pc hardware": 0.2222222222222222,
590
+ "f1_mac hardware": 0.8,
591
+ "f1_guns": 0.2857142857142857,
592
+ "f1_politics": 0.5,
593
+ "f1_space": 0.75,
594
+ "f1_cryptography": 0.0,
595
+ "f1_baseball": 0.5,
596
+ "f1_hockey": 0.3333333333333333,
597
+ "f1_electronics": 0.0,
598
+ "f1_macro_ci_low": 0.3045532013795943,
599
+ "f1_macro_ci_high": 0.48572662880615863,
600
+ "score_name": "f1_micro",
601
+ "score": 0.4305555555555556,
602
+ "score_ci_high": 0.5281645512463683,
603
+ "score_ci_low": 0.31327005378302275,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.31,
606
+ "accuracy_ci_low": 0.22,
607
+ "accuracy_ci_high": 0.4,
608
+ "f1_micro": 0.4305555555555556,
609
+ "f1_micro_ci_low": 0.31327005378302275,
610
+ "f1_micro_ci_high": 0.5281645512463683
611
+ },
612
+ "score": 0.4305555555555556,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.8944327731092437,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9411764705882353,
620
+ "f1_debt collection": 0.6666666666666666,
621
+ "f1_payday loan or title loan or personal loan": 1.0,
622
+ "f1_credit card or prepaid card": 0.8571428571428571,
623
+ "f1_student loan": 0.8333333333333334,
624
+ "f1_checking or savings account": 1.0,
625
+ "f1_mortgage": 0.8571428571428571,
626
+ "f1_money transfer or virtual currency or money service": 1.0,
627
+ "f1_macro_ci_low": 0.806614365464373,
628
+ "f1_macro_ci_high": 0.9657999668768317,
629
+ "score_name": "f1_micro",
630
+ "score": 0.9090909090909091,
631
+ "score_ci_high": 0.9547738693467337,
632
+ "score_ci_low": 0.8379092938797416,
633
+ "num_of_instances": 100,
634
+ "accuracy": 0.9,
635
+ "accuracy_ci_low": 0.83,
636
+ "accuracy_ci_high": 0.95,
637
+ "f1_micro": 0.9090909090909091,
638
+ "f1_micro_ci_low": 0.8379092938797416,
639
+ "f1_micro_ci_high": 0.9547738693467337
640
+ },
641
+ "cfpb_product_watsonx": {
642
+ "f1_macro": 0.8162680456798105,
643
+ "f1_mortgages and loans": 0.7619047619047619,
644
+ "f1_credit card": 0.8571428571428571,
645
+ "f1_debt collection": 0.7058823529411765,
646
+ "f1_credit reporting": 0.8333333333333334,
647
+ "f1_retail banking": 0.9230769230769231,
648
+ "f1_macro_ci_low": 0.7016062262967031,
649
+ "f1_macro_ci_high": 0.9223789564935859,
650
+ "score_name": "f1_micro",
651
+ "score": 0.8125,
652
+ "score_ci_high": 0.9072164948453608,
653
+ "score_ci_low": 0.6956521739130435,
654
+ "num_of_instances": 50,
655
+ "accuracy": 0.78,
656
+ "accuracy_ci_low": 0.66,
657
+ "accuracy_ci_high": 0.88,
658
+ "f1_micro": 0.8125,
659
+ "f1_micro_ci_low": 0.6956521739130435,
660
+ "f1_micro_ci_high": 0.9072164948453608
661
+ },
662
+ "score": 0.8607954545454546,
663
+ "score_name": "subsets_mean",
664
+ "num_of_instances": 150
665
+ },
666
+ "qa_finance": {
667
+ "fin_qa": {
668
+ "num_of_instances": 100,
669
+ "program_accuracy": 0.23,
670
+ "score": 0.23,
671
+ "score_name": "program_accuracy",
672
+ "execution_accuracy": 0.2,
673
+ "program_accuracy_ci_low": 0.16,
674
+ "program_accuracy_ci_high": 0.32,
675
+ "score_ci_low": 0.16,
676
+ "score_ci_high": 0.32,
677
+ "execution_accuracy_ci_low": 0.13,
678
+ "execution_accuracy_ci_high": 0.29
679
+ },
680
+ "score": 0.23,
681
+ "score_name": "subsets_mean",
682
+ "num_of_instances": 100
683
+ },
684
+ "rag_general": {
685
+ "rag_response_generation_clapnq": {
686
+ "precision": 0.4390234149184736,
687
+ "recall": 0.5262161167968094,
688
+ "f1": 0.4405810035203588,
689
+ "precision_ci_low": 0.40326834159222774,
690
+ "precision_ci_high": 0.47447671580848416,
691
+ "recall_ci_low": 0.4871121402499137,
692
+ "recall_ci_high": 0.5624733915163893,
693
+ "f1_ci_low": 0.40997461940491314,
694
+ "f1_ci_high": 0.469781968797013,
695
+ "score_name": "f1",
696
+ "score": 0.4405810035203588,
697
+ "score_ci_high": 0.469781968797013,
698
+ "score_ci_low": 0.40997461940491314,
699
+ "num_of_instances": 100,
700
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6758063852787017,
701
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6950149410963058,
702
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6671330830454827,
703
+ "faithfullness_f1_token_overlap": 0.3422892513241904,
704
+ "faithfullness_recall_token_overlap": 0.25990022331997953,
705
+ "faithfullness_precision_token_overlap": 0.6239090003430506,
706
+ "correctness_f1_token_overlap": 0.4405810035203588,
707
+ "correctness_recall_token_overlap": 0.5262161167968094,
708
+ "correctness_precision_token_overlap": 0.4390234149184736
709
+ },
710
+ "score": 0.4405810035203588,
711
+ "score_name": "subsets_mean",
712
+ "num_of_instances": 100
713
+ },
714
+ "reasoning": {
715
+ "hellaswag": {
716
+ "accuracy": 0.53,
717
+ "accuracy_ci_low": 0.43,
718
+ "accuracy_ci_high": 0.62,
719
+ "score_name": "accuracy",
720
+ "score": 0.53,
721
+ "score_ci_high": 0.62,
722
+ "score_ci_low": 0.43,
723
+ "num_of_instances": 100
724
+ },
725
+ "openbook_qa": {
726
+ "accuracy": 0.93,
727
+ "accuracy_ci_low": 0.86,
728
+ "accuracy_ci_high": 0.97,
729
+ "score_name": "accuracy",
730
+ "score": 0.93,
731
+ "score_ci_high": 0.97,
732
+ "score_ci_low": 0.86,
733
+ "num_of_instances": 100
734
+ },
735
+ "score": 0.73,
736
+ "score_name": "subsets_mean",
737
+ "num_of_instances": 200
738
+ },
739
+ "safety": {
740
+ "attaq_500": {
741
+ "severity_critical": 4.0,
742
+ "severity_high": 4.0,
743
+ "severity_medium": 0.0,
744
+ "severity_low": 92.0,
745
+ "category_harmful_info": 0.7918323914683054,
746
+ "category_deception": 0.8392233636343118,
747
+ "category_explicit_content": 0.7774655876135586,
748
+ "category_violence": 0.8427019776953828,
749
+ "category_discrimination": 0.6616657735809447,
750
+ "category_substance_abuse": 0.775272503743569,
751
+ "category_pii": 0.883132066163752,
752
+ "safety": 0.8060687377645323,
753
+ "safety_ci_low": 0.7640155436260707,
754
+ "safety_ci_high": 0.8341465976524678,
755
+ "score_name": "safety",
756
+ "score": 0.8060687377645323,
757
+ "score_ci_high": 0.8341465976524678,
758
+ "score_ci_low": 0.7640155436260707,
759
+ "num_of_instances": 100
760
+ },
761
+ "score": 0.8060687377645323,
762
+ "score_name": "subsets_mean",
763
+ "num_of_instances": 100
764
+ },
765
+ "summarization": {
766
+ "billsum_document_filtered_to_6000_chars": {
767
+ "num_of_instances": 100,
768
+ "rouge2": 0.15402435010536653,
769
+ "rougeLsum": 0.31968597390653825,
770
+ "rouge1": 0.3880965291674629,
771
+ "rougeL": 0.2524394530234589,
772
+ "score": 0.2524394530234589,
773
+ "score_name": "rougeL",
774
+ "rouge2_ci_low": 0.14130666076526238,
775
+ "rouge2_ci_high": 0.16860889298392745,
776
+ "rougeLsum_ci_low": 0.3016069005629268,
777
+ "rougeLsum_ci_high": 0.33979038219304886,
778
+ "rouge1_ci_low": 0.3688926041108984,
779
+ "rouge1_ci_high": 0.40886117544130285,
780
+ "rougeL_ci_low": 0.23942067032602865,
781
+ "rougeL_ci_high": 0.2682386978258959,
782
+ "score_ci_low": 0.23942067032602865,
783
+ "score_ci_high": 0.2682386978258959
784
+ },
785
+ "tldr_document_filtered_to_6000_chars": {
786
+ "num_of_instances": 100,
787
+ "rouge2": 0.01358931671878944,
788
+ "rougeLsum": 0.08805558214582762,
789
+ "rouge1": 0.11011699084176203,
790
+ "rougeL": 0.08082291263637024,
791
+ "score": 0.08082291263637024,
792
+ "score_name": "rougeL",
793
+ "rouge2_ci_low": 0.009687205104610215,
794
+ "rouge2_ci_high": 0.018802588392340995,
795
+ "rougeLsum_ci_low": 0.07817975698587638,
796
+ "rougeLsum_ci_high": 0.09961952144610982,
797
+ "rouge1_ci_low": 0.096196695824933,
798
+ "rouge1_ci_high": 0.12633882653954673,
799
+ "rougeL_ci_low": 0.07153795581984226,
800
+ "rougeL_ci_high": 0.09127045045673031,
801
+ "score_ci_low": 0.07153795581984226,
802
+ "score_ci_high": 0.09127045045673031
803
+ },
804
+ "score": 0.16663118282991457,
805
+ "score_name": "subsets_mean",
806
+ "num_of_instances": 200
807
+ },
808
+ "translation": {
809
+ "mt_flores_101_ara_eng": {
810
+ "num_of_instances": 6,
811
+ "counts": [
812
+ 137,
813
+ 83,
814
+ 56,
815
+ 41
816
+ ],
817
+ "totals": [
818
+ 211,
819
+ 205,
820
+ 199,
821
+ 193
822
+ ],
823
+ "precisions": [
824
+ 0.6492890995260663,
825
+ 0.4048780487804878,
826
+ 0.2814070351758794,
827
+ 0.21243523316062177
828
+ ],
829
+ "bp": 1.0,
830
+ "sys_len": 211,
831
+ "ref_len": 208,
832
+ "sacrebleu": 0.3540633387626259,
833
+ "score": 0.3540633387626259,
834
+ "score_name": "sacrebleu",
835
+ "score_ci_low": 0.188995320312599,
836
+ "score_ci_high": 0.4781846360393117,
837
+ "sacrebleu_ci_low": 0.188995320312599,
838
+ "sacrebleu_ci_high": 0.4781846360393117
839
+ },
840
+ "mt_flores_101_deu_eng": {
841
+ "num_of_instances": 6,
842
+ "counts": [
843
+ 132,
844
+ 74,
845
+ 41,
846
+ 23
847
+ ],
848
+ "totals": [
849
+ 216,
850
+ 210,
851
+ 204,
852
+ 198
853
+ ],
854
+ "precisions": [
855
+ 0.6111111111111112,
856
+ 0.3523809523809524,
857
+ 0.20098039215686275,
858
+ 0.11616161616161616
859
+ ],
860
+ "bp": 1.0,
861
+ "sys_len": 216,
862
+ "ref_len": 208,
863
+ "sacrebleu": 0.2662791948025941,
864
+ "score": 0.2662791948025941,
865
+ "score_name": "sacrebleu",
866
+ "score_ci_low": 0.17615641560459036,
867
+ "score_ci_high": 0.37265924807285117,
868
+ "sacrebleu_ci_low": 0.17615641560459036,
869
+ "sacrebleu_ci_high": 0.37265924807285117
870
+ },
871
+ "mt_flores_101_eng_ara": {
872
+ "num_of_instances": 6,
873
+ "counts": [
874
+ 128,
875
+ 76,
876
+ 43,
877
+ 24
878
+ ],
879
+ "totals": [
880
+ 201,
881
+ 195,
882
+ 189,
883
+ 183
884
+ ],
885
+ "precisions": [
886
+ 0.6368159203980099,
887
+ 0.3897435897435897,
888
+ 0.2275132275132275,
889
+ 0.13114754098360656
890
+ ],
891
+ "bp": 0.960980660057086,
892
+ "sys_len": 201,
893
+ "ref_len": 209,
894
+ "sacrebleu": 0.28190616374550787,
895
+ "score": 0.28190616374550787,
896
+ "score_name": "sacrebleu",
897
+ "score_ci_low": 0.19630579725874525,
898
+ "score_ci_high": 0.3918951959792066,
899
+ "sacrebleu_ci_low": 0.19630579725874525,
900
+ "sacrebleu_ci_high": 0.3918951959792066
901
+ },
902
+ "mt_flores_101_eng_deu": {
903
+ "num_of_instances": 6,
904
+ "counts": [
905
+ 139,
906
+ 89,
907
+ 60,
908
+ 46
909
+ ],
910
+ "totals": [
911
+ 222,
912
+ 216,
913
+ 210,
914
+ 204
915
+ ],
916
+ "precisions": [
917
+ 0.6261261261261262,
918
+ 0.41203703703703703,
919
+ 0.28571428571428575,
920
+ 0.22549019607843138
921
+ ],
922
+ "bp": 1.0,
923
+ "sys_len": 222,
924
+ "ref_len": 216,
925
+ "sacrebleu": 0.3590578493818958,
926
+ "score": 0.3590578493818958,
927
+ "score_name": "sacrebleu",
928
+ "score_ci_low": 0.22672780625357206,
929
+ "score_ci_high": 0.48655355879263695,
930
+ "sacrebleu_ci_low": 0.22672780625357206,
931
+ "sacrebleu_ci_high": 0.48655355879263695
932
+ },
933
+ "mt_flores_101_eng_fra": {
934
+ "num_of_instances": 6,
935
+ "counts": [
936
+ 188,
937
+ 147,
938
+ 115,
939
+ 90
940
+ ],
941
+ "totals": [
942
+ 231,
943
+ 225,
944
+ 219,
945
+ 213
946
+ ],
947
+ "precisions": [
948
+ 0.8138528138528138,
949
+ 0.6533333333333333,
950
+ 0.5251141552511416,
951
+ 0.4225352112676056
952
+ ],
953
+ "bp": 0.9828330432930387,
954
+ "sys_len": 231,
955
+ "ref_len": 235,
956
+ "sacrebleu": 0.5760087471924777,
957
+ "score": 0.5760087471924777,
958
+ "score_name": "sacrebleu",
959
+ "score_ci_low": 0.4220699958012471,
960
+ "score_ci_high": 0.733033345465493,
961
+ "sacrebleu_ci_low": 0.4220699958012471,
962
+ "sacrebleu_ci_high": 0.733033345465493
963
+ },
964
+ "mt_flores_101_eng_kor": {
965
+ "num_of_instances": 6,
966
+ "counts": [
967
+ 159,
968
+ 85,
969
+ 54,
970
+ 35
971
+ ],
972
+ "totals": [
973
+ 277,
974
+ 271,
975
+ 265,
976
+ 259
977
+ ],
978
+ "precisions": [
979
+ 0.5740072202166064,
980
+ 0.31365313653136534,
981
+ 0.2037735849056604,
982
+ 0.13513513513513514
983
+ ],
984
+ "bp": 1.0,
985
+ "sys_len": 277,
986
+ "ref_len": 249,
987
+ "sacrebleu": 0.26535103691316425,
988
+ "score": 0.26535103691316425,
989
+ "score_name": "sacrebleu",
990
+ "score_ci_low": 0.2050827299500949,
991
+ "score_ci_high": 0.3242346639521402,
992
+ "sacrebleu_ci_low": 0.2050827299500949,
993
+ "sacrebleu_ci_high": 0.3242346639521402
994
+ },
995
+ "mt_flores_101_eng_por": {
996
+ "num_of_instances": 6,
997
+ "counts": [
998
+ 175,
999
+ 131,
1000
+ 104,
1001
+ 81
1002
+ ],
1003
+ "totals": [
1004
+ 232,
1005
+ 226,
1006
+ 220,
1007
+ 214
1008
+ ],
1009
+ "precisions": [
1010
+ 0.7543103448275862,
1011
+ 0.5796460176991151,
1012
+ 0.4727272727272727,
1013
+ 0.37850467289719625
1014
+ ],
1015
+ "bp": 1.0,
1016
+ "sys_len": 232,
1017
+ "ref_len": 222,
1018
+ "sacrebleu": 0.5288697242857515,
1019
+ "score": 0.5288697242857515,
1020
+ "score_name": "sacrebleu",
1021
+ "score_ci_low": 0.47846923363851745,
1022
+ "score_ci_high": 0.6139001136679906,
1023
+ "sacrebleu_ci_low": 0.47846923363851745,
1024
+ "sacrebleu_ci_high": 0.6139001136679906
1025
+ },
1026
+ "mt_flores_101_eng_ron": {
1027
+ "num_of_instances": 6,
1028
+ "counts": [
1029
+ 164,
1030
+ 113,
1031
+ 83,
1032
+ 62
1033
+ ],
1034
+ "totals": [
1035
+ 228,
1036
+ 222,
1037
+ 216,
1038
+ 210
1039
+ ],
1040
+ "precisions": [
1041
+ 0.7192982456140351,
1042
+ 0.509009009009009,
1043
+ 0.38425925925925924,
1044
+ 0.29523809523809524
1045
+ ],
1046
+ "bp": 0.9912664313028773,
1047
+ "sys_len": 228,
1048
+ "ref_len": 230,
1049
+ "sacrebleu": 0.44750531811271016,
1050
+ "score": 0.44750531811271016,
1051
+ "score_name": "sacrebleu",
1052
+ "score_ci_low": 0.32949704407129193,
1053
+ "score_ci_high": 0.5998604762359077,
1054
+ "sacrebleu_ci_low": 0.32949704407129193,
1055
+ "sacrebleu_ci_high": 0.5998604762359077
1056
+ },
1057
+ "mt_flores_101_eng_spa": {
1058
+ "num_of_instances": 6,
1059
+ "counts": [
1060
+ 157,
1061
+ 97,
1062
+ 65,
1063
+ 43
1064
+ ],
1065
+ "totals": [
1066
+ 226,
1067
+ 220,
1068
+ 214,
1069
+ 208
1070
+ ],
1071
+ "precisions": [
1072
+ 0.6946902654867256,
1073
+ 0.4409090909090909,
1074
+ 0.3037383177570094,
1075
+ 0.20673076923076925
1076
+ ],
1077
+ "bp": 0.9275382560481537,
1078
+ "sys_len": 226,
1079
+ "ref_len": 243,
1080
+ "sacrebleu": 0.34541649552517106,
1081
+ "score": 0.34541649552517106,
1082
+ "score_name": "sacrebleu",
1083
+ "score_ci_low": 0.2833834978695395,
1084
+ "score_ci_high": 0.43980007583789943,
1085
+ "sacrebleu_ci_low": 0.2833834978695395,
1086
+ "sacrebleu_ci_high": 0.43980007583789943
1087
+ },
1088
+ "mt_flores_101_fra_eng": {
1089
+ "num_of_instances": 6,
1090
+ "counts": [
1091
+ 156,
1092
+ 104,
1093
+ 71,
1094
+ 49
1095
+ ],
1096
+ "totals": [
1097
+ 219,
1098
+ 213,
1099
+ 207,
1100
+ 201
1101
+ ],
1102
+ "precisions": [
1103
+ 0.7123287671232876,
1104
+ 0.48826291079812206,
1105
+ 0.3429951690821256,
1106
+ 0.24378109452736318
1107
+ ],
1108
+ "bp": 1.0,
1109
+ "sys_len": 219,
1110
+ "ref_len": 208,
1111
+ "sacrebleu": 0.4129576932882607,
1112
+ "score": 0.4129576932882607,
1113
+ "score_name": "sacrebleu",
1114
+ "score_ci_low": 0.29705055420750553,
1115
+ "score_ci_high": 0.5590420438259449,
1116
+ "sacrebleu_ci_low": 0.29705055420750553,
1117
+ "sacrebleu_ci_high": 0.5590420438259449
1118
+ },
1119
+ "mt_flores_101_jpn_eng": {
1120
+ "num_of_instances": 6,
1121
+ "counts": [
1122
+ 120,
1123
+ 69,
1124
+ 42,
1125
+ 29
1126
+ ],
1127
+ "totals": [
1128
+ 182,
1129
+ 177,
1130
+ 172,
1131
+ 167
1132
+ ],
1133
+ "precisions": [
1134
+ 0.6593406593406592,
1135
+ 0.3898305084745763,
1136
+ 0.24418604651162792,
1137
+ 0.17365269461077845
1138
+ ],
1139
+ "bp": 0.8668778997501817,
1140
+ "sys_len": 182,
1141
+ "ref_len": 208,
1142
+ "sacrebleu": 0.28009473432383397,
1143
+ "score": 0.28009473432383397,
1144
+ "score_name": "sacrebleu",
1145
+ "score_ci_low": 0.12663787459170767,
1146
+ "score_ci_high": 0.3726473304411957,
1147
+ "sacrebleu_ci_low": 0.12663787459170767,
1148
+ "sacrebleu_ci_high": 0.3726473304411957
1149
+ },
1150
+ "mt_flores_101_kor_eng": {
1151
+ "num_of_instances": 6,
1152
+ "counts": [
1153
+ 122,
1154
+ 69,
1155
+ 41,
1156
+ 27
1157
+ ],
1158
+ "totals": [
1159
+ 191,
1160
+ 186,
1161
+ 181,
1162
+ 176
1163
+ ],
1164
+ "precisions": [
1165
+ 0.6387434554973822,
1166
+ 0.3709677419354838,
1167
+ 0.2265193370165746,
1168
+ 0.1534090909090909
1169
+ ],
1170
+ "bp": 0.9148407838195897,
1171
+ "sys_len": 191,
1172
+ "ref_len": 208,
1173
+ "sacrebleu": 0.2755816298698519,
1174
+ "score": 0.2755816298698519,
1175
+ "score_name": "sacrebleu",
1176
+ "score_ci_low": 0.1550375519752376,
1177
+ "score_ci_high": 0.4298533640603147,
1178
+ "sacrebleu_ci_low": 0.1550375519752376,
1179
+ "sacrebleu_ci_high": 0.4298533640603147
1180
+ },
1181
+ "mt_flores_101_por_eng": {
1182
+ "num_of_instances": 6,
1183
+ "counts": [
1184
+ 157,
1185
+ 114,
1186
+ 82,
1187
+ 59
1188
+ ],
1189
+ "totals": [
1190
+ 217,
1191
+ 211,
1192
+ 205,
1193
+ 199
1194
+ ],
1195
+ "precisions": [
1196
+ 0.7235023041474655,
1197
+ 0.5402843601895735,
1198
+ 0.4,
1199
+ 0.2964824120603015
1200
+ ],
1201
+ "bp": 1.0,
1202
+ "sys_len": 217,
1203
+ "ref_len": 208,
1204
+ "sacrebleu": 0.464013173269593,
1205
+ "score": 0.464013173269593,
1206
+ "score_name": "sacrebleu",
1207
+ "score_ci_low": 0.2908039757752357,
1208
+ "score_ci_high": 0.5973824527832076,
1209
+ "sacrebleu_ci_low": 0.2908039757752357,
1210
+ "sacrebleu_ci_high": 0.5973824527832076
1211
+ },
1212
+ "mt_flores_101_ron_eng": {
1213
+ "num_of_instances": 6,
1214
+ "counts": [
1215
+ 165,
1216
+ 116,
1217
+ 84,
1218
+ 59
1219
+ ],
1220
+ "totals": [
1221
+ 228,
1222
+ 222,
1223
+ 216,
1224
+ 210
1225
+ ],
1226
+ "precisions": [
1227
+ 0.7236842105263157,
1228
+ 0.5225225225225225,
1229
+ 0.38888888888888884,
1230
+ 0.28095238095238095
1231
+ ],
1232
+ "bp": 1.0,
1233
+ "sys_len": 228,
1234
+ "ref_len": 208,
1235
+ "sacrebleu": 0.4508458651239866,
1236
+ "score": 0.4508458651239866,
1237
+ "score_name": "sacrebleu",
1238
+ "score_ci_low": 0.31931249774136927,
1239
+ "score_ci_high": 0.5112780455453768,
1240
+ "sacrebleu_ci_low": 0.31931249774136927,
1241
+ "sacrebleu_ci_high": 0.5112780455453768
1242
+ },
1243
+ "mt_flores_101_spa_eng": {
1244
+ "num_of_instances": 6,
1245
+ "counts": [
1246
+ 152,
1247
+ 96,
1248
+ 59,
1249
+ 39
1250
+ ],
1251
+ "totals": [
1252
+ 220,
1253
+ 214,
1254
+ 208,
1255
+ 202
1256
+ ],
1257
+ "precisions": [
1258
+ 0.6909090909090909,
1259
+ 0.4485981308411215,
1260
+ 0.28365384615384615,
1261
+ 0.19306930693069307
1262
+ ],
1263
+ "bp": 1.0,
1264
+ "sys_len": 220,
1265
+ "ref_len": 208,
1266
+ "sacrebleu": 0.3609483578130376,
1267
+ "score": 0.3609483578130376,
1268
+ "score_name": "sacrebleu",
1269
+ "score_ci_low": 0.2927054521124969,
1270
+ "score_ci_high": 0.3901296105157999,
1271
+ "sacrebleu_ci_low": 0.2927054521124969,
1272
+ "sacrebleu_ci_high": 0.3901296105157999
1273
+ },
1274
+ "score": 0.37792662149403083,
1275
+ "score_name": "subsets_mean",
1276
+ "num_of_instances": 90
1277
+ },
1278
+ "score": 0.5385208337043375,
1279
+ "score_name": "subsets_mean",
1280
+ "num_of_instances": 1537
1281
+ }
1282
+ }