lewtun's picture
lewtun HF staff
Upload eval_results/HuggingFaceH4/zephyr-7b-beta-ift/v0.2/bbh/results_2024-03-18T20-33-46.216888.json with huggingface_hub
ba09d8b verified
raw
history blame
33.5 kB
{
"config_general": {
"lighteval_sha": "?",
"num_fewshot_seeds": 1,
"override_batch_size": 1,
"max_samples": null,
"job_id": "",
"start_time": 390765.741940286,
"end_time": 391011.411535347,
"total_evaluation_time_secondes": "245.66959506098647",
"model_name": "HuggingFaceH4/zephyr-7b-beta-ift",
"model_sha": "841a9223d05bc50686a493fbe441f0ef91b406b4",
"model_dtype": "torch.bfloat16",
"model_size": "13.99 GB",
"config": null
},
"results": {
"harness|bbh:causal_judgment|3": {
"em": 0.5026737967914439,
"em_stderr": 0.03666125454759919,
"qem": 0.5026737967914439,
"qem_stderr": 0.03666125454759919,
"pem": 0.5026737967914439,
"pem_stderr": 0.03666125454759919,
"pqem": 0.5026737967914439,
"pqem_stderr": 0.03666125454759919,
"perfect_em": 0.5026737967914439,
"perfect_em_stderr": 0.03666125454759919
},
"harness|bbh:date_understanding|3": {
"em": 0.456,
"em_stderr": 0.03156328506121339,
"qem": 0.456,
"qem_stderr": 0.03156328506121339,
"pem": 0.456,
"pem_stderr": 0.03156328506121339,
"pqem": 0.56,
"pqem_stderr": 0.031457244522235646,
"perfect_em": 0.456,
"perfect_em_stderr": 0.03156328506121339
},
"harness|bbh:disambiguation_qa|3": {
"em": 0.564,
"em_stderr": 0.03142556706028128,
"qem": 0.564,
"qem_stderr": 0.03142556706028128,
"pem": 0.572,
"pem_stderr": 0.03135596892377261,
"pqem": 0.696,
"pqem_stderr": 0.029150213374159677,
"perfect_em": 0.564,
"perfect_em_stderr": 0.03142556706028128
},
"harness|bbh:geometric_shapes|3": {
"em": 0.1,
"em_stderr": 0.019011727515734357,
"qem": 0.1,
"qem_stderr": 0.019011727515734357,
"pem": 0.2,
"pem_stderr": 0.025348970020979078,
"pqem": 0.2,
"pqem_stderr": 0.025348970020979078,
"perfect_em": 0.1,
"perfect_em_stderr": 0.019011727515734357
},
"harness|bbh:logical_deduction_five_objects|3": {
"em": 0.336,
"em_stderr": 0.029933259094191516,
"qem": 0.336,
"qem_stderr": 0.029933259094191516,
"pem": 0.336,
"pem_stderr": 0.029933259094191516,
"pqem": 0.456,
"pqem_stderr": 0.03156328506121339,
"perfect_em": 0.336,
"perfect_em_stderr": 0.029933259094191516
},
"harness|bbh:logical_deduction_seven_objects|3": {
"em": 0.256,
"em_stderr": 0.027657108718204915,
"qem": 0.256,
"qem_stderr": 0.027657108718204915,
"pem": 0.256,
"pem_stderr": 0.027657108718204915,
"pqem": 0.392,
"pqem_stderr": 0.030938207620401195,
"perfect_em": 0.256,
"perfect_em_stderr": 0.027657108718204915
},
"harness|bbh:logical_deduction_three_objects|3": {
"em": 0.436,
"em_stderr": 0.03142556706028128,
"qem": 0.436,
"qem_stderr": 0.03142556706028128,
"pem": 0.436,
"pem_stderr": 0.03142556706028128,
"pqem": 0.684,
"pqem_stderr": 0.029462657598578676,
"perfect_em": 0.436,
"perfect_em_stderr": 0.03142556706028128
},
"harness|bbh:movie_recommendation|3": {
"em": 0.642570281124498,
"em_stderr": 0.030431951782206115,
"qem": 0.642570281124498,
"qem_stderr": 0.030431951782206115,
"pem": 0.642570281124498,
"pem_stderr": 0.030431951782206115,
"pqem": 0.7469879518072289,
"pqem_stderr": 0.027605877680456924,
"perfect_em": 0.642570281124498,
"perfect_em_stderr": 0.030431951782206115
},
"harness|bbh:navigate|3": {
"em": 0.596,
"em_stderr": 0.031096688184825298,
"qem": 0.596,
"qem_stderr": 0.031096688184825298,
"pem": 0.596,
"pem_stderr": 0.031096688184825298,
"pqem": 0.596,
"pqem_stderr": 0.031096688184825298,
"perfect_em": 0.596,
"perfect_em_stderr": 0.031096688184825298
},
"harness|bbh:reasoning_about_colored_objects|3": {
"em": 0.092,
"em_stderr": 0.018316275379429644,
"qem": 0.092,
"qem_stderr": 0.018316275379429644,
"pem": 0.304,
"pem_stderr": 0.029150213374159673,
"pqem": 0.472,
"pqem_stderr": 0.031636489531544396,
"perfect_em": 0.092,
"perfect_em_stderr": 0.018316275379429644
},
"harness|bbh:ruin_names|3": {
"em": 0.3629032258064516,
"em_stderr": 0.030594942459036583,
"qem": 0.3629032258064516,
"qem_stderr": 0.030594942459036583,
"pem": 0.375,
"pem_stderr": 0.03080400363063401,
"pqem": 0.5483870967741935,
"pqem_stderr": 0.03166491365125692,
"perfect_em": 0.3629032258064516,
"perfect_em_stderr": 0.030594942459036583
},
"harness|bbh:salient_translation_error_detection|3": {
"em": 0.324,
"em_stderr": 0.029658294924545567,
"qem": 0.324,
"qem_stderr": 0.029658294924545567,
"pem": 0.324,
"pem_stderr": 0.029658294924545567,
"pqem": 0.448,
"pqem_stderr": 0.03151438761115355,
"perfect_em": 0.324,
"perfect_em_stderr": 0.029658294924545567
},
"harness|bbh:snarks|3": {
"em": 0.5674157303370787,
"em_stderr": 0.037239120377075136,
"qem": 0.5674157303370787,
"qem_stderr": 0.037239120377075136,
"pem": 0.5730337078651685,
"pem_stderr": 0.03717921762559316,
"pqem": 0.6348314606741573,
"pqem_stderr": 0.03619005678691266,
"perfect_em": 0.5674157303370787,
"perfect_em_stderr": 0.037239120377075136
},
"harness|bbh:sports_understanding|3": {
"em": 0.688,
"em_stderr": 0.029361067575219817,
"qem": 0.688,
"qem_stderr": 0.029361067575219817,
"pem": 0.768,
"pem_stderr": 0.026750070374865164,
"pqem": 0.768,
"pqem_stderr": 0.026750070374865164,
"perfect_em": 0.688,
"perfect_em_stderr": 0.029361067575219817
},
"harness|bbh:temporal_sequences|3": {
"em": 0.208,
"em_stderr": 0.025721398901416392,
"qem": 0.208,
"qem_stderr": 0.025721398901416392,
"pem": 0.208,
"pem_stderr": 0.025721398901416392,
"pqem": 0.468,
"pqem_stderr": 0.03162125257572551,
"perfect_em": 0.208,
"perfect_em_stderr": 0.025721398901416392
},
"harness|bbh:tracking_shuffled_objects_five_objects|3": {
"em": 0.196,
"em_stderr": 0.02515685731325595,
"qem": 0.196,
"qem_stderr": 0.02515685731325595,
"pem": 0.196,
"pem_stderr": 0.02515685731325595,
"pqem": 0.396,
"pqem_stderr": 0.030993197854577853,
"perfect_em": 0.196,
"perfect_em_stderr": 0.02515685731325595
},
"harness|bbh:tracking_shuffled_objects_seven_objects|3": {
"em": 0.132,
"em_stderr": 0.02145098082403812,
"qem": 0.132,
"qem_stderr": 0.02145098082403812,
"pem": 0.132,
"pem_stderr": 0.02145098082403812,
"pqem": 0.272,
"pqem_stderr": 0.02820008829631,
"perfect_em": 0.132,
"perfect_em_stderr": 0.02145098082403812
},
"harness|bbh:tracking_shuffled_objects_three_objects|3": {
"em": 0.308,
"em_stderr": 0.029256928606501864,
"qem": 0.308,
"qem_stderr": 0.029256928606501864,
"pem": 0.36,
"pem_stderr": 0.030418764025174978,
"pqem": 0.672,
"pqem_stderr": 0.02975239182447539,
"perfect_em": 0.308,
"perfect_em_stderr": 0.029256928606501864
},
"harness|bbh:_average|3": {
"em": 0.37597572411441504,
"em_stderr": 0.028664570854725364,
"qem": 0.37597572411441504,
"qem_stderr": 0.028664570854725364,
"pem": 0.4020709880989506,
"pem_stderr": 0.02954243635483091,
"pqem": 0.5284933503359459,
"pqem_stderr": 0.030644847062070588,
"perfect_em": 0.37597572411441504,
"perfect_em_stderr": 0.028664570854725364
}
},
"versions": {
"harness|bbh:causal_judgment|3": 0,
"harness|bbh:date_understanding|3": 0,
"harness|bbh:disambiguation_qa|3": 0,
"harness|bbh:geometric_shapes|3": 0,
"harness|bbh:logical_deduction_five_objects|3": 0,
"harness|bbh:logical_deduction_seven_objects|3": 0,
"harness|bbh:logical_deduction_three_objects|3": 0,
"harness|bbh:movie_recommendation|3": 0,
"harness|bbh:navigate|3": 0,
"harness|bbh:reasoning_about_colored_objects|3": 0,
"harness|bbh:ruin_names|3": 0,
"harness|bbh:salient_translation_error_detection|3": 0,
"harness|bbh:snarks|3": 0,
"harness|bbh:sports_understanding|3": 0,
"harness|bbh:temporal_sequences|3": 0,
"harness|bbh:tracking_shuffled_objects_five_objects|3": 0,
"harness|bbh:tracking_shuffled_objects_seven_objects|3": 0,
"harness|bbh:tracking_shuffled_objects_three_objects|3": 0
},
"config_tasks": {
"harness|bbh:causal_judgment": {
"name": "bbh:causal_judgment",
"prompt_function": "bbh_causal_judgment",
"hf_repo": "lukaemon/bbh",
"hf_subset": "causal_judgement",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 187,
"effective_num_docs": 187,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:date_understanding": {
"name": "bbh:date_understanding",
"prompt_function": "bbh_date_understanding",
"hf_repo": "lukaemon/bbh",
"hf_subset": "date_understanding",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:disambiguation_qa": {
"name": "bbh:disambiguation_qa",
"prompt_function": "bbh_disambiguation_qa",
"hf_repo": "lukaemon/bbh",
"hf_subset": "disambiguation_qa",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:geometric_shapes": {
"name": "bbh:geometric_shapes",
"prompt_function": "bbh_geometric_shapes",
"hf_repo": "lukaemon/bbh",
"hf_subset": "geometric_shapes",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:logical_deduction_five_objects": {
"name": "bbh:logical_deduction_five_objects",
"prompt_function": "bbh_logical_deduction_five_objects",
"hf_repo": "lukaemon/bbh",
"hf_subset": "logical_deduction_five_objects",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:logical_deduction_seven_objects": {
"name": "bbh:logical_deduction_seven_objects",
"prompt_function": "bbh_logical_deduction_seven_objects",
"hf_repo": "lukaemon/bbh",
"hf_subset": "logical_deduction_seven_objects",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:logical_deduction_three_objects": {
"name": "bbh:logical_deduction_three_objects",
"prompt_function": "bbh_logical_deduction_three_objects",
"hf_repo": "lukaemon/bbh",
"hf_subset": "logical_deduction_three_objects",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:movie_recommendation": {
"name": "bbh:movie_recommendation",
"prompt_function": "bbh_movie_recommendation",
"hf_repo": "lukaemon/bbh",
"hf_subset": "movie_recommendation",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 249,
"effective_num_docs": 249,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:navigate": {
"name": "bbh:navigate",
"prompt_function": "bbh_navigate",
"hf_repo": "lukaemon/bbh",
"hf_subset": "navigate",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:reasoning_about_colored_objects": {
"name": "bbh:reasoning_about_colored_objects",
"prompt_function": "bbh_reasoning_about_colored_objects",
"hf_repo": "lukaemon/bbh",
"hf_subset": "reasoning_about_colored_objects",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:ruin_names": {
"name": "bbh:ruin_names",
"prompt_function": "bbh_ruin_names",
"hf_repo": "lukaemon/bbh",
"hf_subset": "ruin_names",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 248,
"effective_num_docs": 248,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:salient_translation_error_detection": {
"name": "bbh:salient_translation_error_detection",
"prompt_function": "bbh_salient_translation_error_detection",
"hf_repo": "lukaemon/bbh",
"hf_subset": "salient_translation_error_detection",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:snarks": {
"name": "bbh:snarks",
"prompt_function": "bbh_snarks",
"hf_repo": "lukaemon/bbh",
"hf_subset": "snarks",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 178,
"effective_num_docs": 178,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:sports_understanding": {
"name": "bbh:sports_understanding",
"prompt_function": "bbh_sports_understanding",
"hf_repo": "lukaemon/bbh",
"hf_subset": "sports_understanding",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:temporal_sequences": {
"name": "bbh:temporal_sequences",
"prompt_function": "bbh_temporal_sequences",
"hf_repo": "lukaemon/bbh",
"hf_subset": "temporal_sequences",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:tracking_shuffled_objects_five_objects": {
"name": "bbh:tracking_shuffled_objects_five_objects",
"prompt_function": "bbh_tracking_shuffled_objects_five_objects",
"hf_repo": "lukaemon/bbh",
"hf_subset": "tracking_shuffled_objects_five_objects",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:tracking_shuffled_objects_seven_objects": {
"name": "bbh:tracking_shuffled_objects_seven_objects",
"prompt_function": "bbh_tracking_shuffled_objects_seven_objects",
"hf_repo": "lukaemon/bbh",
"hf_subset": "tracking_shuffled_objects_seven_objects",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:tracking_shuffled_objects_three_objects": {
"name": "bbh:tracking_shuffled_objects_three_objects",
"prompt_function": "bbh_tracking_shuffled_objects_three_objects",
"hf_repo": "lukaemon/bbh",
"hf_subset": "tracking_shuffled_objects_three_objects",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
}
},
"summary_tasks": {
"harness|bbh:causal_judgment|3": {
"hashes": {
"hash_examples": "63218f5ae055ab2b",
"hash_full_prompts": "fa8168f39a475fb0",
"hash_input_tokens": "787f75e06fd43c0d",
"hash_cont_tokens": "d38fce6b83cb6cf8"
},
"truncated": 187,
"non_truncated": 0,
"padded": 0,
"non_padded": 187,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:date_understanding|3": {
"hashes": {
"hash_examples": "f145c7a06def3c8e",
"hash_full_prompts": "2cceeea606638d49",
"hash_input_tokens": "10c13d6fb8af7c22",
"hash_cont_tokens": "afe1a458d01d6b49"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:disambiguation_qa|3": {
"hashes": {
"hash_examples": "19677fd1773f7eb9",
"hash_full_prompts": "d8f1ba70c22ae578",
"hash_input_tokens": "c21a88707f480cab",
"hash_cont_tokens": "975af0d0edb5e548"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:geometric_shapes|3": {
"hashes": {
"hash_examples": "76c7b11a13cc72a9",
"hash_full_prompts": "52a60ed1d0113b8b",
"hash_input_tokens": "10e113b2cf3fa584",
"hash_cont_tokens": "d7bd15f16aa3a69e"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:logical_deduction_five_objects|3": {
"hashes": {
"hash_examples": "0e958c856332a745",
"hash_full_prompts": "253aa9791c941909",
"hash_input_tokens": "0bc166cab0aed76a",
"hash_cont_tokens": "33afca6b1b153349"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:logical_deduction_seven_objects|3": {
"hashes": {
"hash_examples": "ab9de25a5eb40d09",
"hash_full_prompts": "aa6117f601cd268e",
"hash_input_tokens": "ab99c78b48e3a0bb",
"hash_cont_tokens": "cb0a829037414bc4"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:logical_deduction_three_objects|3": {
"hashes": {
"hash_examples": "3c6bf52517714218",
"hash_full_prompts": "1892b050bc7848a4",
"hash_input_tokens": "a720b56aa7c52551",
"hash_cont_tokens": "df1c7a934ff4d4fd"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:movie_recommendation|3": {
"hashes": {
"hash_examples": "2d9dc4975935d31a",
"hash_full_prompts": "8e00606ed3407167",
"hash_input_tokens": "c825ab1c99245a17",
"hash_cont_tokens": "8993eba556c19cb1"
},
"truncated": 249,
"non_truncated": 0,
"padded": 0,
"non_padded": 249,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:navigate|3": {
"hashes": {
"hash_examples": "ba91dcdb9a064255",
"hash_full_prompts": "8d50c5baf1df7aef",
"hash_input_tokens": "f234e6b28ea1fa49",
"hash_cont_tokens": "525142c03c67fa52"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:reasoning_about_colored_objects|3": {
"hashes": {
"hash_examples": "a6ba328c4c3385d2",
"hash_full_prompts": "3d2441a21c12a960",
"hash_input_tokens": "f3b577892955aa84",
"hash_cont_tokens": "70ffb1d11ef7cb70"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:ruin_names|3": {
"hashes": {
"hash_examples": "2ef28d5f2d4fdd25",
"hash_full_prompts": "ba95caa786f313b1",
"hash_input_tokens": "9954b30d4205604a",
"hash_cont_tokens": "117e9fc172132861"
},
"truncated": 248,
"non_truncated": 0,
"padded": 0,
"non_padded": 248,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:salient_translation_error_detection|3": {
"hashes": {
"hash_examples": "c13f25ec8ffed496",
"hash_full_prompts": "a8512d174e1cab8f",
"hash_input_tokens": "3e738df24b7eddf8",
"hash_cont_tokens": "78a8d7de4e56c3f6"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:snarks|3": {
"hashes": {
"hash_examples": "5f6db7bff7f6f22e",
"hash_full_prompts": "ff91d81466b9041f",
"hash_input_tokens": "21388b09e13d0208",
"hash_cont_tokens": "25008d3d52e836a9"
},
"truncated": 178,
"non_truncated": 0,
"padded": 0,
"non_padded": 178,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:sports_understanding|3": {
"hashes": {
"hash_examples": "042afbe5d9c1f02d",
"hash_full_prompts": "a59324d9eb37e0f5",
"hash_input_tokens": "0ad41bb8d2290a5b",
"hash_cont_tokens": "481132373d21794f"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:temporal_sequences|3": {
"hashes": {
"hash_examples": "803a05f352eb6afc",
"hash_full_prompts": "1b3971192bf481e7",
"hash_input_tokens": "3051b60940ccceab",
"hash_cont_tokens": "bc6468999bd8da8a"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:tracking_shuffled_objects_five_objects|3": {
"hashes": {
"hash_examples": "2bbac6db7ab0d527",
"hash_full_prompts": "7ef4567d2fcf5094",
"hash_input_tokens": "b841310ee5531238",
"hash_cont_tokens": "614b8cc82e5424f3"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:tracking_shuffled_objects_seven_objects|3": {
"hashes": {
"hash_examples": "845caf093ac2b58c",
"hash_full_prompts": "196a0f8712857624",
"hash_input_tokens": "3e738df24b7eddf8",
"hash_cont_tokens": "08ee1d03a1ca9cfa"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:tracking_shuffled_objects_three_objects|3": {
"hashes": {
"hash_examples": "9004f14d5a32b9a8",
"hash_full_prompts": "592a03f0518f17b6",
"hash_input_tokens": "19e0ef1dd5ae9d33",
"hash_cont_tokens": "ff481f64829c6e9d"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
}
},
"summary_general": {
"hashes": {
"hash_examples": "4ff1e3dc5703575d",
"hash_full_prompts": "0d80ce968d89d4ef",
"hash_input_tokens": "72bda1e7aeb34786",
"hash_cont_tokens": "5e3e2b1aa7251ed7"
},
"truncated": 4362,
"non_truncated": 0,
"padded": 0,
"non_padded": 4362,
"num_truncated_few_shots": 0
}
}