lewtun's picture
lewtun HF staff
Upload eval_results/teknium/OpenHermes-2.5-Mistral-7B/main/bbh/results_2024-03-18T19-49-31.908303.json with huggingface_hub
40f3905 verified
raw
history blame
33.5 kB
{
"config_general": {
"lighteval_sha": "?",
"num_fewshot_seeds": 1,
"override_batch_size": 1,
"max_samples": null,
"job_id": "",
"start_time": 1048188.318193615,
"end_time": 1048420.852422956,
"total_evaluation_time_secondes": "232.53422934107948",
"model_name": "teknium/OpenHermes-2.5-Mistral-7B",
"model_sha": "24c0bea14d53e6f67f1fbe2eca5bfe7cae389b33",
"model_dtype": "torch.bfloat16",
"model_size": "13.99 GB",
"config": null
},
"results": {
"harness|bbh:causal_judgment|3": {
"em": 0.5935828877005348,
"em_stderr": 0.036013904358574426,
"qem": 0.5935828877005348,
"qem_stderr": 0.036013904358574426,
"pem": 0.5935828877005348,
"pem_stderr": 0.036013904358574426,
"pqem": 0.5935828877005348,
"pqem_stderr": 0.036013904358574426,
"perfect_em": 0.5935828877005348,
"perfect_em_stderr": 0.036013904358574426
},
"harness|bbh:date_understanding|3": {
"em": 0.428,
"em_stderr": 0.031355968923772626,
"qem": 0.428,
"qem_stderr": 0.031355968923772626,
"pem": 0.428,
"pem_stderr": 0.031355968923772626,
"pqem": 0.528,
"pqem_stderr": 0.03163648953154439,
"perfect_em": 0.428,
"perfect_em_stderr": 0.031355968923772626
},
"harness|bbh:disambiguation_qa|3": {
"em": 0.664,
"em_stderr": 0.029933259094191516,
"qem": 0.664,
"qem_stderr": 0.029933259094191516,
"pem": 0.664,
"pem_stderr": 0.029933259094191516,
"pqem": 0.736,
"pqem_stderr": 0.027934518957690908,
"perfect_em": 0.664,
"perfect_em_stderr": 0.029933259094191516
},
"harness|bbh:geometric_shapes|3": {
"em": 0.252,
"em_stderr": 0.027513851933031363,
"qem": 0.252,
"qem_stderr": 0.027513851933031363,
"pem": 0.252,
"pem_stderr": 0.027513851933031363,
"pqem": 0.252,
"pqem_stderr": 0.027513851933031363,
"perfect_em": 0.252,
"perfect_em_stderr": 0.027513851933031363
},
"harness|bbh:logical_deduction_five_objects|3": {
"em": 0.48,
"em_stderr": 0.03166085340849519,
"qem": 0.48,
"qem_stderr": 0.03166085340849519,
"pem": 0.48,
"pem_stderr": 0.03166085340849519,
"pqem": 0.58,
"pqem_stderr": 0.03127799950463661,
"perfect_em": 0.48,
"perfect_em_stderr": 0.03166085340849519
},
"harness|bbh:logical_deduction_seven_objects|3": {
"em": 0.408,
"em_stderr": 0.031145209846548488,
"qem": 0.408,
"qem_stderr": 0.031145209846548488,
"pem": 0.408,
"pem_stderr": 0.031145209846548488,
"pqem": 0.516,
"pqem_stderr": 0.03166998503010741,
"perfect_em": 0.408,
"perfect_em_stderr": 0.031145209846548488
},
"harness|bbh:logical_deduction_three_objects|3": {
"em": 0.648,
"em_stderr": 0.030266288057359942,
"qem": 0.648,
"qem_stderr": 0.030266288057359942,
"pem": 0.648,
"pem_stderr": 0.030266288057359942,
"pqem": 0.84,
"pqem_stderr": 0.02323271478206066,
"perfect_em": 0.648,
"perfect_em_stderr": 0.030266288057359942
},
"harness|bbh:movie_recommendation|3": {
"em": 0.6867469879518072,
"em_stderr": 0.02945236466291991,
"qem": 0.6867469879518072,
"qem_stderr": 0.02945236466291991,
"pem": 0.6867469879518072,
"pem_stderr": 0.02945236466291991,
"pqem": 0.7349397590361446,
"pqem_stderr": 0.028026723251674716,
"perfect_em": 0.6867469879518072,
"perfect_em_stderr": 0.02945236466291991
},
"harness|bbh:navigate|3": {
"em": 0.572,
"em_stderr": 0.031355968923772626,
"qem": 0.572,
"qem_stderr": 0.031355968923772626,
"pem": 0.572,
"pem_stderr": 0.031355968923772626,
"pqem": 0.572,
"pqem_stderr": 0.031355968923772626,
"perfect_em": 0.572,
"perfect_em_stderr": 0.031355968923772626
},
"harness|bbh:reasoning_about_colored_objects|3": {
"em": 0.332,
"em_stderr": 0.02984403904746591,
"qem": 0.332,
"qem_stderr": 0.02984403904746591,
"pem": 0.428,
"pem_stderr": 0.03135596892377261,
"pqem": 0.528,
"pqem_stderr": 0.031636489531544396,
"perfect_em": 0.332,
"perfect_em_stderr": 0.02984403904746591
},
"harness|bbh:ruin_names|3": {
"em": 0.5403225806451613,
"em_stderr": 0.031710615183950554,
"qem": 0.5403225806451613,
"qem_stderr": 0.031710615183950554,
"pem": 0.5403225806451613,
"pem_stderr": 0.031710615183950554,
"pqem": 0.6330645161290323,
"pqem_stderr": 0.030666934450850083,
"perfect_em": 0.5403225806451613,
"perfect_em_stderr": 0.031710615183950554
},
"harness|bbh:salient_translation_error_detection|3": {
"em": 0.344,
"em_stderr": 0.03010450339231639,
"qem": 0.344,
"qem_stderr": 0.03010450339231639,
"pem": 0.344,
"pem_stderr": 0.03010450339231639,
"pqem": 0.484,
"pqem_stderr": 0.031669985030107414,
"perfect_em": 0.344,
"perfect_em_stderr": 0.03010450339231639
},
"harness|bbh:snarks|3": {
"em": 0.7696629213483146,
"em_stderr": 0.03164794946543343,
"qem": 0.7696629213483146,
"qem_stderr": 0.03164794946543343,
"pem": 0.7696629213483146,
"pem_stderr": 0.03164794946543343,
"pqem": 0.8426966292134831,
"pqem_stderr": 0.027366421373452483,
"perfect_em": 0.7696629213483146,
"perfect_em_stderr": 0.03164794946543343
},
"harness|bbh:sports_understanding|3": {
"em": 0.824,
"em_stderr": 0.024133497525457112,
"qem": 0.824,
"qem_stderr": 0.024133497525457112,
"pem": 0.824,
"pem_stderr": 0.024133497525457112,
"pqem": 0.824,
"pqem_stderr": 0.024133497525457112,
"perfect_em": 0.824,
"perfect_em_stderr": 0.024133497525457112
},
"harness|bbh:temporal_sequences|3": {
"em": 0.296,
"em_stderr": 0.028928939388379635,
"qem": 0.296,
"qem_stderr": 0.028928939388379635,
"pem": 0.296,
"pem_stderr": 0.028928939388379635,
"pqem": 0.472,
"pqem_stderr": 0.0316364895315444,
"perfect_em": 0.296,
"perfect_em_stderr": 0.028928939388379635
},
"harness|bbh:tracking_shuffled_objects_five_objects|3": {
"em": 0.2,
"em_stderr": 0.02534897002097908,
"qem": 0.2,
"qem_stderr": 0.02534897002097908,
"pem": 0.2,
"pem_stderr": 0.02534897002097908,
"pqem": 0.388,
"pqem_stderr": 0.030881038748993908,
"perfect_em": 0.2,
"perfect_em_stderr": 0.02534897002097908
},
"harness|bbh:tracking_shuffled_objects_seven_objects|3": {
"em": 0.12,
"em_stderr": 0.02059360059683994,
"qem": 0.12,
"qem_stderr": 0.02059360059683994,
"pem": 0.12,
"pem_stderr": 0.02059360059683994,
"pqem": 0.252,
"pqem_stderr": 0.02751385193303136,
"perfect_em": 0.12,
"perfect_em_stderr": 0.02059360059683994
},
"harness|bbh:tracking_shuffled_objects_three_objects|3": {
"em": 0.388,
"em_stderr": 0.03088103874899391,
"qem": 0.388,
"qem_stderr": 0.03088103874899391,
"pem": 0.388,
"pem_stderr": 0.03088103874899391,
"pqem": 0.684,
"pqem_stderr": 0.029462657598578676,
"perfect_em": 0.388,
"perfect_em_stderr": 0.03088103874899391
},
"harness|bbh:_average|3": {
"em": 0.47479529875810095,
"em_stderr": 0.02954949014324901,
"qem": 0.47479529875810095,
"qem_stderr": 0.02954949014324901,
"pem": 0.48012863209143425,
"pem_stderr": 0.02963348624748827,
"pqem": 0.5811268773377329,
"pqem_stderr": 0.029646084555369608,
"perfect_em": 0.47479529875810095,
"perfect_em_stderr": 0.02954949014324901
}
},
"versions": {
"harness|bbh:causal_judgment|3": 0,
"harness|bbh:date_understanding|3": 0,
"harness|bbh:disambiguation_qa|3": 0,
"harness|bbh:geometric_shapes|3": 0,
"harness|bbh:logical_deduction_five_objects|3": 0,
"harness|bbh:logical_deduction_seven_objects|3": 0,
"harness|bbh:logical_deduction_three_objects|3": 0,
"harness|bbh:movie_recommendation|3": 0,
"harness|bbh:navigate|3": 0,
"harness|bbh:reasoning_about_colored_objects|3": 0,
"harness|bbh:ruin_names|3": 0,
"harness|bbh:salient_translation_error_detection|3": 0,
"harness|bbh:snarks|3": 0,
"harness|bbh:sports_understanding|3": 0,
"harness|bbh:temporal_sequences|3": 0,
"harness|bbh:tracking_shuffled_objects_five_objects|3": 0,
"harness|bbh:tracking_shuffled_objects_seven_objects|3": 0,
"harness|bbh:tracking_shuffled_objects_three_objects|3": 0
},
"config_tasks": {
"harness|bbh:causal_judgment": {
"name": "bbh:causal_judgment",
"prompt_function": "bbh_causal_judgment",
"hf_repo": "lukaemon/bbh",
"hf_subset": "causal_judgement",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 187,
"effective_num_docs": 187,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:date_understanding": {
"name": "bbh:date_understanding",
"prompt_function": "bbh_date_understanding",
"hf_repo": "lukaemon/bbh",
"hf_subset": "date_understanding",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:disambiguation_qa": {
"name": "bbh:disambiguation_qa",
"prompt_function": "bbh_disambiguation_qa",
"hf_repo": "lukaemon/bbh",
"hf_subset": "disambiguation_qa",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:geometric_shapes": {
"name": "bbh:geometric_shapes",
"prompt_function": "bbh_geometric_shapes",
"hf_repo": "lukaemon/bbh",
"hf_subset": "geometric_shapes",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:logical_deduction_five_objects": {
"name": "bbh:logical_deduction_five_objects",
"prompt_function": "bbh_logical_deduction_five_objects",
"hf_repo": "lukaemon/bbh",
"hf_subset": "logical_deduction_five_objects",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:logical_deduction_seven_objects": {
"name": "bbh:logical_deduction_seven_objects",
"prompt_function": "bbh_logical_deduction_seven_objects",
"hf_repo": "lukaemon/bbh",
"hf_subset": "logical_deduction_seven_objects",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:logical_deduction_three_objects": {
"name": "bbh:logical_deduction_three_objects",
"prompt_function": "bbh_logical_deduction_three_objects",
"hf_repo": "lukaemon/bbh",
"hf_subset": "logical_deduction_three_objects",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:movie_recommendation": {
"name": "bbh:movie_recommendation",
"prompt_function": "bbh_movie_recommendation",
"hf_repo": "lukaemon/bbh",
"hf_subset": "movie_recommendation",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 249,
"effective_num_docs": 249,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:navigate": {
"name": "bbh:navigate",
"prompt_function": "bbh_navigate",
"hf_repo": "lukaemon/bbh",
"hf_subset": "navigate",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:reasoning_about_colored_objects": {
"name": "bbh:reasoning_about_colored_objects",
"prompt_function": "bbh_reasoning_about_colored_objects",
"hf_repo": "lukaemon/bbh",
"hf_subset": "reasoning_about_colored_objects",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:ruin_names": {
"name": "bbh:ruin_names",
"prompt_function": "bbh_ruin_names",
"hf_repo": "lukaemon/bbh",
"hf_subset": "ruin_names",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 248,
"effective_num_docs": 248,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:salient_translation_error_detection": {
"name": "bbh:salient_translation_error_detection",
"prompt_function": "bbh_salient_translation_error_detection",
"hf_repo": "lukaemon/bbh",
"hf_subset": "salient_translation_error_detection",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:snarks": {
"name": "bbh:snarks",
"prompt_function": "bbh_snarks",
"hf_repo": "lukaemon/bbh",
"hf_subset": "snarks",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 178,
"effective_num_docs": 178,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:sports_understanding": {
"name": "bbh:sports_understanding",
"prompt_function": "bbh_sports_understanding",
"hf_repo": "lukaemon/bbh",
"hf_subset": "sports_understanding",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:temporal_sequences": {
"name": "bbh:temporal_sequences",
"prompt_function": "bbh_temporal_sequences",
"hf_repo": "lukaemon/bbh",
"hf_subset": "temporal_sequences",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:tracking_shuffled_objects_five_objects": {
"name": "bbh:tracking_shuffled_objects_five_objects",
"prompt_function": "bbh_tracking_shuffled_objects_five_objects",
"hf_repo": "lukaemon/bbh",
"hf_subset": "tracking_shuffled_objects_five_objects",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:tracking_shuffled_objects_seven_objects": {
"name": "bbh:tracking_shuffled_objects_seven_objects",
"prompt_function": "bbh_tracking_shuffled_objects_seven_objects",
"hf_repo": "lukaemon/bbh",
"hf_subset": "tracking_shuffled_objects_seven_objects",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"harness|bbh:tracking_shuffled_objects_three_objects": {
"name": "bbh:tracking_shuffled_objects_three_objects",
"prompt_function": "bbh_tracking_shuffled_objects_three_objects",
"hf_repo": "lukaemon/bbh",
"hf_subset": "tracking_shuffled_objects_three_objects",
"metric": [
"exact_match",
"quasi_exact_match",
"prefix_exact_match",
"prefix_quasi_exact_match",
"perfect_exact_match"
],
"hf_avail_splits": [
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": 20,
"stop_sequence": [
"</s>",
"Q:",
"\n\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"harness"
],
"original_num_docs": 250,
"effective_num_docs": 250,
"trust_dataset": true,
"must_remove_duplicate_docs": null
}
},
"summary_tasks": {
"harness|bbh:causal_judgment|3": {
"hashes": {
"hash_examples": "63218f5ae055ab2b",
"hash_full_prompts": "7303fa1d0fe0b29a",
"hash_input_tokens": "94e6ca97dc7a8d65",
"hash_cont_tokens": "c3d0b9e4e0ee81b9"
},
"truncated": 187,
"non_truncated": 0,
"padded": 0,
"non_padded": 187,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:date_understanding|3": {
"hashes": {
"hash_examples": "f145c7a06def3c8e",
"hash_full_prompts": "69e60d10afa5a6f1",
"hash_input_tokens": "56c1b1dfb318cc75",
"hash_cont_tokens": "13813e073a67c71c"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:disambiguation_qa|3": {
"hashes": {
"hash_examples": "19677fd1773f7eb9",
"hash_full_prompts": "ae0a8fd428f9aee3",
"hash_input_tokens": "bc3e442621b75177",
"hash_cont_tokens": "cd37ffdb5b2c05eb"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:geometric_shapes|3": {
"hashes": {
"hash_examples": "76c7b11a13cc72a9",
"hash_full_prompts": "76633257f67207f9",
"hash_input_tokens": "18d576df2960751d",
"hash_cont_tokens": "665887996d172717"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:logical_deduction_five_objects|3": {
"hashes": {
"hash_examples": "0e958c856332a745",
"hash_full_prompts": "3c96645848786efd",
"hash_input_tokens": "36a60b866a1bf813",
"hash_cont_tokens": "314ac0615c6ba8b2"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:logical_deduction_seven_objects|3": {
"hashes": {
"hash_examples": "ab9de25a5eb40d09",
"hash_full_prompts": "185c5851c101ee66",
"hash_input_tokens": "c1e2e1d71455bb49",
"hash_cont_tokens": "8be80ca215d5b2a3"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:logical_deduction_three_objects|3": {
"hashes": {
"hash_examples": "3c6bf52517714218",
"hash_full_prompts": "8ba2d94357e589d0",
"hash_input_tokens": "70f1b3c78b924815",
"hash_cont_tokens": "9066a1f8bf0c0fc5"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:movie_recommendation|3": {
"hashes": {
"hash_examples": "2d9dc4975935d31a",
"hash_full_prompts": "a411e216d0f5f626",
"hash_input_tokens": "d671ce3b88ee45cd",
"hash_cont_tokens": "4ddad062def5e8ef"
},
"truncated": 249,
"non_truncated": 0,
"padded": 0,
"non_padded": 249,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:navigate|3": {
"hashes": {
"hash_examples": "ba91dcdb9a064255",
"hash_full_prompts": "ebb3084ecc78a46a",
"hash_input_tokens": "51743c1fef4a5482",
"hash_cont_tokens": "11beb4a48b985d44"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:reasoning_about_colored_objects|3": {
"hashes": {
"hash_examples": "a6ba328c4c3385d2",
"hash_full_prompts": "38328d016a4ebef3",
"hash_input_tokens": "6897c18acd616cb9",
"hash_cont_tokens": "675a7012cb001b34"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:ruin_names|3": {
"hashes": {
"hash_examples": "2ef28d5f2d4fdd25",
"hash_full_prompts": "9c7d0493c37182d6",
"hash_input_tokens": "4d618e950c8d013d",
"hash_cont_tokens": "8ede606d015dca4f"
},
"truncated": 248,
"non_truncated": 0,
"padded": 0,
"non_padded": 248,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:salient_translation_error_detection|3": {
"hashes": {
"hash_examples": "c13f25ec8ffed496",
"hash_full_prompts": "edccd4061b168b78",
"hash_input_tokens": "fcdd25281b1eba05",
"hash_cont_tokens": "cbf517be41f28f3d"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:snarks|3": {
"hashes": {
"hash_examples": "5f6db7bff7f6f22e",
"hash_full_prompts": "31cafd95ab850a44",
"hash_input_tokens": "16886c991ce348c1",
"hash_cont_tokens": "df20170bf621a2f3"
},
"truncated": 178,
"non_truncated": 0,
"padded": 0,
"non_padded": 178,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:sports_understanding|3": {
"hashes": {
"hash_examples": "042afbe5d9c1f02d",
"hash_full_prompts": "3d46581e9bbec2d0",
"hash_input_tokens": "117a2d8c0e6cb894",
"hash_cont_tokens": "ec37569600892a26"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:temporal_sequences|3": {
"hashes": {
"hash_examples": "803a05f352eb6afc",
"hash_full_prompts": "4a54db144a5dd222",
"hash_input_tokens": "70740a88f84e4a13",
"hash_cont_tokens": "41d96a59ba9c957b"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:tracking_shuffled_objects_five_objects|3": {
"hashes": {
"hash_examples": "2bbac6db7ab0d527",
"hash_full_prompts": "e3079106787cc311",
"hash_input_tokens": "c8d7203b8c369cb8",
"hash_cont_tokens": "fd6b5bd0d85dd2e6"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:tracking_shuffled_objects_seven_objects|3": {
"hashes": {
"hash_examples": "845caf093ac2b58c",
"hash_full_prompts": "6364e5b860590ec8",
"hash_input_tokens": "beb0b08cec3d048f",
"hash_cont_tokens": "2707546adcfc144d"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
},
"harness|bbh:tracking_shuffled_objects_three_objects|3": {
"hashes": {
"hash_examples": "9004f14d5a32b9a8",
"hash_full_prompts": "01aef56c4d1fe9fe",
"hash_input_tokens": "9642e09abf045647",
"hash_cont_tokens": "8468cb37dd7c4590"
},
"truncated": 250,
"non_truncated": 0,
"padded": 0,
"non_padded": 250,
"effective_few_shots": 3.0,
"num_truncated_few_shots": 0
}
},
"summary_general": {
"hashes": {
"hash_examples": "4ff1e3dc5703575d",
"hash_full_prompts": "1cbeab0a00117cb8",
"hash_input_tokens": "2a35e6d8f7c2fc79",
"hash_cont_tokens": "bd9a6fe0e1a8deb1"
},
"truncated": 4362,
"non_truncated": 0,
"padded": 0,
"non_padded": 4362,
"num_truncated_few_shots": 0
}
}