diff --git a/.gitattributes b/.gitattributes index f3b6bf5ab21d1def5b56ce5302c726963acac9b3..2465788072a5918bb3df099106914345d25de22e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -69,3 +69,19 @@ eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_da eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text eval_results_finetunes/core/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text +eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text diff --git a/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d96dc2e3c47c0b8f4ef5d13d6532f8ee6def8aeb --- /dev/null +++ b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,268 @@ +{ + "eval_type_id": "absorption_first_letter", + "eval_config": { + "model_name": "gemma-2-2b", + "random_seed": 42, + "f1_jump_threshold": 0.03, + "max_k_value": 10, + "prompt_template": "{word} has the first letter:", + "prompt_token_pos": -6, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "k_sparse_probe_l1_decay": 0.01, + "k_sparse_probe_batch_size": 4096, + "k_sparse_probe_num_epochs": 50 + }, + "eval_id": "ef0c46fe-510f-4912-be90-57aeae2de794", + "datetime_epoch_millis": 1740152315652, + "eval_result_metrics": { + "mean": { + "mean_absorption_fraction_score": 0.5196529448711236, + "mean_full_absorption_score": 0.3042496024774553, + "mean_num_split_features": 2.8461538461538463, + "std_dev_absorption_fraction_score": 0.2663894021350886, + "std_dev_full_absorption_score": 0.17641031485178768, + "std_dev_num_split_features": 1.6417626550097355 + } + }, + "eval_result_details": [ + { + "first_letter": "a", + "mean_absorption_fraction": 0.6889635895777194, + "full_absorption_rate": 0.3417065390749601, + "num_full_absorption": 857, + "num_probe_true_positives": 2508, + "num_split_features": 4 + }, + { + "first_letter": "b", + "mean_absorption_fraction": 0.679161422720634, + "full_absorption_rate": 0.3430609597924773, + "num_full_absorption": 529, + "num_probe_true_positives": 1542, + "num_split_features": 6 + }, + { + "first_letter": "c", + "mean_absorption_fraction": 0.8155963696018853, + "full_absorption_rate": 0.48698752228163994, + "num_full_absorption": 1366, + "num_probe_true_positives": 2805, + "num_split_features": 3 + }, + { + "first_letter": "d", + "mean_absorption_fraction": 0.7004096658239145, + "full_absorption_rate": 0.41807228915662653, + "num_full_absorption": 694, + "num_probe_true_positives": 1660, + "num_split_features": 3 + }, + { + "first_letter": "e", + "mean_absorption_fraction": 0.5585561782067426, + "full_absorption_rate": 0.3681930693069307, + "num_full_absorption": 595, + "num_probe_true_positives": 1616, + "num_split_features": 4 + }, + { + "first_letter": "f", + "mean_absorption_fraction": 0.8006965508885888, + "full_absorption_rate": 0.5872374798061389, + "num_full_absorption": 727, + "num_probe_true_positives": 1238, + "num_split_features": 3 + }, + { + "first_letter": "g", + "mean_absorption_fraction": 0.5635756145458141, + "full_absorption_rate": 0.34323144104803494, + "num_full_absorption": 393, + "num_probe_true_positives": 1145, + "num_split_features": 3 + }, + { + "first_letter": "h", + "mean_absorption_fraction": 0.7999027114878418, + "full_absorption_rate": 0.45893719806763283, + "num_full_absorption": 475, + "num_probe_true_positives": 1035, + "num_split_features": 3 + }, + { + "first_letter": "i", + "mean_absorption_fraction": 0.6117770347749325, + "full_absorption_rate": 0.358974358974359, + "num_full_absorption": 588, + "num_probe_true_positives": 1638, + "num_split_features": 2 + }, + { + "first_letter": "j", + "mean_absorption_fraction": 0.20771700655851907, + "full_absorption_rate": 0.06310679611650485, + "num_full_absorption": 26, + "num_probe_true_positives": 412, + "num_split_features": 1 + }, + { + "first_letter": "k", + "mean_absorption_fraction": 0.4378382360943496, + "full_absorption_rate": 0.16592592592592592, + "num_full_absorption": 112, + "num_probe_true_positives": 675, + "num_split_features": 3 + }, + { + "first_letter": "l", + "mean_absorption_fraction": 0.6610036837302684, + "full_absorption_rate": 0.3856041131105398, + "num_full_absorption": 450, + "num_probe_true_positives": 1167, + "num_split_features": 6 + }, + { + "first_letter": "m", + "mean_absorption_fraction": 0.7708970748978563, + "full_absorption_rate": 0.5172981878088962, + "num_full_absorption": 942, + "num_probe_true_positives": 1821, + "num_split_features": 2 + }, + { + "first_letter": "n", + "mean_absorption_fraction": 0.7095649014624135, + "full_absorption_rate": 0.3765743073047859, + "num_full_absorption": 299, + "num_probe_true_positives": 794, + "num_split_features": 3 + }, + { + "first_letter": "o", + "mean_absorption_fraction": 0.6244493390182382, + "full_absorption_rate": 0.3964386129334583, + "num_full_absorption": 423, + "num_probe_true_positives": 1067, + "num_split_features": 4 + }, + { + "first_letter": "p", + "mean_absorption_fraction": 0.6332302852454678, + "full_absorption_rate": 0.38650306748466257, + "num_full_absorption": 882, + "num_probe_true_positives": 2282, + "num_split_features": 7 + }, + { + "first_letter": "q", + "mean_absorption_fraction": 0.009843952976727736, + "full_absorption_rate": 0.010526315789473684, + "num_full_absorption": 2, + "num_probe_true_positives": 190, + "num_split_features": 1 + }, + { + "first_letter": "r", + "mean_absorption_fraction": 0.7078808696018212, + "full_absorption_rate": 0.4720752498530276, + "num_full_absorption": 803, + "num_probe_true_positives": 1701, + "num_split_features": 3 + }, + { + "first_letter": "s", + "mean_absorption_fraction": 0.7539738349846717, + "full_absorption_rate": 0.4912718204488778, + "num_full_absorption": 1379, + "num_probe_true_positives": 2807, + "num_split_features": 3 + }, + { + "first_letter": "t", + "mean_absorption_fraction": 0.655318360480264, + "full_absorption_rate": 0.3734513274336283, + "num_full_absorption": 633, + "num_probe_true_positives": 1695, + "num_split_features": 3 + }, + { + "first_letter": "u", + "mean_absorption_fraction": 0.3289562491496899, + "full_absorption_rate": 0.1986754966887417, + "num_full_absorption": 150, + "num_probe_true_positives": 755, + "num_split_features": 2 + }, + { + "first_letter": "v", + "mean_absorption_fraction": 0.3259209126950176, + "full_absorption_rate": 0.22813688212927757, + "num_full_absorption": 180, + "num_probe_true_positives": 789, + "num_split_features": 1 + }, + { + "first_letter": "w", + "mean_absorption_fraction": 0.06180711593258981, + "full_absorption_rate": 0.03581267217630854, + "num_full_absorption": 26, + "num_probe_true_positives": 726, + "num_split_features": 1 + }, + { + "first_letter": "x", + "mean_absorption_fraction": 0.37442147942939097, + "full_absorption_rate": 0.08849557522123894, + "num_full_absorption": 10, + "num_probe_true_positives": 113, + "num_split_features": 1 + }, + { + "first_letter": "y", + "mean_absorption_fraction": 0.022340281665225267, + "full_absorption_rate": 0.005681818181818182, + "num_full_absorption": 1, + "num_probe_true_positives": 176, + "num_split_features": 1 + }, + { + "first_letter": "z", + "mean_absorption_fraction": 0.007173845098628674, + "full_absorption_rate": 0.00851063829787234, + "num_full_absorption": 2, + "num_probe_true_positives": 235, + "num_split_features": 1 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1e4cbc3808e5f38e4f4daa9430167f3ee9533b97 --- /dev/null +++ b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,268 @@ +{ + "eval_type_id": "absorption_first_letter", + "eval_config": { + "model_name": "gemma-2-2b", + "random_seed": 42, + "f1_jump_threshold": 0.03, + "max_k_value": 10, + "prompt_template": "{word} has the first letter:", + "prompt_token_pos": -6, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "k_sparse_probe_l1_decay": 0.01, + "k_sparse_probe_batch_size": 4096, + "k_sparse_probe_num_epochs": 50 + }, + "eval_id": "a794207a-cbc8-4c1d-8ea8-36a54549f1c6", + "datetime_epoch_millis": 1740150709168, + "eval_result_metrics": { + "mean": { + "mean_absorption_fraction_score": 0.0834770439283162, + "mean_full_absorption_score": 0.06013611847978803, + "mean_num_split_features": 1.1538461538461537, + "std_dev_absorption_fraction_score": 0.09241618138074081, + "std_dev_full_absorption_score": 0.08150335423500922, + "std_dev_num_split_features": 0.36794648440311994 + } + }, + "eval_result_details": [ + { + "first_letter": "a", + "mean_absorption_fraction": 0.057028034407490984, + "full_absorption_rate": 0.017543859649122806, + "num_full_absorption": 44, + "num_probe_true_positives": 2508, + "num_split_features": 1 + }, + { + "first_letter": "b", + "mean_absorption_fraction": 0.0017792171984287038, + "full_absorption_rate": 0.0025940337224383916, + "num_full_absorption": 4, + "num_probe_true_positives": 1542, + "num_split_features": 1 + }, + { + "first_letter": "c", + "mean_absorption_fraction": 0.25592667322599083, + "full_absorption_rate": 0.1857397504456328, + "num_full_absorption": 521, + "num_probe_true_positives": 2805, + "num_split_features": 1 + }, + { + "first_letter": "d", + "mean_absorption_fraction": 0.24011997118028888, + "full_absorption_rate": 0.12590361445783133, + "num_full_absorption": 209, + "num_probe_true_positives": 1660, + "num_split_features": 1 + }, + { + "first_letter": "e", + "mean_absorption_fraction": 0.1110917028199224, + "full_absorption_rate": 0.15346534653465346, + "num_full_absorption": 248, + "num_probe_true_positives": 1616, + "num_split_features": 1 + }, + { + "first_letter": "f", + "mean_absorption_fraction": 0.0545493485524353, + "full_absorption_rate": 0.02665589660743134, + "num_full_absorption": 33, + "num_probe_true_positives": 1238, + "num_split_features": 1 + }, + { + "first_letter": "g", + "mean_absorption_fraction": 0.015559784132844725, + "full_absorption_rate": 0.0026200873362445414, + "num_full_absorption": 3, + "num_probe_true_positives": 1145, + "num_split_features": 2 + }, + { + "first_letter": "h", + "mean_absorption_fraction": 0.020966493314900192, + "full_absorption_rate": 0.005797101449275362, + "num_full_absorption": 6, + "num_probe_true_positives": 1035, + "num_split_features": 1 + }, + { + "first_letter": "i", + "mean_absorption_fraction": 0.22232954346440803, + "full_absorption_rate": 0.23321123321123322, + "num_full_absorption": 382, + "num_probe_true_positives": 1638, + "num_split_features": 2 + }, + { + "first_letter": "j", + "mean_absorption_fraction": 0.0033853270927236977, + "full_absorption_rate": 0.0024271844660194173, + "num_full_absorption": 1, + "num_probe_true_positives": 412, + "num_split_features": 1 + }, + { + "first_letter": "k", + "mean_absorption_fraction": 0.00362522533780823, + "full_absorption_rate": 0.005925925925925926, + "num_full_absorption": 4, + "num_probe_true_positives": 675, + "num_split_features": 1 + }, + { + "first_letter": "l", + "mean_absorption_fraction": 0.12461768062488289, + "full_absorption_rate": 0.06512425021422451, + "num_full_absorption": 76, + "num_probe_true_positives": 1167, + "num_split_features": 1 + }, + { + "first_letter": "m", + "mean_absorption_fraction": 0.0051840957147463755, + "full_absorption_rate": 0.008237232289950576, + "num_full_absorption": 15, + "num_probe_true_positives": 1821, + "num_split_features": 1 + }, + { + "first_letter": "n", + "mean_absorption_fraction": 0.05486755467875741, + "full_absorption_rate": 0.021410579345088162, + "num_full_absorption": 17, + "num_probe_true_positives": 794, + "num_split_features": 1 + }, + { + "first_letter": "o", + "mean_absorption_fraction": 0.11608643361066023, + "full_absorption_rate": 0.06560449859418932, + "num_full_absorption": 70, + "num_probe_true_positives": 1067, + "num_split_features": 1 + }, + { + "first_letter": "p", + "mean_absorption_fraction": 0.286828957721577, + "full_absorption_rate": 0.28702892199824714, + "num_full_absorption": 655, + "num_probe_true_positives": 2282, + "num_split_features": 1 + }, + { + "first_letter": "q", + "mean_absorption_fraction": 0.0012196642214486685, + "full_absorption_rate": 0.0, + "num_full_absorption": 0, + "num_probe_true_positives": 190, + "num_split_features": 2 + }, + { + "first_letter": "r", + "mean_absorption_fraction": 0.23541147654145936, + "full_absorption_rate": 0.1781305114638448, + "num_full_absorption": 303, + "num_probe_true_positives": 1701, + "num_split_features": 1 + }, + { + "first_letter": "s", + "mean_absorption_fraction": 0.15717383958699338, + "full_absorption_rate": 0.06840042750267189, + "num_full_absorption": 192, + "num_probe_true_positives": 2807, + "num_split_features": 1 + }, + { + "first_letter": "t", + "mean_absorption_fraction": 0.019660784963942177, + "full_absorption_rate": 0.008849557522123894, + "num_full_absorption": 15, + "num_probe_true_positives": 1695, + "num_split_features": 1 + }, + { + "first_letter": "u", + "mean_absorption_fraction": 0.02239216664758688, + "full_absorption_rate": 0.017218543046357615, + "num_full_absorption": 13, + "num_probe_true_positives": 755, + "num_split_features": 2 + }, + { + "first_letter": "v", + "mean_absorption_fraction": 0.000307858004094181, + "full_absorption_rate": 0.0, + "num_full_absorption": 0, + "num_probe_true_positives": 789, + "num_split_features": 1 + }, + { + "first_letter": "w", + "mean_absorption_fraction": 0.058073269829955455, + "full_absorption_rate": 0.0440771349862259, + "num_full_absorption": 32, + "num_probe_true_positives": 726, + "num_split_features": 1 + }, + { + "first_letter": "x", + "mean_absorption_fraction": 0.04631967164684324, + "full_absorption_rate": 0.017699115044247787, + "num_full_absorption": 2, + "num_probe_true_positives": 113, + "num_split_features": 1 + }, + { + "first_letter": "y", + "mean_absorption_fraction": 0.04255402129395246, + "full_absorption_rate": 0.011363636363636364, + "num_full_absorption": 2, + "num_probe_true_positives": 176, + "num_split_features": 1 + }, + { + "first_letter": "z", + "mean_absorption_fraction": 0.013344346322079504, + "full_absorption_rate": 0.00851063829787234, + "num_full_absorption": 2, + "num_probe_true_positives": 235, + "num_split_features": 1 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..44ef48a35aa28d28fa1fb6b52acf39daeaa98cec --- /dev/null +++ b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,268 @@ +{ + "eval_type_id": "absorption_first_letter", + "eval_config": { + "model_name": "gemma-2-2b", + "random_seed": 42, + "f1_jump_threshold": 0.03, + "max_k_value": 10, + "prompt_template": "{word} has the first letter:", + "prompt_token_pos": -6, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "k_sparse_probe_l1_decay": 0.01, + "k_sparse_probe_batch_size": 4096, + "k_sparse_probe_num_epochs": 50 + }, + "eval_id": "3fd72f62-0cc2-4495-be28-9e81ace44644", + "datetime_epoch_millis": 1740149898408, + "eval_result_metrics": { + "mean": { + "mean_absorption_fraction_score": 0.6847120849800752, + "mean_full_absorption_score": 0.6661390205611261, + "mean_num_split_features": 2.6538461538461537, + "std_dev_absorption_fraction_score": 0.2003883916940885, + "std_dev_full_absorption_score": 0.2090506101541011, + "std_dev_num_split_features": 1.3249092857190696 + } + }, + "eval_result_details": [ + { + "first_letter": "a", + "mean_absorption_fraction": 0.7233867462553377, + "full_absorption_rate": 0.64792663476874, + "num_full_absorption": 1625, + "num_probe_true_positives": 2508, + "num_split_features": 4 + }, + { + "first_letter": "b", + "mean_absorption_fraction": 0.7168509009331099, + "full_absorption_rate": 0.7159533073929961, + "num_full_absorption": 1104, + "num_probe_true_positives": 1542, + "num_split_features": 4 + }, + { + "first_letter": "c", + "mean_absorption_fraction": 0.9156825137832028, + "full_absorption_rate": 0.8762923351158646, + "num_full_absorption": 2458, + "num_probe_true_positives": 2805, + "num_split_features": 1 + }, + { + "first_letter": "d", + "mean_absorption_fraction": 0.7605077790256731, + "full_absorption_rate": 0.7590361445783133, + "num_full_absorption": 1260, + "num_probe_true_positives": 1660, + "num_split_features": 2 + }, + { + "first_letter": "e", + "mean_absorption_fraction": 0.5875146098112013, + "full_absorption_rate": 0.7271039603960396, + "num_full_absorption": 1175, + "num_probe_true_positives": 1616, + "num_split_features": 3 + }, + { + "first_letter": "f", + "mean_absorption_fraction": 0.7127841577147391, + "full_absorption_rate": 0.7059773828756059, + "num_full_absorption": 874, + "num_probe_true_positives": 1238, + "num_split_features": 5 + }, + { + "first_letter": "g", + "mean_absorption_fraction": 0.7512413415830805, + "full_absorption_rate": 0.7362445414847162, + "num_full_absorption": 843, + "num_probe_true_positives": 1145, + "num_split_features": 2 + }, + { + "first_letter": "h", + "mean_absorption_fraction": 0.7663615485897274, + "full_absorption_rate": 0.7661835748792271, + "num_full_absorption": 793, + "num_probe_true_positives": 1035, + "num_split_features": 4 + }, + { + "first_letter": "i", + "mean_absorption_fraction": 0.7978799456118504, + "full_absorption_rate": 0.8028083028083028, + "num_full_absorption": 1315, + "num_probe_true_positives": 1638, + "num_split_features": 2 + }, + { + "first_letter": "j", + "mean_absorption_fraction": 0.7148716538061125, + "full_absorption_rate": 0.6820388349514563, + "num_full_absorption": 281, + "num_probe_true_positives": 412, + "num_split_features": 2 + }, + { + "first_letter": "k", + "mean_absorption_fraction": 0.5989374406326367, + "full_absorption_rate": 0.5555555555555556, + "num_full_absorption": 375, + "num_probe_true_positives": 675, + "num_split_features": 4 + }, + { + "first_letter": "l", + "mean_absorption_fraction": 0.7327207331271454, + "full_absorption_rate": 0.7446443873179092, + "num_full_absorption": 869, + "num_probe_true_positives": 1167, + "num_split_features": 3 + }, + { + "first_letter": "m", + "mean_absorption_fraction": 0.7807434725648233, + "full_absorption_rate": 0.8160351455244371, + "num_full_absorption": 1486, + "num_probe_true_positives": 1821, + "num_split_features": 2 + }, + { + "first_letter": "n", + "mean_absorption_fraction": 0.7722688362643463, + "full_absorption_rate": 0.7657430730478589, + "num_full_absorption": 608, + "num_probe_true_positives": 794, + "num_split_features": 3 + }, + { + "first_letter": "o", + "mean_absorption_fraction": 0.8168333418476779, + "full_absorption_rate": 0.7769447047797563, + "num_full_absorption": 829, + "num_probe_true_positives": 1067, + "num_split_features": 2 + }, + { + "first_letter": "p", + "mean_absorption_fraction": 0.8399943933692386, + "full_absorption_rate": 0.8273444347063978, + "num_full_absorption": 1888, + "num_probe_true_positives": 2282, + "num_split_features": 2 + }, + { + "first_letter": "q", + "mean_absorption_fraction": 0.7839560158065032, + "full_absorption_rate": 0.7157894736842105, + "num_full_absorption": 136, + "num_probe_true_positives": 190, + "num_split_features": 2 + }, + { + "first_letter": "r", + "mean_absorption_fraction": 0.8207604001227187, + "full_absorption_rate": 0.8300999412110524, + "num_full_absorption": 1412, + "num_probe_true_positives": 1701, + "num_split_features": 3 + }, + { + "first_letter": "s", + "mean_absorption_fraction": 0.9192547073005679, + "full_absorption_rate": 0.8696116850730317, + "num_full_absorption": 2441, + "num_probe_true_positives": 2807, + "num_split_features": 1 + }, + { + "first_letter": "t", + "mean_absorption_fraction": 0.7832159717277276, + "full_absorption_rate": 0.736283185840708, + "num_full_absorption": 1248, + "num_probe_true_positives": 1695, + "num_split_features": 2 + }, + { + "first_letter": "u", + "mean_absorption_fraction": 0.31044767910219884, + "full_absorption_rate": 0.3880794701986755, + "num_full_absorption": 293, + "num_probe_true_positives": 755, + "num_split_features": 6 + }, + { + "first_letter": "v", + "mean_absorption_fraction": 0.6288526333046588, + "full_absorption_rate": 0.5779467680608364, + "num_full_absorption": 456, + "num_probe_true_positives": 789, + "num_split_features": 4 + }, + { + "first_letter": "w", + "mean_absorption_fraction": 0.820589593812622, + "full_absorption_rate": 0.7630853994490359, + "num_full_absorption": 554, + "num_probe_true_positives": 726, + "num_split_features": 3 + }, + { + "first_letter": "x", + "mean_absorption_fraction": 0.2525570176189269, + "full_absorption_rate": 0.08849557522123894, + "num_full_absorption": 10, + "num_probe_true_positives": 113, + "num_split_features": 1 + }, + { + "first_letter": "y", + "mean_absorption_fraction": 0.3328708151340147, + "full_absorption_rate": 0.29545454545454547, + "num_full_absorption": 52, + "num_probe_true_positives": 176, + "num_split_features": 1 + }, + { + "first_letter": "z", + "mean_absorption_fraction": 0.16142996063211232, + "full_absorption_rate": 0.14893617021276595, + "num_full_absorption": 35, + "num_probe_true_positives": 235, + "num_split_features": 1 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..13e34557a52bb4a730f39a74d0c2e0f65c18d4d8 --- /dev/null +++ b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,268 @@ +{ + "eval_type_id": "absorption_first_letter", + "eval_config": { + "model_name": "gemma-2-2b", + "random_seed": 42, + "f1_jump_threshold": 0.03, + "max_k_value": 10, + "prompt_template": "{word} has the first letter:", + "prompt_token_pos": -6, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "k_sparse_probe_l1_decay": 0.01, + "k_sparse_probe_batch_size": 4096, + "k_sparse_probe_num_epochs": 50 + }, + "eval_id": "e4f2dfbd-7bca-41a0-b7cc-7b89d14b8d8f", + "datetime_epoch_millis": 1740151507106, + "eval_result_metrics": { + "mean": { + "mean_absorption_fraction_score": 0.4524983861704342, + "mean_full_absorption_score": 0.5240608368125174, + "mean_num_split_features": 4.3076923076923075, + "std_dev_absorption_fraction_score": 0.17312136051231236, + "std_dev_full_absorption_score": 0.19044461427366993, + "std_dev_num_split_features": 2.412786451706504 + } + }, + "eval_result_details": [ + { + "first_letter": "a", + "mean_absorption_fraction": 0.5244886983482652, + "full_absorption_rate": 0.5231259968102073, + "num_full_absorption": 1312, + "num_probe_true_positives": 2508, + "num_split_features": 9 + }, + { + "first_letter": "b", + "mean_absorption_fraction": 0.5030903415851329, + "full_absorption_rate": 0.6848249027237354, + "num_full_absorption": 1056, + "num_probe_true_positives": 1542, + "num_split_features": 4 + }, + { + "first_letter": "c", + "mean_absorption_fraction": 0.6621675316077551, + "full_absorption_rate": 0.685204991087344, + "num_full_absorption": 1922, + "num_probe_true_positives": 2805, + "num_split_features": 7 + }, + { + "first_letter": "d", + "mean_absorption_fraction": 0.5686391650350938, + "full_absorption_rate": 0.6313253012048192, + "num_full_absorption": 1048, + "num_probe_true_positives": 1660, + "num_split_features": 5 + }, + { + "first_letter": "e", + "mean_absorption_fraction": 0.3860805878256704, + "full_absorption_rate": 0.5006188118811881, + "num_full_absorption": 809, + "num_probe_true_positives": 1616, + "num_split_features": 8 + }, + { + "first_letter": "f", + "mean_absorption_fraction": 0.4846739037643228, + "full_absorption_rate": 0.6058158319870759, + "num_full_absorption": 750, + "num_probe_true_positives": 1238, + "num_split_features": 6 + }, + { + "first_letter": "g", + "mean_absorption_fraction": 0.4674741077161477, + "full_absorption_rate": 0.5545851528384279, + "num_full_absorption": 635, + "num_probe_true_positives": 1145, + "num_split_features": 7 + }, + { + "first_letter": "h", + "mean_absorption_fraction": 0.4141624927147996, + "full_absorption_rate": 0.46956521739130436, + "num_full_absorption": 486, + "num_probe_true_positives": 1035, + "num_split_features": 5 + }, + { + "first_letter": "i", + "mean_absorption_fraction": 0.5685041971113255, + "full_absorption_rate": 0.6868131868131868, + "num_full_absorption": 1125, + "num_probe_true_positives": 1638, + "num_split_features": 1 + }, + { + "first_letter": "j", + "mean_absorption_fraction": 0.3294766484902135, + "full_absorption_rate": 0.34951456310679613, + "num_full_absorption": 144, + "num_probe_true_positives": 412, + "num_split_features": 3 + }, + { + "first_letter": "k", + "mean_absorption_fraction": 0.19406311445367022, + "full_absorption_rate": 0.2740740740740741, + "num_full_absorption": 185, + "num_probe_true_positives": 675, + "num_split_features": 3 + }, + { + "first_letter": "l", + "mean_absorption_fraction": 0.43236308517151945, + "full_absorption_rate": 0.48586118251928023, + "num_full_absorption": 567, + "num_probe_true_positives": 1167, + "num_split_features": 6 + }, + { + "first_letter": "m", + "mean_absorption_fraction": 0.6774860216572675, + "full_absorption_rate": 0.7957166392092258, + "num_full_absorption": 1449, + "num_probe_true_positives": 1821, + "num_split_features": 3 + }, + { + "first_letter": "n", + "mean_absorption_fraction": 0.48846356092785836, + "full_absorption_rate": 0.5604534005037783, + "num_full_absorption": 445, + "num_probe_true_positives": 794, + "num_split_features": 3 + }, + { + "first_letter": "o", + "mean_absorption_fraction": 0.591983877600817, + "full_absorption_rate": 0.6588566073102156, + "num_full_absorption": 703, + "num_probe_true_positives": 1067, + "num_split_features": 3 + }, + { + "first_letter": "p", + "mean_absorption_fraction": 0.692209094169467, + "full_absorption_rate": 0.7116564417177914, + "num_full_absorption": 1624, + "num_probe_true_positives": 2282, + "num_split_features": 8 + }, + { + "first_letter": "q", + "mean_absorption_fraction": 0.22241427485109083, + "full_absorption_rate": 0.23157894736842105, + "num_full_absorption": 44, + "num_probe_true_positives": 190, + "num_split_features": 2 + }, + { + "first_letter": "r", + "mean_absorption_fraction": 0.5417187972127966, + "full_absorption_rate": 0.5631981187536743, + "num_full_absorption": 958, + "num_probe_true_positives": 1701, + "num_split_features": 6 + }, + { + "first_letter": "s", + "mean_absorption_fraction": 0.7569173413620608, + "full_absorption_rate": 0.8065550409690061, + "num_full_absorption": 2264, + "num_probe_true_positives": 2807, + "num_split_features": 4 + }, + { + "first_letter": "t", + "mean_absorption_fraction": 0.5078413641592, + "full_absorption_rate": 0.5150442477876106, + "num_full_absorption": 873, + "num_probe_true_positives": 1695, + "num_split_features": 7 + }, + { + "first_letter": "u", + "mean_absorption_fraction": 0.36593236176612903, + "full_absorption_rate": 0.6079470198675496, + "num_full_absorption": 459, + "num_probe_true_positives": 755, + "num_split_features": 2 + }, + { + "first_letter": "v", + "mean_absorption_fraction": 0.33729577176136777, + "full_absorption_rate": 0.4664131812420786, + "num_full_absorption": 368, + "num_probe_true_positives": 789, + "num_split_features": 4 + }, + { + "first_letter": "w", + "mean_absorption_fraction": 0.5653991608245975, + "full_absorption_rate": 0.7052341597796143, + "num_full_absorption": 512, + "num_probe_true_positives": 726, + "num_split_features": 3 + }, + { + "first_letter": "x", + "mean_absorption_fraction": 0.09658828784853217, + "full_absorption_rate": 0.061946902654867256, + "num_full_absorption": 7, + "num_probe_true_positives": 113, + "num_split_features": 1 + }, + { + "first_letter": "y", + "mean_absorption_fraction": 0.1756755097631157, + "full_absorption_rate": 0.20454545454545456, + "num_full_absorption": 36, + "num_probe_true_positives": 176, + "num_split_features": 1 + }, + { + "first_letter": "z", + "mean_absorption_fraction": 0.20984874270307283, + "full_absorption_rate": 0.2851063829787234, + "num_full_absorption": 67, + "num_probe_true_positives": 235, + "num_split_features": 1 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b41b2c346c17077b8b6e8b133fb6eec6bada90b1 --- /dev/null +++ b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,268 @@ +{ + "eval_type_id": "absorption_first_letter", + "eval_config": { + "model_name": "gemma-2-2b", + "random_seed": 42, + "f1_jump_threshold": 0.03, + "max_k_value": 10, + "prompt_template": "{word} has the first letter:", + "prompt_token_pos": -6, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "k_sparse_probe_l1_decay": 0.01, + "k_sparse_probe_batch_size": 4096, + "k_sparse_probe_num_epochs": 50 + }, + "eval_id": "4a874c85-b7b8-4548-be1b-9967bf32571a", + "datetime_epoch_millis": 1740154752792, + "eval_result_metrics": { + "mean": { + "mean_absorption_fraction_score": 0.6616852261361333, + "mean_full_absorption_score": 0.615820361178369, + "mean_num_split_features": 2.6923076923076925, + "std_dev_absorption_fraction_score": 0.19938859527080388, + "std_dev_full_absorption_score": 0.20106605664354318, + "std_dev_num_split_features": 1.7382573059068274 + } + }, + "eval_result_details": [ + { + "first_letter": "a", + "mean_absorption_fraction": 0.8075326747620873, + "full_absorption_rate": 0.6698564593301436, + "num_full_absorption": 1680, + "num_probe_true_positives": 2508, + "num_split_features": 2 + }, + { + "first_letter": "b", + "mean_absorption_fraction": 0.649433379164975, + "full_absorption_rate": 0.5719844357976653, + "num_full_absorption": 882, + "num_probe_true_positives": 1542, + "num_split_features": 6 + }, + { + "first_letter": "c", + "mean_absorption_fraction": 0.843160509366003, + "full_absorption_rate": 0.774331550802139, + "num_full_absorption": 2172, + "num_probe_true_positives": 2805, + "num_split_features": 3 + }, + { + "first_letter": "d", + "mean_absorption_fraction": 0.8053629676295623, + "full_absorption_rate": 0.7536144578313253, + "num_full_absorption": 1251, + "num_probe_true_positives": 1660, + "num_split_features": 2 + }, + { + "first_letter": "e", + "mean_absorption_fraction": 0.6165512751061573, + "full_absorption_rate": 0.6961633663366337, + "num_full_absorption": 1125, + "num_probe_true_positives": 1616, + "num_split_features": 2 + }, + { + "first_letter": "f", + "mean_absorption_fraction": 0.7104825885299954, + "full_absorption_rate": 0.6922455573505655, + "num_full_absorption": 857, + "num_probe_true_positives": 1238, + "num_split_features": 5 + }, + { + "first_letter": "g", + "mean_absorption_fraction": 0.7387823401404363, + "full_absorption_rate": 0.7397379912663755, + "num_full_absorption": 847, + "num_probe_true_positives": 1145, + "num_split_features": 1 + }, + { + "first_letter": "h", + "mean_absorption_fraction": 0.7596010190875191, + "full_absorption_rate": 0.702415458937198, + "num_full_absorption": 727, + "num_probe_true_positives": 1035, + "num_split_features": 4 + }, + { + "first_letter": "i", + "mean_absorption_fraction": 0.6661271457750678, + "full_absorption_rate": 0.6306471306471306, + "num_full_absorption": 1033, + "num_probe_true_positives": 1638, + "num_split_features": 4 + }, + { + "first_letter": "j", + "mean_absorption_fraction": 0.6232648925391066, + "full_absorption_rate": 0.5728155339805825, + "num_full_absorption": 236, + "num_probe_true_positives": 412, + "num_split_features": 2 + }, + { + "first_letter": "k", + "mean_absorption_fraction": 0.567028277479737, + "full_absorption_rate": 0.522962962962963, + "num_full_absorption": 353, + "num_probe_true_positives": 675, + "num_split_features": 1 + }, + { + "first_letter": "l", + "mean_absorption_fraction": 0.754960114510044, + "full_absorption_rate": 0.7343616109682948, + "num_full_absorption": 857, + "num_probe_true_positives": 1167, + "num_split_features": 3 + }, + { + "first_letter": "m", + "mean_absorption_fraction": 0.8203947792860443, + "full_absorption_rate": 0.7940691927512356, + "num_full_absorption": 1446, + "num_probe_true_positives": 1821, + "num_split_features": 1 + }, + { + "first_letter": "n", + "mean_absorption_fraction": 0.7950898206311865, + "full_absorption_rate": 0.7670025188916877, + "num_full_absorption": 609, + "num_probe_true_positives": 794, + "num_split_features": 3 + }, + { + "first_letter": "o", + "mean_absorption_fraction": 0.6647762399813312, + "full_absorption_rate": 0.521087160262418, + "num_full_absorption": 556, + "num_probe_true_positives": 1067, + "num_split_features": 6 + }, + { + "first_letter": "p", + "mean_absorption_fraction": 0.8176700167677401, + "full_absorption_rate": 0.7725679228746714, + "num_full_absorption": 1763, + "num_probe_true_positives": 2282, + "num_split_features": 5 + }, + { + "first_letter": "q", + "mean_absorption_fraction": 0.45423763895890557, + "full_absorption_rate": 0.3, + "num_full_absorption": 57, + "num_probe_true_positives": 190, + "num_split_features": 1 + }, + { + "first_letter": "r", + "mean_absorption_fraction": 0.7393716764914745, + "full_absorption_rate": 0.6843033509700176, + "num_full_absorption": 1164, + "num_probe_true_positives": 1701, + "num_split_features": 3 + }, + { + "first_letter": "s", + "mean_absorption_fraction": 0.8758378878071444, + "full_absorption_rate": 0.791948699679373, + "num_full_absorption": 2223, + "num_probe_true_positives": 2807, + "num_split_features": 1 + }, + { + "first_letter": "t", + "mean_absorption_fraction": 0.8617655260809893, + "full_absorption_rate": 0.743952802359882, + "num_full_absorption": 1261, + "num_probe_true_positives": 1695, + "num_split_features": 2 + }, + { + "first_letter": "u", + "mean_absorption_fraction": 0.7107051211722657, + "full_absorption_rate": 0.7841059602649006, + "num_full_absorption": 592, + "num_probe_true_positives": 755, + "num_split_features": 1 + }, + { + "first_letter": "v", + "mean_absorption_fraction": 0.5079324747158148, + "full_absorption_rate": 0.5057034220532319, + "num_full_absorption": 399, + "num_probe_true_positives": 789, + "num_split_features": 6 + }, + { + "first_letter": "w", + "mean_absorption_fraction": 0.7893559891999093, + "full_absorption_rate": 0.7741046831955923, + "num_full_absorption": 562, + "num_probe_true_positives": 726, + "num_split_features": 3 + }, + { + "first_letter": "x", + "mean_absorption_fraction": 0.2505387445283525, + "full_absorption_rate": 0.1592920353982301, + "num_full_absorption": 18, + "num_probe_true_positives": 113, + "num_split_features": 1 + }, + { + "first_letter": "y", + "mean_absorption_fraction": 0.2744647016381297, + "full_absorption_rate": 0.19886363636363635, + "num_full_absorption": 35, + "num_probe_true_positives": 176, + "num_split_features": 1 + }, + { + "first_letter": "z", + "mean_absorption_fraction": 0.09938807818948822, + "full_absorption_rate": 0.15319148936170213, + "num_full_absorption": 36, + "num_probe_true_positives": 235, + "num_split_features": 1 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c66539d35b2992c0cbbb7a24de9267a859f17274 --- /dev/null +++ b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,268 @@ +{ + "eval_type_id": "absorption_first_letter", + "eval_config": { + "model_name": "gemma-2-2b", + "random_seed": 42, + "f1_jump_threshold": 0.03, + "max_k_value": 10, + "prompt_template": "{word} has the first letter:", + "prompt_token_pos": -6, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "k_sparse_probe_l1_decay": 0.01, + "k_sparse_probe_batch_size": 4096, + "k_sparse_probe_num_epochs": 50 + }, + "eval_id": "2be8745f-1a57-4404-ac46-dce11e8b68ff", + "datetime_epoch_millis": 1740153947708, + "eval_result_metrics": { + "mean": { + "mean_absorption_fraction_score": 0.5128081449664995, + "mean_full_absorption_score": 0.5412368653055878, + "mean_num_split_features": 3.6538461538461537, + "std_dev_absorption_fraction_score": 0.22140493761969532, + "std_dev_full_absorption_score": 0.2331752415340005, + "std_dev_num_split_features": 2.077350383393378 + } + }, + "eval_result_details": [ + { + "first_letter": "a", + "mean_absorption_fraction": 0.7110525620728301, + "full_absorption_rate": 0.7256778309409888, + "num_full_absorption": 1820, + "num_probe_true_positives": 2508, + "num_split_features": 4 + }, + { + "first_letter": "b", + "mean_absorption_fraction": 0.5900107080679658, + "full_absorption_rate": 0.6504539559014267, + "num_full_absorption": 1003, + "num_probe_true_positives": 1542, + "num_split_features": 8 + }, + { + "first_letter": "c", + "mean_absorption_fraction": 0.7889150540988363, + "full_absorption_rate": 0.7992869875222817, + "num_full_absorption": 2242, + "num_probe_true_positives": 2805, + "num_split_features": 5 + }, + { + "first_letter": "d", + "mean_absorption_fraction": 0.7213593093818161, + "full_absorption_rate": 0.7777108433734939, + "num_full_absorption": 1291, + "num_probe_true_positives": 1660, + "num_split_features": 4 + }, + { + "first_letter": "e", + "mean_absorption_fraction": 0.41357739362504964, + "full_absorption_rate": 0.5705445544554455, + "num_full_absorption": 922, + "num_probe_true_positives": 1616, + "num_split_features": 5 + }, + { + "first_letter": "f", + "mean_absorption_fraction": 0.6904496997463666, + "full_absorption_rate": 0.7164781906300485, + "num_full_absorption": 887, + "num_probe_true_positives": 1238, + "num_split_features": 7 + }, + { + "first_letter": "g", + "mean_absorption_fraction": 0.6122705763206383, + "full_absorption_rate": 0.6462882096069869, + "num_full_absorption": 740, + "num_probe_true_positives": 1145, + "num_split_features": 6 + }, + { + "first_letter": "h", + "mean_absorption_fraction": 0.5132230230308134, + "full_absorption_rate": 0.5207729468599034, + "num_full_absorption": 539, + "num_probe_true_positives": 1035, + "num_split_features": 5 + }, + { + "first_letter": "i", + "mean_absorption_fraction": 0.5740640698732722, + "full_absorption_rate": 0.7197802197802198, + "num_full_absorption": 1179, + "num_probe_true_positives": 1638, + "num_split_features": 3 + }, + { + "first_letter": "j", + "mean_absorption_fraction": 0.366316889496265, + "full_absorption_rate": 0.3567961165048544, + "num_full_absorption": 147, + "num_probe_true_positives": 412, + "num_split_features": 1 + }, + { + "first_letter": "k", + "mean_absorption_fraction": 0.23633773281065845, + "full_absorption_rate": 0.23851851851851852, + "num_full_absorption": 161, + "num_probe_true_positives": 675, + "num_split_features": 1 + }, + { + "first_letter": "l", + "mean_absorption_fraction": 0.6116279835790608, + "full_absorption_rate": 0.6392459297343616, + "num_full_absorption": 746, + "num_probe_true_positives": 1167, + "num_split_features": 4 + }, + { + "first_letter": "m", + "mean_absorption_fraction": 0.6636309057144126, + "full_absorption_rate": 0.7177375068643602, + "num_full_absorption": 1307, + "num_probe_true_positives": 1821, + "num_split_features": 6 + }, + { + "first_letter": "n", + "mean_absorption_fraction": 0.577436439906812, + "full_absorption_rate": 0.5629722921914357, + "num_full_absorption": 447, + "num_probe_true_positives": 794, + "num_split_features": 4 + }, + { + "first_letter": "o", + "mean_absorption_fraction": 0.4891657688844448, + "full_absorption_rate": 0.5014058106841612, + "num_full_absorption": 535, + "num_probe_true_positives": 1067, + "num_split_features": 1 + }, + { + "first_letter": "p", + "mean_absorption_fraction": 0.8397761400133482, + "full_absorption_rate": 0.8347940403155127, + "num_full_absorption": 1905, + "num_probe_true_positives": 2282, + "num_split_features": 3 + }, + { + "first_letter": "q", + "mean_absorption_fraction": 0.33764789379308424, + "full_absorption_rate": 0.3105263157894737, + "num_full_absorption": 59, + "num_probe_true_positives": 190, + "num_split_features": 2 + }, + { + "first_letter": "r", + "mean_absorption_fraction": 0.6411389015355013, + "full_absorption_rate": 0.6431510875955321, + "num_full_absorption": 1094, + "num_probe_true_positives": 1701, + "num_split_features": 5 + }, + { + "first_letter": "s", + "mean_absorption_fraction": 0.7622223123066498, + "full_absorption_rate": 0.7716423227645173, + "num_full_absorption": 2166, + "num_probe_true_positives": 2807, + "num_split_features": 5 + }, + { + "first_letter": "t", + "mean_absorption_fraction": 0.7386404958442533, + "full_absorption_rate": 0.7480825958702065, + "num_full_absorption": 1268, + "num_probe_true_positives": 1695, + "num_split_features": 2 + }, + { + "first_letter": "u", + "mean_absorption_fraction": 0.19444390279769325, + "full_absorption_rate": 0.3258278145695364, + "num_full_absorption": 246, + "num_probe_true_positives": 755, + "num_split_features": 2 + }, + { + "first_letter": "v", + "mean_absorption_fraction": 0.36413111590891156, + "full_absorption_rate": 0.4435994930291508, + "num_full_absorption": 350, + "num_probe_true_positives": 789, + "num_split_features": 6 + }, + { + "first_letter": "w", + "mean_absorption_fraction": 0.5408512211581553, + "full_absorption_rate": 0.6060606060606061, + "num_full_absorption": 440, + "num_probe_true_positives": 726, + "num_split_features": 3 + }, + { + "first_letter": "x", + "mean_absorption_fraction": 0.09970442692753816, + "full_absorption_rate": 0.05309734513274336, + "num_full_absorption": 6, + "num_probe_true_positives": 113, + "num_split_features": 1 + }, + { + "first_letter": "y", + "mean_absorption_fraction": 0.20248330206807633, + "full_absorption_rate": 0.1534090909090909, + "num_full_absorption": 27, + "num_probe_true_positives": 176, + "num_split_features": 1 + }, + { + "first_letter": "z", + "mean_absorption_fraction": 0.05253394016653227, + "full_absorption_rate": 0.03829787234042553, + "num_full_absorption": 9, + "num_probe_true_positives": 235, + "num_split_features": 1 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..2b86b24e3a70e4f5ab6a775c27c4f5342ff21cd4 --- /dev/null +++ b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,268 @@ +{ + "eval_type_id": "absorption_first_letter", + "eval_config": { + "model_name": "gemma-2-2b", + "random_seed": 42, + "f1_jump_threshold": 0.03, + "max_k_value": 10, + "prompt_template": "{word} has the first letter:", + "prompt_token_pos": -6, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "k_sparse_probe_l1_decay": 0.01, + "k_sparse_probe_batch_size": 4096, + "k_sparse_probe_num_epochs": 50 + }, + "eval_id": "301cd6f2-5394-4131-b827-9d8a5bbeac04", + "datetime_epoch_millis": 1740153129410, + "eval_result_metrics": { + "mean": { + "mean_absorption_fraction_score": 0.6363149713432454, + "mean_full_absorption_score": 0.4939130321779123, + "mean_num_split_features": 3.3846153846153846, + "std_dev_absorption_fraction_score": 0.19938409617149097, + "std_dev_full_absorption_score": 0.17387178972743858, + "std_dev_num_split_features": 1.6751578570850707 + } + }, + "eval_result_details": [ + { + "first_letter": "a", + "mean_absorption_fraction": 0.685226183958439, + "full_absorption_rate": 0.4597288676236045, + "num_full_absorption": 1153, + "num_probe_true_positives": 2508, + "num_split_features": 5 + }, + { + "first_letter": "b", + "mean_absorption_fraction": 0.6798069224264082, + "full_absorption_rate": 0.5012970168612192, + "num_full_absorption": 773, + "num_probe_true_positives": 1542, + "num_split_features": 4 + }, + { + "first_letter": "c", + "mean_absorption_fraction": 0.7703485846895846, + "full_absorption_rate": 0.6217468805704099, + "num_full_absorption": 1744, + "num_probe_true_positives": 2805, + "num_split_features": 2 + }, + { + "first_letter": "d", + "mean_absorption_fraction": 0.8082173823988077, + "full_absorption_rate": 0.6445783132530121, + "num_full_absorption": 1070, + "num_probe_true_positives": 1660, + "num_split_features": 3 + }, + { + "first_letter": "e", + "mean_absorption_fraction": 0.5316796151182803, + "full_absorption_rate": 0.49876237623762376, + "num_full_absorption": 806, + "num_probe_true_positives": 1616, + "num_split_features": 3 + }, + { + "first_letter": "f", + "mean_absorption_fraction": 0.6921068442218885, + "full_absorption_rate": 0.6106623586429726, + "num_full_absorption": 756, + "num_probe_true_positives": 1238, + "num_split_features": 5 + }, + { + "first_letter": "g", + "mean_absorption_fraction": 0.6652568101195918, + "full_absorption_rate": 0.537117903930131, + "num_full_absorption": 615, + "num_probe_true_positives": 1145, + "num_split_features": 5 + }, + { + "first_letter": "h", + "mean_absorption_fraction": 0.7274664367939123, + "full_absorption_rate": 0.5217391304347826, + "num_full_absorption": 540, + "num_probe_true_positives": 1035, + "num_split_features": 4 + }, + { + "first_letter": "i", + "mean_absorption_fraction": 0.5973165908932144, + "full_absorption_rate": 0.4896214896214896, + "num_full_absorption": 802, + "num_probe_true_positives": 1638, + "num_split_features": 3 + }, + { + "first_letter": "j", + "mean_absorption_fraction": 0.749237725468953, + "full_absorption_rate": 0.5072815533980582, + "num_full_absorption": 209, + "num_probe_true_positives": 412, + "num_split_features": 2 + }, + { + "first_letter": "k", + "mean_absorption_fraction": 0.7065567821922324, + "full_absorption_rate": 0.5955555555555555, + "num_full_absorption": 402, + "num_probe_true_positives": 675, + "num_split_features": 1 + }, + { + "first_letter": "l", + "mean_absorption_fraction": 0.7196233455554913, + "full_absorption_rate": 0.5355612682090831, + "num_full_absorption": 625, + "num_probe_true_positives": 1167, + "num_split_features": 4 + }, + { + "first_letter": "m", + "mean_absorption_fraction": 0.7175474373170088, + "full_absorption_rate": 0.4876441515650741, + "num_full_absorption": 888, + "num_probe_true_positives": 1821, + "num_split_features": 7 + }, + { + "first_letter": "n", + "mean_absorption_fraction": 0.7491123746163217, + "full_absorption_rate": 0.5654911838790933, + "num_full_absorption": 449, + "num_probe_true_positives": 794, + "num_split_features": 2 + }, + { + "first_letter": "o", + "mean_absorption_fraction": 0.5657368175863791, + "full_absorption_rate": 0.4217432052483599, + "num_full_absorption": 450, + "num_probe_true_positives": 1067, + "num_split_features": 7 + }, + { + "first_letter": "p", + "mean_absorption_fraction": 0.7904435804397044, + "full_absorption_rate": 0.6897458369851008, + "num_full_absorption": 1574, + "num_probe_true_positives": 2282, + "num_split_features": 3 + }, + { + "first_letter": "q", + "mean_absorption_fraction": 0.697611173748312, + "full_absorption_rate": 0.46842105263157896, + "num_full_absorption": 89, + "num_probe_true_positives": 190, + "num_split_features": 3 + }, + { + "first_letter": "r", + "mean_absorption_fraction": 0.6868794374337754, + "full_absorption_rate": 0.5955320399764844, + "num_full_absorption": 1013, + "num_probe_true_positives": 1701, + "num_split_features": 5 + }, + { + "first_letter": "s", + "mean_absorption_fraction": 0.8751669277159965, + "full_absorption_rate": 0.7064481653010332, + "num_full_absorption": 1983, + "num_probe_true_positives": 2807, + "num_split_features": 1 + }, + { + "first_letter": "t", + "mean_absorption_fraction": 0.7865680163892015, + "full_absorption_rate": 0.6070796460176991, + "num_full_absorption": 1029, + "num_probe_true_positives": 1695, + "num_split_features": 4 + }, + { + "first_letter": "u", + "mean_absorption_fraction": 0.5045876719987833, + "full_absorption_rate": 0.42251655629139073, + "num_full_absorption": 319, + "num_probe_true_positives": 755, + "num_split_features": 2 + }, + { + "first_letter": "v", + "mean_absorption_fraction": 0.573872745990794, + "full_absorption_rate": 0.47782002534854245, + "num_full_absorption": 377, + "num_probe_true_positives": 789, + "num_split_features": 4 + }, + { + "first_letter": "w", + "mean_absorption_fraction": 0.7610011483914709, + "full_absorption_rate": 0.6556473829201102, + "num_full_absorption": 476, + "num_probe_true_positives": 726, + "num_split_features": 4 + }, + { + "first_letter": "x", + "mean_absorption_fraction": 0.10157088518458829, + "full_absorption_rate": 0.035398230088495575, + "num_full_absorption": 4, + "num_probe_true_positives": 113, + "num_split_features": 1 + }, + { + "first_letter": "y", + "mean_absorption_fraction": 0.36839684040497356, + "full_absorption_rate": 0.14204545454545456, + "num_full_absorption": 25, + "num_probe_true_positives": 176, + "num_split_features": 3 + }, + { + "first_letter": "z", + "mean_absorption_fraction": 0.03285097387026813, + "full_absorption_rate": 0.0425531914893617, + "num_full_absorption": 10, + "num_probe_true_positives": 235, + "num_split_features": 1 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..29041a02f54b3e92bd765356b2cd2b87c05b153e --- /dev/null +++ b/eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,268 @@ +{ + "eval_type_id": "absorption_first_letter", + "eval_config": { + "model_name": "gemma-2-2b", + "random_seed": 42, + "f1_jump_threshold": 0.03, + "max_k_value": 10, + "prompt_template": "{word} has the first letter:", + "prompt_token_pos": -6, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "k_sparse_probe_l1_decay": 0.01, + "k_sparse_probe_batch_size": 4096, + "k_sparse_probe_num_epochs": 50 + }, + "eval_id": "748a0895-c8af-4e84-ae36-2f3eb01ee78c", + "datetime_epoch_millis": 1740155568982, + "eval_result_metrics": { + "mean": { + "mean_absorption_fraction_score": 0.32544343256195074, + "mean_full_absorption_score": 0.32888806627437994, + "mean_num_split_features": 1.6538461538461537, + "std_dev_absorption_fraction_score": 0.20824169704299758, + "std_dev_full_absorption_score": 0.22060249667862672, + "std_dev_num_split_features": 1.0933364602832083 + } + }, + "eval_result_details": [ + { + "first_letter": "a", + "mean_absorption_fraction": 0.6007277347131041, + "full_absorption_rate": 0.5251196172248804, + "num_full_absorption": 1317, + "num_probe_true_positives": 2508, + "num_split_features": 1 + }, + { + "first_letter": "b", + "mean_absorption_fraction": 0.29305024763176835, + "full_absorption_rate": 0.30804150453955903, + "num_full_absorption": 475, + "num_probe_true_positives": 1542, + "num_split_features": 1 + }, + { + "first_letter": "c", + "mean_absorption_fraction": 0.630569504589696, + "full_absorption_rate": 0.6559714795008913, + "num_full_absorption": 1840, + "num_probe_true_positives": 2805, + "num_split_features": 3 + }, + { + "first_letter": "d", + "mean_absorption_fraction": 0.398668071195782, + "full_absorption_rate": 0.4066265060240964, + "num_full_absorption": 675, + "num_probe_true_positives": 1660, + "num_split_features": 2 + }, + { + "first_letter": "e", + "mean_absorption_fraction": 0.3653745156003204, + "full_absorption_rate": 0.46410891089108913, + "num_full_absorption": 750, + "num_probe_true_positives": 1616, + "num_split_features": 3 + }, + { + "first_letter": "f", + "mean_absorption_fraction": 0.4766671026609379, + "full_absorption_rate": 0.5218093699515347, + "num_full_absorption": 646, + "num_probe_true_positives": 1238, + "num_split_features": 1 + }, + { + "first_letter": "g", + "mean_absorption_fraction": 0.20841239327824035, + "full_absorption_rate": 0.20611353711790392, + "num_full_absorption": 236, + "num_probe_true_positives": 1145, + "num_split_features": 1 + }, + { + "first_letter": "h", + "mean_absorption_fraction": 0.194335181953026, + "full_absorption_rate": 0.19033816425120773, + "num_full_absorption": 197, + "num_probe_true_positives": 1035, + "num_split_features": 1 + }, + { + "first_letter": "i", + "mean_absorption_fraction": 0.498854509106931, + "full_absorption_rate": 0.5384615384615384, + "num_full_absorption": 882, + "num_probe_true_positives": 1638, + "num_split_features": 3 + }, + { + "first_letter": "j", + "mean_absorption_fraction": 0.00809270254817472, + "full_absorption_rate": 0.012135922330097087, + "num_full_absorption": 5, + "num_probe_true_positives": 412, + "num_split_features": 1 + }, + { + "first_letter": "k", + "mean_absorption_fraction": 0.027364220505435494, + "full_absorption_rate": 0.022222222222222223, + "num_full_absorption": 15, + "num_probe_true_positives": 675, + "num_split_features": 1 + }, + { + "first_letter": "l", + "mean_absorption_fraction": 0.46655821897414346, + "full_absorption_rate": 0.46786632390745503, + "num_full_absorption": 546, + "num_probe_true_positives": 1167, + "num_split_features": 1 + }, + { + "first_letter": "m", + "mean_absorption_fraction": 0.48562595745684123, + "full_absorption_rate": 0.5332235035694673, + "num_full_absorption": 971, + "num_probe_true_positives": 1821, + "num_split_features": 2 + }, + { + "first_letter": "n", + "mean_absorption_fraction": 0.36360093234914864, + "full_absorption_rate": 0.3211586901763224, + "num_full_absorption": 255, + "num_probe_true_positives": 794, + "num_split_features": 4 + }, + { + "first_letter": "o", + "mean_absorption_fraction": 0.3188066412402324, + "full_absorption_rate": 0.3786316776007498, + "num_full_absorption": 404, + "num_probe_true_positives": 1067, + "num_split_features": 1 + }, + { + "first_letter": "p", + "mean_absorption_fraction": 0.6813615753502903, + "full_absorption_rate": 0.6919368974583698, + "num_full_absorption": 1579, + "num_probe_true_positives": 2282, + "num_split_features": 2 + }, + { + "first_letter": "q", + "mean_absorption_fraction": 0.030652585242806108, + "full_absorption_rate": 0.03684210526315789, + "num_full_absorption": 7, + "num_probe_true_positives": 190, + "num_split_features": 1 + }, + { + "first_letter": "r", + "mean_absorption_fraction": 0.4850781646941056, + "full_absorption_rate": 0.5202821869488536, + "num_full_absorption": 885, + "num_probe_true_positives": 1701, + "num_split_features": 1 + }, + { + "first_letter": "s", + "mean_absorption_fraction": 0.6328351318421925, + "full_absorption_rate": 0.6074100463127895, + "num_full_absorption": 1705, + "num_probe_true_positives": 2807, + "num_split_features": 5 + }, + { + "first_letter": "t", + "mean_absorption_fraction": 0.3754464221045167, + "full_absorption_rate": 0.31504424778761064, + "num_full_absorption": 534, + "num_probe_true_positives": 1695, + "num_split_features": 1 + }, + { + "first_letter": "u", + "mean_absorption_fraction": 0.33574647694082693, + "full_absorption_rate": 0.41456953642384103, + "num_full_absorption": 313, + "num_probe_true_positives": 755, + "num_split_features": 2 + }, + { + "first_letter": "v", + "mean_absorption_fraction": 0.07488270366700864, + "full_absorption_rate": 0.08745247148288973, + "num_full_absorption": 69, + "num_probe_true_positives": 789, + "num_split_features": 1 + }, + { + "first_letter": "w", + "mean_absorption_fraction": 0.24404390408289825, + "full_absorption_rate": 0.2327823691460055, + "num_full_absorption": 169, + "num_probe_true_positives": 726, + "num_split_features": 1 + }, + { + "first_letter": "x", + "mean_absorption_fraction": 0.17461273099505284, + "full_absorption_rate": 0.017699115044247787, + "num_full_absorption": 2, + "num_probe_true_positives": 113, + "num_split_features": 1 + }, + { + "first_letter": "y", + "mean_absorption_fraction": 0.07002365476192687, + "full_absorption_rate": 0.045454545454545456, + "num_full_absorption": 8, + "num_probe_true_positives": 176, + "num_split_features": 1 + }, + { + "first_letter": "z", + "mean_absorption_fraction": 0.020137963125312984, + "full_absorption_rate": 0.029787234042553193, + "num_full_absorption": 7, + "num_probe_true_positives": 235, + "num_split_features": 1 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..64475e7b404bb2c8c944954669a604b28d546ddb --- /dev/null +++ b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ade8835091139a702090a82b583edafc17952df258ec2094ccae8475ffb0edd3 +size 26038784 diff --git a/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..3cbbef118e107a33ec80a08b4347d439d675ec7e --- /dev/null +++ b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ed9ba6ea6fbc0311812466993b7423a455115141b96aa55d92b5ffd60a716e5 +size 26004823 diff --git a/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7caf1eb21694f547f949e2a161ed56fecbb2beae --- /dev/null +++ b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0e5eeaf5313765b9d51b5b018a3a14d86d0f405351fba34e5b131f893c59124 +size 25526733 diff --git a/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..22609a15758fb8c7d3160beb5e840384cd6fb9aa --- /dev/null +++ b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:488c950c80e13463424f371370dba26e2461c059d86c19fbca50395bdf88341f +size 25600864 diff --git a/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..cb118b3311db2e6218de376d1c7b63ae5bd7b713 --- /dev/null +++ b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:536ed4bc61cb843c976114f9a3c8bbe2767dc9126eff5af03e4263d82fbbdb63 +size 25576485 diff --git a/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..34bc9f587b72f29e2b8431c77f7fdfe7f235b59e --- /dev/null +++ b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b5c2e8a088ce126cb3328b7b53055a573c73a6e3025fdbfa16eadebb694b70a +size 25376156 diff --git a/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..31924a2e700f0345d2ce17a393f622bf5152a24e --- /dev/null +++ b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91176fdc720c6a207b11313f9efbd110ecbe9b42e1d31f237d5c567195c4ca73 +size 25873513 diff --git a/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e5e8038fdce61f7f9f96d449a5f5818cf8664a51 --- /dev/null +++ b/eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e4b4b5355f82658f66abf51ca1610565246c09085f9666cf6ba0497eaf78ed8 +size 25938312 diff --git a/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..3d63114afb0d165466b9b7992afbce0541c58d3f --- /dev/null +++ b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:655ab0b13ade7b4ab554c5c3eec0134aa39dd1a462045bf320a8ae2d72d47c6e +size 21672734 diff --git a/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..aadb42e8935ae108fade0defd120c659db2ac012 --- /dev/null +++ b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e4fa8826a42d156a4678073845c4b6c449760fa1a0e0b66b2d370935f2417fa +size 21773774 diff --git a/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ebc47723d1e00a802a959ce79d895205b7bf1c68 --- /dev/null +++ b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b16f7aadd97950eb3f710d2dd1293998ff117f83c436ab19489a9f063abec1fe +size 21266212 diff --git a/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..fe0793d77b7148c5ffa9f11461f8bde060b54f66 --- /dev/null +++ b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e1ee90bbe960ff9c633099c66a1c22fcbed7fe1ced711815c6035b4615010a2 +size 21208670 diff --git a/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..9ff0a1bf88835fd2cb152bf516449063c9bab472 --- /dev/null +++ b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ea6375fb3f724b9d9f35c8f9da86a56cb07ffe09b252eebd68063e5af0549f9 +size 21505205 diff --git a/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..47e1f306d2989ef0e4246f10074cab2dcf2584ab --- /dev/null +++ b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fc340a10fb487df31d251f7b80d4d56629b38ccc606ce074441c8e2993478a +size 21585654 diff --git a/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..df7f5acc1e025aabb7f91973767230f63f2ae98a --- /dev/null +++ b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1da2c27c4aed20429fa10b59ce05e61b689a887f52c352891e4ee7fa112da3a5 +size 21630563 diff --git a/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c4c247c44379eaf79646e618d241a3939ee821e8 --- /dev/null +++ b/eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcac9aa9a10fc5e629bd04219f06be94fc721bec2c03f09d8f48ffc4cbeb469c +size 21697035 diff --git a/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..cebf492c2343460c88e089a69a4e342ece85bb2d --- /dev/null +++ b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,323 @@ +{ + "eval_type_id": "scr", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": true, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "76e1c3b5-c26c-4837-91c8-6d45919b460c", + "datetime_epoch_millis": 1740160660875, + "eval_result_metrics": { + "scr_metrics": { + "scr_dir1_threshold_2": 0.17937598651986972, + "scr_metric_threshold_2": 0.06750879575859692, + "scr_dir2_threshold_2": 0.07494545276237846, + "scr_dir1_threshold_5": 0.22402296524662565, + "scr_metric_threshold_5": 0.11204012359707471, + "scr_dir2_threshold_5": 0.11697061811839651, + "scr_dir1_threshold_10": 0.2336172994227871, + "scr_metric_threshold_10": 0.13293663214601245, + "scr_dir2_threshold_10": 0.1427465534868288, + "scr_dir1_threshold_20": 0.2505843441914149, + "scr_metric_threshold_20": 0.16341666656794324, + "scr_dir2_threshold_20": 0.17208719496326916, + "scr_dir1_threshold_50": 0.27174068905788434, + "scr_metric_threshold_50": 0.21225175803186222, + "scr_dir2_threshold_50": 0.2175195174361239, + "scr_dir1_threshold_100": 0.2464209622824807, + "scr_metric_threshold_100": 0.22702989060166226, + "scr_dir2_threshold_100": 0.23717704484868324, + "scr_dir1_threshold_500": 0.2697303489398643, + "scr_metric_threshold_500": 0.22331210595096943, + "scr_dir2_threshold_500": 0.23536249992024194 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results", + "scr_dir1_threshold_2": 0.47619029597980633, + "scr_metric_threshold_2": 0.004914087313907592, + "scr_dir2_threshold_2": 0.004914087313907592, + "scr_dir1_threshold_5": 0.555555345309774, + "scr_metric_threshold_5": 0.012284998611665989, + "scr_dir2_threshold_5": 0.012284998611665989, + "scr_dir1_threshold_10": 0.555555345309774, + "scr_metric_threshold_10": 0.027027114104653437, + "scr_dir2_threshold_10": 0.027027114104653437, + "scr_dir1_threshold_20": 0.5238097040201937, + "scr_metric_threshold_20": 0.036855142283733294, + "scr_dir2_threshold_20": 0.036855142283733294, + "scr_dir1_threshold_50": 0.333333017964661, + "scr_metric_threshold_50": 0.06633908037223754, + "scr_dir2_threshold_50": 0.06633908037223754, + "scr_dir1_threshold_100": 0.31746019731987085, + "scr_metric_threshold_100": 0.10565119308855696, + "scr_dir2_threshold_100": 0.10565119308855696, + "scr_dir1_threshold_500": 0.2698407892794835, + "scr_metric_threshold_500": 0.23587223695171058, + "scr_dir2_threshold_500": 0.23587223695171058 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results", + "scr_dir1_threshold_2": 0.1313126752015721, + "scr_metric_threshold_2": 0.08498575820403938, + "scr_dir2_threshold_2": 0.08498575820403938, + "scr_dir1_threshold_5": 0.16161592443815084, + "scr_metric_threshold_5": 0.1161473379602074, + "scr_dir2_threshold_5": 0.1161473379602074, + "scr_dir1_threshold_10": 0.17171680682792437, + "scr_metric_threshold_10": 0.15014157370114636, + "scr_dir2_threshold_10": 0.15014157370114636, + "scr_dir1_threshold_20": 0.17171680682792437, + "scr_metric_threshold_20": 0.17847032862085146, + "scr_dir2_threshold_20": 0.17847032862085146, + "scr_dir1_threshold_50": 0.3737374649596856, + "scr_metric_threshold_50": 0.2096317395253275, + "scr_dir2_threshold_50": 0.2096317395253275, + "scr_dir1_threshold_100": 0.11111091042202506, + "scr_metric_threshold_100": 0.24362614411795844, + "scr_dir2_threshold_100": 0.24362614411795844, + "scr_dir1_threshold_500": 0.03030264716932058, + "scr_metric_threshold_500": 0.07082146517003281, + "scr_dir2_threshold_500": 0.07082146517003281 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results", + "scr_dir1_threshold_2": 0.516128349998606, + "scr_metric_threshold_2": 0.007575810028571093, + "scr_dir2_threshold_2": 0.007575810028571093, + "scr_dir1_threshold_5": 0.5483869727270193, + "scr_metric_threshold_5": 0.022727279568944055, + "scr_dir2_threshold_5": 0.022727279568944055, + "scr_dir1_threshold_10": 0.516128349998606, + "scr_metric_threshold_10": 0.03535352927204973, + "scr_dir2_threshold_10": 0.03535352927204973, + "scr_dir1_threshold_20": 0.48387068863579336, + "scr_metric_threshold_20": 0.07070705854409946, + "scr_dir2_threshold_20": 0.07070705854409946, + "scr_dir1_threshold_50": 0.46774185795438705, + "scr_metric_threshold_50": 0.1010101481416146, + "scr_dir2_threshold_50": 0.1010101481416146, + "scr_dir1_threshold_100": 0.46774185795438705, + "scr_metric_threshold_100": 0.17171720668571405, + "scr_dir2_threshold_100": 0.17171720668571405, + "scr_dir1_threshold_500": 0.40322557386316116, + "scr_metric_threshold_500": -0.012626249703105673, + "scr_dir2_threshold_500": -0.012626249703105673 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results", + "scr_dir1_threshold_2": 0.1707318964255635, + "scr_metric_threshold_2": 0.03519051639302713, + "scr_dir2_threshold_2": 0.03519051639302713, + "scr_dir1_threshold_5": 0.2845529991791037, + "scr_metric_threshold_5": 0.07331372148022297, + "scr_dir2_threshold_5": 0.07331372148022297, + "scr_dir1_threshold_10": 0.15447146198048264, + "scr_metric_threshold_10": 0.10263921007231215, + "scr_dir2_threshold_10": 0.10263921007231215, + "scr_dir1_threshold_20": 0.16260192149827632, + "scr_metric_threshold_20": 0.11730195436835675, + "scr_dir2_threshold_20": 0.11730195436835675, + "scr_dir1_threshold_50": 0.12195107768082737, + "scr_metric_threshold_50": 0.17302041765206616, + "scr_dir2_threshold_50": 0.17302041765206616, + "scr_dir1_threshold_100": 0.1707318964255635, + "scr_metric_threshold_100": -0.005865202594637679, + "scr_dir2_threshold_100": -0.005865202594637679, + "scr_dir1_threshold_500": 0.5447155889858393, + "scr_metric_threshold_500": -0.07624641017439167, + "scr_dir2_threshold_500": -0.07624641017439167 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results", + "scr_dir1_threshold_2": 0.00546440968122594, + "scr_metric_threshold_2": 0.21875017462295412, + "scr_dir2_threshold_2": 0.21875017462295412, + "scr_dir1_threshold_5": 0.0, + "scr_metric_threshold_5": 0.371093800931695, + "scr_dir2_threshold_5": 0.371093800931695, + "scr_dir1_threshold_10": 0.05464474822904205, + "scr_metric_threshold_10": 0.417968888243172, + "scr_dir2_threshold_10": 0.417968888243172, + "scr_dir1_threshold_20": 0.10382508677685816, + "scr_metric_threshold_20": 0.4531251455191284, + "scr_dir2_threshold_20": 0.4531251455191284, + "scr_dir1_threshold_50": 0.1803277994391953, + "scr_metric_threshold_50": 0.5156250291038257, + "scr_dir2_threshold_50": 0.5156250291038257, + "scr_dir1_threshold_100": 0.10382508677685816, + "scr_metric_threshold_100": 0.5625001164153027, + "scr_dir2_threshold_100": 0.5625001164153027, + "scr_dir1_threshold_500": -0.06557389329988525, + "scr_metric_threshold_500": 0.6054687718278693, + "scr_dir2_threshold_500": 0.6054687718278693 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results", + "scr_dir1_threshold_2": 0.046153559299134936, + "scr_metric_threshold_2": 0.06451625307937271, + "scr_dir2_threshold_2": 0.06451625307937271, + "scr_dir1_threshold_5": 0.08717928497042846, + "scr_metric_threshold_5": 0.08870972781349516, + "scr_dir2_threshold_5": 0.08870972781349516, + "scr_dir1_threshold_10": 0.15384601277006638, + "scr_metric_threshold_10": 0.1008065853511987, + "scr_dir2_threshold_10": 0.1008065853511987, + "scr_dir1_threshold_20": 0.158974152062764, + "scr_metric_threshold_20": 0.12500006008532116, + "scr_dir2_threshold_20": 0.12500006008532116, + "scr_dir1_threshold_50": 0.20512801702675515, + "scr_metric_threshold_50": 0.20161293036111277, + "scr_dir2_threshold_50": 0.20161293036111277, + "scr_dir1_threshold_100": 0.24615374269804866, + "scr_metric_threshold_100": 0.29032265817460795, + "scr_dir2_threshold_100": 0.29032265817460795, + "scr_dir1_threshold_500": 0.35384589050412385, + "scr_metric_threshold_500": 0.3991936549900859, + "scr_dir2_threshold_500": 0.3991936549900859 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results", + "scr_dir1_threshold_2": 0.06756750225933819, + "scr_metric_threshold_2": 0.10267856311319266, + "scr_dir2_threshold_2": 0.10267856311319266, + "scr_dir1_threshold_5": 0.09909916440732847, + "scr_metric_threshold_5": 0.15625009146916644, + "scr_dir2_threshold_5": 0.15625009146916644, + "scr_dir1_threshold_10": 0.19819806032526952, + "scr_metric_threshold_10": 0.16517854648243513, + "scr_dir2_threshold_10": 0.16517854648243513, + "scr_dir1_threshold_20": 0.2882883318271079, + "scr_metric_threshold_20": 0.214285581239654, + "scr_dir2_threshold_20": 0.214285581239654, + "scr_dir1_threshold_50": 0.3243244404278432, + "scr_metric_threshold_50": 0.2633928820889934, + "scr_dir2_threshold_50": 0.2633928820889934, + "scr_dir1_threshold_100": 0.3783783348395588, + "scr_metric_threshold_100": 0.27232133710226214, + "scr_dir2_threshold_100": 0.27232133710226214, + "scr_dir1_threshold_500": 0.43693694419340684, + "scr_metric_threshold_500": 0.37946412772208915, + "scr_dir2_threshold_500": 0.37946412772208915 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results", + "scr_dir1_threshold_2": 0.021459203313710703, + "scr_metric_threshold_2": 0.021459203313710703, + "scr_dir2_threshold_2": 0.08095245934396302, + "scr_dir1_threshold_5": 0.05579403094120067, + "scr_metric_threshold_5": 0.05579403094120067, + "scr_dir2_threshold_5": 0.09523798711177515, + "scr_dir1_threshold_10": 0.0643776099411321, + "scr_metric_threshold_10": 0.0643776099411321, + "scr_dir2_threshold_10": 0.14285698066766273, + "scr_dir1_threshold_20": 0.11158806188240133, + "scr_metric_threshold_20": 0.11158806188240133, + "scr_dir2_threshold_20": 0.18095228904500887, + "scr_dir1_threshold_50": 0.1673818370097199, + "scr_metric_threshold_50": 0.1673818370097199, + "scr_dir2_threshold_50": 0.20952391224381361, + "scr_dir1_threshold_100": 0.17596567182353345, + "scr_metric_threshold_100": 0.17596567182353345, + "scr_dir2_threshold_100": 0.25714290579970117, + "scr_dir1_threshold_500": 0.18454925082346488, + "scr_metric_threshold_500": 0.18454925082346488, + "scr_dir2_threshold_500": 0.280952402577645 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e37d9cd7296d27e49bd984f6b84e2014db27d4a3 --- /dev/null +++ b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,323 @@ +{ + "eval_type_id": "scr", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": true, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "e41564f8-711c-402f-be1a-c2c1e1a26d9b", + "datetime_epoch_millis": 1740159778191, + "eval_result_metrics": { + "scr_metrics": { + "scr_dir1_threshold_2": 0.21646779635345523, + "scr_metric_threshold_2": 0.10981927993643228, + "scr_dir2_threshold_2": 0.11159734942875701, + "scr_dir1_threshold_5": 0.2386560380069923, + "scr_metric_threshold_5": 0.15915722723436423, + "scr_dir2_threshold_5": 0.1647417441944568, + "scr_dir1_threshold_10": 0.2763340942473081, + "scr_metric_threshold_10": 0.20696168095936596, + "scr_dir2_threshold_10": 0.20938604847717682, + "scr_dir1_threshold_20": 0.29245964775791145, + "scr_metric_threshold_20": 0.2572680056753961, + "scr_dir2_threshold_20": 0.2562308120870308, + "scr_dir1_threshold_50": 0.2917372572864733, + "scr_metric_threshold_50": 0.35979093988002914, + "scr_dir2_threshold_50": 0.3474876389705775, + "scr_dir1_threshold_100": 0.22457255710535257, + "scr_metric_threshold_100": 0.4050724113957374, + "scr_dir2_threshold_100": 0.40371842827480603, + "scr_dir1_threshold_500": 0.06777817985891049, + "scr_metric_threshold_500": 0.31473923877780835, + "scr_dir2_threshold_500": 0.30271441063618476 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results", + "scr_dir1_threshold_2": 0.5079359372693867, + "scr_metric_threshold_2": 0.007371057746493725, + "scr_dir2_threshold_2": 0.007371057746493725, + "scr_dir1_threshold_5": 0.5396825246649839, + "scr_metric_threshold_5": 0.022113026790745845, + "scr_dir2_threshold_5": 0.022113026790745845, + "scr_dir1_threshold_10": 0.6031747533501614, + "scr_metric_threshold_10": 0.022113026790745845, + "scr_dir2_threshold_10": 0.022113026790745845, + "scr_dir1_threshold_20": 0.5873019327053712, + "scr_metric_threshold_20": 0.05651105219315768, + "scr_dir2_threshold_20": 0.05651105219315768, + "scr_dir1_threshold_50": 0.5714281659545642, + "scr_metric_threshold_50": 0.13513513117706122, + "scr_dir2_threshold_50": 0.13513513117706122, + "scr_dir1_threshold_100": 0.5238097040201937, + "scr_metric_threshold_100": 0.15479118753522092, + "scr_dir2_threshold_100": 0.15479118753522092, + "scr_dir1_threshold_500": 0.42857088793941894, + "scr_metric_threshold_500": -0.004913940865172265, + "scr_dir2_threshold_500": -0.004913940865172265 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results", + "scr_dir1_threshold_2": 0.2121209384542766, + "scr_metric_threshold_2": 0.0934844015651201, + "scr_dir2_threshold_2": 0.0934844015651201, + "scr_dir1_threshold_5": 0.25252507008062886, + "scr_metric_threshold_5": 0.15014157370114636, + "scr_dir2_threshold_5": 0.15014157370114636, + "scr_dir1_threshold_10": 0.31313096648652816, + "scr_metric_threshold_10": 0.20679891468886458, + "scr_dir2_threshold_10": 0.20679891468886458, + "scr_dir1_threshold_20": 0.3333333333333333, + "scr_metric_threshold_20": 0.23796032559334063, + "scr_dir2_threshold_20": 0.23796032559334063, + "scr_dir1_threshold_50": 0.15151504204837732, + "scr_metric_threshold_50": 0.3597733132264739, + "scr_dir2_threshold_50": 0.3597733132264739, + "scr_dir1_threshold_100": 0.11111091042202506, + "scr_metric_threshold_100": 0.43909342175758737, + "scr_dir2_threshold_100": 0.43909342175758737, + "scr_dir1_threshold_500": -0.8484855600188809, + "scr_metric_threshold_500": 0.10481586976266374, + "scr_dir2_threshold_500": 0.10481586976266374 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results", + "scr_dir1_threshold_2": 0.532258142045613, + "scr_metric_threshold_2": 0.017676839894409477, + "scr_dir2_threshold_2": 0.017676839894409477, + "scr_dir1_threshold_5": 0.5483869727270193, + "scr_metric_threshold_5": 0.03535352927204973, + "scr_dir2_threshold_5": 0.03535352927204973, + "scr_dir1_threshold_10": 0.5806446340898319, + "scr_metric_threshold_10": 0.06565661886956488, + "scr_dir2_threshold_10": 0.06565661886956488, + "scr_dir1_threshold_20": 0.48387068863579336, + "scr_metric_threshold_20": 0.12121220787329137, + "scr_dir2_threshold_20": 0.12121220787329137, + "scr_dir1_threshold_50": 0.532258142045613, + "scr_metric_threshold_50": 0.24242426522981353, + "scr_dir2_threshold_50": 0.24242426522981353, + "scr_dir1_threshold_100": 0.46774185795438705, + "scr_metric_threshold_100": 0.31060610393664567, + "scr_dir2_threshold_100": 0.31060610393664567, + "scr_dir1_threshold_500": 0.03225766136281265, + "scr_metric_threshold_500": 0.09090911827577622, + "scr_dir2_threshold_500": 0.09090911827577622 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results", + "scr_dir1_threshold_2": 0.2926829741063909, + "scr_metric_threshold_2": 0.0791787492811609, + "scr_dir2_threshold_2": 0.0791787492811609, + "scr_dir1_threshold_5": 0.2764230242518165, + "scr_metric_threshold_5": 0.15835767335602155, + "scr_dir2_threshold_5": 0.15835767335602155, + "scr_dir1_threshold_10": 0.2764230242518165, + "scr_metric_threshold_10": 0.21407613663973096, + "scr_dir2_threshold_10": 0.21407613663973096, + "scr_dir1_threshold_20": 0.1707318964255635, + "scr_metric_threshold_20": 0.28739003291365367, + "scr_dir2_threshold_20": 0.28739003291365367, + "scr_dir1_threshold_50": -0.06504076859931053, + "scr_metric_threshold_50": 0.39296193168013455, + "scr_dir2_threshold_50": 0.39296193168013455, + "scr_dir1_threshold_100": 0.032520384299655265, + "scr_metric_threshold_100": 0.4516129088643129, + "scr_dir2_threshold_100": 0.4516129088643129, + "scr_dir1_threshold_500": -0.22764220550708036, + "scr_metric_threshold_500": 0.09677418227137422, + "scr_dir2_threshold_500": 0.09677418227137422 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results", + "scr_dir1_threshold_2": 0.021857964433295084, + "scr_metric_threshold_2": 0.3671876018633899, + "scr_dir2_threshold_2": 0.3671876018633899, + "scr_dir1_threshold_5": 0.00546440968122594, + "scr_metric_threshold_5": 0.43750011641530273, + "scr_dir2_threshold_5": 0.43750011641530273, + "scr_dir1_threshold_10": 0.04918033854781611, + "scr_metric_threshold_10": 0.5078126309672156, + "scr_dir2_threshold_10": 0.5078126309672156, + "scr_dir1_threshold_20": 0.1803277994391953, + "scr_metric_threshold_20": 0.5664063154836078, + "scr_dir2_threshold_20": 0.5664063154836078, + "scr_dir1_threshold_50": 0.1912569445100385, + "scr_metric_threshold_50": 0.6562500582076514, + "scr_dir2_threshold_50": 0.6562500582076514, + "scr_dir1_threshold_100": -0.1092894964580841, + "scr_metric_threshold_100": 0.63281239813661, + "scr_dir2_threshold_100": 0.63281239813661, + "scr_dir1_threshold_500": -0.09289626741440628, + "scr_metric_threshold_500": 0.7812500582076514, + "scr_dir2_threshold_500": 0.7812500582076514 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results", + "scr_dir1_threshold_2": 0.07692300638503319, + "scr_metric_threshold_2": 0.056451601273808806, + "scr_dir2_threshold_2": 0.056451601273808806, + "scr_dir1_threshold_5": 0.12307687134902433, + "scr_metric_threshold_5": 0.0927419335456348, + "scr_dir2_threshold_5": 0.0927419335456348, + "scr_dir1_threshold_10": 0.15384601277006638, + "scr_metric_threshold_10": 0.12500006008532116, + "scr_dir2_threshold_10": 0.12500006008532116, + "scr_dir1_threshold_20": 0.23076901915509954, + "scr_metric_threshold_20": 0.16935480382142643, + "scr_dir2_threshold_20": 0.16935480382142643, + "scr_dir1_threshold_50": 0.3333333333333333, + "scr_metric_threshold_50": 0.3064517214444511, + "scr_dir2_threshold_50": 0.3064517214444511, + "scr_dir1_threshold_100": 0.36410247475437535, + "scr_metric_threshold_100": 0.39112900318452204, + "scr_dir2_threshold_100": 0.39112900318452204, + "scr_dir1_threshold_500": 0.45641020468235766, + "scr_metric_threshold_500": 0.43145154118848766, + "scr_dir2_threshold_500": 0.43145154118848766 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results", + "scr_dir1_threshold_2": 0.045045001506225466, + "scr_metric_threshold_2": 0.214285581239654, + "scr_dir2_threshold_2": 0.214285581239654, + "scr_dir1_threshold_5": 0.10360361086007354, + "scr_metric_threshold_5": 0.31696414435284664, + "scr_dir2_threshold_5": 0.31696414435284664, + "scr_dir1_threshold_10": 0.14414416591355395, + "scr_metric_threshold_10": 0.4241072010647942, + "scr_dir2_threshold_10": 0.4241072010647942, + "scr_dir1_threshold_20": 0.21171166817289214, + "scr_metric_threshold_20": 0.47767846332864744, + "scr_dir2_threshold_20": 0.47767846332864744, + "scr_dir1_threshold_50": 0.38738749623443636, + "scr_metric_threshold_50": 0.5535713953099135, + "scr_dir2_threshold_50": 0.5535713953099135, + "scr_dir1_threshold_100": 0.15765750527178912, + "scr_metric_threshold_100": 0.6116071511725216, + "scr_dir2_threshold_100": 0.6116071511725216, + "scr_dir1_threshold_500": 0.4549549984937745, + "scr_metric_threshold_500": 0.6785713620483984, + "scr_dir2_threshold_500": 0.6785713620483984 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results", + "scr_dir1_threshold_2": 0.042918406627421406, + "scr_metric_threshold_2": 0.042918406627421406, + "scr_dir2_threshold_2": 0.05714296256601923, + "scr_dir1_threshold_5": 0.060085820441166386, + "scr_metric_threshold_5": 0.060085820441166386, + "scr_dir2_threshold_5": 0.10476195612190681, + "scr_dir1_threshold_10": 0.09012885856869063, + "scr_metric_threshold_10": 0.09012885856869063, + "scr_dir2_threshold_10": 0.10952379871117751, + "scr_dir1_threshold_20": 0.1416308441960435, + "scr_metric_threshold_20": 0.1416308441960435, + "scr_dir2_threshold_20": 0.1333332954891213, + "scr_dir1_threshold_50": 0.23175970276473412, + "scr_metric_threshold_50": 0.23175970276473412, + "scr_dir2_threshold_50": 0.1333332954891213, + "scr_dir1_threshold_100": 0.2489271165784791, + "scr_metric_threshold_100": 0.2489271165784791, + "scr_dir2_threshold_100": 0.2380952516110281, + "scr_dir1_threshold_500": 0.33905571933328765, + "scr_metric_threshold_500": 0.33905571933328765, + "scr_dir2_threshold_500": 0.24285709420029883 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f817d2074cd61798bef112b48698a2344210732c --- /dev/null +++ b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,323 @@ +{ + "eval_type_id": "scr", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": true, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "e84e2919-45d8-4739-935c-c4c49e6f01bd", + "datetime_epoch_millis": 1740159333994, + "eval_result_metrics": { + "scr_metrics": { + "scr_dir1_threshold_2": 0.13902672439841735, + "scr_metric_threshold_2": 0.052679827874983993, + "scr_dir2_threshold_2": 0.05023245750763616, + "scr_dir1_threshold_5": 0.15256902306733214, + "scr_metric_threshold_5": 0.08124373333242407, + "scr_dir2_threshold_5": 0.08373451444399863, + "scr_dir1_threshold_10": 0.18434644922627724, + "scr_metric_threshold_10": 0.11066224908587924, + "scr_dir2_threshold_10": 0.11285158655682409, + "scr_dir1_threshold_20": 0.1730640453138616, + "scr_metric_threshold_20": 0.13485719572436072, + "scr_dir2_threshold_20": 0.13591482918415104, + "scr_dir1_threshold_50": 0.16112645779738913, + "scr_metric_threshold_50": 0.17958703268862003, + "scr_dir2_threshold_50": 0.18450987025234128, + "scr_dir1_threshold_100": 0.16878289626201945, + "scr_metric_threshold_100": 0.17936453816540665, + "scr_dir2_threshold_100": 0.1792240185413299, + "scr_dir1_threshold_500": 0.0930010293225678, + "scr_metric_threshold_500": 0.1838392653323654, + "scr_dir2_threshold_500": 0.18834314598313068 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results", + "scr_dir1_threshold_2": 0.2857136099242737, + "scr_metric_threshold_2": 0.01474211549298745, + "scr_dir2_threshold_2": 0.01474211549298745, + "scr_dir1_threshold_5": 0.31746019731987085, + "scr_metric_threshold_5": 0.017199085925573582, + "scr_dir2_threshold_5": 0.017199085925573582, + "scr_dir1_threshold_10": 0.3492058386094512, + "scr_metric_threshold_10": 0.024570143672067307, + "scr_dir2_threshold_10": 0.024570143672067307, + "scr_dir1_threshold_20": 0.222222327345113, + "scr_metric_threshold_20": 0.036855142283733294, + "scr_dir2_threshold_20": 0.036855142283733294, + "scr_dir1_threshold_50": 0.23809514798990317, + "scr_metric_threshold_50": 0.05651105219315768, + "scr_dir2_threshold_50": 0.05651105219315768, + "scr_dir1_threshold_100": 0.23809514798990317, + "scr_metric_threshold_100": 0.06879605080482366, + "scr_dir2_threshold_100": 0.06879605080482366, + "scr_dir1_threshold_500": 0.15873009865993543, + "scr_metric_threshold_500": 0.13022119031188895, + "scr_dir2_threshold_500": 0.13022119031188895 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results", + "scr_dir1_threshold_2": 0.2121209384542766, + "scr_metric_threshold_2": 0.05099152246310042, + "scr_dir2_threshold_2": 0.05099152246310042, + "scr_dir1_threshold_5": 0.19191917367472955, + "scr_metric_threshold_5": 0.07648728369465063, + "scr_dir2_threshold_5": 0.07648728369465063, + "scr_dir1_threshold_10": 0.2323233053010818, + "scr_metric_threshold_10": 0.10764869459912667, + "scr_dir2_threshold_10": 0.10764869459912667, + "scr_dir1_threshold_20": 0.2828283193172076, + "scr_metric_threshold_20": 0.15014157370114636, + "scr_dir2_threshold_20": 0.15014157370114636, + "scr_dir1_threshold_50": 0.20202005606450307, + "scr_metric_threshold_50": 0.18130315345731438, + "scr_dir2_threshold_50": 0.18130315345731438, + "scr_dir1_threshold_100": 0.16161592443815084, + "scr_metric_threshold_100": 0.21246456436179043, + "scr_dir2_threshold_100": 0.21246456436179043, + "scr_dir1_threshold_500": -0.07070738086293096, + "scr_metric_threshold_500": 0.07932010853111354, + "scr_dir2_threshold_500": 0.07932010853111354 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results", + "scr_dir1_threshold_2": 0.258064174999303, + "scr_metric_threshold_2": 0.017676839894409477, + "scr_dir2_threshold_2": 0.017676839894409477, + "scr_dir1_threshold_5": 0.29032183636211567, + "scr_metric_threshold_5": 0.03535352927204973, + "scr_dir2_threshold_5": 0.03535352927204973, + "scr_dir1_threshold_10": 0.33870928977193526, + "scr_metric_threshold_10": 0.06565661886956488, + "scr_dir2_threshold_10": 0.06565661886956488, + "scr_dir1_threshold_20": 0.37096695113474787, + "scr_metric_threshold_20": 0.06565661886956488, + "scr_dir2_threshold_20": 0.06565661886956488, + "scr_dir1_threshold_50": 0.20967672158948342, + "scr_metric_threshold_50": 0.08838389843850893, + "scr_dir2_threshold_50": 0.08838389843850893, + "scr_dir1_threshold_100": 0.22580651363649037, + "scr_metric_threshold_100": 0.0959595579503108, + "scr_dir2_threshold_100": 0.0959595579503108, + "scr_dir1_threshold_500": 0.11290277613544487, + "scr_metric_threshold_500": 0.04545455913788811, + "scr_dir2_threshold_500": 0.04545455913788811 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results", + "scr_dir1_threshold_2": 0.13821151212590824, + "scr_metric_threshold_2": 0.052785774589540695, + "scr_dir2_threshold_2": 0.052785774589540695, + "scr_dir1_threshold_5": 0.1869918462801379, + "scr_metric_threshold_5": 0.0791787492811609, + "scr_dir2_threshold_5": 0.0791787492811609, + "scr_dir1_threshold_10": 0.21951223057979316, + "scr_metric_threshold_10": 0.1114369265674188, + "scr_dir2_threshold_10": 0.1114369265674188, + "scr_dir1_threshold_20": 0.23577218043436754, + "scr_metric_threshold_20": 0.13782990125903902, + "scr_dir2_threshold_20": 0.13782990125903902, + "scr_dir1_threshold_50": 0.23577218043436754, + "scr_metric_threshold_50": 0.17008790375159719, + "scr_dir2_threshold_50": 0.17008790375159719, + "scr_dir1_threshold_100": 0.21951223057979316, + "scr_metric_threshold_100": 0.052785774589540695, + "scr_dir2_threshold_100": 0.052785774589540695, + "scr_dir1_threshold_500": 0.11382110275354018, + "scr_metric_threshold_500": 0.10850441266694984, + "scr_dir2_threshold_500": 0.10850441266694984 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results", + "scr_dir1_threshold_2": 0.00546440968122594, + "scr_metric_threshold_2": 0.10937497089617432, + "scr_dir2_threshold_2": 0.10937497089617432, + "scr_dir1_threshold_5": 0.021857964433295084, + "scr_metric_threshold_5": 0.21093754365573852, + "scr_dir2_threshold_5": 0.21093754365573852, + "scr_dir1_threshold_10": 0.05464474822904205, + "scr_metric_threshold_10": 0.28125005820765137, + "scr_dir2_threshold_10": 0.28125005820765137, + "scr_dir1_threshold_20": -0.06010915791026799, + "scr_metric_threshold_20": 0.3320313445874335, + "scr_dir2_threshold_20": 0.3320313445874335, + "scr_dir1_threshold_50": -0.04918033854781611, + "scr_metric_threshold_50": 0.3984374272404358, + "scr_dir2_threshold_50": 0.3984374272404358, + "scr_dir1_threshold_100": -0.016393554752069144, + "scr_metric_threshold_100": 0.42578128637978213, + "scr_dir2_threshold_100": 0.42578128637978213, + "scr_dir1_threshold_500": -0.03825151918536423, + "scr_metric_threshold_500": 0.48828116996447934, + "scr_dir2_threshold_500": 0.48828116996447934 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results", + "scr_dir1_threshold_2": 0.09743586922067994, + "scr_metric_threshold_2": 0.05241939554166917, + "scr_dir2_threshold_2": 0.05241939554166917, + "scr_dir1_threshold_5": 0.09743586922067994, + "scr_metric_threshold_5": 0.07661287027579163, + "scr_dir2_threshold_5": 0.07661287027579163, + "scr_dir1_threshold_10": 0.11282028709877284, + "scr_metric_threshold_10": 0.09677413927777444, + "scr_dir2_threshold_10": 0.09677413927777444, + "scr_dir1_threshold_20": 0.13846128922711723, + "scr_metric_threshold_20": 0.14516132908730398, + "scr_dir2_threshold_20": 0.14516132908730398, + "scr_dir1_threshold_50": 0.1692307363130155, + "scr_metric_threshold_50": 0.23790326263293876, + "scr_dir2_threshold_50": 0.23790326263293876, + "scr_dir1_threshold_100": 0.18974359914866223, + "scr_metric_threshold_100": 0.2580645316349216, + "scr_dir2_threshold_100": 0.2580645316349216, + "scr_dir1_threshold_500": 0.14871787347736873, + "scr_metric_threshold_500": 0.3064517214444511, + "scr_dir2_threshold_500": 0.3064517214444511 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results", + "scr_dir1_threshold_2": 0.07657666365421574, + "scr_metric_threshold_2": 0.08482138699453473, + "scr_dir2_threshold_2": 0.08482138699453473, + "scr_dir1_threshold_5": 0.06306305580659313, + "scr_metric_threshold_5": 0.10267856311319266, + "scr_dir2_threshold_5": 0.10267856311319266, + "scr_dir1_threshold_10": 0.09459444946519599, + "scr_metric_threshold_10": 0.12499996673848493, + "scr_dir2_threshold_10": 0.12499996673848493, + "scr_dir1_threshold_20": 0.11711721870769615, + "scr_metric_threshold_20": 0.13392842175175362, + "scr_dir2_threshold_20": 0.13392842175175362, + "scr_dir1_threshold_50": 0.18468472096703434, + "scr_metric_threshold_50": 0.20535712622638533, + "scr_dir2_threshold_50": 0.20535712622638533, + "scr_dir1_threshold_100": 0.21171166817289214, + "scr_metric_threshold_100": 0.20089289871975097, + "scr_dir2_threshold_100": 0.20089289871975097, + "scr_dir1_threshold_500": 0.20720722172014708, + "scr_metric_threshold_500": 0.20089289871975097, + "scr_dir2_threshold_500": 0.20089289871975097 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results", + "scr_dir1_threshold_2": 0.03862661712745569, + "scr_metric_threshold_2": 0.03862661712745569, + "scr_dir2_threshold_2": 0.019047654188673074, + "scr_dir1_threshold_5": 0.05150224144123495, + "scr_metric_threshold_5": 0.05150224144123495, + "scr_dir2_threshold_5": 0.07142849033383136, + "scr_dir1_threshold_10": 0.07296144475494565, + "scr_metric_threshold_10": 0.07296144475494565, + "scr_dir2_threshold_10": 0.09047614452250444, + "scr_dir1_threshold_20": 0.07725323425491137, + "scr_metric_threshold_20": 0.07725323425491137, + "scr_dir2_threshold_20": 0.08571430193323373, + "scr_dir1_threshold_50": 0.09871243756862208, + "scr_metric_threshold_50": 0.09871243756862208, + "scr_dir2_threshold_50": 0.13809513807839202, + "scr_dir1_threshold_100": 0.12017164088233277, + "scr_metric_threshold_100": 0.12017164088233277, + "scr_dir2_threshold_100": 0.11904748388971893, + "scr_dir1_threshold_500": 0.11158806188240133, + "scr_metric_threshold_500": 0.11158806188240133, + "scr_dir2_threshold_500": 0.14761910708852366 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..09a77d0a838df374b21c4e2a606cb665bdd9010b --- /dev/null +++ b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,323 @@ +{ + "eval_type_id": "scr", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": true, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "744efb4f-9f9c-4a65-bd36-c9b19a4ffd71", + "datetime_epoch_millis": 1740160219317, + "eval_result_metrics": { + "scr_metrics": { + "scr_dir1_threshold_2": 0.1334585580077991, + "scr_metric_threshold_2": 0.07676722148137892, + "scr_dir2_threshold_2": 0.07675956452377836, + "scr_dir1_threshold_5": 0.1798632781490608, + "scr_metric_threshold_5": 0.12875666458236273, + "scr_dir2_threshold_5": 0.13481889859378787, + "scr_dir1_threshold_10": 0.18356951723363868, + "scr_metric_threshold_10": 0.18907230490584973, + "scr_dir2_threshold_10": 0.1949506085489714, + "scr_dir1_threshold_20": 0.17099327770246797, + "scr_metric_threshold_20": 0.237307483183272, + "scr_dir2_threshold_20": 0.24175260369566065, + "scr_dir1_threshold_50": 0.07798598324296178, + "scr_metric_threshold_50": 0.2928094354498369, + "scr_dir2_threshold_50": 0.30373578236555543, + "scr_dir1_threshold_100": 0.0037336381161294126, + "scr_metric_threshold_100": 0.2911327859028171, + "scr_dir2_threshold_100": 0.3050353199157786, + "scr_dir1_threshold_500": -0.09587675427885789, + "scr_metric_threshold_500": 0.2517752817013341, + "scr_dir2_threshold_500": 0.2716966035025452 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results", + "scr_dir1_threshold_2": 0.333333017964661, + "scr_metric_threshold_2": 0.009828028179079856, + "scr_dir2_threshold_2": 0.009828028179079856, + "scr_dir1_threshold_5": 0.3809524260050484, + "scr_metric_threshold_5": 0.019656056358159712, + "scr_dir2_threshold_5": 0.019656056358159712, + "scr_dir1_threshold_10": 0.3650786592542414, + "scr_metric_threshold_10": 0.05651105219315768, + "scr_dir2_threshold_10": 0.05651105219315768, + "scr_dir1_threshold_20": 0.31746019731987085, + "scr_metric_threshold_20": 0.09336619447689097, + "scr_dir2_threshold_20": 0.09336619447689097, + "scr_dir1_threshold_50": 0.1746029193047256, + "scr_metric_threshold_50": 0.12285013256539522, + "scr_dir2_threshold_50": 0.12285013256539522, + "scr_dir1_threshold_100": 0.20634856059430595, + "scr_metric_threshold_100": 0.1474201297887272, + "scr_dir2_threshold_100": 0.1474201297887272, + "scr_dir1_threshold_500": 0.14285633190912841, + "scr_metric_threshold_500": 0.05896816907447914, + "scr_dir2_threshold_500": 0.05896816907447914 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results", + "scr_dir1_threshold_2": 0.08080766118544634, + "scr_metric_threshold_2": 0.11048151943558958, + "scr_dir2_threshold_2": 0.11048151943558958, + "scr_dir1_threshold_5": 0.16161592443815084, + "scr_metric_threshold_5": 0.1671388604233078, + "scr_dir2_threshold_5": 0.1671388604233078, + "scr_dir1_threshold_10": 0.12121179281179859, + "scr_metric_threshold_10": 0.24079315042980354, + "scr_dir2_threshold_10": 0.24079315042980354, + "scr_dir1_threshold_20": 0.12121179281179859, + "scr_metric_threshold_20": 0.2861190232199781, + "scr_dir2_threshold_20": 0.2861190232199781, + "scr_dir1_threshold_50": -0.2828283193172076, + "scr_metric_threshold_50": 0.3456090201924673, + "scr_dir2_threshold_50": 0.3456090201924673, + "scr_dir1_threshold_100": -0.31313156855378627, + "scr_metric_threshold_100": 0.20679891468886458, + "scr_dir2_threshold_100": 0.20679891468886458, + "scr_dir1_threshold_500": -0.7373740475295977, + "scr_metric_threshold_500": 0.28895184805644103, + "scr_dir2_threshold_500": 0.28895184805644103 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results", + "scr_dir1_threshold_2": 0.37096695113474787, + "scr_metric_threshold_2": 0.010101029865838383, + "scr_dir2_threshold_2": 0.010101029865838383, + "scr_dir1_threshold_5": 0.41935440454456746, + "scr_metric_threshold_5": 0.042929339300620824, + "scr_dir2_threshold_5": 0.042929339300620824, + "scr_dir1_threshold_10": 0.45161302727298075, + "scr_metric_threshold_10": 0.09343433811304351, + "scr_dir2_threshold_10": 0.09343433811304351, + "scr_dir1_threshold_20": 0.37096695113474787, + "scr_metric_threshold_20": 0.11868683751925485, + "scr_dir2_threshold_20": 0.11868683751925485, + "scr_dir1_threshold_50": 0.16129022954526445, + "scr_metric_threshold_50": 0.16919198684844677, + "scr_dir2_threshold_50": 0.16919198684844677, + "scr_dir1_threshold_100": 0.09677394545403856, + "scr_metric_threshold_100": 0.2045455161204965, + "scr_dir2_threshold_100": 0.2045455161204965, + "scr_dir1_threshold_500": -0.016129792047006934, + "scr_metric_threshold_500": 0.08838389843850893, + "scr_dir2_threshold_500": 0.08838389843850893 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results", + "scr_dir1_threshold_2": 0.13008153719862106, + "scr_metric_threshold_2": 0.1671553898511282, + "scr_dir2_threshold_2": 0.1671553898511282, + "scr_dir1_threshold_5": 0.22764220550708036, + "scr_metric_threshold_5": 0.23167156962994426, + "scr_dir2_threshold_5": 0.23167156962994426, + "scr_dir1_threshold_10": 0.13821151212590824, + "scr_metric_threshold_10": 0.27565980251807803, + "scr_dir2_threshold_10": 0.27565980251807803, + "scr_dir1_threshold_20": 0.13821151212590824, + "scr_metric_threshold_20": 0.34604101009783206, + "scr_dir2_threshold_20": 0.34604101009783206, + "scr_dir1_threshold_50": 0.08130071845388491, + "scr_metric_threshold_50": 0.38416421518502786, + "scr_dir2_threshold_50": 0.38416421518502786, + "scr_dir1_threshold_100": -0.5365851294680456, + "scr_metric_threshold_100": 0.4105571898766481, + "scr_dir2_threshold_100": 0.4105571898766481, + "scr_dir1_threshold_500": -0.658536207148873, + "scr_metric_threshold_500": 0.1964808784432174, + "scr_dir2_threshold_500": 0.1964808784432174 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results", + "scr_dir1_threshold_2": 0.03825119347697291, + "scr_metric_threshold_2": 0.10156257275956422, + "scr_dir2_threshold_2": 0.10156257275956422, + "scr_dir1_threshold_5": 0.04371592886659017, + "scr_metric_threshold_5": 0.22265637369125918, + "scr_dir2_threshold_5": 0.22265637369125918, + "scr_dir1_threshold_10": 0.06557389329988525, + "scr_metric_threshold_10": 0.3320313445874335, + "scr_dir2_threshold_10": 0.3320313445874335, + "scr_dir1_threshold_20": 0.01092881936245188, + "scr_metric_threshold_20": 0.42578128637978213, + "scr_dir2_threshold_20": 0.42578128637978213, + "scr_dir1_threshold_50": -0.00546440968122594, + "scr_metric_threshold_50": 0.542968888243172, + "scr_dir2_threshold_50": 0.542968888243172, + "scr_dir1_threshold_100": 0.01092881936245188, + "scr_metric_threshold_100": 0.5859375436557386, + "scr_dir2_threshold_100": 0.5859375436557386, + "scr_dir1_threshold_500": -0.03825151918536423, + "scr_metric_threshold_500": 0.578124912688523, + "scr_dir2_threshold_500": 0.578124912688523 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results", + "scr_dir1_threshold_2": 0.03076914142104203, + "scr_metric_threshold_2": 0.060483807005948444, + "scr_dir2_threshold_2": 0.060483807005948444, + "scr_dir1_threshold_5": 0.08205114567773082, + "scr_metric_threshold_5": 0.08467752208135552, + "scr_dir2_threshold_5": 0.08467752208135552, + "scr_dir1_threshold_10": 0.12820501064172196, + "scr_metric_threshold_10": 0.1290322658174608, + "scr_dir2_threshold_10": 0.1290322658174608, + "scr_dir1_threshold_20": 0.14871787347736873, + "scr_metric_threshold_20": 0.17741945562699032, + "scr_dir2_threshold_20": 0.17741945562699032, + "scr_dir1_threshold_50": 0.09230742426312609, + "scr_metric_threshold_50": 0.2701613891726251, + "scr_dir2_threshold_50": 0.2701613891726251, + "scr_dir1_threshold_100": 0.15384601277006638, + "scr_metric_threshold_100": 0.3064517214444511, + "scr_dir2_threshold_100": 0.3064517214444511, + "scr_dir1_threshold_500": 0.15384601277006638, + "scr_metric_threshold_500": 0.3790323859881031, + "scr_dir2_threshold_500": 0.3790323859881031 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results", + "scr_dir1_threshold_2": 0.0405405550534804, + "scr_metric_threshold_2": 0.11160701812646134, + "scr_dir2_threshold_2": 0.11160701812646134, + "scr_dir1_threshold_5": 0.07207194871208326, + "scr_metric_threshold_5": 0.20982135373301966, + "scr_dir2_threshold_5": 0.20982135373301966, + "scr_dir1_threshold_10": 0.11711721870769615, + "scr_metric_threshold_10": 0.30357146183294365, + "scr_dir2_threshold_10": 0.30357146183294365, + "scr_dir1_threshold_20": 0.15315305881904406, + "scr_metric_threshold_20": 0.3437500415768938, + "scr_dir2_threshold_20": 0.3437500415768938, + "scr_dir1_threshold_50": 0.2567566696791176, + "scr_metric_threshold_50": 0.36160721769555176, + "scr_dir2_threshold_50": 0.36160721769555176, + "scr_dir1_threshold_100": 0.26576583107399515, + "scr_metric_threshold_100": 0.3214286379516016, + "scr_dir2_threshold_100": 0.3214286379516016, + "scr_dir1_threshold_500": 0.27927917043223033, + "scr_metric_threshold_500": 0.31696414435284664, + "scr_dir2_threshold_500": 0.31696414435284664 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results", + "scr_dir1_threshold_2": 0.042918406627421406, + "scr_metric_threshold_2": 0.042918406627421406, + "scr_dir2_threshold_2": 0.042857150966616867, + "scr_dir1_threshold_5": 0.05150224144123495, + "scr_metric_threshold_5": 0.05150224144123495, + "scr_dir2_threshold_5": 0.10000011353263609, + "scr_dir1_threshold_10": 0.08154502375487709, + "scr_metric_threshold_10": 0.08154502375487709, + "scr_dir2_threshold_10": 0.12857145289985059, + "scr_dir1_threshold_20": 0.10729601656855352, + "scr_metric_threshold_20": 0.10729601656855352, + "scr_dir2_threshold_20": 0.14285698066766273, + "scr_dir1_threshold_50": 0.1459226336960092, + "scr_metric_threshold_50": 0.1459226336960092, + "scr_dir2_threshold_50": 0.2333334090217574, + "scr_dir1_threshold_100": 0.1459226336960092, + "scr_metric_threshold_100": 0.1459226336960092, + "scr_dir2_threshold_100": 0.25714290579970117, + "scr_dir1_threshold_500": 0.10729601656855352, + "scr_metric_threshold_500": 0.10729601656855352, + "scr_dir2_threshold_500": 0.2666665909782426 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..cdc3eef4e26814bb9c5b28bf1be77f1faef7f8f7 --- /dev/null +++ b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,323 @@ +{ + "eval_type_id": "scr", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": true, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "4945c79f-5ec8-4cf1-817b-560bfcc8ec29", + "datetime_epoch_millis": 1740161984895, + "eval_result_metrics": { + "scr_metrics": { + "scr_dir1_threshold_2": 0.16161854897121994, + "scr_metric_threshold_2": 0.06784648898938839, + "scr_dir2_threshold_2": 0.0659943489456994, + "scr_dir1_threshold_5": 0.20247854428196788, + "scr_metric_threshold_5": 0.09403415996971003, + "scr_dir2_threshold_5": 0.0946217333357683, + "scr_dir1_threshold_10": 0.1762122452594689, + "scr_metric_threshold_10": 0.13623078368990285, + "scr_dir2_threshold_10": 0.14062480452372897, + "scr_dir1_threshold_20": 0.17380918952629806, + "scr_metric_threshold_20": 0.1723823540256287, + "scr_dir2_threshold_20": 0.17415276656035286, + "scr_dir1_threshold_50": 0.1849406260352424, + "scr_metric_threshold_50": 0.20896025265643603, + "scr_dir2_threshold_50": 0.21573523026384628, + "scr_dir1_threshold_100": 0.14974715894436086, + "scr_metric_threshold_100": 0.18597532300507813, + "scr_dir2_threshold_100": 0.1965567480802562, + "scr_dir1_threshold_500": 0.0778989845782214, + "scr_metric_threshold_500": 0.21980959861610377, + "scr_dir2_threshold_500": 0.23241178018807312 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results", + "scr_dir1_threshold_2": 0.3650786592542414, + "scr_metric_threshold_2": -0.0024569704325861324, + "scr_dir2_threshold_2": -0.0024569704325861324, + "scr_dir1_threshold_5": 0.3968252466498386, + "scr_metric_threshold_5": 0.009828028179079856, + "scr_dir2_threshold_5": 0.009828028179079856, + "scr_dir1_threshold_10": 0.3809524260050484, + "scr_metric_threshold_10": 0.01474211549298745, + "scr_dir2_threshold_10": 0.01474211549298745, + "scr_dir1_threshold_20": 0.444444654690226, + "scr_metric_threshold_20": 0.027027114104653437, + "scr_dir2_threshold_20": 0.027027114104653437, + "scr_dir1_threshold_50": 0.3015873766750807, + "scr_metric_threshold_50": 0.05896816907447914, + "scr_dir2_threshold_50": 0.05896816907447914, + "scr_dir1_threshold_100": 0.2857136099242737, + "scr_metric_threshold_100": 0.1031940762072355, + "scr_dir2_threshold_100": 0.1031940762072355, + "scr_dir1_threshold_500": 0.19047573994951578, + "scr_metric_threshold_500": 0.16216224528171463, + "scr_dir2_threshold_500": 0.16216224528171463 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results", + "scr_dir1_threshold_2": 0.15151504204837732, + "scr_metric_threshold_2": 0.1161473379602074, + "scr_dir2_threshold_2": 0.1161473379602074, + "scr_dir1_threshold_5": 0.22222182084405012, + "scr_metric_threshold_5": 0.1359772806671398, + "scr_dir2_threshold_5": 0.1359772806671398, + "scr_dir1_threshold_10": 0.17171680682792437, + "scr_metric_threshold_10": 0.18413597829377729, + "scr_dir2_threshold_10": 0.18413597829377729, + "scr_dir1_threshold_20": 0.15151504204837732, + "scr_metric_threshold_20": 0.18980162796670313, + "scr_dir2_threshold_20": 0.18980162796670313, + "scr_dir1_threshold_50": 0.1818176892176979, + "scr_metric_threshold_50": 0.2549574434638101, + "scr_dir2_threshold_50": 0.2549574434638101, + "scr_dir1_threshold_100": 0.03030264716932058, + "scr_metric_threshold_100": 0.10764869459912667, + "scr_dir2_threshold_100": 0.10764869459912667, + "scr_dir1_threshold_500": -0.09090914564247801, + "scr_metric_threshold_500": 0.1161473379602074, + "scr_dir2_threshold_500": 0.1161473379602074 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results", + "scr_dir1_threshold_2": 0.41935440454456746, + "scr_metric_threshold_2": 0.020202059731676766, + "scr_dir2_threshold_2": 0.020202059731676766, + "scr_dir1_threshold_5": 0.45161302727298075, + "scr_metric_threshold_5": 0.03535352927204973, + "scr_dir2_threshold_5": 0.03535352927204973, + "scr_dir1_threshold_10": 0.33870928977193526, + "scr_metric_threshold_10": 0.04040411946335353, + "scr_dir2_threshold_10": 0.04040411946335353, + "scr_dir1_threshold_20": 0.2741930056807093, + "scr_metric_threshold_20": 0.08333330824720513, + "scr_dir2_threshold_20": 0.08333330824720513, + "scr_dir1_threshold_50": 0.2741930056807093, + "scr_metric_threshold_50": 0.11616161768198757, + "scr_dir2_threshold_50": 0.11616161768198757, + "scr_dir1_threshold_100": 0.3064516284091226, + "scr_metric_threshold_100": 0.13383845757639704, + "scr_dir2_threshold_100": 0.13383845757639704, + "scr_dir1_threshold_500": 0.04838649204421897, + "scr_metric_threshold_500": 0.06818183870683217, + "scr_dir2_threshold_500": 0.06818183870683217 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results", + "scr_dir1_threshold_2": 0.15447146198048264, + "scr_metric_threshold_2": 0.0557184632837094, + "scr_dir2_threshold_2": 0.0557184632837094, + "scr_dir1_threshold_5": 0.24390263995216122, + "scr_metric_threshold_5": 0.11730195436835675, + "scr_dir2_threshold_5": 0.11730195436835675, + "scr_dir1_threshold_10": 0.12195107768082737, + "scr_metric_threshold_10": 0.20527859493832404, + "scr_dir2_threshold_10": 0.20527859493832404, + "scr_dir1_threshold_20": 0.0894311779716786, + "scr_metric_threshold_20": 0.2434018000255199, + "scr_dir2_threshold_20": 0.2434018000255199, + "scr_dir1_threshold_50": 0.24390263995216122, + "scr_metric_threshold_50": 0.28445751901318467, + "scr_dir2_threshold_50": 0.28445751901318467, + "scr_dir1_threshold_100": -0.06504076859931053, + "scr_metric_threshold_100": 0.09384166837090524, + "scr_dir2_threshold_100": 0.09384166837090524, + "scr_dir1_threshold_500": -0.008129974927287195, + "scr_metric_threshold_500": 0.11730195436835675, + "scr_dir2_threshold_500": 0.11730195436835675 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results", + "scr_dir1_threshold_2": 0.03278678379574697, + "scr_metric_threshold_2": 0.12890643189891055, + "scr_dir2_threshold_2": 0.12890643189891055, + "scr_dir1_threshold_5": 0.04918033854781611, + "scr_metric_threshold_5": 0.16406245634426148, + "scr_dir2_threshold_5": 0.16406245634426148, + "scr_dir1_threshold_10": 0.016393554752069144, + "scr_metric_threshold_10": 0.2695312281721307, + "scr_dir2_threshold_10": 0.2695312281721307, + "scr_dir1_threshold_20": 0.03278678379574697, + "scr_metric_threshold_20": 0.36328140279508486, + "scr_dir2_threshold_20": 0.36328140279508486, + "scr_dir1_threshold_50": 0.00546440968122594, + "scr_metric_threshold_50": 0.41015625727595645, + "scr_dir2_threshold_50": 0.41015625727595645, + "scr_dir1_threshold_100": 0.06010915791026799, + "scr_metric_threshold_100": 0.4609375436557385, + "scr_dir2_threshold_100": 0.4609375436557385, + "scr_dir1_threshold_500": 0.05464474822904205, + "scr_metric_threshold_500": 0.5195312281721307, + "scr_dir2_threshold_500": 0.5195312281721307 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results", + "scr_dir1_threshold_2": 0.0410254200064373, + "scr_metric_threshold_2": 0.1008065853511987, + "scr_dir2_threshold_2": 0.1008065853511987, + "scr_dir1_threshold_5": 0.12307687134902433, + "scr_metric_threshold_5": 0.11290320254761761, + "scr_dir2_threshold_5": 0.11290320254761761, + "scr_dir1_threshold_10": 0.14871787347736873, + "scr_metric_threshold_10": 0.1370969176230247, + "scr_dir2_threshold_10": 0.1370969176230247, + "scr_dir1_threshold_20": 0.12307687134902433, + "scr_metric_threshold_20": 0.1854838670912696, + "scr_dir2_threshold_20": 0.1854838670912696, + "scr_dir1_threshold_50": 0.1692307363130155, + "scr_metric_threshold_50": 0.2056451360932524, + "scr_dir2_threshold_50": 0.2056451360932524, + "scr_dir1_threshold_100": 0.17435887560571312, + "scr_metric_threshold_100": 0.22983885116865949, + "scr_dir2_threshold_100": 0.22983885116865949, + "scr_dir1_threshold_500": 0.08205114567773082, + "scr_metric_threshold_500": 0.35483867091269605, + "scr_dir2_threshold_500": 0.35483867091269605 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results", + "scr_dir1_threshold_2": 0.09009000301245093, + "scr_metric_threshold_2": 0.08482138699453473, + "scr_dir2_threshold_2": 0.08482138699453473, + "scr_dir1_threshold_5": 0.09009000301245093, + "scr_metric_threshold_5": 0.13392842175175362, + "scr_dir2_threshold_5": 0.13392842175175362, + "scr_dir1_threshold_10": 0.17117111311941172, + "scr_metric_threshold_10": 0.17857149509445872, + "scr_dir2_threshold_10": 0.17857149509445872, + "scr_dir1_threshold_20": 0.1891891674197794, + "scr_metric_threshold_20": 0.20089289871975097, + "scr_dir2_threshold_20": 0.20089289871975097, + "scr_dir1_threshold_50": 0.2432433303208824, + "scr_metric_threshold_50": 0.28125005820765137, + "scr_dir2_threshold_50": 0.28125005820765137, + "scr_dir1_threshold_100": 0.32882888688058826, + "scr_metric_threshold_100": 0.28125005820765137, + "scr_dir2_threshold_100": 0.28125005820765137, + "scr_dir1_threshold_500": 0.25225222322637253, + "scr_metric_threshold_500": 0.3258928654582359, + "scr_dir2_threshold_500": 0.3258928654582359 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results", + "scr_dir1_threshold_2": 0.03862661712745569, + "scr_metric_threshold_2": 0.03862661712745569, + "scr_dir2_threshold_2": 0.02380949677794379, + "scr_dir1_threshold_5": 0.042918406627421406, + "scr_metric_threshold_5": 0.042918406627421406, + "scr_dir2_threshold_5": 0.04761899355588758, + "scr_dir1_threshold_10": 0.060085820441166386, + "scr_metric_threshold_10": 0.060085820441166386, + "scr_dir2_threshold_10": 0.09523798711177515, + "scr_dir1_threshold_20": 0.08583681325484281, + "scr_metric_threshold_20": 0.08583681325484281, + "scr_dir2_threshold_20": 0.10000011353263609, + "scr_dir1_threshold_50": 0.060085820441166386, + "scr_metric_threshold_50": 0.060085820441166386, + "scr_dir2_threshold_50": 0.11428564130044823, + "scr_dir1_threshold_100": 0.07725323425491137, + "scr_metric_threshold_100": 0.07725323425491137, + "scr_dir2_threshold_100": 0.1619046348563358, + "scr_dir1_threshold_500": 0.09442064806865635, + "scr_metric_threshold_500": 0.09442064806865635, + "scr_dir2_threshold_500": 0.19523810064441124 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..679d2c11c36f2d8ffcf5e5cf74734c9a58bd19ca --- /dev/null +++ b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,323 @@ +{ + "eval_type_id": "scr", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": true, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "b9f069ac-8088-45a3-9274-9b1e86e4771e", + "datetime_epoch_millis": 1740161543241, + "eval_result_metrics": { + "scr_metrics": { + "scr_dir1_threshold_2": 0.18865540875749043, + "scr_metric_threshold_2": 0.073516336917133, + "scr_dir2_threshold_2": 0.07225177023950229, + "scr_dir1_threshold_5": 0.23303036096673593, + "scr_metric_threshold_5": 0.1302813962622398, + "scr_dir2_threshold_5": 0.1305598690300678, + "scr_dir1_threshold_10": 0.23417802766807275, + "scr_metric_threshold_10": 0.1620880555443666, + "scr_dir2_threshold_10": 0.16742988549999255, + "scr_dir1_threshold_20": 0.2599068683780712, + "scr_metric_threshold_20": 0.21384163954000984, + "scr_dir2_threshold_20": 0.22096147153227652, + "scr_dir1_threshold_50": 0.1619087251521188, + "scr_metric_threshold_50": 0.2891392857052675, + "scr_dir2_threshold_50": 0.29613397944434255, + "scr_dir1_threshold_100": 0.08543218428340123, + "scr_metric_threshold_100": 0.31203469249820925, + "scr_dir2_threshold_100": 0.3178389255899667, + "scr_dir1_threshold_500": 0.07042284538823904, + "scr_metric_threshold_500": 0.2821800078466572, + "scr_dir2_threshold_500": 0.2820164853730435 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results", + "scr_dir1_threshold_2": 0.3809524260050484, + "scr_metric_threshold_2": 0.012284998611665989, + "scr_dir2_threshold_2": 0.012284998611665989, + "scr_dir1_threshold_5": 0.41269806729462877, + "scr_metric_threshold_5": 0.019656056358159712, + "scr_dir2_threshold_5": 0.019656056358159712, + "scr_dir1_threshold_10": 0.41269806729462877, + "scr_metric_threshold_10": 0.061425139507065275, + "scr_dir2_threshold_10": 0.061425139507065275, + "scr_dir1_threshold_20": 0.444444654690226, + "scr_metric_threshold_20": 0.09336619447689097, + "scr_dir2_threshold_20": 0.09336619447689097, + "scr_dir1_threshold_50": 0.3809524260050484, + "scr_metric_threshold_50": 0.12039316213280908, + "scr_dir2_threshold_50": 0.12039316213280908, + "scr_dir1_threshold_100": 0.3650786592542414, + "scr_metric_threshold_100": 0.13759224805838266, + "scr_dir2_threshold_100": 0.13759224805838266, + "scr_dir1_threshold_500": 0.3015873766750807, + "scr_metric_threshold_500": 0.05896816907447914, + "scr_dir2_threshold_500": 0.05896816907447914 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results", + "scr_dir1_threshold_2": 0.16161592443815084, + "scr_metric_threshold_2": 0.14447592402822051, + "scr_dir2_threshold_2": 0.14447592402822051, + "scr_dir1_threshold_5": 0.19191917367472955, + "scr_metric_threshold_5": 0.2294616822322599, + "scr_dir2_threshold_5": 0.2294616822322599, + "scr_dir1_threshold_10": 0.20202005606450307, + "scr_metric_threshold_10": 0.2691217364978167, + "scr_dir2_threshold_10": 0.2691217364978167, + "scr_dir1_threshold_20": 0.2121209384542766, + "scr_metric_threshold_20": 0.31161478445152835, + "scr_dir2_threshold_20": 0.31161478445152835, + "scr_dir1_threshold_50": -0.343434817790365, + "scr_metric_threshold_50": 0.38243624962156114, + "scr_dir2_threshold_50": 0.38243624962156114, + "scr_dir1_threshold_100": -0.3737374649596856, + "scr_metric_threshold_100": 0.21813038288640824, + "scr_dir2_threshold_100": 0.21813038288640824, + "scr_dir1_threshold_500": -0.5151516246182893, + "scr_metric_threshold_500": 0.28895184805644103, + "scr_dir2_threshold_500": 0.28895184805644103 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results", + "scr_dir1_threshold_2": 0.4999995193171997, + "scr_metric_threshold_2": 0.022727279568944055, + "scr_dir2_threshold_2": 0.022727279568944055, + "scr_dir1_threshold_5": 0.5483869727270193, + "scr_metric_threshold_5": 0.0479797789751554, + "scr_dir2_threshold_5": 0.0479797789751554, + "scr_dir1_threshold_10": 0.5483869727270193, + "scr_metric_threshold_10": 0.07323242889813597, + "scr_dir2_threshold_10": 0.07323242889813597, + "scr_dir1_threshold_20": 0.46774185795438705, + "scr_metric_threshold_20": 0.15151514695403728, + "scr_dir2_threshold_20": 0.15151514695403728, + "scr_dir1_threshold_50": 0.29032183636211567, + "scr_metric_threshold_50": 0.21717176582360218, + "scr_dir2_threshold_50": 0.21717176582360218, + "scr_dir1_threshold_100": 0.2741930056807093, + "scr_metric_threshold_100": 0.21212117563229838, + "scr_dir2_threshold_100": 0.21212117563229838, + "scr_dir1_threshold_500": 0.16129022954526445, + "scr_metric_threshold_500": 0.06565661886956488, + "scr_dir2_threshold_500": 0.06565661886956488 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results", + "scr_dir1_threshold_2": 0.15447146198048264, + "scr_metric_threshold_2": 0.09970669617184318, + "scr_dir2_threshold_2": 0.09970669617184318, + "scr_dir1_threshold_5": 0.2764230242518165, + "scr_metric_threshold_5": 0.18475064804764177, + "scr_dir2_threshold_5": 0.18475064804764177, + "scr_dir1_threshold_10": 0.19512182120742508, + "scr_metric_threshold_10": 0.24633431392598884, + "scr_dir2_threshold_10": 0.24633431392598884, + "scr_dir1_threshold_20": 0.2682925647340228, + "scr_metric_threshold_20": 0.2727272886176091, + "scr_dir2_threshold_20": 0.2727272886176091, + "scr_dir1_threshold_50": 0.15447146198048264, + "scr_metric_threshold_50": 0.3665689569885143, + "scr_dir2_threshold_50": 0.3665689569885143, + "scr_dir1_threshold_100": -0.382113667487563, + "scr_metric_threshold_100": 0.40762467597617913, + "scr_dir2_threshold_100": 0.40762467597617913, + "scr_dir1_threshold_500": -0.487804795313816, + "scr_metric_threshold_500": 0.13196469866440133, + "scr_dir2_threshold_500": 0.13196469866440133 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results", + "scr_dir1_threshold_2": 0.021857964433295084, + "scr_metric_threshold_2": 0.09375017462295412, + "scr_dir2_threshold_2": 0.09375017462295412, + "scr_dir1_threshold_5": 0.03278678379574697, + "scr_metric_threshold_5": 0.1992187136202179, + "scr_dir2_threshold_5": 0.1992187136202179, + "scr_dir1_threshold_10": 0.04918033854781611, + "scr_metric_threshold_10": 0.21484374272404358, + "scr_dir2_threshold_10": 0.21484374272404358, + "scr_dir1_threshold_20": 0.09289626741440628, + "scr_metric_threshold_20": 0.28515625727595645, + "scr_dir2_threshold_20": 0.28515625727595645, + "scr_dir1_threshold_50": 0.12568305121015325, + "scr_metric_threshold_50": 0.4570313445874335, + "scr_dir2_threshold_50": 0.4570313445874335, + "scr_dir1_threshold_100": -0.00546440968122594, + "scr_metric_threshold_100": 0.5625001164153027, + "scr_dir2_threshold_100": 0.5625001164153027, + "scr_dir1_threshold_500": 0.07650271266233713, + "scr_metric_threshold_500": 0.6523438591393463, + "scr_dir2_threshold_500": 0.6523438591393463 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results", + "scr_dir1_threshold_2": 0.08717928497042846, + "scr_metric_threshold_2": 0.04435498407738989, + "scr_dir2_threshold_2": 0.04435498407738989, + "scr_dir1_threshold_5": 0.16410229135546164, + "scr_metric_threshold_5": 0.08870972781349516, + "scr_dir2_threshold_5": 0.08870972781349516, + "scr_dir1_threshold_10": 0.1999998777340575, + "scr_metric_threshold_10": 0.10483879108333834, + "scr_dir2_threshold_10": 0.10483879108333834, + "scr_dir1_threshold_20": 0.23076901915509954, + "scr_metric_threshold_20": 0.17741945562699032, + "scr_dir2_threshold_20": 0.17741945562699032, + "scr_dir1_threshold_50": 0.2358974641126534, + "scr_metric_threshold_50": 0.2983870696388872, + "scr_dir2_threshold_50": 0.2983870696388872, + "scr_dir1_threshold_100": 0.28205102341178834, + "scr_metric_threshold_100": 0.4112902721865048, + "scr_dir2_threshold_100": 0.4112902721865048, + "scr_dir1_threshold_500": 0.32307674908308187, + "scr_metric_threshold_500": 0.3508064651805564, + "scr_dir2_threshold_500": 0.3508064651805564 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results", + "scr_dir1_threshold_2": 0.1216216651604412, + "scr_metric_threshold_2": 0.08928561450116908, + "scr_dir2_threshold_2": 0.08928561450116908, + "scr_dir1_threshold_5": 0.13063055806593132, + "scr_metric_threshold_5": 0.16517854648243513, + "scr_dir2_threshold_5": 0.16517854648243513, + "scr_dir1_threshold_10": 0.18018027451428928, + "scr_metric_threshold_10": 0.24107147846370117, + "scr_dir2_threshold_10": 0.24107147846370117, + "scr_dir1_threshold_20": 0.23423416892600485, + "scr_metric_threshold_20": 0.29017851322092003, + "scr_dir2_threshold_20": 0.29017851322092003, + "scr_dir1_threshold_50": 0.2882883318271079, + "scr_metric_threshold_50": 0.308035689339578, + "scr_dir2_threshold_50": 0.308035689339578, + "scr_dir1_threshold_100": 0.3603602805391911, + "scr_metric_threshold_100": 0.38392862132084404, + "scr_dir2_threshold_100": 0.38392862132084404, + "scr_dir1_threshold_500": 0.4549549984937745, + "scr_metric_threshold_500": 0.45982128720998955, + "scr_dir2_threshold_500": 0.45982128720998955 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results", + "scr_dir1_threshold_2": 0.08154502375487709, + "scr_metric_threshold_2": 0.08154502375487709, + "scr_dir2_threshold_2": 0.07142849033383136, + "scr_dir1_threshold_5": 0.10729601656855352, + "scr_metric_threshold_5": 0.10729601656855352, + "scr_dir2_threshold_5": 0.10952379871117751, + "scr_dir1_threshold_10": 0.08583681325484281, + "scr_metric_threshold_10": 0.08583681325484281, + "scr_dir2_threshold_10": 0.12857145289985059, + "scr_dir1_threshold_20": 0.12875547569614632, + "scr_metric_threshold_20": 0.12875547569614632, + "scr_dir2_threshold_20": 0.18571413163427958, + "scr_dir1_threshold_50": 0.16309004750975417, + "scr_metric_threshold_50": 0.16309004750975417, + "scr_dir2_threshold_50": 0.21904759742235502, + "scr_dir1_threshold_100": 0.16309004750975417, + "scr_metric_threshold_100": 0.16309004750975417, + "scr_dir2_threshold_100": 0.20952391224381361, + "scr_dir1_threshold_500": 0.2489271165784791, + "scr_metric_threshold_500": 0.2489271165784791, + "scr_dir2_threshold_500": 0.24761893678956953 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..18a9cae671b0e4b524807f0e87c742d88744f86e --- /dev/null +++ b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,323 @@ +{ + "eval_type_id": "scr", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": true, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "21df4103-067e-4b1e-992d-4ee5b7c51b7b", + "datetime_epoch_millis": 1740161101577, + "eval_result_metrics": { + "scr_metrics": { + "scr_dir1_threshold_2": 0.2009693612201276, + "scr_metric_threshold_2": 0.08333806598832513, + "scr_dir2_threshold_2": 0.08053045986523572, + "scr_dir1_threshold_5": 0.23069728357870245, + "scr_metric_threshold_5": 0.12271415632570293, + "scr_dir2_threshold_5": 0.12228750697619764, + "scr_dir1_threshold_10": 0.24536139627163156, + "scr_metric_threshold_10": 0.14820108338134885, + "scr_dir2_threshold_10": 0.14854593776620528, + "scr_dir1_threshold_20": 0.26804181768674956, + "scr_metric_threshold_20": 0.16917277755556245, + "scr_dir2_threshold_20": 0.1705906112921456, + "scr_dir1_threshold_50": 0.2558397963519864, + "scr_metric_threshold_50": 0.2246913468132941, + "scr_dir2_threshold_50": 0.22294137414999496, + "scr_dir1_threshold_100": 0.2694834525923413, + "scr_metric_threshold_100": 0.22275277451374648, + "scr_dir2_threshold_100": 0.22045867120535106, + "scr_dir1_threshold_500": 0.23308094185248288, + "scr_metric_threshold_500": 0.23783672950733417, + "scr_dir2_threshold_500": 0.23189713822667268 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results", + "scr_dir1_threshold_2": 0.42857088793941894, + "scr_metric_threshold_2": 0.0, + "scr_dir2_threshold_2": 0.0, + "scr_dir1_threshold_5": 0.47619029597980633, + "scr_metric_threshold_5": 0.0, + "scr_dir2_threshold_5": 0.0, + "scr_dir1_threshold_10": 0.46031747533501616, + "scr_metric_threshold_10": 0.0024571168813214595, + "scr_dir2_threshold_10": 0.0024571168813214595, + "scr_dir1_threshold_20": 0.47619029597980633, + "scr_metric_threshold_20": 0.039312112716319424, + "scr_dir2_threshold_20": 0.039312112716319424, + "scr_dir1_threshold_50": 0.42857088793941894, + "scr_metric_threshold_50": 0.05896816907447914, + "scr_dir2_threshold_50": 0.05896816907447914, + "scr_dir1_threshold_100": 0.3968252466498386, + "scr_metric_threshold_100": 0.09336619447689097, + "scr_dir2_threshold_100": 0.09336619447689097, + "scr_dir1_threshold_500": 0.3968252466498386, + "scr_metric_threshold_500": 0.022113026790745845, + "scr_dir2_threshold_500": 0.022113026790745845 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results", + "scr_dir1_threshold_2": 0.2121209384542766, + "scr_metric_threshold_2": 0.09631722640158302, + "scr_dir2_threshold_2": 0.09631722640158302, + "scr_dir1_threshold_5": 0.2121209384542766, + "scr_metric_threshold_5": 0.13031163099421397, + "scr_dir2_threshold_5": 0.13031163099421397, + "scr_dir1_threshold_10": 0.2121209384542766, + "scr_metric_threshold_10": 0.1671388604233078, + "scr_dir2_threshold_10": 0.1671388604233078, + "scr_dir1_threshold_20": 0.2828283193172076, + "scr_metric_threshold_20": 0.18130315345731438, + "scr_dir2_threshold_20": 0.18130315345731438, + "scr_dir1_threshold_50": 0.040403529559094105, + "scr_metric_threshold_50": 0.24362614411795844, + "scr_dir2_threshold_50": 0.24362614411795844, + "scr_dir1_threshold_100": 0.040403529559094105, + "scr_metric_threshold_100": 0.26345608682489086, + "scr_dir2_threshold_100": 0.26345608682489086, + "scr_dir1_threshold_500": -0.2828283193172076, + "scr_metric_threshold_500": 0.16147304189869, + "scr_dir2_threshold_500": 0.16147304189869 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results", + "scr_dir1_threshold_2": 0.5483869727270193, + "scr_metric_threshold_2": 0.015151620057142186, + "scr_dir2_threshold_2": 0.015151620057142186, + "scr_dir1_threshold_5": 0.5483869727270193, + "scr_metric_threshold_5": 0.012626249703105673, + "scr_dir2_threshold_5": 0.012626249703105673, + "scr_dir1_threshold_10": 0.4999995193171997, + "scr_metric_threshold_10": 0.04040411946335353, + "scr_dir2_threshold_10": 0.04040411946335353, + "scr_dir1_threshold_20": 0.532258142045613, + "scr_metric_threshold_20": 0.050505149329191916, + "scr_dir2_threshold_20": 0.050505149329191916, + "scr_dir1_threshold_50": 0.516128349998606, + "scr_metric_threshold_50": 0.0530303691664592, + "scr_dir2_threshold_50": 0.0530303691664592, + "scr_dir1_threshold_100": 0.48387068863579336, + "scr_metric_threshold_100": 0.08080808840993783, + "scr_dir2_threshold_100": 0.08080808840993783, + "scr_dir1_threshold_500": 0.45161302727298075, + "scr_metric_threshold_500": 0.03030308959751515, + "scr_dir2_threshold_500": 0.03030308959751515 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results", + "scr_dir1_threshold_2": 0.25203261487944845, + "scr_metric_threshold_2": 0.06451600498511631, + "scr_dir2_threshold_2": 0.06451600498511631, + "scr_dir1_threshold_5": 0.2845529991791037, + "scr_metric_threshold_5": 0.09090897967673653, + "scr_dir2_threshold_5": 0.09090897967673653, + "scr_dir1_threshold_10": 0.2601625898067356, + "scr_metric_threshold_10": 0.1524926455550836, + "scr_dir2_threshold_10": 0.1524926455550836, + "scr_dir1_threshold_20": 0.25203261487944845, + "scr_metric_threshold_20": 0.1964808784432174, + "scr_dir2_threshold_20": 0.1964808784432174, + "scr_dir1_threshold_50": 0.3089429239609653, + "scr_metric_threshold_50": 0.23460408353041323, + "scr_dir2_threshold_50": 0.23460408353041323, + "scr_dir1_threshold_100": 0.2682925647340228, + "scr_metric_threshold_100": 0.032258002492558155, + "scr_dir2_threshold_100": 0.032258002492558155, + "scr_dir1_threshold_500": 0.39024412700535666, + "scr_metric_threshold_500": 0.070381207579754, + "scr_dir2_threshold_500": 0.070381207579754 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results", + "scr_dir1_threshold_2": 0.016393554752069144, + "scr_metric_threshold_2": 0.16015625727595642, + "scr_dir2_threshold_2": 0.16015625727595642, + "scr_dir1_threshold_5": 0.07650271266233713, + "scr_metric_threshold_5": 0.3359375436557385, + "scr_dir2_threshold_5": 0.3359375436557385, + "scr_dir1_threshold_10": 0.10382508677685816, + "scr_metric_threshold_10": 0.371093800931695, + "scr_dir2_threshold_10": 0.371093800931695, + "scr_dir1_threshold_20": 0.1202186415289273, + "scr_metric_threshold_20": 0.42578128637978213, + "scr_dir2_threshold_20": 0.42578128637978213, + "scr_dir1_threshold_50": 0.10382508677685816, + "scr_metric_threshold_50": 0.48828116996447934, + "scr_dir2_threshold_50": 0.48828116996447934, + "scr_dir1_threshold_100": 0.1803277994391953, + "scr_metric_threshold_100": 0.5234374272404357, + "scr_dir2_threshold_100": 0.5234374272404357, + "scr_dir1_threshold_500": 0.07103830298111119, + "scr_metric_threshold_500": 0.6054687718278693, + "scr_dir2_threshold_500": 0.6054687718278693 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results", + "scr_dir1_threshold_2": 0.03589728071373967, + "scr_metric_threshold_2": 0.060483807005948444, + "scr_dir2_threshold_2": 0.060483807005948444, + "scr_dir1_threshold_5": 0.09743586922067994, + "scr_metric_threshold_5": 0.0927419335456348, + "scr_dir2_threshold_5": 0.0927419335456348, + "scr_dir1_threshold_10": 0.16410229135546164, + "scr_metric_threshold_10": 0.08870972781349516, + "scr_dir2_threshold_10": 0.08870972781349516, + "scr_dir1_threshold_20": 0.1999998777340575, + "scr_metric_threshold_20": 0.1008065853511987, + "scr_dir2_threshold_20": 0.1008065853511987, + "scr_dir1_threshold_50": 0.24102560340535104, + "scr_metric_threshold_50": 0.19758072462897314, + "scr_dir2_threshold_50": 0.19758072462897314, + "scr_dir1_threshold_100": 0.24102560340535104, + "scr_metric_threshold_100": 0.23790326263293876, + "scr_dir2_threshold_100": 0.23790326263293876, + "scr_dir1_threshold_500": 0.3282048883757795, + "scr_metric_threshold_500": 0.3225807847142943, + "scr_dir2_threshold_500": 0.3225807847142943 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results", + "scr_dir1_threshold_2": 0.05855860935384807, + "scr_metric_threshold_2": 0.214285581239654, + "scr_dir2_threshold_2": 0.214285581239654, + "scr_dir1_threshold_5": 0.09459444946519599, + "scr_metric_threshold_5": 0.2633928820889934, + "scr_dir2_threshold_5": 0.2633928820889934, + "scr_dir1_threshold_10": 0.19369361387252446, + "scr_metric_threshold_10": 0.2946427407275544, + "scr_dir2_threshold_10": 0.2946427407275544, + "scr_dir1_threshold_20": 0.22072082956776967, + "scr_metric_threshold_20": 0.2991072343263093, + "scr_dir2_threshold_20": 0.2991072343263093, + "scr_dir1_threshold_50": 0.27477472397948527, + "scr_metric_threshold_50": 0.3883928488274784, + "scr_dir2_threshold_50": 0.3883928488274784, + "scr_dir1_threshold_100": 0.3648647269919362, + "scr_metric_threshold_100": 0.37053567270882043, + "scr_dir2_threshold_100": 0.37053567270882043, + "scr_dir1_threshold_500": 0.24774777677362747, + "scr_metric_threshold_500": 0.42857142857142855, + "scr_dir2_threshold_500": 0.42857142857142855 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results", + "scr_dir1_threshold_2": 0.05579403094120067, + "scr_metric_threshold_2": 0.05579403094120067, + "scr_dir2_threshold_2": 0.033333181956485214, + "scr_dir1_threshold_5": 0.05579403094120067, + "scr_metric_threshold_5": 0.05579403094120067, + "scr_dir2_threshold_5": 0.05238083614515829, + "scr_dir1_threshold_10": 0.06866965525497992, + "scr_metric_threshold_10": 0.06866965525497992, + "scr_dir2_threshold_10": 0.07142849033383136, + "scr_dir1_threshold_20": 0.060085820441166386, + "scr_metric_threshold_20": 0.060085820441166386, + "scr_dir2_threshold_20": 0.07142849033383136, + "scr_dir1_threshold_50": 0.13304726519611204, + "scr_metric_threshold_50": 0.13304726519611204, + "scr_dir2_threshold_50": 0.11904748388971893, + "scr_dir1_threshold_100": 0.18025746132349915, + "scr_metric_threshold_100": 0.18025746132349915, + "scr_dir2_threshold_100": 0.1619046348563358, + "scr_dir1_threshold_500": 0.2618024850783763, + "scr_metric_threshold_500": 0.2618024850783763, + "scr_dir2_threshold_500": 0.21428575483308432 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..153fe0766444aefa560a4baa67135879cb852010 --- /dev/null +++ b/eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,323 @@ +{ + "eval_type_id": "scr", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": true, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "2f284c7a-10d3-4f19-8264-97e91f4bb2f2", + "datetime_epoch_millis": 1740162428754, + "eval_result_metrics": { + "scr_metrics": { + "scr_dir1_threshold_2": 0.19644145298086652, + "scr_metric_threshold_2": 0.07734320571232689, + "scr_dir2_threshold_2": 0.0759023691262068, + "scr_dir1_threshold_5": 0.2317746907633326, + "scr_metric_threshold_5": 0.13025074424245692, + "scr_dir2_threshold_5": 0.13351306106512845, + "scr_dir1_threshold_10": 0.237756725215616, + "scr_metric_threshold_10": 0.1905854353466263, + "scr_dir2_threshold_10": 0.19497179922285185, + "scr_dir1_threshold_20": 0.24107600003630494, + "scr_metric_threshold_20": 0.24773476074996353, + "scr_dir2_threshold_20": 0.257654541907719, + "scr_dir1_threshold_50": 0.19159755118507713, + "scr_metric_threshold_50": 0.32827118012785683, + "scr_dir2_threshold_50": 0.3363388212419233, + "scr_dir1_threshold_100": 0.0755746565868114, + "scr_metric_threshold_100": 0.3171029788525398, + "scr_dir2_threshold_100": 0.3217090588604302, + "scr_dir1_threshold_500": 0.00475685136491143, + "scr_metric_threshold_500": 0.2958876177808421, + "scr_dir2_threshold_500": 0.3115094291710164 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results", + "scr_dir1_threshold_2": 0.42857088793941894, + "scr_metric_threshold_2": 0.009828028179079856, + "scr_dir2_threshold_2": 0.009828028179079856, + "scr_dir1_threshold_5": 0.444444654690226, + "scr_metric_threshold_5": 0.017199085925573582, + "scr_dir2_threshold_5": 0.017199085925573582, + "scr_dir1_threshold_10": 0.47619029597980633, + "scr_metric_threshold_10": 0.04668317046281315, + "scr_dir2_threshold_10": 0.04668317046281315, + "scr_dir1_threshold_20": 0.5079359372693867, + "scr_metric_threshold_20": 0.08845210716298338, + "scr_dir2_threshold_20": 0.08845210716298338, + "scr_dir1_threshold_50": 0.47619029597980633, + "scr_metric_threshold_50": 0.10565119308855696, + "scr_dir2_threshold_50": 0.10565119308855696, + "scr_dir1_threshold_100": 0.3809524260050484, + "scr_metric_threshold_100": 0.14496315935614107, + "scr_dir2_threshold_100": 0.14496315935614107, + "scr_dir1_threshold_500": 0.31746019731987085, + "scr_metric_threshold_500": 0.02948408453723957, + "scr_dir2_threshold_500": 0.02948408453723957 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results", + "scr_dir1_threshold_2": 0.19191917367472955, + "scr_metric_threshold_2": 0.1359772806671398, + "scr_dir2_threshold_2": 0.1359772806671398, + "scr_dir1_threshold_5": 0.2828283193172076, + "scr_metric_threshold_5": 0.21529738919825334, + "scr_dir2_threshold_5": 0.21529738919825334, + "scr_dir1_threshold_10": 0.2828283193172076, + "scr_metric_threshold_10": 0.2832861983835152, + "scr_dir2_threshold_10": 0.2832861983835152, + "scr_dir1_threshold_20": 0.2929292017069811, + "scr_metric_threshold_20": 0.3371105456830785, + "scr_dir2_threshold_20": 0.3371105456830785, + "scr_dir1_threshold_50": -0.16161652650540898, + "scr_metric_threshold_50": 0.4277621224117357, + "scr_dir2_threshold_50": 0.4277621224117357, + "scr_dir1_threshold_100": -0.42424308104306946, + "scr_metric_threshold_100": 0.2096317395253275, + "scr_dir2_threshold_100": 0.2096317395253275, + "scr_dir1_threshold_500": -0.6767681511236984, + "scr_metric_threshold_500": 0.26345608682489086, + "scr_dir2_threshold_500": 0.26345608682489086 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results", + "scr_dir1_threshold_2": 0.532258142045613, + "scr_metric_threshold_2": 0.03030308959751515, + "scr_dir2_threshold_2": 0.03030308959751515, + "scr_dir1_threshold_5": 0.532258142045613, + "scr_metric_threshold_5": 0.0530303691664592, + "scr_dir2_threshold_5": 0.0530303691664592, + "scr_dir1_threshold_10": 0.516128349998606, + "scr_metric_threshold_10": 0.07828286857267056, + "scr_dir2_threshold_10": 0.07828286857267056, + "scr_dir1_threshold_20": 0.48387068863579336, + "scr_metric_threshold_20": 0.13383845757639704, + "scr_dir2_threshold_20": 0.13383845757639704, + "scr_dir1_threshold_50": 0.35483812045334157, + "scr_metric_threshold_50": 0.22979801552670784, + "scr_dir2_threshold_50": 0.22979801552670784, + "scr_dir1_threshold_100": 0.3064516284091226, + "scr_metric_threshold_100": 0.29797985423354, + "scr_dir2_threshold_100": 0.29797985423354, + "scr_dir1_threshold_500": 0.14516043749825752, + "scr_metric_threshold_500": 0.10606058781614919, + "scr_dir2_threshold_500": 0.10606058781614919 + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results", + "scr_dir1_threshold_2": 0.2601625898067356, + "scr_metric_threshold_2": 0.10850441266694984, + "scr_dir2_threshold_2": 0.10850441266694984, + "scr_dir1_threshold_5": 0.31707338347875896, + "scr_metric_threshold_5": 0.17302041765206616, + "scr_dir2_threshold_5": 0.17302041765206616, + "scr_dir1_threshold_10": 0.21138225565250596, + "scr_metric_threshold_10": 0.26099705822203345, + "scr_dir2_threshold_10": 0.26099705822203345, + "scr_dir1_threshold_20": 0.13008153719862106, + "scr_metric_threshold_20": 0.31964803540621184, + "scr_dir2_threshold_20": 0.31964803540621184, + "scr_dir1_threshold_50": 0.06504076859931053, + "scr_metric_threshold_50": 0.3900292429859658, + "scr_dir2_threshold_50": 0.3900292429859658, + "scr_dir1_threshold_100": -0.39024364241485016, + "scr_metric_threshold_100": 0.140762415159508, + "scr_dir2_threshold_100": 0.140762415159508, + "scr_dir1_threshold_500": -0.6341462823670114, + "scr_metric_threshold_500": 0.13489738735857004, + "scr_dir2_threshold_500": 0.13489738735857004 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results", + "scr_dir1_threshold_2": 0.0, + "scr_metric_threshold_2": 0.13671883003552063, + "scr_dir2_threshold_2": 0.13671883003552063, + "scr_dir1_threshold_5": 0.01092881936245188, + "scr_metric_threshold_5": 0.23437497089617432, + "scr_dir2_threshold_5": 0.23437497089617432, + "scr_dir1_threshold_10": 0.04918033854781611, + "scr_metric_threshold_10": 0.3085936845163922, + "scr_dir2_threshold_10": 0.3085936845163922, + "scr_dir1_threshold_20": 0.00546440968122594, + "scr_metric_threshold_20": 0.417968888243172, + "scr_dir2_threshold_20": 0.417968888243172, + "scr_dir1_threshold_50": 0.09836067709563222, + "scr_metric_threshold_50": 0.5351562572759564, + "scr_dir2_threshold_50": 0.5351562572759564, + "scr_dir1_threshold_100": -0.016393554752069144, + "scr_metric_threshold_100": 0.6015625727595643, + "scr_dir2_threshold_100": 0.6015625727595643, + "scr_dir1_threshold_500": 0.03825119347697291, + "scr_metric_threshold_500": 0.6953125145519129, + "scr_dir2_threshold_500": 0.6953125145519129 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results", + "scr_dir1_threshold_2": 0.03589728071373967, + "scr_metric_threshold_2": 0.04838718980952953, + "scr_dir2_threshold_2": 0.04838718980952953, + "scr_dir1_threshold_5": 0.10769214780607521, + "scr_metric_threshold_5": 0.0927419335456348, + "scr_dir2_threshold_5": 0.0927419335456348, + "scr_dir1_threshold_10": 0.12820501064172196, + "scr_metric_threshold_10": 0.1491935348194436, + "scr_dir2_threshold_10": 0.1491935348194436, + "scr_dir1_threshold_20": 0.18974359914866223, + "scr_metric_threshold_20": 0.18951607282340924, + "scr_dir2_threshold_20": 0.18951607282340924, + "scr_dir1_threshold_50": 0.22564087986240192, + "scr_metric_threshold_50": 0.3145161329087304, + "scr_dir2_threshold_50": 0.3145161329087304, + "scr_dir1_threshold_100": 0.27179474482639304, + "scr_metric_threshold_100": 0.43145154118848766, + "scr_dir2_threshold_100": 0.43145154118848766, + "scr_dir1_threshold_500": 0.30769233120498896, + "scr_metric_threshold_500": 0.4354839872619119, + "scr_dir2_threshold_500": 0.4354839872619119 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results", + "scr_dir1_threshold_2": 0.05405389441171559, + "scr_metric_threshold_2": 0.0803571594879004, + "scr_dir2_threshold_2": 0.0803571594879004, + "scr_dir1_threshold_5": 0.09459444946519599, + "scr_metric_threshold_5": 0.19196417761436174, + "scr_dir2_threshold_5": 0.19196417761436174, + "scr_dir1_threshold_10": 0.13513500451867638, + "scr_metric_threshold_10": 0.2946427407275544, + "scr_dir2_threshold_10": 0.2946427407275544, + "scr_dir1_threshold_20": 0.20270277526740202, + "scr_metric_threshold_20": 0.37946412772208915, + "scr_dir2_threshold_20": 0.37946412772208915, + "scr_dir1_threshold_50": 0.31981972548571075, + "scr_metric_threshold_50": 0.4687500083153788, + "scr_dir2_threshold_50": 0.4687500083153788, + "scr_dir1_threshold_100": 0.2702702775267402, + "scr_metric_threshold_100": 0.504464094460574, + "scr_dir2_threshold_100": 0.504464094460574, + "scr_dir1_threshold_500": 0.35585583408644605, + "scr_metric_threshold_500": 0.5178570430725976, + "scr_dir2_threshold_500": 0.5178570430725976 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results", + "scr_dir1_threshold_2": 0.06866965525497992, + "scr_metric_threshold_2": 0.06866965525497992, + "scr_dir2_threshold_2": 0.05714296256601923, + "scr_dir1_threshold_5": 0.0643776099411321, + "scr_metric_threshold_5": 0.0643776099411321, + "scr_dir2_threshold_5": 0.09047614452250444, + "scr_dir1_threshold_10": 0.10300422706858779, + "scr_metric_threshold_10": 0.10300422706858779, + "scr_dir2_threshold_10": 0.13809513807839202, + "scr_dir1_threshold_20": 0.11587985138236706, + "scr_metric_threshold_20": 0.11587985138236706, + "scr_dir2_threshold_20": 0.19523810064441124, + "scr_dir1_threshold_50": 0.15450646850982275, + "scr_metric_threshold_50": 0.15450646850982275, + "scr_dir2_threshold_50": 0.21904759742235502, + "scr_dir1_threshold_100": 0.20600845413717558, + "scr_metric_threshold_100": 0.20600845413717558, + "scr_dir2_threshold_100": 0.24285709420029883, + "scr_dir1_threshold_500": 0.18454925082346488, + "scr_metric_threshold_500": 0.18454925082346488, + "scr_dir2_threshold_500": 0.3095237419448595 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..585a6d888804b23931253fa6bb29fea2c70c71e8 --- /dev/null +++ b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,670 @@ +{ + "eval_type_id": "sparse_probing", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "LabHC/bias_in_bios_class_set2", + "LabHC/bias_in_bios_class_set3", + "canrager/amazon_reviews_mcauley_1and5", + "canrager/amazon_reviews_mcauley_1and5_sentiment", + "codeparrot/github-code", + "fancyzhx/ag_news", + "Helsinki-NLP/europarl" + ], + "probe_train_set_size": 4000, + "probe_test_set_size": 1000, + "context_length": 128, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "model_name": "gemma-2-2b", + "k_values": [ + 1, + 2, + 5 + ], + "lower_vram_usage": false + }, + "eval_id": "fe2f63b4-1d22-4879-9af8-4e98fff7d830", + "datetime_epoch_millis": 1740164930002, + "eval_result_metrics": { + "llm": { + "llm_test_accuracy": 0.9595375448465346, + "llm_top_1_test_accuracy": 0.64956875, + "llm_top_2_test_accuracy": 0.72589375, + "llm_top_5_test_accuracy": 0.78265625, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null + }, + "sae": { + "sae_test_accuracy": 0.9598625473678112, + "sae_top_1_test_accuracy": 0.7501000000000001, + "sae_top_2_test_accuracy": 0.7998875, + "sae_top_5_test_accuracy": 0.8547375, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_results", + "llm_test_accuracy": 0.966800057888031, + "llm_top_1_test_accuracy": 0.6397999999999999, + "llm_top_2_test_accuracy": 0.6954, + "llm_top_5_test_accuracy": 0.7869999999999999, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9674000382423401, + "sae_top_1_test_accuracy": 0.7148000000000001, + "sae_top_2_test_accuracy": 0.7748, + "sae_top_5_test_accuracy": 0.8404, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set2_results", + "llm_test_accuracy": 0.9578000426292419, + "llm_top_1_test_accuracy": 0.6694000000000001, + "llm_top_2_test_accuracy": 0.725, + "llm_top_5_test_accuracy": 0.7654, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9522000312805176, + "sae_top_1_test_accuracy": 0.712, + "sae_top_2_test_accuracy": 0.7488, + "sae_top_5_test_accuracy": 0.8006, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set3_results", + "llm_test_accuracy": 0.9316000461578369, + "llm_top_1_test_accuracy": 0.687, + "llm_top_2_test_accuracy": 0.7492, + "llm_top_5_test_accuracy": 0.7704000000000001, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.935800039768219, + "sae_top_1_test_accuracy": 0.6752, + "sae_top_2_test_accuracy": 0.7123999999999999, + "sae_top_5_test_accuracy": 0.797, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results", + "llm_test_accuracy": 0.9202000379562378, + "llm_top_1_test_accuracy": 0.599, + "llm_top_2_test_accuracy": 0.6474, + "llm_top_5_test_accuracy": 0.6734, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9280000567436218, + "sae_top_1_test_accuracy": 0.7084, + "sae_top_2_test_accuracy": 0.7418000000000001, + "sae_top_5_test_accuracy": 0.7746, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results", + "llm_test_accuracy": 0.9795000553131104, + "llm_top_1_test_accuracy": 0.673, + "llm_top_2_test_accuracy": 0.724, + "llm_top_5_test_accuracy": 0.766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9715000689029694, + "sae_top_1_test_accuracy": 0.764, + "sae_top_2_test_accuracy": 0.898, + "sae_top_5_test_accuracy": 0.921, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "codeparrot/github-code_results", + "llm_test_accuracy": 0.9708000421524048, + "llm_top_1_test_accuracy": 0.6451999999999999, + "llm_top_2_test_accuracy": 0.6960000000000001, + "llm_top_5_test_accuracy": 0.7766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9712000489234924, + "sae_top_1_test_accuracy": 0.6984, + "sae_top_2_test_accuracy": 0.7142000000000001, + "sae_top_5_test_accuracy": 0.8362, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "fancyzhx/ag_news_results", + "llm_test_accuracy": 0.9500000476837158, + "llm_top_1_test_accuracy": 0.63775, + "llm_top_2_test_accuracy": 0.78175, + "llm_top_5_test_accuracy": 0.82125, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9540000557899475, + "sae_top_1_test_accuracy": 0.813, + "sae_top_2_test_accuracy": 0.8325, + "sae_top_5_test_accuracy": 0.8765000000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "Helsinki-NLP/europarl_results", + "llm_test_accuracy": 0.9996000289916992, + "llm_top_1_test_accuracy": 0.6454, + "llm_top_2_test_accuracy": 0.7884, + "llm_top_5_test_accuracy": 0.9012, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9988000392913818, + "sae_top_1_test_accuracy": 0.915, + "sae_top_2_test_accuracy": 0.9766, + "sae_top_5_test_accuracy": 0.9916, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1_results": { + "sae_test_accuracy": { + "0": 0.9500000476837158, + "1": 0.9700000286102295, + "2": 0.9520000219345093, + "6": 0.9880000352859497, + "9": 0.9770000576972961 + }, + "llm_test_accuracy": { + "0": 0.9510000348091125, + "1": 0.9670000672340393, + "2": 0.9530000686645508, + "6": 0.987000048160553, + "9": 0.9760000705718994 + }, + "llm_top_1_test_accuracy": { + "0": 0.577, + "1": 0.613, + "2": 0.662, + "6": 0.787, + "9": 0.56 + }, + "llm_top_2_test_accuracy": { + "0": 0.574, + "1": 0.66, + "2": 0.718, + "6": 0.811, + "9": 0.714 + }, + "llm_top_5_test_accuracy": { + "0": 0.713, + "1": 0.711, + "2": 0.755, + "6": 0.895, + "9": 0.861 + }, + "sae_top_1_test_accuracy": { + "0": 0.679, + "1": 0.57, + "2": 0.622, + "6": 0.76, + "9": 0.943 + }, + "sae_top_2_test_accuracy": { + "0": 0.676, + "1": 0.565, + "2": 0.924, + "6": 0.762, + "9": 0.947 + }, + "sae_top_5_test_accuracy": { + "0": 0.828, + "1": 0.63, + "2": 0.924, + "6": 0.878, + "9": 0.942 + } + }, + "LabHC/bias_in_bios_class_set2_results": { + "sae_test_accuracy": { + "11": 0.9570000171661377, + "13": 0.9520000219345093, + "14": 0.9520000219345093, + "18": 0.9320000410079956, + "19": 0.968000054359436 + }, + "llm_test_accuracy": { + "11": 0.9690000414848328, + "13": 0.9600000381469727, + "14": 0.9600000381469727, + "18": 0.9390000700950623, + "19": 0.9610000252723694 + }, + "llm_top_1_test_accuracy": { + "11": 0.555, + "13": 0.668, + "14": 0.638, + "18": 0.69, + "19": 0.796 + }, + "llm_top_2_test_accuracy": { + "11": 0.756, + "13": 0.714, + "14": 0.67, + "18": 0.717, + "19": 0.768 + }, + "llm_top_5_test_accuracy": { + "11": 0.794, + "13": 0.749, + "14": 0.723, + "18": 0.73, + "19": 0.831 + }, + "sae_top_1_test_accuracy": { + "11": 0.86, + "13": 0.646, + "14": 0.73, + "18": 0.628, + "19": 0.696 + }, + "sae_top_2_test_accuracy": { + "11": 0.854, + "13": 0.708, + "14": 0.728, + "18": 0.669, + "19": 0.785 + }, + "sae_top_5_test_accuracy": { + "11": 0.861, + "13": 0.728, + "14": 0.862, + "18": 0.714, + "19": 0.838 + } + }, + "LabHC/bias_in_bios_class_set3_results": { + "sae_test_accuracy": { + "20": 0.9550000429153442, + "21": 0.9260000586509705, + "22": 0.9290000200271606, + "25": 0.9750000238418579, + "26": 0.8940000534057617 + }, + "llm_test_accuracy": { + "20": 0.956000030040741, + "21": 0.9350000619888306, + "22": 0.9180000424385071, + "25": 0.9640000462532043, + "26": 0.8850000500679016 + }, + "llm_top_1_test_accuracy": { + "20": 0.693, + "21": 0.775, + "22": 0.645, + "25": 0.706, + "26": 0.616 + }, + "llm_top_2_test_accuracy": { + "20": 0.827, + "21": 0.761, + "22": 0.694, + "25": 0.778, + "26": 0.686 + }, + "llm_top_5_test_accuracy": { + "20": 0.855, + "21": 0.791, + "22": 0.725, + "25": 0.809, + "26": 0.672 + }, + "sae_top_1_test_accuracy": { + "20": 0.869, + "21": 0.59, + "22": 0.661, + "25": 0.633, + "26": 0.623 + }, + "sae_top_2_test_accuracy": { + "20": 0.861, + "21": 0.616, + "22": 0.689, + "25": 0.665, + "26": 0.731 + }, + "sae_top_5_test_accuracy": { + "20": 0.914, + "21": 0.831, + "22": 0.719, + "25": 0.765, + "26": 0.756 + } + }, + "canrager/amazon_reviews_mcauley_1and5_results": { + "sae_test_accuracy": { + "1": 0.9530000686645508, + "2": 0.940000057220459, + "3": 0.9160000681877136, + "5": 0.9390000700950623, + "6": 0.8920000195503235 + }, + "llm_test_accuracy": { + "1": 0.9580000638961792, + "2": 0.9330000281333923, + "3": 0.9280000329017639, + "5": 0.9200000166893005, + "6": 0.862000048160553 + }, + "llm_top_1_test_accuracy": { + "1": 0.647, + "2": 0.603, + "3": 0.598, + "5": 0.555, + "6": 0.592 + }, + "llm_top_2_test_accuracy": { + "1": 0.75, + "2": 0.648, + "3": 0.607, + "5": 0.606, + "6": 0.626 + }, + "llm_top_5_test_accuracy": { + "1": 0.767, + "2": 0.641, + "3": 0.645, + "5": 0.638, + "6": 0.676 + }, + "sae_top_1_test_accuracy": { + "1": 0.854, + "2": 0.806, + "3": 0.667, + "5": 0.543, + "6": 0.672 + }, + "sae_top_2_test_accuracy": { + "1": 0.891, + "2": 0.793, + "3": 0.753, + "5": 0.592, + "6": 0.68 + }, + "sae_top_5_test_accuracy": { + "1": 0.885, + "2": 0.809, + "3": 0.758, + "5": 0.703, + "6": 0.718 + } + }, + "canrager/amazon_reviews_mcauley_1and5_sentiment_results": { + "sae_test_accuracy": { + "1.0": 0.971000075340271, + "5.0": 0.9720000624656677 + }, + "llm_test_accuracy": { + "1.0": 0.9780000448226929, + "5.0": 0.9810000658035278 + }, + "llm_top_1_test_accuracy": { + "1.0": 0.673, + "5.0": 0.673 + }, + "llm_top_2_test_accuracy": { + "1.0": 0.724, + "5.0": 0.724 + }, + "llm_top_5_test_accuracy": { + "1.0": 0.766, + "5.0": 0.766 + }, + "sae_top_1_test_accuracy": { + "1.0": 0.764, + "5.0": 0.764 + }, + "sae_top_2_test_accuracy": { + "1.0": 0.898, + "5.0": 0.898 + }, + "sae_top_5_test_accuracy": { + "1.0": 0.921, + "5.0": 0.921 + } + }, + "codeparrot/github-code_results": { + "sae_test_accuracy": { + "C": 0.9530000686645508, + "Python": 0.984000027179718, + "HTML": 0.9860000610351562, + "Java": 0.9750000238418579, + "PHP": 0.9580000638961792 + }, + "llm_test_accuracy": { + "C": 0.956000030040741, + "Python": 0.984000027179718, + "HTML": 0.9900000691413879, + "Java": 0.9670000672340393, + "PHP": 0.9570000171661377 + }, + "llm_top_1_test_accuracy": { + "C": 0.666, + "Python": 0.626, + "HTML": 0.721, + "Java": 0.619, + "PHP": 0.594 + }, + "llm_top_2_test_accuracy": { + "C": 0.679, + "Python": 0.674, + "HTML": 0.8, + "Java": 0.676, + "PHP": 0.651 + }, + "llm_top_5_test_accuracy": { + "C": 0.783, + "Python": 0.717, + "HTML": 0.935, + "Java": 0.733, + "PHP": 0.715 + }, + "sae_top_1_test_accuracy": { + "C": 0.598, + "Python": 0.566, + "HTML": 0.742, + "Java": 0.656, + "PHP": 0.93 + }, + "sae_top_2_test_accuracy": { + "C": 0.654, + "Python": 0.612, + "HTML": 0.738, + "Java": 0.637, + "PHP": 0.93 + }, + "sae_top_5_test_accuracy": { + "C": 0.704, + "Python": 0.917, + "HTML": 0.938, + "Java": 0.688, + "PHP": 0.934 + } + }, + "fancyzhx/ag_news_results": { + "sae_test_accuracy": { + "0": 0.9500000476837158, + "1": 0.987000048160553, + "2": 0.9350000619888306, + "3": 0.9440000653266907 + }, + "llm_test_accuracy": { + "0": 0.940000057220459, + "1": 0.9860000610351562, + "2": 0.9200000166893005, + "3": 0.9540000557899475 + }, + "llm_top_1_test_accuracy": { + "0": 0.573, + "1": 0.671, + "2": 0.672, + "3": 0.635 + }, + "llm_top_2_test_accuracy": { + "0": 0.802, + "1": 0.808, + "2": 0.701, + "3": 0.816 + }, + "llm_top_5_test_accuracy": { + "0": 0.81, + "1": 0.891, + "2": 0.752, + "3": 0.832 + }, + "sae_top_1_test_accuracy": { + "0": 0.793, + "1": 0.972, + "2": 0.817, + "3": 0.67 + }, + "sae_top_2_test_accuracy": { + "0": 0.838, + "1": 0.972, + "2": 0.818, + "3": 0.702 + }, + "sae_top_5_test_accuracy": { + "0": 0.871, + "1": 0.976, + "2": 0.821, + "3": 0.838 + } + }, + "Helsinki-NLP/europarl_results": { + "sae_test_accuracy": { + "en": 0.999000072479248, + "fr": 1.0, + "de": 1.0, + "es": 0.999000072479248, + "nl": 0.9960000514984131 + }, + "llm_test_accuracy": { + "en": 1.0, + "fr": 0.999000072479248, + "de": 1.0, + "es": 0.999000072479248, + "nl": 1.0 + }, + "llm_top_1_test_accuracy": { + "en": 0.739, + "fr": 0.585, + "de": 0.758, + "es": 0.496, + "nl": 0.649 + }, + "llm_top_2_test_accuracy": { + "en": 0.829, + "fr": 0.582, + "de": 0.82, + "es": 0.958, + "nl": 0.753 + }, + "llm_top_5_test_accuracy": { + "en": 0.892, + "fr": 0.888, + "de": 0.894, + "es": 0.98, + "nl": 0.852 + }, + "sae_top_1_test_accuracy": { + "en": 0.996, + "fr": 0.994, + "de": 0.926, + "es": 0.839, + "nl": 0.82 + }, + "sae_top_2_test_accuracy": { + "en": 0.996, + "fr": 0.995, + "de": 0.959, + "es": 0.938, + "nl": 0.995 + }, + "sae_top_5_test_accuracy": { + "en": 0.996, + "fr": 0.997, + "de": 0.975, + "es": 0.995, + "nl": 0.995 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a4095bdf04234e74eeb753a9ac008262c7660597 --- /dev/null +++ b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,670 @@ +{ + "eval_type_id": "sparse_probing", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "LabHC/bias_in_bios_class_set2", + "LabHC/bias_in_bios_class_set3", + "canrager/amazon_reviews_mcauley_1and5", + "canrager/amazon_reviews_mcauley_1and5_sentiment", + "codeparrot/github-code", + "fancyzhx/ag_news", + "Helsinki-NLP/europarl" + ], + "probe_train_set_size": 4000, + "probe_test_set_size": 1000, + "context_length": 128, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "model_name": "gemma-2-2b", + "k_values": [ + 1, + 2, + 5 + ], + "lower_vram_usage": false + }, + "eval_id": "5fcf2fd9-d1b6-4284-8363-b605ba094c6c", + "datetime_epoch_millis": 1740164658660, + "eval_result_metrics": { + "llm": { + "llm_test_accuracy": 0.9595375448465346, + "llm_top_1_test_accuracy": 0.64956875, + "llm_top_2_test_accuracy": 0.72589375, + "llm_top_5_test_accuracy": 0.78265625, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null + }, + "sae": { + "sae_test_accuracy": 0.9564437940716745, + "sae_top_1_test_accuracy": 0.68616875, + "sae_top_2_test_accuracy": 0.7763499999999999, + "sae_top_5_test_accuracy": 0.87125, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_results", + "llm_test_accuracy": 0.966800057888031, + "llm_top_1_test_accuracy": 0.6397999999999999, + "llm_top_2_test_accuracy": 0.6954, + "llm_top_5_test_accuracy": 0.7869999999999999, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9612000465393067, + "sae_top_1_test_accuracy": 0.6982, + "sae_top_2_test_accuracy": 0.8256, + "sae_top_5_test_accuracy": 0.8992000000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set2_results", + "llm_test_accuracy": 0.9578000426292419, + "llm_top_1_test_accuracy": 0.6694000000000001, + "llm_top_2_test_accuracy": 0.725, + "llm_top_5_test_accuracy": 0.7654, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9466000437736511, + "sae_top_1_test_accuracy": 0.6604, + "sae_top_2_test_accuracy": 0.7849999999999999, + "sae_top_5_test_accuracy": 0.8326, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set3_results", + "llm_test_accuracy": 0.9316000461578369, + "llm_top_1_test_accuracy": 0.687, + "llm_top_2_test_accuracy": 0.7492, + "llm_top_5_test_accuracy": 0.7704000000000001, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9314000368118286, + "sae_top_1_test_accuracy": 0.6974, + "sae_top_2_test_accuracy": 0.7984, + "sae_top_5_test_accuracy": 0.853, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results", + "llm_test_accuracy": 0.9202000379562378, + "llm_top_1_test_accuracy": 0.599, + "llm_top_2_test_accuracy": 0.6474, + "llm_top_5_test_accuracy": 0.6734, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9216000437736511, + "sae_top_1_test_accuracy": 0.6776000000000001, + "sae_top_2_test_accuracy": 0.7224, + "sae_top_5_test_accuracy": 0.8231999999999999, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results", + "llm_test_accuracy": 0.9795000553131104, + "llm_top_1_test_accuracy": 0.673, + "llm_top_2_test_accuracy": 0.724, + "llm_top_5_test_accuracy": 0.766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9725000560283661, + "sae_top_1_test_accuracy": 0.644, + "sae_top_2_test_accuracy": 0.661, + "sae_top_5_test_accuracy": 0.909, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "codeparrot/github-code_results", + "llm_test_accuracy": 0.9708000421524048, + "llm_top_1_test_accuracy": 0.6451999999999999, + "llm_top_2_test_accuracy": 0.6960000000000001, + "llm_top_5_test_accuracy": 0.7766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9682000517845154, + "sae_top_1_test_accuracy": 0.6364000000000001, + "sae_top_2_test_accuracy": 0.7253999999999999, + "sae_top_5_test_accuracy": 0.8042, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "fancyzhx/ag_news_results", + "llm_test_accuracy": 0.9500000476837158, + "llm_top_1_test_accuracy": 0.63775, + "llm_top_2_test_accuracy": 0.78175, + "llm_top_5_test_accuracy": 0.82125, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9502500593662262, + "sae_top_1_test_accuracy": 0.6407499999999999, + "sae_top_2_test_accuracy": 0.7829999999999999, + "sae_top_5_test_accuracy": 0.8530000000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "Helsinki-NLP/europarl_results", + "llm_test_accuracy": 0.9996000289916992, + "llm_top_1_test_accuracy": 0.6454, + "llm_top_2_test_accuracy": 0.7884, + "llm_top_5_test_accuracy": 0.9012, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9998000144958497, + "sae_top_1_test_accuracy": 0.8346, + "sae_top_2_test_accuracy": 0.9099999999999999, + "sae_top_5_test_accuracy": 0.9957999999999998, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1_results": { + "sae_test_accuracy": { + "0": 0.9450000524520874, + "1": 0.9540000557899475, + "2": 0.9440000653266907, + "6": 0.984000027179718, + "9": 0.9790000319480896 + }, + "llm_test_accuracy": { + "0": 0.9510000348091125, + "1": 0.9670000672340393, + "2": 0.9530000686645508, + "6": 0.987000048160553, + "9": 0.9760000705718994 + }, + "llm_top_1_test_accuracy": { + "0": 0.577, + "1": 0.613, + "2": 0.662, + "6": 0.787, + "9": 0.56 + }, + "llm_top_2_test_accuracy": { + "0": 0.574, + "1": 0.66, + "2": 0.718, + "6": 0.811, + "9": 0.714 + }, + "llm_top_5_test_accuracy": { + "0": 0.713, + "1": 0.711, + "2": 0.755, + "6": 0.895, + "9": 0.861 + }, + "sae_top_1_test_accuracy": { + "0": 0.547, + "1": 0.666, + "2": 0.849, + "6": 0.82, + "9": 0.609 + }, + "sae_top_2_test_accuracy": { + "0": 0.806, + "1": 0.662, + "2": 0.89, + "6": 0.832, + "9": 0.938 + }, + "sae_top_5_test_accuracy": { + "0": 0.811, + "1": 0.849, + "2": 0.9, + "6": 0.991, + "9": 0.945 + } + }, + "LabHC/bias_in_bios_class_set2_results": { + "sae_test_accuracy": { + "11": 0.9540000557899475, + "13": 0.9420000314712524, + "14": 0.9500000476837158, + "18": 0.9240000247955322, + "19": 0.9630000591278076 + }, + "llm_test_accuracy": { + "11": 0.9690000414848328, + "13": 0.9600000381469727, + "14": 0.9600000381469727, + "18": 0.9390000700950623, + "19": 0.9610000252723694 + }, + "llm_top_1_test_accuracy": { + "11": 0.555, + "13": 0.668, + "14": 0.638, + "18": 0.69, + "19": 0.796 + }, + "llm_top_2_test_accuracy": { + "11": 0.756, + "13": 0.714, + "14": 0.67, + "18": 0.717, + "19": 0.768 + }, + "llm_top_5_test_accuracy": { + "11": 0.794, + "13": 0.749, + "14": 0.723, + "18": 0.73, + "19": 0.831 + }, + "sae_top_1_test_accuracy": { + "11": 0.565, + "13": 0.656, + "14": 0.628, + "18": 0.646, + "19": 0.807 + }, + "sae_top_2_test_accuracy": { + "11": 0.854, + "13": 0.679, + "14": 0.869, + "18": 0.699, + "19": 0.824 + }, + "sae_top_5_test_accuracy": { + "11": 0.953, + "13": 0.758, + "14": 0.891, + "18": 0.73, + "19": 0.831 + } + }, + "LabHC/bias_in_bios_class_set3_results": { + "sae_test_accuracy": { + "20": 0.9600000381469727, + "21": 0.9290000200271606, + "22": 0.9120000600814819, + "25": 0.9700000286102295, + "26": 0.8860000371932983 + }, + "llm_test_accuracy": { + "20": 0.956000030040741, + "21": 0.9350000619888306, + "22": 0.9180000424385071, + "25": 0.9640000462532043, + "26": 0.8850000500679016 + }, + "llm_top_1_test_accuracy": { + "20": 0.693, + "21": 0.775, + "22": 0.645, + "25": 0.706, + "26": 0.616 + }, + "llm_top_2_test_accuracy": { + "20": 0.827, + "21": 0.761, + "22": 0.694, + "25": 0.778, + "26": 0.686 + }, + "llm_top_5_test_accuracy": { + "20": 0.855, + "21": 0.791, + "22": 0.725, + "25": 0.809, + "26": 0.672 + }, + "sae_top_1_test_accuracy": { + "20": 0.568, + "21": 0.75, + "22": 0.858, + "25": 0.715, + "26": 0.596 + }, + "sae_top_2_test_accuracy": { + "20": 0.841, + "21": 0.782, + "22": 0.887, + "25": 0.865, + "26": 0.617 + }, + "sae_top_5_test_accuracy": { + "20": 0.913, + "21": 0.842, + "22": 0.883, + "25": 0.882, + "26": 0.745 + } + }, + "canrager/amazon_reviews_mcauley_1and5_results": { + "sae_test_accuracy": { + "1": 0.9500000476837158, + "2": 0.9350000619888306, + "3": 0.9240000247955322, + "5": 0.9180000424385071, + "6": 0.8810000419616699 + }, + "llm_test_accuracy": { + "1": 0.9580000638961792, + "2": 0.9330000281333923, + "3": 0.9280000329017639, + "5": 0.9200000166893005, + "6": 0.862000048160553 + }, + "llm_top_1_test_accuracy": { + "1": 0.647, + "2": 0.603, + "3": 0.598, + "5": 0.555, + "6": 0.592 + }, + "llm_top_2_test_accuracy": { + "1": 0.75, + "2": 0.648, + "3": 0.607, + "5": 0.606, + "6": 0.626 + }, + "llm_top_5_test_accuracy": { + "1": 0.767, + "2": 0.641, + "3": 0.645, + "5": 0.638, + "6": 0.676 + }, + "sae_top_1_test_accuracy": { + "1": 0.902, + "2": 0.852, + "3": 0.52, + "5": 0.555, + "6": 0.559 + }, + "sae_top_2_test_accuracy": { + "1": 0.907, + "2": 0.852, + "3": 0.553, + "5": 0.749, + "6": 0.551 + }, + "sae_top_5_test_accuracy": { + "1": 0.925, + "2": 0.882, + "3": 0.73, + "5": 0.852, + "6": 0.727 + } + }, + "canrager/amazon_reviews_mcauley_1and5_sentiment_results": { + "sae_test_accuracy": { + "1.0": 0.9720000624656677, + "5.0": 0.9730000495910645 + }, + "llm_test_accuracy": { + "1.0": 0.9780000448226929, + "5.0": 0.9810000658035278 + }, + "llm_top_1_test_accuracy": { + "1.0": 0.673, + "5.0": 0.673 + }, + "llm_top_2_test_accuracy": { + "1.0": 0.724, + "5.0": 0.724 + }, + "llm_top_5_test_accuracy": { + "1.0": 0.766, + "5.0": 0.766 + }, + "sae_top_1_test_accuracy": { + "1.0": 0.644, + "5.0": 0.644 + }, + "sae_top_2_test_accuracy": { + "1.0": 0.661, + "5.0": 0.661 + }, + "sae_top_5_test_accuracy": { + "1.0": 0.909, + "5.0": 0.909 + } + }, + "codeparrot/github-code_results": { + "sae_test_accuracy": { + "C": 0.9540000557899475, + "Python": 0.9860000610351562, + "HTML": 0.9820000529289246, + "Java": 0.9620000720024109, + "PHP": 0.9570000171661377 + }, + "llm_test_accuracy": { + "C": 0.956000030040741, + "Python": 0.984000027179718, + "HTML": 0.9900000691413879, + "Java": 0.9670000672340393, + "PHP": 0.9570000171661377 + }, + "llm_top_1_test_accuracy": { + "C": 0.666, + "Python": 0.626, + "HTML": 0.721, + "Java": 0.619, + "PHP": 0.594 + }, + "llm_top_2_test_accuracy": { + "C": 0.679, + "Python": 0.674, + "HTML": 0.8, + "Java": 0.676, + "PHP": 0.651 + }, + "llm_top_5_test_accuracy": { + "C": 0.783, + "Python": 0.717, + "HTML": 0.935, + "Java": 0.733, + "PHP": 0.715 + }, + "sae_top_1_test_accuracy": { + "C": 0.623, + "Python": 0.623, + "HTML": 0.689, + "Java": 0.632, + "PHP": 0.615 + }, + "sae_top_2_test_accuracy": { + "C": 0.617, + "Python": 0.657, + "HTML": 0.811, + "Java": 0.629, + "PHP": 0.913 + }, + "sae_top_5_test_accuracy": { + "C": 0.681, + "Python": 0.94, + "HTML": 0.833, + "Java": 0.645, + "PHP": 0.922 + } + }, + "fancyzhx/ag_news_results": { + "sae_test_accuracy": { + "0": 0.9390000700950623, + "1": 0.9790000319480896, + "2": 0.9390000700950623, + "3": 0.9440000653266907 + }, + "llm_test_accuracy": { + "0": 0.940000057220459, + "1": 0.9860000610351562, + "2": 0.9200000166893005, + "3": 0.9540000557899475 + }, + "llm_top_1_test_accuracy": { + "0": 0.573, + "1": 0.671, + "2": 0.672, + "3": 0.635 + }, + "llm_top_2_test_accuracy": { + "0": 0.802, + "1": 0.808, + "2": 0.701, + "3": 0.816 + }, + "llm_top_5_test_accuracy": { + "0": 0.81, + "1": 0.891, + "2": 0.752, + "3": 0.832 + }, + "sae_top_1_test_accuracy": { + "0": 0.67, + "1": 0.641, + "2": 0.546, + "3": 0.706 + }, + "sae_top_2_test_accuracy": { + "0": 0.695, + "1": 0.94, + "2": 0.76, + "3": 0.737 + }, + "sae_top_5_test_accuracy": { + "0": 0.842, + "1": 0.96, + "2": 0.809, + "3": 0.801 + } + }, + "Helsinki-NLP/europarl_results": { + "sae_test_accuracy": { + "en": 1.0, + "fr": 1.0, + "de": 1.0, + "es": 1.0, + "nl": 0.999000072479248 + }, + "llm_test_accuracy": { + "en": 1.0, + "fr": 0.999000072479248, + "de": 1.0, + "es": 0.999000072479248, + "nl": 1.0 + }, + "llm_top_1_test_accuracy": { + "en": 0.739, + "fr": 0.585, + "de": 0.758, + "es": 0.496, + "nl": 0.649 + }, + "llm_top_2_test_accuracy": { + "en": 0.829, + "fr": 0.582, + "de": 0.82, + "es": 0.958, + "nl": 0.753 + }, + "llm_top_5_test_accuracy": { + "en": 0.892, + "fr": 0.888, + "de": 0.894, + "es": 0.98, + "nl": 0.852 + }, + "sae_top_1_test_accuracy": { + "en": 0.718, + "fr": 0.993, + "de": 0.9, + "es": 0.902, + "nl": 0.66 + }, + "sae_top_2_test_accuracy": { + "en": 0.739, + "fr": 0.994, + "de": 0.907, + "es": 0.913, + "nl": 0.997 + }, + "sae_top_5_test_accuracy": { + "en": 0.998, + "fr": 0.996, + "de": 0.992, + "es": 0.993, + "nl": 1.0 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ad4b050a0c3c0cf5bf838ef9a1b27bb430dda109 --- /dev/null +++ b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,670 @@ +{ + "eval_type_id": "sparse_probing", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "LabHC/bias_in_bios_class_set2", + "LabHC/bias_in_bios_class_set3", + "canrager/amazon_reviews_mcauley_1and5", + "canrager/amazon_reviews_mcauley_1and5_sentiment", + "codeparrot/github-code", + "fancyzhx/ag_news", + "Helsinki-NLP/europarl" + ], + "probe_train_set_size": 4000, + "probe_test_set_size": 1000, + "context_length": 128, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "model_name": "gemma-2-2b", + "k_values": [ + 1, + 2, + 5 + ], + "lower_vram_usage": false + }, + "eval_id": "82b9980d-82e4-4ea8-9e22-8e6990b9b64e", + "datetime_epoch_millis": 1740164519938, + "eval_result_metrics": { + "llm": { + "llm_test_accuracy": 0.9595375448465346, + "llm_top_1_test_accuracy": 0.64956875, + "llm_top_2_test_accuracy": 0.72589375, + "llm_top_5_test_accuracy": 0.78265625, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null + }, + "sae": { + "sae_test_accuracy": 0.9534375388175249, + "sae_top_1_test_accuracy": 0.7455499999999999, + "sae_top_2_test_accuracy": 0.81490625, + "sae_top_5_test_accuracy": 0.8733625, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_results", + "llm_test_accuracy": 0.966800057888031, + "llm_top_1_test_accuracy": 0.6397999999999999, + "llm_top_2_test_accuracy": 0.6954, + "llm_top_5_test_accuracy": 0.7869999999999999, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9592000484466553, + "sae_top_1_test_accuracy": 0.7891999999999999, + "sae_top_2_test_accuracy": 0.8113999999999999, + "sae_top_5_test_accuracy": 0.9046, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set2_results", + "llm_test_accuracy": 0.9578000426292419, + "llm_top_1_test_accuracy": 0.6694000000000001, + "llm_top_2_test_accuracy": 0.725, + "llm_top_5_test_accuracy": 0.7654, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9442000389099121, + "sae_top_1_test_accuracy": 0.6958, + "sae_top_2_test_accuracy": 0.8019999999999999, + "sae_top_5_test_accuracy": 0.8480000000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set3_results", + "llm_test_accuracy": 0.9316000461578369, + "llm_top_1_test_accuracy": 0.687, + "llm_top_2_test_accuracy": 0.7492, + "llm_top_5_test_accuracy": 0.7704000000000001, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9224000334739685, + "sae_top_1_test_accuracy": 0.7988, + "sae_top_2_test_accuracy": 0.836, + "sae_top_5_test_accuracy": 0.8666, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results", + "llm_test_accuracy": 0.9202000379562378, + "llm_top_1_test_accuracy": 0.599, + "llm_top_2_test_accuracy": 0.6474, + "llm_top_5_test_accuracy": 0.6734, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9188000440597535, + "sae_top_1_test_accuracy": 0.6394, + "sae_top_2_test_accuracy": 0.7183999999999999, + "sae_top_5_test_accuracy": 0.798, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results", + "llm_test_accuracy": 0.9795000553131104, + "llm_top_1_test_accuracy": 0.673, + "llm_top_2_test_accuracy": 0.724, + "llm_top_5_test_accuracy": 0.766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9685000479221344, + "sae_top_1_test_accuracy": 0.76, + "sae_top_2_test_accuracy": 0.931, + "sae_top_5_test_accuracy": 0.939, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "codeparrot/github-code_results", + "llm_test_accuracy": 0.9708000421524048, + "llm_top_1_test_accuracy": 0.6451999999999999, + "llm_top_2_test_accuracy": 0.6960000000000001, + "llm_top_5_test_accuracy": 0.7766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9700000286102295, + "sae_top_1_test_accuracy": 0.6275999999999999, + "sae_top_2_test_accuracy": 0.7262000000000001, + "sae_top_5_test_accuracy": 0.7979999999999999, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "fancyzhx/ag_news_results", + "llm_test_accuracy": 0.9500000476837158, + "llm_top_1_test_accuracy": 0.63775, + "llm_top_2_test_accuracy": 0.78175, + "llm_top_5_test_accuracy": 0.82125, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9460000246763229, + "sae_top_1_test_accuracy": 0.713, + "sae_top_2_test_accuracy": 0.75225, + "sae_top_5_test_accuracy": 0.8434999999999999, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "Helsinki-NLP/europarl_results", + "llm_test_accuracy": 0.9996000289916992, + "llm_top_1_test_accuracy": 0.6454, + "llm_top_2_test_accuracy": 0.7884, + "llm_top_5_test_accuracy": 0.9012, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9984000444412231, + "sae_top_1_test_accuracy": 0.9406000000000001, + "sae_top_2_test_accuracy": 0.942, + "sae_top_5_test_accuracy": 0.9892, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1_results": { + "sae_test_accuracy": { + "0": 0.9390000700950623, + "1": 0.9580000638961792, + "2": 0.9500000476837158, + "6": 0.984000027179718, + "9": 0.9650000333786011 + }, + "llm_test_accuracy": { + "0": 0.9510000348091125, + "1": 0.9670000672340393, + "2": 0.9530000686645508, + "6": 0.987000048160553, + "9": 0.9760000705718994 + }, + "llm_top_1_test_accuracy": { + "0": 0.577, + "1": 0.613, + "2": 0.662, + "6": 0.787, + "9": 0.56 + }, + "llm_top_2_test_accuracy": { + "0": 0.574, + "1": 0.66, + "2": 0.718, + "6": 0.811, + "9": 0.714 + }, + "llm_top_5_test_accuracy": { + "0": 0.713, + "1": 0.711, + "2": 0.755, + "6": 0.895, + "9": 0.861 + }, + "sae_top_1_test_accuracy": { + "0": 0.573, + "1": 0.579, + "2": 0.896, + "6": 0.961, + "9": 0.937 + }, + "sae_top_2_test_accuracy": { + "0": 0.624, + "1": 0.633, + "2": 0.905, + "6": 0.961, + "9": 0.934 + }, + "sae_top_5_test_accuracy": { + "0": 0.856, + "1": 0.83, + "2": 0.91, + "6": 0.984, + "9": 0.943 + } + }, + "LabHC/bias_in_bios_class_set2_results": { + "sae_test_accuracy": { + "11": 0.9520000219345093, + "13": 0.9470000267028809, + "14": 0.940000057220459, + "18": 0.9190000295639038, + "19": 0.9630000591278076 + }, + "llm_test_accuracy": { + "11": 0.9690000414848328, + "13": 0.9600000381469727, + "14": 0.9600000381469727, + "18": 0.9390000700950623, + "19": 0.9610000252723694 + }, + "llm_top_1_test_accuracy": { + "11": 0.555, + "13": 0.668, + "14": 0.638, + "18": 0.69, + "19": 0.796 + }, + "llm_top_2_test_accuracy": { + "11": 0.756, + "13": 0.714, + "14": 0.67, + "18": 0.717, + "19": 0.768 + }, + "llm_top_5_test_accuracy": { + "11": 0.794, + "13": 0.749, + "14": 0.723, + "18": 0.73, + "19": 0.831 + }, + "sae_top_1_test_accuracy": { + "11": 0.618, + "13": 0.646, + "14": 0.731, + "18": 0.641, + "19": 0.843 + }, + "sae_top_2_test_accuracy": { + "11": 0.723, + "13": 0.689, + "14": 0.854, + "18": 0.882, + "19": 0.862 + }, + "sae_top_5_test_accuracy": { + "11": 0.835, + "13": 0.746, + "14": 0.858, + "18": 0.899, + "19": 0.902 + } + }, + "LabHC/bias_in_bios_class_set3_results": { + "sae_test_accuracy": { + "20": 0.9520000219345093, + "21": 0.9160000681877136, + "22": 0.9100000262260437, + "25": 0.9470000267028809, + "26": 0.8870000243186951 + }, + "llm_test_accuracy": { + "20": 0.956000030040741, + "21": 0.9350000619888306, + "22": 0.9180000424385071, + "25": 0.9640000462532043, + "26": 0.8850000500679016 + }, + "llm_top_1_test_accuracy": { + "20": 0.693, + "21": 0.775, + "22": 0.645, + "25": 0.706, + "26": 0.616 + }, + "llm_top_2_test_accuracy": { + "20": 0.827, + "21": 0.761, + "22": 0.694, + "25": 0.778, + "26": 0.686 + }, + "llm_top_5_test_accuracy": { + "20": 0.855, + "21": 0.791, + "22": 0.725, + "25": 0.809, + "26": 0.672 + }, + "sae_top_1_test_accuracy": { + "20": 0.909, + "21": 0.795, + "22": 0.799, + "25": 0.876, + "26": 0.615 + }, + "sae_top_2_test_accuracy": { + "20": 0.914, + "21": 0.812, + "22": 0.872, + "25": 0.887, + "26": 0.695 + }, + "sae_top_5_test_accuracy": { + "20": 0.921, + "21": 0.817, + "22": 0.86, + "25": 0.918, + "26": 0.817 + } + }, + "canrager/amazon_reviews_mcauley_1and5_results": { + "sae_test_accuracy": { + "1": 0.9510000348091125, + "2": 0.937000036239624, + "3": 0.9130000472068787, + "5": 0.921000063419342, + "6": 0.8720000386238098 + }, + "llm_test_accuracy": { + "1": 0.9580000638961792, + "2": 0.9330000281333923, + "3": 0.9280000329017639, + "5": 0.9200000166893005, + "6": 0.862000048160553 + }, + "llm_top_1_test_accuracy": { + "1": 0.647, + "2": 0.603, + "3": 0.598, + "5": 0.555, + "6": 0.592 + }, + "llm_top_2_test_accuracy": { + "1": 0.75, + "2": 0.648, + "3": 0.607, + "5": 0.606, + "6": 0.626 + }, + "llm_top_5_test_accuracy": { + "1": 0.767, + "2": 0.641, + "3": 0.645, + "5": 0.638, + "6": 0.676 + }, + "sae_top_1_test_accuracy": { + "1": 0.724, + "2": 0.626, + "3": 0.577, + "5": 0.553, + "6": 0.717 + }, + "sae_top_2_test_accuracy": { + "1": 0.733, + "2": 0.859, + "3": 0.615, + "5": 0.666, + "6": 0.719 + }, + "sae_top_5_test_accuracy": { + "1": 0.817, + "2": 0.887, + "3": 0.704, + "5": 0.826, + "6": 0.756 + } + }, + "canrager/amazon_reviews_mcauley_1and5_sentiment_results": { + "sae_test_accuracy": { + "1.0": 0.9690000414848328, + "5.0": 0.968000054359436 + }, + "llm_test_accuracy": { + "1.0": 0.9780000448226929, + "5.0": 0.9810000658035278 + }, + "llm_top_1_test_accuracy": { + "1.0": 0.673, + "5.0": 0.673 + }, + "llm_top_2_test_accuracy": { + "1.0": 0.724, + "5.0": 0.724 + }, + "llm_top_5_test_accuracy": { + "1.0": 0.766, + "5.0": 0.766 + }, + "sae_top_1_test_accuracy": { + "1.0": 0.76, + "5.0": 0.76 + }, + "sae_top_2_test_accuracy": { + "1.0": 0.931, + "5.0": 0.931 + }, + "sae_top_5_test_accuracy": { + "1.0": 0.939, + "5.0": 0.939 + } + }, + "codeparrot/github-code_results": { + "sae_test_accuracy": { + "C": 0.956000030040741, + "Python": 0.9890000224113464, + "HTML": 0.984000027179718, + "Java": 0.9650000333786011, + "PHP": 0.956000030040741 + }, + "llm_test_accuracy": { + "C": 0.956000030040741, + "Python": 0.984000027179718, + "HTML": 0.9900000691413879, + "Java": 0.9670000672340393, + "PHP": 0.9570000171661377 + }, + "llm_top_1_test_accuracy": { + "C": 0.666, + "Python": 0.626, + "HTML": 0.721, + "Java": 0.619, + "PHP": 0.594 + }, + "llm_top_2_test_accuracy": { + "C": 0.679, + "Python": 0.674, + "HTML": 0.8, + "Java": 0.676, + "PHP": 0.651 + }, + "llm_top_5_test_accuracy": { + "C": 0.783, + "Python": 0.717, + "HTML": 0.935, + "Java": 0.733, + "PHP": 0.715 + }, + "sae_top_1_test_accuracy": { + "C": 0.576, + "Python": 0.547, + "HTML": 0.768, + "Java": 0.652, + "PHP": 0.595 + }, + "sae_top_2_test_accuracy": { + "C": 0.595, + "Python": 0.699, + "HTML": 0.777, + "Java": 0.638, + "PHP": 0.922 + }, + "sae_top_5_test_accuracy": { + "C": 0.705, + "Python": 0.715, + "HTML": 0.936, + "Java": 0.714, + "PHP": 0.92 + } + }, + "fancyzhx/ag_news_results": { + "sae_test_accuracy": { + "0": 0.9340000152587891, + "1": 0.9800000190734863, + "2": 0.9200000166893005, + "3": 0.9500000476837158 + }, + "llm_test_accuracy": { + "0": 0.940000057220459, + "1": 0.9860000610351562, + "2": 0.9200000166893005, + "3": 0.9540000557899475 + }, + "llm_top_1_test_accuracy": { + "0": 0.573, + "1": 0.671, + "2": 0.672, + "3": 0.635 + }, + "llm_top_2_test_accuracy": { + "0": 0.802, + "1": 0.808, + "2": 0.701, + "3": 0.816 + }, + "llm_top_5_test_accuracy": { + "0": 0.81, + "1": 0.891, + "2": 0.752, + "3": 0.832 + }, + "sae_top_1_test_accuracy": { + "0": 0.67, + "1": 0.811, + "2": 0.741, + "3": 0.63 + }, + "sae_top_2_test_accuracy": { + "0": 0.745, + "1": 0.909, + "2": 0.734, + "3": 0.621 + }, + "sae_top_5_test_accuracy": { + "0": 0.847, + "1": 0.927, + "2": 0.813, + "3": 0.787 + } + }, + "Helsinki-NLP/europarl_results": { + "sae_test_accuracy": { + "en": 0.9980000257492065, + "fr": 0.999000072479248, + "de": 0.9980000257492065, + "es": 0.999000072479248, + "nl": 0.9980000257492065 + }, + "llm_test_accuracy": { + "en": 1.0, + "fr": 0.999000072479248, + "de": 1.0, + "es": 0.999000072479248, + "nl": 1.0 + }, + "llm_top_1_test_accuracy": { + "en": 0.739, + "fr": 0.585, + "de": 0.758, + "es": 0.496, + "nl": 0.649 + }, + "llm_top_2_test_accuracy": { + "en": 0.829, + "fr": 0.582, + "de": 0.82, + "es": 0.958, + "nl": 0.753 + }, + "llm_top_5_test_accuracy": { + "en": 0.892, + "fr": 0.888, + "de": 0.894, + "es": 0.98, + "nl": 0.852 + }, + "sae_top_1_test_accuracy": { + "en": 0.739, + "fr": 0.991, + "de": 0.98, + "es": 0.995, + "nl": 0.998 + }, + "sae_top_2_test_accuracy": { + "en": 0.747, + "fr": 0.99, + "de": 0.98, + "es": 0.995, + "nl": 0.998 + }, + "sae_top_5_test_accuracy": { + "en": 0.97, + "fr": 0.996, + "de": 0.988, + "es": 0.994, + "nl": 0.998 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c634b76cda3d3da4982cc1e58c5011c7ad0ab741 --- /dev/null +++ b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,670 @@ +{ + "eval_type_id": "sparse_probing", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "LabHC/bias_in_bios_class_set2", + "LabHC/bias_in_bios_class_set3", + "canrager/amazon_reviews_mcauley_1and5", + "canrager/amazon_reviews_mcauley_1and5_sentiment", + "codeparrot/github-code", + "fancyzhx/ag_news", + "Helsinki-NLP/europarl" + ], + "probe_train_set_size": 4000, + "probe_test_set_size": 1000, + "context_length": 128, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "model_name": "gemma-2-2b", + "k_values": [ + 1, + 2, + 5 + ], + "lower_vram_usage": false + }, + "eval_id": "ead5eae1-ac4a-495a-aa9d-6980c16e8482", + "datetime_epoch_millis": 1740164809112, + "eval_result_metrics": { + "llm": { + "llm_test_accuracy": 0.9595375448465346, + "llm_top_1_test_accuracy": 0.64956875, + "llm_top_2_test_accuracy": 0.72589375, + "llm_top_5_test_accuracy": 0.78265625, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null + }, + "sae": { + "sae_test_accuracy": 0.9516125384718179, + "sae_top_1_test_accuracy": 0.7007125, + "sae_top_2_test_accuracy": 0.808825, + "sae_top_5_test_accuracy": 0.8754875000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_results", + "llm_test_accuracy": 0.966800057888031, + "llm_top_1_test_accuracy": 0.6397999999999999, + "llm_top_2_test_accuracy": 0.6954, + "llm_top_5_test_accuracy": 0.7869999999999999, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9568000435829163, + "sae_top_1_test_accuracy": 0.6923999999999999, + "sae_top_2_test_accuracy": 0.8591999999999999, + "sae_top_5_test_accuracy": 0.9028, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set2_results", + "llm_test_accuracy": 0.9578000426292419, + "llm_top_1_test_accuracy": 0.6694000000000001, + "llm_top_2_test_accuracy": 0.725, + "llm_top_5_test_accuracy": 0.7654, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9420000433921814, + "sae_top_1_test_accuracy": 0.671, + "sae_top_2_test_accuracy": 0.7924, + "sae_top_5_test_accuracy": 0.8718, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set3_results", + "llm_test_accuracy": 0.9316000461578369, + "llm_top_1_test_accuracy": 0.687, + "llm_top_2_test_accuracy": 0.7492, + "llm_top_5_test_accuracy": 0.7704000000000001, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9232000350952149, + "sae_top_1_test_accuracy": 0.7499999999999999, + "sae_top_2_test_accuracy": 0.7924, + "sae_top_5_test_accuracy": 0.8560000000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results", + "llm_test_accuracy": 0.9202000379562378, + "llm_top_1_test_accuracy": 0.599, + "llm_top_2_test_accuracy": 0.6474, + "llm_top_5_test_accuracy": 0.6734, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9094000339508057, + "sae_top_1_test_accuracy": 0.6928, + "sae_top_2_test_accuracy": 0.744, + "sae_top_5_test_accuracy": 0.8098000000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results", + "llm_test_accuracy": 0.9795000553131104, + "llm_top_1_test_accuracy": 0.673, + "llm_top_2_test_accuracy": 0.724, + "llm_top_5_test_accuracy": 0.766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9650000333786011, + "sae_top_1_test_accuracy": 0.745, + "sae_top_2_test_accuracy": 0.895, + "sae_top_5_test_accuracy": 0.936, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "codeparrot/github-code_results", + "llm_test_accuracy": 0.9708000421524048, + "llm_top_1_test_accuracy": 0.6451999999999999, + "llm_top_2_test_accuracy": 0.6960000000000001, + "llm_top_5_test_accuracy": 0.7766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9684000372886657, + "sae_top_1_test_accuracy": 0.5955999999999999, + "sae_top_2_test_accuracy": 0.7612, + "sae_top_5_test_accuracy": 0.7986000000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "fancyzhx/ag_news_results", + "llm_test_accuracy": 0.9500000476837158, + "llm_top_1_test_accuracy": 0.63775, + "llm_top_2_test_accuracy": 0.78175, + "llm_top_5_test_accuracy": 0.82125, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9495000392198563, + "sae_top_1_test_accuracy": 0.6034999999999999, + "sae_top_2_test_accuracy": 0.6970000000000001, + "sae_top_5_test_accuracy": 0.8434999999999999, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "Helsinki-NLP/europarl_results", + "llm_test_accuracy": 0.9996000289916992, + "llm_top_1_test_accuracy": 0.6454, + "llm_top_2_test_accuracy": 0.7884, + "llm_top_5_test_accuracy": 0.9012, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9986000418663025, + "sae_top_1_test_accuracy": 0.8554, + "sae_top_2_test_accuracy": 0.9294, + "sae_top_5_test_accuracy": 0.9853999999999999, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1_results": { + "sae_test_accuracy": { + "0": 0.9350000619888306, + "1": 0.9600000381469727, + "2": 0.937000036239624, + "6": 0.987000048160553, + "9": 0.9650000333786011 + }, + "llm_test_accuracy": { + "0": 0.9510000348091125, + "1": 0.9670000672340393, + "2": 0.9530000686645508, + "6": 0.987000048160553, + "9": 0.9760000705718994 + }, + "llm_top_1_test_accuracy": { + "0": 0.577, + "1": 0.613, + "2": 0.662, + "6": 0.787, + "9": 0.56 + }, + "llm_top_2_test_accuracy": { + "0": 0.574, + "1": 0.66, + "2": 0.718, + "6": 0.811, + "9": 0.714 + }, + "llm_top_5_test_accuracy": { + "0": 0.713, + "1": 0.711, + "2": 0.755, + "6": 0.895, + "9": 0.861 + }, + "sae_top_1_test_accuracy": { + "0": 0.574, + "1": 0.632, + "2": 0.891, + "6": 0.803, + "9": 0.562 + }, + "sae_top_2_test_accuracy": { + "0": 0.874, + "1": 0.741, + "2": 0.898, + "6": 0.94, + "9": 0.843 + }, + "sae_top_5_test_accuracy": { + "0": 0.879, + "1": 0.872, + "2": 0.895, + "6": 0.967, + "9": 0.901 + } + }, + "LabHC/bias_in_bios_class_set2_results": { + "sae_test_accuracy": { + "11": 0.9510000348091125, + "13": 0.9440000653266907, + "14": 0.9460000395774841, + "18": 0.9120000600814819, + "19": 0.9570000171661377 + }, + "llm_test_accuracy": { + "11": 0.9690000414848328, + "13": 0.9600000381469727, + "14": 0.9600000381469727, + "18": 0.9390000700950623, + "19": 0.9610000252723694 + }, + "llm_top_1_test_accuracy": { + "11": 0.555, + "13": 0.668, + "14": 0.638, + "18": 0.69, + "19": 0.796 + }, + "llm_top_2_test_accuracy": { + "11": 0.756, + "13": 0.714, + "14": 0.67, + "18": 0.717, + "19": 0.768 + }, + "llm_top_5_test_accuracy": { + "11": 0.794, + "13": 0.749, + "14": 0.723, + "18": 0.73, + "19": 0.831 + }, + "sae_top_1_test_accuracy": { + "11": 0.55, + "13": 0.667, + "14": 0.652, + "18": 0.697, + "19": 0.789 + }, + "sae_top_2_test_accuracy": { + "11": 0.853, + "13": 0.705, + "14": 0.88, + "18": 0.732, + "19": 0.792 + }, + "sae_top_5_test_accuracy": { + "11": 0.916, + "13": 0.749, + "14": 0.883, + "18": 0.917, + "19": 0.894 + } + }, + "LabHC/bias_in_bios_class_set3_results": { + "sae_test_accuracy": { + "20": 0.9530000686645508, + "21": 0.9240000247955322, + "22": 0.9010000228881836, + "25": 0.9520000219345093, + "26": 0.8860000371932983 + }, + "llm_test_accuracy": { + "20": 0.956000030040741, + "21": 0.9350000619888306, + "22": 0.9180000424385071, + "25": 0.9640000462532043, + "26": 0.8850000500679016 + }, + "llm_top_1_test_accuracy": { + "20": 0.693, + "21": 0.775, + "22": 0.645, + "25": 0.706, + "26": 0.616 + }, + "llm_top_2_test_accuracy": { + "20": 0.827, + "21": 0.761, + "22": 0.694, + "25": 0.778, + "26": 0.686 + }, + "llm_top_5_test_accuracy": { + "20": 0.855, + "21": 0.791, + "22": 0.725, + "25": 0.809, + "26": 0.672 + }, + "sae_top_1_test_accuracy": { + "20": 0.852, + "21": 0.736, + "22": 0.829, + "25": 0.699, + "26": 0.634 + }, + "sae_top_2_test_accuracy": { + "20": 0.86, + "21": 0.78, + "22": 0.852, + "25": 0.837, + "26": 0.633 + }, + "sae_top_5_test_accuracy": { + "20": 0.93, + "21": 0.833, + "22": 0.851, + "25": 0.875, + "26": 0.791 + } + }, + "canrager/amazon_reviews_mcauley_1and5_results": { + "sae_test_accuracy": { + "1": 0.9360000491142273, + "2": 0.9330000281333923, + "3": 0.9150000214576721, + "5": 0.8990000486373901, + "6": 0.8640000224113464 + }, + "llm_test_accuracy": { + "1": 0.9580000638961792, + "2": 0.9330000281333923, + "3": 0.9280000329017639, + "5": 0.9200000166893005, + "6": 0.862000048160553 + }, + "llm_top_1_test_accuracy": { + "1": 0.647, + "2": 0.603, + "3": 0.598, + "5": 0.555, + "6": 0.592 + }, + "llm_top_2_test_accuracy": { + "1": 0.75, + "2": 0.648, + "3": 0.607, + "5": 0.606, + "6": 0.626 + }, + "llm_top_5_test_accuracy": { + "1": 0.767, + "2": 0.641, + "3": 0.645, + "5": 0.638, + "6": 0.676 + }, + "sae_top_1_test_accuracy": { + "1": 0.882, + "2": 0.869, + "3": 0.551, + "5": 0.537, + "6": 0.625 + }, + "sae_top_2_test_accuracy": { + "1": 0.885, + "2": 0.866, + "3": 0.657, + "5": 0.606, + "6": 0.706 + }, + "sae_top_5_test_accuracy": { + "1": 0.908, + "2": 0.887, + "3": 0.656, + "5": 0.869, + "6": 0.729 + } + }, + "canrager/amazon_reviews_mcauley_1and5_sentiment_results": { + "sae_test_accuracy": { + "1.0": 0.9640000462532043, + "5.0": 0.9660000205039978 + }, + "llm_test_accuracy": { + "1.0": 0.9780000448226929, + "5.0": 0.9810000658035278 + }, + "llm_top_1_test_accuracy": { + "1.0": 0.673, + "5.0": 0.673 + }, + "llm_top_2_test_accuracy": { + "1.0": 0.724, + "5.0": 0.724 + }, + "llm_top_5_test_accuracy": { + "1.0": 0.766, + "5.0": 0.766 + }, + "sae_top_1_test_accuracy": { + "1.0": 0.745, + "5.0": 0.745 + }, + "sae_top_2_test_accuracy": { + "1.0": 0.895, + "5.0": 0.895 + }, + "sae_top_5_test_accuracy": { + "1.0": 0.936, + "5.0": 0.936 + } + }, + "codeparrot/github-code_results": { + "sae_test_accuracy": { + "C": 0.9570000171661377, + "Python": 0.9830000400543213, + "HTML": 0.9810000658035278, + "Java": 0.9640000462532043, + "PHP": 0.9570000171661377 + }, + "llm_test_accuracy": { + "C": 0.956000030040741, + "Python": 0.984000027179718, + "HTML": 0.9900000691413879, + "Java": 0.9670000672340393, + "PHP": 0.9570000171661377 + }, + "llm_top_1_test_accuracy": { + "C": 0.666, + "Python": 0.626, + "HTML": 0.721, + "Java": 0.619, + "PHP": 0.594 + }, + "llm_top_2_test_accuracy": { + "C": 0.679, + "Python": 0.674, + "HTML": 0.8, + "Java": 0.676, + "PHP": 0.651 + }, + "llm_top_5_test_accuracy": { + "C": 0.783, + "Python": 0.717, + "HTML": 0.935, + "Java": 0.733, + "PHP": 0.715 + }, + "sae_top_1_test_accuracy": { + "C": 0.523, + "Python": 0.641, + "HTML": 0.596, + "Java": 0.623, + "PHP": 0.595 + }, + "sae_top_2_test_accuracy": { + "C": 0.61, + "Python": 0.881, + "HTML": 0.74, + "Java": 0.66, + "PHP": 0.915 + }, + "sae_top_5_test_accuracy": { + "C": 0.643, + "Python": 0.883, + "HTML": 0.897, + "Java": 0.657, + "PHP": 0.913 + } + }, + "fancyzhx/ag_news_results": { + "sae_test_accuracy": { + "0": 0.9320000410079956, + "1": 0.9830000400543213, + "2": 0.9290000200271606, + "3": 0.9540000557899475 + }, + "llm_test_accuracy": { + "0": 0.940000057220459, + "1": 0.9860000610351562, + "2": 0.9200000166893005, + "3": 0.9540000557899475 + }, + "llm_top_1_test_accuracy": { + "0": 0.573, + "1": 0.671, + "2": 0.672, + "3": 0.635 + }, + "llm_top_2_test_accuracy": { + "0": 0.802, + "1": 0.808, + "2": 0.701, + "3": 0.816 + }, + "llm_top_5_test_accuracy": { + "0": 0.81, + "1": 0.891, + "2": 0.752, + "3": 0.832 + }, + "sae_top_1_test_accuracy": { + "0": 0.583, + "1": 0.652, + "2": 0.554, + "3": 0.625 + }, + "sae_top_2_test_accuracy": { + "0": 0.667, + "1": 0.689, + "2": 0.748, + "3": 0.684 + }, + "sae_top_5_test_accuracy": { + "0": 0.81, + "1": 0.915, + "2": 0.841, + "3": 0.808 + } + }, + "Helsinki-NLP/europarl_results": { + "sae_test_accuracy": { + "en": 0.9980000257492065, + "fr": 0.999000072479248, + "de": 1.0, + "es": 0.9970000386238098, + "nl": 0.999000072479248 + }, + "llm_test_accuracy": { + "en": 1.0, + "fr": 0.999000072479248, + "de": 1.0, + "es": 0.999000072479248, + "nl": 1.0 + }, + "llm_top_1_test_accuracy": { + "en": 0.739, + "fr": 0.585, + "de": 0.758, + "es": 0.496, + "nl": 0.649 + }, + "llm_top_2_test_accuracy": { + "en": 0.829, + "fr": 0.582, + "de": 0.82, + "es": 0.958, + "nl": 0.753 + }, + "llm_top_5_test_accuracy": { + "en": 0.892, + "fr": 0.888, + "de": 0.894, + "es": 0.98, + "nl": 0.852 + }, + "sae_top_1_test_accuracy": { + "en": 0.737, + "fr": 0.985, + "de": 0.93, + "es": 0.986, + "nl": 0.639 + }, + "sae_top_2_test_accuracy": { + "en": 0.972, + "fr": 0.991, + "de": 0.937, + "es": 0.989, + "nl": 0.758 + }, + "sae_top_5_test_accuracy": { + "en": 0.998, + "fr": 0.996, + "de": 0.944, + "es": 0.993, + "nl": 0.996 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7ef0093373caede7bb3705b8879698190a9df5ff --- /dev/null +++ b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,670 @@ +{ + "eval_type_id": "sparse_probing", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "LabHC/bias_in_bios_class_set2", + "LabHC/bias_in_bios_class_set3", + "canrager/amazon_reviews_mcauley_1and5", + "canrager/amazon_reviews_mcauley_1and5_sentiment", + "codeparrot/github-code", + "fancyzhx/ag_news", + "Helsinki-NLP/europarl" + ], + "probe_train_set_size": 4000, + "probe_test_set_size": 1000, + "context_length": 128, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "model_name": "gemma-2-2b", + "k_values": [ + 1, + 2, + 5 + ], + "lower_vram_usage": false + }, + "eval_id": "01df6242-51fa-47d4-af93-9c80a172184d", + "datetime_epoch_millis": 1740165335112, + "eval_result_metrics": { + "llm": { + "llm_test_accuracy": 0.9595375448465346, + "llm_top_1_test_accuracy": 0.64956875, + "llm_top_2_test_accuracy": 0.72589375, + "llm_top_5_test_accuracy": 0.78265625, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null + }, + "sae": { + "sae_test_accuracy": 0.9563000392168761, + "sae_top_1_test_accuracy": 0.7563249999999999, + "sae_top_2_test_accuracy": 0.80766875, + "sae_top_5_test_accuracy": 0.861175, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_results", + "llm_test_accuracy": 0.966800057888031, + "llm_top_1_test_accuracy": 0.6397999999999999, + "llm_top_2_test_accuracy": 0.6954, + "llm_top_5_test_accuracy": 0.7869999999999999, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9606000423431397, + "sae_top_1_test_accuracy": 0.735, + "sae_top_2_test_accuracy": 0.8328, + "sae_top_5_test_accuracy": 0.8700000000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set2_results", + "llm_test_accuracy": 0.9578000426292419, + "llm_top_1_test_accuracy": 0.6694000000000001, + "llm_top_2_test_accuracy": 0.725, + "llm_top_5_test_accuracy": 0.7654, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9454000473022461, + "sae_top_1_test_accuracy": 0.7318, + "sae_top_2_test_accuracy": 0.7548, + "sae_top_5_test_accuracy": 0.8513999999999999, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set3_results", + "llm_test_accuracy": 0.9316000461578369, + "llm_top_1_test_accuracy": 0.687, + "llm_top_2_test_accuracy": 0.7492, + "llm_top_5_test_accuracy": 0.7704000000000001, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.931600034236908, + "sae_top_1_test_accuracy": 0.7748000000000002, + "sae_top_2_test_accuracy": 0.8088000000000001, + "sae_top_5_test_accuracy": 0.8513999999999999, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results", + "llm_test_accuracy": 0.9202000379562378, + "llm_top_1_test_accuracy": 0.599, + "llm_top_2_test_accuracy": 0.6474, + "llm_top_5_test_accuracy": 0.6734, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9192000389099121, + "sae_top_1_test_accuracy": 0.683, + "sae_top_2_test_accuracy": 0.6950000000000001, + "sae_top_5_test_accuracy": 0.7598, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results", + "llm_test_accuracy": 0.9795000553131104, + "llm_top_1_test_accuracy": 0.673, + "llm_top_2_test_accuracy": 0.724, + "llm_top_5_test_accuracy": 0.766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9755000472068787, + "sae_top_1_test_accuracy": 0.94, + "sae_top_2_test_accuracy": 0.941, + "sae_top_5_test_accuracy": 0.95, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "codeparrot/github-code_results", + "llm_test_accuracy": 0.9708000421524048, + "llm_top_1_test_accuracy": 0.6451999999999999, + "llm_top_2_test_accuracy": 0.6960000000000001, + "llm_top_5_test_accuracy": 0.7766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9714000463485718, + "sae_top_1_test_accuracy": 0.6077999999999999, + "sae_top_2_test_accuracy": 0.7322, + "sae_top_5_test_accuracy": 0.7504, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "fancyzhx/ag_news_results", + "llm_test_accuracy": 0.9500000476837158, + "llm_top_1_test_accuracy": 0.63775, + "llm_top_2_test_accuracy": 0.78175, + "llm_top_5_test_accuracy": 0.82125, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9485000222921371, + "sae_top_1_test_accuracy": 0.6849999999999999, + "sae_top_2_test_accuracy": 0.74675, + "sae_top_5_test_accuracy": 0.859, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "Helsinki-NLP/europarl_results", + "llm_test_accuracy": 0.9996000289916992, + "llm_top_1_test_accuracy": 0.6454, + "llm_top_2_test_accuracy": 0.7884, + "llm_top_5_test_accuracy": 0.9012, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9982000350952148, + "sae_top_1_test_accuracy": 0.8932, + "sae_top_2_test_accuracy": 0.95, + "sae_top_5_test_accuracy": 0.9974000000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1_results": { + "sae_test_accuracy": { + "0": 0.9460000395774841, + "1": 0.9550000429153442, + "2": 0.9440000653266907, + "6": 0.9830000400543213, + "9": 0.9750000238418579 + }, + "llm_test_accuracy": { + "0": 0.9510000348091125, + "1": 0.9670000672340393, + "2": 0.9530000686645508, + "6": 0.987000048160553, + "9": 0.9760000705718994 + }, + "llm_top_1_test_accuracy": { + "0": 0.577, + "1": 0.613, + "2": 0.662, + "6": 0.787, + "9": 0.56 + }, + "llm_top_2_test_accuracy": { + "0": 0.574, + "1": 0.66, + "2": 0.718, + "6": 0.811, + "9": 0.714 + }, + "llm_top_5_test_accuracy": { + "0": 0.713, + "1": 0.711, + "2": 0.755, + "6": 0.895, + "9": 0.861 + }, + "sae_top_1_test_accuracy": { + "0": 0.876, + "1": 0.586, + "2": 0.852, + "6": 0.751, + "9": 0.61 + }, + "sae_top_2_test_accuracy": { + "0": 0.878, + "1": 0.607, + "2": 0.848, + "6": 0.976, + "9": 0.855 + }, + "sae_top_5_test_accuracy": { + "0": 0.884, + "1": 0.696, + "2": 0.864, + "6": 0.982, + "9": 0.924 + } + }, + "LabHC/bias_in_bios_class_set2_results": { + "sae_test_accuracy": { + "11": 0.9500000476837158, + "13": 0.9490000605583191, + "14": 0.9550000429153442, + "18": 0.9130000472068787, + "19": 0.9600000381469727 + }, + "llm_test_accuracy": { + "11": 0.9690000414848328, + "13": 0.9600000381469727, + "14": 0.9600000381469727, + "18": 0.9390000700950623, + "19": 0.9610000252723694 + }, + "llm_top_1_test_accuracy": { + "11": 0.555, + "13": 0.668, + "14": 0.638, + "18": 0.69, + "19": 0.796 + }, + "llm_top_2_test_accuracy": { + "11": 0.756, + "13": 0.714, + "14": 0.67, + "18": 0.717, + "19": 0.768 + }, + "llm_top_5_test_accuracy": { + "11": 0.794, + "13": 0.749, + "14": 0.723, + "18": 0.73, + "19": 0.831 + }, + "sae_top_1_test_accuracy": { + "11": 0.855, + "13": 0.662, + "14": 0.618, + "18": 0.672, + "19": 0.852 + }, + "sae_top_2_test_accuracy": { + "11": 0.853, + "13": 0.641, + "14": 0.736, + "18": 0.69, + "19": 0.854 + }, + "sae_top_5_test_accuracy": { + "11": 0.847, + "13": 0.769, + "14": 0.872, + "18": 0.907, + "19": 0.862 + } + }, + "LabHC/bias_in_bios_class_set3_results": { + "sae_test_accuracy": { + "20": 0.9580000638961792, + "21": 0.9240000247955322, + "22": 0.9190000295639038, + "25": 0.9570000171661377, + "26": 0.9000000357627869 + }, + "llm_test_accuracy": { + "20": 0.956000030040741, + "21": 0.9350000619888306, + "22": 0.9180000424385071, + "25": 0.9640000462532043, + "26": 0.8850000500679016 + }, + "llm_top_1_test_accuracy": { + "20": 0.693, + "21": 0.775, + "22": 0.645, + "25": 0.706, + "26": 0.616 + }, + "llm_top_2_test_accuracy": { + "20": 0.827, + "21": 0.761, + "22": 0.694, + "25": 0.778, + "26": 0.686 + }, + "llm_top_5_test_accuracy": { + "20": 0.855, + "21": 0.791, + "22": 0.725, + "25": 0.809, + "26": 0.672 + }, + "sae_top_1_test_accuracy": { + "20": 0.889, + "21": 0.61, + "22": 0.877, + "25": 0.881, + "26": 0.617 + }, + "sae_top_2_test_accuracy": { + "20": 0.866, + "21": 0.782, + "22": 0.873, + "25": 0.898, + "26": 0.625 + }, + "sae_top_5_test_accuracy": { + "20": 0.905, + "21": 0.789, + "22": 0.885, + "25": 0.899, + "26": 0.779 + } + }, + "canrager/amazon_reviews_mcauley_1and5_results": { + "sae_test_accuracy": { + "1": 0.9550000429153442, + "2": 0.9360000491142273, + "3": 0.9180000424385071, + "5": 0.9240000247955322, + "6": 0.8630000352859497 + }, + "llm_test_accuracy": { + "1": 0.9580000638961792, + "2": 0.9330000281333923, + "3": 0.9280000329017639, + "5": 0.9200000166893005, + "6": 0.862000048160553 + }, + "llm_top_1_test_accuracy": { + "1": 0.647, + "2": 0.603, + "3": 0.598, + "5": 0.555, + "6": 0.592 + }, + "llm_top_2_test_accuracy": { + "1": 0.75, + "2": 0.648, + "3": 0.607, + "5": 0.606, + "6": 0.626 + }, + "llm_top_5_test_accuracy": { + "1": 0.767, + "2": 0.641, + "3": 0.645, + "5": 0.638, + "6": 0.676 + }, + "sae_top_1_test_accuracy": { + "1": 0.792, + "2": 0.591, + "3": 0.568, + "5": 0.829, + "6": 0.635 + }, + "sae_top_2_test_accuracy": { + "1": 0.822, + "2": 0.621, + "3": 0.571, + "5": 0.823, + "6": 0.638 + }, + "sae_top_5_test_accuracy": { + "1": 0.838, + "2": 0.862, + "3": 0.609, + "5": 0.825, + "6": 0.665 + } + }, + "canrager/amazon_reviews_mcauley_1and5_sentiment_results": { + "sae_test_accuracy": { + "1.0": 0.9770000576972961, + "5.0": 0.9740000367164612 + }, + "llm_test_accuracy": { + "1.0": 0.9780000448226929, + "5.0": 0.9810000658035278 + }, + "llm_top_1_test_accuracy": { + "1.0": 0.673, + "5.0": 0.673 + }, + "llm_top_2_test_accuracy": { + "1.0": 0.724, + "5.0": 0.724 + }, + "llm_top_5_test_accuracy": { + "1.0": 0.766, + "5.0": 0.766 + }, + "sae_top_1_test_accuracy": { + "1.0": 0.94, + "5.0": 0.94 + }, + "sae_top_2_test_accuracy": { + "1.0": 0.941, + "5.0": 0.941 + }, + "sae_top_5_test_accuracy": { + "1.0": 0.95, + "5.0": 0.95 + } + }, + "codeparrot/github-code_results": { + "sae_test_accuracy": { + "C": 0.9600000381469727, + "Python": 0.984000027179718, + "HTML": 0.9920000433921814, + "Java": 0.9670000672340393, + "PHP": 0.9540000557899475 + }, + "llm_test_accuracy": { + "C": 0.956000030040741, + "Python": 0.984000027179718, + "HTML": 0.9900000691413879, + "Java": 0.9670000672340393, + "PHP": 0.9570000171661377 + }, + "llm_top_1_test_accuracy": { + "C": 0.666, + "Python": 0.626, + "HTML": 0.721, + "Java": 0.619, + "PHP": 0.594 + }, + "llm_top_2_test_accuracy": { + "C": 0.679, + "Python": 0.674, + "HTML": 0.8, + "Java": 0.676, + "PHP": 0.651 + }, + "llm_top_5_test_accuracy": { + "C": 0.783, + "Python": 0.717, + "HTML": 0.935, + "Java": 0.733, + "PHP": 0.715 + }, + "sae_top_1_test_accuracy": { + "C": 0.595, + "Python": 0.629, + "HTML": 0.565, + "Java": 0.647, + "PHP": 0.603 + }, + "sae_top_2_test_accuracy": { + "C": 0.622, + "Python": 0.654, + "HTML": 0.824, + "Java": 0.649, + "PHP": 0.912 + }, + "sae_top_5_test_accuracy": { + "C": 0.654, + "Python": 0.7, + "HTML": 0.807, + "Java": 0.674, + "PHP": 0.917 + } + }, + "fancyzhx/ag_news_results": { + "sae_test_accuracy": { + "0": 0.9380000233650208, + "1": 0.9800000190734863, + "2": 0.9330000281333923, + "3": 0.9430000185966492 + }, + "llm_test_accuracy": { + "0": 0.940000057220459, + "1": 0.9860000610351562, + "2": 0.9200000166893005, + "3": 0.9540000557899475 + }, + "llm_top_1_test_accuracy": { + "0": 0.573, + "1": 0.671, + "2": 0.672, + "3": 0.635 + }, + "llm_top_2_test_accuracy": { + "0": 0.802, + "1": 0.808, + "2": 0.701, + "3": 0.816 + }, + "llm_top_5_test_accuracy": { + "0": 0.81, + "1": 0.891, + "2": 0.752, + "3": 0.832 + }, + "sae_top_1_test_accuracy": { + "0": 0.591, + "1": 0.934, + "2": 0.562, + "3": 0.653 + }, + "sae_top_2_test_accuracy": { + "0": 0.704, + "1": 0.937, + "2": 0.69, + "3": 0.656 + }, + "sae_top_5_test_accuracy": { + "0": 0.835, + "1": 0.954, + "2": 0.834, + "3": 0.813 + } + }, + "Helsinki-NLP/europarl_results": { + "sae_test_accuracy": { + "en": 0.999000072479248, + "fr": 0.9970000386238098, + "de": 0.9980000257492065, + "es": 1.0, + "nl": 0.9970000386238098 + }, + "llm_test_accuracy": { + "en": 1.0, + "fr": 0.999000072479248, + "de": 1.0, + "es": 0.999000072479248, + "nl": 1.0 + }, + "llm_top_1_test_accuracy": { + "en": 0.739, + "fr": 0.585, + "de": 0.758, + "es": 0.496, + "nl": 0.649 + }, + "llm_top_2_test_accuracy": { + "en": 0.829, + "fr": 0.582, + "de": 0.82, + "es": 0.958, + "nl": 0.753 + }, + "llm_top_5_test_accuracy": { + "en": 0.892, + "fr": 0.888, + "de": 0.894, + "es": 0.98, + "nl": 0.852 + }, + "sae_top_1_test_accuracy": { + "en": 0.651, + "fr": 0.996, + "de": 0.925, + "es": 0.897, + "nl": 0.997 + }, + "sae_top_2_test_accuracy": { + "en": 0.776, + "fr": 0.997, + "de": 0.988, + "es": 0.992, + "nl": 0.997 + }, + "sae_top_5_test_accuracy": { + "en": 0.998, + "fr": 0.997, + "de": 0.997, + "es": 0.996, + "nl": 0.999 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..73cca95bf43adfae7ee36942b9c10c5d5cf22961 --- /dev/null +++ b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,670 @@ +{ + "eval_type_id": "sparse_probing", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "LabHC/bias_in_bios_class_set2", + "LabHC/bias_in_bios_class_set3", + "canrager/amazon_reviews_mcauley_1and5", + "canrager/amazon_reviews_mcauley_1and5_sentiment", + "codeparrot/github-code", + "fancyzhx/ag_news", + "Helsinki-NLP/europarl" + ], + "probe_train_set_size": 4000, + "probe_test_set_size": 1000, + "context_length": 128, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "model_name": "gemma-2-2b", + "k_values": [ + 1, + 2, + 5 + ], + "lower_vram_usage": false + }, + "eval_id": "c9b1c992-08d4-4128-85b3-7725ac95e5a6", + "datetime_epoch_millis": 1740165200108, + "eval_result_metrics": { + "llm": { + "llm_test_accuracy": 0.9595375448465346, + "llm_top_1_test_accuracy": 0.64956875, + "llm_top_2_test_accuracy": 0.72589375, + "llm_top_5_test_accuracy": 0.78265625, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null + }, + "sae": { + "sae_test_accuracy": 0.9555687937885523, + "sae_top_1_test_accuracy": 0.71979375, + "sae_top_2_test_accuracy": 0.8020937500000002, + "sae_top_5_test_accuracy": 0.86435625, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_results", + "llm_test_accuracy": 0.966800057888031, + "llm_top_1_test_accuracy": 0.6397999999999999, + "llm_top_2_test_accuracy": 0.6954, + "llm_top_5_test_accuracy": 0.7869999999999999, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.962000048160553, + "sae_top_1_test_accuracy": 0.6808, + "sae_top_2_test_accuracy": 0.8480000000000001, + "sae_top_5_test_accuracy": 0.9032, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set2_results", + "llm_test_accuracy": 0.9578000426292419, + "llm_top_1_test_accuracy": 0.6694000000000001, + "llm_top_2_test_accuracy": 0.725, + "llm_top_5_test_accuracy": 0.7654, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9456000447273254, + "sae_top_1_test_accuracy": 0.669, + "sae_top_2_test_accuracy": 0.7938000000000001, + "sae_top_5_test_accuracy": 0.8597999999999999, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set3_results", + "llm_test_accuracy": 0.9316000461578369, + "llm_top_1_test_accuracy": 0.687, + "llm_top_2_test_accuracy": 0.7492, + "llm_top_5_test_accuracy": 0.7704000000000001, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9292000293731689, + "sae_top_1_test_accuracy": 0.704, + "sae_top_2_test_accuracy": 0.8064, + "sae_top_5_test_accuracy": 0.8610000000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results", + "llm_test_accuracy": 0.9202000379562378, + "llm_top_1_test_accuracy": 0.599, + "llm_top_2_test_accuracy": 0.6474, + "llm_top_5_test_accuracy": 0.6734, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9170000433921814, + "sae_top_1_test_accuracy": 0.7496, + "sae_top_2_test_accuracy": 0.774, + "sae_top_5_test_accuracy": 0.8013999999999999, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results", + "llm_test_accuracy": 0.9795000553131104, + "llm_top_1_test_accuracy": 0.673, + "llm_top_2_test_accuracy": 0.724, + "llm_top_5_test_accuracy": 0.766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9750000536441803, + "sae_top_1_test_accuracy": 0.847, + "sae_top_2_test_accuracy": 0.847, + "sae_top_5_test_accuracy": 0.929, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "codeparrot/github-code_results", + "llm_test_accuracy": 0.9708000421524048, + "llm_top_1_test_accuracy": 0.6451999999999999, + "llm_top_2_test_accuracy": 0.6960000000000001, + "llm_top_5_test_accuracy": 0.7766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9676000475883484, + "sae_top_1_test_accuracy": 0.64, + "sae_top_2_test_accuracy": 0.6508, + "sae_top_5_test_accuracy": 0.7672, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "fancyzhx/ag_news_results", + "llm_test_accuracy": 0.9500000476837158, + "llm_top_1_test_accuracy": 0.63775, + "llm_top_2_test_accuracy": 0.78175, + "llm_top_5_test_accuracy": 0.82125, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9497500509023666, + "sae_top_1_test_accuracy": 0.60775, + "sae_top_2_test_accuracy": 0.70375, + "sae_top_5_test_accuracy": 0.7982499999999999, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "Helsinki-NLP/europarl_results", + "llm_test_accuracy": 0.9996000289916992, + "llm_top_1_test_accuracy": 0.6454, + "llm_top_2_test_accuracy": 0.7884, + "llm_top_5_test_accuracy": 0.9012, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9984000325202942, + "sae_top_1_test_accuracy": 0.8602000000000001, + "sae_top_2_test_accuracy": 0.993, + "sae_top_5_test_accuracy": 0.9950000000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1_results": { + "sae_test_accuracy": { + "0": 0.9450000524520874, + "1": 0.9600000381469727, + "2": 0.9470000267028809, + "6": 0.9860000610351562, + "9": 0.9720000624656677 + }, + "llm_test_accuracy": { + "0": 0.9510000348091125, + "1": 0.9670000672340393, + "2": 0.9530000686645508, + "6": 0.987000048160553, + "9": 0.9760000705718994 + }, + "llm_top_1_test_accuracy": { + "0": 0.577, + "1": 0.613, + "2": 0.662, + "6": 0.787, + "9": 0.56 + }, + "llm_top_2_test_accuracy": { + "0": 0.574, + "1": 0.66, + "2": 0.718, + "6": 0.811, + "9": 0.714 + }, + "llm_top_5_test_accuracy": { + "0": 0.713, + "1": 0.711, + "2": 0.755, + "6": 0.895, + "9": 0.861 + }, + "sae_top_1_test_accuracy": { + "0": 0.571, + "1": 0.631, + "2": 0.835, + "6": 0.808, + "9": 0.559 + }, + "sae_top_2_test_accuracy": { + "0": 0.856, + "1": 0.809, + "2": 0.843, + "6": 0.976, + "9": 0.756 + }, + "sae_top_5_test_accuracy": { + "0": 0.869, + "1": 0.851, + "2": 0.864, + "6": 0.99, + "9": 0.942 + } + }, + "LabHC/bias_in_bios_class_set2_results": { + "sae_test_accuracy": { + "11": 0.9580000638961792, + "13": 0.9470000267028809, + "14": 0.9460000395774841, + "18": 0.9220000505447388, + "19": 0.9550000429153442 + }, + "llm_test_accuracy": { + "11": 0.9690000414848328, + "13": 0.9600000381469727, + "14": 0.9600000381469727, + "18": 0.9390000700950623, + "19": 0.9610000252723694 + }, + "llm_top_1_test_accuracy": { + "11": 0.555, + "13": 0.668, + "14": 0.638, + "18": 0.69, + "19": 0.796 + }, + "llm_top_2_test_accuracy": { + "11": 0.756, + "13": 0.714, + "14": 0.67, + "18": 0.717, + "19": 0.768 + }, + "llm_top_5_test_accuracy": { + "11": 0.794, + "13": 0.749, + "14": 0.723, + "18": 0.73, + "19": 0.831 + }, + "sae_top_1_test_accuracy": { + "11": 0.539, + "13": 0.658, + "14": 0.648, + "18": 0.703, + "19": 0.797 + }, + "sae_top_2_test_accuracy": { + "11": 0.858, + "13": 0.675, + "14": 0.878, + "18": 0.729, + "19": 0.829 + }, + "sae_top_5_test_accuracy": { + "11": 0.871, + "13": 0.792, + "14": 0.876, + "18": 0.897, + "19": 0.863 + } + }, + "LabHC/bias_in_bios_class_set3_results": { + "sae_test_accuracy": { + "20": 0.9520000219345093, + "21": 0.9220000505447388, + "22": 0.9240000247955322, + "25": 0.956000030040741, + "26": 0.8920000195503235 + }, + "llm_test_accuracy": { + "20": 0.956000030040741, + "21": 0.9350000619888306, + "22": 0.9180000424385071, + "25": 0.9640000462532043, + "26": 0.8850000500679016 + }, + "llm_top_1_test_accuracy": { + "20": 0.693, + "21": 0.775, + "22": 0.645, + "25": 0.706, + "26": 0.616 + }, + "llm_top_2_test_accuracy": { + "20": 0.827, + "21": 0.761, + "22": 0.694, + "25": 0.778, + "26": 0.686 + }, + "llm_top_5_test_accuracy": { + "20": 0.855, + "21": 0.791, + "22": 0.725, + "25": 0.809, + "26": 0.672 + }, + "sae_top_1_test_accuracy": { + "20": 0.84, + "21": 0.476, + "22": 0.88, + "25": 0.693, + "26": 0.631 + }, + "sae_top_2_test_accuracy": { + "20": 0.849, + "21": 0.747, + "22": 0.881, + "25": 0.849, + "26": 0.706 + }, + "sae_top_5_test_accuracy": { + "20": 0.911, + "21": 0.844, + "22": 0.874, + "25": 0.893, + "26": 0.783 + } + }, + "canrager/amazon_reviews_mcauley_1and5_results": { + "sae_test_accuracy": { + "1": 0.9510000348091125, + "2": 0.9460000395774841, + "3": 0.9110000729560852, + "5": 0.9200000166893005, + "6": 0.8570000529289246 + }, + "llm_test_accuracy": { + "1": 0.9580000638961792, + "2": 0.9330000281333923, + "3": 0.9280000329017639, + "5": 0.9200000166893005, + "6": 0.862000048160553 + }, + "llm_top_1_test_accuracy": { + "1": 0.647, + "2": 0.603, + "3": 0.598, + "5": 0.555, + "6": 0.592 + }, + "llm_top_2_test_accuracy": { + "1": 0.75, + "2": 0.648, + "3": 0.607, + "5": 0.606, + "6": 0.626 + }, + "llm_top_5_test_accuracy": { + "1": 0.767, + "2": 0.641, + "3": 0.645, + "5": 0.638, + "6": 0.676 + }, + "sae_top_1_test_accuracy": { + "1": 0.856, + "2": 0.862, + "3": 0.592, + "5": 0.819, + "6": 0.619 + }, + "sae_top_2_test_accuracy": { + "1": 0.905, + "2": 0.867, + "3": 0.612, + "5": 0.818, + "6": 0.668 + }, + "sae_top_5_test_accuracy": { + "1": 0.908, + "2": 0.862, + "3": 0.63, + "5": 0.873, + "6": 0.734 + } + }, + "canrager/amazon_reviews_mcauley_1and5_sentiment_results": { + "sae_test_accuracy": { + "1.0": 0.9760000705718994, + "5.0": 0.9740000367164612 + }, + "llm_test_accuracy": { + "1.0": 0.9780000448226929, + "5.0": 0.9810000658035278 + }, + "llm_top_1_test_accuracy": { + "1.0": 0.673, + "5.0": 0.673 + }, + "llm_top_2_test_accuracy": { + "1.0": 0.724, + "5.0": 0.724 + }, + "llm_top_5_test_accuracy": { + "1.0": 0.766, + "5.0": 0.766 + }, + "sae_top_1_test_accuracy": { + "1.0": 0.847, + "5.0": 0.847 + }, + "sae_top_2_test_accuracy": { + "1.0": 0.847, + "5.0": 0.847 + }, + "sae_top_5_test_accuracy": { + "1.0": 0.929, + "5.0": 0.929 + } + }, + "codeparrot/github-code_results": { + "sae_test_accuracy": { + "C": 0.9540000557899475, + "Python": 0.984000027179718, + "HTML": 0.9810000658035278, + "Java": 0.9640000462532043, + "PHP": 0.9550000429153442 + }, + "llm_test_accuracy": { + "C": 0.956000030040741, + "Python": 0.984000027179718, + "HTML": 0.9900000691413879, + "Java": 0.9670000672340393, + "PHP": 0.9570000171661377 + }, + "llm_top_1_test_accuracy": { + "C": 0.666, + "Python": 0.626, + "HTML": 0.721, + "Java": 0.619, + "PHP": 0.594 + }, + "llm_top_2_test_accuracy": { + "C": 0.679, + "Python": 0.674, + "HTML": 0.8, + "Java": 0.676, + "PHP": 0.651 + }, + "llm_top_5_test_accuracy": { + "C": 0.783, + "Python": 0.717, + "HTML": 0.935, + "Java": 0.733, + "PHP": 0.715 + }, + "sae_top_1_test_accuracy": { + "C": 0.622, + "Python": 0.661, + "HTML": 0.692, + "Java": 0.629, + "PHP": 0.596 + }, + "sae_top_2_test_accuracy": { + "C": 0.599, + "Python": 0.65, + "HTML": 0.795, + "Java": 0.628, + "PHP": 0.582 + }, + "sae_top_5_test_accuracy": { + "C": 0.671, + "Python": 0.684, + "HTML": 0.865, + "Java": 0.705, + "PHP": 0.911 + } + }, + "fancyzhx/ag_news_results": { + "sae_test_accuracy": { + "0": 0.9360000491142273, + "1": 0.9850000739097595, + "2": 0.9330000281333923, + "3": 0.9450000524520874 + }, + "llm_test_accuracy": { + "0": 0.940000057220459, + "1": 0.9860000610351562, + "2": 0.9200000166893005, + "3": 0.9540000557899475 + }, + "llm_top_1_test_accuracy": { + "0": 0.573, + "1": 0.671, + "2": 0.672, + "3": 0.635 + }, + "llm_top_2_test_accuracy": { + "0": 0.802, + "1": 0.808, + "2": 0.701, + "3": 0.816 + }, + "llm_top_5_test_accuracy": { + "0": 0.81, + "1": 0.891, + "2": 0.752, + "3": 0.832 + }, + "sae_top_1_test_accuracy": { + "0": 0.578, + "1": 0.664, + "2": 0.552, + "3": 0.637 + }, + "sae_top_2_test_accuracy": { + "0": 0.795, + "1": 0.697, + "2": 0.673, + "3": 0.65 + }, + "sae_top_5_test_accuracy": { + "0": 0.822, + "1": 0.869, + "2": 0.691, + "3": 0.811 + } + }, + "Helsinki-NLP/europarl_results": { + "sae_test_accuracy": { + "en": 0.999000072479248, + "fr": 0.9980000257492065, + "de": 0.9970000386238098, + "es": 1.0, + "nl": 0.9980000257492065 + }, + "llm_test_accuracy": { + "en": 1.0, + "fr": 0.999000072479248, + "de": 1.0, + "es": 0.999000072479248, + "nl": 1.0 + }, + "llm_top_1_test_accuracy": { + "en": 0.739, + "fr": 0.585, + "de": 0.758, + "es": 0.496, + "nl": 0.649 + }, + "llm_top_2_test_accuracy": { + "en": 0.829, + "fr": 0.582, + "de": 0.82, + "es": 0.958, + "nl": 0.753 + }, + "llm_top_5_test_accuracy": { + "en": 0.892, + "fr": 0.888, + "de": 0.894, + "es": 0.98, + "nl": 0.852 + }, + "sae_top_1_test_accuracy": { + "en": 0.749, + "fr": 0.991, + "de": 0.932, + "es": 0.99, + "nl": 0.639 + }, + "sae_top_2_test_accuracy": { + "en": 0.995, + "fr": 0.994, + "de": 0.988, + "es": 0.99, + "nl": 0.998 + }, + "sae_top_5_test_accuracy": { + "en": 0.998, + "fr": 0.997, + "de": 0.988, + "es": 0.994, + "nl": 0.998 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d1fc002dfc05d60edf246c57cb544536fe7e3116 --- /dev/null +++ b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,670 @@ +{ + "eval_type_id": "sparse_probing", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "LabHC/bias_in_bios_class_set2", + "LabHC/bias_in_bios_class_set3", + "canrager/amazon_reviews_mcauley_1and5", + "canrager/amazon_reviews_mcauley_1and5_sentiment", + "codeparrot/github-code", + "fancyzhx/ag_news", + "Helsinki-NLP/europarl" + ], + "probe_train_set_size": 4000, + "probe_test_set_size": 1000, + "context_length": 128, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "model_name": "gemma-2-2b", + "k_values": [ + 1, + 2, + 5 + ], + "lower_vram_usage": false + }, + "eval_id": "d3e0c3ec-e2e5-4d60-ae48-22cfd7d5fba2", + "datetime_epoch_millis": 1740165066678, + "eval_result_metrics": { + "llm": { + "llm_test_accuracy": 0.9595375448465346, + "llm_top_1_test_accuracy": 0.64956875, + "llm_top_2_test_accuracy": 0.72589375, + "llm_top_5_test_accuracy": 0.78265625, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null + }, + "sae": { + "sae_test_accuracy": 0.9570625454187394, + "sae_top_1_test_accuracy": 0.7210749999999999, + "sae_top_2_test_accuracy": 0.7653625, + "sae_top_5_test_accuracy": 0.8490062500000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_results", + "llm_test_accuracy": 0.966800057888031, + "llm_top_1_test_accuracy": 0.6397999999999999, + "llm_top_2_test_accuracy": 0.6954, + "llm_top_5_test_accuracy": 0.7869999999999999, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9642000436782837, + "sae_top_1_test_accuracy": 0.7083999999999999, + "sae_top_2_test_accuracy": 0.7636000000000001, + "sae_top_5_test_accuracy": 0.8354000000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set2_results", + "llm_test_accuracy": 0.9578000426292419, + "llm_top_1_test_accuracy": 0.6694000000000001, + "llm_top_2_test_accuracy": 0.725, + "llm_top_5_test_accuracy": 0.7654, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9512000441551208, + "sae_top_1_test_accuracy": 0.7248000000000001, + "sae_top_2_test_accuracy": 0.7267999999999999, + "sae_top_5_test_accuracy": 0.828, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set3_results", + "llm_test_accuracy": 0.9316000461578369, + "llm_top_1_test_accuracy": 0.687, + "llm_top_2_test_accuracy": 0.7492, + "llm_top_5_test_accuracy": 0.7704000000000001, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9294000387191772, + "sae_top_1_test_accuracy": 0.7074, + "sae_top_2_test_accuracy": 0.784, + "sae_top_5_test_accuracy": 0.825, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results", + "llm_test_accuracy": 0.9202000379562378, + "llm_top_1_test_accuracy": 0.599, + "llm_top_2_test_accuracy": 0.6474, + "llm_top_5_test_accuracy": 0.6734, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9192000508308411, + "sae_top_1_test_accuracy": 0.6622, + "sae_top_2_test_accuracy": 0.6678000000000001, + "sae_top_5_test_accuracy": 0.7878000000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results", + "llm_test_accuracy": 0.9795000553131104, + "llm_top_1_test_accuracy": 0.673, + "llm_top_2_test_accuracy": 0.724, + "llm_top_5_test_accuracy": 0.766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9725000560283661, + "sae_top_1_test_accuracy": 0.6, + "sae_top_2_test_accuracy": 0.764, + "sae_top_5_test_accuracy": 0.942, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "codeparrot/github-code_results", + "llm_test_accuracy": 0.9708000421524048, + "llm_top_1_test_accuracy": 0.6451999999999999, + "llm_top_2_test_accuracy": 0.6960000000000001, + "llm_top_5_test_accuracy": 0.7766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.968600058555603, + "sae_top_1_test_accuracy": 0.6384000000000001, + "sae_top_2_test_accuracy": 0.643, + "sae_top_5_test_accuracy": 0.7448, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "fancyzhx/ag_news_results", + "llm_test_accuracy": 0.9500000476837158, + "llm_top_1_test_accuracy": 0.63775, + "llm_top_2_test_accuracy": 0.78175, + "llm_top_5_test_accuracy": 0.82125, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9520000517368317, + "sae_top_1_test_accuracy": 0.77, + "sae_top_2_test_accuracy": 0.7875000000000001, + "sae_top_5_test_accuracy": 0.8322499999999999, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "Helsinki-NLP/europarl_results", + "llm_test_accuracy": 0.9996000289916992, + "llm_top_1_test_accuracy": 0.6454, + "llm_top_2_test_accuracy": 0.7884, + "llm_top_5_test_accuracy": 0.9012, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9994000196456909, + "sae_top_1_test_accuracy": 0.9574, + "sae_top_2_test_accuracy": 0.9862, + "sae_top_5_test_accuracy": 0.9968, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1_results": { + "sae_test_accuracy": { + "0": 0.9480000734329224, + "1": 0.9660000205039978, + "2": 0.9480000734329224, + "6": 0.984000027179718, + "9": 0.9750000238418579 + }, + "llm_test_accuracy": { + "0": 0.9510000348091125, + "1": 0.9670000672340393, + "2": 0.9530000686645508, + "6": 0.987000048160553, + "9": 0.9760000705718994 + }, + "llm_top_1_test_accuracy": { + "0": 0.577, + "1": 0.613, + "2": 0.662, + "6": 0.787, + "9": 0.56 + }, + "llm_top_2_test_accuracy": { + "0": 0.574, + "1": 0.66, + "2": 0.718, + "6": 0.811, + "9": 0.714 + }, + "llm_top_5_test_accuracy": { + "0": 0.713, + "1": 0.711, + "2": 0.755, + "6": 0.895, + "9": 0.861 + }, + "sae_top_1_test_accuracy": { + "0": 0.588, + "1": 0.612, + "2": 0.887, + "6": 0.751, + "9": 0.704 + }, + "sae_top_2_test_accuracy": { + "0": 0.627, + "1": 0.621, + "2": 0.881, + "6": 0.766, + "9": 0.923 + }, + "sae_top_5_test_accuracy": { + "0": 0.703, + "1": 0.7, + "2": 0.878, + "6": 0.971, + "9": 0.925 + } + }, + "LabHC/bias_in_bios_class_set2_results": { + "sae_test_accuracy": { + "11": 0.9580000638961792, + "13": 0.9510000348091125, + "14": 0.9530000686645508, + "18": 0.9290000200271606, + "19": 0.9650000333786011 + }, + "llm_test_accuracy": { + "11": 0.9690000414848328, + "13": 0.9600000381469727, + "14": 0.9600000381469727, + "18": 0.9390000700950623, + "19": 0.9610000252723694 + }, + "llm_top_1_test_accuracy": { + "11": 0.555, + "13": 0.668, + "14": 0.638, + "18": 0.69, + "19": 0.796 + }, + "llm_top_2_test_accuracy": { + "11": 0.756, + "13": 0.714, + "14": 0.67, + "18": 0.717, + "19": 0.768 + }, + "llm_top_5_test_accuracy": { + "11": 0.794, + "13": 0.749, + "14": 0.723, + "18": 0.73, + "19": 0.831 + }, + "sae_top_1_test_accuracy": { + "11": 0.849, + "13": 0.677, + "14": 0.638, + "18": 0.627, + "19": 0.833 + }, + "sae_top_2_test_accuracy": { + "11": 0.849, + "13": 0.684, + "14": 0.614, + "18": 0.666, + "19": 0.821 + }, + "sae_top_5_test_accuracy": { + "11": 0.921, + "13": 0.762, + "14": 0.874, + "18": 0.761, + "19": 0.822 + } + }, + "LabHC/bias_in_bios_class_set3_results": { + "sae_test_accuracy": { + "20": 0.9520000219345093, + "21": 0.9220000505447388, + "22": 0.9120000600814819, + "25": 0.9610000252723694, + "26": 0.9000000357627869 + }, + "llm_test_accuracy": { + "20": 0.956000030040741, + "21": 0.9350000619888306, + "22": 0.9180000424385071, + "25": 0.9640000462532043, + "26": 0.8850000500679016 + }, + "llm_top_1_test_accuracy": { + "20": 0.693, + "21": 0.775, + "22": 0.645, + "25": 0.706, + "26": 0.616 + }, + "llm_top_2_test_accuracy": { + "20": 0.827, + "21": 0.761, + "22": 0.694, + "25": 0.778, + "26": 0.686 + }, + "llm_top_5_test_accuracy": { + "20": 0.855, + "21": 0.791, + "22": 0.725, + "25": 0.809, + "26": 0.672 + }, + "sae_top_1_test_accuracy": { + "20": 0.814, + "21": 0.618, + "22": 0.82, + "25": 0.664, + "26": 0.621 + }, + "sae_top_2_test_accuracy": { + "20": 0.841, + "21": 0.779, + "22": 0.817, + "25": 0.86, + "26": 0.623 + }, + "sae_top_5_test_accuracy": { + "20": 0.932, + "21": 0.8, + "22": 0.859, + "25": 0.86, + "26": 0.674 + } + }, + "canrager/amazon_reviews_mcauley_1and5_results": { + "sae_test_accuracy": { + "1": 0.9480000734329224, + "2": 0.9380000233650208, + "3": 0.9190000295639038, + "5": 0.921000063419342, + "6": 0.8700000643730164 + }, + "llm_test_accuracy": { + "1": 0.9580000638961792, + "2": 0.9330000281333923, + "3": 0.9280000329017639, + "5": 0.9200000166893005, + "6": 0.862000048160553 + }, + "llm_top_1_test_accuracy": { + "1": 0.647, + "2": 0.603, + "3": 0.598, + "5": 0.555, + "6": 0.592 + }, + "llm_top_2_test_accuracy": { + "1": 0.75, + "2": 0.648, + "3": 0.607, + "5": 0.606, + "6": 0.626 + }, + "llm_top_5_test_accuracy": { + "1": 0.767, + "2": 0.641, + "3": 0.645, + "5": 0.638, + "6": 0.676 + }, + "sae_top_1_test_accuracy": { + "1": 0.82, + "2": 0.647, + "3": 0.557, + "5": 0.547, + "6": 0.74 + }, + "sae_top_2_test_accuracy": { + "1": 0.817, + "2": 0.639, + "3": 0.584, + "5": 0.555, + "6": 0.744 + }, + "sae_top_5_test_accuracy": { + "1": 0.87, + "2": 0.872, + "3": 0.641, + "5": 0.8, + "6": 0.756 + } + }, + "canrager/amazon_reviews_mcauley_1and5_sentiment_results": { + "sae_test_accuracy": { + "1.0": 0.9730000495910645, + "5.0": 0.9720000624656677 + }, + "llm_test_accuracy": { + "1.0": 0.9780000448226929, + "5.0": 0.9810000658035278 + }, + "llm_top_1_test_accuracy": { + "1.0": 0.673, + "5.0": 0.673 + }, + "llm_top_2_test_accuracy": { + "1.0": 0.724, + "5.0": 0.724 + }, + "llm_top_5_test_accuracy": { + "1.0": 0.766, + "5.0": 0.766 + }, + "sae_top_1_test_accuracy": { + "1.0": 0.6, + "5.0": 0.6 + }, + "sae_top_2_test_accuracy": { + "1.0": 0.764, + "5.0": 0.764 + }, + "sae_top_5_test_accuracy": { + "1.0": 0.942, + "5.0": 0.942 + } + }, + "codeparrot/github-code_results": { + "sae_test_accuracy": { + "C": 0.9590000510215759, + "Python": 0.9860000610351562, + "HTML": 0.9820000529289246, + "Java": 0.9630000591278076, + "PHP": 0.9530000686645508 + }, + "llm_test_accuracy": { + "C": 0.956000030040741, + "Python": 0.984000027179718, + "HTML": 0.9900000691413879, + "Java": 0.9670000672340393, + "PHP": 0.9570000171661377 + }, + "llm_top_1_test_accuracy": { + "C": 0.666, + "Python": 0.626, + "HTML": 0.721, + "Java": 0.619, + "PHP": 0.594 + }, + "llm_top_2_test_accuracy": { + "C": 0.679, + "Python": 0.674, + "HTML": 0.8, + "Java": 0.676, + "PHP": 0.651 + }, + "llm_top_5_test_accuracy": { + "C": 0.783, + "Python": 0.717, + "HTML": 0.935, + "Java": 0.733, + "PHP": 0.715 + }, + "sae_top_1_test_accuracy": { + "C": 0.607, + "Python": 0.648, + "HTML": 0.689, + "Java": 0.633, + "PHP": 0.615 + }, + "sae_top_2_test_accuracy": { + "C": 0.601, + "Python": 0.685, + "HTML": 0.674, + "Java": 0.638, + "PHP": 0.617 + }, + "sae_top_5_test_accuracy": { + "C": 0.607, + "Python": 0.696, + "HTML": 0.879, + "Java": 0.656, + "PHP": 0.886 + } + }, + "fancyzhx/ag_news_results": { + "sae_test_accuracy": { + "0": 0.940000057220459, + "1": 0.9850000739097595, + "2": 0.9310000538825989, + "3": 0.9520000219345093 + }, + "llm_test_accuracy": { + "0": 0.940000057220459, + "1": 0.9860000610351562, + "2": 0.9200000166893005, + "3": 0.9540000557899475 + }, + "llm_top_1_test_accuracy": { + "0": 0.573, + "1": 0.671, + "2": 0.672, + "3": 0.635 + }, + "llm_top_2_test_accuracy": { + "0": 0.802, + "1": 0.808, + "2": 0.701, + "3": 0.816 + }, + "llm_top_5_test_accuracy": { + "0": 0.81, + "1": 0.891, + "2": 0.752, + "3": 0.832 + }, + "sae_top_1_test_accuracy": { + "0": 0.661, + "1": 0.948, + "2": 0.815, + "3": 0.656 + }, + "sae_top_2_test_accuracy": { + "0": 0.658, + "1": 0.958, + "2": 0.83, + "3": 0.704 + }, + "sae_top_5_test_accuracy": { + "0": 0.764, + "1": 0.96, + "2": 0.83, + "3": 0.775 + } + }, + "Helsinki-NLP/europarl_results": { + "sae_test_accuracy": { + "en": 0.9980000257492065, + "fr": 1.0, + "de": 1.0, + "es": 1.0, + "nl": 0.999000072479248 + }, + "llm_test_accuracy": { + "en": 1.0, + "fr": 0.999000072479248, + "de": 1.0, + "es": 0.999000072479248, + "nl": 1.0 + }, + "llm_top_1_test_accuracy": { + "en": 0.739, + "fr": 0.585, + "de": 0.758, + "es": 0.496, + "nl": 0.649 + }, + "llm_top_2_test_accuracy": { + "en": 0.829, + "fr": 0.582, + "de": 0.82, + "es": 0.958, + "nl": 0.753 + }, + "llm_top_5_test_accuracy": { + "en": 0.892, + "fr": 0.888, + "de": 0.894, + "es": 0.98, + "nl": 0.852 + }, + "sae_top_1_test_accuracy": { + "en": 0.998, + "fr": 0.84, + "de": 0.96, + "es": 0.992, + "nl": 0.997 + }, + "sae_top_2_test_accuracy": { + "en": 0.997, + "fr": 0.989, + "de": 0.956, + "es": 0.99, + "nl": 0.999 + }, + "sae_top_5_test_accuracy": { + "en": 0.999, + "fr": 0.993, + "de": 0.998, + "es": 0.995, + "nl": 0.999 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..4fbed887a47cca544a01d4f0cbfd293a4bfd9912 --- /dev/null +++ b/eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,670 @@ +{ + "eval_type_id": "sparse_probing", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "LabHC/bias_in_bios_class_set2", + "LabHC/bias_in_bios_class_set3", + "canrager/amazon_reviews_mcauley_1and5", + "canrager/amazon_reviews_mcauley_1and5_sentiment", + "codeparrot/github-code", + "fancyzhx/ag_news", + "Helsinki-NLP/europarl" + ], + "probe_train_set_size": 4000, + "probe_test_set_size": 1000, + "context_length": 128, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "model_name": "gemma-2-2b", + "k_values": [ + 1, + 2, + 5 + ], + "lower_vram_usage": false + }, + "eval_id": "5982f843-09ae-423f-897e-88fdc5e9b765", + "datetime_epoch_millis": 1740165484719, + "eval_result_metrics": { + "llm": { + "llm_test_accuracy": 0.9595375448465346, + "llm_top_1_test_accuracy": 0.64956875, + "llm_top_2_test_accuracy": 0.72589375, + "llm_top_5_test_accuracy": 0.78265625, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null + }, + "sae": { + "sae_test_accuracy": 0.9565250385552645, + "sae_top_1_test_accuracy": 0.70693125, + "sae_top_2_test_accuracy": 0.8177062500000001, + "sae_top_5_test_accuracy": 0.8643937500000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_results", + "llm_test_accuracy": 0.966800057888031, + "llm_top_1_test_accuracy": 0.6397999999999999, + "llm_top_2_test_accuracy": 0.6954, + "llm_top_5_test_accuracy": 0.7869999999999999, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9606000423431397, + "sae_top_1_test_accuracy": 0.677, + "sae_top_2_test_accuracy": 0.8896000000000001, + "sae_top_5_test_accuracy": 0.9054, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set2_results", + "llm_test_accuracy": 0.9578000426292419, + "llm_top_1_test_accuracy": 0.6694000000000001, + "llm_top_2_test_accuracy": 0.725, + "llm_top_5_test_accuracy": 0.7654, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9488000273704529, + "sae_top_1_test_accuracy": 0.679, + "sae_top_2_test_accuracy": 0.758, + "sae_top_5_test_accuracy": 0.8614, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "LabHC/bias_in_bios_class_set3_results", + "llm_test_accuracy": 0.9316000461578369, + "llm_top_1_test_accuracy": 0.687, + "llm_top_2_test_accuracy": 0.7492, + "llm_top_5_test_accuracy": 0.7704000000000001, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.929200041294098, + "sae_top_1_test_accuracy": 0.7186, + "sae_top_2_test_accuracy": 0.8170000000000002, + "sae_top_5_test_accuracy": 0.8568, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results", + "llm_test_accuracy": 0.9202000379562378, + "llm_top_1_test_accuracy": 0.599, + "llm_top_2_test_accuracy": 0.6474, + "llm_top_5_test_accuracy": 0.6734, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9192000389099121, + "sae_top_1_test_accuracy": 0.6487999999999999, + "sae_top_2_test_accuracy": 0.7392, + "sae_top_5_test_accuracy": 0.7898, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results", + "llm_test_accuracy": 0.9795000553131104, + "llm_top_1_test_accuracy": 0.673, + "llm_top_2_test_accuracy": 0.724, + "llm_top_5_test_accuracy": 0.766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9725000560283661, + "sae_top_1_test_accuracy": 0.885, + "sae_top_2_test_accuracy": 0.889, + "sae_top_5_test_accuracy": 0.931, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "codeparrot/github-code_results", + "llm_test_accuracy": 0.9708000421524048, + "llm_top_1_test_accuracy": 0.6451999999999999, + "llm_top_2_test_accuracy": 0.6960000000000001, + "llm_top_5_test_accuracy": 0.7766, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9698000431060791, + "sae_top_1_test_accuracy": 0.5916, + "sae_top_2_test_accuracy": 0.756, + "sae_top_5_test_accuracy": 0.8038000000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "fancyzhx/ag_news_results", + "llm_test_accuracy": 0.9500000476837158, + "llm_top_1_test_accuracy": 0.63775, + "llm_top_2_test_accuracy": 0.78175, + "llm_top_5_test_accuracy": 0.82125, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9525000303983688, + "sae_top_1_test_accuracy": 0.61025, + "sae_top_2_test_accuracy": 0.7112499999999999, + "sae_top_5_test_accuracy": 0.7747499999999999, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + }, + { + "dataset_name": "Helsinki-NLP/europarl_results", + "llm_test_accuracy": 0.9996000289916992, + "llm_top_1_test_accuracy": 0.6454, + "llm_top_2_test_accuracy": 0.7884, + "llm_top_5_test_accuracy": 0.9012, + "llm_top_10_test_accuracy": null, + "llm_top_20_test_accuracy": null, + "llm_top_50_test_accuracy": null, + "llm_top_100_test_accuracy": null, + "sae_test_accuracy": 0.9996000289916992, + "sae_top_1_test_accuracy": 0.8452, + "sae_top_2_test_accuracy": 0.9816, + "sae_top_5_test_accuracy": 0.9922000000000001, + "sae_top_10_test_accuracy": null, + "sae_top_20_test_accuracy": null, + "sae_top_50_test_accuracy": null, + "sae_top_100_test_accuracy": null + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1_results": { + "sae_test_accuracy": { + "0": 0.9420000314712524, + "1": 0.9530000686645508, + "2": 0.9470000267028809, + "6": 0.984000027179718, + "9": 0.9770000576972961 + }, + "llm_test_accuracy": { + "0": 0.9510000348091125, + "1": 0.9670000672340393, + "2": 0.9530000686645508, + "6": 0.987000048160553, + "9": 0.9760000705718994 + }, + "llm_top_1_test_accuracy": { + "0": 0.577, + "1": 0.613, + "2": 0.662, + "6": 0.787, + "9": 0.56 + }, + "llm_top_2_test_accuracy": { + "0": 0.574, + "1": 0.66, + "2": 0.718, + "6": 0.811, + "9": 0.714 + }, + "llm_top_5_test_accuracy": { + "0": 0.713, + "1": 0.711, + "2": 0.755, + "6": 0.895, + "9": 0.861 + }, + "sae_top_1_test_accuracy": { + "0": 0.566, + "1": 0.643, + "2": 0.826, + "6": 0.797, + "9": 0.553 + }, + "sae_top_2_test_accuracy": { + "0": 0.868, + "1": 0.806, + "2": 0.853, + "6": 0.981, + "9": 0.94 + }, + "sae_top_5_test_accuracy": { + "0": 0.881, + "1": 0.853, + "2": 0.858, + "6": 0.989, + "9": 0.946 + } + }, + "LabHC/bias_in_bios_class_set2_results": { + "sae_test_accuracy": { + "11": 0.9660000205039978, + "13": 0.9520000219345093, + "14": 0.9430000185966492, + "18": 0.9230000376701355, + "19": 0.9600000381469727 + }, + "llm_test_accuracy": { + "11": 0.9690000414848328, + "13": 0.9600000381469727, + "14": 0.9600000381469727, + "18": 0.9390000700950623, + "19": 0.9610000252723694 + }, + "llm_top_1_test_accuracy": { + "11": 0.555, + "13": 0.668, + "14": 0.638, + "18": 0.69, + "19": 0.796 + }, + "llm_top_2_test_accuracy": { + "11": 0.756, + "13": 0.714, + "14": 0.67, + "18": 0.717, + "19": 0.768 + }, + "llm_top_5_test_accuracy": { + "11": 0.794, + "13": 0.749, + "14": 0.723, + "18": 0.73, + "19": 0.831 + }, + "sae_top_1_test_accuracy": { + "11": 0.555, + "13": 0.666, + "14": 0.661, + "18": 0.712, + "19": 0.801 + }, + "sae_top_2_test_accuracy": { + "11": 0.736, + "13": 0.697, + "14": 0.795, + "18": 0.737, + "19": 0.825 + }, + "sae_top_5_test_accuracy": { + "11": 0.909, + "13": 0.745, + "14": 0.904, + "18": 0.897, + "19": 0.852 + } + }, + "LabHC/bias_in_bios_class_set3_results": { + "sae_test_accuracy": { + "20": 0.9510000348091125, + "21": 0.9240000247955322, + "22": 0.9180000424385071, + "25": 0.9510000348091125, + "26": 0.9020000696182251 + }, + "llm_test_accuracy": { + "20": 0.956000030040741, + "21": 0.9350000619888306, + "22": 0.9180000424385071, + "25": 0.9640000462532043, + "26": 0.8850000500679016 + }, + "llm_top_1_test_accuracy": { + "20": 0.693, + "21": 0.775, + "22": 0.645, + "25": 0.706, + "26": 0.616 + }, + "llm_top_2_test_accuracy": { + "20": 0.827, + "21": 0.761, + "22": 0.694, + "25": 0.778, + "26": 0.686 + }, + "llm_top_5_test_accuracy": { + "20": 0.855, + "21": 0.791, + "22": 0.725, + "25": 0.809, + "26": 0.672 + }, + "sae_top_1_test_accuracy": { + "20": 0.856, + "21": 0.501, + "22": 0.893, + "25": 0.695, + "26": 0.648 + }, + "sae_top_2_test_accuracy": { + "20": 0.863, + "21": 0.737, + "22": 0.893, + "25": 0.874, + "26": 0.718 + }, + "sae_top_5_test_accuracy": { + "20": 0.897, + "21": 0.841, + "22": 0.886, + "25": 0.899, + "26": 0.761 + } + }, + "canrager/amazon_reviews_mcauley_1and5_results": { + "sae_test_accuracy": { + "1": 0.9510000348091125, + "2": 0.9300000667572021, + "3": 0.9200000166893005, + "5": 0.9270000457763672, + "6": 0.8680000305175781 + }, + "llm_test_accuracy": { + "1": 0.9580000638961792, + "2": 0.9330000281333923, + "3": 0.9280000329017639, + "5": 0.9200000166893005, + "6": 0.862000048160553 + }, + "llm_top_1_test_accuracy": { + "1": 0.647, + "2": 0.603, + "3": 0.598, + "5": 0.555, + "6": 0.592 + }, + "llm_top_2_test_accuracy": { + "1": 0.75, + "2": 0.648, + "3": 0.607, + "5": 0.606, + "6": 0.626 + }, + "llm_top_5_test_accuracy": { + "1": 0.767, + "2": 0.641, + "3": 0.645, + "5": 0.638, + "6": 0.676 + }, + "sae_top_1_test_accuracy": { + "1": 0.842, + "2": 0.729, + "3": 0.54, + "5": 0.536, + "6": 0.597 + }, + "sae_top_2_test_accuracy": { + "1": 0.906, + "2": 0.84, + "3": 0.569, + "5": 0.791, + "6": 0.59 + }, + "sae_top_5_test_accuracy": { + "1": 0.918, + "2": 0.844, + "3": 0.6, + "5": 0.877, + "6": 0.71 + } + }, + "canrager/amazon_reviews_mcauley_1and5_sentiment_results": { + "sae_test_accuracy": { + "1.0": 0.9720000624656677, + "5.0": 0.9730000495910645 + }, + "llm_test_accuracy": { + "1.0": 0.9780000448226929, + "5.0": 0.9810000658035278 + }, + "llm_top_1_test_accuracy": { + "1.0": 0.673, + "5.0": 0.673 + }, + "llm_top_2_test_accuracy": { + "1.0": 0.724, + "5.0": 0.724 + }, + "llm_top_5_test_accuracy": { + "1.0": 0.766, + "5.0": 0.766 + }, + "sae_top_1_test_accuracy": { + "1.0": 0.885, + "5.0": 0.885 + }, + "sae_top_2_test_accuracy": { + "1.0": 0.889, + "5.0": 0.889 + }, + "sae_top_5_test_accuracy": { + "1.0": 0.931, + "5.0": 0.931 + } + }, + "codeparrot/github-code_results": { + "sae_test_accuracy": { + "C": 0.9530000686645508, + "Python": 0.9930000305175781, + "HTML": 0.984000027179718, + "Java": 0.9630000591278076, + "PHP": 0.956000030040741 + }, + "llm_test_accuracy": { + "C": 0.956000030040741, + "Python": 0.984000027179718, + "HTML": 0.9900000691413879, + "Java": 0.9670000672340393, + "PHP": 0.9570000171661377 + }, + "llm_top_1_test_accuracy": { + "C": 0.666, + "Python": 0.626, + "HTML": 0.721, + "Java": 0.619, + "PHP": 0.594 + }, + "llm_top_2_test_accuracy": { + "C": 0.679, + "Python": 0.674, + "HTML": 0.8, + "Java": 0.676, + "PHP": 0.651 + }, + "llm_top_5_test_accuracy": { + "C": 0.783, + "Python": 0.717, + "HTML": 0.935, + "Java": 0.733, + "PHP": 0.715 + }, + "sae_top_1_test_accuracy": { + "C": 0.536, + "Python": 0.645, + "HTML": 0.579, + "Java": 0.613, + "PHP": 0.585 + }, + "sae_top_2_test_accuracy": { + "C": 0.634, + "Python": 0.94, + "HTML": 0.691, + "Java": 0.617, + "PHP": 0.898 + }, + "sae_top_5_test_accuracy": { + "C": 0.687, + "Python": 0.943, + "HTML": 0.804, + "Java": 0.675, + "PHP": 0.91 + } + }, + "fancyzhx/ag_news_results": { + "sae_test_accuracy": { + "0": 0.9380000233650208, + "1": 0.984000027179718, + "2": 0.9330000281333923, + "3": 0.9550000429153442 + }, + "llm_test_accuracy": { + "0": 0.940000057220459, + "1": 0.9860000610351562, + "2": 0.9200000166893005, + "3": 0.9540000557899475 + }, + "llm_top_1_test_accuracy": { + "0": 0.573, + "1": 0.671, + "2": 0.672, + "3": 0.635 + }, + "llm_top_2_test_accuracy": { + "0": 0.802, + "1": 0.808, + "2": 0.701, + "3": 0.816 + }, + "llm_top_5_test_accuracy": { + "0": 0.81, + "1": 0.891, + "2": 0.752, + "3": 0.832 + }, + "sae_top_1_test_accuracy": { + "0": 0.575, + "1": 0.667, + "2": 0.566, + "3": 0.633 + }, + "sae_top_2_test_accuracy": { + "0": 0.737, + "1": 0.706, + "2": 0.708, + "3": 0.694 + }, + "sae_top_5_test_accuracy": { + "0": 0.799, + "1": 0.749, + "2": 0.821, + "3": 0.73 + } + }, + "Helsinki-NLP/europarl_results": { + "sae_test_accuracy": { + "en": 1.0, + "fr": 0.999000072479248, + "de": 1.0, + "es": 1.0, + "nl": 0.999000072479248 + }, + "llm_test_accuracy": { + "en": 1.0, + "fr": 0.999000072479248, + "de": 1.0, + "es": 0.999000072479248, + "nl": 1.0 + }, + "llm_top_1_test_accuracy": { + "en": 0.739, + "fr": 0.585, + "de": 0.758, + "es": 0.496, + "nl": 0.649 + }, + "llm_top_2_test_accuracy": { + "en": 0.829, + "fr": 0.582, + "de": 0.82, + "es": 0.958, + "nl": 0.753 + }, + "llm_top_5_test_accuracy": { + "en": 0.892, + "fr": 0.888, + "de": 0.894, + "es": 0.98, + "nl": 0.852 + }, + "sae_top_1_test_accuracy": { + "en": 0.76, + "fr": 0.994, + "de": 0.923, + "es": 0.884, + "nl": 0.665 + }, + "sae_top_2_test_accuracy": { + "en": 0.997, + "fr": 0.994, + "de": 0.924, + "es": 0.996, + "nl": 0.997 + }, + "sae_top_5_test_accuracy": { + "en": 0.998, + "fr": 0.997, + "de": 0.975, + "es": 0.995, + "nl": 0.996 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6b8f783cb29b0d4507814ee8e8019c95fc434203 --- /dev/null +++ b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,414 @@ +{ + "eval_type_id": "tpp", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": false, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "8a95660d-45b7-41a4-a525-961bf9e6596a", + "datetime_epoch_millis": 1740163272675, + "eval_result_metrics": { + "tpp_metrics": { + "tpp_threshold_2_total_metric": 0.002374991774559021, + "tpp_threshold_2_intended_diff_only": 0.004799991846084595, + "tpp_threshold_2_unintended_diff_only": 0.0024250000715255737, + "tpp_threshold_5_total_metric": 0.0023500144481658934, + "tpp_threshold_5_intended_diff_only": 0.0051000118255615234, + "tpp_threshold_5_unintended_diff_only": 0.00274999737739563, + "tpp_threshold_10_total_metric": 0.007099992036819458, + "tpp_threshold_10_intended_diff_only": 0.010999995470046996, + "tpp_threshold_10_unintended_diff_only": 0.003900003433227539, + "tpp_threshold_20_total_metric": 0.018574997782707214, + "tpp_threshold_20_intended_diff_only": 0.025499999523162842, + "tpp_threshold_20_unintended_diff_only": 0.0069250017404556274, + "tpp_threshold_50_total_metric": 0.04237500578165054, + "tpp_threshold_50_intended_diff_only": 0.05090000629425048, + "tpp_threshold_50_unintended_diff_only": 0.008525000512599945, + "tpp_threshold_100_total_metric": 0.08157499581575393, + "tpp_threshold_100_intended_diff_only": 0.0940999984741211, + "tpp_threshold_100_unintended_diff_only": 0.012525002658367156, + "tpp_threshold_500_total_metric": 0.2862000107765198, + "tpp_threshold_500_intended_diff_only": 0.30840001106262205, + "tpp_threshold_500_unintended_diff_only": 0.022200000286102296 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results", + "tpp_threshold_2_total_metric": 0.0029499828815460205, + "tpp_threshold_2_intended_diff_only": 0.0045999884605407715, + "tpp_threshold_2_unintended_diff_only": 0.001650005578994751, + "tpp_threshold_5_total_metric": 0.0015500158071517945, + "tpp_threshold_5_intended_diff_only": 0.004000020027160644, + "tpp_threshold_5_unintended_diff_only": 0.00245000422000885, + "tpp_threshold_10_total_metric": 0.0034999847412109375, + "tpp_threshold_10_intended_diff_only": 0.0054000020027160645, + "tpp_threshold_10_unintended_diff_only": 0.001900017261505127, + "tpp_threshold_20_total_metric": 0.008800002932548522, + "tpp_threshold_20_intended_diff_only": 0.01100001335144043, + "tpp_threshold_20_unintended_diff_only": 0.002200010418891907, + "tpp_threshold_50_total_metric": 0.020700007677078247, + "tpp_threshold_50_intended_diff_only": 0.022600018978118898, + "tpp_threshold_50_unintended_diff_only": 0.0019000113010406495, + "tpp_threshold_100_total_metric": 0.05239999294281006, + "tpp_threshold_100_intended_diff_only": 0.05540000200271607, + "tpp_threshold_100_unintended_diff_only": 0.003000009059906006, + "tpp_threshold_500_total_metric": 0.2711500138044357, + "tpp_threshold_500_intended_diff_only": 0.27920001745224, + "tpp_threshold_500_unintended_diff_only": 0.00805000364780426 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results", + "tpp_threshold_2_total_metric": 0.0018000006675720215, + "tpp_threshold_2_intended_diff_only": 0.004999995231628418, + "tpp_threshold_2_unintended_diff_only": 0.0031999945640563965, + "tpp_threshold_5_total_metric": 0.0031500130891799925, + "tpp_threshold_5_intended_diff_only": 0.0062000036239624025, + "tpp_threshold_5_unintended_diff_only": 0.0030499905347824096, + "tpp_threshold_10_total_metric": 0.01069999933242798, + "tpp_threshold_10_intended_diff_only": 0.01659998893737793, + "tpp_threshold_10_unintended_diff_only": 0.005899989604949951, + "tpp_threshold_20_total_metric": 0.028349992632865906, + "tpp_threshold_20_intended_diff_only": 0.039999985694885255, + "tpp_threshold_20_unintended_diff_only": 0.011649993062019349, + "tpp_threshold_50_total_metric": 0.06405000388622284, + "tpp_threshold_50_intended_diff_only": 0.07919999361038207, + "tpp_threshold_50_unintended_diff_only": 0.01514998972415924, + "tpp_threshold_100_total_metric": 0.11074999868869781, + "tpp_threshold_100_intended_diff_only": 0.13279999494552613, + "tpp_threshold_100_unintended_diff_only": 0.022049996256828307, + "tpp_threshold_500_total_metric": 0.30125000774860383, + "tpp_threshold_500_intended_diff_only": 0.33760000467300416, + "tpp_threshold_500_unintended_diff_only": 0.03634999692440033 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1": { + "0": { + "tpp_threshold_2_total_metric": 0.006750002503395081, + "tpp_threshold_2_intended_diff_only": 0.009000003337860107, + "tpp_threshold_2_unintended_diff_only": 0.002250000834465027, + "tpp_threshold_5_total_metric": 0.011500045657157898, + "tpp_threshold_5_intended_diff_only": 0.01500004529953003, + "tpp_threshold_5_unintended_diff_only": 0.0034999996423721313, + "tpp_threshold_10_total_metric": 0.004999995231628418, + "tpp_threshold_10_intended_diff_only": 0.008000016212463379, + "tpp_threshold_10_unintended_diff_only": 0.003000020980834961, + "tpp_threshold_20_total_metric": 0.01800002157688141, + "tpp_threshold_20_intended_diff_only": 0.020000040531158447, + "tpp_threshold_20_unintended_diff_only": 0.0020000189542770386, + "tpp_threshold_50_total_metric": 0.0350000262260437, + "tpp_threshold_50_intended_diff_only": 0.03800004720687866, + "tpp_threshold_50_unintended_diff_only": 0.003000020980834961, + "tpp_threshold_100_total_metric": 0.11850003898143768, + "tpp_threshold_100_intended_diff_only": 0.12200003862380981, + "tpp_threshold_100_unintended_diff_only": 0.0034999996423721313, + "tpp_threshold_500_total_metric": 0.39775002002716064, + "tpp_threshold_500_intended_diff_only": 0.406000018119812, + "tpp_threshold_500_unintended_diff_only": 0.008249998092651367 + }, + "1": { + "tpp_threshold_2_total_metric": 0.0045000165700912476, + "tpp_threshold_2_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_2_unintended_diff_only": -0.0005000084638595581, + "tpp_threshold_5_total_metric": -0.0002499520778656006, + "tpp_threshold_5_intended_diff_only": 0.001000046730041504, + "tpp_threshold_5_unintended_diff_only": 0.0012499988079071045, + "tpp_threshold_10_total_metric": 0.003750026226043701, + "tpp_threshold_10_intended_diff_only": 0.0020000338554382324, + "tpp_threshold_10_unintended_diff_only": -0.0017499923706054688, + "tpp_threshold_20_total_metric": -0.0012499690055847168, + "tpp_threshold_20_intended_diff_only": 0.0020000338554382324, + "tpp_threshold_20_unintended_diff_only": 0.0032500028610229492, + "tpp_threshold_50_total_metric": 0.0025000572204589844, + "tpp_threshold_50_intended_diff_only": 0.005000054836273193, + "tpp_threshold_50_unintended_diff_only": 0.002499997615814209, + "tpp_threshold_100_total_metric": 0.009000018239021301, + "tpp_threshold_100_intended_diff_only": 0.012000024318695068, + "tpp_threshold_100_unintended_diff_only": 0.003000006079673767, + "tpp_threshold_500_total_metric": 0.1807500571012497, + "tpp_threshold_500_intended_diff_only": 0.18500006198883057, + "tpp_threshold_500_unintended_diff_only": 0.004250004887580872 + }, + "2": { + "tpp_threshold_2_total_metric": -0.00025004148483276367, + "tpp_threshold_2_intended_diff_only": 0.001999974250793457, + "tpp_threshold_2_unintended_diff_only": 0.0022500157356262207, + "tpp_threshold_5_total_metric": -0.004250004887580872, + "tpp_threshold_5_intended_diff_only": 0.0, + "tpp_threshold_5_unintended_diff_only": 0.004250004887580872, + "tpp_threshold_10_total_metric": 0.009249955415725708, + "tpp_threshold_10_intended_diff_only": 0.010999977588653564, + "tpp_threshold_10_unintended_diff_only": 0.0017500221729278564, + "tpp_threshold_20_total_metric": 0.01299998164176941, + "tpp_threshold_20_intended_diff_only": 0.013999998569488525, + "tpp_threshold_20_unintended_diff_only": 0.0010000169277191162, + "tpp_threshold_50_total_metric": 0.03224998712539673, + "tpp_threshold_50_intended_diff_only": 0.03200000524520874, + "tpp_threshold_50_unintended_diff_only": -0.0002499818801879883, + "tpp_threshold_100_total_metric": 0.05349995195865631, + "tpp_threshold_100_intended_diff_only": 0.05299997329711914, + "tpp_threshold_100_unintended_diff_only": -0.0004999786615371704, + "tpp_threshold_500_total_metric": 0.3799999952316284, + "tpp_threshold_500_intended_diff_only": 0.3889999985694885, + "tpp_threshold_500_unintended_diff_only": 0.009000003337860107 + }, + "6": { + "tpp_threshold_2_total_metric": -0.000500023365020752, + "tpp_threshold_2_intended_diff_only": 0.0009999871253967285, + "tpp_threshold_2_unintended_diff_only": 0.0015000104904174805, + "tpp_threshold_5_total_metric": 0.000250011682510376, + "tpp_threshold_5_intended_diff_only": -0.0009999871253967285, + "tpp_threshold_5_unintended_diff_only": -0.0012499988079071045, + "tpp_threshold_10_total_metric": 0.0012499988079071045, + "tpp_threshold_10_intended_diff_only": 0.003000020980834961, + "tpp_threshold_10_unintended_diff_only": 0.0017500221729278564, + "tpp_threshold_20_total_metric": 0.001999974250793457, + "tpp_threshold_20_intended_diff_only": 0.0009999871253967285, + "tpp_threshold_20_unintended_diff_only": -0.0009999871253967285, + "tpp_threshold_50_total_metric": 0.002750009298324585, + "tpp_threshold_50_intended_diff_only": 0.003000020980834961, + "tpp_threshold_50_unintended_diff_only": 0.000250011682510376, + "tpp_threshold_100_total_metric": 0.002749994397163391, + "tpp_threshold_100_intended_diff_only": 0.004999995231628418, + "tpp_threshold_100_unintended_diff_only": 0.002250000834465027, + "tpp_threshold_500_total_metric": 0.016249999403953552, + "tpp_threshold_500_intended_diff_only": 0.023000001907348633, + "tpp_threshold_500_unintended_diff_only": 0.006750002503395081 + }, + "9": { + "tpp_threshold_2_total_metric": 0.00424996018409729, + "tpp_threshold_2_intended_diff_only": 0.006999969482421875, + "tpp_threshold_2_unintended_diff_only": 0.002750009298324585, + "tpp_threshold_5_total_metric": 0.0004999786615371704, + "tpp_threshold_5_intended_diff_only": 0.004999995231628418, + "tpp_threshold_5_unintended_diff_only": 0.0045000165700912476, + "tpp_threshold_10_total_metric": -0.0017500519752502441, + "tpp_threshold_10_intended_diff_only": 0.0029999613761901855, + "tpp_threshold_10_unintended_diff_only": 0.00475001335144043, + "tpp_threshold_20_total_metric": 0.012250006198883057, + "tpp_threshold_20_intended_diff_only": 0.018000006675720215, + "tpp_threshold_20_unintended_diff_only": 0.005750000476837158, + "tpp_threshold_50_total_metric": 0.030999958515167236, + "tpp_threshold_50_intended_diff_only": 0.034999966621398926, + "tpp_threshold_50_unintended_diff_only": 0.0040000081062316895, + "tpp_threshold_100_total_metric": 0.0782499611377716, + "tpp_threshold_100_intended_diff_only": 0.08499997854232788, + "tpp_threshold_100_unintended_diff_only": 0.006750017404556274, + "tpp_threshold_500_total_metric": 0.38099999725818634, + "tpp_threshold_500_intended_diff_only": 0.3930000066757202, + "tpp_threshold_500_unintended_diff_only": 0.012000009417533875 + } + }, + "canrager/amazon_reviews_mcauley_1and5": { + "1": { + "tpp_threshold_2_total_metric": 0.009749948978424072, + "tpp_threshold_2_intended_diff_only": 0.012999951839447021, + "tpp_threshold_2_unintended_diff_only": 0.0032500028610229492, + "tpp_threshold_5_total_metric": 0.004499971866607666, + "tpp_threshold_5_intended_diff_only": 0.006999969482421875, + "tpp_threshold_5_unintended_diff_only": 0.002499997615814209, + "tpp_threshold_10_total_metric": -0.0015000104904174805, + "tpp_threshold_10_intended_diff_only": 0.006999969482421875, + "tpp_threshold_10_unintended_diff_only": 0.008499979972839355, + "tpp_threshold_20_total_metric": 0.0014999806880950928, + "tpp_threshold_20_intended_diff_only": 0.006999969482421875, + "tpp_threshold_20_unintended_diff_only": 0.005499988794326782, + "tpp_threshold_50_total_metric": 0.014499977231025696, + "tpp_threshold_50_intended_diff_only": 0.01699995994567871, + "tpp_threshold_50_unintended_diff_only": 0.002499982714653015, + "tpp_threshold_100_total_metric": 0.024999961256980896, + "tpp_threshold_100_intended_diff_only": 0.034999966621398926, + "tpp_threshold_100_unintended_diff_only": 0.01000000536441803, + "tpp_threshold_500_total_metric": 0.1757499873638153, + "tpp_threshold_500_intended_diff_only": 0.18599998950958252, + "tpp_threshold_500_unintended_diff_only": 0.010250002145767212 + }, + "2": { + "tpp_threshold_2_total_metric": 0.001249954104423523, + "tpp_threshold_2_intended_diff_only": 0.0029999613761901855, + "tpp_threshold_2_unintended_diff_only": 0.0017500072717666626, + "tpp_threshold_5_total_metric": -0.004749983549118042, + "tpp_threshold_5_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_5_unintended_diff_only": 0.008749991655349731, + "tpp_threshold_10_total_metric": 0.011499986052513123, + "tpp_threshold_10_intended_diff_only": 0.015999972820281982, + "tpp_threshold_10_unintended_diff_only": 0.00449998676776886, + "tpp_threshold_20_total_metric": 0.020249977707862854, + "tpp_threshold_20_intended_diff_only": 0.02499997615814209, + "tpp_threshold_20_unintended_diff_only": 0.004749998450279236, + "tpp_threshold_50_total_metric": 0.07249999046325684, + "tpp_threshold_50_intended_diff_only": 0.08499997854232788, + "tpp_threshold_50_unintended_diff_only": 0.012499988079071045, + "tpp_threshold_100_total_metric": 0.12825001776218414, + "tpp_threshold_100_intended_diff_only": 0.15200001001358032, + "tpp_threshold_100_unintended_diff_only": 0.02374999225139618, + "tpp_threshold_500_total_metric": 0.3604999780654907, + "tpp_threshold_500_intended_diff_only": 0.3999999761581421, + "tpp_threshold_500_unintended_diff_only": 0.03949999809265137 + }, + "3": { + "tpp_threshold_2_total_metric": -0.008499979972839355, + "tpp_threshold_2_intended_diff_only": -0.0059999823570251465, + "tpp_threshold_2_unintended_diff_only": 0.002499997615814209, + "tpp_threshold_5_total_metric": 0.0005000084638595581, + "tpp_threshold_5_intended_diff_only": 0.0, + "tpp_threshold_5_unintended_diff_only": -0.0005000084638595581, + "tpp_threshold_10_total_metric": 0.01424996554851532, + "tpp_threshold_10_intended_diff_only": 0.01699995994567871, + "tpp_threshold_10_unintended_diff_only": 0.002749994397163391, + "tpp_threshold_20_total_metric": 0.008999988436698914, + "tpp_threshold_20_intended_diff_only": 0.014999985694885254, + "tpp_threshold_20_unintended_diff_only": 0.00599999725818634, + "tpp_threshold_50_total_metric": 0.03624999523162842, + "tpp_threshold_50_intended_diff_only": 0.041999995708465576, + "tpp_threshold_50_unintended_diff_only": 0.005750000476837158, + "tpp_threshold_100_total_metric": 0.07375001907348633, + "tpp_threshold_100_intended_diff_only": 0.08700001239776611, + "tpp_threshold_100_unintended_diff_only": 0.013249993324279785, + "tpp_threshold_500_total_metric": 0.29850004613399506, + "tpp_threshold_500_intended_diff_only": 0.3400000333786011, + "tpp_threshold_500_unintended_diff_only": 0.04149998724460602 + }, + "5": { + "tpp_threshold_2_total_metric": -0.006499916315078735, + "tpp_threshold_2_intended_diff_only": -0.003999948501586914, + "tpp_threshold_2_unintended_diff_only": 0.0024999678134918213, + "tpp_threshold_5_total_metric": -0.007999926805496216, + "tpp_threshold_5_intended_diff_only": -0.003999948501586914, + "tpp_threshold_5_unintended_diff_only": 0.003999978303909302, + "tpp_threshold_10_total_metric": -0.014499947428703308, + "tpp_threshold_10_intended_diff_only": -0.007999956607818604, + "tpp_threshold_10_unintended_diff_only": 0.006499990820884705, + "tpp_threshold_20_total_metric": 0.03825005888938904, + "tpp_threshold_20_intended_diff_only": 0.06700003147125244, + "tpp_threshold_20_unintended_diff_only": 0.028749972581863403, + "tpp_threshold_50_total_metric": 0.07950006425380707, + "tpp_threshold_50_intended_diff_only": 0.12200003862380981, + "tpp_threshold_50_unintended_diff_only": 0.04249997437000275, + "tpp_threshold_100_total_metric": 0.13325001299381256, + "tpp_threshold_100_intended_diff_only": 0.18000000715255737, + "tpp_threshold_100_unintended_diff_only": 0.04674999415874481, + "tpp_threshold_500_total_metric": 0.3255000412464142, + "tpp_threshold_500_intended_diff_only": 0.39100003242492676, + "tpp_threshold_500_unintended_diff_only": 0.06549999117851257 + }, + "6": { + "tpp_threshold_2_total_metric": 0.012999996542930603, + "tpp_threshold_2_intended_diff_only": 0.018999993801116943, + "tpp_threshold_2_unintended_diff_only": 0.00599999725818634, + "tpp_threshold_5_total_metric": 0.023499995470046997, + "tpp_threshold_5_intended_diff_only": 0.02399998903274536, + "tpp_threshold_5_unintended_diff_only": 0.0004999935626983643, + "tpp_threshold_10_total_metric": 0.04375000298023224, + "tpp_threshold_10_intended_diff_only": 0.050999999046325684, + "tpp_threshold_10_unintended_diff_only": 0.007249996066093445, + "tpp_threshold_20_total_metric": 0.07274995744228363, + "tpp_threshold_20_intended_diff_only": 0.08599996566772461, + "tpp_threshold_20_unintended_diff_only": 0.013250008225440979, + "tpp_threshold_50_total_metric": 0.11749999225139618, + "tpp_threshold_50_intended_diff_only": 0.12999999523162842, + "tpp_threshold_50_unintended_diff_only": 0.012500002980232239, + "tpp_threshold_100_total_metric": 0.19349998235702515, + "tpp_threshold_100_intended_diff_only": 0.20999997854232788, + "tpp_threshold_100_unintended_diff_only": 0.016499996185302734, + "tpp_threshold_500_total_metric": 0.34599998593330383, + "tpp_threshold_500_intended_diff_only": 0.3709999918937683, + "tpp_threshold_500_unintended_diff_only": 0.025000005960464478 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a55c68442dddf3d8ced62bb94092b61a5925b707 --- /dev/null +++ b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,414 @@ +{ + "eval_type_id": "tpp", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": false, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "2072bfe8-9d3d-4573-8939-241c618278fe", + "datetime_epoch_millis": 1740162955007, + "eval_result_metrics": { + "tpp_metrics": { + "tpp_threshold_2_total_metric": 0.0027249947190284727, + "tpp_threshold_2_intended_diff_only": 0.005399996042251587, + "tpp_threshold_2_unintended_diff_only": 0.002675001323223114, + "tpp_threshold_5_total_metric": 0.004999993741512299, + "tpp_threshold_5_intended_diff_only": 0.007799994945526124, + "tpp_threshold_5_unintended_diff_only": 0.002800001204013825, + "tpp_threshold_10_total_metric": 0.011049999296665192, + "tpp_threshold_10_intended_diff_only": 0.014899998903274536, + "tpp_threshold_10_unintended_diff_only": 0.0038499996066093446, + "tpp_threshold_20_total_metric": 0.023100003600120544, + "tpp_threshold_20_intended_diff_only": 0.0281000018119812, + "tpp_threshold_20_unintended_diff_only": 0.004999998211860657, + "tpp_threshold_50_total_metric": 0.0674250066280365, + "tpp_threshold_50_intended_diff_only": 0.07430000305175781, + "tpp_threshold_50_unintended_diff_only": 0.006874996423721313, + "tpp_threshold_100_total_metric": 0.14430001527071, + "tpp_threshold_100_intended_diff_only": 0.15460001230239867, + "tpp_threshold_100_unintended_diff_only": 0.01029999703168869, + "tpp_threshold_500_total_metric": 0.3798500135540962, + "tpp_threshold_500_intended_diff_only": 0.4061000108718872, + "tpp_threshold_500_unintended_diff_only": 0.026249997317790985 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results", + "tpp_threshold_2_total_metric": 0.003049987554550171, + "tpp_threshold_2_intended_diff_only": 0.00559999942779541, + "tpp_threshold_2_unintended_diff_only": 0.002550011873245239, + "tpp_threshold_5_total_metric": 0.00589999258518219, + "tpp_threshold_5_intended_diff_only": 0.008800005912780762, + "tpp_threshold_5_unintended_diff_only": 0.002900013327598572, + "tpp_threshold_10_total_metric": 0.011350002884864808, + "tpp_threshold_10_intended_diff_only": 0.014000010490417481, + "tpp_threshold_10_unintended_diff_only": 0.0026500076055526733, + "tpp_threshold_20_total_metric": 0.026500004529953002, + "tpp_threshold_20_intended_diff_only": 0.02960001230239868, + "tpp_threshold_20_unintended_diff_only": 0.0031000077724456787, + "tpp_threshold_50_total_metric": 0.0689500093460083, + "tpp_threshold_50_intended_diff_only": 0.07320001125335693, + "tpp_threshold_50_unintended_diff_only": 0.004250001907348633, + "tpp_threshold_100_total_metric": 0.15995001494884492, + "tpp_threshold_100_intended_diff_only": 0.16520001888275146, + "tpp_threshold_100_unintended_diff_only": 0.005250003933906555, + "tpp_threshold_500_total_metric": 0.4329000234603882, + "tpp_threshold_500_intended_diff_only": 0.44420002698898314, + "tpp_threshold_500_unintended_diff_only": 0.011300003528594971 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results", + "tpp_threshold_2_total_metric": 0.002400001883506775, + "tpp_threshold_2_intended_diff_only": 0.005199992656707763, + "tpp_threshold_2_unintended_diff_only": 0.002799990773200989, + "tpp_threshold_5_total_metric": 0.004099994897842407, + "tpp_threshold_5_intended_diff_only": 0.0067999839782714845, + "tpp_threshold_5_unintended_diff_only": 0.0026999890804290773, + "tpp_threshold_10_total_metric": 0.010749995708465576, + "tpp_threshold_10_intended_diff_only": 0.01579998731613159, + "tpp_threshold_10_unintended_diff_only": 0.005049991607666016, + "tpp_threshold_20_total_metric": 0.019700002670288087, + "tpp_threshold_20_intended_diff_only": 0.02659999132156372, + "tpp_threshold_20_unintended_diff_only": 0.006899988651275635, + "tpp_threshold_50_total_metric": 0.0659000039100647, + "tpp_threshold_50_intended_diff_only": 0.07539999485015869, + "tpp_threshold_50_unintended_diff_only": 0.009499990940093994, + "tpp_threshold_100_total_metric": 0.12865001559257508, + "tpp_threshold_100_intended_diff_only": 0.1440000057220459, + "tpp_threshold_100_unintended_diff_only": 0.015349990129470826, + "tpp_threshold_500_total_metric": 0.32680000364780426, + "tpp_threshold_500_intended_diff_only": 0.3679999947547913, + "tpp_threshold_500_unintended_diff_only": 0.041199991106987 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1": { + "0": { + "tpp_threshold_2_total_metric": 0.0037500113248825073, + "tpp_threshold_2_intended_diff_only": 0.008000016212463379, + "tpp_threshold_2_unintended_diff_only": 0.004250004887580872, + "tpp_threshold_5_total_metric": 0.011250019073486328, + "tpp_threshold_5_intended_diff_only": 0.016000032424926758, + "tpp_threshold_5_unintended_diff_only": 0.00475001335144043, + "tpp_threshold_10_total_metric": 0.010500013828277588, + "tpp_threshold_10_intended_diff_only": 0.016000032424926758, + "tpp_threshold_10_unintended_diff_only": 0.00550001859664917, + "tpp_threshold_20_total_metric": 0.02700003981590271, + "tpp_threshold_20_intended_diff_only": 0.030000030994415283, + "tpp_threshold_20_unintended_diff_only": 0.0029999911785125732, + "tpp_threshold_50_total_metric": 0.06300003826618195, + "tpp_threshold_50_intended_diff_only": 0.06700003147125244, + "tpp_threshold_50_unintended_diff_only": 0.003999993205070496, + "tpp_threshold_100_total_metric": 0.1420000195503235, + "tpp_threshold_100_intended_diff_only": 0.1470000147819519, + "tpp_threshold_100_unintended_diff_only": 0.004999995231628418, + "tpp_threshold_500_total_metric": 0.427000030875206, + "tpp_threshold_500_intended_diff_only": 0.44200003147125244, + "tpp_threshold_500_unintended_diff_only": 0.015000000596046448 + }, + "1": { + "tpp_threshold_2_total_metric": 0.00700002908706665, + "tpp_threshold_2_intended_diff_only": 0.00700002908706665, + "tpp_threshold_2_unintended_diff_only": 0.0, + "tpp_threshold_5_total_metric": 0.001999989151954651, + "tpp_threshold_5_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_5_unintended_diff_only": 0.0020000189542770386, + "tpp_threshold_10_total_metric": 0.005500048398971558, + "tpp_threshold_10_intended_diff_only": 0.005000054836273193, + "tpp_threshold_10_unintended_diff_only": -0.0004999935626983643, + "tpp_threshold_20_total_metric": 0.0072500258684158325, + "tpp_threshold_20_intended_diff_only": 0.012000024318695068, + "tpp_threshold_20_unintended_diff_only": 0.004749998450279236, + "tpp_threshold_50_total_metric": 0.061750054359436035, + "tpp_threshold_50_intended_diff_only": 0.0700000524520874, + "tpp_threshold_50_unintended_diff_only": 0.008249998092651367, + "tpp_threshold_100_total_metric": 0.11425001919269562, + "tpp_threshold_100_intended_diff_only": 0.12200003862380981, + "tpp_threshold_100_unintended_diff_only": 0.007750019431114197, + "tpp_threshold_500_total_metric": 0.4257500469684601, + "tpp_threshold_500_intended_diff_only": 0.4360000491142273, + "tpp_threshold_500_unintended_diff_only": 0.010250002145767212 + }, + "2": { + "tpp_threshold_2_total_metric": -0.0020000338554382324, + "tpp_threshold_2_intended_diff_only": 0.0009999871253967285, + "tpp_threshold_2_unintended_diff_only": 0.003000020980834961, + "tpp_threshold_5_total_metric": 0.0057499706745147705, + "tpp_threshold_5_intended_diff_only": 0.009999990463256836, + "tpp_threshold_5_unintended_diff_only": 0.004250019788742065, + "tpp_threshold_10_total_metric": 0.02674996852874756, + "tpp_threshold_10_intended_diff_only": 0.02899998426437378, + "tpp_threshold_10_unintended_diff_only": 0.0022500157356262207, + "tpp_threshold_20_total_metric": 0.04725000262260437, + "tpp_threshold_20_intended_diff_only": 0.04900002479553223, + "tpp_threshold_20_unintended_diff_only": 0.0017500221729278564, + "tpp_threshold_50_total_metric": 0.08799996972084045, + "tpp_threshold_50_intended_diff_only": 0.08899998664855957, + "tpp_threshold_50_unintended_diff_only": 0.0010000169277191162, + "tpp_threshold_100_total_metric": 0.19200001657009125, + "tpp_threshold_100_intended_diff_only": 0.19300001859664917, + "tpp_threshold_100_unintended_diff_only": 0.0010000020265579224, + "tpp_threshold_500_total_metric": 0.43150001764297485, + "tpp_threshold_500_intended_diff_only": 0.43800002336502075, + "tpp_threshold_500_unintended_diff_only": 0.0065000057220458984 + }, + "6": { + "tpp_threshold_2_total_metric": 0.001999989151954651, + "tpp_threshold_2_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_2_unintended_diff_only": 0.0020000189542770386, + "tpp_threshold_5_total_metric": 0.003500029444694519, + "tpp_threshold_5_intended_diff_only": 0.0020000338554382324, + "tpp_threshold_5_unintended_diff_only": -0.0014999955892562866, + "tpp_threshold_10_total_metric": 0.00449998676776886, + "tpp_threshold_10_intended_diff_only": 0.004999995231628418, + "tpp_threshold_10_unintended_diff_only": 0.0005000084638595581, + "tpp_threshold_20_total_metric": 0.0052499920129776, + "tpp_threshold_20_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_20_unintended_diff_only": -0.0012499839067459106, + "tpp_threshold_50_total_metric": 0.016000032424926758, + "tpp_threshold_50_intended_diff_only": 0.017000019550323486, + "tpp_threshold_50_unintended_diff_only": 0.0009999871253967285, + "tpp_threshold_100_total_metric": 0.1157500147819519, + "tpp_threshold_100_intended_diff_only": 0.11900001764297485, + "tpp_threshold_100_unintended_diff_only": 0.0032500028610229492, + "tpp_threshold_500_total_metric": 0.42000001668930054, + "tpp_threshold_500_intended_diff_only": 0.42900002002716064, + "tpp_threshold_500_unintended_diff_only": 0.009000003337860107 + }, + "9": { + "tpp_threshold_2_total_metric": 0.004499942064285278, + "tpp_threshold_2_intended_diff_only": 0.007999956607818604, + "tpp_threshold_2_unintended_diff_only": 0.003500014543533325, + "tpp_threshold_5_total_metric": 0.006999954581260681, + "tpp_threshold_5_intended_diff_only": 0.011999964714050293, + "tpp_threshold_5_unintended_diff_only": 0.005000010132789612, + "tpp_threshold_10_total_metric": 0.009499996900558472, + "tpp_threshold_10_intended_diff_only": 0.014999985694885254, + "tpp_threshold_10_unintended_diff_only": 0.005499988794326782, + "tpp_threshold_20_total_metric": 0.0457499623298645, + "tpp_threshold_20_intended_diff_only": 0.05299997329711914, + "tpp_threshold_20_unintended_diff_only": 0.007250010967254639, + "tpp_threshold_50_total_metric": 0.11599995195865631, + "tpp_threshold_50_intended_diff_only": 0.12299996614456177, + "tpp_threshold_50_unintended_diff_only": 0.0070000141859054565, + "tpp_threshold_100_total_metric": 0.2357500046491623, + "tpp_threshold_100_intended_diff_only": 0.24500000476837158, + "tpp_threshold_100_unintended_diff_only": 0.00925000011920929, + "tpp_threshold_500_total_metric": 0.46025000512599945, + "tpp_threshold_500_intended_diff_only": 0.47600001096725464, + "tpp_threshold_500_unintended_diff_only": 0.015750005841255188 + } + }, + "canrager/amazon_reviews_mcauley_1and5": { + "1": { + "tpp_threshold_2_total_metric": 0.00475001335144043, + "tpp_threshold_2_intended_diff_only": 0.009000003337860107, + "tpp_threshold_2_unintended_diff_only": 0.004249989986419678, + "tpp_threshold_5_total_metric": 0.0020000040531158447, + "tpp_threshold_5_intended_diff_only": 0.004999995231628418, + "tpp_threshold_5_unintended_diff_only": 0.0029999911785125732, + "tpp_threshold_10_total_metric": 0.004999950528144836, + "tpp_threshold_10_intended_diff_only": 0.012999951839447021, + "tpp_threshold_10_unintended_diff_only": 0.008000001311302185, + "tpp_threshold_20_total_metric": 0.005249962210655212, + "tpp_threshold_20_intended_diff_only": 0.011999964714050293, + "tpp_threshold_20_unintended_diff_only": 0.006750002503395081, + "tpp_threshold_50_total_metric": 0.01299998164176941, + "tpp_threshold_50_intended_diff_only": 0.014999985694885254, + "tpp_threshold_50_unintended_diff_only": 0.0020000040531158447, + "tpp_threshold_100_total_metric": 0.022250011563301086, + "tpp_threshold_100_intended_diff_only": 0.03700000047683716, + "tpp_threshold_100_unintended_diff_only": 0.014749988913536072, + "tpp_threshold_500_total_metric": 0.2149999886751175, + "tpp_threshold_500_intended_diff_only": 0.2749999761581421, + "tpp_threshold_500_unintended_diff_only": 0.0599999874830246 + }, + "2": { + "tpp_threshold_2_total_metric": 0.0037499964237213135, + "tpp_threshold_2_intended_diff_only": 0.004999995231628418, + "tpp_threshold_2_unintended_diff_only": 0.0012499988079071045, + "tpp_threshold_5_total_metric": 0.0007499754428863525, + "tpp_threshold_5_intended_diff_only": 0.007999956607818604, + "tpp_threshold_5_unintended_diff_only": 0.007249981164932251, + "tpp_threshold_10_total_metric": 0.014500007033348083, + "tpp_threshold_10_intended_diff_only": 0.018999993801116943, + "tpp_threshold_10_unintended_diff_only": 0.00449998676776886, + "tpp_threshold_20_total_metric": 0.032000020146369934, + "tpp_threshold_20_intended_diff_only": 0.03600001335144043, + "tpp_threshold_20_unintended_diff_only": 0.003999993205070496, + "tpp_threshold_50_total_metric": 0.0845000296831131, + "tpp_threshold_50_intended_diff_only": 0.09600001573562622, + "tpp_threshold_50_unintended_diff_only": 0.011499986052513123, + "tpp_threshold_100_total_metric": 0.17149998247623444, + "tpp_threshold_100_intended_diff_only": 0.19099998474121094, + "tpp_threshold_100_unintended_diff_only": 0.0195000022649765, + "tpp_threshold_500_total_metric": 0.3877500146627426, + "tpp_threshold_500_intended_diff_only": 0.41600000858306885, + "tpp_threshold_500_unintended_diff_only": 0.028249993920326233 + }, + "3": { + "tpp_threshold_2_total_metric": -0.01000000536441803, + "tpp_threshold_2_intended_diff_only": -0.008000016212463379, + "tpp_threshold_2_unintended_diff_only": 0.001999989151954651, + "tpp_threshold_5_total_metric": -2.9802322387695312e-08, + "tpp_threshold_5_intended_diff_only": -0.0020000338554382324, + "tpp_threshold_5_unintended_diff_only": -0.0020000040531158447, + "tpp_threshold_10_total_metric": 0.009499981999397278, + "tpp_threshold_10_intended_diff_only": 0.010999977588653564, + "tpp_threshold_10_unintended_diff_only": 0.0014999955892562866, + "tpp_threshold_20_total_metric": 0.002249985933303833, + "tpp_threshold_20_intended_diff_only": 0.011999964714050293, + "tpp_threshold_20_unintended_diff_only": 0.00974997878074646, + "tpp_threshold_50_total_metric": 0.0455000102519989, + "tpp_threshold_50_intended_diff_only": 0.0559999942779541, + "tpp_threshold_50_unintended_diff_only": 0.0104999840259552, + "tpp_threshold_100_total_metric": 0.0767500251531601, + "tpp_threshold_100_intended_diff_only": 0.0910000205039978, + "tpp_threshold_100_unintended_diff_only": 0.014249995350837708, + "tpp_threshold_500_total_metric": 0.3192499876022339, + "tpp_threshold_500_intended_diff_only": 0.3579999804496765, + "tpp_threshold_500_unintended_diff_only": 0.03874999284744263 + }, + "5": { + "tpp_threshold_2_total_metric": 2.9802322387695312e-08, + "tpp_threshold_2_intended_diff_only": 0.003000020980834961, + "tpp_threshold_2_unintended_diff_only": 0.0029999911785125732, + "tpp_threshold_5_total_metric": 0.006750032305717468, + "tpp_threshold_5_intended_diff_only": 0.013000011444091797, + "tpp_threshold_5_unintended_diff_only": 0.006249979138374329, + "tpp_threshold_10_total_metric": 0.004500046372413635, + "tpp_threshold_10_intended_diff_only": 0.012000024318695068, + "tpp_threshold_10_unintended_diff_only": 0.007499977946281433, + "tpp_threshold_20_total_metric": 0.01575005054473877, + "tpp_threshold_20_intended_diff_only": 0.025000035762786865, + "tpp_threshold_20_unintended_diff_only": 0.009249985218048096, + "tpp_threshold_50_total_metric": 0.08250001072883606, + "tpp_threshold_50_intended_diff_only": 0.09700000286102295, + "tpp_threshold_50_unintended_diff_only": 0.01449999213218689, + "tpp_threshold_100_total_metric": 0.15550003945827484, + "tpp_threshold_100_intended_diff_only": 0.17500001192092896, + "tpp_threshold_100_unintended_diff_only": 0.019499972462654114, + "tpp_threshold_500_total_metric": 0.35050003230571747, + "tpp_threshold_500_intended_diff_only": 0.4150000214576721, + "tpp_threshold_500_unintended_diff_only": 0.06449998915195465 + }, + "6": { + "tpp_threshold_2_total_metric": 0.013499975204467773, + "tpp_threshold_2_intended_diff_only": 0.01699995994567871, + "tpp_threshold_2_unintended_diff_only": 0.0034999847412109375, + "tpp_threshold_5_total_metric": 0.010999992489814758, + "tpp_threshold_5_intended_diff_only": 0.009999990463256836, + "tpp_threshold_5_unintended_diff_only": -0.0010000020265579224, + "tpp_threshold_10_total_metric": 0.020249992609024048, + "tpp_threshold_10_intended_diff_only": 0.02399998903274536, + "tpp_threshold_10_unintended_diff_only": 0.0037499964237213135, + "tpp_threshold_20_total_metric": 0.04324999451637268, + "tpp_threshold_20_intended_diff_only": 0.04799997806549072, + "tpp_threshold_20_unintended_diff_only": 0.004749983549118042, + "tpp_threshold_50_total_metric": 0.10399998724460602, + "tpp_threshold_50_intended_diff_only": 0.11299997568130493, + "tpp_threshold_50_unintended_diff_only": 0.008999988436698914, + "tpp_threshold_100_total_metric": 0.2172500193119049, + "tpp_threshold_100_intended_diff_only": 0.22600001096725464, + "tpp_threshold_100_unintended_diff_only": 0.008749991655349731, + "tpp_threshold_500_total_metric": 0.36149999499320984, + "tpp_threshold_500_intended_diff_only": 0.37599998712539673, + "tpp_threshold_500_unintended_diff_only": 0.01449999213218689 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..90dbffa38bc149c7a745e80be0e1d2ce071a10c1 --- /dev/null +++ b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,414 @@ +{ + "eval_type_id": "tpp", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": false, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "f2bf09ac-6740-414f-aa61-a62e38a23b92", + "datetime_epoch_millis": 1740162794981, + "eval_result_metrics": { + "tpp_metrics": { + "tpp_threshold_2_total_metric": 0.002074998617172241, + "tpp_threshold_2_intended_diff_only": 0.004600000381469726, + "tpp_threshold_2_unintended_diff_only": 0.0025250017642974854, + "tpp_threshold_5_total_metric": 0.003899991512298584, + "tpp_threshold_5_intended_diff_only": 0.0056999921798706055, + "tpp_threshold_5_unintended_diff_only": 0.0018000006675720215, + "tpp_threshold_10_total_metric": 2.5008618831634565e-05, + "tpp_threshold_10_intended_diff_only": 0.002800005674362183, + "tpp_threshold_10_unintended_diff_only": 0.002774997055530548, + "tpp_threshold_20_total_metric": 0.003749997913837433, + "tpp_threshold_20_intended_diff_only": 0.007200002670288086, + "tpp_threshold_20_unintended_diff_only": 0.0034500047564506534, + "tpp_threshold_50_total_metric": 0.012599988281726836, + "tpp_threshold_50_intended_diff_only": 0.016999995708465575, + "tpp_threshold_50_unintended_diff_only": 0.004400007426738739, + "tpp_threshold_100_total_metric": 0.02380000501871109, + "tpp_threshold_100_intended_diff_only": 0.02910000681877136, + "tpp_threshold_100_unintended_diff_only": 0.005300001800060272, + "tpp_threshold_500_total_metric": 0.07820001393556594, + "tpp_threshold_500_intended_diff_only": 0.08660001158714295, + "tpp_threshold_500_unintended_diff_only": 0.008399997651576997 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results", + "tpp_threshold_2_total_metric": 0.005099990963935852, + "tpp_threshold_2_intended_diff_only": 0.007200002670288086, + "tpp_threshold_2_unintended_diff_only": 0.002100011706352234, + "tpp_threshold_5_total_metric": 0.002500000596046448, + "tpp_threshold_5_intended_diff_only": 0.0048000097274780275, + "tpp_threshold_5_unintended_diff_only": 0.0023000091314315796, + "tpp_threshold_10_total_metric": 0.0022500097751617433, + "tpp_threshold_10_intended_diff_only": 0.004400014877319336, + "tpp_threshold_10_unintended_diff_only": 0.0021500051021575927, + "tpp_threshold_20_total_metric": 0.0045499980449676515, + "tpp_threshold_20_intended_diff_only": 0.007200014591217041, + "tpp_threshold_20_unintended_diff_only": 0.0026500165462493897, + "tpp_threshold_50_total_metric": 0.014249974489212036, + "tpp_threshold_50_intended_diff_only": 0.016999995708465575, + "tpp_threshold_50_unintended_diff_only": 0.00275002121925354, + "tpp_threshold_100_total_metric": 0.021900007128715517, + "tpp_threshold_100_intended_diff_only": 0.0254000186920166, + "tpp_threshold_100_unintended_diff_only": 0.0035000115633010863, + "tpp_threshold_500_total_metric": 0.07305001318454743, + "tpp_threshold_500_intended_diff_only": 0.07700002193450928, + "tpp_threshold_500_unintended_diff_only": 0.003950008749961853 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results", + "tpp_threshold_2_total_metric": -0.0009499937295913696, + "tpp_threshold_2_intended_diff_only": 0.0019999980926513673, + "tpp_threshold_2_unintended_diff_only": 0.002949991822242737, + "tpp_threshold_5_total_metric": 0.00529998242855072, + "tpp_threshold_5_intended_diff_only": 0.006599974632263183, + "tpp_threshold_5_unintended_diff_only": 0.0012999922037124634, + "tpp_threshold_10_total_metric": -0.002199992537498474, + "tpp_threshold_10_intended_diff_only": 0.0011999964714050292, + "tpp_threshold_10_unintended_diff_only": 0.0033999890089035033, + "tpp_threshold_20_total_metric": 0.0029499977827072144, + "tpp_threshold_20_intended_diff_only": 0.007199990749359131, + "tpp_threshold_20_unintended_diff_only": 0.004249992966651917, + "tpp_threshold_50_total_metric": 0.010950002074241637, + "tpp_threshold_50_intended_diff_only": 0.016999995708465575, + "tpp_threshold_50_unintended_diff_only": 0.006049993634223938, + "tpp_threshold_100_total_metric": 0.025700002908706665, + "tpp_threshold_100_intended_diff_only": 0.03279999494552612, + "tpp_threshold_100_unintended_diff_only": 0.007099992036819458, + "tpp_threshold_500_total_metric": 0.08335001468658447, + "tpp_threshold_500_intended_diff_only": 0.09620000123977661, + "tpp_threshold_500_unintended_diff_only": 0.012849986553192139 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1": { + "0": { + "tpp_threshold_2_total_metric": 0.005000010132789612, + "tpp_threshold_2_intended_diff_only": 0.00700002908706665, + "tpp_threshold_2_unintended_diff_only": 0.0020000189542770386, + "tpp_threshold_5_total_metric": 0.004249975085258484, + "tpp_threshold_5_intended_diff_only": 0.004999995231628418, + "tpp_threshold_5_unintended_diff_only": 0.0007500201463699341, + "tpp_threshold_10_total_metric": 0.0072500258684158325, + "tpp_threshold_10_intended_diff_only": 0.00700002908706665, + "tpp_threshold_10_unintended_diff_only": -0.00024999678134918213, + "tpp_threshold_20_total_metric": 0.004749998450279236, + "tpp_threshold_20_intended_diff_only": 0.008000016212463379, + "tpp_threshold_20_unintended_diff_only": 0.003250017762184143, + "tpp_threshold_50_total_metric": 0.022749990224838257, + "tpp_threshold_50_intended_diff_only": 0.02799999713897705, + "tpp_threshold_50_unintended_diff_only": 0.005250006914138794, + "tpp_threshold_100_total_metric": 0.038000017404556274, + "tpp_threshold_100_intended_diff_only": 0.04400002956390381, + "tpp_threshold_100_unintended_diff_only": 0.006000012159347534, + "tpp_threshold_500_total_metric": 0.14850004017353058, + "tpp_threshold_500_intended_diff_only": 0.15400004386901855, + "tpp_threshold_500_unintended_diff_only": 0.005500003695487976 + }, + "1": { + "tpp_threshold_2_total_metric": 0.00875002145767212, + "tpp_threshold_2_intended_diff_only": 0.012000024318695068, + "tpp_threshold_2_unintended_diff_only": 0.0032500028610229492, + "tpp_threshold_5_total_metric": -0.0012499988079071045, + "tpp_threshold_5_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_5_unintended_diff_only": 0.005250006914138794, + "tpp_threshold_10_total_metric": -0.0009999573230743408, + "tpp_threshold_10_intended_diff_only": 0.001000046730041504, + "tpp_threshold_10_unintended_diff_only": 0.0020000040531158447, + "tpp_threshold_20_total_metric": -0.00024996697902679443, + "tpp_threshold_20_intended_diff_only": 0.001000046730041504, + "tpp_threshold_20_unintended_diff_only": 0.0012500137090682983, + "tpp_threshold_50_total_metric": -0.004250004887580872, + "tpp_threshold_50_intended_diff_only": -0.0009999871253967285, + "tpp_threshold_50_unintended_diff_only": 0.003250017762184143, + "tpp_threshold_100_total_metric": 0.007249981164932251, + "tpp_threshold_100_intended_diff_only": 0.009000003337860107, + "tpp_threshold_100_unintended_diff_only": 0.0017500221729278564, + "tpp_threshold_500_total_metric": 0.03200000524520874, + "tpp_threshold_500_intended_diff_only": 0.03700000047683716, + "tpp_threshold_500_unintended_diff_only": 0.004999995231628418 + }, + "2": { + "tpp_threshold_2_total_metric": 0.0062499940395355225, + "tpp_threshold_2_intended_diff_only": 0.009000003337860107, + "tpp_threshold_2_unintended_diff_only": 0.002750009298324585, + "tpp_threshold_5_total_metric": 0.0065000057220458984, + "tpp_threshold_5_intended_diff_only": 0.008000016212463379, + "tpp_threshold_5_unintended_diff_only": 0.0015000104904174805, + "tpp_threshold_10_total_metric": 0.0007500052452087402, + "tpp_threshold_10_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_10_unintended_diff_only": 0.0032500028610229492, + "tpp_threshold_20_total_metric": 0.007999971508979797, + "tpp_threshold_20_intended_diff_only": 0.009999990463256836, + "tpp_threshold_20_unintended_diff_only": 0.0020000189542770386, + "tpp_threshold_50_total_metric": 0.014999955892562866, + "tpp_threshold_50_intended_diff_only": 0.015999972820281982, + "tpp_threshold_50_unintended_diff_only": 0.0010000169277191162, + "tpp_threshold_100_total_metric": 0.008500009775161743, + "tpp_threshold_100_intended_diff_only": 0.013000011444091797, + "tpp_threshold_100_unintended_diff_only": 0.004500001668930054, + "tpp_threshold_500_total_metric": 0.0455000102519989, + "tpp_threshold_500_intended_diff_only": 0.04900002479553223, + "tpp_threshold_500_unintended_diff_only": 0.003500014543533325 + }, + "6": { + "tpp_threshold_2_total_metric": 0.0014999806880950928, + "tpp_threshold_2_intended_diff_only": 0.0009999871253967285, + "tpp_threshold_2_unintended_diff_only": -0.0004999935626983643, + "tpp_threshold_5_total_metric": -0.0004999637603759766, + "tpp_threshold_5_intended_diff_only": 0.0020000338554382324, + "tpp_threshold_5_unintended_diff_only": 0.002499997615814209, + "tpp_threshold_10_total_metric": 0.0032500028610229492, + "tpp_threshold_10_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_10_unintended_diff_only": 0.0007500052452087402, + "tpp_threshold_20_total_metric": -0.0004999935626983643, + "tpp_threshold_20_intended_diff_only": 0.003000020980834961, + "tpp_threshold_20_unintended_diff_only": 0.003500014543533325, + "tpp_threshold_50_total_metric": 0.0027499794960021973, + "tpp_threshold_50_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_50_unintended_diff_only": 0.0012500286102294922, + "tpp_threshold_100_total_metric": 0.0015000253915786743, + "tpp_threshold_100_intended_diff_only": 0.0020000338554382324, + "tpp_threshold_100_unintended_diff_only": 0.0005000084638595581, + "tpp_threshold_500_total_metric": 0.004250019788742065, + "tpp_threshold_500_intended_diff_only": 0.00700002908706665, + "tpp_threshold_500_unintended_diff_only": 0.002750009298324585 + }, + "9": { + "tpp_threshold_2_total_metric": 0.003999948501586914, + "tpp_threshold_2_intended_diff_only": 0.006999969482421875, + "tpp_threshold_2_unintended_diff_only": 0.003000020980834961, + "tpp_threshold_5_total_metric": 0.0034999847412109375, + "tpp_threshold_5_intended_diff_only": 0.004999995231628418, + "tpp_threshold_5_unintended_diff_only": 0.0015000104904174805, + "tpp_threshold_10_total_metric": 0.0009999722242355347, + "tpp_threshold_10_intended_diff_only": 0.0059999823570251465, + "tpp_threshold_10_unintended_diff_only": 0.005000010132789612, + "tpp_threshold_20_total_metric": 0.010749980807304382, + "tpp_threshold_20_intended_diff_only": 0.013999998569488525, + "tpp_threshold_20_unintended_diff_only": 0.003250017762184143, + "tpp_threshold_50_total_metric": 0.03499995172023773, + "tpp_threshold_50_intended_diff_only": 0.03799998760223389, + "tpp_threshold_50_unintended_diff_only": 0.003000035881996155, + "tpp_threshold_100_total_metric": 0.05425000190734863, + "tpp_threshold_100_intended_diff_only": 0.05900001525878906, + "tpp_threshold_100_unintended_diff_only": 0.00475001335144043, + "tpp_threshold_500_total_metric": 0.13499999046325684, + "tpp_threshold_500_intended_diff_only": 0.1380000114440918, + "tpp_threshold_500_unintended_diff_only": 0.003000020980834961 + } + }, + "canrager/amazon_reviews_mcauley_1and5": { + "1": { + "tpp_threshold_2_total_metric": 0.0007500052452087402, + "tpp_threshold_2_intended_diff_only": 0.004999995231628418, + "tpp_threshold_2_unintended_diff_only": 0.004249989986419678, + "tpp_threshold_5_total_metric": 0.0037499815225601196, + "tpp_threshold_5_intended_diff_only": 0.006999969482421875, + "tpp_threshold_5_unintended_diff_only": 0.0032499879598617554, + "tpp_threshold_10_total_metric": -0.0002499818801879883, + "tpp_threshold_10_intended_diff_only": 0.004999995231628418, + "tpp_threshold_10_unintended_diff_only": 0.005249977111816406, + "tpp_threshold_20_total_metric": -0.003250032663345337, + "tpp_threshold_20_intended_diff_only": 0.0029999613761901855, + "tpp_threshold_20_unintended_diff_only": 0.0062499940395355225, + "tpp_threshold_50_total_metric": 0.0009999871253967285, + "tpp_threshold_50_intended_diff_only": 0.006999969482421875, + "tpp_threshold_50_unintended_diff_only": 0.0059999823570251465, + "tpp_threshold_100_total_metric": 0.002499997615814209, + "tpp_threshold_100_intended_diff_only": 0.0059999823570251465, + "tpp_threshold_100_unintended_diff_only": 0.0034999847412109375, + "tpp_threshold_500_total_metric": 0.015999972820281982, + "tpp_threshold_500_intended_diff_only": 0.02199995517730713, + "tpp_threshold_500_unintended_diff_only": 0.0059999823570251465 + }, + "2": { + "tpp_threshold_2_total_metric": 0.0005000084638595581, + "tpp_threshold_2_intended_diff_only": 0.004999995231628418, + "tpp_threshold_2_unintended_diff_only": 0.00449998676776886, + "tpp_threshold_5_total_metric": 0.012999966740608215, + "tpp_threshold_5_intended_diff_only": 0.011999964714050293, + "tpp_threshold_5_unintended_diff_only": -0.0010000020265579224, + "tpp_threshold_10_total_metric": 0.004749983549118042, + "tpp_threshold_10_intended_diff_only": 0.006999969482421875, + "tpp_threshold_10_unintended_diff_only": 0.002249985933303833, + "tpp_threshold_20_total_metric": 0.013750016689300537, + "tpp_threshold_20_intended_diff_only": 0.018000006675720215, + "tpp_threshold_20_unintended_diff_only": 0.004249989986419678, + "tpp_threshold_50_total_metric": 0.0062499940395355225, + "tpp_threshold_50_intended_diff_only": 0.013999998569488525, + "tpp_threshold_50_unintended_diff_only": 0.007750004529953003, + "tpp_threshold_100_total_metric": 0.023999974131584167, + "tpp_threshold_100_intended_diff_only": 0.029999971389770508, + "tpp_threshold_100_unintended_diff_only": 0.00599999725818634, + "tpp_threshold_500_total_metric": 0.07625000178813934, + "tpp_threshold_500_intended_diff_only": 0.08899998664855957, + "tpp_threshold_500_unintended_diff_only": 0.012749984860420227 + }, + "3": { + "tpp_threshold_2_total_metric": 0.0004999637603759766, + "tpp_threshold_2_intended_diff_only": -0.0020000338554382324, + "tpp_threshold_2_unintended_diff_only": -0.002499997615814209, + "tpp_threshold_5_total_metric": -0.003500029444694519, + "tpp_threshold_5_intended_diff_only": -0.0020000338554382324, + "tpp_threshold_5_unintended_diff_only": 0.0014999955892562866, + "tpp_threshold_10_total_metric": -0.008749991655349731, + "tpp_threshold_10_intended_diff_only": -0.004999995231628418, + "tpp_threshold_10_unintended_diff_only": 0.0037499964237213135, + "tpp_threshold_20_total_metric": -0.000500023365020752, + "tpp_threshold_20_intended_diff_only": 0.001999974250793457, + "tpp_threshold_20_unintended_diff_only": 0.002499997615814209, + "tpp_threshold_50_total_metric": 0.011249974370002747, + "tpp_threshold_50_intended_diff_only": 0.011999964714050293, + "tpp_threshold_50_unintended_diff_only": 0.0007499903440475464, + "tpp_threshold_100_total_metric": 0.024500012397766113, + "tpp_threshold_100_intended_diff_only": 0.027000010013580322, + "tpp_threshold_100_unintended_diff_only": 0.002499997615814209, + "tpp_threshold_500_total_metric": 0.07074999809265137, + "tpp_threshold_500_intended_diff_only": 0.08300000429153442, + "tpp_threshold_500_unintended_diff_only": 0.012250006198883057 + }, + "5": { + "tpp_threshold_2_total_metric": -0.014499962329864502, + "tpp_threshold_2_intended_diff_only": -0.006999969482421875, + "tpp_threshold_2_unintended_diff_only": 0.007499992847442627, + "tpp_threshold_5_total_metric": -0.008249983191490173, + "tpp_threshold_5_intended_diff_only": -0.004999995231628418, + "tpp_threshold_5_unintended_diff_only": 0.0032499879598617554, + "tpp_threshold_10_total_metric": -0.012499943375587463, + "tpp_threshold_10_intended_diff_only": -0.007999956607818604, + "tpp_threshold_10_unintended_diff_only": 0.00449998676776886, + "tpp_threshold_20_total_metric": -0.009499967098236084, + "tpp_threshold_20_intended_diff_only": -0.0009999871253967285, + "tpp_threshold_20_unintended_diff_only": 0.008499979972839355, + "tpp_threshold_50_total_metric": 0.004000052809715271, + "tpp_threshold_50_intended_diff_only": 0.01500004529953003, + "tpp_threshold_50_unintended_diff_only": 0.010999992489814758, + "tpp_threshold_100_total_metric": 0.01875002682209015, + "tpp_threshold_100_intended_diff_only": 0.03600001335144043, + "tpp_threshold_100_unintended_diff_only": 0.01724998652935028, + "tpp_threshold_500_total_metric": 0.09225007891654968, + "tpp_threshold_500_intended_diff_only": 0.11200004816055298, + "tpp_threshold_500_unintended_diff_only": 0.019749969244003296 + }, + "6": { + "tpp_threshold_2_total_metric": 0.008000016212463379, + "tpp_threshold_2_intended_diff_only": 0.009000003337860107, + "tpp_threshold_2_unintended_diff_only": 0.0009999871253967285, + "tpp_threshold_5_total_metric": 0.02149997651576996, + "tpp_threshold_5_intended_diff_only": 0.0209999680519104, + "tpp_threshold_5_unintended_diff_only": -0.0005000084638595581, + "tpp_threshold_10_total_metric": 0.0057499706745147705, + "tpp_threshold_10_intended_diff_only": 0.006999969482421875, + "tpp_threshold_10_unintended_diff_only": 0.0012499988079071045, + "tpp_threshold_20_total_metric": 0.014249995350837708, + "tpp_threshold_20_intended_diff_only": 0.013999998569488525, + "tpp_threshold_20_unintended_diff_only": -0.00024999678134918213, + "tpp_threshold_50_total_metric": 0.03225000202655792, + "tpp_threshold_50_intended_diff_only": 0.03700000047683716, + "tpp_threshold_50_unintended_diff_only": 0.004749998450279236, + "tpp_threshold_100_total_metric": 0.058750003576278687, + "tpp_threshold_100_intended_diff_only": 0.06499999761581421, + "tpp_threshold_100_unintended_diff_only": 0.0062499940395355225, + "tpp_threshold_500_total_metric": 0.1615000218153, + "tpp_threshold_500_intended_diff_only": 0.17500001192092896, + "tpp_threshold_500_unintended_diff_only": 0.013499990105628967 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ed9ef1e2c2400cfe5442e0c18b2e38c456d14191 --- /dev/null +++ b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,414 @@ +{ + "eval_type_id": "tpp", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": false, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "072d027b-019d-4c30-82dd-a58126cb07ee", + "datetime_epoch_millis": 1740163113677, + "eval_result_metrics": { + "tpp_metrics": { + "tpp_threshold_2_total_metric": 0.0036749929189682003, + "tpp_threshold_2_intended_diff_only": 0.00629999041557312, + "tpp_threshold_2_unintended_diff_only": 0.002624997496604919, + "tpp_threshold_5_total_metric": 0.005574998259544372, + "tpp_threshold_5_intended_diff_only": 0.008499997854232787, + "tpp_threshold_5_unintended_diff_only": 0.0029249995946884154, + "tpp_threshold_10_total_metric": 0.009825007617473602, + "tpp_threshold_10_intended_diff_only": 0.013400006294250488, + "tpp_threshold_10_unintended_diff_only": 0.003574998676776886, + "tpp_threshold_20_total_metric": 0.014200010895729066, + "tpp_threshold_20_intended_diff_only": 0.01830000877380371, + "tpp_threshold_20_unintended_diff_only": 0.004099997878074646, + "tpp_threshold_50_total_metric": 0.027800002694129942, + "tpp_threshold_50_intended_diff_only": 0.033399999141693115, + "tpp_threshold_50_unintended_diff_only": 0.005599996447563172, + "tpp_threshold_100_total_metric": 0.04107500612735748, + "tpp_threshold_100_intended_diff_only": 0.04900000095367432, + "tpp_threshold_100_unintended_diff_only": 0.007924994826316834, + "tpp_threshold_500_total_metric": 0.1252000018954277, + "tpp_threshold_500_intended_diff_only": 0.13830000162124634, + "tpp_threshold_500_unintended_diff_only": 0.013099999725818634 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results", + "tpp_threshold_2_total_metric": 0.006199979782104492, + "tpp_threshold_2_intended_diff_only": 0.008199989795684814, + "tpp_threshold_2_unintended_diff_only": 0.002000010013580322, + "tpp_threshold_5_total_metric": 0.007949993014335632, + "tpp_threshold_5_intended_diff_only": 0.010800004005432129, + "tpp_threshold_5_unintended_diff_only": 0.0028500109910964966, + "tpp_threshold_10_total_metric": 0.012450012564659118, + "tpp_threshold_10_intended_diff_only": 0.014600014686584473, + "tpp_threshold_10_unintended_diff_only": 0.002150002121925354, + "tpp_threshold_20_total_metric": 0.018750008940696717, + "tpp_threshold_20_intended_diff_only": 0.02100001573562622, + "tpp_threshold_20_unintended_diff_only": 0.0022500067949295043, + "tpp_threshold_50_total_metric": 0.03235000073909759, + "tpp_threshold_50_intended_diff_only": 0.03760000467300415, + "tpp_threshold_50_unintended_diff_only": 0.005250003933906555, + "tpp_threshold_100_total_metric": 0.05085000991821289, + "tpp_threshold_100_intended_diff_only": 0.05620001554489136, + "tpp_threshold_100_unintended_diff_only": 0.005350005626678467, + "tpp_threshold_500_total_metric": 0.1507999986410141, + "tpp_threshold_500_intended_diff_only": 0.15760000944137573, + "tpp_threshold_500_unintended_diff_only": 0.006800010800361633 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results", + "tpp_threshold_2_total_metric": 0.0011500060558319093, + "tpp_threshold_2_intended_diff_only": 0.004399991035461426, + "tpp_threshold_2_unintended_diff_only": 0.0032499849796295164, + "tpp_threshold_5_total_metric": 0.003200003504753113, + "tpp_threshold_5_intended_diff_only": 0.006199991703033448, + "tpp_threshold_5_unintended_diff_only": 0.0029999881982803343, + "tpp_threshold_10_total_metric": 0.007200002670288086, + "tpp_threshold_10_intended_diff_only": 0.012199997901916504, + "tpp_threshold_10_unintended_diff_only": 0.004999995231628418, + "tpp_threshold_20_total_metric": 0.009650012850761414, + "tpp_threshold_20_intended_diff_only": 0.015600001811981202, + "tpp_threshold_20_unintended_diff_only": 0.005949988961219788, + "tpp_threshold_50_total_metric": 0.02325000464916229, + "tpp_threshold_50_intended_diff_only": 0.02919999361038208, + "tpp_threshold_50_unintended_diff_only": 0.005949988961219788, + "tpp_threshold_100_total_metric": 0.03130000233650208, + "tpp_threshold_100_intended_diff_only": 0.04179998636245728, + "tpp_threshold_100_unintended_diff_only": 0.0104999840259552, + "tpp_threshold_500_total_metric": 0.09960000514984131, + "tpp_threshold_500_intended_diff_only": 0.11899999380111695, + "tpp_threshold_500_unintended_diff_only": 0.019399988651275634 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1": { + "0": { + "tpp_threshold_2_total_metric": 0.01124998927116394, + "tpp_threshold_2_intended_diff_only": 0.013999998569488525, + "tpp_threshold_2_unintended_diff_only": 0.002750009298324585, + "tpp_threshold_5_total_metric": 0.014999985694885254, + "tpp_threshold_5_intended_diff_only": 0.018999993801116943, + "tpp_threshold_5_unintended_diff_only": 0.0040000081062316895, + "tpp_threshold_10_total_metric": 0.010249987244606018, + "tpp_threshold_10_intended_diff_only": 0.013999998569488525, + "tpp_threshold_10_unintended_diff_only": 0.0037500113248825073, + "tpp_threshold_20_total_metric": 0.021500006318092346, + "tpp_threshold_20_intended_diff_only": 0.023000001907348633, + "tpp_threshold_20_unintended_diff_only": 0.0014999955892562866, + "tpp_threshold_50_total_metric": 0.04150000214576721, + "tpp_threshold_50_intended_diff_only": 0.05500000715255737, + "tpp_threshold_50_unintended_diff_only": 0.013500005006790161, + "tpp_threshold_100_total_metric": 0.06099998950958252, + "tpp_threshold_100_intended_diff_only": 0.07400000095367432, + "tpp_threshold_100_unintended_diff_only": 0.013000011444091797, + "tpp_threshold_500_total_metric": 0.19425000250339508, + "tpp_threshold_500_intended_diff_only": 0.2070000171661377, + "tpp_threshold_500_unintended_diff_only": 0.012750014662742615 + }, + "1": { + "tpp_threshold_2_total_metric": 0.009250015020370483, + "tpp_threshold_2_intended_diff_only": 0.009000003337860107, + "tpp_threshold_2_unintended_diff_only": -0.000250011682510376, + "tpp_threshold_5_total_metric": 0.008250042796134949, + "tpp_threshold_5_intended_diff_only": 0.010000050067901611, + "tpp_threshold_5_unintended_diff_only": 0.0017500072717666626, + "tpp_threshold_10_total_metric": 0.006750047206878662, + "tpp_threshold_10_intended_diff_only": 0.006000041961669922, + "tpp_threshold_10_unintended_diff_only": -0.0007500052452087402, + "tpp_threshold_20_total_metric": 0.004000052809715271, + "tpp_threshold_20_intended_diff_only": 0.006000041961669922, + "tpp_threshold_20_unintended_diff_only": 0.001999989151954651, + "tpp_threshold_50_total_metric": 0.006000041961669922, + "tpp_threshold_50_intended_diff_only": 0.01100003719329834, + "tpp_threshold_50_unintended_diff_only": 0.004999995231628418, + "tpp_threshold_100_total_metric": 0.018000051379203796, + "tpp_threshold_100_intended_diff_only": 0.024000048637390137, + "tpp_threshold_100_unintended_diff_only": 0.00599999725818634, + "tpp_threshold_500_total_metric": 0.09325002133846283, + "tpp_threshold_500_intended_diff_only": 0.10000002384185791, + "tpp_threshold_500_unintended_diff_only": 0.006750002503395081 + }, + "2": { + "tpp_threshold_2_total_metric": 0.006749957799911499, + "tpp_threshold_2_intended_diff_only": 0.010999977588653564, + "tpp_threshold_2_unintended_diff_only": 0.004250019788742065, + "tpp_threshold_5_total_metric": 0.014499962329864502, + "tpp_threshold_5_intended_diff_only": 0.019999980926513672, + "tpp_threshold_5_unintended_diff_only": 0.00550001859664917, + "tpp_threshold_10_total_metric": 0.038000017404556274, + "tpp_threshold_10_intended_diff_only": 0.04000002145767212, + "tpp_threshold_10_unintended_diff_only": 0.0020000040531158447, + "tpp_threshold_20_total_metric": 0.047749996185302734, + "tpp_threshold_20_intended_diff_only": 0.050000011920928955, + "tpp_threshold_20_unintended_diff_only": 0.0022500157356262207, + "tpp_threshold_50_total_metric": 0.06199999153614044, + "tpp_threshold_50_intended_diff_only": 0.06400001049041748, + "tpp_threshold_50_unintended_diff_only": 0.0020000189542770386, + "tpp_threshold_100_total_metric": 0.08500000834465027, + "tpp_threshold_100_intended_diff_only": 0.08700001239776611, + "tpp_threshold_100_unintended_diff_only": 0.0020000040531158447, + "tpp_threshold_500_total_metric": 0.26524999737739563, + "tpp_threshold_500_intended_diff_only": 0.2670000195503235, + "tpp_threshold_500_unintended_diff_only": 0.0017500221729278564 + }, + "6": { + "tpp_threshold_2_total_metric": -0.0002500265836715698, + "tpp_threshold_2_intended_diff_only": 0.0009999871253967285, + "tpp_threshold_2_unintended_diff_only": 0.0012500137090682983, + "tpp_threshold_5_total_metric": 0.000250011682510376, + "tpp_threshold_5_intended_diff_only": -0.0009999871253967285, + "tpp_threshold_5_unintended_diff_only": -0.0012499988079071045, + "tpp_threshold_10_total_metric": 0.003000006079673767, + "tpp_threshold_10_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_10_unintended_diff_only": 0.0010000020265579224, + "tpp_threshold_20_total_metric": 0.003250017762184143, + "tpp_threshold_20_intended_diff_only": 0.0020000338554382324, + "tpp_threshold_20_unintended_diff_only": -0.0012499839067459106, + "tpp_threshold_50_total_metric": 0.004250004887580872, + "tpp_threshold_50_intended_diff_only": 0.004999995231628418, + "tpp_threshold_50_unintended_diff_only": 0.0007499903440475464, + "tpp_threshold_100_total_metric": 0.011750027537345886, + "tpp_threshold_100_intended_diff_only": 0.01100003719329834, + "tpp_threshold_100_unintended_diff_only": -0.0007499903440475464, + "tpp_threshold_500_total_metric": 0.015999987721443176, + "tpp_threshold_500_intended_diff_only": 0.019999980926513672, + "tpp_threshold_500_unintended_diff_only": 0.003999993205070496 + }, + "9": { + "tpp_threshold_2_total_metric": 0.003999963402748108, + "tpp_threshold_2_intended_diff_only": 0.0059999823570251465, + "tpp_threshold_2_unintended_diff_only": 0.0020000189542770386, + "tpp_threshold_5_total_metric": 0.001749962568283081, + "tpp_threshold_5_intended_diff_only": 0.0059999823570251465, + "tpp_threshold_5_unintended_diff_only": 0.004250019788742065, + "tpp_threshold_10_total_metric": 0.004250004887580872, + "tpp_threshold_10_intended_diff_only": 0.009000003337860107, + "tpp_threshold_10_unintended_diff_only": 0.004749998450279236, + "tpp_threshold_20_total_metric": 0.017249971628189087, + "tpp_threshold_20_intended_diff_only": 0.02399998903274536, + "tpp_threshold_20_unintended_diff_only": 0.006750017404556274, + "tpp_threshold_50_total_metric": 0.04799996316432953, + "tpp_threshold_50_intended_diff_only": 0.05299997329711914, + "tpp_threshold_50_unintended_diff_only": 0.005000010132789612, + "tpp_threshold_100_total_metric": 0.07849997282028198, + "tpp_threshold_100_intended_diff_only": 0.08499997854232788, + "tpp_threshold_100_unintended_diff_only": 0.0065000057220458984, + "tpp_threshold_500_total_metric": 0.18524998426437378, + "tpp_threshold_500_intended_diff_only": 0.1940000057220459, + "tpp_threshold_500_unintended_diff_only": 0.00875002145767212 + } + }, + "canrager/amazon_reviews_mcauley_1and5": { + "1": { + "tpp_threshold_2_total_metric": 0.0062499940395355225, + "tpp_threshold_2_intended_diff_only": 0.010999977588653564, + "tpp_threshold_2_unintended_diff_only": 0.004749983549118042, + "tpp_threshold_5_total_metric": 0.0015000104904174805, + "tpp_threshold_5_intended_diff_only": 0.004999995231628418, + "tpp_threshold_5_unintended_diff_only": 0.0034999847412109375, + "tpp_threshold_10_total_metric": 0.0032499730587005615, + "tpp_threshold_10_intended_diff_only": 0.010999977588653564, + "tpp_threshold_10_unintended_diff_only": 0.007750004529953003, + "tpp_threshold_20_total_metric": 0.001749977469444275, + "tpp_threshold_20_intended_diff_only": 0.006999969482421875, + "tpp_threshold_20_unintended_diff_only": 0.0052499920129776, + "tpp_threshold_50_total_metric": 0.001499965786933899, + "tpp_threshold_50_intended_diff_only": 0.003999948501586914, + "tpp_threshold_50_unintended_diff_only": 0.002499982714653015, + "tpp_threshold_100_total_metric": 0.001749962568283081, + "tpp_threshold_100_intended_diff_only": 0.007999956607818604, + "tpp_threshold_100_unintended_diff_only": 0.0062499940395355225, + "tpp_threshold_500_total_metric": 0.027499958872795105, + "tpp_threshold_500_intended_diff_only": 0.030999958515167236, + "tpp_threshold_500_unintended_diff_only": 0.0034999996423721313 + }, + "2": { + "tpp_threshold_2_total_metric": 0.00475001335144043, + "tpp_threshold_2_intended_diff_only": 0.004999995231628418, + "tpp_threshold_2_unintended_diff_only": 0.0002499818801879883, + "tpp_threshold_5_total_metric": 0.001499965786933899, + "tpp_threshold_5_intended_diff_only": 0.007999956607818604, + "tpp_threshold_5_unintended_diff_only": 0.006499990820884705, + "tpp_threshold_10_total_metric": 0.008749961853027344, + "tpp_threshold_10_intended_diff_only": 0.011999964714050293, + "tpp_threshold_10_unintended_diff_only": 0.0032500028610229492, + "tpp_threshold_20_total_metric": 0.005750015377998352, + "tpp_threshold_20_intended_diff_only": 0.009000003337860107, + "tpp_threshold_20_unintended_diff_only": 0.0032499879598617554, + "tpp_threshold_50_total_metric": 0.012500002980232239, + "tpp_threshold_50_intended_diff_only": 0.018000006675720215, + "tpp_threshold_50_unintended_diff_only": 0.005500003695487976, + "tpp_threshold_100_total_metric": 0.019000008702278137, + "tpp_threshold_100_intended_diff_only": 0.02899998426437378, + "tpp_threshold_100_unintended_diff_only": 0.009999975562095642, + "tpp_threshold_500_total_metric": 0.08375000953674316, + "tpp_threshold_500_intended_diff_only": 0.09299999475479126, + "tpp_threshold_500_unintended_diff_only": 0.009249985218048096 + }, + "3": { + "tpp_threshold_2_total_metric": -0.00849999487400055, + "tpp_threshold_2_intended_diff_only": -0.004999995231628418, + "tpp_threshold_2_unintended_diff_only": 0.0034999996423721313, + "tpp_threshold_5_total_metric": 0.005000025033950806, + "tpp_threshold_5_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_5_unintended_diff_only": -0.0010000169277191162, + "tpp_threshold_10_total_metric": 0.006750002503395081, + "tpp_threshold_10_intended_diff_only": 0.009999990463256836, + "tpp_threshold_10_unintended_diff_only": 0.0032499879598617554, + "tpp_threshold_20_total_metric": 0.003250017762184143, + "tpp_threshold_20_intended_diff_only": 0.009000003337860107, + "tpp_threshold_20_unintended_diff_only": 0.005749985575675964, + "tpp_threshold_50_total_metric": 0.016000017523765564, + "tpp_threshold_50_intended_diff_only": 0.018000006675720215, + "tpp_threshold_50_unintended_diff_only": 0.001999989151954651, + "tpp_threshold_100_total_metric": 0.023000001907348633, + "tpp_threshold_100_intended_diff_only": 0.03299999237060547, + "tpp_threshold_100_unintended_diff_only": 0.009999990463256836, + "tpp_threshold_500_total_metric": 0.0832500159740448, + "tpp_threshold_500_intended_diff_only": 0.09700000286102295, + "tpp_threshold_500_unintended_diff_only": 0.01374998688697815 + }, + "5": { + "tpp_threshold_2_total_metric": -0.004999950528144836, + "tpp_threshold_2_intended_diff_only": -0.001999974250793457, + "tpp_threshold_2_unintended_diff_only": 0.0029999762773513794, + "tpp_threshold_5_total_metric": -0.0009999871253967285, + "tpp_threshold_5_intended_diff_only": 0.004999995231628418, + "tpp_threshold_5_unintended_diff_only": 0.0059999823570251465, + "tpp_threshold_10_total_metric": -0.0074999332427978516, + "tpp_threshold_10_intended_diff_only": 0.001000046730041504, + "tpp_threshold_10_unintended_diff_only": 0.008499979972839355, + "tpp_threshold_20_total_metric": 0.0055000633001327515, + "tpp_threshold_20_intended_diff_only": 0.01500004529953003, + "tpp_threshold_20_unintended_diff_only": 0.009499981999397278, + "tpp_threshold_50_total_metric": 0.03925004601478577, + "tpp_threshold_50_intended_diff_only": 0.04900002479553223, + "tpp_threshold_50_unintended_diff_only": 0.00974997878074646, + "tpp_threshold_100_total_metric": 0.04725003242492676, + "tpp_threshold_100_intended_diff_only": 0.06400001049041748, + "tpp_threshold_100_unintended_diff_only": 0.016749978065490723, + "tpp_threshold_500_total_metric": 0.08625003695487976, + "tpp_threshold_500_intended_diff_only": 0.14600002765655518, + "tpp_threshold_500_unintended_diff_only": 0.059749990701675415 + }, + "6": { + "tpp_threshold_2_total_metric": 0.00824996829032898, + "tpp_threshold_2_intended_diff_only": 0.012999951839447021, + "tpp_threshold_2_unintended_diff_only": 0.004749983549118042, + "tpp_threshold_5_total_metric": 0.009000003337860107, + "tpp_threshold_5_intended_diff_only": 0.009000003337860107, + "tpp_threshold_5_unintended_diff_only": 0.0, + "tpp_threshold_10_total_metric": 0.024750009179115295, + "tpp_threshold_10_intended_diff_only": 0.027000010013580322, + "tpp_threshold_10_unintended_diff_only": 0.002250000834465027, + "tpp_threshold_20_total_metric": 0.031999990344047546, + "tpp_threshold_20_intended_diff_only": 0.03799998760223389, + "tpp_threshold_20_unintended_diff_only": 0.00599999725818634, + "tpp_threshold_50_total_metric": 0.046999990940093994, + "tpp_threshold_50_intended_diff_only": 0.05699998140335083, + "tpp_threshold_50_unintended_diff_only": 0.009999990463256836, + "tpp_threshold_100_total_metric": 0.06550000607967377, + "tpp_threshold_100_intended_diff_only": 0.07499998807907104, + "tpp_threshold_100_unintended_diff_only": 0.009499981999397278, + "tpp_threshold_500_total_metric": 0.2172500044107437, + "tpp_threshold_500_intended_diff_only": 0.2279999852180481, + "tpp_threshold_500_unintended_diff_only": 0.010749980807304382 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..60697517fe3e350ae3ee5d081d44154a32f1ae90 --- /dev/null +++ b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,414 @@ +{ + "eval_type_id": "tpp", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": false, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "6d523acd-6837-4bf6-8169-4ebea2aedf9e", + "datetime_epoch_millis": 1740163747509, + "eval_result_metrics": { + "tpp_metrics": { + "tpp_threshold_2_total_metric": 0.001275007426738739, + "tpp_threshold_2_intended_diff_only": 0.0037000060081481935, + "tpp_threshold_2_unintended_diff_only": 0.0024249985814094547, + "tpp_threshold_5_total_metric": 0.0007249891757965087, + "tpp_threshold_5_intended_diff_only": 0.0034999907016754154, + "tpp_threshold_5_unintended_diff_only": 0.0027750015258789064, + "tpp_threshold_10_total_metric": 0.004174999892711639, + "tpp_threshold_10_intended_diff_only": 0.0078000009059906, + "tpp_threshold_10_unintended_diff_only": 0.0036250010132789614, + "tpp_threshold_20_total_metric": 0.0060749977827072145, + "tpp_threshold_20_intended_diff_only": 0.009899997711181642, + "tpp_threshold_20_unintended_diff_only": 0.003824999928474426, + "tpp_threshold_50_total_metric": 0.016099993884563447, + "tpp_threshold_50_intended_diff_only": 0.022099995613098146, + "tpp_threshold_50_unintended_diff_only": 0.006000001728534699, + "tpp_threshold_100_total_metric": 0.02952500283718109, + "tpp_threshold_100_intended_diff_only": 0.039000004529953, + "tpp_threshold_100_unintended_diff_only": 0.009475001692771911, + "tpp_threshold_500_total_metric": 0.12557500153779982, + "tpp_threshold_500_intended_diff_only": 0.1384999990463257, + "tpp_threshold_500_unintended_diff_only": 0.012924997508525847 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results", + "tpp_threshold_2_total_metric": 0.0031000018119812013, + "tpp_threshold_2_intended_diff_only": 0.0048000097274780275, + "tpp_threshold_2_unintended_diff_only": 0.0017000079154968263, + "tpp_threshold_5_total_metric": 0.0022999852895736693, + "tpp_threshold_5_intended_diff_only": 0.004799997806549073, + "tpp_threshold_5_unintended_diff_only": 0.002500012516975403, + "tpp_threshold_10_total_metric": 0.006099998950958252, + "tpp_threshold_10_intended_diff_only": 0.008200013637542724, + "tpp_threshold_10_unintended_diff_only": 0.0021000146865844727, + "tpp_threshold_20_total_metric": 0.01099998652935028, + "tpp_threshold_20_intended_diff_only": 0.013199996948242188, + "tpp_threshold_20_unintended_diff_only": 0.002200010418891907, + "tpp_threshold_50_total_metric": 0.0169999897480011, + "tpp_threshold_50_intended_diff_only": 0.020399999618530274, + "tpp_threshold_50_unintended_diff_only": 0.0034000098705291746, + "tpp_threshold_100_total_metric": 0.03155000209808349, + "tpp_threshold_100_intended_diff_only": 0.035600018501281736, + "tpp_threshold_100_unintended_diff_only": 0.004050016403198242, + "tpp_threshold_500_total_metric": 0.12450000941753388, + "tpp_threshold_500_intended_diff_only": 0.13040001392364503, + "tpp_threshold_500_unintended_diff_only": 0.005900004506111145 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results", + "tpp_threshold_2_total_metric": -0.0005499869585037231, + "tpp_threshold_2_intended_diff_only": 0.0026000022888183595, + "tpp_threshold_2_unintended_diff_only": 0.0031499892473220827, + "tpp_threshold_5_total_metric": -0.0008500069379806519, + "tpp_threshold_5_intended_diff_only": 0.0021999835968017577, + "tpp_threshold_5_unintended_diff_only": 0.0030499905347824096, + "tpp_threshold_10_total_metric": 0.002250000834465027, + "tpp_threshold_10_intended_diff_only": 0.007399988174438476, + "tpp_threshold_10_unintended_diff_only": 0.0051499873399734495, + "tpp_threshold_20_total_metric": 0.001150009036064148, + "tpp_threshold_20_intended_diff_only": 0.006599998474121094, + "tpp_threshold_20_unintended_diff_only": 0.0054499894380569455, + "tpp_threshold_50_total_metric": 0.015199998021125793, + "tpp_threshold_50_intended_diff_only": 0.023799991607666014, + "tpp_threshold_50_unintended_diff_only": 0.008599993586540223, + "tpp_threshold_100_total_metric": 0.027500003576278687, + "tpp_threshold_100_intended_diff_only": 0.04239999055862427, + "tpp_threshold_100_unintended_diff_only": 0.01489998698234558, + "tpp_threshold_500_total_metric": 0.12664999365806578, + "tpp_threshold_500_intended_diff_only": 0.14659998416900635, + "tpp_threshold_500_unintended_diff_only": 0.01994999051094055 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1": { + "0": { + "tpp_threshold_2_total_metric": 0.007249996066093445, + "tpp_threshold_2_intended_diff_only": 0.009000003337860107, + "tpp_threshold_2_unintended_diff_only": 0.0017500072717666626, + "tpp_threshold_5_total_metric": 0.00925000011920929, + "tpp_threshold_5_intended_diff_only": 0.012000024318695068, + "tpp_threshold_5_unintended_diff_only": 0.002750024199485779, + "tpp_threshold_10_total_metric": 0.003000006079673767, + "tpp_threshold_10_intended_diff_only": 0.00700002908706665, + "tpp_threshold_10_unintended_diff_only": 0.004000023007392883, + "tpp_threshold_20_total_metric": 0.0169999897480011, + "tpp_threshold_20_intended_diff_only": 0.018999993801116943, + "tpp_threshold_20_unintended_diff_only": 0.0020000040531158447, + "tpp_threshold_50_total_metric": 0.028999969363212585, + "tpp_threshold_50_intended_diff_only": 0.03299999237060547, + "tpp_threshold_50_unintended_diff_only": 0.004000023007392883, + "tpp_threshold_100_total_metric": 0.05250002443790436, + "tpp_threshold_100_intended_diff_only": 0.058000028133392334, + "tpp_threshold_100_unintended_diff_only": 0.005500003695487976, + "tpp_threshold_500_total_metric": 0.23200003802776337, + "tpp_threshold_500_intended_diff_only": 0.23700004816055298, + "tpp_threshold_500_unintended_diff_only": 0.005000010132789612 + }, + "1": { + "tpp_threshold_2_total_metric": 0.007750034332275391, + "tpp_threshold_2_intended_diff_only": 0.00700002908706665, + "tpp_threshold_2_unintended_diff_only": -0.0007500052452087402, + "tpp_threshold_5_total_metric": 0.0015000104904174805, + "tpp_threshold_5_intended_diff_only": 0.003000020980834961, + "tpp_threshold_5_unintended_diff_only": 0.0015000104904174805, + "tpp_threshold_10_total_metric": 0.006750032305717468, + "tpp_threshold_10_intended_diff_only": 0.006000041961669922, + "tpp_threshold_10_unintended_diff_only": -0.0007499903440475464, + "tpp_threshold_20_total_metric": -0.0002499818801879883, + "tpp_threshold_20_intended_diff_only": 0.003000020980834961, + "tpp_threshold_20_unintended_diff_only": 0.0032500028610229492, + "tpp_threshold_50_total_metric": 0.002750024199485779, + "tpp_threshold_50_intended_diff_only": 0.00700002908706665, + "tpp_threshold_50_unintended_diff_only": 0.004250004887580872, + "tpp_threshold_100_total_metric": 0.0017500221729278564, + "tpp_threshold_100_intended_diff_only": 0.006000041961669922, + "tpp_threshold_100_unintended_diff_only": 0.004250019788742065, + "tpp_threshold_500_total_metric": 0.059500038623809814, + "tpp_threshold_500_intended_diff_only": 0.06700003147125244, + "tpp_threshold_500_unintended_diff_only": 0.007499992847442627 + }, + "2": { + "tpp_threshold_2_total_metric": -0.002750009298324585, + "tpp_threshold_2_intended_diff_only": 0.0, + "tpp_threshold_2_unintended_diff_only": 0.002750009298324585, + "tpp_threshold_5_total_metric": -0.003250032663345337, + "tpp_threshold_5_intended_diff_only": 0.0009999871253967285, + "tpp_threshold_5_unintended_diff_only": 0.004250019788742065, + "tpp_threshold_10_total_metric": 0.013499975204467773, + "tpp_threshold_10_intended_diff_only": 0.014999985694885254, + "tpp_threshold_10_unintended_diff_only": 0.0015000104904174805, + "tpp_threshold_20_total_metric": 0.02374996244907379, + "tpp_threshold_20_intended_diff_only": 0.02499997615814209, + "tpp_threshold_20_unintended_diff_only": 0.0012500137090682983, + "tpp_threshold_50_total_metric": 0.022499963641166687, + "tpp_threshold_50_intended_diff_only": 0.02499997615814209, + "tpp_threshold_50_unintended_diff_only": 0.002500012516975403, + "tpp_threshold_100_total_metric": 0.03275001049041748, + "tpp_threshold_100_intended_diff_only": 0.0350000262260437, + "tpp_threshold_100_unintended_diff_only": 0.0022500157356262207, + "tpp_threshold_500_total_metric": 0.148250013589859, + "tpp_threshold_500_intended_diff_only": 0.15200001001358032, + "tpp_threshold_500_unintended_diff_only": 0.0037499964237213135 + }, + "6": { + "tpp_threshold_2_total_metric": 1.4901161193847656e-08, + "tpp_threshold_2_intended_diff_only": 0.0020000338554382324, + "tpp_threshold_2_unintended_diff_only": 0.0020000189542770386, + "tpp_threshold_5_total_metric": 0.001999989151954651, + "tpp_threshold_5_intended_diff_only": 0.0009999871253967285, + "tpp_threshold_5_unintended_diff_only": -0.0010000020265579224, + "tpp_threshold_10_total_metric": 0.002749994397163391, + "tpp_threshold_10_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_10_unintended_diff_only": 0.0012500137090682983, + "tpp_threshold_20_total_metric": 0.003250017762184143, + "tpp_threshold_20_intended_diff_only": 0.0020000338554382324, + "tpp_threshold_20_unintended_diff_only": -0.0012499839067459106, + "tpp_threshold_50_total_metric": 0.003000006079673767, + "tpp_threshold_50_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_50_unintended_diff_only": 0.0010000020265579224, + "tpp_threshold_100_total_metric": 0.0034999847412109375, + "tpp_threshold_100_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_100_unintended_diff_only": 0.000500023365020752, + "tpp_threshold_500_total_metric": 0.012749999761581421, + "tpp_threshold_500_intended_diff_only": 0.018000006675720215, + "tpp_threshold_500_unintended_diff_only": 0.005250006914138794 + }, + "9": { + "tpp_threshold_2_total_metric": 0.0032499730587005615, + "tpp_threshold_2_intended_diff_only": 0.0059999823570251465, + "tpp_threshold_2_unintended_diff_only": 0.002750009298324585, + "tpp_threshold_5_total_metric": 0.001999959349632263, + "tpp_threshold_5_intended_diff_only": 0.006999969482421875, + "tpp_threshold_5_unintended_diff_only": 0.005000010132789612, + "tpp_threshold_10_total_metric": 0.00449998676776886, + "tpp_threshold_10_intended_diff_only": 0.009000003337860107, + "tpp_threshold_10_unintended_diff_only": 0.0045000165700912476, + "tpp_threshold_20_total_metric": 0.011249944567680359, + "tpp_threshold_20_intended_diff_only": 0.01699995994567871, + "tpp_threshold_20_unintended_diff_only": 0.005750015377998352, + "tpp_threshold_50_total_metric": 0.027749985456466675, + "tpp_threshold_50_intended_diff_only": 0.03299999237060547, + "tpp_threshold_50_unintended_diff_only": 0.005250006914138794, + "tpp_threshold_100_total_metric": 0.06724996864795685, + "tpp_threshold_100_intended_diff_only": 0.07499998807907104, + "tpp_threshold_100_unintended_diff_only": 0.007750019431114197, + "tpp_threshold_500_total_metric": 0.16999995708465576, + "tpp_threshold_500_intended_diff_only": 0.17799997329711914, + "tpp_threshold_500_unintended_diff_only": 0.008000016212463379 + } + }, + "canrager/amazon_reviews_mcauley_1and5": { + "1": { + "tpp_threshold_2_total_metric": 0.00599999725818634, + "tpp_threshold_2_intended_diff_only": 0.009999990463256836, + "tpp_threshold_2_unintended_diff_only": 0.003999993205070496, + "tpp_threshold_5_total_metric": 0.003999963402748108, + "tpp_threshold_5_intended_diff_only": 0.006999969482421875, + "tpp_threshold_5_unintended_diff_only": 0.003000006079673767, + "tpp_threshold_10_total_metric": 0.003000006079673767, + "tpp_threshold_10_intended_diff_only": 0.009999990463256836, + "tpp_threshold_10_unintended_diff_only": 0.006999984383583069, + "tpp_threshold_20_total_metric": -0.0025000274181365967, + "tpp_threshold_20_intended_diff_only": 0.0029999613761901855, + "tpp_threshold_20_unintended_diff_only": 0.005499988794326782, + "tpp_threshold_50_total_metric": 0.00349995493888855, + "tpp_threshold_50_intended_diff_only": 0.003999948501586914, + "tpp_threshold_50_unintended_diff_only": 0.0004999935626983643, + "tpp_threshold_100_total_metric": 0.0020000189542770386, + "tpp_threshold_100_intended_diff_only": 0.009000003337860107, + "tpp_threshold_100_unintended_diff_only": 0.006999984383583069, + "tpp_threshold_500_total_metric": 0.022749975323677063, + "tpp_threshold_500_intended_diff_only": 0.042999982833862305, + "tpp_threshold_500_unintended_diff_only": 0.02025000751018524 + }, + "2": { + "tpp_threshold_2_total_metric": -0.0004999935626983643, + "tpp_threshold_2_intended_diff_only": 0.0, + "tpp_threshold_2_unintended_diff_only": 0.0004999935626983643, + "tpp_threshold_5_total_metric": -0.005500033497810364, + "tpp_threshold_5_intended_diff_only": 0.0029999613761901855, + "tpp_threshold_5_unintended_diff_only": 0.00849999487400055, + "tpp_threshold_10_total_metric": 0.0014999955892562866, + "tpp_threshold_10_intended_diff_only": 0.006999969482421875, + "tpp_threshold_10_unintended_diff_only": 0.005499973893165588, + "tpp_threshold_20_total_metric": -0.0009999722242355347, + "tpp_threshold_20_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_20_unintended_diff_only": 0.004999980330467224, + "tpp_threshold_50_total_metric": 0.008999988436698914, + "tpp_threshold_50_intended_diff_only": 0.014999985694885254, + "tpp_threshold_50_unintended_diff_only": 0.00599999725818634, + "tpp_threshold_100_total_metric": 0.01150001585483551, + "tpp_threshold_100_intended_diff_only": 0.023000001907348633, + "tpp_threshold_100_unintended_diff_only": 0.011499986052513123, + "tpp_threshold_500_total_metric": 0.08924996852874756, + "tpp_threshold_500_intended_diff_only": 0.10499995946884155, + "tpp_threshold_500_unintended_diff_only": 0.015749990940093994 + }, + "3": { + "tpp_threshold_2_total_metric": -0.009249970316886902, + "tpp_threshold_2_intended_diff_only": -0.0059999823570251465, + "tpp_threshold_2_unintended_diff_only": 0.0032499879598617554, + "tpp_threshold_5_total_metric": -0.004499971866607666, + "tpp_threshold_5_intended_diff_only": -0.004999995231628418, + "tpp_threshold_5_unintended_diff_only": -0.000500023365020752, + "tpp_threshold_10_total_metric": 0.004249989986419678, + "tpp_threshold_10_intended_diff_only": 0.0059999823570251465, + "tpp_threshold_10_unintended_diff_only": 0.0017499923706054688, + "tpp_threshold_20_total_metric": -0.005499973893165588, + "tpp_threshold_20_intended_diff_only": -0.0009999871253967285, + "tpp_threshold_20_unintended_diff_only": 0.00449998676776886, + "tpp_threshold_50_total_metric": 0.013749971985816956, + "tpp_threshold_50_intended_diff_only": 0.034999966621398926, + "tpp_threshold_50_unintended_diff_only": 0.02124999463558197, + "tpp_threshold_100_total_metric": 0.029999971389770508, + "tpp_threshold_100_intended_diff_only": 0.05799996852874756, + "tpp_threshold_100_unintended_diff_only": 0.02799999713897705, + "tpp_threshold_500_total_metric": 0.10924999415874481, + "tpp_threshold_500_intended_diff_only": 0.13599997758865356, + "tpp_threshold_500_unintended_diff_only": 0.026749983429908752 + }, + "5": { + "tpp_threshold_2_total_metric": -0.005499929189682007, + "tpp_threshold_2_intended_diff_only": -0.0029999613761901855, + "tpp_threshold_2_unintended_diff_only": 0.0024999678134918213, + "tpp_threshold_5_total_metric": -0.006249964237213135, + "tpp_threshold_5_intended_diff_only": -0.0009999871253967285, + "tpp_threshold_5_unintended_diff_only": 0.005249977111816406, + "tpp_threshold_10_total_metric": -0.009999975562095642, + "tpp_threshold_10_intended_diff_only": -0.0009999871253967285, + "tpp_threshold_10_unintended_diff_only": 0.008999988436698914, + "tpp_threshold_20_total_metric": -0.006999969482421875, + "tpp_threshold_20_intended_diff_only": 0.003000020980834961, + "tpp_threshold_20_unintended_diff_only": 0.009999990463256836, + "tpp_threshold_50_total_metric": 0.01650005578994751, + "tpp_threshold_50_intended_diff_only": 0.024000048637390137, + "tpp_threshold_50_unintended_diff_only": 0.007499992847442627, + "tpp_threshold_100_total_metric": 0.04000003635883331, + "tpp_threshold_100_intended_diff_only": 0.06300002336502075, + "tpp_threshold_100_unintended_diff_only": 0.02299998700618744, + "tpp_threshold_500_total_metric": 0.16475005447864532, + "tpp_threshold_500_intended_diff_only": 0.19200003147125244, + "tpp_threshold_500_unintended_diff_only": 0.027249976992607117 + }, + "6": { + "tpp_threshold_2_total_metric": 0.006499961018562317, + "tpp_threshold_2_intended_diff_only": 0.011999964714050293, + "tpp_threshold_2_unintended_diff_only": 0.005500003695487976, + "tpp_threshold_5_total_metric": 0.007999971508979797, + "tpp_threshold_5_intended_diff_only": 0.006999969482421875, + "tpp_threshold_5_unintended_diff_only": -0.0010000020265579224, + "tpp_threshold_10_total_metric": 0.012499988079071045, + "tpp_threshold_10_intended_diff_only": 0.014999985694885254, + "tpp_threshold_10_unintended_diff_only": 0.002499997615814209, + "tpp_threshold_20_total_metric": 0.021749988198280334, + "tpp_threshold_20_intended_diff_only": 0.02399998903274536, + "tpp_threshold_20_unintended_diff_only": 0.002250000834465027, + "tpp_threshold_50_total_metric": 0.03325001895427704, + "tpp_threshold_50_intended_diff_only": 0.04100000858306885, + "tpp_threshold_50_unintended_diff_only": 0.007749989628791809, + "tpp_threshold_100_total_metric": 0.05399997532367706, + "tpp_threshold_100_intended_diff_only": 0.05899995565414429, + "tpp_threshold_100_unintended_diff_only": 0.004999980330467224, + "tpp_threshold_500_total_metric": 0.24724997580051422, + "tpp_threshold_500_intended_diff_only": 0.2569999694824219, + "tpp_threshold_500_unintended_diff_only": 0.009749993681907654 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..10b9ab8ae65f34319ac028bf18ef57c58988cf05 --- /dev/null +++ b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,414 @@ +{ + "eval_type_id": "tpp", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": false, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "16072d61-c7c6-4047-91dd-5fff05bf32c4", + "datetime_epoch_millis": 1740163589499, + "eval_result_metrics": { + "tpp_metrics": { + "tpp_threshold_2_total_metric": 0.0028499945998191833, + "tpp_threshold_2_intended_diff_only": 0.00549999475479126, + "tpp_threshold_2_unintended_diff_only": 0.0026500001549720764, + "tpp_threshold_5_total_metric": 0.004700003564357758, + "tpp_threshold_5_intended_diff_only": 0.0076000034809112545, + "tpp_threshold_5_unintended_diff_only": 0.0028999999165534975, + "tpp_threshold_10_total_metric": 0.01082499623298645, + "tpp_threshold_10_intended_diff_only": 0.014699995517730713, + "tpp_threshold_10_unintended_diff_only": 0.0038749992847442625, + "tpp_threshold_20_total_metric": 0.017949993908405303, + "tpp_threshold_20_intended_diff_only": 0.02199999690055847, + "tpp_threshold_20_unintended_diff_only": 0.004050002992153167, + "tpp_threshold_50_total_metric": 0.03577501326799393, + "tpp_threshold_50_intended_diff_only": 0.04050000905990601, + "tpp_threshold_50_unintended_diff_only": 0.0047249957919120785, + "tpp_threshold_100_total_metric": 0.06557500511407852, + "tpp_threshold_100_intended_diff_only": 0.07450000643730165, + "tpp_threshold_100_unintended_diff_only": 0.008925001323223113, + "tpp_threshold_500_total_metric": 0.21700000911951065, + "tpp_threshold_500_intended_diff_only": 0.23130001425743102, + "tpp_threshold_500_unintended_diff_only": 0.01430000513792038 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results", + "tpp_threshold_2_total_metric": 0.003350001573562622, + "tpp_threshold_2_intended_diff_only": 0.005600011348724366, + "tpp_threshold_2_unintended_diff_only": 0.0022500097751617433, + "tpp_threshold_5_total_metric": 0.006550008058547973, + "tpp_threshold_5_intended_diff_only": 0.009600019454956055, + "tpp_threshold_5_unintended_diff_only": 0.003050011396408081, + "tpp_threshold_10_total_metric": 0.01145000457763672, + "tpp_threshold_10_intended_diff_only": 0.014200007915496827, + "tpp_threshold_10_unintended_diff_only": 0.0027500033378601075, + "tpp_threshold_20_total_metric": 0.01974998414516449, + "tpp_threshold_20_intended_diff_only": 0.02239999771118164, + "tpp_threshold_20_unintended_diff_only": 0.002650013566017151, + "tpp_threshold_50_total_metric": 0.03780001997947693, + "tpp_threshold_50_intended_diff_only": 0.04080002307891846, + "tpp_threshold_50_unintended_diff_only": 0.003000003099441528, + "tpp_threshold_100_total_metric": 0.06720000505447388, + "tpp_threshold_100_intended_diff_only": 0.07420001029968262, + "tpp_threshold_100_unintended_diff_only": 0.007000005245208741, + "tpp_threshold_500_total_metric": 0.24940000772476195, + "tpp_threshold_500_intended_diff_only": 0.26020002365112305, + "tpp_threshold_500_unintended_diff_only": 0.010800015926361085 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results", + "tpp_threshold_2_total_metric": 0.0023499876260757446, + "tpp_threshold_2_intended_diff_only": 0.005399978160858155, + "tpp_threshold_2_unintended_diff_only": 0.0030499905347824096, + "tpp_threshold_5_total_metric": 0.0028499990701675417, + "tpp_threshold_5_intended_diff_only": 0.005599987506866455, + "tpp_threshold_5_unintended_diff_only": 0.0027499884366989137, + "tpp_threshold_10_total_metric": 0.010199987888336181, + "tpp_threshold_10_intended_diff_only": 0.0151999831199646, + "tpp_threshold_10_unintended_diff_only": 0.004999995231628418, + "tpp_threshold_20_total_metric": 0.016150003671646117, + "tpp_threshold_20_intended_diff_only": 0.021599996089935302, + "tpp_threshold_20_unintended_diff_only": 0.005449992418289184, + "tpp_threshold_50_total_metric": 0.03375000655651093, + "tpp_threshold_50_intended_diff_only": 0.040199995040893555, + "tpp_threshold_50_unintended_diff_only": 0.00644998848438263, + "tpp_threshold_100_total_metric": 0.06395000517368317, + "tpp_threshold_100_intended_diff_only": 0.07480000257492066, + "tpp_threshold_100_unintended_diff_only": 0.010849997401237488, + "tpp_threshold_500_total_metric": 0.18460001051425934, + "tpp_threshold_500_intended_diff_only": 0.20240000486373902, + "tpp_threshold_500_unintended_diff_only": 0.017799994349479674 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1": { + "0": { + "tpp_threshold_2_total_metric": 0.0072500258684158325, + "tpp_threshold_2_intended_diff_only": 0.01100003719329834, + "tpp_threshold_2_unintended_diff_only": 0.0037500113248825073, + "tpp_threshold_5_total_metric": 0.016750022768974304, + "tpp_threshold_5_intended_diff_only": 0.021000027656555176, + "tpp_threshold_5_unintended_diff_only": 0.004250004887580872, + "tpp_threshold_10_total_metric": 0.012500002980232239, + "tpp_threshold_10_intended_diff_only": 0.017000019550323486, + "tpp_threshold_10_unintended_diff_only": 0.0045000165700912476, + "tpp_threshold_20_total_metric": 0.030749991536140442, + "tpp_threshold_20_intended_diff_only": 0.03299999237060547, + "tpp_threshold_20_unintended_diff_only": 0.002250000834465027, + "tpp_threshold_50_total_metric": 0.04200001060962677, + "tpp_threshold_50_intended_diff_only": 0.04500001668930054, + "tpp_threshold_50_unintended_diff_only": 0.003000006079673767, + "tpp_threshold_100_total_metric": 0.08250001072883606, + "tpp_threshold_100_intended_diff_only": 0.10199999809265137, + "tpp_threshold_100_unintended_diff_only": 0.019499987363815308, + "tpp_threshold_500_total_metric": 0.3044999986886978, + "tpp_threshold_500_intended_diff_only": 0.328000009059906, + "tpp_threshold_500_unintended_diff_only": 0.02350001037120819 + }, + "1": { + "tpp_threshold_2_total_metric": 0.006250038743019104, + "tpp_threshold_2_intended_diff_only": 0.006000041961669922, + "tpp_threshold_2_unintended_diff_only": -0.00024999678134918213, + "tpp_threshold_5_total_metric": 0.0077500492334365845, + "tpp_threshold_5_intended_diff_only": 0.010000050067901611, + "tpp_threshold_5_unintended_diff_only": 0.002250000834465027, + "tpp_threshold_10_total_metric": 0.012000024318695068, + "tpp_threshold_10_intended_diff_only": 0.012000024318695068, + "tpp_threshold_10_unintended_diff_only": 0.0, + "tpp_threshold_20_total_metric": 0.005749985575675964, + "tpp_threshold_20_intended_diff_only": 0.009000003337860107, + "tpp_threshold_20_unintended_diff_only": 0.003250017762184143, + "tpp_threshold_50_total_metric": 0.015250056982040405, + "tpp_threshold_50_intended_diff_only": 0.01900005340576172, + "tpp_threshold_50_unintended_diff_only": 0.0037499964237213135, + "tpp_threshold_100_total_metric": 0.026749998331069946, + "tpp_threshold_100_intended_diff_only": 0.03100001811981201, + "tpp_threshold_100_unintended_diff_only": 0.004250019788742065, + "tpp_threshold_500_total_metric": 0.14800003170967102, + "tpp_threshold_500_intended_diff_only": 0.15400004386901855, + "tpp_threshold_500_unintended_diff_only": 0.006000012159347534 + }, + "2": { + "tpp_threshold_2_total_metric": -0.0032500028610229492, + "tpp_threshold_2_intended_diff_only": 0.0, + "tpp_threshold_2_unintended_diff_only": 0.0032500028610229492, + "tpp_threshold_5_total_metric": 0.0007499754428863525, + "tpp_threshold_5_intended_diff_only": 0.004999995231628418, + "tpp_threshold_5_unintended_diff_only": 0.004250019788742065, + "tpp_threshold_10_total_metric": 0.02400001883506775, + "tpp_threshold_10_intended_diff_only": 0.026000022888183594, + "tpp_threshold_10_unintended_diff_only": 0.0020000040531158447, + "tpp_threshold_20_total_metric": 0.03149998188018799, + "tpp_threshold_20_intended_diff_only": 0.03299999237060547, + "tpp_threshold_20_unintended_diff_only": 0.0015000104904174805, + "tpp_threshold_50_total_metric": 0.05425000190734863, + "tpp_threshold_50_intended_diff_only": 0.0559999942779541, + "tpp_threshold_50_unintended_diff_only": 0.0017499923706054688, + "tpp_threshold_100_total_metric": 0.07874995470046997, + "tpp_threshold_100_intended_diff_only": 0.07999998331069946, + "tpp_threshold_100_unintended_diff_only": 0.0012500286102294922, + "tpp_threshold_500_total_metric": 0.2527500092983246, + "tpp_threshold_500_intended_diff_only": 0.25700002908706665, + "tpp_threshold_500_unintended_diff_only": 0.004250019788742065 + }, + "6": { + "tpp_threshold_2_total_metric": 0.001999989151954651, + "tpp_threshold_2_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_2_unintended_diff_only": 0.0020000189542770386, + "tpp_threshold_5_total_metric": 0.002750024199485779, + "tpp_threshold_5_intended_diff_only": 0.0020000338554382324, + "tpp_threshold_5_unintended_diff_only": -0.0007499903440475464, + "tpp_threshold_10_total_metric": 0.004749983549118042, + "tpp_threshold_10_intended_diff_only": 0.0059999823570251465, + "tpp_threshold_10_unintended_diff_only": 0.0012499988079071045, + "tpp_threshold_20_total_metric": 0.004999995231628418, + "tpp_threshold_20_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_20_unintended_diff_only": -0.0009999871253967285, + "tpp_threshold_50_total_metric": 0.010000035166740417, + "tpp_threshold_50_intended_diff_only": 0.01100003719329834, + "tpp_threshold_50_unintended_diff_only": 0.0010000020265579224, + "tpp_threshold_100_total_metric": 0.03125004470348358, + "tpp_threshold_100_intended_diff_only": 0.03400003910064697, + "tpp_threshold_100_unintended_diff_only": 0.002749994397163391, + "tpp_threshold_500_total_metric": 0.19425003230571747, + "tpp_threshold_500_intended_diff_only": 0.20100003480911255, + "tpp_threshold_500_unintended_diff_only": 0.006750002503395081 + }, + "9": { + "tpp_threshold_2_total_metric": 0.004499956965446472, + "tpp_threshold_2_intended_diff_only": 0.006999969482421875, + "tpp_threshold_2_unintended_diff_only": 0.002500012516975403, + "tpp_threshold_5_total_metric": 0.004749968647956848, + "tpp_threshold_5_intended_diff_only": 0.009999990463256836, + "tpp_threshold_5_unintended_diff_only": 0.005250021815299988, + "tpp_threshold_10_total_metric": 0.003999993205070496, + "tpp_threshold_10_intended_diff_only": 0.009999990463256836, + "tpp_threshold_10_unintended_diff_only": 0.00599999725818634, + "tpp_threshold_20_total_metric": 0.025749966502189636, + "tpp_threshold_20_intended_diff_only": 0.03299999237060547, + "tpp_threshold_20_unintended_diff_only": 0.0072500258684158325, + "tpp_threshold_50_total_metric": 0.06749999523162842, + "tpp_threshold_50_intended_diff_only": 0.07300001382827759, + "tpp_threshold_50_unintended_diff_only": 0.00550001859664917, + "tpp_threshold_100_total_metric": 0.11675001680850983, + "tpp_threshold_100_intended_diff_only": 0.12400001287460327, + "tpp_threshold_100_unintended_diff_only": 0.007249996066093445, + "tpp_threshold_500_total_metric": 0.3474999666213989, + "tpp_threshold_500_intended_diff_only": 0.3610000014305115, + "tpp_threshold_500_unintended_diff_only": 0.013500034809112549 + } + }, + "canrager/amazon_reviews_mcauley_1and5": { + "1": { + "tpp_threshold_2_total_metric": 0.0037499666213989258, + "tpp_threshold_2_intended_diff_only": 0.007999956607818604, + "tpp_threshold_2_unintended_diff_only": 0.004249989986419678, + "tpp_threshold_5_total_metric": -0.00025004148483276367, + "tpp_threshold_5_intended_diff_only": 0.003999948501586914, + "tpp_threshold_5_unintended_diff_only": 0.004249989986419678, + "tpp_threshold_10_total_metric": 0.0017500072717666626, + "tpp_threshold_10_intended_diff_only": 0.009000003337860107, + "tpp_threshold_10_unintended_diff_only": 0.007249996066093445, + "tpp_threshold_20_total_metric": -0.000250011682510376, + "tpp_threshold_20_intended_diff_only": 0.0059999823570251465, + "tpp_threshold_20_unintended_diff_only": 0.0062499940395355225, + "tpp_threshold_50_total_metric": 0.010500013828277588, + "tpp_threshold_50_intended_diff_only": 0.013999998569488525, + "tpp_threshold_50_unintended_diff_only": 0.0034999847412109375, + "tpp_threshold_100_total_metric": 0.004749983549118042, + "tpp_threshold_100_intended_diff_only": 0.014999985694885254, + "tpp_threshold_100_unintended_diff_only": 0.010250002145767212, + "tpp_threshold_500_total_metric": 0.08899998664855957, + "tpp_threshold_500_intended_diff_only": 0.09799998998641968, + "tpp_threshold_500_unintended_diff_only": 0.009000003337860107 + }, + "2": { + "tpp_threshold_2_total_metric": 0.0014999806880950928, + "tpp_threshold_2_intended_diff_only": 0.001999974250793457, + "tpp_threshold_2_unintended_diff_only": 0.0004999935626983643, + "tpp_threshold_5_total_metric": -0.002499997615814209, + "tpp_threshold_5_intended_diff_only": 0.004999995231628418, + "tpp_threshold_5_unintended_diff_only": 0.007499992847442627, + "tpp_threshold_10_total_metric": 0.01099996268749237, + "tpp_threshold_10_intended_diff_only": 0.01699995994567871, + "tpp_threshold_10_unintended_diff_only": 0.00599999725818634, + "tpp_threshold_20_total_metric": 0.011499986052513123, + "tpp_threshold_20_intended_diff_only": 0.014999985694885254, + "tpp_threshold_20_unintended_diff_only": 0.0034999996423721313, + "tpp_threshold_50_total_metric": 0.03499998152256012, + "tpp_threshold_50_intended_diff_only": 0.042999982833862305, + "tpp_threshold_50_unintended_diff_only": 0.008000001311302185, + "tpp_threshold_100_total_metric": 0.04949997365474701, + "tpp_threshold_100_intended_diff_only": 0.06299996376037598, + "tpp_threshold_100_unintended_diff_only": 0.013499990105628967, + "tpp_threshold_500_total_metric": 0.1442500203847885, + "tpp_threshold_500_intended_diff_only": 0.16100001335144043, + "tpp_threshold_500_unintended_diff_only": 0.016749992966651917 + }, + "3": { + "tpp_threshold_2_total_metric": -0.010000020265579224, + "tpp_threshold_2_intended_diff_only": -0.00700002908706665, + "tpp_threshold_2_unintended_diff_only": 0.0029999911785125732, + "tpp_threshold_5_total_metric": 0.006999999284744263, + "tpp_threshold_5_intended_diff_only": 0.004999995231628418, + "tpp_threshold_5_unintended_diff_only": -0.0020000040531158447, + "tpp_threshold_10_total_metric": 0.013249978423118591, + "tpp_threshold_10_intended_diff_only": 0.014999985694885254, + "tpp_threshold_10_unintended_diff_only": 0.0017500072717666626, + "tpp_threshold_20_total_metric": 0.0052499920129776, + "tpp_threshold_20_intended_diff_only": 0.009999990463256836, + "tpp_threshold_20_unintended_diff_only": 0.004749998450279236, + "tpp_threshold_50_total_metric": 0.02750001847743988, + "tpp_threshold_50_intended_diff_only": 0.03100001811981201, + "tpp_threshold_50_unintended_diff_only": 0.0034999996423721313, + "tpp_threshold_100_total_metric": 0.04750002920627594, + "tpp_threshold_100_intended_diff_only": 0.054000020027160645, + "tpp_threshold_100_unintended_diff_only": 0.006499990820884705, + "tpp_threshold_500_total_metric": 0.13974998891353607, + "tpp_threshold_500_intended_diff_only": 0.1629999876022339, + "tpp_threshold_500_unintended_diff_only": 0.023249998688697815 + }, + "5": { + "tpp_threshold_2_total_metric": 0.00025004148483276367, + "tpp_threshold_2_intended_diff_only": 0.003000020980834961, + "tpp_threshold_2_unintended_diff_only": 0.0027499794960021973, + "tpp_threshold_5_total_metric": 0.0012500584125518799, + "tpp_threshold_5_intended_diff_only": 0.00700002908706665, + "tpp_threshold_5_unintended_diff_only": 0.0057499706745147705, + "tpp_threshold_10_total_metric": -0.004249989986419678, + "tpp_threshold_10_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_10_unintended_diff_only": 0.008249998092651367, + "tpp_threshold_20_total_metric": 0.020750045776367188, + "tpp_threshold_20_intended_diff_only": 0.030000030994415283, + "tpp_threshold_20_unintended_diff_only": 0.009249985218048096, + "tpp_threshold_50_total_metric": 0.031500041484832764, + "tpp_threshold_50_intended_diff_only": 0.04000002145767212, + "tpp_threshold_50_unintended_diff_only": 0.008499979972839355, + "tpp_threshold_100_total_metric": 0.11225005984306335, + "tpp_threshold_100_intended_diff_only": 0.1300000548362732, + "tpp_threshold_100_unintended_diff_only": 0.01774999499320984, + "tpp_threshold_500_total_metric": 0.24350006878376007, + "tpp_threshold_500_intended_diff_only": 0.2690000534057617, + "tpp_threshold_500_unintended_diff_only": 0.025499984622001648 + }, + "6": { + "tpp_threshold_2_total_metric": 0.016249969601631165, + "tpp_threshold_2_intended_diff_only": 0.0209999680519104, + "tpp_threshold_2_unintended_diff_only": 0.004749998450279236, + "tpp_threshold_5_total_metric": 0.008749976754188538, + "tpp_threshold_5_intended_diff_only": 0.006999969482421875, + "tpp_threshold_5_unintended_diff_only": -0.0017500072717666626, + "tpp_threshold_10_total_metric": 0.02924998104572296, + "tpp_threshold_10_intended_diff_only": 0.030999958515167236, + "tpp_threshold_10_unintended_diff_only": 0.001749977469444275, + "tpp_threshold_20_total_metric": 0.04350000619888306, + "tpp_threshold_20_intended_diff_only": 0.046999990940093994, + "tpp_threshold_20_unintended_diff_only": 0.0034999847412109375, + "tpp_threshold_50_total_metric": 0.06424997746944427, + "tpp_threshold_50_intended_diff_only": 0.07299995422363281, + "tpp_threshold_50_unintended_diff_only": 0.008749976754188538, + "tpp_threshold_100_total_metric": 0.10574997961521149, + "tpp_threshold_100_intended_diff_only": 0.1119999885559082, + "tpp_threshold_100_unintended_diff_only": 0.006250008940696716, + "tpp_threshold_500_total_metric": 0.30649998784065247, + "tpp_threshold_500_intended_diff_only": 0.32099997997283936, + "tpp_threshold_500_unintended_diff_only": 0.01449999213218689 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..93f694819bf21bdcb43e2aa771210aaee61e7d91 --- /dev/null +++ b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,414 @@ +{ + "eval_type_id": "tpp", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": false, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "0cf78e95-df49-4332-8aa4-30ee2712619b", + "datetime_epoch_millis": 1740163430683, + "eval_result_metrics": { + "tpp_metrics": { + "tpp_threshold_2_total_metric": 0.001475006341934204, + "tpp_threshold_2_intended_diff_only": 0.003700006008148193, + "tpp_threshold_2_unintended_diff_only": 0.0022249996662139894, + "tpp_threshold_5_total_metric": 0.0010999992489814758, + "tpp_threshold_5_intended_diff_only": 0.003600001335144043, + "tpp_threshold_5_unintended_diff_only": 0.002500002086162567, + "tpp_threshold_10_total_metric": 0.004625001549720764, + "tpp_threshold_10_intended_diff_only": 0.00790000557899475, + "tpp_threshold_10_unintended_diff_only": 0.0032750040292739866, + "tpp_threshold_20_total_metric": 0.008000005781650544, + "tpp_threshold_20_intended_diff_only": 0.01170000433921814, + "tpp_threshold_20_unintended_diff_only": 0.003699998557567596, + "tpp_threshold_50_total_metric": 0.024850000441074372, + "tpp_threshold_50_intended_diff_only": 0.03130000233650208, + "tpp_threshold_50_unintended_diff_only": 0.0064500018954277046, + "tpp_threshold_100_total_metric": 0.042700006067752844, + "tpp_threshold_100_intended_diff_only": 0.05190000534057617, + "tpp_threshold_100_unintended_diff_only": 0.009199999272823334, + "tpp_threshold_500_total_metric": 0.19910001307725905, + "tpp_threshold_500_intended_diff_only": 0.21640000939369203, + "tpp_threshold_500_unintended_diff_only": 0.017299996316432954 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results", + "tpp_threshold_2_total_metric": 0.003000003099441528, + "tpp_threshold_2_intended_diff_only": 0.004600012302398681, + "tpp_threshold_2_unintended_diff_only": 0.0016000092029571534, + "tpp_threshold_5_total_metric": 0.0025999963283538817, + "tpp_threshold_5_intended_diff_only": 0.005000007152557373, + "tpp_threshold_5_unintended_diff_only": 0.002400010824203491, + "tpp_threshold_10_total_metric": 0.005449992418289184, + "tpp_threshold_10_intended_diff_only": 0.007800006866455078, + "tpp_threshold_10_unintended_diff_only": 0.0023500144481658934, + "tpp_threshold_20_total_metric": 0.010450014472007751, + "tpp_threshold_20_intended_diff_only": 0.012800014019012452, + "tpp_threshold_20_unintended_diff_only": 0.0023499995470046995, + "tpp_threshold_50_total_metric": 0.01789999008178711, + "tpp_threshold_50_intended_diff_only": 0.02080000638961792, + "tpp_threshold_50_unintended_diff_only": 0.0029000163078308104, + "tpp_threshold_100_total_metric": 0.0331000030040741, + "tpp_threshold_100_intended_diff_only": 0.036400008201599124, + "tpp_threshold_100_unintended_diff_only": 0.0033000051975250245, + "tpp_threshold_500_total_metric": 0.19535001814365388, + "tpp_threshold_500_intended_diff_only": 0.20200002193450928, + "tpp_threshold_500_unintended_diff_only": 0.006650003790855408 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results", + "tpp_threshold_2_total_metric": -4.999041557312012e-05, + "tpp_threshold_2_intended_diff_only": 0.002799999713897705, + "tpp_threshold_2_unintended_diff_only": 0.0028499901294708253, + "tpp_threshold_5_total_metric": -0.0003999978303909302, + "tpp_threshold_5_intended_diff_only": 0.002199995517730713, + "tpp_threshold_5_unintended_diff_only": 0.002599993348121643, + "tpp_threshold_10_total_metric": 0.0038000106811523437, + "tpp_threshold_10_intended_diff_only": 0.008000004291534423, + "tpp_threshold_10_unintended_diff_only": 0.00419999361038208, + "tpp_threshold_20_total_metric": 0.005549997091293335, + "tpp_threshold_20_intended_diff_only": 0.010599994659423828, + "tpp_threshold_20_unintended_diff_only": 0.005049997568130493, + "tpp_threshold_50_total_metric": 0.031800010800361635, + "tpp_threshold_50_intended_diff_only": 0.04179999828338623, + "tpp_threshold_50_unintended_diff_only": 0.009999987483024598, + "tpp_threshold_100_total_metric": 0.05230000913143158, + "tpp_threshold_100_intended_diff_only": 0.06740000247955322, + "tpp_threshold_100_unintended_diff_only": 0.015099993348121643, + "tpp_threshold_500_total_metric": 0.20285000801086425, + "tpp_threshold_500_intended_diff_only": 0.23079999685287475, + "tpp_threshold_500_unintended_diff_only": 0.027949988842010498 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1": { + "0": { + "tpp_threshold_2_total_metric": 0.008750036358833313, + "tpp_threshold_2_intended_diff_only": 0.01100003719329834, + "tpp_threshold_2_unintended_diff_only": 0.002250000834465027, + "tpp_threshold_5_total_metric": 0.012000024318695068, + "tpp_threshold_5_intended_diff_only": 0.01500004529953003, + "tpp_threshold_5_unintended_diff_only": 0.003000020980834961, + "tpp_threshold_10_total_metric": 0.006749972701072693, + "tpp_threshold_10_intended_diff_only": 0.009999990463256836, + "tpp_threshold_10_unintended_diff_only": 0.003250017762184143, + "tpp_threshold_20_total_metric": 0.01975002884864807, + "tpp_threshold_20_intended_diff_only": 0.022000014781951904, + "tpp_threshold_20_unintended_diff_only": 0.002249985933303833, + "tpp_threshold_50_total_metric": 0.03349998593330383, + "tpp_threshold_50_intended_diff_only": 0.03700000047683716, + "tpp_threshold_50_unintended_diff_only": 0.003500014543533325, + "tpp_threshold_100_total_metric": 0.06024999916553497, + "tpp_threshold_100_intended_diff_only": 0.06400001049041748, + "tpp_threshold_100_unintended_diff_only": 0.0037500113248825073, + "tpp_threshold_500_total_metric": 0.3145000487565994, + "tpp_threshold_500_intended_diff_only": 0.3200000524520874, + "tpp_threshold_500_unintended_diff_only": 0.005500003695487976 + }, + "1": { + "tpp_threshold_2_total_metric": 0.0055000633001327515, + "tpp_threshold_2_intended_diff_only": 0.005000054836273193, + "tpp_threshold_2_unintended_diff_only": -0.0005000084638595581, + "tpp_threshold_5_total_metric": -0.0010000020265579224, + "tpp_threshold_5_intended_diff_only": 0.0, + "tpp_threshold_5_unintended_diff_only": 0.0010000020265579224, + "tpp_threshold_10_total_metric": 0.0022500455379486084, + "tpp_threshold_10_intended_diff_only": 0.001000046730041504, + "tpp_threshold_10_unintended_diff_only": -0.0012499988079071045, + "tpp_threshold_20_total_metric": 0.0002500265836715698, + "tpp_threshold_20_intended_diff_only": 0.003000020980834961, + "tpp_threshold_20_unintended_diff_only": 0.002749994397163391, + "tpp_threshold_50_total_metric": 0.004500031471252441, + "tpp_threshold_50_intended_diff_only": 0.00700002908706665, + "tpp_threshold_50_unintended_diff_only": 0.002499997615814209, + "tpp_threshold_100_total_metric": 0.014500007033348083, + "tpp_threshold_100_intended_diff_only": 0.018000006675720215, + "tpp_threshold_100_unintended_diff_only": 0.0034999996423721313, + "tpp_threshold_500_total_metric": 0.13175006210803986, + "tpp_threshold_500_intended_diff_only": 0.1350000500679016, + "tpp_threshold_500_unintended_diff_only": 0.0032499879598617554 + }, + "2": { + "tpp_threshold_2_total_metric": -0.0005000531673431396, + "tpp_threshold_2_intended_diff_only": 0.001999974250793457, + "tpp_threshold_2_unintended_diff_only": 0.0025000274181365967, + "tpp_threshold_5_total_metric": -1.4901161193847656e-08, + "tpp_threshold_5_intended_diff_only": 0.004999995231628418, + "tpp_threshold_5_unintended_diff_only": 0.005000010132789612, + "tpp_threshold_10_total_metric": 0.015000000596046448, + "tpp_threshold_10_intended_diff_only": 0.017000019550323486, + "tpp_threshold_10_unintended_diff_only": 0.0020000189542770386, + "tpp_threshold_20_total_metric": 0.02025000751018524, + "tpp_threshold_20_intended_diff_only": 0.022000014781951904, + "tpp_threshold_20_unintended_diff_only": 0.0017500072717666626, + "tpp_threshold_50_total_metric": 0.025249987840652466, + "tpp_threshold_50_intended_diff_only": 0.027000010013580322, + "tpp_threshold_50_unintended_diff_only": 0.0017500221729278564, + "tpp_threshold_100_total_metric": 0.04474999010562897, + "tpp_threshold_100_intended_diff_only": 0.046999990940093994, + "tpp_threshold_100_unintended_diff_only": 0.002250000834465027, + "tpp_threshold_500_total_metric": 0.30299998819828033, + "tpp_threshold_500_intended_diff_only": 0.3100000023841858, + "tpp_threshold_500_unintended_diff_only": 0.0070000141859054565 + }, + "6": { + "tpp_threshold_2_total_metric": -0.0015000104904174805, + "tpp_threshold_2_intended_diff_only": 0.0, + "tpp_threshold_2_unintended_diff_only": 0.0015000104904174805, + "tpp_threshold_5_total_metric": 0.000250011682510376, + "tpp_threshold_5_intended_diff_only": -0.0009999871253967285, + "tpp_threshold_5_unintended_diff_only": -0.0012499988079071045, + "tpp_threshold_10_total_metric": 0.002249985933303833, + "tpp_threshold_10_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_10_unintended_diff_only": 0.0017500221729278564, + "tpp_threshold_20_total_metric": 0.00475001335144043, + "tpp_threshold_20_intended_diff_only": 0.003000020980834961, + "tpp_threshold_20_unintended_diff_only": -0.0017499923706054688, + "tpp_threshold_50_total_metric": 0.004249989986419678, + "tpp_threshold_50_intended_diff_only": 0.0040000081062316895, + "tpp_threshold_50_unintended_diff_only": -0.0002499818801879883, + "tpp_threshold_100_total_metric": 0.0037500113248825073, + "tpp_threshold_100_intended_diff_only": 0.003000020980834961, + "tpp_threshold_100_unintended_diff_only": -0.0007499903440475464, + "tpp_threshold_500_total_metric": 0.030749976634979248, + "tpp_threshold_500_intended_diff_only": 0.03799998760223389, + "tpp_threshold_500_unintended_diff_only": 0.007250010967254639 + }, + "9": { + "tpp_threshold_2_total_metric": 0.0027499794960021973, + "tpp_threshold_2_intended_diff_only": 0.004999995231628418, + "tpp_threshold_2_unintended_diff_only": 0.0022500157356262207, + "tpp_threshold_5_total_metric": 0.001749962568283081, + "tpp_threshold_5_intended_diff_only": 0.0059999823570251465, + "tpp_threshold_5_unintended_diff_only": 0.004250019788742065, + "tpp_threshold_10_total_metric": 0.0009999573230743408, + "tpp_threshold_10_intended_diff_only": 0.006999969482421875, + "tpp_threshold_10_unintended_diff_only": 0.006000012159347534, + "tpp_threshold_20_total_metric": 0.007249996066093445, + "tpp_threshold_20_intended_diff_only": 0.013999998569488525, + "tpp_threshold_20_unintended_diff_only": 0.006750002503395081, + "tpp_threshold_50_total_metric": 0.02199995517730713, + "tpp_threshold_50_intended_diff_only": 0.02899998426437378, + "tpp_threshold_50_unintended_diff_only": 0.00700002908706665, + "tpp_threshold_100_total_metric": 0.04225000739097595, + "tpp_threshold_100_intended_diff_only": 0.050000011920928955, + "tpp_threshold_100_unintended_diff_only": 0.007750004529953003, + "tpp_threshold_500_total_metric": 0.19675001502037048, + "tpp_threshold_500_intended_diff_only": 0.2070000171661377, + "tpp_threshold_500_unintended_diff_only": 0.010250002145767212 + } + }, + "canrager/amazon_reviews_mcauley_1and5": { + "1": { + "tpp_threshold_2_total_metric": 0.007499992847442627, + "tpp_threshold_2_intended_diff_only": 0.010999977588653564, + "tpp_threshold_2_unintended_diff_only": 0.0034999847412109375, + "tpp_threshold_5_total_metric": 0.004249975085258484, + "tpp_threshold_5_intended_diff_only": 0.0059999823570251465, + "tpp_threshold_5_unintended_diff_only": 0.0017500072717666626, + "tpp_threshold_10_total_metric": 0.0022500157356262207, + "tpp_threshold_10_intended_diff_only": 0.009000003337860107, + "tpp_threshold_10_unintended_diff_only": 0.006749987602233887, + "tpp_threshold_20_total_metric": 0.00024999678134918213, + "tpp_threshold_20_intended_diff_only": 0.004999995231628418, + "tpp_threshold_20_unintended_diff_only": 0.004749998450279236, + "tpp_threshold_50_total_metric": 0.010250002145767212, + "tpp_threshold_50_intended_diff_only": 0.013999998569488525, + "tpp_threshold_50_unintended_diff_only": 0.0037499964237213135, + "tpp_threshold_100_total_metric": 0.008000016212463379, + "tpp_threshold_100_intended_diff_only": 0.018000006675720215, + "tpp_threshold_100_unintended_diff_only": 0.009999990463256836, + "tpp_threshold_500_total_metric": 0.08074997365474701, + "tpp_threshold_500_intended_diff_only": 0.09099996089935303, + "tpp_threshold_500_unintended_diff_only": 0.010249987244606018 + }, + "2": { + "tpp_threshold_2_total_metric": 0.0004999935626983643, + "tpp_threshold_2_intended_diff_only": 0.0009999871253967285, + "tpp_threshold_2_unintended_diff_only": 0.0004999935626983643, + "tpp_threshold_5_total_metric": -0.0070000141859054565, + "tpp_threshold_5_intended_diff_only": 0.0009999871253967285, + "tpp_threshold_5_unintended_diff_only": 0.008000001311302185, + "tpp_threshold_10_total_metric": 0.002249971032142639, + "tpp_threshold_10_intended_diff_only": 0.007999956607818604, + "tpp_threshold_10_unintended_diff_only": 0.005749985575675964, + "tpp_threshold_20_total_metric": 0.0029999762773513794, + "tpp_threshold_20_intended_diff_only": 0.0059999823570251465, + "tpp_threshold_20_unintended_diff_only": 0.003000006079673767, + "tpp_threshold_50_total_metric": 0.016999974846839905, + "tpp_threshold_50_intended_diff_only": 0.02599996328353882, + "tpp_threshold_50_unintended_diff_only": 0.008999988436698914, + "tpp_threshold_100_total_metric": 0.028250008821487427, + "tpp_threshold_100_intended_diff_only": 0.041999995708465576, + "tpp_threshold_100_unintended_diff_only": 0.01374998688697815, + "tpp_threshold_500_total_metric": 0.19774997234344482, + "tpp_threshold_500_intended_diff_only": 0.21999996900558472, + "tpp_threshold_500_unintended_diff_only": 0.022249996662139893 + }, + "3": { + "tpp_threshold_2_total_metric": -0.008249998092651367, + "tpp_threshold_2_intended_diff_only": -0.004999995231628418, + "tpp_threshold_2_unintended_diff_only": 0.0032500028610229492, + "tpp_threshold_5_total_metric": 0.000250011682510376, + "tpp_threshold_5_intended_diff_only": 0.0, + "tpp_threshold_5_unintended_diff_only": -0.000250011682510376, + "tpp_threshold_10_total_metric": 0.0072500258684158325, + "tpp_threshold_10_intended_diff_only": 0.008000016212463379, + "tpp_threshold_10_unintended_diff_only": 0.0007499903440475464, + "tpp_threshold_20_total_metric": -0.0015000253915786743, + "tpp_threshold_20_intended_diff_only": 0.001999974250793457, + "tpp_threshold_20_unintended_diff_only": 0.0034999996423721313, + "tpp_threshold_50_total_metric": 0.012000039219856262, + "tpp_threshold_50_intended_diff_only": 0.013000011444091797, + "tpp_threshold_50_unintended_diff_only": 0.0009999722242355347, + "tpp_threshold_100_total_metric": 0.02925001084804535, + "tpp_threshold_100_intended_diff_only": 0.04100000858306885, + "tpp_threshold_100_unintended_diff_only": 0.011749997735023499, + "tpp_threshold_500_total_metric": 0.1612500250339508, + "tpp_threshold_500_intended_diff_only": 0.2160000205039978, + "tpp_threshold_500_unintended_diff_only": 0.054749995470047 + }, + "5": { + "tpp_threshold_2_total_metric": -0.006749927997589111, + "tpp_threshold_2_intended_diff_only": -0.003999948501586914, + "tpp_threshold_2_unintended_diff_only": 0.0027499794960021973, + "tpp_threshold_5_total_metric": -0.006249949336051941, + "tpp_threshold_5_intended_diff_only": -0.001999974250793457, + "tpp_threshold_5_unintended_diff_only": 0.004249975085258484, + "tpp_threshold_10_total_metric": -0.008999958634376526, + "tpp_threshold_10_intended_diff_only": -0.0029999613761901855, + "tpp_threshold_10_unintended_diff_only": 0.00599999725818634, + "tpp_threshold_20_total_metric": -0.009499937295913696, + "tpp_threshold_20_intended_diff_only": 0.001000046730041504, + "tpp_threshold_20_unintended_diff_only": 0.0104999840259552, + "tpp_threshold_50_total_metric": 0.05525003373622894, + "tpp_threshold_50_intended_diff_only": 0.0820000171661377, + "tpp_threshold_50_unintended_diff_only": 0.026749983429908752, + "tpp_threshold_100_total_metric": 0.08600002527236938, + "tpp_threshold_100_intended_diff_only": 0.11800003051757812, + "tpp_threshold_100_unintended_diff_only": 0.03200000524520874, + "tpp_threshold_500_total_metric": 0.2552500516176224, + "tpp_threshold_500_intended_diff_only": 0.2940000295639038, + "tpp_threshold_500_unintended_diff_only": 0.03874997794628143 + }, + "6": { + "tpp_threshold_2_total_metric": 0.006749987602233887, + "tpp_threshold_2_intended_diff_only": 0.010999977588653564, + "tpp_threshold_2_unintended_diff_only": 0.004249989986419678, + "tpp_threshold_5_total_metric": 0.006749987602233887, + "tpp_threshold_5_intended_diff_only": 0.0059999823570251465, + "tpp_threshold_5_unintended_diff_only": -0.0007500052452087402, + "tpp_threshold_10_total_metric": 0.016249999403953552, + "tpp_threshold_10_intended_diff_only": 0.018000006675720215, + "tpp_threshold_10_unintended_diff_only": 0.0017500072717666626, + "tpp_threshold_20_total_metric": 0.035499975085258484, + "tpp_threshold_20_intended_diff_only": 0.038999974727630615, + "tpp_threshold_20_unintended_diff_only": 0.0034999996423721313, + "tpp_threshold_50_total_metric": 0.06450000405311584, + "tpp_threshold_50_intended_diff_only": 0.07400000095367432, + "tpp_threshold_50_unintended_diff_only": 0.009499996900558472, + "tpp_threshold_100_total_metric": 0.10999998450279236, + "tpp_threshold_100_intended_diff_only": 0.11799997091293335, + "tpp_threshold_100_unintended_diff_only": 0.007999986410140991, + "tpp_threshold_500_total_metric": 0.3192500174045563, + "tpp_threshold_500_intended_diff_only": 0.3330000042915344, + "tpp_threshold_500_unintended_diff_only": 0.01374998688697815 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..afc02c1aa1c129575e7328ed260fc277df510174 --- /dev/null +++ b/eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,414 @@ +{ + "eval_type_id": "tpp", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "LabHC/bias_in_bios_class_set1", + "canrager/amazon_reviews_mcauley_1and5" + ], + "perform_scr": false, + "early_stopping_patience": 20, + "train_set_size": 4000, + "test_set_size": 1000, + "context_length": 128, + "probe_train_batch_size": 16, + "probe_test_batch_size": 500, + "probe_epochs": 20, + "probe_lr": 0.001, + "probe_l1_penalty": 0.001, + "sae_batch_size": 125, + "llm_batch_size": 32, + "llm_dtype": "bfloat16", + "lower_vram_usage": false, + "model_name": "gemma-2-2b", + "n_values": [ + 2, + 5, + 10, + 20, + 50, + 100, + 500 + ], + "column1_vals_lookup": { + "LabHC/bias_in_bios_class_set1": [ + [ + "professor", + "nurse" + ], + [ + "architect", + "journalist" + ], + [ + "surgeon", + "psychologist" + ], + [ + "attorney", + "teacher" + ] + ], + "canrager/amazon_reviews_mcauley_1and5": [ + [ + "Books", + "CDs_and_Vinyl" + ], + [ + "Software", + "Electronics" + ], + [ + "Pet_Supplies", + "Office_Products" + ], + [ + "Industrial_and_Scientific", + "Toys_and_Games" + ] + ] + } + }, + "eval_id": "496eec06-563d-45a6-8ad8-e68c3fad1008", + "datetime_epoch_millis": 1740163906732, + "eval_result_metrics": { + "tpp_metrics": { + "tpp_threshold_2_total_metric": 0.003649994730949402, + "tpp_threshold_2_intended_diff_only": 0.006199997663497925, + "tpp_threshold_2_unintended_diff_only": 0.0025500029325485228, + "tpp_threshold_5_total_metric": 0.0050999939441680915, + "tpp_threshold_5_intended_diff_only": 0.007999992370605467, + "tpp_threshold_5_unintended_diff_only": 0.0028999984264373776, + "tpp_threshold_10_total_metric": 0.013499999046325683, + "tpp_threshold_10_intended_diff_only": 0.017399996519088745, + "tpp_threshold_10_unintended_diff_only": 0.0038999974727630614, + "tpp_threshold_20_total_metric": 0.02162499576807022, + "tpp_threshold_20_intended_diff_only": 0.02619999647140503, + "tpp_threshold_20_unintended_diff_only": 0.004575000703334808, + "tpp_threshold_50_total_metric": 0.051074995100498205, + "tpp_threshold_50_intended_diff_only": 0.057199996709823606, + "tpp_threshold_50_unintended_diff_only": 0.006125001609325409, + "tpp_threshold_100_total_metric": 0.09944999665021896, + "tpp_threshold_100_intended_diff_only": 0.10920000076293945, + "tpp_threshold_100_unintended_diff_only": 0.009750004112720489, + "tpp_threshold_500_total_metric": 0.310000017285347, + "tpp_threshold_500_intended_diff_only": 0.3243000149726868, + "tpp_threshold_500_unintended_diff_only": 0.014299997687339782 + } + }, + "eval_result_details": [ + { + "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results", + "tpp_threshold_2_total_metric": 0.004200002551078797, + "tpp_threshold_2_intended_diff_only": 0.006200015544891357, + "tpp_threshold_2_unintended_diff_only": 0.002000012993812561, + "tpp_threshold_5_total_metric": 0.0056500047445297245, + "tpp_threshold_5_intended_diff_only": 0.00840001106262207, + "tpp_threshold_5_unintended_diff_only": 0.002750006318092346, + "tpp_threshold_10_total_metric": 0.01315000057220459, + "tpp_threshold_10_intended_diff_only": 0.015400004386901856, + "tpp_threshold_10_unintended_diff_only": 0.002250003814697266, + "tpp_threshold_20_total_metric": 0.02469998598098755, + "tpp_threshold_20_intended_diff_only": 0.02799999713897705, + "tpp_threshold_20_unintended_diff_only": 0.003300011157989502, + "tpp_threshold_50_total_metric": 0.05049999952316284, + "tpp_threshold_50_intended_diff_only": 0.053800010681152345, + "tpp_threshold_50_unintended_diff_only": 0.003300011157989502, + "tpp_threshold_100_total_metric": 0.09890000522136688, + "tpp_threshold_100_intended_diff_only": 0.103600013256073, + "tpp_threshold_100_unintended_diff_only": 0.004700008034706116, + "tpp_threshold_500_total_metric": 0.3669000148773193, + "tpp_threshold_500_intended_diff_only": 0.37420002222061155, + "tpp_threshold_500_unintended_diff_only": 0.0073000073432922365 + }, + { + "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results", + "tpp_threshold_2_total_metric": 0.0030999869108200074, + "tpp_threshold_2_intended_diff_only": 0.006199979782104492, + "tpp_threshold_2_unintended_diff_only": 0.003099992871284485, + "tpp_threshold_5_total_metric": 0.004549983143806458, + "tpp_threshold_5_intended_diff_only": 0.007599973678588867, + "tpp_threshold_5_unintended_diff_only": 0.0030499905347824096, + "tpp_threshold_10_total_metric": 0.013849997520446777, + "tpp_threshold_10_intended_diff_only": 0.019399988651275634, + "tpp_threshold_10_unintended_diff_only": 0.005549991130828857, + "tpp_threshold_20_total_metric": 0.018550005555152894, + "tpp_threshold_20_intended_diff_only": 0.024399995803833008, + "tpp_threshold_20_unintended_diff_only": 0.005849990248680115, + "tpp_threshold_50_total_metric": 0.05164999067783356, + "tpp_threshold_50_intended_diff_only": 0.06059998273849487, + "tpp_threshold_50_unintended_diff_only": 0.008949992060661317, + "tpp_threshold_100_total_metric": 0.09999998807907104, + "tpp_threshold_100_intended_diff_only": 0.1147999882698059, + "tpp_threshold_100_unintended_diff_only": 0.014800000190734863, + "tpp_threshold_500_total_metric": 0.25310001969337464, + "tpp_threshold_500_intended_diff_only": 0.274400007724762, + "tpp_threshold_500_unintended_diff_only": 0.02129998803138733 + } + ], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": { + "LabHC/bias_in_bios_class_set1": { + "0": { + "tpp_threshold_2_total_metric": 0.006749987602233887, + "tpp_threshold_2_intended_diff_only": 0.009999990463256836, + "tpp_threshold_2_unintended_diff_only": 0.0032500028610229492, + "tpp_threshold_5_total_metric": 0.017000019550323486, + "tpp_threshold_5_intended_diff_only": 0.021000027656555176, + "tpp_threshold_5_unintended_diff_only": 0.0040000081062316895, + "tpp_threshold_10_total_metric": 0.009750038385391235, + "tpp_threshold_10_intended_diff_only": 0.01500004529953003, + "tpp_threshold_10_unintended_diff_only": 0.005250006914138794, + "tpp_threshold_20_total_metric": 0.029749974608421326, + "tpp_threshold_20_intended_diff_only": 0.03299999237060547, + "tpp_threshold_20_unintended_diff_only": 0.003250017762184143, + "tpp_threshold_50_total_metric": 0.046999990940093994, + "tpp_threshold_50_intended_diff_only": 0.050999999046325684, + "tpp_threshold_50_unintended_diff_only": 0.0040000081062316895, + "tpp_threshold_100_total_metric": 0.12275001406669617, + "tpp_threshold_100_intended_diff_only": 0.12800002098083496, + "tpp_threshold_100_unintended_diff_only": 0.005250006914138794, + "tpp_threshold_500_total_metric": 0.398250013589859, + "tpp_threshold_500_intended_diff_only": 0.4020000100135803, + "tpp_threshold_500_unintended_diff_only": 0.0037499964237213135 + }, + "1": { + "tpp_threshold_2_total_metric": 0.0050000399351119995, + "tpp_threshold_2_intended_diff_only": 0.005000054836273193, + "tpp_threshold_2_unintended_diff_only": 1.4901161193847656e-08, + "tpp_threshold_5_total_metric": 0.0025000274181365967, + "tpp_threshold_5_intended_diff_only": 0.003000020980834961, + "tpp_threshold_5_unintended_diff_only": 0.0004999935626983643, + "tpp_threshold_10_total_metric": 0.010499998927116394, + "tpp_threshold_10_intended_diff_only": 0.009000003337860107, + "tpp_threshold_10_unintended_diff_only": -0.0014999955892562866, + "tpp_threshold_20_total_metric": 0.0104999840259552, + "tpp_threshold_20_intended_diff_only": 0.013999998569488525, + "tpp_threshold_20_unintended_diff_only": 0.003500014543533325, + "tpp_threshold_50_total_metric": 0.04275001585483551, + "tpp_threshold_50_intended_diff_only": 0.0480000376701355, + "tpp_threshold_50_unintended_diff_only": 0.005250021815299988, + "tpp_threshold_100_total_metric": 0.0625, + "tpp_threshold_100_intended_diff_only": 0.0690000057220459, + "tpp_threshold_100_unintended_diff_only": 0.0065000057220458984, + "tpp_threshold_500_total_metric": 0.31550003588199615, + "tpp_threshold_500_intended_diff_only": 0.32600003480911255, + "tpp_threshold_500_unintended_diff_only": 0.010499998927116394 + }, + "2": { + "tpp_threshold_2_total_metric": 0.0007500052452087402, + "tpp_threshold_2_intended_diff_only": 0.003000020980834961, + "tpp_threshold_2_unintended_diff_only": 0.0022500157356262207, + "tpp_threshold_5_total_metric": 0.001499950885772705, + "tpp_threshold_5_intended_diff_only": 0.006999969482421875, + "tpp_threshold_5_unintended_diff_only": 0.00550001859664917, + "tpp_threshold_10_total_metric": 0.0337500125169754, + "tpp_threshold_10_intended_diff_only": 0.03600001335144043, + "tpp_threshold_10_unintended_diff_only": 0.002250000834465027, + "tpp_threshold_20_total_metric": 0.047749996185302734, + "tpp_threshold_20_intended_diff_only": 0.050999999046325684, + "tpp_threshold_20_unintended_diff_only": 0.0032500028610229492, + "tpp_threshold_50_total_metric": 0.07749998569488525, + "tpp_threshold_50_intended_diff_only": 0.07899999618530273, + "tpp_threshold_50_unintended_diff_only": 0.0015000104904174805, + "tpp_threshold_100_total_metric": 0.13199999928474426, + "tpp_threshold_100_intended_diff_only": 0.1340000033378601, + "tpp_threshold_100_unintended_diff_only": 0.0020000040531158447, + "tpp_threshold_500_total_metric": 0.37299999594688416, + "tpp_threshold_500_intended_diff_only": 0.3790000081062317, + "tpp_threshold_500_unintended_diff_only": 0.006000012159347534 + }, + "6": { + "tpp_threshold_2_total_metric": 0.0015000104904174805, + "tpp_threshold_2_intended_diff_only": 0.003000020980834961, + "tpp_threshold_2_unintended_diff_only": 0.0015000104904174805, + "tpp_threshold_5_total_metric": 0.003000035881996155, + "tpp_threshold_5_intended_diff_only": 0.0020000338554382324, + "tpp_threshold_5_unintended_diff_only": -0.0010000020265579224, + "tpp_threshold_10_total_metric": 0.005249977111816406, + "tpp_threshold_10_intended_diff_only": 0.0059999823570251465, + "tpp_threshold_10_unintended_diff_only": 0.0007500052452087402, + "tpp_threshold_20_total_metric": 0.004999995231628418, + "tpp_threshold_20_intended_diff_only": 0.004999995231628418, + "tpp_threshold_20_unintended_diff_only": 0.0, + "tpp_threshold_50_total_metric": 0.008500009775161743, + "tpp_threshold_50_intended_diff_only": 0.009000003337860107, + "tpp_threshold_50_unintended_diff_only": 0.0004999935626983643, + "tpp_threshold_100_total_metric": 0.02300003170967102, + "tpp_threshold_100_intended_diff_only": 0.025000035762786865, + "tpp_threshold_100_unintended_diff_only": 0.0020000040531158447, + "tpp_threshold_500_total_metric": 0.320000022649765, + "tpp_threshold_500_intended_diff_only": 0.32600003480911255, + "tpp_threshold_500_unintended_diff_only": 0.006000012159347534 + }, + "9": { + "tpp_threshold_2_total_metric": 0.006999969482421875, + "tpp_threshold_2_intended_diff_only": 0.009999990463256836, + "tpp_threshold_2_unintended_diff_only": 0.003000020980834961, + "tpp_threshold_5_total_metric": 0.004249989986419678, + "tpp_threshold_5_intended_diff_only": 0.009000003337860107, + "tpp_threshold_5_unintended_diff_only": 0.00475001335144043, + "tpp_threshold_10_total_metric": 0.006499975919723511, + "tpp_threshold_10_intended_diff_only": 0.010999977588653564, + "tpp_threshold_10_unintended_diff_only": 0.004500001668930054, + "tpp_threshold_20_total_metric": 0.030499979853630066, + "tpp_threshold_20_intended_diff_only": 0.03700000047683716, + "tpp_threshold_20_unintended_diff_only": 0.006500020623207092, + "tpp_threshold_50_total_metric": 0.07674999535083771, + "tpp_threshold_50_intended_diff_only": 0.0820000171661377, + "tpp_threshold_50_unintended_diff_only": 0.005250021815299988, + "tpp_threshold_100_total_metric": 0.15424998104572296, + "tpp_threshold_100_intended_diff_only": 0.16200000047683716, + "tpp_threshold_100_unintended_diff_only": 0.007750019431114197, + "tpp_threshold_500_total_metric": 0.42775000631809235, + "tpp_threshold_500_intended_diff_only": 0.43800002336502075, + "tpp_threshold_500_unintended_diff_only": 0.010250017046928406 + } + }, + "canrager/amazon_reviews_mcauley_1and5": { + "1": { + "tpp_threshold_2_total_metric": 0.00424996018409729, + "tpp_threshold_2_intended_diff_only": 0.007999956607818604, + "tpp_threshold_2_unintended_diff_only": 0.0037499964237213135, + "tpp_threshold_5_total_metric": 0.0015000104904174805, + "tpp_threshold_5_intended_diff_only": 0.004999995231628418, + "tpp_threshold_5_unintended_diff_only": 0.0034999847412109375, + "tpp_threshold_10_total_metric": 0.0007499605417251587, + "tpp_threshold_10_intended_diff_only": 0.007999956607818604, + "tpp_threshold_10_unintended_diff_only": 0.007249996066093445, + "tpp_threshold_20_total_metric": 0.001749962568283081, + "tpp_threshold_20_intended_diff_only": 0.006999969482421875, + "tpp_threshold_20_unintended_diff_only": 0.005250006914138794, + "tpp_threshold_50_total_metric": 0.011249944567680359, + "tpp_threshold_50_intended_diff_only": 0.012999951839447021, + "tpp_threshold_50_unintended_diff_only": 0.0017500072717666626, + "tpp_threshold_100_total_metric": 0.011499956250190735, + "tpp_threshold_100_intended_diff_only": 0.0209999680519104, + "tpp_threshold_100_unintended_diff_only": 0.009500011801719666, + "tpp_threshold_500_total_metric": 0.1262499988079071, + "tpp_threshold_500_intended_diff_only": 0.13899999856948853, + "tpp_threshold_500_unintended_diff_only": 0.012749999761581421 + }, + "2": { + "tpp_threshold_2_total_metric": 0.0040000081062316895, + "tpp_threshold_2_intended_diff_only": 0.004999995231628418, + "tpp_threshold_2_unintended_diff_only": 0.0009999871253967285, + "tpp_threshold_5_total_metric": -0.004500031471252441, + "tpp_threshold_5_intended_diff_only": 0.0029999613761901855, + "tpp_threshold_5_unintended_diff_only": 0.007499992847442627, + "tpp_threshold_10_total_metric": 0.00449998676776886, + "tpp_threshold_10_intended_diff_only": 0.010999977588653564, + "tpp_threshold_10_unintended_diff_only": 0.006499990820884705, + "tpp_threshold_20_total_metric": 0.009250015020370483, + "tpp_threshold_20_intended_diff_only": 0.013999998569488525, + "tpp_threshold_20_unintended_diff_only": 0.004749983549118042, + "tpp_threshold_50_total_metric": 0.03349998593330383, + "tpp_threshold_50_intended_diff_only": 0.04399996995925903, + "tpp_threshold_50_unintended_diff_only": 0.0104999840259552, + "tpp_threshold_100_total_metric": 0.06824997067451477, + "tpp_threshold_100_intended_diff_only": 0.08499997854232788, + "tpp_threshold_100_unintended_diff_only": 0.01675000786781311, + "tpp_threshold_500_total_metric": 0.25550003349781036, + "tpp_threshold_500_intended_diff_only": 0.2720000147819519, + "tpp_threshold_500_unintended_diff_only": 0.01649998128414154 + }, + "3": { + "tpp_threshold_2_total_metric": -0.006250008940696716, + "tpp_threshold_2_intended_diff_only": -0.003000020980834961, + "tpp_threshold_2_unintended_diff_only": 0.0032499879598617554, + "tpp_threshold_5_total_metric": 0.0027499645948410034, + "tpp_threshold_5_intended_diff_only": 0.0029999613761901855, + "tpp_threshold_5_unintended_diff_only": 0.00024999678134918213, + "tpp_threshold_10_total_metric": 0.0065000057220458984, + "tpp_threshold_10_intended_diff_only": 0.009999990463256836, + "tpp_threshold_10_unintended_diff_only": 0.0034999847412109375, + "tpp_threshold_20_total_metric": 0.006000012159347534, + "tpp_threshold_20_intended_diff_only": 0.013000011444091797, + "tpp_threshold_20_unintended_diff_only": 0.006999999284744263, + "tpp_threshold_50_total_metric": 0.02499997615814209, + "tpp_threshold_50_intended_diff_only": 0.029999971389770508, + "tpp_threshold_50_unintended_diff_only": 0.004999995231628418, + "tpp_threshold_100_total_metric": 0.06025001406669617, + "tpp_threshold_100_intended_diff_only": 0.07300001382827759, + "tpp_threshold_100_unintended_diff_only": 0.012749999761581421, + "tpp_threshold_500_total_metric": 0.19625000655651093, + "tpp_threshold_500_intended_diff_only": 0.22699999809265137, + "tpp_threshold_500_unintended_diff_only": 0.030749991536140442 + }, + "5": { + "tpp_threshold_2_total_metric": 0.006000012159347534, + "tpp_threshold_2_intended_diff_only": 0.009000003337860107, + "tpp_threshold_2_unintended_diff_only": 0.0029999911785125732, + "tpp_threshold_5_total_metric": 0.008500009775161743, + "tpp_threshold_5_intended_diff_only": 0.013999998569488525, + "tpp_threshold_5_unintended_diff_only": 0.005499988794326782, + "tpp_threshold_10_total_metric": 0.018500030040740967, + "tpp_threshold_10_intended_diff_only": 0.026000022888183594, + "tpp_threshold_10_unintended_diff_only": 0.007499992847442627, + "tpp_threshold_20_total_metric": 0.027250036597251892, + "tpp_threshold_20_intended_diff_only": 0.03600001335144043, + "tpp_threshold_20_unintended_diff_only": 0.008749976754188538, + "tpp_threshold_50_total_metric": 0.09700007736682892, + "tpp_threshold_50_intended_diff_only": 0.11600005626678467, + "tpp_threshold_50_unintended_diff_only": 0.01899997889995575, + "tpp_threshold_100_total_metric": 0.17525003850460052, + "tpp_threshold_100_intended_diff_only": 0.20200002193450928, + "tpp_threshold_100_unintended_diff_only": 0.026749983429908752, + "tpp_threshold_500_total_metric": 0.34125006198883057, + "tpp_threshold_500_intended_diff_only": 0.3720000386238098, + "tpp_threshold_500_unintended_diff_only": 0.030749976634979248 + }, + "6": { + "tpp_threshold_2_total_metric": 0.007499963045120239, + "tpp_threshold_2_intended_diff_only": 0.011999964714050293, + "tpp_threshold_2_unintended_diff_only": 0.004500001668930054, + "tpp_threshold_5_total_metric": 0.014499962329864502, + "tpp_threshold_5_intended_diff_only": 0.012999951839447021, + "tpp_threshold_5_unintended_diff_only": -0.0015000104904174805, + "tpp_threshold_10_total_metric": 0.039000004529953, + "tpp_threshold_10_intended_diff_only": 0.041999995708465576, + "tpp_threshold_10_unintended_diff_only": 0.0029999911785125732, + "tpp_threshold_20_total_metric": 0.048500001430511475, + "tpp_threshold_20_intended_diff_only": 0.05199998617172241, + "tpp_threshold_20_unintended_diff_only": 0.0034999847412109375, + "tpp_threshold_50_total_metric": 0.09149996936321259, + "tpp_threshold_50_intended_diff_only": 0.09999996423721313, + "tpp_threshold_50_unintended_diff_only": 0.00849999487400055, + "tpp_threshold_100_total_metric": 0.18474996089935303, + "tpp_threshold_100_intended_diff_only": 0.1929999589920044, + "tpp_threshold_100_unintended_diff_only": 0.008249998092651367, + "tpp_threshold_500_total_metric": 0.3462499976158142, + "tpp_threshold_500_intended_diff_only": 0.3619999885559082, + "tpp_threshold_500_unintended_diff_only": 0.015749990940093994 + } + } + } +} \ No newline at end of file diff --git a/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..9ba9d02198e868b6031126a80f393c7ca4998f68 --- /dev/null +++ b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,74 @@ +{ + "eval_type_id": "unlearning", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "wmdp-bio", + "high_school_us_history", + "college_computer_science", + "high_school_geography", + "human_aging" + ], + "intervention_method": "clamp_feature_activation", + "retain_thresholds": [ + 0.001, + 0.01 + ], + "n_features_list": [ + 10, + 20 + ], + "multipliers": [ + 25, + 50, + 100, + 200 + ], + "dataset_size": 1024, + "seq_len": 1024, + "n_batch_loss_added": 50, + "target_metric": "correct", + "save_metrics": true, + "model_name": "gemma-2-2b-it", + "llm_batch_size": 4, + "llm_dtype": "bfloat16" + }, + "eval_id": "9c4fef1a-ae28-4280-b511-c8d59c94496f", + "datetime_epoch_millis": 1740170757553, + "eval_result_metrics": { + "unlearning": { + "unlearning_score": 0.022514045238494873 + } + }, + "eval_result_details": [], + "sae_bench_commit_hash": "d91a218b4cc4ac6c164d0e1b739c8437901c7acd", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d550b334443fd330331d55d23b9c582dbaf5b5e2 --- /dev/null +++ b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,74 @@ +{ + "eval_type_id": "unlearning", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "wmdp-bio", + "high_school_us_history", + "college_computer_science", + "high_school_geography", + "human_aging" + ], + "intervention_method": "clamp_feature_activation", + "retain_thresholds": [ + 0.001, + 0.01 + ], + "n_features_list": [ + 10, + 20 + ], + "multipliers": [ + 25, + 50, + 100, + 200 + ], + "dataset_size": 1024, + "seq_len": 1024, + "n_batch_loss_added": 50, + "target_metric": "correct", + "save_metrics": true, + "model_name": "gemma-2-2b-it", + "llm_batch_size": 4, + "llm_dtype": "bfloat16" + }, + "eval_id": "2aab83aa-2bf6-43cb-89d0-39f635cc846b", + "datetime_epoch_millis": 1740169824113, + "eval_result_metrics": { + "unlearning": { + "unlearning_score": 0.0863039493560791 + } + }, + "eval_result_details": [], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d159e113874f60354fcf73f9c60c9bf46988f66f --- /dev/null +++ b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,74 @@ +{ + "eval_type_id": "unlearning", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "wmdp-bio", + "high_school_us_history", + "college_computer_science", + "high_school_geography", + "human_aging" + ], + "intervention_method": "clamp_feature_activation", + "retain_thresholds": [ + 0.001, + 0.01 + ], + "n_features_list": [ + 10, + 20 + ], + "multipliers": [ + 25, + 50, + 100, + 200 + ], + "dataset_size": 1024, + "seq_len": 1024, + "n_batch_loss_added": 50, + "target_metric": "correct", + "save_metrics": true, + "model_name": "gemma-2-2b-it", + "llm_batch_size": 4, + "llm_dtype": "bfloat16" + }, + "eval_id": "d0440f14-e67c-4842-99d5-81fcd5cf3274", + "datetime_epoch_millis": 1740169349304, + "eval_result_metrics": { + "unlearning": { + "unlearning_score": 0.06003749370574951 + } + }, + "eval_result_details": [], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..5c6e41e870e48ee7f4a6f3503f0944e6e54ef42f --- /dev/null +++ b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,74 @@ +{ + "eval_type_id": "unlearning", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "wmdp-bio", + "high_school_us_history", + "college_computer_science", + "high_school_geography", + "human_aging" + ], + "intervention_method": "clamp_feature_activation", + "retain_thresholds": [ + 0.001, + 0.01 + ], + "n_features_list": [ + 10, + 20 + ], + "multipliers": [ + 25, + 50, + 100, + 200 + ], + "dataset_size": 1024, + "seq_len": 1024, + "n_batch_loss_added": 50, + "target_metric": "correct", + "save_metrics": true, + "model_name": "gemma-2-2b-it", + "llm_batch_size": 4, + "llm_dtype": "bfloat16" + }, + "eval_id": "15a4207f-7a9a-4b11-89df-797716ea8fed", + "datetime_epoch_millis": 1740170291329, + "eval_result_metrics": { + "unlearning": { + "unlearning_score": 0.030018746852874756 + } + }, + "eval_result_details": [], + "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..57a94ecccf064d20abdb12171157cace3f1a4611 --- /dev/null +++ b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,74 @@ +{ + "eval_type_id": "unlearning", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "wmdp-bio", + "high_school_us_history", + "college_computer_science", + "high_school_geography", + "human_aging" + ], + "intervention_method": "clamp_feature_activation", + "retain_thresholds": [ + 0.001, + 0.01 + ], + "n_features_list": [ + 10, + 20 + ], + "multipliers": [ + 25, + 50, + 100, + 200 + ], + "dataset_size": 1024, + "seq_len": 1024, + "n_batch_loss_added": 50, + "target_metric": "correct", + "save_metrics": true, + "model_name": "gemma-2-2b-it", + "llm_batch_size": 4, + "llm_dtype": "bfloat16" + }, + "eval_id": "94e770c0-ff2e-428f-83b6-79d0857562ae", + "datetime_epoch_millis": 1740172452592, + "eval_result_metrics": { + "unlearning": { + "unlearning_score": 0.09193247556686401 + } + }, + "eval_result_details": [], + "sae_bench_commit_hash": "d91a218b4cc4ac6c164d0e1b739c8437901c7acd", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..51f80fc05f688550272b632764556b3774efde9c --- /dev/null +++ b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,74 @@ +{ + "eval_type_id": "unlearning", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "wmdp-bio", + "high_school_us_history", + "college_computer_science", + "high_school_geography", + "human_aging" + ], + "intervention_method": "clamp_feature_activation", + "retain_thresholds": [ + 0.001, + 0.01 + ], + "n_features_list": [ + 10, + 20 + ], + "multipliers": [ + 25, + 50, + 100, + 200 + ], + "dataset_size": 1024, + "seq_len": 1024, + "n_batch_loss_added": 50, + "target_metric": "correct", + "save_metrics": true, + "model_name": "gemma-2-2b-it", + "llm_batch_size": 4, + "llm_dtype": "bfloat16" + }, + "eval_id": "a2875445-7b1e-4eee-9c51-dd5d4e93e9bb", + "datetime_epoch_millis": 1740171944398, + "eval_result_metrics": { + "unlearning": { + "unlearning_score": 0.03752344846725464 + } + }, + "eval_result_details": [], + "sae_bench_commit_hash": "d91a218b4cc4ac6c164d0e1b739c8437901c7acd", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f46feb0fc00123be77a40342bfbb349ea88851d1 --- /dev/null +++ b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json @@ -0,0 +1,74 @@ +{ + "eval_type_id": "unlearning", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "wmdp-bio", + "high_school_us_history", + "college_computer_science", + "high_school_geography", + "human_aging" + ], + "intervention_method": "clamp_feature_activation", + "retain_thresholds": [ + 0.001, + 0.01 + ], + "n_features_list": [ + 10, + 20 + ], + "multipliers": [ + 25, + 50, + 100, + 200 + ], + "dataset_size": 1024, + "seq_len": 1024, + "n_batch_loss_added": 50, + "target_metric": "correct", + "save_metrics": true, + "model_name": "gemma-2-2b-it", + "llm_batch_size": 4, + "llm_dtype": "bfloat16" + }, + "eval_id": "9bdf877c-184e-4521-ba8e-b8eb6ec7d495", + "datetime_epoch_millis": 1740171262970, + "eval_result_metrics": { + "unlearning": { + "unlearning_score": 0.03564727306365967 + } + }, + "eval_result_details": [], + "sae_bench_commit_hash": "d91a218b4cc4ac6c164d0e1b739c8437901c7acd", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file diff --git a/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e81327c928a6338b5dd881cefc58c9a0c406b442 --- /dev/null +++ b/eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json @@ -0,0 +1,74 @@ +{ + "eval_type_id": "unlearning", + "eval_config": { + "random_seed": 42, + "dataset_names": [ + "wmdp-bio", + "high_school_us_history", + "college_computer_science", + "high_school_geography", + "human_aging" + ], + "intervention_method": "clamp_feature_activation", + "retain_thresholds": [ + 0.001, + 0.01 + ], + "n_features_list": [ + 10, + 20 + ], + "multipliers": [ + 25, + 50, + 100, + 200 + ], + "dataset_size": 1024, + "seq_len": 1024, + "n_batch_loss_added": 50, + "target_metric": "correct", + "save_metrics": true, + "model_name": "gemma-2-2b-it", + "llm_batch_size": 4, + "llm_dtype": "bfloat16" + }, + "eval_id": "59aa219f-9aae-46c7-bd1d-742764abafa0", + "datetime_epoch_millis": 1740172929065, + "eval_result_metrics": { + "unlearning": { + "unlearning_score": 0.058161377906799316 + } + }, + "eval_result_details": [], + "sae_bench_commit_hash": "d91a218b4cc4ac6c164d0e1b739c8437901c7acd", + "sae_lens_id": "custom_sae", + "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0", + "sae_lens_version": "5.4.2", + "sae_cfg_dict": { + "model_name": "gemma-2-2b", + "d_in": 2304, + "d_sae": 65536, + "hook_layer": 12, + "hook_name": "blocks.12.hook_resid_post", + "context_size": null, + "hook_head_index": null, + "architecture": "topk", + "apply_b_dec_to_input": null, + "finetuning_scaling_factor": null, + "activation_fn_str": "", + "prepend_bos": true, + "normalize_activations": "none", + "dtype": "bfloat16", + "device": "", + "dataset_path": "", + "dataset_trust_remote_code": true, + "seqpos_slice": [ + null + ], + "training_tokens": -100000, + "sae_lens_training_version": null, + "neuronpedia_id": null + }, + "eval_result_unstructured": null +} \ No newline at end of file