Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +16 -0
- eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +268 -0
- eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +268 -0
- eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +268 -0
- eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +268 -0
- eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +268 -0
- eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +268 -0
- eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +268 -0
- eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +268 -0
- eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +323 -0
- eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +323 -0
- eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +323 -0
- eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +323 -0
- eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +323 -0
- eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +323 -0
- eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +323 -0
- eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +323 -0
- eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +670 -0
- eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +670 -0
- eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +670 -0
- eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +670 -0
- eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +670 -0
- eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +670 -0
- eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +670 -0
- eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +670 -0
- eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +414 -0
- eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +414 -0
- eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +414 -0
- eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +414 -0
- eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +414 -0
- eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +414 -0
- eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +414 -0
- eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +414 -0
- eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +74 -0
.gitattributes
CHANGED
@@ -69,3 +69,19 @@ eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_da
|
|
69 |
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
70 |
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
71 |
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
70 |
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
71 |
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
72 |
+
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
73 |
+
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
74 |
+
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
75 |
+
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
76 |
+
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
77 |
+
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
78 |
+
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
79 |
+
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
80 |
+
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
81 |
+
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
82 |
+
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
83 |
+
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
84 |
+
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
85 |
+
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
86 |
+
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
87 |
+
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "ef0c46fe-510f-4912-be90-57aeae2de794",
|
17 |
+
"datetime_epoch_millis": 1740152315652,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.5196529448711236,
|
21 |
+
"mean_full_absorption_score": 0.3042496024774553,
|
22 |
+
"mean_num_split_features": 2.8461538461538463,
|
23 |
+
"std_dev_absorption_fraction_score": 0.2663894021350886,
|
24 |
+
"std_dev_full_absorption_score": 0.17641031485178768,
|
25 |
+
"std_dev_num_split_features": 1.6417626550097355
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.6889635895777194,
|
32 |
+
"full_absorption_rate": 0.3417065390749601,
|
33 |
+
"num_full_absorption": 857,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 4
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.679161422720634,
|
40 |
+
"full_absorption_rate": 0.3430609597924773,
|
41 |
+
"num_full_absorption": 529,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 6
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.8155963696018853,
|
48 |
+
"full_absorption_rate": 0.48698752228163994,
|
49 |
+
"num_full_absorption": 1366,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 3
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.7004096658239145,
|
56 |
+
"full_absorption_rate": 0.41807228915662653,
|
57 |
+
"num_full_absorption": 694,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 3
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.5585561782067426,
|
64 |
+
"full_absorption_rate": 0.3681930693069307,
|
65 |
+
"num_full_absorption": 595,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 4
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.8006965508885888,
|
72 |
+
"full_absorption_rate": 0.5872374798061389,
|
73 |
+
"num_full_absorption": 727,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 3
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.5635756145458141,
|
80 |
+
"full_absorption_rate": 0.34323144104803494,
|
81 |
+
"num_full_absorption": 393,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 3
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.7999027114878418,
|
88 |
+
"full_absorption_rate": 0.45893719806763283,
|
89 |
+
"num_full_absorption": 475,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 3
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.6117770347749325,
|
96 |
+
"full_absorption_rate": 0.358974358974359,
|
97 |
+
"num_full_absorption": 588,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.20771700655851907,
|
104 |
+
"full_absorption_rate": 0.06310679611650485,
|
105 |
+
"num_full_absorption": 26,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.4378382360943496,
|
112 |
+
"full_absorption_rate": 0.16592592592592592,
|
113 |
+
"num_full_absorption": 112,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 3
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.6610036837302684,
|
120 |
+
"full_absorption_rate": 0.3856041131105398,
|
121 |
+
"num_full_absorption": 450,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 6
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.7708970748978563,
|
128 |
+
"full_absorption_rate": 0.5172981878088962,
|
129 |
+
"num_full_absorption": 942,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 2
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.7095649014624135,
|
136 |
+
"full_absorption_rate": 0.3765743073047859,
|
137 |
+
"num_full_absorption": 299,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 3
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.6244493390182382,
|
144 |
+
"full_absorption_rate": 0.3964386129334583,
|
145 |
+
"num_full_absorption": 423,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 4
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.6332302852454678,
|
152 |
+
"full_absorption_rate": 0.38650306748466257,
|
153 |
+
"num_full_absorption": 882,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 7
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.009843952976727736,
|
160 |
+
"full_absorption_rate": 0.010526315789473684,
|
161 |
+
"num_full_absorption": 2,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.7078808696018212,
|
168 |
+
"full_absorption_rate": 0.4720752498530276,
|
169 |
+
"num_full_absorption": 803,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 3
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.7539738349846717,
|
176 |
+
"full_absorption_rate": 0.4912718204488778,
|
177 |
+
"num_full_absorption": 1379,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 3
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.655318360480264,
|
184 |
+
"full_absorption_rate": 0.3734513274336283,
|
185 |
+
"num_full_absorption": 633,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 3
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.3289562491496899,
|
192 |
+
"full_absorption_rate": 0.1986754966887417,
|
193 |
+
"num_full_absorption": 150,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.3259209126950176,
|
200 |
+
"full_absorption_rate": 0.22813688212927757,
|
201 |
+
"num_full_absorption": 180,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.06180711593258981,
|
208 |
+
"full_absorption_rate": 0.03581267217630854,
|
209 |
+
"num_full_absorption": 26,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.37442147942939097,
|
216 |
+
"full_absorption_rate": 0.08849557522123894,
|
217 |
+
"num_full_absorption": 10,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.022340281665225267,
|
224 |
+
"full_absorption_rate": 0.005681818181818182,
|
225 |
+
"num_full_absorption": 1,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.007173845098628674,
|
232 |
+
"full_absorption_rate": 0.00851063829787234,
|
233 |
+
"num_full_absorption": 2,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "a794207a-cbc8-4c1d-8ea8-36a54549f1c6",
|
17 |
+
"datetime_epoch_millis": 1740150709168,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.0834770439283162,
|
21 |
+
"mean_full_absorption_score": 0.06013611847978803,
|
22 |
+
"mean_num_split_features": 1.1538461538461537,
|
23 |
+
"std_dev_absorption_fraction_score": 0.09241618138074081,
|
24 |
+
"std_dev_full_absorption_score": 0.08150335423500922,
|
25 |
+
"std_dev_num_split_features": 0.36794648440311994
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.057028034407490984,
|
32 |
+
"full_absorption_rate": 0.017543859649122806,
|
33 |
+
"num_full_absorption": 44,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.0017792171984287038,
|
40 |
+
"full_absorption_rate": 0.0025940337224383916,
|
41 |
+
"num_full_absorption": 4,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.25592667322599083,
|
48 |
+
"full_absorption_rate": 0.1857397504456328,
|
49 |
+
"num_full_absorption": 521,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.24011997118028888,
|
56 |
+
"full_absorption_rate": 0.12590361445783133,
|
57 |
+
"num_full_absorption": 209,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.1110917028199224,
|
64 |
+
"full_absorption_rate": 0.15346534653465346,
|
65 |
+
"num_full_absorption": 248,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.0545493485524353,
|
72 |
+
"full_absorption_rate": 0.02665589660743134,
|
73 |
+
"num_full_absorption": 33,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.015559784132844725,
|
80 |
+
"full_absorption_rate": 0.0026200873362445414,
|
81 |
+
"num_full_absorption": 3,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.020966493314900192,
|
88 |
+
"full_absorption_rate": 0.005797101449275362,
|
89 |
+
"num_full_absorption": 6,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.22232954346440803,
|
96 |
+
"full_absorption_rate": 0.23321123321123322,
|
97 |
+
"num_full_absorption": 382,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.0033853270927236977,
|
104 |
+
"full_absorption_rate": 0.0024271844660194173,
|
105 |
+
"num_full_absorption": 1,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.00362522533780823,
|
112 |
+
"full_absorption_rate": 0.005925925925925926,
|
113 |
+
"num_full_absorption": 4,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.12461768062488289,
|
120 |
+
"full_absorption_rate": 0.06512425021422451,
|
121 |
+
"num_full_absorption": 76,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.0051840957147463755,
|
128 |
+
"full_absorption_rate": 0.008237232289950576,
|
129 |
+
"num_full_absorption": 15,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.05486755467875741,
|
136 |
+
"full_absorption_rate": 0.021410579345088162,
|
137 |
+
"num_full_absorption": 17,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.11608643361066023,
|
144 |
+
"full_absorption_rate": 0.06560449859418932,
|
145 |
+
"num_full_absorption": 70,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.286828957721577,
|
152 |
+
"full_absorption_rate": 0.28702892199824714,
|
153 |
+
"num_full_absorption": 655,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.0012196642214486685,
|
160 |
+
"full_absorption_rate": 0.0,
|
161 |
+
"num_full_absorption": 0,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 2
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.23541147654145936,
|
168 |
+
"full_absorption_rate": 0.1781305114638448,
|
169 |
+
"num_full_absorption": 303,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.15717383958699338,
|
176 |
+
"full_absorption_rate": 0.06840042750267189,
|
177 |
+
"num_full_absorption": 192,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.019660784963942177,
|
184 |
+
"full_absorption_rate": 0.008849557522123894,
|
185 |
+
"num_full_absorption": 15,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.02239216664758688,
|
192 |
+
"full_absorption_rate": 0.017218543046357615,
|
193 |
+
"num_full_absorption": 13,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.000307858004094181,
|
200 |
+
"full_absorption_rate": 0.0,
|
201 |
+
"num_full_absorption": 0,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.058073269829955455,
|
208 |
+
"full_absorption_rate": 0.0440771349862259,
|
209 |
+
"num_full_absorption": 32,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.04631967164684324,
|
216 |
+
"full_absorption_rate": 0.017699115044247787,
|
217 |
+
"num_full_absorption": 2,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.04255402129395246,
|
224 |
+
"full_absorption_rate": 0.011363636363636364,
|
225 |
+
"num_full_absorption": 2,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.013344346322079504,
|
232 |
+
"full_absorption_rate": 0.00851063829787234,
|
233 |
+
"num_full_absorption": 2,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "3fd72f62-0cc2-4495-be28-9e81ace44644",
|
17 |
+
"datetime_epoch_millis": 1740149898408,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.6847120849800752,
|
21 |
+
"mean_full_absorption_score": 0.6661390205611261,
|
22 |
+
"mean_num_split_features": 2.6538461538461537,
|
23 |
+
"std_dev_absorption_fraction_score": 0.2003883916940885,
|
24 |
+
"std_dev_full_absorption_score": 0.2090506101541011,
|
25 |
+
"std_dev_num_split_features": 1.3249092857190696
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.7233867462553377,
|
32 |
+
"full_absorption_rate": 0.64792663476874,
|
33 |
+
"num_full_absorption": 1625,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 4
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.7168509009331099,
|
40 |
+
"full_absorption_rate": 0.7159533073929961,
|
41 |
+
"num_full_absorption": 1104,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 4
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.9156825137832028,
|
48 |
+
"full_absorption_rate": 0.8762923351158646,
|
49 |
+
"num_full_absorption": 2458,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.7605077790256731,
|
56 |
+
"full_absorption_rate": 0.7590361445783133,
|
57 |
+
"num_full_absorption": 1260,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 2
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.5875146098112013,
|
64 |
+
"full_absorption_rate": 0.7271039603960396,
|
65 |
+
"num_full_absorption": 1175,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 3
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.7127841577147391,
|
72 |
+
"full_absorption_rate": 0.7059773828756059,
|
73 |
+
"num_full_absorption": 874,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 5
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.7512413415830805,
|
80 |
+
"full_absorption_rate": 0.7362445414847162,
|
81 |
+
"num_full_absorption": 843,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.7663615485897274,
|
88 |
+
"full_absorption_rate": 0.7661835748792271,
|
89 |
+
"num_full_absorption": 793,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 4
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.7978799456118504,
|
96 |
+
"full_absorption_rate": 0.8028083028083028,
|
97 |
+
"num_full_absorption": 1315,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.7148716538061125,
|
104 |
+
"full_absorption_rate": 0.6820388349514563,
|
105 |
+
"num_full_absorption": 281,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 2
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.5989374406326367,
|
112 |
+
"full_absorption_rate": 0.5555555555555556,
|
113 |
+
"num_full_absorption": 375,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 4
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.7327207331271454,
|
120 |
+
"full_absorption_rate": 0.7446443873179092,
|
121 |
+
"num_full_absorption": 869,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 3
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.7807434725648233,
|
128 |
+
"full_absorption_rate": 0.8160351455244371,
|
129 |
+
"num_full_absorption": 1486,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 2
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.7722688362643463,
|
136 |
+
"full_absorption_rate": 0.7657430730478589,
|
137 |
+
"num_full_absorption": 608,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 3
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.8168333418476779,
|
144 |
+
"full_absorption_rate": 0.7769447047797563,
|
145 |
+
"num_full_absorption": 829,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 2
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.8399943933692386,
|
152 |
+
"full_absorption_rate": 0.8273444347063978,
|
153 |
+
"num_full_absorption": 1888,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 2
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.7839560158065032,
|
160 |
+
"full_absorption_rate": 0.7157894736842105,
|
161 |
+
"num_full_absorption": 136,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 2
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.8207604001227187,
|
168 |
+
"full_absorption_rate": 0.8300999412110524,
|
169 |
+
"num_full_absorption": 1412,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 3
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.9192547073005679,
|
176 |
+
"full_absorption_rate": 0.8696116850730317,
|
177 |
+
"num_full_absorption": 2441,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.7832159717277276,
|
184 |
+
"full_absorption_rate": 0.736283185840708,
|
185 |
+
"num_full_absorption": 1248,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 2
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.31044767910219884,
|
192 |
+
"full_absorption_rate": 0.3880794701986755,
|
193 |
+
"num_full_absorption": 293,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 6
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.6288526333046588,
|
200 |
+
"full_absorption_rate": 0.5779467680608364,
|
201 |
+
"num_full_absorption": 456,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 4
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.820589593812622,
|
208 |
+
"full_absorption_rate": 0.7630853994490359,
|
209 |
+
"num_full_absorption": 554,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 3
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.2525570176189269,
|
216 |
+
"full_absorption_rate": 0.08849557522123894,
|
217 |
+
"num_full_absorption": 10,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.3328708151340147,
|
224 |
+
"full_absorption_rate": 0.29545454545454547,
|
225 |
+
"num_full_absorption": 52,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.16142996063211232,
|
232 |
+
"full_absorption_rate": 0.14893617021276595,
|
233 |
+
"num_full_absorption": 35,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "e4f2dfbd-7bca-41a0-b7cc-7b89d14b8d8f",
|
17 |
+
"datetime_epoch_millis": 1740151507106,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.4524983861704342,
|
21 |
+
"mean_full_absorption_score": 0.5240608368125174,
|
22 |
+
"mean_num_split_features": 4.3076923076923075,
|
23 |
+
"std_dev_absorption_fraction_score": 0.17312136051231236,
|
24 |
+
"std_dev_full_absorption_score": 0.19044461427366993,
|
25 |
+
"std_dev_num_split_features": 2.412786451706504
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.5244886983482652,
|
32 |
+
"full_absorption_rate": 0.5231259968102073,
|
33 |
+
"num_full_absorption": 1312,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 9
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.5030903415851329,
|
40 |
+
"full_absorption_rate": 0.6848249027237354,
|
41 |
+
"num_full_absorption": 1056,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 4
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.6621675316077551,
|
48 |
+
"full_absorption_rate": 0.685204991087344,
|
49 |
+
"num_full_absorption": 1922,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 7
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.5686391650350938,
|
56 |
+
"full_absorption_rate": 0.6313253012048192,
|
57 |
+
"num_full_absorption": 1048,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 5
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.3860805878256704,
|
64 |
+
"full_absorption_rate": 0.5006188118811881,
|
65 |
+
"num_full_absorption": 809,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 8
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.4846739037643228,
|
72 |
+
"full_absorption_rate": 0.6058158319870759,
|
73 |
+
"num_full_absorption": 750,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 6
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.4674741077161477,
|
80 |
+
"full_absorption_rate": 0.5545851528384279,
|
81 |
+
"num_full_absorption": 635,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 7
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.4141624927147996,
|
88 |
+
"full_absorption_rate": 0.46956521739130436,
|
89 |
+
"num_full_absorption": 486,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 5
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.5685041971113255,
|
96 |
+
"full_absorption_rate": 0.6868131868131868,
|
97 |
+
"num_full_absorption": 1125,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.3294766484902135,
|
104 |
+
"full_absorption_rate": 0.34951456310679613,
|
105 |
+
"num_full_absorption": 144,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 3
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.19406311445367022,
|
112 |
+
"full_absorption_rate": 0.2740740740740741,
|
113 |
+
"num_full_absorption": 185,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 3
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.43236308517151945,
|
120 |
+
"full_absorption_rate": 0.48586118251928023,
|
121 |
+
"num_full_absorption": 567,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 6
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.6774860216572675,
|
128 |
+
"full_absorption_rate": 0.7957166392092258,
|
129 |
+
"num_full_absorption": 1449,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 3
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.48846356092785836,
|
136 |
+
"full_absorption_rate": 0.5604534005037783,
|
137 |
+
"num_full_absorption": 445,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 3
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.591983877600817,
|
144 |
+
"full_absorption_rate": 0.6588566073102156,
|
145 |
+
"num_full_absorption": 703,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 3
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.692209094169467,
|
152 |
+
"full_absorption_rate": 0.7116564417177914,
|
153 |
+
"num_full_absorption": 1624,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 8
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.22241427485109083,
|
160 |
+
"full_absorption_rate": 0.23157894736842105,
|
161 |
+
"num_full_absorption": 44,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 2
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.5417187972127966,
|
168 |
+
"full_absorption_rate": 0.5631981187536743,
|
169 |
+
"num_full_absorption": 958,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 6
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.7569173413620608,
|
176 |
+
"full_absorption_rate": 0.8065550409690061,
|
177 |
+
"num_full_absorption": 2264,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 4
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.5078413641592,
|
184 |
+
"full_absorption_rate": 0.5150442477876106,
|
185 |
+
"num_full_absorption": 873,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 7
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.36593236176612903,
|
192 |
+
"full_absorption_rate": 0.6079470198675496,
|
193 |
+
"num_full_absorption": 459,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.33729577176136777,
|
200 |
+
"full_absorption_rate": 0.4664131812420786,
|
201 |
+
"num_full_absorption": 368,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 4
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.5653991608245975,
|
208 |
+
"full_absorption_rate": 0.7052341597796143,
|
209 |
+
"num_full_absorption": 512,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 3
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.09658828784853217,
|
216 |
+
"full_absorption_rate": 0.061946902654867256,
|
217 |
+
"num_full_absorption": 7,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.1756755097631157,
|
224 |
+
"full_absorption_rate": 0.20454545454545456,
|
225 |
+
"num_full_absorption": 36,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.20984874270307283,
|
232 |
+
"full_absorption_rate": 0.2851063829787234,
|
233 |
+
"num_full_absorption": 67,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "4a874c85-b7b8-4548-be1b-9967bf32571a",
|
17 |
+
"datetime_epoch_millis": 1740154752792,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.6616852261361333,
|
21 |
+
"mean_full_absorption_score": 0.615820361178369,
|
22 |
+
"mean_num_split_features": 2.6923076923076925,
|
23 |
+
"std_dev_absorption_fraction_score": 0.19938859527080388,
|
24 |
+
"std_dev_full_absorption_score": 0.20106605664354318,
|
25 |
+
"std_dev_num_split_features": 1.7382573059068274
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.8075326747620873,
|
32 |
+
"full_absorption_rate": 0.6698564593301436,
|
33 |
+
"num_full_absorption": 1680,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 2
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.649433379164975,
|
40 |
+
"full_absorption_rate": 0.5719844357976653,
|
41 |
+
"num_full_absorption": 882,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 6
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.843160509366003,
|
48 |
+
"full_absorption_rate": 0.774331550802139,
|
49 |
+
"num_full_absorption": 2172,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 3
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.8053629676295623,
|
56 |
+
"full_absorption_rate": 0.7536144578313253,
|
57 |
+
"num_full_absorption": 1251,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 2
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.6165512751061573,
|
64 |
+
"full_absorption_rate": 0.6961633663366337,
|
65 |
+
"num_full_absorption": 1125,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 2
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.7104825885299954,
|
72 |
+
"full_absorption_rate": 0.6922455573505655,
|
73 |
+
"num_full_absorption": 857,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 5
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.7387823401404363,
|
80 |
+
"full_absorption_rate": 0.7397379912663755,
|
81 |
+
"num_full_absorption": 847,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.7596010190875191,
|
88 |
+
"full_absorption_rate": 0.702415458937198,
|
89 |
+
"num_full_absorption": 727,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 4
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.6661271457750678,
|
96 |
+
"full_absorption_rate": 0.6306471306471306,
|
97 |
+
"num_full_absorption": 1033,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 4
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.6232648925391066,
|
104 |
+
"full_absorption_rate": 0.5728155339805825,
|
105 |
+
"num_full_absorption": 236,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 2
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.567028277479737,
|
112 |
+
"full_absorption_rate": 0.522962962962963,
|
113 |
+
"num_full_absorption": 353,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.754960114510044,
|
120 |
+
"full_absorption_rate": 0.7343616109682948,
|
121 |
+
"num_full_absorption": 857,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 3
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.8203947792860443,
|
128 |
+
"full_absorption_rate": 0.7940691927512356,
|
129 |
+
"num_full_absorption": 1446,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.7950898206311865,
|
136 |
+
"full_absorption_rate": 0.7670025188916877,
|
137 |
+
"num_full_absorption": 609,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 3
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.6647762399813312,
|
144 |
+
"full_absorption_rate": 0.521087160262418,
|
145 |
+
"num_full_absorption": 556,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 6
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.8176700167677401,
|
152 |
+
"full_absorption_rate": 0.7725679228746714,
|
153 |
+
"num_full_absorption": 1763,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 5
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.45423763895890557,
|
160 |
+
"full_absorption_rate": 0.3,
|
161 |
+
"num_full_absorption": 57,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.7393716764914745,
|
168 |
+
"full_absorption_rate": 0.6843033509700176,
|
169 |
+
"num_full_absorption": 1164,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 3
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.8758378878071444,
|
176 |
+
"full_absorption_rate": 0.791948699679373,
|
177 |
+
"num_full_absorption": 2223,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.8617655260809893,
|
184 |
+
"full_absorption_rate": 0.743952802359882,
|
185 |
+
"num_full_absorption": 1261,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 2
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.7107051211722657,
|
192 |
+
"full_absorption_rate": 0.7841059602649006,
|
193 |
+
"num_full_absorption": 592,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.5079324747158148,
|
200 |
+
"full_absorption_rate": 0.5057034220532319,
|
201 |
+
"num_full_absorption": 399,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 6
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.7893559891999093,
|
208 |
+
"full_absorption_rate": 0.7741046831955923,
|
209 |
+
"num_full_absorption": 562,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 3
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.2505387445283525,
|
216 |
+
"full_absorption_rate": 0.1592920353982301,
|
217 |
+
"num_full_absorption": 18,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.2744647016381297,
|
224 |
+
"full_absorption_rate": 0.19886363636363635,
|
225 |
+
"num_full_absorption": 35,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.09938807818948822,
|
232 |
+
"full_absorption_rate": 0.15319148936170213,
|
233 |
+
"num_full_absorption": 36,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "2be8745f-1a57-4404-ac46-dce11e8b68ff",
|
17 |
+
"datetime_epoch_millis": 1740153947708,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.5128081449664995,
|
21 |
+
"mean_full_absorption_score": 0.5412368653055878,
|
22 |
+
"mean_num_split_features": 3.6538461538461537,
|
23 |
+
"std_dev_absorption_fraction_score": 0.22140493761969532,
|
24 |
+
"std_dev_full_absorption_score": 0.2331752415340005,
|
25 |
+
"std_dev_num_split_features": 2.077350383393378
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.7110525620728301,
|
32 |
+
"full_absorption_rate": 0.7256778309409888,
|
33 |
+
"num_full_absorption": 1820,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 4
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.5900107080679658,
|
40 |
+
"full_absorption_rate": 0.6504539559014267,
|
41 |
+
"num_full_absorption": 1003,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 8
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.7889150540988363,
|
48 |
+
"full_absorption_rate": 0.7992869875222817,
|
49 |
+
"num_full_absorption": 2242,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 5
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.7213593093818161,
|
56 |
+
"full_absorption_rate": 0.7777108433734939,
|
57 |
+
"num_full_absorption": 1291,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 4
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.41357739362504964,
|
64 |
+
"full_absorption_rate": 0.5705445544554455,
|
65 |
+
"num_full_absorption": 922,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 5
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.6904496997463666,
|
72 |
+
"full_absorption_rate": 0.7164781906300485,
|
73 |
+
"num_full_absorption": 887,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 7
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.6122705763206383,
|
80 |
+
"full_absorption_rate": 0.6462882096069869,
|
81 |
+
"num_full_absorption": 740,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 6
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.5132230230308134,
|
88 |
+
"full_absorption_rate": 0.5207729468599034,
|
89 |
+
"num_full_absorption": 539,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 5
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.5740640698732722,
|
96 |
+
"full_absorption_rate": 0.7197802197802198,
|
97 |
+
"num_full_absorption": 1179,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 3
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.366316889496265,
|
104 |
+
"full_absorption_rate": 0.3567961165048544,
|
105 |
+
"num_full_absorption": 147,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.23633773281065845,
|
112 |
+
"full_absorption_rate": 0.23851851851851852,
|
113 |
+
"num_full_absorption": 161,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.6116279835790608,
|
120 |
+
"full_absorption_rate": 0.6392459297343616,
|
121 |
+
"num_full_absorption": 746,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 4
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.6636309057144126,
|
128 |
+
"full_absorption_rate": 0.7177375068643602,
|
129 |
+
"num_full_absorption": 1307,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 6
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.577436439906812,
|
136 |
+
"full_absorption_rate": 0.5629722921914357,
|
137 |
+
"num_full_absorption": 447,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 4
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.4891657688844448,
|
144 |
+
"full_absorption_rate": 0.5014058106841612,
|
145 |
+
"num_full_absorption": 535,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.8397761400133482,
|
152 |
+
"full_absorption_rate": 0.8347940403155127,
|
153 |
+
"num_full_absorption": 1905,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 3
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.33764789379308424,
|
160 |
+
"full_absorption_rate": 0.3105263157894737,
|
161 |
+
"num_full_absorption": 59,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 2
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.6411389015355013,
|
168 |
+
"full_absorption_rate": 0.6431510875955321,
|
169 |
+
"num_full_absorption": 1094,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 5
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.7622223123066498,
|
176 |
+
"full_absorption_rate": 0.7716423227645173,
|
177 |
+
"num_full_absorption": 2166,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 5
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.7386404958442533,
|
184 |
+
"full_absorption_rate": 0.7480825958702065,
|
185 |
+
"num_full_absorption": 1268,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 2
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.19444390279769325,
|
192 |
+
"full_absorption_rate": 0.3258278145695364,
|
193 |
+
"num_full_absorption": 246,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.36413111590891156,
|
200 |
+
"full_absorption_rate": 0.4435994930291508,
|
201 |
+
"num_full_absorption": 350,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 6
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.5408512211581553,
|
208 |
+
"full_absorption_rate": 0.6060606060606061,
|
209 |
+
"num_full_absorption": 440,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 3
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.09970442692753816,
|
216 |
+
"full_absorption_rate": 0.05309734513274336,
|
217 |
+
"num_full_absorption": 6,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.20248330206807633,
|
224 |
+
"full_absorption_rate": 0.1534090909090909,
|
225 |
+
"num_full_absorption": 27,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.05253394016653227,
|
232 |
+
"full_absorption_rate": 0.03829787234042553,
|
233 |
+
"num_full_absorption": 9,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "301cd6f2-5394-4131-b827-9d8a5bbeac04",
|
17 |
+
"datetime_epoch_millis": 1740153129410,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.6363149713432454,
|
21 |
+
"mean_full_absorption_score": 0.4939130321779123,
|
22 |
+
"mean_num_split_features": 3.3846153846153846,
|
23 |
+
"std_dev_absorption_fraction_score": 0.19938409617149097,
|
24 |
+
"std_dev_full_absorption_score": 0.17387178972743858,
|
25 |
+
"std_dev_num_split_features": 1.6751578570850707
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.685226183958439,
|
32 |
+
"full_absorption_rate": 0.4597288676236045,
|
33 |
+
"num_full_absorption": 1153,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 5
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.6798069224264082,
|
40 |
+
"full_absorption_rate": 0.5012970168612192,
|
41 |
+
"num_full_absorption": 773,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 4
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.7703485846895846,
|
48 |
+
"full_absorption_rate": 0.6217468805704099,
|
49 |
+
"num_full_absorption": 1744,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 2
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.8082173823988077,
|
56 |
+
"full_absorption_rate": 0.6445783132530121,
|
57 |
+
"num_full_absorption": 1070,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 3
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.5316796151182803,
|
64 |
+
"full_absorption_rate": 0.49876237623762376,
|
65 |
+
"num_full_absorption": 806,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 3
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.6921068442218885,
|
72 |
+
"full_absorption_rate": 0.6106623586429726,
|
73 |
+
"num_full_absorption": 756,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 5
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.6652568101195918,
|
80 |
+
"full_absorption_rate": 0.537117903930131,
|
81 |
+
"num_full_absorption": 615,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 5
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.7274664367939123,
|
88 |
+
"full_absorption_rate": 0.5217391304347826,
|
89 |
+
"num_full_absorption": 540,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 4
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.5973165908932144,
|
96 |
+
"full_absorption_rate": 0.4896214896214896,
|
97 |
+
"num_full_absorption": 802,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 3
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.749237725468953,
|
104 |
+
"full_absorption_rate": 0.5072815533980582,
|
105 |
+
"num_full_absorption": 209,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 2
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.7065567821922324,
|
112 |
+
"full_absorption_rate": 0.5955555555555555,
|
113 |
+
"num_full_absorption": 402,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.7196233455554913,
|
120 |
+
"full_absorption_rate": 0.5355612682090831,
|
121 |
+
"num_full_absorption": 625,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 4
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.7175474373170088,
|
128 |
+
"full_absorption_rate": 0.4876441515650741,
|
129 |
+
"num_full_absorption": 888,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 7
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.7491123746163217,
|
136 |
+
"full_absorption_rate": 0.5654911838790933,
|
137 |
+
"num_full_absorption": 449,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 2
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.5657368175863791,
|
144 |
+
"full_absorption_rate": 0.4217432052483599,
|
145 |
+
"num_full_absorption": 450,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 7
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.7904435804397044,
|
152 |
+
"full_absorption_rate": 0.6897458369851008,
|
153 |
+
"num_full_absorption": 1574,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 3
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.697611173748312,
|
160 |
+
"full_absorption_rate": 0.46842105263157896,
|
161 |
+
"num_full_absorption": 89,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 3
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.6868794374337754,
|
168 |
+
"full_absorption_rate": 0.5955320399764844,
|
169 |
+
"num_full_absorption": 1013,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 5
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.8751669277159965,
|
176 |
+
"full_absorption_rate": 0.7064481653010332,
|
177 |
+
"num_full_absorption": 1983,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.7865680163892015,
|
184 |
+
"full_absorption_rate": 0.6070796460176991,
|
185 |
+
"num_full_absorption": 1029,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 4
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.5045876719987833,
|
192 |
+
"full_absorption_rate": 0.42251655629139073,
|
193 |
+
"num_full_absorption": 319,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.573872745990794,
|
200 |
+
"full_absorption_rate": 0.47782002534854245,
|
201 |
+
"num_full_absorption": 377,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 4
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.7610011483914709,
|
208 |
+
"full_absorption_rate": 0.6556473829201102,
|
209 |
+
"num_full_absorption": 476,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 4
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.10157088518458829,
|
216 |
+
"full_absorption_rate": 0.035398230088495575,
|
217 |
+
"num_full_absorption": 4,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.36839684040497356,
|
224 |
+
"full_absorption_rate": 0.14204545454545456,
|
225 |
+
"num_full_absorption": 25,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 3
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.03285097387026813,
|
232 |
+
"full_absorption_rate": 0.0425531914893617,
|
233 |
+
"num_full_absorption": 10,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "748a0895-c8af-4e84-ae36-2f3eb01ee78c",
|
17 |
+
"datetime_epoch_millis": 1740155568982,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.32544343256195074,
|
21 |
+
"mean_full_absorption_score": 0.32888806627437994,
|
22 |
+
"mean_num_split_features": 1.6538461538461537,
|
23 |
+
"std_dev_absorption_fraction_score": 0.20824169704299758,
|
24 |
+
"std_dev_full_absorption_score": 0.22060249667862672,
|
25 |
+
"std_dev_num_split_features": 1.0933364602832083
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.6007277347131041,
|
32 |
+
"full_absorption_rate": 0.5251196172248804,
|
33 |
+
"num_full_absorption": 1317,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.29305024763176835,
|
40 |
+
"full_absorption_rate": 0.30804150453955903,
|
41 |
+
"num_full_absorption": 475,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.630569504589696,
|
48 |
+
"full_absorption_rate": 0.6559714795008913,
|
49 |
+
"num_full_absorption": 1840,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 3
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.398668071195782,
|
56 |
+
"full_absorption_rate": 0.4066265060240964,
|
57 |
+
"num_full_absorption": 675,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 2
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.3653745156003204,
|
64 |
+
"full_absorption_rate": 0.46410891089108913,
|
65 |
+
"num_full_absorption": 750,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 3
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.4766671026609379,
|
72 |
+
"full_absorption_rate": 0.5218093699515347,
|
73 |
+
"num_full_absorption": 646,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.20841239327824035,
|
80 |
+
"full_absorption_rate": 0.20611353711790392,
|
81 |
+
"num_full_absorption": 236,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.194335181953026,
|
88 |
+
"full_absorption_rate": 0.19033816425120773,
|
89 |
+
"num_full_absorption": 197,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.498854509106931,
|
96 |
+
"full_absorption_rate": 0.5384615384615384,
|
97 |
+
"num_full_absorption": 882,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 3
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.00809270254817472,
|
104 |
+
"full_absorption_rate": 0.012135922330097087,
|
105 |
+
"num_full_absorption": 5,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.027364220505435494,
|
112 |
+
"full_absorption_rate": 0.022222222222222223,
|
113 |
+
"num_full_absorption": 15,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.46655821897414346,
|
120 |
+
"full_absorption_rate": 0.46786632390745503,
|
121 |
+
"num_full_absorption": 546,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.48562595745684123,
|
128 |
+
"full_absorption_rate": 0.5332235035694673,
|
129 |
+
"num_full_absorption": 971,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 2
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.36360093234914864,
|
136 |
+
"full_absorption_rate": 0.3211586901763224,
|
137 |
+
"num_full_absorption": 255,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 4
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.3188066412402324,
|
144 |
+
"full_absorption_rate": 0.3786316776007498,
|
145 |
+
"num_full_absorption": 404,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.6813615753502903,
|
152 |
+
"full_absorption_rate": 0.6919368974583698,
|
153 |
+
"num_full_absorption": 1579,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 2
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.030652585242806108,
|
160 |
+
"full_absorption_rate": 0.03684210526315789,
|
161 |
+
"num_full_absorption": 7,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.4850781646941056,
|
168 |
+
"full_absorption_rate": 0.5202821869488536,
|
169 |
+
"num_full_absorption": 885,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.6328351318421925,
|
176 |
+
"full_absorption_rate": 0.6074100463127895,
|
177 |
+
"num_full_absorption": 1705,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 5
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.3754464221045167,
|
184 |
+
"full_absorption_rate": 0.31504424778761064,
|
185 |
+
"num_full_absorption": 534,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.33574647694082693,
|
192 |
+
"full_absorption_rate": 0.41456953642384103,
|
193 |
+
"num_full_absorption": 313,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.07488270366700864,
|
200 |
+
"full_absorption_rate": 0.08745247148288973,
|
201 |
+
"num_full_absorption": 69,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.24404390408289825,
|
208 |
+
"full_absorption_rate": 0.2327823691460055,
|
209 |
+
"num_full_absorption": 169,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.17461273099505284,
|
216 |
+
"full_absorption_rate": 0.017699115044247787,
|
217 |
+
"num_full_absorption": 2,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.07002365476192687,
|
224 |
+
"full_absorption_rate": 0.045454545454545456,
|
225 |
+
"num_full_absorption": 8,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.020137963125312984,
|
232 |
+
"full_absorption_rate": 0.029787234042553193,
|
233 |
+
"num_full_absorption": 7,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ade8835091139a702090a82b583edafc17952df258ec2094ccae8475ffb0edd3
|
3 |
+
size 26038784
|
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ed9ba6ea6fbc0311812466993b7423a455115141b96aa55d92b5ffd60a716e5
|
3 |
+
size 26004823
|
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a0e5eeaf5313765b9d51b5b018a3a14d86d0f405351fba34e5b131f893c59124
|
3 |
+
size 25526733
|
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:488c950c80e13463424f371370dba26e2461c059d86c19fbca50395bdf88341f
|
3 |
+
size 25600864
|
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:536ed4bc61cb843c976114f9a3c8bbe2767dc9126eff5af03e4263d82fbbdb63
|
3 |
+
size 25576485
|
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9b5c2e8a088ce126cb3328b7b53055a573c73a6e3025fdbfa16eadebb694b70a
|
3 |
+
size 25376156
|
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:91176fdc720c6a207b11313f9efbd110ecbe9b42e1d31f237d5c567195c4ca73
|
3 |
+
size 25873513
|
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e4b4b5355f82658f66abf51ca1610565246c09085f9666cf6ba0497eaf78ed8
|
3 |
+
size 25938312
|
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:655ab0b13ade7b4ab554c5c3eec0134aa39dd1a462045bf320a8ae2d72d47c6e
|
3 |
+
size 21672734
|
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e4fa8826a42d156a4678073845c4b6c449760fa1a0e0b66b2d370935f2417fa
|
3 |
+
size 21773774
|
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b16f7aadd97950eb3f710d2dd1293998ff117f83c436ab19489a9f063abec1fe
|
3 |
+
size 21266212
|
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e1ee90bbe960ff9c633099c66a1c22fcbed7fe1ced711815c6035b4615010a2
|
3 |
+
size 21208670
|
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ea6375fb3f724b9d9f35c8f9da86a56cb07ffe09b252eebd68063e5af0549f9
|
3 |
+
size 21505205
|
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:45fc340a10fb487df31d251f7b80d4d56629b38ccc606ce074441c8e2993478a
|
3 |
+
size 21585654
|
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1da2c27c4aed20429fa10b59ce05e61b689a887f52c352891e4ee7fa112da3a5
|
3 |
+
size 21630563
|
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bcac9aa9a10fc5e629bd04219f06be94fc721bec2c03f09d8f48ffc4cbeb469c
|
3 |
+
size 21697035
|
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "76e1c3b5-c26c-4837-91c8-6d45919b460c",
|
73 |
+
"datetime_epoch_millis": 1740160660875,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.17937598651986972,
|
77 |
+
"scr_metric_threshold_2": 0.06750879575859692,
|
78 |
+
"scr_dir2_threshold_2": 0.07494545276237846,
|
79 |
+
"scr_dir1_threshold_5": 0.22402296524662565,
|
80 |
+
"scr_metric_threshold_5": 0.11204012359707471,
|
81 |
+
"scr_dir2_threshold_5": 0.11697061811839651,
|
82 |
+
"scr_dir1_threshold_10": 0.2336172994227871,
|
83 |
+
"scr_metric_threshold_10": 0.13293663214601245,
|
84 |
+
"scr_dir2_threshold_10": 0.1427465534868288,
|
85 |
+
"scr_dir1_threshold_20": 0.2505843441914149,
|
86 |
+
"scr_metric_threshold_20": 0.16341666656794324,
|
87 |
+
"scr_dir2_threshold_20": 0.17208719496326916,
|
88 |
+
"scr_dir1_threshold_50": 0.27174068905788434,
|
89 |
+
"scr_metric_threshold_50": 0.21225175803186222,
|
90 |
+
"scr_dir2_threshold_50": 0.2175195174361239,
|
91 |
+
"scr_dir1_threshold_100": 0.2464209622824807,
|
92 |
+
"scr_metric_threshold_100": 0.22702989060166226,
|
93 |
+
"scr_dir2_threshold_100": 0.23717704484868324,
|
94 |
+
"scr_dir1_threshold_500": 0.2697303489398643,
|
95 |
+
"scr_metric_threshold_500": 0.22331210595096943,
|
96 |
+
"scr_dir2_threshold_500": 0.23536249992024194
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.47619029597980633,
|
103 |
+
"scr_metric_threshold_2": 0.004914087313907592,
|
104 |
+
"scr_dir2_threshold_2": 0.004914087313907592,
|
105 |
+
"scr_dir1_threshold_5": 0.555555345309774,
|
106 |
+
"scr_metric_threshold_5": 0.012284998611665989,
|
107 |
+
"scr_dir2_threshold_5": 0.012284998611665989,
|
108 |
+
"scr_dir1_threshold_10": 0.555555345309774,
|
109 |
+
"scr_metric_threshold_10": 0.027027114104653437,
|
110 |
+
"scr_dir2_threshold_10": 0.027027114104653437,
|
111 |
+
"scr_dir1_threshold_20": 0.5238097040201937,
|
112 |
+
"scr_metric_threshold_20": 0.036855142283733294,
|
113 |
+
"scr_dir2_threshold_20": 0.036855142283733294,
|
114 |
+
"scr_dir1_threshold_50": 0.333333017964661,
|
115 |
+
"scr_metric_threshold_50": 0.06633908037223754,
|
116 |
+
"scr_dir2_threshold_50": 0.06633908037223754,
|
117 |
+
"scr_dir1_threshold_100": 0.31746019731987085,
|
118 |
+
"scr_metric_threshold_100": 0.10565119308855696,
|
119 |
+
"scr_dir2_threshold_100": 0.10565119308855696,
|
120 |
+
"scr_dir1_threshold_500": 0.2698407892794835,
|
121 |
+
"scr_metric_threshold_500": 0.23587223695171058,
|
122 |
+
"scr_dir2_threshold_500": 0.23587223695171058
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.1313126752015721,
|
127 |
+
"scr_metric_threshold_2": 0.08498575820403938,
|
128 |
+
"scr_dir2_threshold_2": 0.08498575820403938,
|
129 |
+
"scr_dir1_threshold_5": 0.16161592443815084,
|
130 |
+
"scr_metric_threshold_5": 0.1161473379602074,
|
131 |
+
"scr_dir2_threshold_5": 0.1161473379602074,
|
132 |
+
"scr_dir1_threshold_10": 0.17171680682792437,
|
133 |
+
"scr_metric_threshold_10": 0.15014157370114636,
|
134 |
+
"scr_dir2_threshold_10": 0.15014157370114636,
|
135 |
+
"scr_dir1_threshold_20": 0.17171680682792437,
|
136 |
+
"scr_metric_threshold_20": 0.17847032862085146,
|
137 |
+
"scr_dir2_threshold_20": 0.17847032862085146,
|
138 |
+
"scr_dir1_threshold_50": 0.3737374649596856,
|
139 |
+
"scr_metric_threshold_50": 0.2096317395253275,
|
140 |
+
"scr_dir2_threshold_50": 0.2096317395253275,
|
141 |
+
"scr_dir1_threshold_100": 0.11111091042202506,
|
142 |
+
"scr_metric_threshold_100": 0.24362614411795844,
|
143 |
+
"scr_dir2_threshold_100": 0.24362614411795844,
|
144 |
+
"scr_dir1_threshold_500": 0.03030264716932058,
|
145 |
+
"scr_metric_threshold_500": 0.07082146517003281,
|
146 |
+
"scr_dir2_threshold_500": 0.07082146517003281
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.516128349998606,
|
151 |
+
"scr_metric_threshold_2": 0.007575810028571093,
|
152 |
+
"scr_dir2_threshold_2": 0.007575810028571093,
|
153 |
+
"scr_dir1_threshold_5": 0.5483869727270193,
|
154 |
+
"scr_metric_threshold_5": 0.022727279568944055,
|
155 |
+
"scr_dir2_threshold_5": 0.022727279568944055,
|
156 |
+
"scr_dir1_threshold_10": 0.516128349998606,
|
157 |
+
"scr_metric_threshold_10": 0.03535352927204973,
|
158 |
+
"scr_dir2_threshold_10": 0.03535352927204973,
|
159 |
+
"scr_dir1_threshold_20": 0.48387068863579336,
|
160 |
+
"scr_metric_threshold_20": 0.07070705854409946,
|
161 |
+
"scr_dir2_threshold_20": 0.07070705854409946,
|
162 |
+
"scr_dir1_threshold_50": 0.46774185795438705,
|
163 |
+
"scr_metric_threshold_50": 0.1010101481416146,
|
164 |
+
"scr_dir2_threshold_50": 0.1010101481416146,
|
165 |
+
"scr_dir1_threshold_100": 0.46774185795438705,
|
166 |
+
"scr_metric_threshold_100": 0.17171720668571405,
|
167 |
+
"scr_dir2_threshold_100": 0.17171720668571405,
|
168 |
+
"scr_dir1_threshold_500": 0.40322557386316116,
|
169 |
+
"scr_metric_threshold_500": -0.012626249703105673,
|
170 |
+
"scr_dir2_threshold_500": -0.012626249703105673
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.1707318964255635,
|
175 |
+
"scr_metric_threshold_2": 0.03519051639302713,
|
176 |
+
"scr_dir2_threshold_2": 0.03519051639302713,
|
177 |
+
"scr_dir1_threshold_5": 0.2845529991791037,
|
178 |
+
"scr_metric_threshold_5": 0.07331372148022297,
|
179 |
+
"scr_dir2_threshold_5": 0.07331372148022297,
|
180 |
+
"scr_dir1_threshold_10": 0.15447146198048264,
|
181 |
+
"scr_metric_threshold_10": 0.10263921007231215,
|
182 |
+
"scr_dir2_threshold_10": 0.10263921007231215,
|
183 |
+
"scr_dir1_threshold_20": 0.16260192149827632,
|
184 |
+
"scr_metric_threshold_20": 0.11730195436835675,
|
185 |
+
"scr_dir2_threshold_20": 0.11730195436835675,
|
186 |
+
"scr_dir1_threshold_50": 0.12195107768082737,
|
187 |
+
"scr_metric_threshold_50": 0.17302041765206616,
|
188 |
+
"scr_dir2_threshold_50": 0.17302041765206616,
|
189 |
+
"scr_dir1_threshold_100": 0.1707318964255635,
|
190 |
+
"scr_metric_threshold_100": -0.005865202594637679,
|
191 |
+
"scr_dir2_threshold_100": -0.005865202594637679,
|
192 |
+
"scr_dir1_threshold_500": 0.5447155889858393,
|
193 |
+
"scr_metric_threshold_500": -0.07624641017439167,
|
194 |
+
"scr_dir2_threshold_500": -0.07624641017439167
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.00546440968122594,
|
199 |
+
"scr_metric_threshold_2": 0.21875017462295412,
|
200 |
+
"scr_dir2_threshold_2": 0.21875017462295412,
|
201 |
+
"scr_dir1_threshold_5": 0.0,
|
202 |
+
"scr_metric_threshold_5": 0.371093800931695,
|
203 |
+
"scr_dir2_threshold_5": 0.371093800931695,
|
204 |
+
"scr_dir1_threshold_10": 0.05464474822904205,
|
205 |
+
"scr_metric_threshold_10": 0.417968888243172,
|
206 |
+
"scr_dir2_threshold_10": 0.417968888243172,
|
207 |
+
"scr_dir1_threshold_20": 0.10382508677685816,
|
208 |
+
"scr_metric_threshold_20": 0.4531251455191284,
|
209 |
+
"scr_dir2_threshold_20": 0.4531251455191284,
|
210 |
+
"scr_dir1_threshold_50": 0.1803277994391953,
|
211 |
+
"scr_metric_threshold_50": 0.5156250291038257,
|
212 |
+
"scr_dir2_threshold_50": 0.5156250291038257,
|
213 |
+
"scr_dir1_threshold_100": 0.10382508677685816,
|
214 |
+
"scr_metric_threshold_100": 0.5625001164153027,
|
215 |
+
"scr_dir2_threshold_100": 0.5625001164153027,
|
216 |
+
"scr_dir1_threshold_500": -0.06557389329988525,
|
217 |
+
"scr_metric_threshold_500": 0.6054687718278693,
|
218 |
+
"scr_dir2_threshold_500": 0.6054687718278693
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.046153559299134936,
|
223 |
+
"scr_metric_threshold_2": 0.06451625307937271,
|
224 |
+
"scr_dir2_threshold_2": 0.06451625307937271,
|
225 |
+
"scr_dir1_threshold_5": 0.08717928497042846,
|
226 |
+
"scr_metric_threshold_5": 0.08870972781349516,
|
227 |
+
"scr_dir2_threshold_5": 0.08870972781349516,
|
228 |
+
"scr_dir1_threshold_10": 0.15384601277006638,
|
229 |
+
"scr_metric_threshold_10": 0.1008065853511987,
|
230 |
+
"scr_dir2_threshold_10": 0.1008065853511987,
|
231 |
+
"scr_dir1_threshold_20": 0.158974152062764,
|
232 |
+
"scr_metric_threshold_20": 0.12500006008532116,
|
233 |
+
"scr_dir2_threshold_20": 0.12500006008532116,
|
234 |
+
"scr_dir1_threshold_50": 0.20512801702675515,
|
235 |
+
"scr_metric_threshold_50": 0.20161293036111277,
|
236 |
+
"scr_dir2_threshold_50": 0.20161293036111277,
|
237 |
+
"scr_dir1_threshold_100": 0.24615374269804866,
|
238 |
+
"scr_metric_threshold_100": 0.29032265817460795,
|
239 |
+
"scr_dir2_threshold_100": 0.29032265817460795,
|
240 |
+
"scr_dir1_threshold_500": 0.35384589050412385,
|
241 |
+
"scr_metric_threshold_500": 0.3991936549900859,
|
242 |
+
"scr_dir2_threshold_500": 0.3991936549900859
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.06756750225933819,
|
247 |
+
"scr_metric_threshold_2": 0.10267856311319266,
|
248 |
+
"scr_dir2_threshold_2": 0.10267856311319266,
|
249 |
+
"scr_dir1_threshold_5": 0.09909916440732847,
|
250 |
+
"scr_metric_threshold_5": 0.15625009146916644,
|
251 |
+
"scr_dir2_threshold_5": 0.15625009146916644,
|
252 |
+
"scr_dir1_threshold_10": 0.19819806032526952,
|
253 |
+
"scr_metric_threshold_10": 0.16517854648243513,
|
254 |
+
"scr_dir2_threshold_10": 0.16517854648243513,
|
255 |
+
"scr_dir1_threshold_20": 0.2882883318271079,
|
256 |
+
"scr_metric_threshold_20": 0.214285581239654,
|
257 |
+
"scr_dir2_threshold_20": 0.214285581239654,
|
258 |
+
"scr_dir1_threshold_50": 0.3243244404278432,
|
259 |
+
"scr_metric_threshold_50": 0.2633928820889934,
|
260 |
+
"scr_dir2_threshold_50": 0.2633928820889934,
|
261 |
+
"scr_dir1_threshold_100": 0.3783783348395588,
|
262 |
+
"scr_metric_threshold_100": 0.27232133710226214,
|
263 |
+
"scr_dir2_threshold_100": 0.27232133710226214,
|
264 |
+
"scr_dir1_threshold_500": 0.43693694419340684,
|
265 |
+
"scr_metric_threshold_500": 0.37946412772208915,
|
266 |
+
"scr_dir2_threshold_500": 0.37946412772208915
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.021459203313710703,
|
271 |
+
"scr_metric_threshold_2": 0.021459203313710703,
|
272 |
+
"scr_dir2_threshold_2": 0.08095245934396302,
|
273 |
+
"scr_dir1_threshold_5": 0.05579403094120067,
|
274 |
+
"scr_metric_threshold_5": 0.05579403094120067,
|
275 |
+
"scr_dir2_threshold_5": 0.09523798711177515,
|
276 |
+
"scr_dir1_threshold_10": 0.0643776099411321,
|
277 |
+
"scr_metric_threshold_10": 0.0643776099411321,
|
278 |
+
"scr_dir2_threshold_10": 0.14285698066766273,
|
279 |
+
"scr_dir1_threshold_20": 0.11158806188240133,
|
280 |
+
"scr_metric_threshold_20": 0.11158806188240133,
|
281 |
+
"scr_dir2_threshold_20": 0.18095228904500887,
|
282 |
+
"scr_dir1_threshold_50": 0.1673818370097199,
|
283 |
+
"scr_metric_threshold_50": 0.1673818370097199,
|
284 |
+
"scr_dir2_threshold_50": 0.20952391224381361,
|
285 |
+
"scr_dir1_threshold_100": 0.17596567182353345,
|
286 |
+
"scr_metric_threshold_100": 0.17596567182353345,
|
287 |
+
"scr_dir2_threshold_100": 0.25714290579970117,
|
288 |
+
"scr_dir1_threshold_500": 0.18454925082346488,
|
289 |
+
"scr_metric_threshold_500": 0.18454925082346488,
|
290 |
+
"scr_dir2_threshold_500": 0.280952402577645
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0",
|
296 |
+
"sae_lens_version": "5.4.2",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 65536,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "e41564f8-711c-402f-be1a-c2c1e1a26d9b",
|
73 |
+
"datetime_epoch_millis": 1740159778191,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.21646779635345523,
|
77 |
+
"scr_metric_threshold_2": 0.10981927993643228,
|
78 |
+
"scr_dir2_threshold_2": 0.11159734942875701,
|
79 |
+
"scr_dir1_threshold_5": 0.2386560380069923,
|
80 |
+
"scr_metric_threshold_5": 0.15915722723436423,
|
81 |
+
"scr_dir2_threshold_5": 0.1647417441944568,
|
82 |
+
"scr_dir1_threshold_10": 0.2763340942473081,
|
83 |
+
"scr_metric_threshold_10": 0.20696168095936596,
|
84 |
+
"scr_dir2_threshold_10": 0.20938604847717682,
|
85 |
+
"scr_dir1_threshold_20": 0.29245964775791145,
|
86 |
+
"scr_metric_threshold_20": 0.2572680056753961,
|
87 |
+
"scr_dir2_threshold_20": 0.2562308120870308,
|
88 |
+
"scr_dir1_threshold_50": 0.2917372572864733,
|
89 |
+
"scr_metric_threshold_50": 0.35979093988002914,
|
90 |
+
"scr_dir2_threshold_50": 0.3474876389705775,
|
91 |
+
"scr_dir1_threshold_100": 0.22457255710535257,
|
92 |
+
"scr_metric_threshold_100": 0.4050724113957374,
|
93 |
+
"scr_dir2_threshold_100": 0.40371842827480603,
|
94 |
+
"scr_dir1_threshold_500": 0.06777817985891049,
|
95 |
+
"scr_metric_threshold_500": 0.31473923877780835,
|
96 |
+
"scr_dir2_threshold_500": 0.30271441063618476
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.5079359372693867,
|
103 |
+
"scr_metric_threshold_2": 0.007371057746493725,
|
104 |
+
"scr_dir2_threshold_2": 0.007371057746493725,
|
105 |
+
"scr_dir1_threshold_5": 0.5396825246649839,
|
106 |
+
"scr_metric_threshold_5": 0.022113026790745845,
|
107 |
+
"scr_dir2_threshold_5": 0.022113026790745845,
|
108 |
+
"scr_dir1_threshold_10": 0.6031747533501614,
|
109 |
+
"scr_metric_threshold_10": 0.022113026790745845,
|
110 |
+
"scr_dir2_threshold_10": 0.022113026790745845,
|
111 |
+
"scr_dir1_threshold_20": 0.5873019327053712,
|
112 |
+
"scr_metric_threshold_20": 0.05651105219315768,
|
113 |
+
"scr_dir2_threshold_20": 0.05651105219315768,
|
114 |
+
"scr_dir1_threshold_50": 0.5714281659545642,
|
115 |
+
"scr_metric_threshold_50": 0.13513513117706122,
|
116 |
+
"scr_dir2_threshold_50": 0.13513513117706122,
|
117 |
+
"scr_dir1_threshold_100": 0.5238097040201937,
|
118 |
+
"scr_metric_threshold_100": 0.15479118753522092,
|
119 |
+
"scr_dir2_threshold_100": 0.15479118753522092,
|
120 |
+
"scr_dir1_threshold_500": 0.42857088793941894,
|
121 |
+
"scr_metric_threshold_500": -0.004913940865172265,
|
122 |
+
"scr_dir2_threshold_500": -0.004913940865172265
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.2121209384542766,
|
127 |
+
"scr_metric_threshold_2": 0.0934844015651201,
|
128 |
+
"scr_dir2_threshold_2": 0.0934844015651201,
|
129 |
+
"scr_dir1_threshold_5": 0.25252507008062886,
|
130 |
+
"scr_metric_threshold_5": 0.15014157370114636,
|
131 |
+
"scr_dir2_threshold_5": 0.15014157370114636,
|
132 |
+
"scr_dir1_threshold_10": 0.31313096648652816,
|
133 |
+
"scr_metric_threshold_10": 0.20679891468886458,
|
134 |
+
"scr_dir2_threshold_10": 0.20679891468886458,
|
135 |
+
"scr_dir1_threshold_20": 0.3333333333333333,
|
136 |
+
"scr_metric_threshold_20": 0.23796032559334063,
|
137 |
+
"scr_dir2_threshold_20": 0.23796032559334063,
|
138 |
+
"scr_dir1_threshold_50": 0.15151504204837732,
|
139 |
+
"scr_metric_threshold_50": 0.3597733132264739,
|
140 |
+
"scr_dir2_threshold_50": 0.3597733132264739,
|
141 |
+
"scr_dir1_threshold_100": 0.11111091042202506,
|
142 |
+
"scr_metric_threshold_100": 0.43909342175758737,
|
143 |
+
"scr_dir2_threshold_100": 0.43909342175758737,
|
144 |
+
"scr_dir1_threshold_500": -0.8484855600188809,
|
145 |
+
"scr_metric_threshold_500": 0.10481586976266374,
|
146 |
+
"scr_dir2_threshold_500": 0.10481586976266374
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.532258142045613,
|
151 |
+
"scr_metric_threshold_2": 0.017676839894409477,
|
152 |
+
"scr_dir2_threshold_2": 0.017676839894409477,
|
153 |
+
"scr_dir1_threshold_5": 0.5483869727270193,
|
154 |
+
"scr_metric_threshold_5": 0.03535352927204973,
|
155 |
+
"scr_dir2_threshold_5": 0.03535352927204973,
|
156 |
+
"scr_dir1_threshold_10": 0.5806446340898319,
|
157 |
+
"scr_metric_threshold_10": 0.06565661886956488,
|
158 |
+
"scr_dir2_threshold_10": 0.06565661886956488,
|
159 |
+
"scr_dir1_threshold_20": 0.48387068863579336,
|
160 |
+
"scr_metric_threshold_20": 0.12121220787329137,
|
161 |
+
"scr_dir2_threshold_20": 0.12121220787329137,
|
162 |
+
"scr_dir1_threshold_50": 0.532258142045613,
|
163 |
+
"scr_metric_threshold_50": 0.24242426522981353,
|
164 |
+
"scr_dir2_threshold_50": 0.24242426522981353,
|
165 |
+
"scr_dir1_threshold_100": 0.46774185795438705,
|
166 |
+
"scr_metric_threshold_100": 0.31060610393664567,
|
167 |
+
"scr_dir2_threshold_100": 0.31060610393664567,
|
168 |
+
"scr_dir1_threshold_500": 0.03225766136281265,
|
169 |
+
"scr_metric_threshold_500": 0.09090911827577622,
|
170 |
+
"scr_dir2_threshold_500": 0.09090911827577622
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.2926829741063909,
|
175 |
+
"scr_metric_threshold_2": 0.0791787492811609,
|
176 |
+
"scr_dir2_threshold_2": 0.0791787492811609,
|
177 |
+
"scr_dir1_threshold_5": 0.2764230242518165,
|
178 |
+
"scr_metric_threshold_5": 0.15835767335602155,
|
179 |
+
"scr_dir2_threshold_5": 0.15835767335602155,
|
180 |
+
"scr_dir1_threshold_10": 0.2764230242518165,
|
181 |
+
"scr_metric_threshold_10": 0.21407613663973096,
|
182 |
+
"scr_dir2_threshold_10": 0.21407613663973096,
|
183 |
+
"scr_dir1_threshold_20": 0.1707318964255635,
|
184 |
+
"scr_metric_threshold_20": 0.28739003291365367,
|
185 |
+
"scr_dir2_threshold_20": 0.28739003291365367,
|
186 |
+
"scr_dir1_threshold_50": -0.06504076859931053,
|
187 |
+
"scr_metric_threshold_50": 0.39296193168013455,
|
188 |
+
"scr_dir2_threshold_50": 0.39296193168013455,
|
189 |
+
"scr_dir1_threshold_100": 0.032520384299655265,
|
190 |
+
"scr_metric_threshold_100": 0.4516129088643129,
|
191 |
+
"scr_dir2_threshold_100": 0.4516129088643129,
|
192 |
+
"scr_dir1_threshold_500": -0.22764220550708036,
|
193 |
+
"scr_metric_threshold_500": 0.09677418227137422,
|
194 |
+
"scr_dir2_threshold_500": 0.09677418227137422
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.021857964433295084,
|
199 |
+
"scr_metric_threshold_2": 0.3671876018633899,
|
200 |
+
"scr_dir2_threshold_2": 0.3671876018633899,
|
201 |
+
"scr_dir1_threshold_5": 0.00546440968122594,
|
202 |
+
"scr_metric_threshold_5": 0.43750011641530273,
|
203 |
+
"scr_dir2_threshold_5": 0.43750011641530273,
|
204 |
+
"scr_dir1_threshold_10": 0.04918033854781611,
|
205 |
+
"scr_metric_threshold_10": 0.5078126309672156,
|
206 |
+
"scr_dir2_threshold_10": 0.5078126309672156,
|
207 |
+
"scr_dir1_threshold_20": 0.1803277994391953,
|
208 |
+
"scr_metric_threshold_20": 0.5664063154836078,
|
209 |
+
"scr_dir2_threshold_20": 0.5664063154836078,
|
210 |
+
"scr_dir1_threshold_50": 0.1912569445100385,
|
211 |
+
"scr_metric_threshold_50": 0.6562500582076514,
|
212 |
+
"scr_dir2_threshold_50": 0.6562500582076514,
|
213 |
+
"scr_dir1_threshold_100": -0.1092894964580841,
|
214 |
+
"scr_metric_threshold_100": 0.63281239813661,
|
215 |
+
"scr_dir2_threshold_100": 0.63281239813661,
|
216 |
+
"scr_dir1_threshold_500": -0.09289626741440628,
|
217 |
+
"scr_metric_threshold_500": 0.7812500582076514,
|
218 |
+
"scr_dir2_threshold_500": 0.7812500582076514
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.07692300638503319,
|
223 |
+
"scr_metric_threshold_2": 0.056451601273808806,
|
224 |
+
"scr_dir2_threshold_2": 0.056451601273808806,
|
225 |
+
"scr_dir1_threshold_5": 0.12307687134902433,
|
226 |
+
"scr_metric_threshold_5": 0.0927419335456348,
|
227 |
+
"scr_dir2_threshold_5": 0.0927419335456348,
|
228 |
+
"scr_dir1_threshold_10": 0.15384601277006638,
|
229 |
+
"scr_metric_threshold_10": 0.12500006008532116,
|
230 |
+
"scr_dir2_threshold_10": 0.12500006008532116,
|
231 |
+
"scr_dir1_threshold_20": 0.23076901915509954,
|
232 |
+
"scr_metric_threshold_20": 0.16935480382142643,
|
233 |
+
"scr_dir2_threshold_20": 0.16935480382142643,
|
234 |
+
"scr_dir1_threshold_50": 0.3333333333333333,
|
235 |
+
"scr_metric_threshold_50": 0.3064517214444511,
|
236 |
+
"scr_dir2_threshold_50": 0.3064517214444511,
|
237 |
+
"scr_dir1_threshold_100": 0.36410247475437535,
|
238 |
+
"scr_metric_threshold_100": 0.39112900318452204,
|
239 |
+
"scr_dir2_threshold_100": 0.39112900318452204,
|
240 |
+
"scr_dir1_threshold_500": 0.45641020468235766,
|
241 |
+
"scr_metric_threshold_500": 0.43145154118848766,
|
242 |
+
"scr_dir2_threshold_500": 0.43145154118848766
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.045045001506225466,
|
247 |
+
"scr_metric_threshold_2": 0.214285581239654,
|
248 |
+
"scr_dir2_threshold_2": 0.214285581239654,
|
249 |
+
"scr_dir1_threshold_5": 0.10360361086007354,
|
250 |
+
"scr_metric_threshold_5": 0.31696414435284664,
|
251 |
+
"scr_dir2_threshold_5": 0.31696414435284664,
|
252 |
+
"scr_dir1_threshold_10": 0.14414416591355395,
|
253 |
+
"scr_metric_threshold_10": 0.4241072010647942,
|
254 |
+
"scr_dir2_threshold_10": 0.4241072010647942,
|
255 |
+
"scr_dir1_threshold_20": 0.21171166817289214,
|
256 |
+
"scr_metric_threshold_20": 0.47767846332864744,
|
257 |
+
"scr_dir2_threshold_20": 0.47767846332864744,
|
258 |
+
"scr_dir1_threshold_50": 0.38738749623443636,
|
259 |
+
"scr_metric_threshold_50": 0.5535713953099135,
|
260 |
+
"scr_dir2_threshold_50": 0.5535713953099135,
|
261 |
+
"scr_dir1_threshold_100": 0.15765750527178912,
|
262 |
+
"scr_metric_threshold_100": 0.6116071511725216,
|
263 |
+
"scr_dir2_threshold_100": 0.6116071511725216,
|
264 |
+
"scr_dir1_threshold_500": 0.4549549984937745,
|
265 |
+
"scr_metric_threshold_500": 0.6785713620483984,
|
266 |
+
"scr_dir2_threshold_500": 0.6785713620483984
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.042918406627421406,
|
271 |
+
"scr_metric_threshold_2": 0.042918406627421406,
|
272 |
+
"scr_dir2_threshold_2": 0.05714296256601923,
|
273 |
+
"scr_dir1_threshold_5": 0.060085820441166386,
|
274 |
+
"scr_metric_threshold_5": 0.060085820441166386,
|
275 |
+
"scr_dir2_threshold_5": 0.10476195612190681,
|
276 |
+
"scr_dir1_threshold_10": 0.09012885856869063,
|
277 |
+
"scr_metric_threshold_10": 0.09012885856869063,
|
278 |
+
"scr_dir2_threshold_10": 0.10952379871117751,
|
279 |
+
"scr_dir1_threshold_20": 0.1416308441960435,
|
280 |
+
"scr_metric_threshold_20": 0.1416308441960435,
|
281 |
+
"scr_dir2_threshold_20": 0.1333332954891213,
|
282 |
+
"scr_dir1_threshold_50": 0.23175970276473412,
|
283 |
+
"scr_metric_threshold_50": 0.23175970276473412,
|
284 |
+
"scr_dir2_threshold_50": 0.1333332954891213,
|
285 |
+
"scr_dir1_threshold_100": 0.2489271165784791,
|
286 |
+
"scr_metric_threshold_100": 0.2489271165784791,
|
287 |
+
"scr_dir2_threshold_100": 0.2380952516110281,
|
288 |
+
"scr_dir1_threshold_500": 0.33905571933328765,
|
289 |
+
"scr_metric_threshold_500": 0.33905571933328765,
|
290 |
+
"scr_dir2_threshold_500": 0.24285709420029883
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0",
|
296 |
+
"sae_lens_version": "5.4.2",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 65536,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "e84e2919-45d8-4739-935c-c4c49e6f01bd",
|
73 |
+
"datetime_epoch_millis": 1740159333994,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.13902672439841735,
|
77 |
+
"scr_metric_threshold_2": 0.052679827874983993,
|
78 |
+
"scr_dir2_threshold_2": 0.05023245750763616,
|
79 |
+
"scr_dir1_threshold_5": 0.15256902306733214,
|
80 |
+
"scr_metric_threshold_5": 0.08124373333242407,
|
81 |
+
"scr_dir2_threshold_5": 0.08373451444399863,
|
82 |
+
"scr_dir1_threshold_10": 0.18434644922627724,
|
83 |
+
"scr_metric_threshold_10": 0.11066224908587924,
|
84 |
+
"scr_dir2_threshold_10": 0.11285158655682409,
|
85 |
+
"scr_dir1_threshold_20": 0.1730640453138616,
|
86 |
+
"scr_metric_threshold_20": 0.13485719572436072,
|
87 |
+
"scr_dir2_threshold_20": 0.13591482918415104,
|
88 |
+
"scr_dir1_threshold_50": 0.16112645779738913,
|
89 |
+
"scr_metric_threshold_50": 0.17958703268862003,
|
90 |
+
"scr_dir2_threshold_50": 0.18450987025234128,
|
91 |
+
"scr_dir1_threshold_100": 0.16878289626201945,
|
92 |
+
"scr_metric_threshold_100": 0.17936453816540665,
|
93 |
+
"scr_dir2_threshold_100": 0.1792240185413299,
|
94 |
+
"scr_dir1_threshold_500": 0.0930010293225678,
|
95 |
+
"scr_metric_threshold_500": 0.1838392653323654,
|
96 |
+
"scr_dir2_threshold_500": 0.18834314598313068
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.2857136099242737,
|
103 |
+
"scr_metric_threshold_2": 0.01474211549298745,
|
104 |
+
"scr_dir2_threshold_2": 0.01474211549298745,
|
105 |
+
"scr_dir1_threshold_5": 0.31746019731987085,
|
106 |
+
"scr_metric_threshold_5": 0.017199085925573582,
|
107 |
+
"scr_dir2_threshold_5": 0.017199085925573582,
|
108 |
+
"scr_dir1_threshold_10": 0.3492058386094512,
|
109 |
+
"scr_metric_threshold_10": 0.024570143672067307,
|
110 |
+
"scr_dir2_threshold_10": 0.024570143672067307,
|
111 |
+
"scr_dir1_threshold_20": 0.222222327345113,
|
112 |
+
"scr_metric_threshold_20": 0.036855142283733294,
|
113 |
+
"scr_dir2_threshold_20": 0.036855142283733294,
|
114 |
+
"scr_dir1_threshold_50": 0.23809514798990317,
|
115 |
+
"scr_metric_threshold_50": 0.05651105219315768,
|
116 |
+
"scr_dir2_threshold_50": 0.05651105219315768,
|
117 |
+
"scr_dir1_threshold_100": 0.23809514798990317,
|
118 |
+
"scr_metric_threshold_100": 0.06879605080482366,
|
119 |
+
"scr_dir2_threshold_100": 0.06879605080482366,
|
120 |
+
"scr_dir1_threshold_500": 0.15873009865993543,
|
121 |
+
"scr_metric_threshold_500": 0.13022119031188895,
|
122 |
+
"scr_dir2_threshold_500": 0.13022119031188895
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.2121209384542766,
|
127 |
+
"scr_metric_threshold_2": 0.05099152246310042,
|
128 |
+
"scr_dir2_threshold_2": 0.05099152246310042,
|
129 |
+
"scr_dir1_threshold_5": 0.19191917367472955,
|
130 |
+
"scr_metric_threshold_5": 0.07648728369465063,
|
131 |
+
"scr_dir2_threshold_5": 0.07648728369465063,
|
132 |
+
"scr_dir1_threshold_10": 0.2323233053010818,
|
133 |
+
"scr_metric_threshold_10": 0.10764869459912667,
|
134 |
+
"scr_dir2_threshold_10": 0.10764869459912667,
|
135 |
+
"scr_dir1_threshold_20": 0.2828283193172076,
|
136 |
+
"scr_metric_threshold_20": 0.15014157370114636,
|
137 |
+
"scr_dir2_threshold_20": 0.15014157370114636,
|
138 |
+
"scr_dir1_threshold_50": 0.20202005606450307,
|
139 |
+
"scr_metric_threshold_50": 0.18130315345731438,
|
140 |
+
"scr_dir2_threshold_50": 0.18130315345731438,
|
141 |
+
"scr_dir1_threshold_100": 0.16161592443815084,
|
142 |
+
"scr_metric_threshold_100": 0.21246456436179043,
|
143 |
+
"scr_dir2_threshold_100": 0.21246456436179043,
|
144 |
+
"scr_dir1_threshold_500": -0.07070738086293096,
|
145 |
+
"scr_metric_threshold_500": 0.07932010853111354,
|
146 |
+
"scr_dir2_threshold_500": 0.07932010853111354
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.258064174999303,
|
151 |
+
"scr_metric_threshold_2": 0.017676839894409477,
|
152 |
+
"scr_dir2_threshold_2": 0.017676839894409477,
|
153 |
+
"scr_dir1_threshold_5": 0.29032183636211567,
|
154 |
+
"scr_metric_threshold_5": 0.03535352927204973,
|
155 |
+
"scr_dir2_threshold_5": 0.03535352927204973,
|
156 |
+
"scr_dir1_threshold_10": 0.33870928977193526,
|
157 |
+
"scr_metric_threshold_10": 0.06565661886956488,
|
158 |
+
"scr_dir2_threshold_10": 0.06565661886956488,
|
159 |
+
"scr_dir1_threshold_20": 0.37096695113474787,
|
160 |
+
"scr_metric_threshold_20": 0.06565661886956488,
|
161 |
+
"scr_dir2_threshold_20": 0.06565661886956488,
|
162 |
+
"scr_dir1_threshold_50": 0.20967672158948342,
|
163 |
+
"scr_metric_threshold_50": 0.08838389843850893,
|
164 |
+
"scr_dir2_threshold_50": 0.08838389843850893,
|
165 |
+
"scr_dir1_threshold_100": 0.22580651363649037,
|
166 |
+
"scr_metric_threshold_100": 0.0959595579503108,
|
167 |
+
"scr_dir2_threshold_100": 0.0959595579503108,
|
168 |
+
"scr_dir1_threshold_500": 0.11290277613544487,
|
169 |
+
"scr_metric_threshold_500": 0.04545455913788811,
|
170 |
+
"scr_dir2_threshold_500": 0.04545455913788811
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.13821151212590824,
|
175 |
+
"scr_metric_threshold_2": 0.052785774589540695,
|
176 |
+
"scr_dir2_threshold_2": 0.052785774589540695,
|
177 |
+
"scr_dir1_threshold_5": 0.1869918462801379,
|
178 |
+
"scr_metric_threshold_5": 0.0791787492811609,
|
179 |
+
"scr_dir2_threshold_5": 0.0791787492811609,
|
180 |
+
"scr_dir1_threshold_10": 0.21951223057979316,
|
181 |
+
"scr_metric_threshold_10": 0.1114369265674188,
|
182 |
+
"scr_dir2_threshold_10": 0.1114369265674188,
|
183 |
+
"scr_dir1_threshold_20": 0.23577218043436754,
|
184 |
+
"scr_metric_threshold_20": 0.13782990125903902,
|
185 |
+
"scr_dir2_threshold_20": 0.13782990125903902,
|
186 |
+
"scr_dir1_threshold_50": 0.23577218043436754,
|
187 |
+
"scr_metric_threshold_50": 0.17008790375159719,
|
188 |
+
"scr_dir2_threshold_50": 0.17008790375159719,
|
189 |
+
"scr_dir1_threshold_100": 0.21951223057979316,
|
190 |
+
"scr_metric_threshold_100": 0.052785774589540695,
|
191 |
+
"scr_dir2_threshold_100": 0.052785774589540695,
|
192 |
+
"scr_dir1_threshold_500": 0.11382110275354018,
|
193 |
+
"scr_metric_threshold_500": 0.10850441266694984,
|
194 |
+
"scr_dir2_threshold_500": 0.10850441266694984
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.00546440968122594,
|
199 |
+
"scr_metric_threshold_2": 0.10937497089617432,
|
200 |
+
"scr_dir2_threshold_2": 0.10937497089617432,
|
201 |
+
"scr_dir1_threshold_5": 0.021857964433295084,
|
202 |
+
"scr_metric_threshold_5": 0.21093754365573852,
|
203 |
+
"scr_dir2_threshold_5": 0.21093754365573852,
|
204 |
+
"scr_dir1_threshold_10": 0.05464474822904205,
|
205 |
+
"scr_metric_threshold_10": 0.28125005820765137,
|
206 |
+
"scr_dir2_threshold_10": 0.28125005820765137,
|
207 |
+
"scr_dir1_threshold_20": -0.06010915791026799,
|
208 |
+
"scr_metric_threshold_20": 0.3320313445874335,
|
209 |
+
"scr_dir2_threshold_20": 0.3320313445874335,
|
210 |
+
"scr_dir1_threshold_50": -0.04918033854781611,
|
211 |
+
"scr_metric_threshold_50": 0.3984374272404358,
|
212 |
+
"scr_dir2_threshold_50": 0.3984374272404358,
|
213 |
+
"scr_dir1_threshold_100": -0.016393554752069144,
|
214 |
+
"scr_metric_threshold_100": 0.42578128637978213,
|
215 |
+
"scr_dir2_threshold_100": 0.42578128637978213,
|
216 |
+
"scr_dir1_threshold_500": -0.03825151918536423,
|
217 |
+
"scr_metric_threshold_500": 0.48828116996447934,
|
218 |
+
"scr_dir2_threshold_500": 0.48828116996447934
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.09743586922067994,
|
223 |
+
"scr_metric_threshold_2": 0.05241939554166917,
|
224 |
+
"scr_dir2_threshold_2": 0.05241939554166917,
|
225 |
+
"scr_dir1_threshold_5": 0.09743586922067994,
|
226 |
+
"scr_metric_threshold_5": 0.07661287027579163,
|
227 |
+
"scr_dir2_threshold_5": 0.07661287027579163,
|
228 |
+
"scr_dir1_threshold_10": 0.11282028709877284,
|
229 |
+
"scr_metric_threshold_10": 0.09677413927777444,
|
230 |
+
"scr_dir2_threshold_10": 0.09677413927777444,
|
231 |
+
"scr_dir1_threshold_20": 0.13846128922711723,
|
232 |
+
"scr_metric_threshold_20": 0.14516132908730398,
|
233 |
+
"scr_dir2_threshold_20": 0.14516132908730398,
|
234 |
+
"scr_dir1_threshold_50": 0.1692307363130155,
|
235 |
+
"scr_metric_threshold_50": 0.23790326263293876,
|
236 |
+
"scr_dir2_threshold_50": 0.23790326263293876,
|
237 |
+
"scr_dir1_threshold_100": 0.18974359914866223,
|
238 |
+
"scr_metric_threshold_100": 0.2580645316349216,
|
239 |
+
"scr_dir2_threshold_100": 0.2580645316349216,
|
240 |
+
"scr_dir1_threshold_500": 0.14871787347736873,
|
241 |
+
"scr_metric_threshold_500": 0.3064517214444511,
|
242 |
+
"scr_dir2_threshold_500": 0.3064517214444511
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.07657666365421574,
|
247 |
+
"scr_metric_threshold_2": 0.08482138699453473,
|
248 |
+
"scr_dir2_threshold_2": 0.08482138699453473,
|
249 |
+
"scr_dir1_threshold_5": 0.06306305580659313,
|
250 |
+
"scr_metric_threshold_5": 0.10267856311319266,
|
251 |
+
"scr_dir2_threshold_5": 0.10267856311319266,
|
252 |
+
"scr_dir1_threshold_10": 0.09459444946519599,
|
253 |
+
"scr_metric_threshold_10": 0.12499996673848493,
|
254 |
+
"scr_dir2_threshold_10": 0.12499996673848493,
|
255 |
+
"scr_dir1_threshold_20": 0.11711721870769615,
|
256 |
+
"scr_metric_threshold_20": 0.13392842175175362,
|
257 |
+
"scr_dir2_threshold_20": 0.13392842175175362,
|
258 |
+
"scr_dir1_threshold_50": 0.18468472096703434,
|
259 |
+
"scr_metric_threshold_50": 0.20535712622638533,
|
260 |
+
"scr_dir2_threshold_50": 0.20535712622638533,
|
261 |
+
"scr_dir1_threshold_100": 0.21171166817289214,
|
262 |
+
"scr_metric_threshold_100": 0.20089289871975097,
|
263 |
+
"scr_dir2_threshold_100": 0.20089289871975097,
|
264 |
+
"scr_dir1_threshold_500": 0.20720722172014708,
|
265 |
+
"scr_metric_threshold_500": 0.20089289871975097,
|
266 |
+
"scr_dir2_threshold_500": 0.20089289871975097
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.03862661712745569,
|
271 |
+
"scr_metric_threshold_2": 0.03862661712745569,
|
272 |
+
"scr_dir2_threshold_2": 0.019047654188673074,
|
273 |
+
"scr_dir1_threshold_5": 0.05150224144123495,
|
274 |
+
"scr_metric_threshold_5": 0.05150224144123495,
|
275 |
+
"scr_dir2_threshold_5": 0.07142849033383136,
|
276 |
+
"scr_dir1_threshold_10": 0.07296144475494565,
|
277 |
+
"scr_metric_threshold_10": 0.07296144475494565,
|
278 |
+
"scr_dir2_threshold_10": 0.09047614452250444,
|
279 |
+
"scr_dir1_threshold_20": 0.07725323425491137,
|
280 |
+
"scr_metric_threshold_20": 0.07725323425491137,
|
281 |
+
"scr_dir2_threshold_20": 0.08571430193323373,
|
282 |
+
"scr_dir1_threshold_50": 0.09871243756862208,
|
283 |
+
"scr_metric_threshold_50": 0.09871243756862208,
|
284 |
+
"scr_dir2_threshold_50": 0.13809513807839202,
|
285 |
+
"scr_dir1_threshold_100": 0.12017164088233277,
|
286 |
+
"scr_metric_threshold_100": 0.12017164088233277,
|
287 |
+
"scr_dir2_threshold_100": 0.11904748388971893,
|
288 |
+
"scr_dir1_threshold_500": 0.11158806188240133,
|
289 |
+
"scr_metric_threshold_500": 0.11158806188240133,
|
290 |
+
"scr_dir2_threshold_500": 0.14761910708852366
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0",
|
296 |
+
"sae_lens_version": "5.4.2",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 65536,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "744efb4f-9f9c-4a65-bd36-c9b19a4ffd71",
|
73 |
+
"datetime_epoch_millis": 1740160219317,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.1334585580077991,
|
77 |
+
"scr_metric_threshold_2": 0.07676722148137892,
|
78 |
+
"scr_dir2_threshold_2": 0.07675956452377836,
|
79 |
+
"scr_dir1_threshold_5": 0.1798632781490608,
|
80 |
+
"scr_metric_threshold_5": 0.12875666458236273,
|
81 |
+
"scr_dir2_threshold_5": 0.13481889859378787,
|
82 |
+
"scr_dir1_threshold_10": 0.18356951723363868,
|
83 |
+
"scr_metric_threshold_10": 0.18907230490584973,
|
84 |
+
"scr_dir2_threshold_10": 0.1949506085489714,
|
85 |
+
"scr_dir1_threshold_20": 0.17099327770246797,
|
86 |
+
"scr_metric_threshold_20": 0.237307483183272,
|
87 |
+
"scr_dir2_threshold_20": 0.24175260369566065,
|
88 |
+
"scr_dir1_threshold_50": 0.07798598324296178,
|
89 |
+
"scr_metric_threshold_50": 0.2928094354498369,
|
90 |
+
"scr_dir2_threshold_50": 0.30373578236555543,
|
91 |
+
"scr_dir1_threshold_100": 0.0037336381161294126,
|
92 |
+
"scr_metric_threshold_100": 0.2911327859028171,
|
93 |
+
"scr_dir2_threshold_100": 0.3050353199157786,
|
94 |
+
"scr_dir1_threshold_500": -0.09587675427885789,
|
95 |
+
"scr_metric_threshold_500": 0.2517752817013341,
|
96 |
+
"scr_dir2_threshold_500": 0.2716966035025452
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.333333017964661,
|
103 |
+
"scr_metric_threshold_2": 0.009828028179079856,
|
104 |
+
"scr_dir2_threshold_2": 0.009828028179079856,
|
105 |
+
"scr_dir1_threshold_5": 0.3809524260050484,
|
106 |
+
"scr_metric_threshold_5": 0.019656056358159712,
|
107 |
+
"scr_dir2_threshold_5": 0.019656056358159712,
|
108 |
+
"scr_dir1_threshold_10": 0.3650786592542414,
|
109 |
+
"scr_metric_threshold_10": 0.05651105219315768,
|
110 |
+
"scr_dir2_threshold_10": 0.05651105219315768,
|
111 |
+
"scr_dir1_threshold_20": 0.31746019731987085,
|
112 |
+
"scr_metric_threshold_20": 0.09336619447689097,
|
113 |
+
"scr_dir2_threshold_20": 0.09336619447689097,
|
114 |
+
"scr_dir1_threshold_50": 0.1746029193047256,
|
115 |
+
"scr_metric_threshold_50": 0.12285013256539522,
|
116 |
+
"scr_dir2_threshold_50": 0.12285013256539522,
|
117 |
+
"scr_dir1_threshold_100": 0.20634856059430595,
|
118 |
+
"scr_metric_threshold_100": 0.1474201297887272,
|
119 |
+
"scr_dir2_threshold_100": 0.1474201297887272,
|
120 |
+
"scr_dir1_threshold_500": 0.14285633190912841,
|
121 |
+
"scr_metric_threshold_500": 0.05896816907447914,
|
122 |
+
"scr_dir2_threshold_500": 0.05896816907447914
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.08080766118544634,
|
127 |
+
"scr_metric_threshold_2": 0.11048151943558958,
|
128 |
+
"scr_dir2_threshold_2": 0.11048151943558958,
|
129 |
+
"scr_dir1_threshold_5": 0.16161592443815084,
|
130 |
+
"scr_metric_threshold_5": 0.1671388604233078,
|
131 |
+
"scr_dir2_threshold_5": 0.1671388604233078,
|
132 |
+
"scr_dir1_threshold_10": 0.12121179281179859,
|
133 |
+
"scr_metric_threshold_10": 0.24079315042980354,
|
134 |
+
"scr_dir2_threshold_10": 0.24079315042980354,
|
135 |
+
"scr_dir1_threshold_20": 0.12121179281179859,
|
136 |
+
"scr_metric_threshold_20": 0.2861190232199781,
|
137 |
+
"scr_dir2_threshold_20": 0.2861190232199781,
|
138 |
+
"scr_dir1_threshold_50": -0.2828283193172076,
|
139 |
+
"scr_metric_threshold_50": 0.3456090201924673,
|
140 |
+
"scr_dir2_threshold_50": 0.3456090201924673,
|
141 |
+
"scr_dir1_threshold_100": -0.31313156855378627,
|
142 |
+
"scr_metric_threshold_100": 0.20679891468886458,
|
143 |
+
"scr_dir2_threshold_100": 0.20679891468886458,
|
144 |
+
"scr_dir1_threshold_500": -0.7373740475295977,
|
145 |
+
"scr_metric_threshold_500": 0.28895184805644103,
|
146 |
+
"scr_dir2_threshold_500": 0.28895184805644103
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.37096695113474787,
|
151 |
+
"scr_metric_threshold_2": 0.010101029865838383,
|
152 |
+
"scr_dir2_threshold_2": 0.010101029865838383,
|
153 |
+
"scr_dir1_threshold_5": 0.41935440454456746,
|
154 |
+
"scr_metric_threshold_5": 0.042929339300620824,
|
155 |
+
"scr_dir2_threshold_5": 0.042929339300620824,
|
156 |
+
"scr_dir1_threshold_10": 0.45161302727298075,
|
157 |
+
"scr_metric_threshold_10": 0.09343433811304351,
|
158 |
+
"scr_dir2_threshold_10": 0.09343433811304351,
|
159 |
+
"scr_dir1_threshold_20": 0.37096695113474787,
|
160 |
+
"scr_metric_threshold_20": 0.11868683751925485,
|
161 |
+
"scr_dir2_threshold_20": 0.11868683751925485,
|
162 |
+
"scr_dir1_threshold_50": 0.16129022954526445,
|
163 |
+
"scr_metric_threshold_50": 0.16919198684844677,
|
164 |
+
"scr_dir2_threshold_50": 0.16919198684844677,
|
165 |
+
"scr_dir1_threshold_100": 0.09677394545403856,
|
166 |
+
"scr_metric_threshold_100": 0.2045455161204965,
|
167 |
+
"scr_dir2_threshold_100": 0.2045455161204965,
|
168 |
+
"scr_dir1_threshold_500": -0.016129792047006934,
|
169 |
+
"scr_metric_threshold_500": 0.08838389843850893,
|
170 |
+
"scr_dir2_threshold_500": 0.08838389843850893
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.13008153719862106,
|
175 |
+
"scr_metric_threshold_2": 0.1671553898511282,
|
176 |
+
"scr_dir2_threshold_2": 0.1671553898511282,
|
177 |
+
"scr_dir1_threshold_5": 0.22764220550708036,
|
178 |
+
"scr_metric_threshold_5": 0.23167156962994426,
|
179 |
+
"scr_dir2_threshold_5": 0.23167156962994426,
|
180 |
+
"scr_dir1_threshold_10": 0.13821151212590824,
|
181 |
+
"scr_metric_threshold_10": 0.27565980251807803,
|
182 |
+
"scr_dir2_threshold_10": 0.27565980251807803,
|
183 |
+
"scr_dir1_threshold_20": 0.13821151212590824,
|
184 |
+
"scr_metric_threshold_20": 0.34604101009783206,
|
185 |
+
"scr_dir2_threshold_20": 0.34604101009783206,
|
186 |
+
"scr_dir1_threshold_50": 0.08130071845388491,
|
187 |
+
"scr_metric_threshold_50": 0.38416421518502786,
|
188 |
+
"scr_dir2_threshold_50": 0.38416421518502786,
|
189 |
+
"scr_dir1_threshold_100": -0.5365851294680456,
|
190 |
+
"scr_metric_threshold_100": 0.4105571898766481,
|
191 |
+
"scr_dir2_threshold_100": 0.4105571898766481,
|
192 |
+
"scr_dir1_threshold_500": -0.658536207148873,
|
193 |
+
"scr_metric_threshold_500": 0.1964808784432174,
|
194 |
+
"scr_dir2_threshold_500": 0.1964808784432174
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.03825119347697291,
|
199 |
+
"scr_metric_threshold_2": 0.10156257275956422,
|
200 |
+
"scr_dir2_threshold_2": 0.10156257275956422,
|
201 |
+
"scr_dir1_threshold_5": 0.04371592886659017,
|
202 |
+
"scr_metric_threshold_5": 0.22265637369125918,
|
203 |
+
"scr_dir2_threshold_5": 0.22265637369125918,
|
204 |
+
"scr_dir1_threshold_10": 0.06557389329988525,
|
205 |
+
"scr_metric_threshold_10": 0.3320313445874335,
|
206 |
+
"scr_dir2_threshold_10": 0.3320313445874335,
|
207 |
+
"scr_dir1_threshold_20": 0.01092881936245188,
|
208 |
+
"scr_metric_threshold_20": 0.42578128637978213,
|
209 |
+
"scr_dir2_threshold_20": 0.42578128637978213,
|
210 |
+
"scr_dir1_threshold_50": -0.00546440968122594,
|
211 |
+
"scr_metric_threshold_50": 0.542968888243172,
|
212 |
+
"scr_dir2_threshold_50": 0.542968888243172,
|
213 |
+
"scr_dir1_threshold_100": 0.01092881936245188,
|
214 |
+
"scr_metric_threshold_100": 0.5859375436557386,
|
215 |
+
"scr_dir2_threshold_100": 0.5859375436557386,
|
216 |
+
"scr_dir1_threshold_500": -0.03825151918536423,
|
217 |
+
"scr_metric_threshold_500": 0.578124912688523,
|
218 |
+
"scr_dir2_threshold_500": 0.578124912688523
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.03076914142104203,
|
223 |
+
"scr_metric_threshold_2": 0.060483807005948444,
|
224 |
+
"scr_dir2_threshold_2": 0.060483807005948444,
|
225 |
+
"scr_dir1_threshold_5": 0.08205114567773082,
|
226 |
+
"scr_metric_threshold_5": 0.08467752208135552,
|
227 |
+
"scr_dir2_threshold_5": 0.08467752208135552,
|
228 |
+
"scr_dir1_threshold_10": 0.12820501064172196,
|
229 |
+
"scr_metric_threshold_10": 0.1290322658174608,
|
230 |
+
"scr_dir2_threshold_10": 0.1290322658174608,
|
231 |
+
"scr_dir1_threshold_20": 0.14871787347736873,
|
232 |
+
"scr_metric_threshold_20": 0.17741945562699032,
|
233 |
+
"scr_dir2_threshold_20": 0.17741945562699032,
|
234 |
+
"scr_dir1_threshold_50": 0.09230742426312609,
|
235 |
+
"scr_metric_threshold_50": 0.2701613891726251,
|
236 |
+
"scr_dir2_threshold_50": 0.2701613891726251,
|
237 |
+
"scr_dir1_threshold_100": 0.15384601277006638,
|
238 |
+
"scr_metric_threshold_100": 0.3064517214444511,
|
239 |
+
"scr_dir2_threshold_100": 0.3064517214444511,
|
240 |
+
"scr_dir1_threshold_500": 0.15384601277006638,
|
241 |
+
"scr_metric_threshold_500": 0.3790323859881031,
|
242 |
+
"scr_dir2_threshold_500": 0.3790323859881031
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.0405405550534804,
|
247 |
+
"scr_metric_threshold_2": 0.11160701812646134,
|
248 |
+
"scr_dir2_threshold_2": 0.11160701812646134,
|
249 |
+
"scr_dir1_threshold_5": 0.07207194871208326,
|
250 |
+
"scr_metric_threshold_5": 0.20982135373301966,
|
251 |
+
"scr_dir2_threshold_5": 0.20982135373301966,
|
252 |
+
"scr_dir1_threshold_10": 0.11711721870769615,
|
253 |
+
"scr_metric_threshold_10": 0.30357146183294365,
|
254 |
+
"scr_dir2_threshold_10": 0.30357146183294365,
|
255 |
+
"scr_dir1_threshold_20": 0.15315305881904406,
|
256 |
+
"scr_metric_threshold_20": 0.3437500415768938,
|
257 |
+
"scr_dir2_threshold_20": 0.3437500415768938,
|
258 |
+
"scr_dir1_threshold_50": 0.2567566696791176,
|
259 |
+
"scr_metric_threshold_50": 0.36160721769555176,
|
260 |
+
"scr_dir2_threshold_50": 0.36160721769555176,
|
261 |
+
"scr_dir1_threshold_100": 0.26576583107399515,
|
262 |
+
"scr_metric_threshold_100": 0.3214286379516016,
|
263 |
+
"scr_dir2_threshold_100": 0.3214286379516016,
|
264 |
+
"scr_dir1_threshold_500": 0.27927917043223033,
|
265 |
+
"scr_metric_threshold_500": 0.31696414435284664,
|
266 |
+
"scr_dir2_threshold_500": 0.31696414435284664
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.042918406627421406,
|
271 |
+
"scr_metric_threshold_2": 0.042918406627421406,
|
272 |
+
"scr_dir2_threshold_2": 0.042857150966616867,
|
273 |
+
"scr_dir1_threshold_5": 0.05150224144123495,
|
274 |
+
"scr_metric_threshold_5": 0.05150224144123495,
|
275 |
+
"scr_dir2_threshold_5": 0.10000011353263609,
|
276 |
+
"scr_dir1_threshold_10": 0.08154502375487709,
|
277 |
+
"scr_metric_threshold_10": 0.08154502375487709,
|
278 |
+
"scr_dir2_threshold_10": 0.12857145289985059,
|
279 |
+
"scr_dir1_threshold_20": 0.10729601656855352,
|
280 |
+
"scr_metric_threshold_20": 0.10729601656855352,
|
281 |
+
"scr_dir2_threshold_20": 0.14285698066766273,
|
282 |
+
"scr_dir1_threshold_50": 0.1459226336960092,
|
283 |
+
"scr_metric_threshold_50": 0.1459226336960092,
|
284 |
+
"scr_dir2_threshold_50": 0.2333334090217574,
|
285 |
+
"scr_dir1_threshold_100": 0.1459226336960092,
|
286 |
+
"scr_metric_threshold_100": 0.1459226336960092,
|
287 |
+
"scr_dir2_threshold_100": 0.25714290579970117,
|
288 |
+
"scr_dir1_threshold_500": 0.10729601656855352,
|
289 |
+
"scr_metric_threshold_500": 0.10729601656855352,
|
290 |
+
"scr_dir2_threshold_500": 0.2666665909782426
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0",
|
296 |
+
"sae_lens_version": "5.4.2",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 65536,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "4945c79f-5ec8-4cf1-817b-560bfcc8ec29",
|
73 |
+
"datetime_epoch_millis": 1740161984895,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.16161854897121994,
|
77 |
+
"scr_metric_threshold_2": 0.06784648898938839,
|
78 |
+
"scr_dir2_threshold_2": 0.0659943489456994,
|
79 |
+
"scr_dir1_threshold_5": 0.20247854428196788,
|
80 |
+
"scr_metric_threshold_5": 0.09403415996971003,
|
81 |
+
"scr_dir2_threshold_5": 0.0946217333357683,
|
82 |
+
"scr_dir1_threshold_10": 0.1762122452594689,
|
83 |
+
"scr_metric_threshold_10": 0.13623078368990285,
|
84 |
+
"scr_dir2_threshold_10": 0.14062480452372897,
|
85 |
+
"scr_dir1_threshold_20": 0.17380918952629806,
|
86 |
+
"scr_metric_threshold_20": 0.1723823540256287,
|
87 |
+
"scr_dir2_threshold_20": 0.17415276656035286,
|
88 |
+
"scr_dir1_threshold_50": 0.1849406260352424,
|
89 |
+
"scr_metric_threshold_50": 0.20896025265643603,
|
90 |
+
"scr_dir2_threshold_50": 0.21573523026384628,
|
91 |
+
"scr_dir1_threshold_100": 0.14974715894436086,
|
92 |
+
"scr_metric_threshold_100": 0.18597532300507813,
|
93 |
+
"scr_dir2_threshold_100": 0.1965567480802562,
|
94 |
+
"scr_dir1_threshold_500": 0.0778989845782214,
|
95 |
+
"scr_metric_threshold_500": 0.21980959861610377,
|
96 |
+
"scr_dir2_threshold_500": 0.23241178018807312
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.3650786592542414,
|
103 |
+
"scr_metric_threshold_2": -0.0024569704325861324,
|
104 |
+
"scr_dir2_threshold_2": -0.0024569704325861324,
|
105 |
+
"scr_dir1_threshold_5": 0.3968252466498386,
|
106 |
+
"scr_metric_threshold_5": 0.009828028179079856,
|
107 |
+
"scr_dir2_threshold_5": 0.009828028179079856,
|
108 |
+
"scr_dir1_threshold_10": 0.3809524260050484,
|
109 |
+
"scr_metric_threshold_10": 0.01474211549298745,
|
110 |
+
"scr_dir2_threshold_10": 0.01474211549298745,
|
111 |
+
"scr_dir1_threshold_20": 0.444444654690226,
|
112 |
+
"scr_metric_threshold_20": 0.027027114104653437,
|
113 |
+
"scr_dir2_threshold_20": 0.027027114104653437,
|
114 |
+
"scr_dir1_threshold_50": 0.3015873766750807,
|
115 |
+
"scr_metric_threshold_50": 0.05896816907447914,
|
116 |
+
"scr_dir2_threshold_50": 0.05896816907447914,
|
117 |
+
"scr_dir1_threshold_100": 0.2857136099242737,
|
118 |
+
"scr_metric_threshold_100": 0.1031940762072355,
|
119 |
+
"scr_dir2_threshold_100": 0.1031940762072355,
|
120 |
+
"scr_dir1_threshold_500": 0.19047573994951578,
|
121 |
+
"scr_metric_threshold_500": 0.16216224528171463,
|
122 |
+
"scr_dir2_threshold_500": 0.16216224528171463
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.15151504204837732,
|
127 |
+
"scr_metric_threshold_2": 0.1161473379602074,
|
128 |
+
"scr_dir2_threshold_2": 0.1161473379602074,
|
129 |
+
"scr_dir1_threshold_5": 0.22222182084405012,
|
130 |
+
"scr_metric_threshold_5": 0.1359772806671398,
|
131 |
+
"scr_dir2_threshold_5": 0.1359772806671398,
|
132 |
+
"scr_dir1_threshold_10": 0.17171680682792437,
|
133 |
+
"scr_metric_threshold_10": 0.18413597829377729,
|
134 |
+
"scr_dir2_threshold_10": 0.18413597829377729,
|
135 |
+
"scr_dir1_threshold_20": 0.15151504204837732,
|
136 |
+
"scr_metric_threshold_20": 0.18980162796670313,
|
137 |
+
"scr_dir2_threshold_20": 0.18980162796670313,
|
138 |
+
"scr_dir1_threshold_50": 0.1818176892176979,
|
139 |
+
"scr_metric_threshold_50": 0.2549574434638101,
|
140 |
+
"scr_dir2_threshold_50": 0.2549574434638101,
|
141 |
+
"scr_dir1_threshold_100": 0.03030264716932058,
|
142 |
+
"scr_metric_threshold_100": 0.10764869459912667,
|
143 |
+
"scr_dir2_threshold_100": 0.10764869459912667,
|
144 |
+
"scr_dir1_threshold_500": -0.09090914564247801,
|
145 |
+
"scr_metric_threshold_500": 0.1161473379602074,
|
146 |
+
"scr_dir2_threshold_500": 0.1161473379602074
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.41935440454456746,
|
151 |
+
"scr_metric_threshold_2": 0.020202059731676766,
|
152 |
+
"scr_dir2_threshold_2": 0.020202059731676766,
|
153 |
+
"scr_dir1_threshold_5": 0.45161302727298075,
|
154 |
+
"scr_metric_threshold_5": 0.03535352927204973,
|
155 |
+
"scr_dir2_threshold_5": 0.03535352927204973,
|
156 |
+
"scr_dir1_threshold_10": 0.33870928977193526,
|
157 |
+
"scr_metric_threshold_10": 0.04040411946335353,
|
158 |
+
"scr_dir2_threshold_10": 0.04040411946335353,
|
159 |
+
"scr_dir1_threshold_20": 0.2741930056807093,
|
160 |
+
"scr_metric_threshold_20": 0.08333330824720513,
|
161 |
+
"scr_dir2_threshold_20": 0.08333330824720513,
|
162 |
+
"scr_dir1_threshold_50": 0.2741930056807093,
|
163 |
+
"scr_metric_threshold_50": 0.11616161768198757,
|
164 |
+
"scr_dir2_threshold_50": 0.11616161768198757,
|
165 |
+
"scr_dir1_threshold_100": 0.3064516284091226,
|
166 |
+
"scr_metric_threshold_100": 0.13383845757639704,
|
167 |
+
"scr_dir2_threshold_100": 0.13383845757639704,
|
168 |
+
"scr_dir1_threshold_500": 0.04838649204421897,
|
169 |
+
"scr_metric_threshold_500": 0.06818183870683217,
|
170 |
+
"scr_dir2_threshold_500": 0.06818183870683217
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.15447146198048264,
|
175 |
+
"scr_metric_threshold_2": 0.0557184632837094,
|
176 |
+
"scr_dir2_threshold_2": 0.0557184632837094,
|
177 |
+
"scr_dir1_threshold_5": 0.24390263995216122,
|
178 |
+
"scr_metric_threshold_5": 0.11730195436835675,
|
179 |
+
"scr_dir2_threshold_5": 0.11730195436835675,
|
180 |
+
"scr_dir1_threshold_10": 0.12195107768082737,
|
181 |
+
"scr_metric_threshold_10": 0.20527859493832404,
|
182 |
+
"scr_dir2_threshold_10": 0.20527859493832404,
|
183 |
+
"scr_dir1_threshold_20": 0.0894311779716786,
|
184 |
+
"scr_metric_threshold_20": 0.2434018000255199,
|
185 |
+
"scr_dir2_threshold_20": 0.2434018000255199,
|
186 |
+
"scr_dir1_threshold_50": 0.24390263995216122,
|
187 |
+
"scr_metric_threshold_50": 0.28445751901318467,
|
188 |
+
"scr_dir2_threshold_50": 0.28445751901318467,
|
189 |
+
"scr_dir1_threshold_100": -0.06504076859931053,
|
190 |
+
"scr_metric_threshold_100": 0.09384166837090524,
|
191 |
+
"scr_dir2_threshold_100": 0.09384166837090524,
|
192 |
+
"scr_dir1_threshold_500": -0.008129974927287195,
|
193 |
+
"scr_metric_threshold_500": 0.11730195436835675,
|
194 |
+
"scr_dir2_threshold_500": 0.11730195436835675
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.03278678379574697,
|
199 |
+
"scr_metric_threshold_2": 0.12890643189891055,
|
200 |
+
"scr_dir2_threshold_2": 0.12890643189891055,
|
201 |
+
"scr_dir1_threshold_5": 0.04918033854781611,
|
202 |
+
"scr_metric_threshold_5": 0.16406245634426148,
|
203 |
+
"scr_dir2_threshold_5": 0.16406245634426148,
|
204 |
+
"scr_dir1_threshold_10": 0.016393554752069144,
|
205 |
+
"scr_metric_threshold_10": 0.2695312281721307,
|
206 |
+
"scr_dir2_threshold_10": 0.2695312281721307,
|
207 |
+
"scr_dir1_threshold_20": 0.03278678379574697,
|
208 |
+
"scr_metric_threshold_20": 0.36328140279508486,
|
209 |
+
"scr_dir2_threshold_20": 0.36328140279508486,
|
210 |
+
"scr_dir1_threshold_50": 0.00546440968122594,
|
211 |
+
"scr_metric_threshold_50": 0.41015625727595645,
|
212 |
+
"scr_dir2_threshold_50": 0.41015625727595645,
|
213 |
+
"scr_dir1_threshold_100": 0.06010915791026799,
|
214 |
+
"scr_metric_threshold_100": 0.4609375436557385,
|
215 |
+
"scr_dir2_threshold_100": 0.4609375436557385,
|
216 |
+
"scr_dir1_threshold_500": 0.05464474822904205,
|
217 |
+
"scr_metric_threshold_500": 0.5195312281721307,
|
218 |
+
"scr_dir2_threshold_500": 0.5195312281721307
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.0410254200064373,
|
223 |
+
"scr_metric_threshold_2": 0.1008065853511987,
|
224 |
+
"scr_dir2_threshold_2": 0.1008065853511987,
|
225 |
+
"scr_dir1_threshold_5": 0.12307687134902433,
|
226 |
+
"scr_metric_threshold_5": 0.11290320254761761,
|
227 |
+
"scr_dir2_threshold_5": 0.11290320254761761,
|
228 |
+
"scr_dir1_threshold_10": 0.14871787347736873,
|
229 |
+
"scr_metric_threshold_10": 0.1370969176230247,
|
230 |
+
"scr_dir2_threshold_10": 0.1370969176230247,
|
231 |
+
"scr_dir1_threshold_20": 0.12307687134902433,
|
232 |
+
"scr_metric_threshold_20": 0.1854838670912696,
|
233 |
+
"scr_dir2_threshold_20": 0.1854838670912696,
|
234 |
+
"scr_dir1_threshold_50": 0.1692307363130155,
|
235 |
+
"scr_metric_threshold_50": 0.2056451360932524,
|
236 |
+
"scr_dir2_threshold_50": 0.2056451360932524,
|
237 |
+
"scr_dir1_threshold_100": 0.17435887560571312,
|
238 |
+
"scr_metric_threshold_100": 0.22983885116865949,
|
239 |
+
"scr_dir2_threshold_100": 0.22983885116865949,
|
240 |
+
"scr_dir1_threshold_500": 0.08205114567773082,
|
241 |
+
"scr_metric_threshold_500": 0.35483867091269605,
|
242 |
+
"scr_dir2_threshold_500": 0.35483867091269605
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.09009000301245093,
|
247 |
+
"scr_metric_threshold_2": 0.08482138699453473,
|
248 |
+
"scr_dir2_threshold_2": 0.08482138699453473,
|
249 |
+
"scr_dir1_threshold_5": 0.09009000301245093,
|
250 |
+
"scr_metric_threshold_5": 0.13392842175175362,
|
251 |
+
"scr_dir2_threshold_5": 0.13392842175175362,
|
252 |
+
"scr_dir1_threshold_10": 0.17117111311941172,
|
253 |
+
"scr_metric_threshold_10": 0.17857149509445872,
|
254 |
+
"scr_dir2_threshold_10": 0.17857149509445872,
|
255 |
+
"scr_dir1_threshold_20": 0.1891891674197794,
|
256 |
+
"scr_metric_threshold_20": 0.20089289871975097,
|
257 |
+
"scr_dir2_threshold_20": 0.20089289871975097,
|
258 |
+
"scr_dir1_threshold_50": 0.2432433303208824,
|
259 |
+
"scr_metric_threshold_50": 0.28125005820765137,
|
260 |
+
"scr_dir2_threshold_50": 0.28125005820765137,
|
261 |
+
"scr_dir1_threshold_100": 0.32882888688058826,
|
262 |
+
"scr_metric_threshold_100": 0.28125005820765137,
|
263 |
+
"scr_dir2_threshold_100": 0.28125005820765137,
|
264 |
+
"scr_dir1_threshold_500": 0.25225222322637253,
|
265 |
+
"scr_metric_threshold_500": 0.3258928654582359,
|
266 |
+
"scr_dir2_threshold_500": 0.3258928654582359
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.03862661712745569,
|
271 |
+
"scr_metric_threshold_2": 0.03862661712745569,
|
272 |
+
"scr_dir2_threshold_2": 0.02380949677794379,
|
273 |
+
"scr_dir1_threshold_5": 0.042918406627421406,
|
274 |
+
"scr_metric_threshold_5": 0.042918406627421406,
|
275 |
+
"scr_dir2_threshold_5": 0.04761899355588758,
|
276 |
+
"scr_dir1_threshold_10": 0.060085820441166386,
|
277 |
+
"scr_metric_threshold_10": 0.060085820441166386,
|
278 |
+
"scr_dir2_threshold_10": 0.09523798711177515,
|
279 |
+
"scr_dir1_threshold_20": 0.08583681325484281,
|
280 |
+
"scr_metric_threshold_20": 0.08583681325484281,
|
281 |
+
"scr_dir2_threshold_20": 0.10000011353263609,
|
282 |
+
"scr_dir1_threshold_50": 0.060085820441166386,
|
283 |
+
"scr_metric_threshold_50": 0.060085820441166386,
|
284 |
+
"scr_dir2_threshold_50": 0.11428564130044823,
|
285 |
+
"scr_dir1_threshold_100": 0.07725323425491137,
|
286 |
+
"scr_metric_threshold_100": 0.07725323425491137,
|
287 |
+
"scr_dir2_threshold_100": 0.1619046348563358,
|
288 |
+
"scr_dir1_threshold_500": 0.09442064806865635,
|
289 |
+
"scr_metric_threshold_500": 0.09442064806865635,
|
290 |
+
"scr_dir2_threshold_500": 0.19523810064441124
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0",
|
296 |
+
"sae_lens_version": "5.4.2",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 65536,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "b9f069ac-8088-45a3-9274-9b1e86e4771e",
|
73 |
+
"datetime_epoch_millis": 1740161543241,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.18865540875749043,
|
77 |
+
"scr_metric_threshold_2": 0.073516336917133,
|
78 |
+
"scr_dir2_threshold_2": 0.07225177023950229,
|
79 |
+
"scr_dir1_threshold_5": 0.23303036096673593,
|
80 |
+
"scr_metric_threshold_5": 0.1302813962622398,
|
81 |
+
"scr_dir2_threshold_5": 0.1305598690300678,
|
82 |
+
"scr_dir1_threshold_10": 0.23417802766807275,
|
83 |
+
"scr_metric_threshold_10": 0.1620880555443666,
|
84 |
+
"scr_dir2_threshold_10": 0.16742988549999255,
|
85 |
+
"scr_dir1_threshold_20": 0.2599068683780712,
|
86 |
+
"scr_metric_threshold_20": 0.21384163954000984,
|
87 |
+
"scr_dir2_threshold_20": 0.22096147153227652,
|
88 |
+
"scr_dir1_threshold_50": 0.1619087251521188,
|
89 |
+
"scr_metric_threshold_50": 0.2891392857052675,
|
90 |
+
"scr_dir2_threshold_50": 0.29613397944434255,
|
91 |
+
"scr_dir1_threshold_100": 0.08543218428340123,
|
92 |
+
"scr_metric_threshold_100": 0.31203469249820925,
|
93 |
+
"scr_dir2_threshold_100": 0.3178389255899667,
|
94 |
+
"scr_dir1_threshold_500": 0.07042284538823904,
|
95 |
+
"scr_metric_threshold_500": 0.2821800078466572,
|
96 |
+
"scr_dir2_threshold_500": 0.2820164853730435
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.3809524260050484,
|
103 |
+
"scr_metric_threshold_2": 0.012284998611665989,
|
104 |
+
"scr_dir2_threshold_2": 0.012284998611665989,
|
105 |
+
"scr_dir1_threshold_5": 0.41269806729462877,
|
106 |
+
"scr_metric_threshold_5": 0.019656056358159712,
|
107 |
+
"scr_dir2_threshold_5": 0.019656056358159712,
|
108 |
+
"scr_dir1_threshold_10": 0.41269806729462877,
|
109 |
+
"scr_metric_threshold_10": 0.061425139507065275,
|
110 |
+
"scr_dir2_threshold_10": 0.061425139507065275,
|
111 |
+
"scr_dir1_threshold_20": 0.444444654690226,
|
112 |
+
"scr_metric_threshold_20": 0.09336619447689097,
|
113 |
+
"scr_dir2_threshold_20": 0.09336619447689097,
|
114 |
+
"scr_dir1_threshold_50": 0.3809524260050484,
|
115 |
+
"scr_metric_threshold_50": 0.12039316213280908,
|
116 |
+
"scr_dir2_threshold_50": 0.12039316213280908,
|
117 |
+
"scr_dir1_threshold_100": 0.3650786592542414,
|
118 |
+
"scr_metric_threshold_100": 0.13759224805838266,
|
119 |
+
"scr_dir2_threshold_100": 0.13759224805838266,
|
120 |
+
"scr_dir1_threshold_500": 0.3015873766750807,
|
121 |
+
"scr_metric_threshold_500": 0.05896816907447914,
|
122 |
+
"scr_dir2_threshold_500": 0.05896816907447914
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.16161592443815084,
|
127 |
+
"scr_metric_threshold_2": 0.14447592402822051,
|
128 |
+
"scr_dir2_threshold_2": 0.14447592402822051,
|
129 |
+
"scr_dir1_threshold_5": 0.19191917367472955,
|
130 |
+
"scr_metric_threshold_5": 0.2294616822322599,
|
131 |
+
"scr_dir2_threshold_5": 0.2294616822322599,
|
132 |
+
"scr_dir1_threshold_10": 0.20202005606450307,
|
133 |
+
"scr_metric_threshold_10": 0.2691217364978167,
|
134 |
+
"scr_dir2_threshold_10": 0.2691217364978167,
|
135 |
+
"scr_dir1_threshold_20": 0.2121209384542766,
|
136 |
+
"scr_metric_threshold_20": 0.31161478445152835,
|
137 |
+
"scr_dir2_threshold_20": 0.31161478445152835,
|
138 |
+
"scr_dir1_threshold_50": -0.343434817790365,
|
139 |
+
"scr_metric_threshold_50": 0.38243624962156114,
|
140 |
+
"scr_dir2_threshold_50": 0.38243624962156114,
|
141 |
+
"scr_dir1_threshold_100": -0.3737374649596856,
|
142 |
+
"scr_metric_threshold_100": 0.21813038288640824,
|
143 |
+
"scr_dir2_threshold_100": 0.21813038288640824,
|
144 |
+
"scr_dir1_threshold_500": -0.5151516246182893,
|
145 |
+
"scr_metric_threshold_500": 0.28895184805644103,
|
146 |
+
"scr_dir2_threshold_500": 0.28895184805644103
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.4999995193171997,
|
151 |
+
"scr_metric_threshold_2": 0.022727279568944055,
|
152 |
+
"scr_dir2_threshold_2": 0.022727279568944055,
|
153 |
+
"scr_dir1_threshold_5": 0.5483869727270193,
|
154 |
+
"scr_metric_threshold_5": 0.0479797789751554,
|
155 |
+
"scr_dir2_threshold_5": 0.0479797789751554,
|
156 |
+
"scr_dir1_threshold_10": 0.5483869727270193,
|
157 |
+
"scr_metric_threshold_10": 0.07323242889813597,
|
158 |
+
"scr_dir2_threshold_10": 0.07323242889813597,
|
159 |
+
"scr_dir1_threshold_20": 0.46774185795438705,
|
160 |
+
"scr_metric_threshold_20": 0.15151514695403728,
|
161 |
+
"scr_dir2_threshold_20": 0.15151514695403728,
|
162 |
+
"scr_dir1_threshold_50": 0.29032183636211567,
|
163 |
+
"scr_metric_threshold_50": 0.21717176582360218,
|
164 |
+
"scr_dir2_threshold_50": 0.21717176582360218,
|
165 |
+
"scr_dir1_threshold_100": 0.2741930056807093,
|
166 |
+
"scr_metric_threshold_100": 0.21212117563229838,
|
167 |
+
"scr_dir2_threshold_100": 0.21212117563229838,
|
168 |
+
"scr_dir1_threshold_500": 0.16129022954526445,
|
169 |
+
"scr_metric_threshold_500": 0.06565661886956488,
|
170 |
+
"scr_dir2_threshold_500": 0.06565661886956488
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.15447146198048264,
|
175 |
+
"scr_metric_threshold_2": 0.09970669617184318,
|
176 |
+
"scr_dir2_threshold_2": 0.09970669617184318,
|
177 |
+
"scr_dir1_threshold_5": 0.2764230242518165,
|
178 |
+
"scr_metric_threshold_5": 0.18475064804764177,
|
179 |
+
"scr_dir2_threshold_5": 0.18475064804764177,
|
180 |
+
"scr_dir1_threshold_10": 0.19512182120742508,
|
181 |
+
"scr_metric_threshold_10": 0.24633431392598884,
|
182 |
+
"scr_dir2_threshold_10": 0.24633431392598884,
|
183 |
+
"scr_dir1_threshold_20": 0.2682925647340228,
|
184 |
+
"scr_metric_threshold_20": 0.2727272886176091,
|
185 |
+
"scr_dir2_threshold_20": 0.2727272886176091,
|
186 |
+
"scr_dir1_threshold_50": 0.15447146198048264,
|
187 |
+
"scr_metric_threshold_50": 0.3665689569885143,
|
188 |
+
"scr_dir2_threshold_50": 0.3665689569885143,
|
189 |
+
"scr_dir1_threshold_100": -0.382113667487563,
|
190 |
+
"scr_metric_threshold_100": 0.40762467597617913,
|
191 |
+
"scr_dir2_threshold_100": 0.40762467597617913,
|
192 |
+
"scr_dir1_threshold_500": -0.487804795313816,
|
193 |
+
"scr_metric_threshold_500": 0.13196469866440133,
|
194 |
+
"scr_dir2_threshold_500": 0.13196469866440133
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.021857964433295084,
|
199 |
+
"scr_metric_threshold_2": 0.09375017462295412,
|
200 |
+
"scr_dir2_threshold_2": 0.09375017462295412,
|
201 |
+
"scr_dir1_threshold_5": 0.03278678379574697,
|
202 |
+
"scr_metric_threshold_5": 0.1992187136202179,
|
203 |
+
"scr_dir2_threshold_5": 0.1992187136202179,
|
204 |
+
"scr_dir1_threshold_10": 0.04918033854781611,
|
205 |
+
"scr_metric_threshold_10": 0.21484374272404358,
|
206 |
+
"scr_dir2_threshold_10": 0.21484374272404358,
|
207 |
+
"scr_dir1_threshold_20": 0.09289626741440628,
|
208 |
+
"scr_metric_threshold_20": 0.28515625727595645,
|
209 |
+
"scr_dir2_threshold_20": 0.28515625727595645,
|
210 |
+
"scr_dir1_threshold_50": 0.12568305121015325,
|
211 |
+
"scr_metric_threshold_50": 0.4570313445874335,
|
212 |
+
"scr_dir2_threshold_50": 0.4570313445874335,
|
213 |
+
"scr_dir1_threshold_100": -0.00546440968122594,
|
214 |
+
"scr_metric_threshold_100": 0.5625001164153027,
|
215 |
+
"scr_dir2_threshold_100": 0.5625001164153027,
|
216 |
+
"scr_dir1_threshold_500": 0.07650271266233713,
|
217 |
+
"scr_metric_threshold_500": 0.6523438591393463,
|
218 |
+
"scr_dir2_threshold_500": 0.6523438591393463
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.08717928497042846,
|
223 |
+
"scr_metric_threshold_2": 0.04435498407738989,
|
224 |
+
"scr_dir2_threshold_2": 0.04435498407738989,
|
225 |
+
"scr_dir1_threshold_5": 0.16410229135546164,
|
226 |
+
"scr_metric_threshold_5": 0.08870972781349516,
|
227 |
+
"scr_dir2_threshold_5": 0.08870972781349516,
|
228 |
+
"scr_dir1_threshold_10": 0.1999998777340575,
|
229 |
+
"scr_metric_threshold_10": 0.10483879108333834,
|
230 |
+
"scr_dir2_threshold_10": 0.10483879108333834,
|
231 |
+
"scr_dir1_threshold_20": 0.23076901915509954,
|
232 |
+
"scr_metric_threshold_20": 0.17741945562699032,
|
233 |
+
"scr_dir2_threshold_20": 0.17741945562699032,
|
234 |
+
"scr_dir1_threshold_50": 0.2358974641126534,
|
235 |
+
"scr_metric_threshold_50": 0.2983870696388872,
|
236 |
+
"scr_dir2_threshold_50": 0.2983870696388872,
|
237 |
+
"scr_dir1_threshold_100": 0.28205102341178834,
|
238 |
+
"scr_metric_threshold_100": 0.4112902721865048,
|
239 |
+
"scr_dir2_threshold_100": 0.4112902721865048,
|
240 |
+
"scr_dir1_threshold_500": 0.32307674908308187,
|
241 |
+
"scr_metric_threshold_500": 0.3508064651805564,
|
242 |
+
"scr_dir2_threshold_500": 0.3508064651805564
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.1216216651604412,
|
247 |
+
"scr_metric_threshold_2": 0.08928561450116908,
|
248 |
+
"scr_dir2_threshold_2": 0.08928561450116908,
|
249 |
+
"scr_dir1_threshold_5": 0.13063055806593132,
|
250 |
+
"scr_metric_threshold_5": 0.16517854648243513,
|
251 |
+
"scr_dir2_threshold_5": 0.16517854648243513,
|
252 |
+
"scr_dir1_threshold_10": 0.18018027451428928,
|
253 |
+
"scr_metric_threshold_10": 0.24107147846370117,
|
254 |
+
"scr_dir2_threshold_10": 0.24107147846370117,
|
255 |
+
"scr_dir1_threshold_20": 0.23423416892600485,
|
256 |
+
"scr_metric_threshold_20": 0.29017851322092003,
|
257 |
+
"scr_dir2_threshold_20": 0.29017851322092003,
|
258 |
+
"scr_dir1_threshold_50": 0.2882883318271079,
|
259 |
+
"scr_metric_threshold_50": 0.308035689339578,
|
260 |
+
"scr_dir2_threshold_50": 0.308035689339578,
|
261 |
+
"scr_dir1_threshold_100": 0.3603602805391911,
|
262 |
+
"scr_metric_threshold_100": 0.38392862132084404,
|
263 |
+
"scr_dir2_threshold_100": 0.38392862132084404,
|
264 |
+
"scr_dir1_threshold_500": 0.4549549984937745,
|
265 |
+
"scr_metric_threshold_500": 0.45982128720998955,
|
266 |
+
"scr_dir2_threshold_500": 0.45982128720998955
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.08154502375487709,
|
271 |
+
"scr_metric_threshold_2": 0.08154502375487709,
|
272 |
+
"scr_dir2_threshold_2": 0.07142849033383136,
|
273 |
+
"scr_dir1_threshold_5": 0.10729601656855352,
|
274 |
+
"scr_metric_threshold_5": 0.10729601656855352,
|
275 |
+
"scr_dir2_threshold_5": 0.10952379871117751,
|
276 |
+
"scr_dir1_threshold_10": 0.08583681325484281,
|
277 |
+
"scr_metric_threshold_10": 0.08583681325484281,
|
278 |
+
"scr_dir2_threshold_10": 0.12857145289985059,
|
279 |
+
"scr_dir1_threshold_20": 0.12875547569614632,
|
280 |
+
"scr_metric_threshold_20": 0.12875547569614632,
|
281 |
+
"scr_dir2_threshold_20": 0.18571413163427958,
|
282 |
+
"scr_dir1_threshold_50": 0.16309004750975417,
|
283 |
+
"scr_metric_threshold_50": 0.16309004750975417,
|
284 |
+
"scr_dir2_threshold_50": 0.21904759742235502,
|
285 |
+
"scr_dir1_threshold_100": 0.16309004750975417,
|
286 |
+
"scr_metric_threshold_100": 0.16309004750975417,
|
287 |
+
"scr_dir2_threshold_100": 0.20952391224381361,
|
288 |
+
"scr_dir1_threshold_500": 0.2489271165784791,
|
289 |
+
"scr_metric_threshold_500": 0.2489271165784791,
|
290 |
+
"scr_dir2_threshold_500": 0.24761893678956953
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0",
|
296 |
+
"sae_lens_version": "5.4.2",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 65536,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "21df4103-067e-4b1e-992d-4ee5b7c51b7b",
|
73 |
+
"datetime_epoch_millis": 1740161101577,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.2009693612201276,
|
77 |
+
"scr_metric_threshold_2": 0.08333806598832513,
|
78 |
+
"scr_dir2_threshold_2": 0.08053045986523572,
|
79 |
+
"scr_dir1_threshold_5": 0.23069728357870245,
|
80 |
+
"scr_metric_threshold_5": 0.12271415632570293,
|
81 |
+
"scr_dir2_threshold_5": 0.12228750697619764,
|
82 |
+
"scr_dir1_threshold_10": 0.24536139627163156,
|
83 |
+
"scr_metric_threshold_10": 0.14820108338134885,
|
84 |
+
"scr_dir2_threshold_10": 0.14854593776620528,
|
85 |
+
"scr_dir1_threshold_20": 0.26804181768674956,
|
86 |
+
"scr_metric_threshold_20": 0.16917277755556245,
|
87 |
+
"scr_dir2_threshold_20": 0.1705906112921456,
|
88 |
+
"scr_dir1_threshold_50": 0.2558397963519864,
|
89 |
+
"scr_metric_threshold_50": 0.2246913468132941,
|
90 |
+
"scr_dir2_threshold_50": 0.22294137414999496,
|
91 |
+
"scr_dir1_threshold_100": 0.2694834525923413,
|
92 |
+
"scr_metric_threshold_100": 0.22275277451374648,
|
93 |
+
"scr_dir2_threshold_100": 0.22045867120535106,
|
94 |
+
"scr_dir1_threshold_500": 0.23308094185248288,
|
95 |
+
"scr_metric_threshold_500": 0.23783672950733417,
|
96 |
+
"scr_dir2_threshold_500": 0.23189713822667268
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.42857088793941894,
|
103 |
+
"scr_metric_threshold_2": 0.0,
|
104 |
+
"scr_dir2_threshold_2": 0.0,
|
105 |
+
"scr_dir1_threshold_5": 0.47619029597980633,
|
106 |
+
"scr_metric_threshold_5": 0.0,
|
107 |
+
"scr_dir2_threshold_5": 0.0,
|
108 |
+
"scr_dir1_threshold_10": 0.46031747533501616,
|
109 |
+
"scr_metric_threshold_10": 0.0024571168813214595,
|
110 |
+
"scr_dir2_threshold_10": 0.0024571168813214595,
|
111 |
+
"scr_dir1_threshold_20": 0.47619029597980633,
|
112 |
+
"scr_metric_threshold_20": 0.039312112716319424,
|
113 |
+
"scr_dir2_threshold_20": 0.039312112716319424,
|
114 |
+
"scr_dir1_threshold_50": 0.42857088793941894,
|
115 |
+
"scr_metric_threshold_50": 0.05896816907447914,
|
116 |
+
"scr_dir2_threshold_50": 0.05896816907447914,
|
117 |
+
"scr_dir1_threshold_100": 0.3968252466498386,
|
118 |
+
"scr_metric_threshold_100": 0.09336619447689097,
|
119 |
+
"scr_dir2_threshold_100": 0.09336619447689097,
|
120 |
+
"scr_dir1_threshold_500": 0.3968252466498386,
|
121 |
+
"scr_metric_threshold_500": 0.022113026790745845,
|
122 |
+
"scr_dir2_threshold_500": 0.022113026790745845
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.2121209384542766,
|
127 |
+
"scr_metric_threshold_2": 0.09631722640158302,
|
128 |
+
"scr_dir2_threshold_2": 0.09631722640158302,
|
129 |
+
"scr_dir1_threshold_5": 0.2121209384542766,
|
130 |
+
"scr_metric_threshold_5": 0.13031163099421397,
|
131 |
+
"scr_dir2_threshold_5": 0.13031163099421397,
|
132 |
+
"scr_dir1_threshold_10": 0.2121209384542766,
|
133 |
+
"scr_metric_threshold_10": 0.1671388604233078,
|
134 |
+
"scr_dir2_threshold_10": 0.1671388604233078,
|
135 |
+
"scr_dir1_threshold_20": 0.2828283193172076,
|
136 |
+
"scr_metric_threshold_20": 0.18130315345731438,
|
137 |
+
"scr_dir2_threshold_20": 0.18130315345731438,
|
138 |
+
"scr_dir1_threshold_50": 0.040403529559094105,
|
139 |
+
"scr_metric_threshold_50": 0.24362614411795844,
|
140 |
+
"scr_dir2_threshold_50": 0.24362614411795844,
|
141 |
+
"scr_dir1_threshold_100": 0.040403529559094105,
|
142 |
+
"scr_metric_threshold_100": 0.26345608682489086,
|
143 |
+
"scr_dir2_threshold_100": 0.26345608682489086,
|
144 |
+
"scr_dir1_threshold_500": -0.2828283193172076,
|
145 |
+
"scr_metric_threshold_500": 0.16147304189869,
|
146 |
+
"scr_dir2_threshold_500": 0.16147304189869
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.5483869727270193,
|
151 |
+
"scr_metric_threshold_2": 0.015151620057142186,
|
152 |
+
"scr_dir2_threshold_2": 0.015151620057142186,
|
153 |
+
"scr_dir1_threshold_5": 0.5483869727270193,
|
154 |
+
"scr_metric_threshold_5": 0.012626249703105673,
|
155 |
+
"scr_dir2_threshold_5": 0.012626249703105673,
|
156 |
+
"scr_dir1_threshold_10": 0.4999995193171997,
|
157 |
+
"scr_metric_threshold_10": 0.04040411946335353,
|
158 |
+
"scr_dir2_threshold_10": 0.04040411946335353,
|
159 |
+
"scr_dir1_threshold_20": 0.532258142045613,
|
160 |
+
"scr_metric_threshold_20": 0.050505149329191916,
|
161 |
+
"scr_dir2_threshold_20": 0.050505149329191916,
|
162 |
+
"scr_dir1_threshold_50": 0.516128349998606,
|
163 |
+
"scr_metric_threshold_50": 0.0530303691664592,
|
164 |
+
"scr_dir2_threshold_50": 0.0530303691664592,
|
165 |
+
"scr_dir1_threshold_100": 0.48387068863579336,
|
166 |
+
"scr_metric_threshold_100": 0.08080808840993783,
|
167 |
+
"scr_dir2_threshold_100": 0.08080808840993783,
|
168 |
+
"scr_dir1_threshold_500": 0.45161302727298075,
|
169 |
+
"scr_metric_threshold_500": 0.03030308959751515,
|
170 |
+
"scr_dir2_threshold_500": 0.03030308959751515
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.25203261487944845,
|
175 |
+
"scr_metric_threshold_2": 0.06451600498511631,
|
176 |
+
"scr_dir2_threshold_2": 0.06451600498511631,
|
177 |
+
"scr_dir1_threshold_5": 0.2845529991791037,
|
178 |
+
"scr_metric_threshold_5": 0.09090897967673653,
|
179 |
+
"scr_dir2_threshold_5": 0.09090897967673653,
|
180 |
+
"scr_dir1_threshold_10": 0.2601625898067356,
|
181 |
+
"scr_metric_threshold_10": 0.1524926455550836,
|
182 |
+
"scr_dir2_threshold_10": 0.1524926455550836,
|
183 |
+
"scr_dir1_threshold_20": 0.25203261487944845,
|
184 |
+
"scr_metric_threshold_20": 0.1964808784432174,
|
185 |
+
"scr_dir2_threshold_20": 0.1964808784432174,
|
186 |
+
"scr_dir1_threshold_50": 0.3089429239609653,
|
187 |
+
"scr_metric_threshold_50": 0.23460408353041323,
|
188 |
+
"scr_dir2_threshold_50": 0.23460408353041323,
|
189 |
+
"scr_dir1_threshold_100": 0.2682925647340228,
|
190 |
+
"scr_metric_threshold_100": 0.032258002492558155,
|
191 |
+
"scr_dir2_threshold_100": 0.032258002492558155,
|
192 |
+
"scr_dir1_threshold_500": 0.39024412700535666,
|
193 |
+
"scr_metric_threshold_500": 0.070381207579754,
|
194 |
+
"scr_dir2_threshold_500": 0.070381207579754
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.016393554752069144,
|
199 |
+
"scr_metric_threshold_2": 0.16015625727595642,
|
200 |
+
"scr_dir2_threshold_2": 0.16015625727595642,
|
201 |
+
"scr_dir1_threshold_5": 0.07650271266233713,
|
202 |
+
"scr_metric_threshold_5": 0.3359375436557385,
|
203 |
+
"scr_dir2_threshold_5": 0.3359375436557385,
|
204 |
+
"scr_dir1_threshold_10": 0.10382508677685816,
|
205 |
+
"scr_metric_threshold_10": 0.371093800931695,
|
206 |
+
"scr_dir2_threshold_10": 0.371093800931695,
|
207 |
+
"scr_dir1_threshold_20": 0.1202186415289273,
|
208 |
+
"scr_metric_threshold_20": 0.42578128637978213,
|
209 |
+
"scr_dir2_threshold_20": 0.42578128637978213,
|
210 |
+
"scr_dir1_threshold_50": 0.10382508677685816,
|
211 |
+
"scr_metric_threshold_50": 0.48828116996447934,
|
212 |
+
"scr_dir2_threshold_50": 0.48828116996447934,
|
213 |
+
"scr_dir1_threshold_100": 0.1803277994391953,
|
214 |
+
"scr_metric_threshold_100": 0.5234374272404357,
|
215 |
+
"scr_dir2_threshold_100": 0.5234374272404357,
|
216 |
+
"scr_dir1_threshold_500": 0.07103830298111119,
|
217 |
+
"scr_metric_threshold_500": 0.6054687718278693,
|
218 |
+
"scr_dir2_threshold_500": 0.6054687718278693
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.03589728071373967,
|
223 |
+
"scr_metric_threshold_2": 0.060483807005948444,
|
224 |
+
"scr_dir2_threshold_2": 0.060483807005948444,
|
225 |
+
"scr_dir1_threshold_5": 0.09743586922067994,
|
226 |
+
"scr_metric_threshold_5": 0.0927419335456348,
|
227 |
+
"scr_dir2_threshold_5": 0.0927419335456348,
|
228 |
+
"scr_dir1_threshold_10": 0.16410229135546164,
|
229 |
+
"scr_metric_threshold_10": 0.08870972781349516,
|
230 |
+
"scr_dir2_threshold_10": 0.08870972781349516,
|
231 |
+
"scr_dir1_threshold_20": 0.1999998777340575,
|
232 |
+
"scr_metric_threshold_20": 0.1008065853511987,
|
233 |
+
"scr_dir2_threshold_20": 0.1008065853511987,
|
234 |
+
"scr_dir1_threshold_50": 0.24102560340535104,
|
235 |
+
"scr_metric_threshold_50": 0.19758072462897314,
|
236 |
+
"scr_dir2_threshold_50": 0.19758072462897314,
|
237 |
+
"scr_dir1_threshold_100": 0.24102560340535104,
|
238 |
+
"scr_metric_threshold_100": 0.23790326263293876,
|
239 |
+
"scr_dir2_threshold_100": 0.23790326263293876,
|
240 |
+
"scr_dir1_threshold_500": 0.3282048883757795,
|
241 |
+
"scr_metric_threshold_500": 0.3225807847142943,
|
242 |
+
"scr_dir2_threshold_500": 0.3225807847142943
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.05855860935384807,
|
247 |
+
"scr_metric_threshold_2": 0.214285581239654,
|
248 |
+
"scr_dir2_threshold_2": 0.214285581239654,
|
249 |
+
"scr_dir1_threshold_5": 0.09459444946519599,
|
250 |
+
"scr_metric_threshold_5": 0.2633928820889934,
|
251 |
+
"scr_dir2_threshold_5": 0.2633928820889934,
|
252 |
+
"scr_dir1_threshold_10": 0.19369361387252446,
|
253 |
+
"scr_metric_threshold_10": 0.2946427407275544,
|
254 |
+
"scr_dir2_threshold_10": 0.2946427407275544,
|
255 |
+
"scr_dir1_threshold_20": 0.22072082956776967,
|
256 |
+
"scr_metric_threshold_20": 0.2991072343263093,
|
257 |
+
"scr_dir2_threshold_20": 0.2991072343263093,
|
258 |
+
"scr_dir1_threshold_50": 0.27477472397948527,
|
259 |
+
"scr_metric_threshold_50": 0.3883928488274784,
|
260 |
+
"scr_dir2_threshold_50": 0.3883928488274784,
|
261 |
+
"scr_dir1_threshold_100": 0.3648647269919362,
|
262 |
+
"scr_metric_threshold_100": 0.37053567270882043,
|
263 |
+
"scr_dir2_threshold_100": 0.37053567270882043,
|
264 |
+
"scr_dir1_threshold_500": 0.24774777677362747,
|
265 |
+
"scr_metric_threshold_500": 0.42857142857142855,
|
266 |
+
"scr_dir2_threshold_500": 0.42857142857142855
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.05579403094120067,
|
271 |
+
"scr_metric_threshold_2": 0.05579403094120067,
|
272 |
+
"scr_dir2_threshold_2": 0.033333181956485214,
|
273 |
+
"scr_dir1_threshold_5": 0.05579403094120067,
|
274 |
+
"scr_metric_threshold_5": 0.05579403094120067,
|
275 |
+
"scr_dir2_threshold_5": 0.05238083614515829,
|
276 |
+
"scr_dir1_threshold_10": 0.06866965525497992,
|
277 |
+
"scr_metric_threshold_10": 0.06866965525497992,
|
278 |
+
"scr_dir2_threshold_10": 0.07142849033383136,
|
279 |
+
"scr_dir1_threshold_20": 0.060085820441166386,
|
280 |
+
"scr_metric_threshold_20": 0.060085820441166386,
|
281 |
+
"scr_dir2_threshold_20": 0.07142849033383136,
|
282 |
+
"scr_dir1_threshold_50": 0.13304726519611204,
|
283 |
+
"scr_metric_threshold_50": 0.13304726519611204,
|
284 |
+
"scr_dir2_threshold_50": 0.11904748388971893,
|
285 |
+
"scr_dir1_threshold_100": 0.18025746132349915,
|
286 |
+
"scr_metric_threshold_100": 0.18025746132349915,
|
287 |
+
"scr_dir2_threshold_100": 0.1619046348563358,
|
288 |
+
"scr_dir1_threshold_500": 0.2618024850783763,
|
289 |
+
"scr_metric_threshold_500": 0.2618024850783763,
|
290 |
+
"scr_dir2_threshold_500": 0.21428575483308432
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0",
|
296 |
+
"sae_lens_version": "5.4.2",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 65536,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "2f284c7a-10d3-4f19-8264-97e91f4bb2f2",
|
73 |
+
"datetime_epoch_millis": 1740162428754,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.19644145298086652,
|
77 |
+
"scr_metric_threshold_2": 0.07734320571232689,
|
78 |
+
"scr_dir2_threshold_2": 0.0759023691262068,
|
79 |
+
"scr_dir1_threshold_5": 0.2317746907633326,
|
80 |
+
"scr_metric_threshold_5": 0.13025074424245692,
|
81 |
+
"scr_dir2_threshold_5": 0.13351306106512845,
|
82 |
+
"scr_dir1_threshold_10": 0.237756725215616,
|
83 |
+
"scr_metric_threshold_10": 0.1905854353466263,
|
84 |
+
"scr_dir2_threshold_10": 0.19497179922285185,
|
85 |
+
"scr_dir1_threshold_20": 0.24107600003630494,
|
86 |
+
"scr_metric_threshold_20": 0.24773476074996353,
|
87 |
+
"scr_dir2_threshold_20": 0.257654541907719,
|
88 |
+
"scr_dir1_threshold_50": 0.19159755118507713,
|
89 |
+
"scr_metric_threshold_50": 0.32827118012785683,
|
90 |
+
"scr_dir2_threshold_50": 0.3363388212419233,
|
91 |
+
"scr_dir1_threshold_100": 0.0755746565868114,
|
92 |
+
"scr_metric_threshold_100": 0.3171029788525398,
|
93 |
+
"scr_dir2_threshold_100": 0.3217090588604302,
|
94 |
+
"scr_dir1_threshold_500": 0.00475685136491143,
|
95 |
+
"scr_metric_threshold_500": 0.2958876177808421,
|
96 |
+
"scr_dir2_threshold_500": 0.3115094291710164
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.42857088793941894,
|
103 |
+
"scr_metric_threshold_2": 0.009828028179079856,
|
104 |
+
"scr_dir2_threshold_2": 0.009828028179079856,
|
105 |
+
"scr_dir1_threshold_5": 0.444444654690226,
|
106 |
+
"scr_metric_threshold_5": 0.017199085925573582,
|
107 |
+
"scr_dir2_threshold_5": 0.017199085925573582,
|
108 |
+
"scr_dir1_threshold_10": 0.47619029597980633,
|
109 |
+
"scr_metric_threshold_10": 0.04668317046281315,
|
110 |
+
"scr_dir2_threshold_10": 0.04668317046281315,
|
111 |
+
"scr_dir1_threshold_20": 0.5079359372693867,
|
112 |
+
"scr_metric_threshold_20": 0.08845210716298338,
|
113 |
+
"scr_dir2_threshold_20": 0.08845210716298338,
|
114 |
+
"scr_dir1_threshold_50": 0.47619029597980633,
|
115 |
+
"scr_metric_threshold_50": 0.10565119308855696,
|
116 |
+
"scr_dir2_threshold_50": 0.10565119308855696,
|
117 |
+
"scr_dir1_threshold_100": 0.3809524260050484,
|
118 |
+
"scr_metric_threshold_100": 0.14496315935614107,
|
119 |
+
"scr_dir2_threshold_100": 0.14496315935614107,
|
120 |
+
"scr_dir1_threshold_500": 0.31746019731987085,
|
121 |
+
"scr_metric_threshold_500": 0.02948408453723957,
|
122 |
+
"scr_dir2_threshold_500": 0.02948408453723957
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.19191917367472955,
|
127 |
+
"scr_metric_threshold_2": 0.1359772806671398,
|
128 |
+
"scr_dir2_threshold_2": 0.1359772806671398,
|
129 |
+
"scr_dir1_threshold_5": 0.2828283193172076,
|
130 |
+
"scr_metric_threshold_5": 0.21529738919825334,
|
131 |
+
"scr_dir2_threshold_5": 0.21529738919825334,
|
132 |
+
"scr_dir1_threshold_10": 0.2828283193172076,
|
133 |
+
"scr_metric_threshold_10": 0.2832861983835152,
|
134 |
+
"scr_dir2_threshold_10": 0.2832861983835152,
|
135 |
+
"scr_dir1_threshold_20": 0.2929292017069811,
|
136 |
+
"scr_metric_threshold_20": 0.3371105456830785,
|
137 |
+
"scr_dir2_threshold_20": 0.3371105456830785,
|
138 |
+
"scr_dir1_threshold_50": -0.16161652650540898,
|
139 |
+
"scr_metric_threshold_50": 0.4277621224117357,
|
140 |
+
"scr_dir2_threshold_50": 0.4277621224117357,
|
141 |
+
"scr_dir1_threshold_100": -0.42424308104306946,
|
142 |
+
"scr_metric_threshold_100": 0.2096317395253275,
|
143 |
+
"scr_dir2_threshold_100": 0.2096317395253275,
|
144 |
+
"scr_dir1_threshold_500": -0.6767681511236984,
|
145 |
+
"scr_metric_threshold_500": 0.26345608682489086,
|
146 |
+
"scr_dir2_threshold_500": 0.26345608682489086
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.532258142045613,
|
151 |
+
"scr_metric_threshold_2": 0.03030308959751515,
|
152 |
+
"scr_dir2_threshold_2": 0.03030308959751515,
|
153 |
+
"scr_dir1_threshold_5": 0.532258142045613,
|
154 |
+
"scr_metric_threshold_5": 0.0530303691664592,
|
155 |
+
"scr_dir2_threshold_5": 0.0530303691664592,
|
156 |
+
"scr_dir1_threshold_10": 0.516128349998606,
|
157 |
+
"scr_metric_threshold_10": 0.07828286857267056,
|
158 |
+
"scr_dir2_threshold_10": 0.07828286857267056,
|
159 |
+
"scr_dir1_threshold_20": 0.48387068863579336,
|
160 |
+
"scr_metric_threshold_20": 0.13383845757639704,
|
161 |
+
"scr_dir2_threshold_20": 0.13383845757639704,
|
162 |
+
"scr_dir1_threshold_50": 0.35483812045334157,
|
163 |
+
"scr_metric_threshold_50": 0.22979801552670784,
|
164 |
+
"scr_dir2_threshold_50": 0.22979801552670784,
|
165 |
+
"scr_dir1_threshold_100": 0.3064516284091226,
|
166 |
+
"scr_metric_threshold_100": 0.29797985423354,
|
167 |
+
"scr_dir2_threshold_100": 0.29797985423354,
|
168 |
+
"scr_dir1_threshold_500": 0.14516043749825752,
|
169 |
+
"scr_metric_threshold_500": 0.10606058781614919,
|
170 |
+
"scr_dir2_threshold_500": 0.10606058781614919
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.2601625898067356,
|
175 |
+
"scr_metric_threshold_2": 0.10850441266694984,
|
176 |
+
"scr_dir2_threshold_2": 0.10850441266694984,
|
177 |
+
"scr_dir1_threshold_5": 0.31707338347875896,
|
178 |
+
"scr_metric_threshold_5": 0.17302041765206616,
|
179 |
+
"scr_dir2_threshold_5": 0.17302041765206616,
|
180 |
+
"scr_dir1_threshold_10": 0.21138225565250596,
|
181 |
+
"scr_metric_threshold_10": 0.26099705822203345,
|
182 |
+
"scr_dir2_threshold_10": 0.26099705822203345,
|
183 |
+
"scr_dir1_threshold_20": 0.13008153719862106,
|
184 |
+
"scr_metric_threshold_20": 0.31964803540621184,
|
185 |
+
"scr_dir2_threshold_20": 0.31964803540621184,
|
186 |
+
"scr_dir1_threshold_50": 0.06504076859931053,
|
187 |
+
"scr_metric_threshold_50": 0.3900292429859658,
|
188 |
+
"scr_dir2_threshold_50": 0.3900292429859658,
|
189 |
+
"scr_dir1_threshold_100": -0.39024364241485016,
|
190 |
+
"scr_metric_threshold_100": 0.140762415159508,
|
191 |
+
"scr_dir2_threshold_100": 0.140762415159508,
|
192 |
+
"scr_dir1_threshold_500": -0.6341462823670114,
|
193 |
+
"scr_metric_threshold_500": 0.13489738735857004,
|
194 |
+
"scr_dir2_threshold_500": 0.13489738735857004
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.0,
|
199 |
+
"scr_metric_threshold_2": 0.13671883003552063,
|
200 |
+
"scr_dir2_threshold_2": 0.13671883003552063,
|
201 |
+
"scr_dir1_threshold_5": 0.01092881936245188,
|
202 |
+
"scr_metric_threshold_5": 0.23437497089617432,
|
203 |
+
"scr_dir2_threshold_5": 0.23437497089617432,
|
204 |
+
"scr_dir1_threshold_10": 0.04918033854781611,
|
205 |
+
"scr_metric_threshold_10": 0.3085936845163922,
|
206 |
+
"scr_dir2_threshold_10": 0.3085936845163922,
|
207 |
+
"scr_dir1_threshold_20": 0.00546440968122594,
|
208 |
+
"scr_metric_threshold_20": 0.417968888243172,
|
209 |
+
"scr_dir2_threshold_20": 0.417968888243172,
|
210 |
+
"scr_dir1_threshold_50": 0.09836067709563222,
|
211 |
+
"scr_metric_threshold_50": 0.5351562572759564,
|
212 |
+
"scr_dir2_threshold_50": 0.5351562572759564,
|
213 |
+
"scr_dir1_threshold_100": -0.016393554752069144,
|
214 |
+
"scr_metric_threshold_100": 0.6015625727595643,
|
215 |
+
"scr_dir2_threshold_100": 0.6015625727595643,
|
216 |
+
"scr_dir1_threshold_500": 0.03825119347697291,
|
217 |
+
"scr_metric_threshold_500": 0.6953125145519129,
|
218 |
+
"scr_dir2_threshold_500": 0.6953125145519129
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.03589728071373967,
|
223 |
+
"scr_metric_threshold_2": 0.04838718980952953,
|
224 |
+
"scr_dir2_threshold_2": 0.04838718980952953,
|
225 |
+
"scr_dir1_threshold_5": 0.10769214780607521,
|
226 |
+
"scr_metric_threshold_5": 0.0927419335456348,
|
227 |
+
"scr_dir2_threshold_5": 0.0927419335456348,
|
228 |
+
"scr_dir1_threshold_10": 0.12820501064172196,
|
229 |
+
"scr_metric_threshold_10": 0.1491935348194436,
|
230 |
+
"scr_dir2_threshold_10": 0.1491935348194436,
|
231 |
+
"scr_dir1_threshold_20": 0.18974359914866223,
|
232 |
+
"scr_metric_threshold_20": 0.18951607282340924,
|
233 |
+
"scr_dir2_threshold_20": 0.18951607282340924,
|
234 |
+
"scr_dir1_threshold_50": 0.22564087986240192,
|
235 |
+
"scr_metric_threshold_50": 0.3145161329087304,
|
236 |
+
"scr_dir2_threshold_50": 0.3145161329087304,
|
237 |
+
"scr_dir1_threshold_100": 0.27179474482639304,
|
238 |
+
"scr_metric_threshold_100": 0.43145154118848766,
|
239 |
+
"scr_dir2_threshold_100": 0.43145154118848766,
|
240 |
+
"scr_dir1_threshold_500": 0.30769233120498896,
|
241 |
+
"scr_metric_threshold_500": 0.4354839872619119,
|
242 |
+
"scr_dir2_threshold_500": 0.4354839872619119
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.05405389441171559,
|
247 |
+
"scr_metric_threshold_2": 0.0803571594879004,
|
248 |
+
"scr_dir2_threshold_2": 0.0803571594879004,
|
249 |
+
"scr_dir1_threshold_5": 0.09459444946519599,
|
250 |
+
"scr_metric_threshold_5": 0.19196417761436174,
|
251 |
+
"scr_dir2_threshold_5": 0.19196417761436174,
|
252 |
+
"scr_dir1_threshold_10": 0.13513500451867638,
|
253 |
+
"scr_metric_threshold_10": 0.2946427407275544,
|
254 |
+
"scr_dir2_threshold_10": 0.2946427407275544,
|
255 |
+
"scr_dir1_threshold_20": 0.20270277526740202,
|
256 |
+
"scr_metric_threshold_20": 0.37946412772208915,
|
257 |
+
"scr_dir2_threshold_20": 0.37946412772208915,
|
258 |
+
"scr_dir1_threshold_50": 0.31981972548571075,
|
259 |
+
"scr_metric_threshold_50": 0.4687500083153788,
|
260 |
+
"scr_dir2_threshold_50": 0.4687500083153788,
|
261 |
+
"scr_dir1_threshold_100": 0.2702702775267402,
|
262 |
+
"scr_metric_threshold_100": 0.504464094460574,
|
263 |
+
"scr_dir2_threshold_100": 0.504464094460574,
|
264 |
+
"scr_dir1_threshold_500": 0.35585583408644605,
|
265 |
+
"scr_metric_threshold_500": 0.5178570430725976,
|
266 |
+
"scr_dir2_threshold_500": 0.5178570430725976
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.06866965525497992,
|
271 |
+
"scr_metric_threshold_2": 0.06866965525497992,
|
272 |
+
"scr_dir2_threshold_2": 0.05714296256601923,
|
273 |
+
"scr_dir1_threshold_5": 0.0643776099411321,
|
274 |
+
"scr_metric_threshold_5": 0.0643776099411321,
|
275 |
+
"scr_dir2_threshold_5": 0.09047614452250444,
|
276 |
+
"scr_dir1_threshold_10": 0.10300422706858779,
|
277 |
+
"scr_metric_threshold_10": 0.10300422706858779,
|
278 |
+
"scr_dir2_threshold_10": 0.13809513807839202,
|
279 |
+
"scr_dir1_threshold_20": 0.11587985138236706,
|
280 |
+
"scr_metric_threshold_20": 0.11587985138236706,
|
281 |
+
"scr_dir2_threshold_20": 0.19523810064441124,
|
282 |
+
"scr_dir1_threshold_50": 0.15450646850982275,
|
283 |
+
"scr_metric_threshold_50": 0.15450646850982275,
|
284 |
+
"scr_dir2_threshold_50": 0.21904759742235502,
|
285 |
+
"scr_dir1_threshold_100": 0.20600845413717558,
|
286 |
+
"scr_metric_threshold_100": 0.20600845413717558,
|
287 |
+
"scr_dir2_threshold_100": 0.24285709420029883,
|
288 |
+
"scr_dir1_threshold_500": 0.18454925082346488,
|
289 |
+
"scr_metric_threshold_500": 0.18454925082346488,
|
290 |
+
"scr_dir2_threshold_500": 0.3095237419448595
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0",
|
296 |
+
"sae_lens_version": "5.4.2",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 65536,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "fe2f63b4-1d22-4879-9af8-4e98fff7d830",
|
30 |
+
"datetime_epoch_millis": 1740164930002,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9595375448465346,
|
34 |
+
"llm_top_1_test_accuracy": 0.64956875,
|
35 |
+
"llm_top_2_test_accuracy": 0.72589375,
|
36 |
+
"llm_top_5_test_accuracy": 0.78265625,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9598625473678112,
|
44 |
+
"sae_top_1_test_accuracy": 0.7501000000000001,
|
45 |
+
"sae_top_2_test_accuracy": 0.7998875,
|
46 |
+
"sae_top_5_test_accuracy": 0.8547375,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.966800057888031,
|
57 |
+
"llm_top_1_test_accuracy": 0.6397999999999999,
|
58 |
+
"llm_top_2_test_accuracy": 0.6954,
|
59 |
+
"llm_top_5_test_accuracy": 0.7869999999999999,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9674000382423401,
|
65 |
+
"sae_top_1_test_accuracy": 0.7148000000000001,
|
66 |
+
"sae_top_2_test_accuracy": 0.7748,
|
67 |
+
"sae_top_5_test_accuracy": 0.8404,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9578000426292419,
|
76 |
+
"llm_top_1_test_accuracy": 0.6694000000000001,
|
77 |
+
"llm_top_2_test_accuracy": 0.725,
|
78 |
+
"llm_top_5_test_accuracy": 0.7654,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9522000312805176,
|
84 |
+
"sae_top_1_test_accuracy": 0.712,
|
85 |
+
"sae_top_2_test_accuracy": 0.7488,
|
86 |
+
"sae_top_5_test_accuracy": 0.8006,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9316000461578369,
|
95 |
+
"llm_top_1_test_accuracy": 0.687,
|
96 |
+
"llm_top_2_test_accuracy": 0.7492,
|
97 |
+
"llm_top_5_test_accuracy": 0.7704000000000001,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.935800039768219,
|
103 |
+
"sae_top_1_test_accuracy": 0.6752,
|
104 |
+
"sae_top_2_test_accuracy": 0.7123999999999999,
|
105 |
+
"sae_top_5_test_accuracy": 0.797,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.9202000379562378,
|
114 |
+
"llm_top_1_test_accuracy": 0.599,
|
115 |
+
"llm_top_2_test_accuracy": 0.6474,
|
116 |
+
"llm_top_5_test_accuracy": 0.6734,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9280000567436218,
|
122 |
+
"sae_top_1_test_accuracy": 0.7084,
|
123 |
+
"sae_top_2_test_accuracy": 0.7418000000000001,
|
124 |
+
"sae_top_5_test_accuracy": 0.7746,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9795000553131104,
|
133 |
+
"llm_top_1_test_accuracy": 0.673,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9715000689029694,
|
141 |
+
"sae_top_1_test_accuracy": 0.764,
|
142 |
+
"sae_top_2_test_accuracy": 0.898,
|
143 |
+
"sae_top_5_test_accuracy": 0.921,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9708000421524048,
|
152 |
+
"llm_top_1_test_accuracy": 0.6451999999999999,
|
153 |
+
"llm_top_2_test_accuracy": 0.6960000000000001,
|
154 |
+
"llm_top_5_test_accuracy": 0.7766,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9712000489234924,
|
160 |
+
"sae_top_1_test_accuracy": 0.6984,
|
161 |
+
"sae_top_2_test_accuracy": 0.7142000000000001,
|
162 |
+
"sae_top_5_test_accuracy": 0.8362,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9500000476837158,
|
171 |
+
"llm_top_1_test_accuracy": 0.63775,
|
172 |
+
"llm_top_2_test_accuracy": 0.78175,
|
173 |
+
"llm_top_5_test_accuracy": 0.82125,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9540000557899475,
|
179 |
+
"sae_top_1_test_accuracy": 0.813,
|
180 |
+
"sae_top_2_test_accuracy": 0.8325,
|
181 |
+
"sae_top_5_test_accuracy": 0.8765000000000001,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6454,
|
191 |
+
"llm_top_2_test_accuracy": 0.7884,
|
192 |
+
"llm_top_5_test_accuracy": 0.9012,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9988000392913818,
|
198 |
+
"sae_top_1_test_accuracy": 0.915,
|
199 |
+
"sae_top_2_test_accuracy": 0.9766,
|
200 |
+
"sae_top_5_test_accuracy": 0.9916,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0",
|
210 |
+
"sae_lens_version": "5.4.2",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 65536,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9500000476837158,
|
240 |
+
"1": 0.9700000286102295,
|
241 |
+
"2": 0.9520000219345093,
|
242 |
+
"6": 0.9880000352859497,
|
243 |
+
"9": 0.9770000576972961
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9530000686645508,
|
249 |
+
"6": 0.987000048160553,
|
250 |
+
"9": 0.9760000705718994
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.577,
|
254 |
+
"1": 0.613,
|
255 |
+
"2": 0.662,
|
256 |
+
"6": 0.787,
|
257 |
+
"9": 0.56
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.574,
|
261 |
+
"1": 0.66,
|
262 |
+
"2": 0.718,
|
263 |
+
"6": 0.811,
|
264 |
+
"9": 0.714
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.713,
|
268 |
+
"1": 0.711,
|
269 |
+
"2": 0.755,
|
270 |
+
"6": 0.895,
|
271 |
+
"9": 0.861
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.679,
|
275 |
+
"1": 0.57,
|
276 |
+
"2": 0.622,
|
277 |
+
"6": 0.76,
|
278 |
+
"9": 0.943
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.676,
|
282 |
+
"1": 0.565,
|
283 |
+
"2": 0.924,
|
284 |
+
"6": 0.762,
|
285 |
+
"9": 0.947
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.828,
|
289 |
+
"1": 0.63,
|
290 |
+
"2": 0.924,
|
291 |
+
"6": 0.878,
|
292 |
+
"9": 0.942
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9570000171661377,
|
298 |
+
"13": 0.9520000219345093,
|
299 |
+
"14": 0.9520000219345093,
|
300 |
+
"18": 0.9320000410079956,
|
301 |
+
"19": 0.968000054359436
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9690000414848328,
|
305 |
+
"13": 0.9600000381469727,
|
306 |
+
"14": 0.9600000381469727,
|
307 |
+
"18": 0.9390000700950623,
|
308 |
+
"19": 0.9610000252723694
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.555,
|
312 |
+
"13": 0.668,
|
313 |
+
"14": 0.638,
|
314 |
+
"18": 0.69,
|
315 |
+
"19": 0.796
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.756,
|
319 |
+
"13": 0.714,
|
320 |
+
"14": 0.67,
|
321 |
+
"18": 0.717,
|
322 |
+
"19": 0.768
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.794,
|
326 |
+
"13": 0.749,
|
327 |
+
"14": 0.723,
|
328 |
+
"18": 0.73,
|
329 |
+
"19": 0.831
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.86,
|
333 |
+
"13": 0.646,
|
334 |
+
"14": 0.73,
|
335 |
+
"18": 0.628,
|
336 |
+
"19": 0.696
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.854,
|
340 |
+
"13": 0.708,
|
341 |
+
"14": 0.728,
|
342 |
+
"18": 0.669,
|
343 |
+
"19": 0.785
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.861,
|
347 |
+
"13": 0.728,
|
348 |
+
"14": 0.862,
|
349 |
+
"18": 0.714,
|
350 |
+
"19": 0.838
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9550000429153442,
|
356 |
+
"21": 0.9260000586509705,
|
357 |
+
"22": 0.9290000200271606,
|
358 |
+
"25": 0.9750000238418579,
|
359 |
+
"26": 0.8940000534057617
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.956000030040741,
|
363 |
+
"21": 0.9350000619888306,
|
364 |
+
"22": 0.9180000424385071,
|
365 |
+
"25": 0.9640000462532043,
|
366 |
+
"26": 0.8850000500679016
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.693,
|
370 |
+
"21": 0.775,
|
371 |
+
"22": 0.645,
|
372 |
+
"25": 0.706,
|
373 |
+
"26": 0.616
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.827,
|
377 |
+
"21": 0.761,
|
378 |
+
"22": 0.694,
|
379 |
+
"25": 0.778,
|
380 |
+
"26": 0.686
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.855,
|
384 |
+
"21": 0.791,
|
385 |
+
"22": 0.725,
|
386 |
+
"25": 0.809,
|
387 |
+
"26": 0.672
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.869,
|
391 |
+
"21": 0.59,
|
392 |
+
"22": 0.661,
|
393 |
+
"25": 0.633,
|
394 |
+
"26": 0.623
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.861,
|
398 |
+
"21": 0.616,
|
399 |
+
"22": 0.689,
|
400 |
+
"25": 0.665,
|
401 |
+
"26": 0.731
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.914,
|
405 |
+
"21": 0.831,
|
406 |
+
"22": 0.719,
|
407 |
+
"25": 0.765,
|
408 |
+
"26": 0.756
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9530000686645508,
|
414 |
+
"2": 0.940000057220459,
|
415 |
+
"3": 0.9160000681877136,
|
416 |
+
"5": 0.9390000700950623,
|
417 |
+
"6": 0.8920000195503235
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9580000638961792,
|
421 |
+
"2": 0.9330000281333923,
|
422 |
+
"3": 0.9280000329017639,
|
423 |
+
"5": 0.9200000166893005,
|
424 |
+
"6": 0.862000048160553
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.647,
|
428 |
+
"2": 0.603,
|
429 |
+
"3": 0.598,
|
430 |
+
"5": 0.555,
|
431 |
+
"6": 0.592
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.75,
|
435 |
+
"2": 0.648,
|
436 |
+
"3": 0.607,
|
437 |
+
"5": 0.606,
|
438 |
+
"6": 0.626
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.767,
|
442 |
+
"2": 0.641,
|
443 |
+
"3": 0.645,
|
444 |
+
"5": 0.638,
|
445 |
+
"6": 0.676
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.854,
|
449 |
+
"2": 0.806,
|
450 |
+
"3": 0.667,
|
451 |
+
"5": 0.543,
|
452 |
+
"6": 0.672
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.891,
|
456 |
+
"2": 0.793,
|
457 |
+
"3": 0.753,
|
458 |
+
"5": 0.592,
|
459 |
+
"6": 0.68
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.885,
|
463 |
+
"2": 0.809,
|
464 |
+
"3": 0.758,
|
465 |
+
"5": 0.703,
|
466 |
+
"6": 0.718
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.971000075340271,
|
472 |
+
"5.0": 0.9720000624656677
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9780000448226929,
|
476 |
+
"5.0": 0.9810000658035278
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.673,
|
480 |
+
"5.0": 0.673
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.764,
|
492 |
+
"5.0": 0.764
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.898,
|
496 |
+
"5.0": 0.898
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.921,
|
500 |
+
"5.0": 0.921
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9530000686645508,
|
506 |
+
"Python": 0.984000027179718,
|
507 |
+
"HTML": 0.9860000610351562,
|
508 |
+
"Java": 0.9750000238418579,
|
509 |
+
"PHP": 0.9580000638961792
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.984000027179718,
|
514 |
+
"HTML": 0.9900000691413879,
|
515 |
+
"Java": 0.9670000672340393,
|
516 |
+
"PHP": 0.9570000171661377
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.666,
|
520 |
+
"Python": 0.626,
|
521 |
+
"HTML": 0.721,
|
522 |
+
"Java": 0.619,
|
523 |
+
"PHP": 0.594
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.679,
|
527 |
+
"Python": 0.674,
|
528 |
+
"HTML": 0.8,
|
529 |
+
"Java": 0.676,
|
530 |
+
"PHP": 0.651
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.783,
|
534 |
+
"Python": 0.717,
|
535 |
+
"HTML": 0.935,
|
536 |
+
"Java": 0.733,
|
537 |
+
"PHP": 0.715
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.598,
|
541 |
+
"Python": 0.566,
|
542 |
+
"HTML": 0.742,
|
543 |
+
"Java": 0.656,
|
544 |
+
"PHP": 0.93
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.654,
|
548 |
+
"Python": 0.612,
|
549 |
+
"HTML": 0.738,
|
550 |
+
"Java": 0.637,
|
551 |
+
"PHP": 0.93
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.704,
|
555 |
+
"Python": 0.917,
|
556 |
+
"HTML": 0.938,
|
557 |
+
"Java": 0.688,
|
558 |
+
"PHP": 0.934
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9500000476837158,
|
564 |
+
"1": 0.987000048160553,
|
565 |
+
"2": 0.9350000619888306,
|
566 |
+
"3": 0.9440000653266907
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.940000057220459,
|
570 |
+
"1": 0.9860000610351562,
|
571 |
+
"2": 0.9200000166893005,
|
572 |
+
"3": 0.9540000557899475
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.573,
|
576 |
+
"1": 0.671,
|
577 |
+
"2": 0.672,
|
578 |
+
"3": 0.635
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.802,
|
582 |
+
"1": 0.808,
|
583 |
+
"2": 0.701,
|
584 |
+
"3": 0.816
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.81,
|
588 |
+
"1": 0.891,
|
589 |
+
"2": 0.752,
|
590 |
+
"3": 0.832
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.793,
|
594 |
+
"1": 0.972,
|
595 |
+
"2": 0.817,
|
596 |
+
"3": 0.67
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.838,
|
600 |
+
"1": 0.972,
|
601 |
+
"2": 0.818,
|
602 |
+
"3": 0.702
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.871,
|
606 |
+
"1": 0.976,
|
607 |
+
"2": 0.821,
|
608 |
+
"3": 0.838
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.999000072479248,
|
614 |
+
"fr": 1.0,
|
615 |
+
"de": 1.0,
|
616 |
+
"es": 0.999000072479248,
|
617 |
+
"nl": 0.9960000514984131
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 0.999000072479248,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 0.999000072479248,
|
624 |
+
"nl": 1.0
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.739,
|
628 |
+
"fr": 0.585,
|
629 |
+
"de": 0.758,
|
630 |
+
"es": 0.496,
|
631 |
+
"nl": 0.649
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.829,
|
635 |
+
"fr": 0.582,
|
636 |
+
"de": 0.82,
|
637 |
+
"es": 0.958,
|
638 |
+
"nl": 0.753
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.892,
|
642 |
+
"fr": 0.888,
|
643 |
+
"de": 0.894,
|
644 |
+
"es": 0.98,
|
645 |
+
"nl": 0.852
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.996,
|
649 |
+
"fr": 0.994,
|
650 |
+
"de": 0.926,
|
651 |
+
"es": 0.839,
|
652 |
+
"nl": 0.82
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.996,
|
656 |
+
"fr": 0.995,
|
657 |
+
"de": 0.959,
|
658 |
+
"es": 0.938,
|
659 |
+
"nl": 0.995
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.996,
|
663 |
+
"fr": 0.997,
|
664 |
+
"de": 0.975,
|
665 |
+
"es": 0.995,
|
666 |
+
"nl": 0.995
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "5fcf2fd9-d1b6-4284-8363-b605ba094c6c",
|
30 |
+
"datetime_epoch_millis": 1740164658660,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9595375448465346,
|
34 |
+
"llm_top_1_test_accuracy": 0.64956875,
|
35 |
+
"llm_top_2_test_accuracy": 0.72589375,
|
36 |
+
"llm_top_5_test_accuracy": 0.78265625,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9564437940716745,
|
44 |
+
"sae_top_1_test_accuracy": 0.68616875,
|
45 |
+
"sae_top_2_test_accuracy": 0.7763499999999999,
|
46 |
+
"sae_top_5_test_accuracy": 0.87125,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.966800057888031,
|
57 |
+
"llm_top_1_test_accuracy": 0.6397999999999999,
|
58 |
+
"llm_top_2_test_accuracy": 0.6954,
|
59 |
+
"llm_top_5_test_accuracy": 0.7869999999999999,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9612000465393067,
|
65 |
+
"sae_top_1_test_accuracy": 0.6982,
|
66 |
+
"sae_top_2_test_accuracy": 0.8256,
|
67 |
+
"sae_top_5_test_accuracy": 0.8992000000000001,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9578000426292419,
|
76 |
+
"llm_top_1_test_accuracy": 0.6694000000000001,
|
77 |
+
"llm_top_2_test_accuracy": 0.725,
|
78 |
+
"llm_top_5_test_accuracy": 0.7654,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9466000437736511,
|
84 |
+
"sae_top_1_test_accuracy": 0.6604,
|
85 |
+
"sae_top_2_test_accuracy": 0.7849999999999999,
|
86 |
+
"sae_top_5_test_accuracy": 0.8326,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9316000461578369,
|
95 |
+
"llm_top_1_test_accuracy": 0.687,
|
96 |
+
"llm_top_2_test_accuracy": 0.7492,
|
97 |
+
"llm_top_5_test_accuracy": 0.7704000000000001,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9314000368118286,
|
103 |
+
"sae_top_1_test_accuracy": 0.6974,
|
104 |
+
"sae_top_2_test_accuracy": 0.7984,
|
105 |
+
"sae_top_5_test_accuracy": 0.853,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.9202000379562378,
|
114 |
+
"llm_top_1_test_accuracy": 0.599,
|
115 |
+
"llm_top_2_test_accuracy": 0.6474,
|
116 |
+
"llm_top_5_test_accuracy": 0.6734,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9216000437736511,
|
122 |
+
"sae_top_1_test_accuracy": 0.6776000000000001,
|
123 |
+
"sae_top_2_test_accuracy": 0.7224,
|
124 |
+
"sae_top_5_test_accuracy": 0.8231999999999999,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9795000553131104,
|
133 |
+
"llm_top_1_test_accuracy": 0.673,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9725000560283661,
|
141 |
+
"sae_top_1_test_accuracy": 0.644,
|
142 |
+
"sae_top_2_test_accuracy": 0.661,
|
143 |
+
"sae_top_5_test_accuracy": 0.909,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9708000421524048,
|
152 |
+
"llm_top_1_test_accuracy": 0.6451999999999999,
|
153 |
+
"llm_top_2_test_accuracy": 0.6960000000000001,
|
154 |
+
"llm_top_5_test_accuracy": 0.7766,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9682000517845154,
|
160 |
+
"sae_top_1_test_accuracy": 0.6364000000000001,
|
161 |
+
"sae_top_2_test_accuracy": 0.7253999999999999,
|
162 |
+
"sae_top_5_test_accuracy": 0.8042,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9500000476837158,
|
171 |
+
"llm_top_1_test_accuracy": 0.63775,
|
172 |
+
"llm_top_2_test_accuracy": 0.78175,
|
173 |
+
"llm_top_5_test_accuracy": 0.82125,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9502500593662262,
|
179 |
+
"sae_top_1_test_accuracy": 0.6407499999999999,
|
180 |
+
"sae_top_2_test_accuracy": 0.7829999999999999,
|
181 |
+
"sae_top_5_test_accuracy": 0.8530000000000001,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6454,
|
191 |
+
"llm_top_2_test_accuracy": 0.7884,
|
192 |
+
"llm_top_5_test_accuracy": 0.9012,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9998000144958497,
|
198 |
+
"sae_top_1_test_accuracy": 0.8346,
|
199 |
+
"sae_top_2_test_accuracy": 0.9099999999999999,
|
200 |
+
"sae_top_5_test_accuracy": 0.9957999999999998,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0",
|
210 |
+
"sae_lens_version": "5.4.2",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 65536,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9450000524520874,
|
240 |
+
"1": 0.9540000557899475,
|
241 |
+
"2": 0.9440000653266907,
|
242 |
+
"6": 0.984000027179718,
|
243 |
+
"9": 0.9790000319480896
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9530000686645508,
|
249 |
+
"6": 0.987000048160553,
|
250 |
+
"9": 0.9760000705718994
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.577,
|
254 |
+
"1": 0.613,
|
255 |
+
"2": 0.662,
|
256 |
+
"6": 0.787,
|
257 |
+
"9": 0.56
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.574,
|
261 |
+
"1": 0.66,
|
262 |
+
"2": 0.718,
|
263 |
+
"6": 0.811,
|
264 |
+
"9": 0.714
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.713,
|
268 |
+
"1": 0.711,
|
269 |
+
"2": 0.755,
|
270 |
+
"6": 0.895,
|
271 |
+
"9": 0.861
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.547,
|
275 |
+
"1": 0.666,
|
276 |
+
"2": 0.849,
|
277 |
+
"6": 0.82,
|
278 |
+
"9": 0.609
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.806,
|
282 |
+
"1": 0.662,
|
283 |
+
"2": 0.89,
|
284 |
+
"6": 0.832,
|
285 |
+
"9": 0.938
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.811,
|
289 |
+
"1": 0.849,
|
290 |
+
"2": 0.9,
|
291 |
+
"6": 0.991,
|
292 |
+
"9": 0.945
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9540000557899475,
|
298 |
+
"13": 0.9420000314712524,
|
299 |
+
"14": 0.9500000476837158,
|
300 |
+
"18": 0.9240000247955322,
|
301 |
+
"19": 0.9630000591278076
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9690000414848328,
|
305 |
+
"13": 0.9600000381469727,
|
306 |
+
"14": 0.9600000381469727,
|
307 |
+
"18": 0.9390000700950623,
|
308 |
+
"19": 0.9610000252723694
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.555,
|
312 |
+
"13": 0.668,
|
313 |
+
"14": 0.638,
|
314 |
+
"18": 0.69,
|
315 |
+
"19": 0.796
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.756,
|
319 |
+
"13": 0.714,
|
320 |
+
"14": 0.67,
|
321 |
+
"18": 0.717,
|
322 |
+
"19": 0.768
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.794,
|
326 |
+
"13": 0.749,
|
327 |
+
"14": 0.723,
|
328 |
+
"18": 0.73,
|
329 |
+
"19": 0.831
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.565,
|
333 |
+
"13": 0.656,
|
334 |
+
"14": 0.628,
|
335 |
+
"18": 0.646,
|
336 |
+
"19": 0.807
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.854,
|
340 |
+
"13": 0.679,
|
341 |
+
"14": 0.869,
|
342 |
+
"18": 0.699,
|
343 |
+
"19": 0.824
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.953,
|
347 |
+
"13": 0.758,
|
348 |
+
"14": 0.891,
|
349 |
+
"18": 0.73,
|
350 |
+
"19": 0.831
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9600000381469727,
|
356 |
+
"21": 0.9290000200271606,
|
357 |
+
"22": 0.9120000600814819,
|
358 |
+
"25": 0.9700000286102295,
|
359 |
+
"26": 0.8860000371932983
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.956000030040741,
|
363 |
+
"21": 0.9350000619888306,
|
364 |
+
"22": 0.9180000424385071,
|
365 |
+
"25": 0.9640000462532043,
|
366 |
+
"26": 0.8850000500679016
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.693,
|
370 |
+
"21": 0.775,
|
371 |
+
"22": 0.645,
|
372 |
+
"25": 0.706,
|
373 |
+
"26": 0.616
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.827,
|
377 |
+
"21": 0.761,
|
378 |
+
"22": 0.694,
|
379 |
+
"25": 0.778,
|
380 |
+
"26": 0.686
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.855,
|
384 |
+
"21": 0.791,
|
385 |
+
"22": 0.725,
|
386 |
+
"25": 0.809,
|
387 |
+
"26": 0.672
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.568,
|
391 |
+
"21": 0.75,
|
392 |
+
"22": 0.858,
|
393 |
+
"25": 0.715,
|
394 |
+
"26": 0.596
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.841,
|
398 |
+
"21": 0.782,
|
399 |
+
"22": 0.887,
|
400 |
+
"25": 0.865,
|
401 |
+
"26": 0.617
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.913,
|
405 |
+
"21": 0.842,
|
406 |
+
"22": 0.883,
|
407 |
+
"25": 0.882,
|
408 |
+
"26": 0.745
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9500000476837158,
|
414 |
+
"2": 0.9350000619888306,
|
415 |
+
"3": 0.9240000247955322,
|
416 |
+
"5": 0.9180000424385071,
|
417 |
+
"6": 0.8810000419616699
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9580000638961792,
|
421 |
+
"2": 0.9330000281333923,
|
422 |
+
"3": 0.9280000329017639,
|
423 |
+
"5": 0.9200000166893005,
|
424 |
+
"6": 0.862000048160553
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.647,
|
428 |
+
"2": 0.603,
|
429 |
+
"3": 0.598,
|
430 |
+
"5": 0.555,
|
431 |
+
"6": 0.592
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.75,
|
435 |
+
"2": 0.648,
|
436 |
+
"3": 0.607,
|
437 |
+
"5": 0.606,
|
438 |
+
"6": 0.626
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.767,
|
442 |
+
"2": 0.641,
|
443 |
+
"3": 0.645,
|
444 |
+
"5": 0.638,
|
445 |
+
"6": 0.676
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.902,
|
449 |
+
"2": 0.852,
|
450 |
+
"3": 0.52,
|
451 |
+
"5": 0.555,
|
452 |
+
"6": 0.559
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.907,
|
456 |
+
"2": 0.852,
|
457 |
+
"3": 0.553,
|
458 |
+
"5": 0.749,
|
459 |
+
"6": 0.551
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.925,
|
463 |
+
"2": 0.882,
|
464 |
+
"3": 0.73,
|
465 |
+
"5": 0.852,
|
466 |
+
"6": 0.727
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9720000624656677,
|
472 |
+
"5.0": 0.9730000495910645
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9780000448226929,
|
476 |
+
"5.0": 0.9810000658035278
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.673,
|
480 |
+
"5.0": 0.673
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.644,
|
492 |
+
"5.0": 0.644
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.661,
|
496 |
+
"5.0": 0.661
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.909,
|
500 |
+
"5.0": 0.909
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9540000557899475,
|
506 |
+
"Python": 0.9860000610351562,
|
507 |
+
"HTML": 0.9820000529289246,
|
508 |
+
"Java": 0.9620000720024109,
|
509 |
+
"PHP": 0.9570000171661377
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.984000027179718,
|
514 |
+
"HTML": 0.9900000691413879,
|
515 |
+
"Java": 0.9670000672340393,
|
516 |
+
"PHP": 0.9570000171661377
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.666,
|
520 |
+
"Python": 0.626,
|
521 |
+
"HTML": 0.721,
|
522 |
+
"Java": 0.619,
|
523 |
+
"PHP": 0.594
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.679,
|
527 |
+
"Python": 0.674,
|
528 |
+
"HTML": 0.8,
|
529 |
+
"Java": 0.676,
|
530 |
+
"PHP": 0.651
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.783,
|
534 |
+
"Python": 0.717,
|
535 |
+
"HTML": 0.935,
|
536 |
+
"Java": 0.733,
|
537 |
+
"PHP": 0.715
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.623,
|
541 |
+
"Python": 0.623,
|
542 |
+
"HTML": 0.689,
|
543 |
+
"Java": 0.632,
|
544 |
+
"PHP": 0.615
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.617,
|
548 |
+
"Python": 0.657,
|
549 |
+
"HTML": 0.811,
|
550 |
+
"Java": 0.629,
|
551 |
+
"PHP": 0.913
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.681,
|
555 |
+
"Python": 0.94,
|
556 |
+
"HTML": 0.833,
|
557 |
+
"Java": 0.645,
|
558 |
+
"PHP": 0.922
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9390000700950623,
|
564 |
+
"1": 0.9790000319480896,
|
565 |
+
"2": 0.9390000700950623,
|
566 |
+
"3": 0.9440000653266907
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.940000057220459,
|
570 |
+
"1": 0.9860000610351562,
|
571 |
+
"2": 0.9200000166893005,
|
572 |
+
"3": 0.9540000557899475
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.573,
|
576 |
+
"1": 0.671,
|
577 |
+
"2": 0.672,
|
578 |
+
"3": 0.635
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.802,
|
582 |
+
"1": 0.808,
|
583 |
+
"2": 0.701,
|
584 |
+
"3": 0.816
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.81,
|
588 |
+
"1": 0.891,
|
589 |
+
"2": 0.752,
|
590 |
+
"3": 0.832
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.67,
|
594 |
+
"1": 0.641,
|
595 |
+
"2": 0.546,
|
596 |
+
"3": 0.706
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.695,
|
600 |
+
"1": 0.94,
|
601 |
+
"2": 0.76,
|
602 |
+
"3": 0.737
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.842,
|
606 |
+
"1": 0.96,
|
607 |
+
"2": 0.809,
|
608 |
+
"3": 0.801
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 1.0,
|
614 |
+
"fr": 1.0,
|
615 |
+
"de": 1.0,
|
616 |
+
"es": 1.0,
|
617 |
+
"nl": 0.999000072479248
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 0.999000072479248,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 0.999000072479248,
|
624 |
+
"nl": 1.0
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.739,
|
628 |
+
"fr": 0.585,
|
629 |
+
"de": 0.758,
|
630 |
+
"es": 0.496,
|
631 |
+
"nl": 0.649
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.829,
|
635 |
+
"fr": 0.582,
|
636 |
+
"de": 0.82,
|
637 |
+
"es": 0.958,
|
638 |
+
"nl": 0.753
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.892,
|
642 |
+
"fr": 0.888,
|
643 |
+
"de": 0.894,
|
644 |
+
"es": 0.98,
|
645 |
+
"nl": 0.852
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.718,
|
649 |
+
"fr": 0.993,
|
650 |
+
"de": 0.9,
|
651 |
+
"es": 0.902,
|
652 |
+
"nl": 0.66
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.739,
|
656 |
+
"fr": 0.994,
|
657 |
+
"de": 0.907,
|
658 |
+
"es": 0.913,
|
659 |
+
"nl": 0.997
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.998,
|
663 |
+
"fr": 0.996,
|
664 |
+
"de": 0.992,
|
665 |
+
"es": 0.993,
|
666 |
+
"nl": 1.0
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "82b9980d-82e4-4ea8-9e22-8e6990b9b64e",
|
30 |
+
"datetime_epoch_millis": 1740164519938,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9595375448465346,
|
34 |
+
"llm_top_1_test_accuracy": 0.64956875,
|
35 |
+
"llm_top_2_test_accuracy": 0.72589375,
|
36 |
+
"llm_top_5_test_accuracy": 0.78265625,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9534375388175249,
|
44 |
+
"sae_top_1_test_accuracy": 0.7455499999999999,
|
45 |
+
"sae_top_2_test_accuracy": 0.81490625,
|
46 |
+
"sae_top_5_test_accuracy": 0.8733625,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.966800057888031,
|
57 |
+
"llm_top_1_test_accuracy": 0.6397999999999999,
|
58 |
+
"llm_top_2_test_accuracy": 0.6954,
|
59 |
+
"llm_top_5_test_accuracy": 0.7869999999999999,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9592000484466553,
|
65 |
+
"sae_top_1_test_accuracy": 0.7891999999999999,
|
66 |
+
"sae_top_2_test_accuracy": 0.8113999999999999,
|
67 |
+
"sae_top_5_test_accuracy": 0.9046,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9578000426292419,
|
76 |
+
"llm_top_1_test_accuracy": 0.6694000000000001,
|
77 |
+
"llm_top_2_test_accuracy": 0.725,
|
78 |
+
"llm_top_5_test_accuracy": 0.7654,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9442000389099121,
|
84 |
+
"sae_top_1_test_accuracy": 0.6958,
|
85 |
+
"sae_top_2_test_accuracy": 0.8019999999999999,
|
86 |
+
"sae_top_5_test_accuracy": 0.8480000000000001,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9316000461578369,
|
95 |
+
"llm_top_1_test_accuracy": 0.687,
|
96 |
+
"llm_top_2_test_accuracy": 0.7492,
|
97 |
+
"llm_top_5_test_accuracy": 0.7704000000000001,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9224000334739685,
|
103 |
+
"sae_top_1_test_accuracy": 0.7988,
|
104 |
+
"sae_top_2_test_accuracy": 0.836,
|
105 |
+
"sae_top_5_test_accuracy": 0.8666,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.9202000379562378,
|
114 |
+
"llm_top_1_test_accuracy": 0.599,
|
115 |
+
"llm_top_2_test_accuracy": 0.6474,
|
116 |
+
"llm_top_5_test_accuracy": 0.6734,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9188000440597535,
|
122 |
+
"sae_top_1_test_accuracy": 0.6394,
|
123 |
+
"sae_top_2_test_accuracy": 0.7183999999999999,
|
124 |
+
"sae_top_5_test_accuracy": 0.798,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9795000553131104,
|
133 |
+
"llm_top_1_test_accuracy": 0.673,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9685000479221344,
|
141 |
+
"sae_top_1_test_accuracy": 0.76,
|
142 |
+
"sae_top_2_test_accuracy": 0.931,
|
143 |
+
"sae_top_5_test_accuracy": 0.939,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9708000421524048,
|
152 |
+
"llm_top_1_test_accuracy": 0.6451999999999999,
|
153 |
+
"llm_top_2_test_accuracy": 0.6960000000000001,
|
154 |
+
"llm_top_5_test_accuracy": 0.7766,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9700000286102295,
|
160 |
+
"sae_top_1_test_accuracy": 0.6275999999999999,
|
161 |
+
"sae_top_2_test_accuracy": 0.7262000000000001,
|
162 |
+
"sae_top_5_test_accuracy": 0.7979999999999999,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9500000476837158,
|
171 |
+
"llm_top_1_test_accuracy": 0.63775,
|
172 |
+
"llm_top_2_test_accuracy": 0.78175,
|
173 |
+
"llm_top_5_test_accuracy": 0.82125,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9460000246763229,
|
179 |
+
"sae_top_1_test_accuracy": 0.713,
|
180 |
+
"sae_top_2_test_accuracy": 0.75225,
|
181 |
+
"sae_top_5_test_accuracy": 0.8434999999999999,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6454,
|
191 |
+
"llm_top_2_test_accuracy": 0.7884,
|
192 |
+
"llm_top_5_test_accuracy": 0.9012,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9984000444412231,
|
198 |
+
"sae_top_1_test_accuracy": 0.9406000000000001,
|
199 |
+
"sae_top_2_test_accuracy": 0.942,
|
200 |
+
"sae_top_5_test_accuracy": 0.9892,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0",
|
210 |
+
"sae_lens_version": "5.4.2",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 65536,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9390000700950623,
|
240 |
+
"1": 0.9580000638961792,
|
241 |
+
"2": 0.9500000476837158,
|
242 |
+
"6": 0.984000027179718,
|
243 |
+
"9": 0.9650000333786011
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9530000686645508,
|
249 |
+
"6": 0.987000048160553,
|
250 |
+
"9": 0.9760000705718994
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.577,
|
254 |
+
"1": 0.613,
|
255 |
+
"2": 0.662,
|
256 |
+
"6": 0.787,
|
257 |
+
"9": 0.56
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.574,
|
261 |
+
"1": 0.66,
|
262 |
+
"2": 0.718,
|
263 |
+
"6": 0.811,
|
264 |
+
"9": 0.714
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.713,
|
268 |
+
"1": 0.711,
|
269 |
+
"2": 0.755,
|
270 |
+
"6": 0.895,
|
271 |
+
"9": 0.861
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.573,
|
275 |
+
"1": 0.579,
|
276 |
+
"2": 0.896,
|
277 |
+
"6": 0.961,
|
278 |
+
"9": 0.937
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.624,
|
282 |
+
"1": 0.633,
|
283 |
+
"2": 0.905,
|
284 |
+
"6": 0.961,
|
285 |
+
"9": 0.934
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.856,
|
289 |
+
"1": 0.83,
|
290 |
+
"2": 0.91,
|
291 |
+
"6": 0.984,
|
292 |
+
"9": 0.943
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9520000219345093,
|
298 |
+
"13": 0.9470000267028809,
|
299 |
+
"14": 0.940000057220459,
|
300 |
+
"18": 0.9190000295639038,
|
301 |
+
"19": 0.9630000591278076
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9690000414848328,
|
305 |
+
"13": 0.9600000381469727,
|
306 |
+
"14": 0.9600000381469727,
|
307 |
+
"18": 0.9390000700950623,
|
308 |
+
"19": 0.9610000252723694
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.555,
|
312 |
+
"13": 0.668,
|
313 |
+
"14": 0.638,
|
314 |
+
"18": 0.69,
|
315 |
+
"19": 0.796
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.756,
|
319 |
+
"13": 0.714,
|
320 |
+
"14": 0.67,
|
321 |
+
"18": 0.717,
|
322 |
+
"19": 0.768
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.794,
|
326 |
+
"13": 0.749,
|
327 |
+
"14": 0.723,
|
328 |
+
"18": 0.73,
|
329 |
+
"19": 0.831
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.618,
|
333 |
+
"13": 0.646,
|
334 |
+
"14": 0.731,
|
335 |
+
"18": 0.641,
|
336 |
+
"19": 0.843
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.723,
|
340 |
+
"13": 0.689,
|
341 |
+
"14": 0.854,
|
342 |
+
"18": 0.882,
|
343 |
+
"19": 0.862
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.835,
|
347 |
+
"13": 0.746,
|
348 |
+
"14": 0.858,
|
349 |
+
"18": 0.899,
|
350 |
+
"19": 0.902
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9520000219345093,
|
356 |
+
"21": 0.9160000681877136,
|
357 |
+
"22": 0.9100000262260437,
|
358 |
+
"25": 0.9470000267028809,
|
359 |
+
"26": 0.8870000243186951
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.956000030040741,
|
363 |
+
"21": 0.9350000619888306,
|
364 |
+
"22": 0.9180000424385071,
|
365 |
+
"25": 0.9640000462532043,
|
366 |
+
"26": 0.8850000500679016
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.693,
|
370 |
+
"21": 0.775,
|
371 |
+
"22": 0.645,
|
372 |
+
"25": 0.706,
|
373 |
+
"26": 0.616
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.827,
|
377 |
+
"21": 0.761,
|
378 |
+
"22": 0.694,
|
379 |
+
"25": 0.778,
|
380 |
+
"26": 0.686
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.855,
|
384 |
+
"21": 0.791,
|
385 |
+
"22": 0.725,
|
386 |
+
"25": 0.809,
|
387 |
+
"26": 0.672
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.909,
|
391 |
+
"21": 0.795,
|
392 |
+
"22": 0.799,
|
393 |
+
"25": 0.876,
|
394 |
+
"26": 0.615
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.914,
|
398 |
+
"21": 0.812,
|
399 |
+
"22": 0.872,
|
400 |
+
"25": 0.887,
|
401 |
+
"26": 0.695
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.921,
|
405 |
+
"21": 0.817,
|
406 |
+
"22": 0.86,
|
407 |
+
"25": 0.918,
|
408 |
+
"26": 0.817
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9510000348091125,
|
414 |
+
"2": 0.937000036239624,
|
415 |
+
"3": 0.9130000472068787,
|
416 |
+
"5": 0.921000063419342,
|
417 |
+
"6": 0.8720000386238098
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9580000638961792,
|
421 |
+
"2": 0.9330000281333923,
|
422 |
+
"3": 0.9280000329017639,
|
423 |
+
"5": 0.9200000166893005,
|
424 |
+
"6": 0.862000048160553
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.647,
|
428 |
+
"2": 0.603,
|
429 |
+
"3": 0.598,
|
430 |
+
"5": 0.555,
|
431 |
+
"6": 0.592
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.75,
|
435 |
+
"2": 0.648,
|
436 |
+
"3": 0.607,
|
437 |
+
"5": 0.606,
|
438 |
+
"6": 0.626
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.767,
|
442 |
+
"2": 0.641,
|
443 |
+
"3": 0.645,
|
444 |
+
"5": 0.638,
|
445 |
+
"6": 0.676
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.724,
|
449 |
+
"2": 0.626,
|
450 |
+
"3": 0.577,
|
451 |
+
"5": 0.553,
|
452 |
+
"6": 0.717
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.733,
|
456 |
+
"2": 0.859,
|
457 |
+
"3": 0.615,
|
458 |
+
"5": 0.666,
|
459 |
+
"6": 0.719
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.817,
|
463 |
+
"2": 0.887,
|
464 |
+
"3": 0.704,
|
465 |
+
"5": 0.826,
|
466 |
+
"6": 0.756
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9690000414848328,
|
472 |
+
"5.0": 0.968000054359436
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9780000448226929,
|
476 |
+
"5.0": 0.9810000658035278
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.673,
|
480 |
+
"5.0": 0.673
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.76,
|
492 |
+
"5.0": 0.76
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.931,
|
496 |
+
"5.0": 0.931
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.939,
|
500 |
+
"5.0": 0.939
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.956000030040741,
|
506 |
+
"Python": 0.9890000224113464,
|
507 |
+
"HTML": 0.984000027179718,
|
508 |
+
"Java": 0.9650000333786011,
|
509 |
+
"PHP": 0.956000030040741
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.984000027179718,
|
514 |
+
"HTML": 0.9900000691413879,
|
515 |
+
"Java": 0.9670000672340393,
|
516 |
+
"PHP": 0.9570000171661377
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.666,
|
520 |
+
"Python": 0.626,
|
521 |
+
"HTML": 0.721,
|
522 |
+
"Java": 0.619,
|
523 |
+
"PHP": 0.594
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.679,
|
527 |
+
"Python": 0.674,
|
528 |
+
"HTML": 0.8,
|
529 |
+
"Java": 0.676,
|
530 |
+
"PHP": 0.651
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.783,
|
534 |
+
"Python": 0.717,
|
535 |
+
"HTML": 0.935,
|
536 |
+
"Java": 0.733,
|
537 |
+
"PHP": 0.715
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.576,
|
541 |
+
"Python": 0.547,
|
542 |
+
"HTML": 0.768,
|
543 |
+
"Java": 0.652,
|
544 |
+
"PHP": 0.595
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.595,
|
548 |
+
"Python": 0.699,
|
549 |
+
"HTML": 0.777,
|
550 |
+
"Java": 0.638,
|
551 |
+
"PHP": 0.922
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.705,
|
555 |
+
"Python": 0.715,
|
556 |
+
"HTML": 0.936,
|
557 |
+
"Java": 0.714,
|
558 |
+
"PHP": 0.92
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9340000152587891,
|
564 |
+
"1": 0.9800000190734863,
|
565 |
+
"2": 0.9200000166893005,
|
566 |
+
"3": 0.9500000476837158
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.940000057220459,
|
570 |
+
"1": 0.9860000610351562,
|
571 |
+
"2": 0.9200000166893005,
|
572 |
+
"3": 0.9540000557899475
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.573,
|
576 |
+
"1": 0.671,
|
577 |
+
"2": 0.672,
|
578 |
+
"3": 0.635
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.802,
|
582 |
+
"1": 0.808,
|
583 |
+
"2": 0.701,
|
584 |
+
"3": 0.816
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.81,
|
588 |
+
"1": 0.891,
|
589 |
+
"2": 0.752,
|
590 |
+
"3": 0.832
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.67,
|
594 |
+
"1": 0.811,
|
595 |
+
"2": 0.741,
|
596 |
+
"3": 0.63
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.745,
|
600 |
+
"1": 0.909,
|
601 |
+
"2": 0.734,
|
602 |
+
"3": 0.621
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.847,
|
606 |
+
"1": 0.927,
|
607 |
+
"2": 0.813,
|
608 |
+
"3": 0.787
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.9980000257492065,
|
614 |
+
"fr": 0.999000072479248,
|
615 |
+
"de": 0.9980000257492065,
|
616 |
+
"es": 0.999000072479248,
|
617 |
+
"nl": 0.9980000257492065
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 0.999000072479248,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 0.999000072479248,
|
624 |
+
"nl": 1.0
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.739,
|
628 |
+
"fr": 0.585,
|
629 |
+
"de": 0.758,
|
630 |
+
"es": 0.496,
|
631 |
+
"nl": 0.649
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.829,
|
635 |
+
"fr": 0.582,
|
636 |
+
"de": 0.82,
|
637 |
+
"es": 0.958,
|
638 |
+
"nl": 0.753
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.892,
|
642 |
+
"fr": 0.888,
|
643 |
+
"de": 0.894,
|
644 |
+
"es": 0.98,
|
645 |
+
"nl": 0.852
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.739,
|
649 |
+
"fr": 0.991,
|
650 |
+
"de": 0.98,
|
651 |
+
"es": 0.995,
|
652 |
+
"nl": 0.998
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.747,
|
656 |
+
"fr": 0.99,
|
657 |
+
"de": 0.98,
|
658 |
+
"es": 0.995,
|
659 |
+
"nl": 0.998
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.97,
|
663 |
+
"fr": 0.996,
|
664 |
+
"de": 0.988,
|
665 |
+
"es": 0.994,
|
666 |
+
"nl": 0.998
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "ead5eae1-ac4a-495a-aa9d-6980c16e8482",
|
30 |
+
"datetime_epoch_millis": 1740164809112,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9595375448465346,
|
34 |
+
"llm_top_1_test_accuracy": 0.64956875,
|
35 |
+
"llm_top_2_test_accuracy": 0.72589375,
|
36 |
+
"llm_top_5_test_accuracy": 0.78265625,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9516125384718179,
|
44 |
+
"sae_top_1_test_accuracy": 0.7007125,
|
45 |
+
"sae_top_2_test_accuracy": 0.808825,
|
46 |
+
"sae_top_5_test_accuracy": 0.8754875000000001,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.966800057888031,
|
57 |
+
"llm_top_1_test_accuracy": 0.6397999999999999,
|
58 |
+
"llm_top_2_test_accuracy": 0.6954,
|
59 |
+
"llm_top_5_test_accuracy": 0.7869999999999999,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9568000435829163,
|
65 |
+
"sae_top_1_test_accuracy": 0.6923999999999999,
|
66 |
+
"sae_top_2_test_accuracy": 0.8591999999999999,
|
67 |
+
"sae_top_5_test_accuracy": 0.9028,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9578000426292419,
|
76 |
+
"llm_top_1_test_accuracy": 0.6694000000000001,
|
77 |
+
"llm_top_2_test_accuracy": 0.725,
|
78 |
+
"llm_top_5_test_accuracy": 0.7654,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9420000433921814,
|
84 |
+
"sae_top_1_test_accuracy": 0.671,
|
85 |
+
"sae_top_2_test_accuracy": 0.7924,
|
86 |
+
"sae_top_5_test_accuracy": 0.8718,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9316000461578369,
|
95 |
+
"llm_top_1_test_accuracy": 0.687,
|
96 |
+
"llm_top_2_test_accuracy": 0.7492,
|
97 |
+
"llm_top_5_test_accuracy": 0.7704000000000001,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9232000350952149,
|
103 |
+
"sae_top_1_test_accuracy": 0.7499999999999999,
|
104 |
+
"sae_top_2_test_accuracy": 0.7924,
|
105 |
+
"sae_top_5_test_accuracy": 0.8560000000000001,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.9202000379562378,
|
114 |
+
"llm_top_1_test_accuracy": 0.599,
|
115 |
+
"llm_top_2_test_accuracy": 0.6474,
|
116 |
+
"llm_top_5_test_accuracy": 0.6734,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9094000339508057,
|
122 |
+
"sae_top_1_test_accuracy": 0.6928,
|
123 |
+
"sae_top_2_test_accuracy": 0.744,
|
124 |
+
"sae_top_5_test_accuracy": 0.8098000000000001,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9795000553131104,
|
133 |
+
"llm_top_1_test_accuracy": 0.673,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9650000333786011,
|
141 |
+
"sae_top_1_test_accuracy": 0.745,
|
142 |
+
"sae_top_2_test_accuracy": 0.895,
|
143 |
+
"sae_top_5_test_accuracy": 0.936,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9708000421524048,
|
152 |
+
"llm_top_1_test_accuracy": 0.6451999999999999,
|
153 |
+
"llm_top_2_test_accuracy": 0.6960000000000001,
|
154 |
+
"llm_top_5_test_accuracy": 0.7766,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9684000372886657,
|
160 |
+
"sae_top_1_test_accuracy": 0.5955999999999999,
|
161 |
+
"sae_top_2_test_accuracy": 0.7612,
|
162 |
+
"sae_top_5_test_accuracy": 0.7986000000000001,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9500000476837158,
|
171 |
+
"llm_top_1_test_accuracy": 0.63775,
|
172 |
+
"llm_top_2_test_accuracy": 0.78175,
|
173 |
+
"llm_top_5_test_accuracy": 0.82125,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9495000392198563,
|
179 |
+
"sae_top_1_test_accuracy": 0.6034999999999999,
|
180 |
+
"sae_top_2_test_accuracy": 0.6970000000000001,
|
181 |
+
"sae_top_5_test_accuracy": 0.8434999999999999,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6454,
|
191 |
+
"llm_top_2_test_accuracy": 0.7884,
|
192 |
+
"llm_top_5_test_accuracy": 0.9012,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9986000418663025,
|
198 |
+
"sae_top_1_test_accuracy": 0.8554,
|
199 |
+
"sae_top_2_test_accuracy": 0.9294,
|
200 |
+
"sae_top_5_test_accuracy": 0.9853999999999999,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0",
|
210 |
+
"sae_lens_version": "5.4.2",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 65536,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9350000619888306,
|
240 |
+
"1": 0.9600000381469727,
|
241 |
+
"2": 0.937000036239624,
|
242 |
+
"6": 0.987000048160553,
|
243 |
+
"9": 0.9650000333786011
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9530000686645508,
|
249 |
+
"6": 0.987000048160553,
|
250 |
+
"9": 0.9760000705718994
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.577,
|
254 |
+
"1": 0.613,
|
255 |
+
"2": 0.662,
|
256 |
+
"6": 0.787,
|
257 |
+
"9": 0.56
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.574,
|
261 |
+
"1": 0.66,
|
262 |
+
"2": 0.718,
|
263 |
+
"6": 0.811,
|
264 |
+
"9": 0.714
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.713,
|
268 |
+
"1": 0.711,
|
269 |
+
"2": 0.755,
|
270 |
+
"6": 0.895,
|
271 |
+
"9": 0.861
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.574,
|
275 |
+
"1": 0.632,
|
276 |
+
"2": 0.891,
|
277 |
+
"6": 0.803,
|
278 |
+
"9": 0.562
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.874,
|
282 |
+
"1": 0.741,
|
283 |
+
"2": 0.898,
|
284 |
+
"6": 0.94,
|
285 |
+
"9": 0.843
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.879,
|
289 |
+
"1": 0.872,
|
290 |
+
"2": 0.895,
|
291 |
+
"6": 0.967,
|
292 |
+
"9": 0.901
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9510000348091125,
|
298 |
+
"13": 0.9440000653266907,
|
299 |
+
"14": 0.9460000395774841,
|
300 |
+
"18": 0.9120000600814819,
|
301 |
+
"19": 0.9570000171661377
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9690000414848328,
|
305 |
+
"13": 0.9600000381469727,
|
306 |
+
"14": 0.9600000381469727,
|
307 |
+
"18": 0.9390000700950623,
|
308 |
+
"19": 0.9610000252723694
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.555,
|
312 |
+
"13": 0.668,
|
313 |
+
"14": 0.638,
|
314 |
+
"18": 0.69,
|
315 |
+
"19": 0.796
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.756,
|
319 |
+
"13": 0.714,
|
320 |
+
"14": 0.67,
|
321 |
+
"18": 0.717,
|
322 |
+
"19": 0.768
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.794,
|
326 |
+
"13": 0.749,
|
327 |
+
"14": 0.723,
|
328 |
+
"18": 0.73,
|
329 |
+
"19": 0.831
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.55,
|
333 |
+
"13": 0.667,
|
334 |
+
"14": 0.652,
|
335 |
+
"18": 0.697,
|
336 |
+
"19": 0.789
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.853,
|
340 |
+
"13": 0.705,
|
341 |
+
"14": 0.88,
|
342 |
+
"18": 0.732,
|
343 |
+
"19": 0.792
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.916,
|
347 |
+
"13": 0.749,
|
348 |
+
"14": 0.883,
|
349 |
+
"18": 0.917,
|
350 |
+
"19": 0.894
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9530000686645508,
|
356 |
+
"21": 0.9240000247955322,
|
357 |
+
"22": 0.9010000228881836,
|
358 |
+
"25": 0.9520000219345093,
|
359 |
+
"26": 0.8860000371932983
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.956000030040741,
|
363 |
+
"21": 0.9350000619888306,
|
364 |
+
"22": 0.9180000424385071,
|
365 |
+
"25": 0.9640000462532043,
|
366 |
+
"26": 0.8850000500679016
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.693,
|
370 |
+
"21": 0.775,
|
371 |
+
"22": 0.645,
|
372 |
+
"25": 0.706,
|
373 |
+
"26": 0.616
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.827,
|
377 |
+
"21": 0.761,
|
378 |
+
"22": 0.694,
|
379 |
+
"25": 0.778,
|
380 |
+
"26": 0.686
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.855,
|
384 |
+
"21": 0.791,
|
385 |
+
"22": 0.725,
|
386 |
+
"25": 0.809,
|
387 |
+
"26": 0.672
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.852,
|
391 |
+
"21": 0.736,
|
392 |
+
"22": 0.829,
|
393 |
+
"25": 0.699,
|
394 |
+
"26": 0.634
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.86,
|
398 |
+
"21": 0.78,
|
399 |
+
"22": 0.852,
|
400 |
+
"25": 0.837,
|
401 |
+
"26": 0.633
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.93,
|
405 |
+
"21": 0.833,
|
406 |
+
"22": 0.851,
|
407 |
+
"25": 0.875,
|
408 |
+
"26": 0.791
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9360000491142273,
|
414 |
+
"2": 0.9330000281333923,
|
415 |
+
"3": 0.9150000214576721,
|
416 |
+
"5": 0.8990000486373901,
|
417 |
+
"6": 0.8640000224113464
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9580000638961792,
|
421 |
+
"2": 0.9330000281333923,
|
422 |
+
"3": 0.9280000329017639,
|
423 |
+
"5": 0.9200000166893005,
|
424 |
+
"6": 0.862000048160553
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.647,
|
428 |
+
"2": 0.603,
|
429 |
+
"3": 0.598,
|
430 |
+
"5": 0.555,
|
431 |
+
"6": 0.592
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.75,
|
435 |
+
"2": 0.648,
|
436 |
+
"3": 0.607,
|
437 |
+
"5": 0.606,
|
438 |
+
"6": 0.626
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.767,
|
442 |
+
"2": 0.641,
|
443 |
+
"3": 0.645,
|
444 |
+
"5": 0.638,
|
445 |
+
"6": 0.676
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.882,
|
449 |
+
"2": 0.869,
|
450 |
+
"3": 0.551,
|
451 |
+
"5": 0.537,
|
452 |
+
"6": 0.625
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.885,
|
456 |
+
"2": 0.866,
|
457 |
+
"3": 0.657,
|
458 |
+
"5": 0.606,
|
459 |
+
"6": 0.706
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.908,
|
463 |
+
"2": 0.887,
|
464 |
+
"3": 0.656,
|
465 |
+
"5": 0.869,
|
466 |
+
"6": 0.729
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9640000462532043,
|
472 |
+
"5.0": 0.9660000205039978
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9780000448226929,
|
476 |
+
"5.0": 0.9810000658035278
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.673,
|
480 |
+
"5.0": 0.673
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.745,
|
492 |
+
"5.0": 0.745
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.895,
|
496 |
+
"5.0": 0.895
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.936,
|
500 |
+
"5.0": 0.936
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9570000171661377,
|
506 |
+
"Python": 0.9830000400543213,
|
507 |
+
"HTML": 0.9810000658035278,
|
508 |
+
"Java": 0.9640000462532043,
|
509 |
+
"PHP": 0.9570000171661377
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.984000027179718,
|
514 |
+
"HTML": 0.9900000691413879,
|
515 |
+
"Java": 0.9670000672340393,
|
516 |
+
"PHP": 0.9570000171661377
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.666,
|
520 |
+
"Python": 0.626,
|
521 |
+
"HTML": 0.721,
|
522 |
+
"Java": 0.619,
|
523 |
+
"PHP": 0.594
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.679,
|
527 |
+
"Python": 0.674,
|
528 |
+
"HTML": 0.8,
|
529 |
+
"Java": 0.676,
|
530 |
+
"PHP": 0.651
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.783,
|
534 |
+
"Python": 0.717,
|
535 |
+
"HTML": 0.935,
|
536 |
+
"Java": 0.733,
|
537 |
+
"PHP": 0.715
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.523,
|
541 |
+
"Python": 0.641,
|
542 |
+
"HTML": 0.596,
|
543 |
+
"Java": 0.623,
|
544 |
+
"PHP": 0.595
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.61,
|
548 |
+
"Python": 0.881,
|
549 |
+
"HTML": 0.74,
|
550 |
+
"Java": 0.66,
|
551 |
+
"PHP": 0.915
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.643,
|
555 |
+
"Python": 0.883,
|
556 |
+
"HTML": 0.897,
|
557 |
+
"Java": 0.657,
|
558 |
+
"PHP": 0.913
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9320000410079956,
|
564 |
+
"1": 0.9830000400543213,
|
565 |
+
"2": 0.9290000200271606,
|
566 |
+
"3": 0.9540000557899475
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.940000057220459,
|
570 |
+
"1": 0.9860000610351562,
|
571 |
+
"2": 0.9200000166893005,
|
572 |
+
"3": 0.9540000557899475
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.573,
|
576 |
+
"1": 0.671,
|
577 |
+
"2": 0.672,
|
578 |
+
"3": 0.635
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.802,
|
582 |
+
"1": 0.808,
|
583 |
+
"2": 0.701,
|
584 |
+
"3": 0.816
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.81,
|
588 |
+
"1": 0.891,
|
589 |
+
"2": 0.752,
|
590 |
+
"3": 0.832
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.583,
|
594 |
+
"1": 0.652,
|
595 |
+
"2": 0.554,
|
596 |
+
"3": 0.625
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.667,
|
600 |
+
"1": 0.689,
|
601 |
+
"2": 0.748,
|
602 |
+
"3": 0.684
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.81,
|
606 |
+
"1": 0.915,
|
607 |
+
"2": 0.841,
|
608 |
+
"3": 0.808
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.9980000257492065,
|
614 |
+
"fr": 0.999000072479248,
|
615 |
+
"de": 1.0,
|
616 |
+
"es": 0.9970000386238098,
|
617 |
+
"nl": 0.999000072479248
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 0.999000072479248,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 0.999000072479248,
|
624 |
+
"nl": 1.0
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.739,
|
628 |
+
"fr": 0.585,
|
629 |
+
"de": 0.758,
|
630 |
+
"es": 0.496,
|
631 |
+
"nl": 0.649
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.829,
|
635 |
+
"fr": 0.582,
|
636 |
+
"de": 0.82,
|
637 |
+
"es": 0.958,
|
638 |
+
"nl": 0.753
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.892,
|
642 |
+
"fr": 0.888,
|
643 |
+
"de": 0.894,
|
644 |
+
"es": 0.98,
|
645 |
+
"nl": 0.852
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.737,
|
649 |
+
"fr": 0.985,
|
650 |
+
"de": 0.93,
|
651 |
+
"es": 0.986,
|
652 |
+
"nl": 0.639
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.972,
|
656 |
+
"fr": 0.991,
|
657 |
+
"de": 0.937,
|
658 |
+
"es": 0.989,
|
659 |
+
"nl": 0.758
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.998,
|
663 |
+
"fr": 0.996,
|
664 |
+
"de": 0.944,
|
665 |
+
"es": 0.993,
|
666 |
+
"nl": 0.996
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "01df6242-51fa-47d4-af93-9c80a172184d",
|
30 |
+
"datetime_epoch_millis": 1740165335112,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9595375448465346,
|
34 |
+
"llm_top_1_test_accuracy": 0.64956875,
|
35 |
+
"llm_top_2_test_accuracy": 0.72589375,
|
36 |
+
"llm_top_5_test_accuracy": 0.78265625,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9563000392168761,
|
44 |
+
"sae_top_1_test_accuracy": 0.7563249999999999,
|
45 |
+
"sae_top_2_test_accuracy": 0.80766875,
|
46 |
+
"sae_top_5_test_accuracy": 0.861175,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.966800057888031,
|
57 |
+
"llm_top_1_test_accuracy": 0.6397999999999999,
|
58 |
+
"llm_top_2_test_accuracy": 0.6954,
|
59 |
+
"llm_top_5_test_accuracy": 0.7869999999999999,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9606000423431397,
|
65 |
+
"sae_top_1_test_accuracy": 0.735,
|
66 |
+
"sae_top_2_test_accuracy": 0.8328,
|
67 |
+
"sae_top_5_test_accuracy": 0.8700000000000001,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9578000426292419,
|
76 |
+
"llm_top_1_test_accuracy": 0.6694000000000001,
|
77 |
+
"llm_top_2_test_accuracy": 0.725,
|
78 |
+
"llm_top_5_test_accuracy": 0.7654,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9454000473022461,
|
84 |
+
"sae_top_1_test_accuracy": 0.7318,
|
85 |
+
"sae_top_2_test_accuracy": 0.7548,
|
86 |
+
"sae_top_5_test_accuracy": 0.8513999999999999,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9316000461578369,
|
95 |
+
"llm_top_1_test_accuracy": 0.687,
|
96 |
+
"llm_top_2_test_accuracy": 0.7492,
|
97 |
+
"llm_top_5_test_accuracy": 0.7704000000000001,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.931600034236908,
|
103 |
+
"sae_top_1_test_accuracy": 0.7748000000000002,
|
104 |
+
"sae_top_2_test_accuracy": 0.8088000000000001,
|
105 |
+
"sae_top_5_test_accuracy": 0.8513999999999999,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.9202000379562378,
|
114 |
+
"llm_top_1_test_accuracy": 0.599,
|
115 |
+
"llm_top_2_test_accuracy": 0.6474,
|
116 |
+
"llm_top_5_test_accuracy": 0.6734,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9192000389099121,
|
122 |
+
"sae_top_1_test_accuracy": 0.683,
|
123 |
+
"sae_top_2_test_accuracy": 0.6950000000000001,
|
124 |
+
"sae_top_5_test_accuracy": 0.7598,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9795000553131104,
|
133 |
+
"llm_top_1_test_accuracy": 0.673,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9755000472068787,
|
141 |
+
"sae_top_1_test_accuracy": 0.94,
|
142 |
+
"sae_top_2_test_accuracy": 0.941,
|
143 |
+
"sae_top_5_test_accuracy": 0.95,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9708000421524048,
|
152 |
+
"llm_top_1_test_accuracy": 0.6451999999999999,
|
153 |
+
"llm_top_2_test_accuracy": 0.6960000000000001,
|
154 |
+
"llm_top_5_test_accuracy": 0.7766,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9714000463485718,
|
160 |
+
"sae_top_1_test_accuracy": 0.6077999999999999,
|
161 |
+
"sae_top_2_test_accuracy": 0.7322,
|
162 |
+
"sae_top_5_test_accuracy": 0.7504,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9500000476837158,
|
171 |
+
"llm_top_1_test_accuracy": 0.63775,
|
172 |
+
"llm_top_2_test_accuracy": 0.78175,
|
173 |
+
"llm_top_5_test_accuracy": 0.82125,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9485000222921371,
|
179 |
+
"sae_top_1_test_accuracy": 0.6849999999999999,
|
180 |
+
"sae_top_2_test_accuracy": 0.74675,
|
181 |
+
"sae_top_5_test_accuracy": 0.859,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6454,
|
191 |
+
"llm_top_2_test_accuracy": 0.7884,
|
192 |
+
"llm_top_5_test_accuracy": 0.9012,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9982000350952148,
|
198 |
+
"sae_top_1_test_accuracy": 0.8932,
|
199 |
+
"sae_top_2_test_accuracy": 0.95,
|
200 |
+
"sae_top_5_test_accuracy": 0.9974000000000001,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0",
|
210 |
+
"sae_lens_version": "5.4.2",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 65536,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9460000395774841,
|
240 |
+
"1": 0.9550000429153442,
|
241 |
+
"2": 0.9440000653266907,
|
242 |
+
"6": 0.9830000400543213,
|
243 |
+
"9": 0.9750000238418579
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9530000686645508,
|
249 |
+
"6": 0.987000048160553,
|
250 |
+
"9": 0.9760000705718994
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.577,
|
254 |
+
"1": 0.613,
|
255 |
+
"2": 0.662,
|
256 |
+
"6": 0.787,
|
257 |
+
"9": 0.56
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.574,
|
261 |
+
"1": 0.66,
|
262 |
+
"2": 0.718,
|
263 |
+
"6": 0.811,
|
264 |
+
"9": 0.714
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.713,
|
268 |
+
"1": 0.711,
|
269 |
+
"2": 0.755,
|
270 |
+
"6": 0.895,
|
271 |
+
"9": 0.861
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.876,
|
275 |
+
"1": 0.586,
|
276 |
+
"2": 0.852,
|
277 |
+
"6": 0.751,
|
278 |
+
"9": 0.61
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.878,
|
282 |
+
"1": 0.607,
|
283 |
+
"2": 0.848,
|
284 |
+
"6": 0.976,
|
285 |
+
"9": 0.855
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.884,
|
289 |
+
"1": 0.696,
|
290 |
+
"2": 0.864,
|
291 |
+
"6": 0.982,
|
292 |
+
"9": 0.924
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9500000476837158,
|
298 |
+
"13": 0.9490000605583191,
|
299 |
+
"14": 0.9550000429153442,
|
300 |
+
"18": 0.9130000472068787,
|
301 |
+
"19": 0.9600000381469727
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9690000414848328,
|
305 |
+
"13": 0.9600000381469727,
|
306 |
+
"14": 0.9600000381469727,
|
307 |
+
"18": 0.9390000700950623,
|
308 |
+
"19": 0.9610000252723694
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.555,
|
312 |
+
"13": 0.668,
|
313 |
+
"14": 0.638,
|
314 |
+
"18": 0.69,
|
315 |
+
"19": 0.796
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.756,
|
319 |
+
"13": 0.714,
|
320 |
+
"14": 0.67,
|
321 |
+
"18": 0.717,
|
322 |
+
"19": 0.768
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.794,
|
326 |
+
"13": 0.749,
|
327 |
+
"14": 0.723,
|
328 |
+
"18": 0.73,
|
329 |
+
"19": 0.831
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.855,
|
333 |
+
"13": 0.662,
|
334 |
+
"14": 0.618,
|
335 |
+
"18": 0.672,
|
336 |
+
"19": 0.852
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.853,
|
340 |
+
"13": 0.641,
|
341 |
+
"14": 0.736,
|
342 |
+
"18": 0.69,
|
343 |
+
"19": 0.854
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.847,
|
347 |
+
"13": 0.769,
|
348 |
+
"14": 0.872,
|
349 |
+
"18": 0.907,
|
350 |
+
"19": 0.862
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9580000638961792,
|
356 |
+
"21": 0.9240000247955322,
|
357 |
+
"22": 0.9190000295639038,
|
358 |
+
"25": 0.9570000171661377,
|
359 |
+
"26": 0.9000000357627869
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.956000030040741,
|
363 |
+
"21": 0.9350000619888306,
|
364 |
+
"22": 0.9180000424385071,
|
365 |
+
"25": 0.9640000462532043,
|
366 |
+
"26": 0.8850000500679016
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.693,
|
370 |
+
"21": 0.775,
|
371 |
+
"22": 0.645,
|
372 |
+
"25": 0.706,
|
373 |
+
"26": 0.616
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.827,
|
377 |
+
"21": 0.761,
|
378 |
+
"22": 0.694,
|
379 |
+
"25": 0.778,
|
380 |
+
"26": 0.686
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.855,
|
384 |
+
"21": 0.791,
|
385 |
+
"22": 0.725,
|
386 |
+
"25": 0.809,
|
387 |
+
"26": 0.672
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.889,
|
391 |
+
"21": 0.61,
|
392 |
+
"22": 0.877,
|
393 |
+
"25": 0.881,
|
394 |
+
"26": 0.617
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.866,
|
398 |
+
"21": 0.782,
|
399 |
+
"22": 0.873,
|
400 |
+
"25": 0.898,
|
401 |
+
"26": 0.625
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.905,
|
405 |
+
"21": 0.789,
|
406 |
+
"22": 0.885,
|
407 |
+
"25": 0.899,
|
408 |
+
"26": 0.779
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9550000429153442,
|
414 |
+
"2": 0.9360000491142273,
|
415 |
+
"3": 0.9180000424385071,
|
416 |
+
"5": 0.9240000247955322,
|
417 |
+
"6": 0.8630000352859497
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9580000638961792,
|
421 |
+
"2": 0.9330000281333923,
|
422 |
+
"3": 0.9280000329017639,
|
423 |
+
"5": 0.9200000166893005,
|
424 |
+
"6": 0.862000048160553
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.647,
|
428 |
+
"2": 0.603,
|
429 |
+
"3": 0.598,
|
430 |
+
"5": 0.555,
|
431 |
+
"6": 0.592
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.75,
|
435 |
+
"2": 0.648,
|
436 |
+
"3": 0.607,
|
437 |
+
"5": 0.606,
|
438 |
+
"6": 0.626
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.767,
|
442 |
+
"2": 0.641,
|
443 |
+
"3": 0.645,
|
444 |
+
"5": 0.638,
|
445 |
+
"6": 0.676
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.792,
|
449 |
+
"2": 0.591,
|
450 |
+
"3": 0.568,
|
451 |
+
"5": 0.829,
|
452 |
+
"6": 0.635
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.822,
|
456 |
+
"2": 0.621,
|
457 |
+
"3": 0.571,
|
458 |
+
"5": 0.823,
|
459 |
+
"6": 0.638
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.838,
|
463 |
+
"2": 0.862,
|
464 |
+
"3": 0.609,
|
465 |
+
"5": 0.825,
|
466 |
+
"6": 0.665
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9770000576972961,
|
472 |
+
"5.0": 0.9740000367164612
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9780000448226929,
|
476 |
+
"5.0": 0.9810000658035278
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.673,
|
480 |
+
"5.0": 0.673
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.94,
|
492 |
+
"5.0": 0.94
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.941,
|
496 |
+
"5.0": 0.941
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.95,
|
500 |
+
"5.0": 0.95
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9600000381469727,
|
506 |
+
"Python": 0.984000027179718,
|
507 |
+
"HTML": 0.9920000433921814,
|
508 |
+
"Java": 0.9670000672340393,
|
509 |
+
"PHP": 0.9540000557899475
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.984000027179718,
|
514 |
+
"HTML": 0.9900000691413879,
|
515 |
+
"Java": 0.9670000672340393,
|
516 |
+
"PHP": 0.9570000171661377
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.666,
|
520 |
+
"Python": 0.626,
|
521 |
+
"HTML": 0.721,
|
522 |
+
"Java": 0.619,
|
523 |
+
"PHP": 0.594
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.679,
|
527 |
+
"Python": 0.674,
|
528 |
+
"HTML": 0.8,
|
529 |
+
"Java": 0.676,
|
530 |
+
"PHP": 0.651
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.783,
|
534 |
+
"Python": 0.717,
|
535 |
+
"HTML": 0.935,
|
536 |
+
"Java": 0.733,
|
537 |
+
"PHP": 0.715
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.595,
|
541 |
+
"Python": 0.629,
|
542 |
+
"HTML": 0.565,
|
543 |
+
"Java": 0.647,
|
544 |
+
"PHP": 0.603
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.622,
|
548 |
+
"Python": 0.654,
|
549 |
+
"HTML": 0.824,
|
550 |
+
"Java": 0.649,
|
551 |
+
"PHP": 0.912
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.654,
|
555 |
+
"Python": 0.7,
|
556 |
+
"HTML": 0.807,
|
557 |
+
"Java": 0.674,
|
558 |
+
"PHP": 0.917
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9380000233650208,
|
564 |
+
"1": 0.9800000190734863,
|
565 |
+
"2": 0.9330000281333923,
|
566 |
+
"3": 0.9430000185966492
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.940000057220459,
|
570 |
+
"1": 0.9860000610351562,
|
571 |
+
"2": 0.9200000166893005,
|
572 |
+
"3": 0.9540000557899475
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.573,
|
576 |
+
"1": 0.671,
|
577 |
+
"2": 0.672,
|
578 |
+
"3": 0.635
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.802,
|
582 |
+
"1": 0.808,
|
583 |
+
"2": 0.701,
|
584 |
+
"3": 0.816
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.81,
|
588 |
+
"1": 0.891,
|
589 |
+
"2": 0.752,
|
590 |
+
"3": 0.832
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.591,
|
594 |
+
"1": 0.934,
|
595 |
+
"2": 0.562,
|
596 |
+
"3": 0.653
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.704,
|
600 |
+
"1": 0.937,
|
601 |
+
"2": 0.69,
|
602 |
+
"3": 0.656
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.835,
|
606 |
+
"1": 0.954,
|
607 |
+
"2": 0.834,
|
608 |
+
"3": 0.813
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.999000072479248,
|
614 |
+
"fr": 0.9970000386238098,
|
615 |
+
"de": 0.9980000257492065,
|
616 |
+
"es": 1.0,
|
617 |
+
"nl": 0.9970000386238098
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 0.999000072479248,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 0.999000072479248,
|
624 |
+
"nl": 1.0
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.739,
|
628 |
+
"fr": 0.585,
|
629 |
+
"de": 0.758,
|
630 |
+
"es": 0.496,
|
631 |
+
"nl": 0.649
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.829,
|
635 |
+
"fr": 0.582,
|
636 |
+
"de": 0.82,
|
637 |
+
"es": 0.958,
|
638 |
+
"nl": 0.753
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.892,
|
642 |
+
"fr": 0.888,
|
643 |
+
"de": 0.894,
|
644 |
+
"es": 0.98,
|
645 |
+
"nl": 0.852
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.651,
|
649 |
+
"fr": 0.996,
|
650 |
+
"de": 0.925,
|
651 |
+
"es": 0.897,
|
652 |
+
"nl": 0.997
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.776,
|
656 |
+
"fr": 0.997,
|
657 |
+
"de": 0.988,
|
658 |
+
"es": 0.992,
|
659 |
+
"nl": 0.997
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.998,
|
663 |
+
"fr": 0.997,
|
664 |
+
"de": 0.997,
|
665 |
+
"es": 0.996,
|
666 |
+
"nl": 0.999
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "c9b1c992-08d4-4128-85b3-7725ac95e5a6",
|
30 |
+
"datetime_epoch_millis": 1740165200108,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9595375448465346,
|
34 |
+
"llm_top_1_test_accuracy": 0.64956875,
|
35 |
+
"llm_top_2_test_accuracy": 0.72589375,
|
36 |
+
"llm_top_5_test_accuracy": 0.78265625,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9555687937885523,
|
44 |
+
"sae_top_1_test_accuracy": 0.71979375,
|
45 |
+
"sae_top_2_test_accuracy": 0.8020937500000002,
|
46 |
+
"sae_top_5_test_accuracy": 0.86435625,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.966800057888031,
|
57 |
+
"llm_top_1_test_accuracy": 0.6397999999999999,
|
58 |
+
"llm_top_2_test_accuracy": 0.6954,
|
59 |
+
"llm_top_5_test_accuracy": 0.7869999999999999,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.962000048160553,
|
65 |
+
"sae_top_1_test_accuracy": 0.6808,
|
66 |
+
"sae_top_2_test_accuracy": 0.8480000000000001,
|
67 |
+
"sae_top_5_test_accuracy": 0.9032,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9578000426292419,
|
76 |
+
"llm_top_1_test_accuracy": 0.6694000000000001,
|
77 |
+
"llm_top_2_test_accuracy": 0.725,
|
78 |
+
"llm_top_5_test_accuracy": 0.7654,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9456000447273254,
|
84 |
+
"sae_top_1_test_accuracy": 0.669,
|
85 |
+
"sae_top_2_test_accuracy": 0.7938000000000001,
|
86 |
+
"sae_top_5_test_accuracy": 0.8597999999999999,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9316000461578369,
|
95 |
+
"llm_top_1_test_accuracy": 0.687,
|
96 |
+
"llm_top_2_test_accuracy": 0.7492,
|
97 |
+
"llm_top_5_test_accuracy": 0.7704000000000001,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9292000293731689,
|
103 |
+
"sae_top_1_test_accuracy": 0.704,
|
104 |
+
"sae_top_2_test_accuracy": 0.8064,
|
105 |
+
"sae_top_5_test_accuracy": 0.8610000000000001,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.9202000379562378,
|
114 |
+
"llm_top_1_test_accuracy": 0.599,
|
115 |
+
"llm_top_2_test_accuracy": 0.6474,
|
116 |
+
"llm_top_5_test_accuracy": 0.6734,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9170000433921814,
|
122 |
+
"sae_top_1_test_accuracy": 0.7496,
|
123 |
+
"sae_top_2_test_accuracy": 0.774,
|
124 |
+
"sae_top_5_test_accuracy": 0.8013999999999999,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9795000553131104,
|
133 |
+
"llm_top_1_test_accuracy": 0.673,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9750000536441803,
|
141 |
+
"sae_top_1_test_accuracy": 0.847,
|
142 |
+
"sae_top_2_test_accuracy": 0.847,
|
143 |
+
"sae_top_5_test_accuracy": 0.929,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9708000421524048,
|
152 |
+
"llm_top_1_test_accuracy": 0.6451999999999999,
|
153 |
+
"llm_top_2_test_accuracy": 0.6960000000000001,
|
154 |
+
"llm_top_5_test_accuracy": 0.7766,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9676000475883484,
|
160 |
+
"sae_top_1_test_accuracy": 0.64,
|
161 |
+
"sae_top_2_test_accuracy": 0.6508,
|
162 |
+
"sae_top_5_test_accuracy": 0.7672,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9500000476837158,
|
171 |
+
"llm_top_1_test_accuracy": 0.63775,
|
172 |
+
"llm_top_2_test_accuracy": 0.78175,
|
173 |
+
"llm_top_5_test_accuracy": 0.82125,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9497500509023666,
|
179 |
+
"sae_top_1_test_accuracy": 0.60775,
|
180 |
+
"sae_top_2_test_accuracy": 0.70375,
|
181 |
+
"sae_top_5_test_accuracy": 0.7982499999999999,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6454,
|
191 |
+
"llm_top_2_test_accuracy": 0.7884,
|
192 |
+
"llm_top_5_test_accuracy": 0.9012,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9984000325202942,
|
198 |
+
"sae_top_1_test_accuracy": 0.8602000000000001,
|
199 |
+
"sae_top_2_test_accuracy": 0.993,
|
200 |
+
"sae_top_5_test_accuracy": 0.9950000000000001,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0",
|
210 |
+
"sae_lens_version": "5.4.2",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 65536,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9450000524520874,
|
240 |
+
"1": 0.9600000381469727,
|
241 |
+
"2": 0.9470000267028809,
|
242 |
+
"6": 0.9860000610351562,
|
243 |
+
"9": 0.9720000624656677
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9530000686645508,
|
249 |
+
"6": 0.987000048160553,
|
250 |
+
"9": 0.9760000705718994
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.577,
|
254 |
+
"1": 0.613,
|
255 |
+
"2": 0.662,
|
256 |
+
"6": 0.787,
|
257 |
+
"9": 0.56
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.574,
|
261 |
+
"1": 0.66,
|
262 |
+
"2": 0.718,
|
263 |
+
"6": 0.811,
|
264 |
+
"9": 0.714
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.713,
|
268 |
+
"1": 0.711,
|
269 |
+
"2": 0.755,
|
270 |
+
"6": 0.895,
|
271 |
+
"9": 0.861
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.571,
|
275 |
+
"1": 0.631,
|
276 |
+
"2": 0.835,
|
277 |
+
"6": 0.808,
|
278 |
+
"9": 0.559
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.856,
|
282 |
+
"1": 0.809,
|
283 |
+
"2": 0.843,
|
284 |
+
"6": 0.976,
|
285 |
+
"9": 0.756
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.869,
|
289 |
+
"1": 0.851,
|
290 |
+
"2": 0.864,
|
291 |
+
"6": 0.99,
|
292 |
+
"9": 0.942
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9580000638961792,
|
298 |
+
"13": 0.9470000267028809,
|
299 |
+
"14": 0.9460000395774841,
|
300 |
+
"18": 0.9220000505447388,
|
301 |
+
"19": 0.9550000429153442
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9690000414848328,
|
305 |
+
"13": 0.9600000381469727,
|
306 |
+
"14": 0.9600000381469727,
|
307 |
+
"18": 0.9390000700950623,
|
308 |
+
"19": 0.9610000252723694
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.555,
|
312 |
+
"13": 0.668,
|
313 |
+
"14": 0.638,
|
314 |
+
"18": 0.69,
|
315 |
+
"19": 0.796
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.756,
|
319 |
+
"13": 0.714,
|
320 |
+
"14": 0.67,
|
321 |
+
"18": 0.717,
|
322 |
+
"19": 0.768
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.794,
|
326 |
+
"13": 0.749,
|
327 |
+
"14": 0.723,
|
328 |
+
"18": 0.73,
|
329 |
+
"19": 0.831
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.539,
|
333 |
+
"13": 0.658,
|
334 |
+
"14": 0.648,
|
335 |
+
"18": 0.703,
|
336 |
+
"19": 0.797
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.858,
|
340 |
+
"13": 0.675,
|
341 |
+
"14": 0.878,
|
342 |
+
"18": 0.729,
|
343 |
+
"19": 0.829
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.871,
|
347 |
+
"13": 0.792,
|
348 |
+
"14": 0.876,
|
349 |
+
"18": 0.897,
|
350 |
+
"19": 0.863
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9520000219345093,
|
356 |
+
"21": 0.9220000505447388,
|
357 |
+
"22": 0.9240000247955322,
|
358 |
+
"25": 0.956000030040741,
|
359 |
+
"26": 0.8920000195503235
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.956000030040741,
|
363 |
+
"21": 0.9350000619888306,
|
364 |
+
"22": 0.9180000424385071,
|
365 |
+
"25": 0.9640000462532043,
|
366 |
+
"26": 0.8850000500679016
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.693,
|
370 |
+
"21": 0.775,
|
371 |
+
"22": 0.645,
|
372 |
+
"25": 0.706,
|
373 |
+
"26": 0.616
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.827,
|
377 |
+
"21": 0.761,
|
378 |
+
"22": 0.694,
|
379 |
+
"25": 0.778,
|
380 |
+
"26": 0.686
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.855,
|
384 |
+
"21": 0.791,
|
385 |
+
"22": 0.725,
|
386 |
+
"25": 0.809,
|
387 |
+
"26": 0.672
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.84,
|
391 |
+
"21": 0.476,
|
392 |
+
"22": 0.88,
|
393 |
+
"25": 0.693,
|
394 |
+
"26": 0.631
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.849,
|
398 |
+
"21": 0.747,
|
399 |
+
"22": 0.881,
|
400 |
+
"25": 0.849,
|
401 |
+
"26": 0.706
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.911,
|
405 |
+
"21": 0.844,
|
406 |
+
"22": 0.874,
|
407 |
+
"25": 0.893,
|
408 |
+
"26": 0.783
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9510000348091125,
|
414 |
+
"2": 0.9460000395774841,
|
415 |
+
"3": 0.9110000729560852,
|
416 |
+
"5": 0.9200000166893005,
|
417 |
+
"6": 0.8570000529289246
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9580000638961792,
|
421 |
+
"2": 0.9330000281333923,
|
422 |
+
"3": 0.9280000329017639,
|
423 |
+
"5": 0.9200000166893005,
|
424 |
+
"6": 0.862000048160553
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.647,
|
428 |
+
"2": 0.603,
|
429 |
+
"3": 0.598,
|
430 |
+
"5": 0.555,
|
431 |
+
"6": 0.592
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.75,
|
435 |
+
"2": 0.648,
|
436 |
+
"3": 0.607,
|
437 |
+
"5": 0.606,
|
438 |
+
"6": 0.626
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.767,
|
442 |
+
"2": 0.641,
|
443 |
+
"3": 0.645,
|
444 |
+
"5": 0.638,
|
445 |
+
"6": 0.676
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.856,
|
449 |
+
"2": 0.862,
|
450 |
+
"3": 0.592,
|
451 |
+
"5": 0.819,
|
452 |
+
"6": 0.619
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.905,
|
456 |
+
"2": 0.867,
|
457 |
+
"3": 0.612,
|
458 |
+
"5": 0.818,
|
459 |
+
"6": 0.668
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.908,
|
463 |
+
"2": 0.862,
|
464 |
+
"3": 0.63,
|
465 |
+
"5": 0.873,
|
466 |
+
"6": 0.734
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9760000705718994,
|
472 |
+
"5.0": 0.9740000367164612
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9780000448226929,
|
476 |
+
"5.0": 0.9810000658035278
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.673,
|
480 |
+
"5.0": 0.673
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.847,
|
492 |
+
"5.0": 0.847
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.847,
|
496 |
+
"5.0": 0.847
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.929,
|
500 |
+
"5.0": 0.929
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9540000557899475,
|
506 |
+
"Python": 0.984000027179718,
|
507 |
+
"HTML": 0.9810000658035278,
|
508 |
+
"Java": 0.9640000462532043,
|
509 |
+
"PHP": 0.9550000429153442
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.984000027179718,
|
514 |
+
"HTML": 0.9900000691413879,
|
515 |
+
"Java": 0.9670000672340393,
|
516 |
+
"PHP": 0.9570000171661377
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.666,
|
520 |
+
"Python": 0.626,
|
521 |
+
"HTML": 0.721,
|
522 |
+
"Java": 0.619,
|
523 |
+
"PHP": 0.594
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.679,
|
527 |
+
"Python": 0.674,
|
528 |
+
"HTML": 0.8,
|
529 |
+
"Java": 0.676,
|
530 |
+
"PHP": 0.651
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.783,
|
534 |
+
"Python": 0.717,
|
535 |
+
"HTML": 0.935,
|
536 |
+
"Java": 0.733,
|
537 |
+
"PHP": 0.715
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.622,
|
541 |
+
"Python": 0.661,
|
542 |
+
"HTML": 0.692,
|
543 |
+
"Java": 0.629,
|
544 |
+
"PHP": 0.596
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.599,
|
548 |
+
"Python": 0.65,
|
549 |
+
"HTML": 0.795,
|
550 |
+
"Java": 0.628,
|
551 |
+
"PHP": 0.582
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.671,
|
555 |
+
"Python": 0.684,
|
556 |
+
"HTML": 0.865,
|
557 |
+
"Java": 0.705,
|
558 |
+
"PHP": 0.911
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9360000491142273,
|
564 |
+
"1": 0.9850000739097595,
|
565 |
+
"2": 0.9330000281333923,
|
566 |
+
"3": 0.9450000524520874
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.940000057220459,
|
570 |
+
"1": 0.9860000610351562,
|
571 |
+
"2": 0.9200000166893005,
|
572 |
+
"3": 0.9540000557899475
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.573,
|
576 |
+
"1": 0.671,
|
577 |
+
"2": 0.672,
|
578 |
+
"3": 0.635
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.802,
|
582 |
+
"1": 0.808,
|
583 |
+
"2": 0.701,
|
584 |
+
"3": 0.816
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.81,
|
588 |
+
"1": 0.891,
|
589 |
+
"2": 0.752,
|
590 |
+
"3": 0.832
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.578,
|
594 |
+
"1": 0.664,
|
595 |
+
"2": 0.552,
|
596 |
+
"3": 0.637
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.795,
|
600 |
+
"1": 0.697,
|
601 |
+
"2": 0.673,
|
602 |
+
"3": 0.65
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.822,
|
606 |
+
"1": 0.869,
|
607 |
+
"2": 0.691,
|
608 |
+
"3": 0.811
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.999000072479248,
|
614 |
+
"fr": 0.9980000257492065,
|
615 |
+
"de": 0.9970000386238098,
|
616 |
+
"es": 1.0,
|
617 |
+
"nl": 0.9980000257492065
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 0.999000072479248,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 0.999000072479248,
|
624 |
+
"nl": 1.0
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.739,
|
628 |
+
"fr": 0.585,
|
629 |
+
"de": 0.758,
|
630 |
+
"es": 0.496,
|
631 |
+
"nl": 0.649
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.829,
|
635 |
+
"fr": 0.582,
|
636 |
+
"de": 0.82,
|
637 |
+
"es": 0.958,
|
638 |
+
"nl": 0.753
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.892,
|
642 |
+
"fr": 0.888,
|
643 |
+
"de": 0.894,
|
644 |
+
"es": 0.98,
|
645 |
+
"nl": 0.852
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.749,
|
649 |
+
"fr": 0.991,
|
650 |
+
"de": 0.932,
|
651 |
+
"es": 0.99,
|
652 |
+
"nl": 0.639
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.995,
|
656 |
+
"fr": 0.994,
|
657 |
+
"de": 0.988,
|
658 |
+
"es": 0.99,
|
659 |
+
"nl": 0.998
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.998,
|
663 |
+
"fr": 0.997,
|
664 |
+
"de": 0.988,
|
665 |
+
"es": 0.994,
|
666 |
+
"nl": 0.998
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "d3e0c3ec-e2e5-4d60-ae48-22cfd7d5fba2",
|
30 |
+
"datetime_epoch_millis": 1740165066678,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9595375448465346,
|
34 |
+
"llm_top_1_test_accuracy": 0.64956875,
|
35 |
+
"llm_top_2_test_accuracy": 0.72589375,
|
36 |
+
"llm_top_5_test_accuracy": 0.78265625,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9570625454187394,
|
44 |
+
"sae_top_1_test_accuracy": 0.7210749999999999,
|
45 |
+
"sae_top_2_test_accuracy": 0.7653625,
|
46 |
+
"sae_top_5_test_accuracy": 0.8490062500000001,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.966800057888031,
|
57 |
+
"llm_top_1_test_accuracy": 0.6397999999999999,
|
58 |
+
"llm_top_2_test_accuracy": 0.6954,
|
59 |
+
"llm_top_5_test_accuracy": 0.7869999999999999,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9642000436782837,
|
65 |
+
"sae_top_1_test_accuracy": 0.7083999999999999,
|
66 |
+
"sae_top_2_test_accuracy": 0.7636000000000001,
|
67 |
+
"sae_top_5_test_accuracy": 0.8354000000000001,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9578000426292419,
|
76 |
+
"llm_top_1_test_accuracy": 0.6694000000000001,
|
77 |
+
"llm_top_2_test_accuracy": 0.725,
|
78 |
+
"llm_top_5_test_accuracy": 0.7654,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9512000441551208,
|
84 |
+
"sae_top_1_test_accuracy": 0.7248000000000001,
|
85 |
+
"sae_top_2_test_accuracy": 0.7267999999999999,
|
86 |
+
"sae_top_5_test_accuracy": 0.828,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9316000461578369,
|
95 |
+
"llm_top_1_test_accuracy": 0.687,
|
96 |
+
"llm_top_2_test_accuracy": 0.7492,
|
97 |
+
"llm_top_5_test_accuracy": 0.7704000000000001,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9294000387191772,
|
103 |
+
"sae_top_1_test_accuracy": 0.7074,
|
104 |
+
"sae_top_2_test_accuracy": 0.784,
|
105 |
+
"sae_top_5_test_accuracy": 0.825,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.9202000379562378,
|
114 |
+
"llm_top_1_test_accuracy": 0.599,
|
115 |
+
"llm_top_2_test_accuracy": 0.6474,
|
116 |
+
"llm_top_5_test_accuracy": 0.6734,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9192000508308411,
|
122 |
+
"sae_top_1_test_accuracy": 0.6622,
|
123 |
+
"sae_top_2_test_accuracy": 0.6678000000000001,
|
124 |
+
"sae_top_5_test_accuracy": 0.7878000000000001,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9795000553131104,
|
133 |
+
"llm_top_1_test_accuracy": 0.673,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9725000560283661,
|
141 |
+
"sae_top_1_test_accuracy": 0.6,
|
142 |
+
"sae_top_2_test_accuracy": 0.764,
|
143 |
+
"sae_top_5_test_accuracy": 0.942,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9708000421524048,
|
152 |
+
"llm_top_1_test_accuracy": 0.6451999999999999,
|
153 |
+
"llm_top_2_test_accuracy": 0.6960000000000001,
|
154 |
+
"llm_top_5_test_accuracy": 0.7766,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.968600058555603,
|
160 |
+
"sae_top_1_test_accuracy": 0.6384000000000001,
|
161 |
+
"sae_top_2_test_accuracy": 0.643,
|
162 |
+
"sae_top_5_test_accuracy": 0.7448,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9500000476837158,
|
171 |
+
"llm_top_1_test_accuracy": 0.63775,
|
172 |
+
"llm_top_2_test_accuracy": 0.78175,
|
173 |
+
"llm_top_5_test_accuracy": 0.82125,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9520000517368317,
|
179 |
+
"sae_top_1_test_accuracy": 0.77,
|
180 |
+
"sae_top_2_test_accuracy": 0.7875000000000001,
|
181 |
+
"sae_top_5_test_accuracy": 0.8322499999999999,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6454,
|
191 |
+
"llm_top_2_test_accuracy": 0.7884,
|
192 |
+
"llm_top_5_test_accuracy": 0.9012,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9994000196456909,
|
198 |
+
"sae_top_1_test_accuracy": 0.9574,
|
199 |
+
"sae_top_2_test_accuracy": 0.9862,
|
200 |
+
"sae_top_5_test_accuracy": 0.9968,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0",
|
210 |
+
"sae_lens_version": "5.4.2",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 65536,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9480000734329224,
|
240 |
+
"1": 0.9660000205039978,
|
241 |
+
"2": 0.9480000734329224,
|
242 |
+
"6": 0.984000027179718,
|
243 |
+
"9": 0.9750000238418579
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9530000686645508,
|
249 |
+
"6": 0.987000048160553,
|
250 |
+
"9": 0.9760000705718994
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.577,
|
254 |
+
"1": 0.613,
|
255 |
+
"2": 0.662,
|
256 |
+
"6": 0.787,
|
257 |
+
"9": 0.56
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.574,
|
261 |
+
"1": 0.66,
|
262 |
+
"2": 0.718,
|
263 |
+
"6": 0.811,
|
264 |
+
"9": 0.714
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.713,
|
268 |
+
"1": 0.711,
|
269 |
+
"2": 0.755,
|
270 |
+
"6": 0.895,
|
271 |
+
"9": 0.861
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.588,
|
275 |
+
"1": 0.612,
|
276 |
+
"2": 0.887,
|
277 |
+
"6": 0.751,
|
278 |
+
"9": 0.704
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.627,
|
282 |
+
"1": 0.621,
|
283 |
+
"2": 0.881,
|
284 |
+
"6": 0.766,
|
285 |
+
"9": 0.923
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.703,
|
289 |
+
"1": 0.7,
|
290 |
+
"2": 0.878,
|
291 |
+
"6": 0.971,
|
292 |
+
"9": 0.925
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9580000638961792,
|
298 |
+
"13": 0.9510000348091125,
|
299 |
+
"14": 0.9530000686645508,
|
300 |
+
"18": 0.9290000200271606,
|
301 |
+
"19": 0.9650000333786011
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9690000414848328,
|
305 |
+
"13": 0.9600000381469727,
|
306 |
+
"14": 0.9600000381469727,
|
307 |
+
"18": 0.9390000700950623,
|
308 |
+
"19": 0.9610000252723694
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.555,
|
312 |
+
"13": 0.668,
|
313 |
+
"14": 0.638,
|
314 |
+
"18": 0.69,
|
315 |
+
"19": 0.796
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.756,
|
319 |
+
"13": 0.714,
|
320 |
+
"14": 0.67,
|
321 |
+
"18": 0.717,
|
322 |
+
"19": 0.768
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.794,
|
326 |
+
"13": 0.749,
|
327 |
+
"14": 0.723,
|
328 |
+
"18": 0.73,
|
329 |
+
"19": 0.831
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.849,
|
333 |
+
"13": 0.677,
|
334 |
+
"14": 0.638,
|
335 |
+
"18": 0.627,
|
336 |
+
"19": 0.833
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.849,
|
340 |
+
"13": 0.684,
|
341 |
+
"14": 0.614,
|
342 |
+
"18": 0.666,
|
343 |
+
"19": 0.821
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.921,
|
347 |
+
"13": 0.762,
|
348 |
+
"14": 0.874,
|
349 |
+
"18": 0.761,
|
350 |
+
"19": 0.822
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9520000219345093,
|
356 |
+
"21": 0.9220000505447388,
|
357 |
+
"22": 0.9120000600814819,
|
358 |
+
"25": 0.9610000252723694,
|
359 |
+
"26": 0.9000000357627869
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.956000030040741,
|
363 |
+
"21": 0.9350000619888306,
|
364 |
+
"22": 0.9180000424385071,
|
365 |
+
"25": 0.9640000462532043,
|
366 |
+
"26": 0.8850000500679016
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.693,
|
370 |
+
"21": 0.775,
|
371 |
+
"22": 0.645,
|
372 |
+
"25": 0.706,
|
373 |
+
"26": 0.616
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.827,
|
377 |
+
"21": 0.761,
|
378 |
+
"22": 0.694,
|
379 |
+
"25": 0.778,
|
380 |
+
"26": 0.686
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.855,
|
384 |
+
"21": 0.791,
|
385 |
+
"22": 0.725,
|
386 |
+
"25": 0.809,
|
387 |
+
"26": 0.672
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.814,
|
391 |
+
"21": 0.618,
|
392 |
+
"22": 0.82,
|
393 |
+
"25": 0.664,
|
394 |
+
"26": 0.621
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.841,
|
398 |
+
"21": 0.779,
|
399 |
+
"22": 0.817,
|
400 |
+
"25": 0.86,
|
401 |
+
"26": 0.623
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.932,
|
405 |
+
"21": 0.8,
|
406 |
+
"22": 0.859,
|
407 |
+
"25": 0.86,
|
408 |
+
"26": 0.674
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9480000734329224,
|
414 |
+
"2": 0.9380000233650208,
|
415 |
+
"3": 0.9190000295639038,
|
416 |
+
"5": 0.921000063419342,
|
417 |
+
"6": 0.8700000643730164
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9580000638961792,
|
421 |
+
"2": 0.9330000281333923,
|
422 |
+
"3": 0.9280000329017639,
|
423 |
+
"5": 0.9200000166893005,
|
424 |
+
"6": 0.862000048160553
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.647,
|
428 |
+
"2": 0.603,
|
429 |
+
"3": 0.598,
|
430 |
+
"5": 0.555,
|
431 |
+
"6": 0.592
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.75,
|
435 |
+
"2": 0.648,
|
436 |
+
"3": 0.607,
|
437 |
+
"5": 0.606,
|
438 |
+
"6": 0.626
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.767,
|
442 |
+
"2": 0.641,
|
443 |
+
"3": 0.645,
|
444 |
+
"5": 0.638,
|
445 |
+
"6": 0.676
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.82,
|
449 |
+
"2": 0.647,
|
450 |
+
"3": 0.557,
|
451 |
+
"5": 0.547,
|
452 |
+
"6": 0.74
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.817,
|
456 |
+
"2": 0.639,
|
457 |
+
"3": 0.584,
|
458 |
+
"5": 0.555,
|
459 |
+
"6": 0.744
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.87,
|
463 |
+
"2": 0.872,
|
464 |
+
"3": 0.641,
|
465 |
+
"5": 0.8,
|
466 |
+
"6": 0.756
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9730000495910645,
|
472 |
+
"5.0": 0.9720000624656677
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9780000448226929,
|
476 |
+
"5.0": 0.9810000658035278
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.673,
|
480 |
+
"5.0": 0.673
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.6,
|
492 |
+
"5.0": 0.6
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.764,
|
496 |
+
"5.0": 0.764
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.942,
|
500 |
+
"5.0": 0.942
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9590000510215759,
|
506 |
+
"Python": 0.9860000610351562,
|
507 |
+
"HTML": 0.9820000529289246,
|
508 |
+
"Java": 0.9630000591278076,
|
509 |
+
"PHP": 0.9530000686645508
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.984000027179718,
|
514 |
+
"HTML": 0.9900000691413879,
|
515 |
+
"Java": 0.9670000672340393,
|
516 |
+
"PHP": 0.9570000171661377
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.666,
|
520 |
+
"Python": 0.626,
|
521 |
+
"HTML": 0.721,
|
522 |
+
"Java": 0.619,
|
523 |
+
"PHP": 0.594
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.679,
|
527 |
+
"Python": 0.674,
|
528 |
+
"HTML": 0.8,
|
529 |
+
"Java": 0.676,
|
530 |
+
"PHP": 0.651
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.783,
|
534 |
+
"Python": 0.717,
|
535 |
+
"HTML": 0.935,
|
536 |
+
"Java": 0.733,
|
537 |
+
"PHP": 0.715
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.607,
|
541 |
+
"Python": 0.648,
|
542 |
+
"HTML": 0.689,
|
543 |
+
"Java": 0.633,
|
544 |
+
"PHP": 0.615
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.601,
|
548 |
+
"Python": 0.685,
|
549 |
+
"HTML": 0.674,
|
550 |
+
"Java": 0.638,
|
551 |
+
"PHP": 0.617
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.607,
|
555 |
+
"Python": 0.696,
|
556 |
+
"HTML": 0.879,
|
557 |
+
"Java": 0.656,
|
558 |
+
"PHP": 0.886
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.940000057220459,
|
564 |
+
"1": 0.9850000739097595,
|
565 |
+
"2": 0.9310000538825989,
|
566 |
+
"3": 0.9520000219345093
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.940000057220459,
|
570 |
+
"1": 0.9860000610351562,
|
571 |
+
"2": 0.9200000166893005,
|
572 |
+
"3": 0.9540000557899475
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.573,
|
576 |
+
"1": 0.671,
|
577 |
+
"2": 0.672,
|
578 |
+
"3": 0.635
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.802,
|
582 |
+
"1": 0.808,
|
583 |
+
"2": 0.701,
|
584 |
+
"3": 0.816
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.81,
|
588 |
+
"1": 0.891,
|
589 |
+
"2": 0.752,
|
590 |
+
"3": 0.832
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.661,
|
594 |
+
"1": 0.948,
|
595 |
+
"2": 0.815,
|
596 |
+
"3": 0.656
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.658,
|
600 |
+
"1": 0.958,
|
601 |
+
"2": 0.83,
|
602 |
+
"3": 0.704
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.764,
|
606 |
+
"1": 0.96,
|
607 |
+
"2": 0.83,
|
608 |
+
"3": 0.775
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.9980000257492065,
|
614 |
+
"fr": 1.0,
|
615 |
+
"de": 1.0,
|
616 |
+
"es": 1.0,
|
617 |
+
"nl": 0.999000072479248
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 0.999000072479248,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 0.999000072479248,
|
624 |
+
"nl": 1.0
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.739,
|
628 |
+
"fr": 0.585,
|
629 |
+
"de": 0.758,
|
630 |
+
"es": 0.496,
|
631 |
+
"nl": 0.649
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.829,
|
635 |
+
"fr": 0.582,
|
636 |
+
"de": 0.82,
|
637 |
+
"es": 0.958,
|
638 |
+
"nl": 0.753
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.892,
|
642 |
+
"fr": 0.888,
|
643 |
+
"de": 0.894,
|
644 |
+
"es": 0.98,
|
645 |
+
"nl": 0.852
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.998,
|
649 |
+
"fr": 0.84,
|
650 |
+
"de": 0.96,
|
651 |
+
"es": 0.992,
|
652 |
+
"nl": 0.997
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.997,
|
656 |
+
"fr": 0.989,
|
657 |
+
"de": 0.956,
|
658 |
+
"es": 0.99,
|
659 |
+
"nl": 0.999
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.999,
|
663 |
+
"fr": 0.993,
|
664 |
+
"de": 0.998,
|
665 |
+
"es": 0.995,
|
666 |
+
"nl": 0.999
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "5982f843-09ae-423f-897e-88fdc5e9b765",
|
30 |
+
"datetime_epoch_millis": 1740165484719,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9595375448465346,
|
34 |
+
"llm_top_1_test_accuracy": 0.64956875,
|
35 |
+
"llm_top_2_test_accuracy": 0.72589375,
|
36 |
+
"llm_top_5_test_accuracy": 0.78265625,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9565250385552645,
|
44 |
+
"sae_top_1_test_accuracy": 0.70693125,
|
45 |
+
"sae_top_2_test_accuracy": 0.8177062500000001,
|
46 |
+
"sae_top_5_test_accuracy": 0.8643937500000001,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.966800057888031,
|
57 |
+
"llm_top_1_test_accuracy": 0.6397999999999999,
|
58 |
+
"llm_top_2_test_accuracy": 0.6954,
|
59 |
+
"llm_top_5_test_accuracy": 0.7869999999999999,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9606000423431397,
|
65 |
+
"sae_top_1_test_accuracy": 0.677,
|
66 |
+
"sae_top_2_test_accuracy": 0.8896000000000001,
|
67 |
+
"sae_top_5_test_accuracy": 0.9054,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9578000426292419,
|
76 |
+
"llm_top_1_test_accuracy": 0.6694000000000001,
|
77 |
+
"llm_top_2_test_accuracy": 0.725,
|
78 |
+
"llm_top_5_test_accuracy": 0.7654,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9488000273704529,
|
84 |
+
"sae_top_1_test_accuracy": 0.679,
|
85 |
+
"sae_top_2_test_accuracy": 0.758,
|
86 |
+
"sae_top_5_test_accuracy": 0.8614,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9316000461578369,
|
95 |
+
"llm_top_1_test_accuracy": 0.687,
|
96 |
+
"llm_top_2_test_accuracy": 0.7492,
|
97 |
+
"llm_top_5_test_accuracy": 0.7704000000000001,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.929200041294098,
|
103 |
+
"sae_top_1_test_accuracy": 0.7186,
|
104 |
+
"sae_top_2_test_accuracy": 0.8170000000000002,
|
105 |
+
"sae_top_5_test_accuracy": 0.8568,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.9202000379562378,
|
114 |
+
"llm_top_1_test_accuracy": 0.599,
|
115 |
+
"llm_top_2_test_accuracy": 0.6474,
|
116 |
+
"llm_top_5_test_accuracy": 0.6734,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9192000389099121,
|
122 |
+
"sae_top_1_test_accuracy": 0.6487999999999999,
|
123 |
+
"sae_top_2_test_accuracy": 0.7392,
|
124 |
+
"sae_top_5_test_accuracy": 0.7898,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9795000553131104,
|
133 |
+
"llm_top_1_test_accuracy": 0.673,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9725000560283661,
|
141 |
+
"sae_top_1_test_accuracy": 0.885,
|
142 |
+
"sae_top_2_test_accuracy": 0.889,
|
143 |
+
"sae_top_5_test_accuracy": 0.931,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9708000421524048,
|
152 |
+
"llm_top_1_test_accuracy": 0.6451999999999999,
|
153 |
+
"llm_top_2_test_accuracy": 0.6960000000000001,
|
154 |
+
"llm_top_5_test_accuracy": 0.7766,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9698000431060791,
|
160 |
+
"sae_top_1_test_accuracy": 0.5916,
|
161 |
+
"sae_top_2_test_accuracy": 0.756,
|
162 |
+
"sae_top_5_test_accuracy": 0.8038000000000001,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9500000476837158,
|
171 |
+
"llm_top_1_test_accuracy": 0.63775,
|
172 |
+
"llm_top_2_test_accuracy": 0.78175,
|
173 |
+
"llm_top_5_test_accuracy": 0.82125,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9525000303983688,
|
179 |
+
"sae_top_1_test_accuracy": 0.61025,
|
180 |
+
"sae_top_2_test_accuracy": 0.7112499999999999,
|
181 |
+
"sae_top_5_test_accuracy": 0.7747499999999999,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6454,
|
191 |
+
"llm_top_2_test_accuracy": 0.7884,
|
192 |
+
"llm_top_5_test_accuracy": 0.9012,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9996000289916992,
|
198 |
+
"sae_top_1_test_accuracy": 0.8452,
|
199 |
+
"sae_top_2_test_accuracy": 0.9816,
|
200 |
+
"sae_top_5_test_accuracy": 0.9922000000000001,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0",
|
210 |
+
"sae_lens_version": "5.4.2",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 65536,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9420000314712524,
|
240 |
+
"1": 0.9530000686645508,
|
241 |
+
"2": 0.9470000267028809,
|
242 |
+
"6": 0.984000027179718,
|
243 |
+
"9": 0.9770000576972961
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9530000686645508,
|
249 |
+
"6": 0.987000048160553,
|
250 |
+
"9": 0.9760000705718994
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.577,
|
254 |
+
"1": 0.613,
|
255 |
+
"2": 0.662,
|
256 |
+
"6": 0.787,
|
257 |
+
"9": 0.56
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.574,
|
261 |
+
"1": 0.66,
|
262 |
+
"2": 0.718,
|
263 |
+
"6": 0.811,
|
264 |
+
"9": 0.714
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.713,
|
268 |
+
"1": 0.711,
|
269 |
+
"2": 0.755,
|
270 |
+
"6": 0.895,
|
271 |
+
"9": 0.861
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.566,
|
275 |
+
"1": 0.643,
|
276 |
+
"2": 0.826,
|
277 |
+
"6": 0.797,
|
278 |
+
"9": 0.553
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.868,
|
282 |
+
"1": 0.806,
|
283 |
+
"2": 0.853,
|
284 |
+
"6": 0.981,
|
285 |
+
"9": 0.94
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.881,
|
289 |
+
"1": 0.853,
|
290 |
+
"2": 0.858,
|
291 |
+
"6": 0.989,
|
292 |
+
"9": 0.946
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9660000205039978,
|
298 |
+
"13": 0.9520000219345093,
|
299 |
+
"14": 0.9430000185966492,
|
300 |
+
"18": 0.9230000376701355,
|
301 |
+
"19": 0.9600000381469727
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9690000414848328,
|
305 |
+
"13": 0.9600000381469727,
|
306 |
+
"14": 0.9600000381469727,
|
307 |
+
"18": 0.9390000700950623,
|
308 |
+
"19": 0.9610000252723694
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.555,
|
312 |
+
"13": 0.668,
|
313 |
+
"14": 0.638,
|
314 |
+
"18": 0.69,
|
315 |
+
"19": 0.796
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.756,
|
319 |
+
"13": 0.714,
|
320 |
+
"14": 0.67,
|
321 |
+
"18": 0.717,
|
322 |
+
"19": 0.768
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.794,
|
326 |
+
"13": 0.749,
|
327 |
+
"14": 0.723,
|
328 |
+
"18": 0.73,
|
329 |
+
"19": 0.831
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.555,
|
333 |
+
"13": 0.666,
|
334 |
+
"14": 0.661,
|
335 |
+
"18": 0.712,
|
336 |
+
"19": 0.801
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.736,
|
340 |
+
"13": 0.697,
|
341 |
+
"14": 0.795,
|
342 |
+
"18": 0.737,
|
343 |
+
"19": 0.825
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.909,
|
347 |
+
"13": 0.745,
|
348 |
+
"14": 0.904,
|
349 |
+
"18": 0.897,
|
350 |
+
"19": 0.852
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9510000348091125,
|
356 |
+
"21": 0.9240000247955322,
|
357 |
+
"22": 0.9180000424385071,
|
358 |
+
"25": 0.9510000348091125,
|
359 |
+
"26": 0.9020000696182251
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.956000030040741,
|
363 |
+
"21": 0.9350000619888306,
|
364 |
+
"22": 0.9180000424385071,
|
365 |
+
"25": 0.9640000462532043,
|
366 |
+
"26": 0.8850000500679016
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.693,
|
370 |
+
"21": 0.775,
|
371 |
+
"22": 0.645,
|
372 |
+
"25": 0.706,
|
373 |
+
"26": 0.616
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.827,
|
377 |
+
"21": 0.761,
|
378 |
+
"22": 0.694,
|
379 |
+
"25": 0.778,
|
380 |
+
"26": 0.686
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.855,
|
384 |
+
"21": 0.791,
|
385 |
+
"22": 0.725,
|
386 |
+
"25": 0.809,
|
387 |
+
"26": 0.672
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.856,
|
391 |
+
"21": 0.501,
|
392 |
+
"22": 0.893,
|
393 |
+
"25": 0.695,
|
394 |
+
"26": 0.648
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.863,
|
398 |
+
"21": 0.737,
|
399 |
+
"22": 0.893,
|
400 |
+
"25": 0.874,
|
401 |
+
"26": 0.718
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.897,
|
405 |
+
"21": 0.841,
|
406 |
+
"22": 0.886,
|
407 |
+
"25": 0.899,
|
408 |
+
"26": 0.761
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9510000348091125,
|
414 |
+
"2": 0.9300000667572021,
|
415 |
+
"3": 0.9200000166893005,
|
416 |
+
"5": 0.9270000457763672,
|
417 |
+
"6": 0.8680000305175781
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9580000638961792,
|
421 |
+
"2": 0.9330000281333923,
|
422 |
+
"3": 0.9280000329017639,
|
423 |
+
"5": 0.9200000166893005,
|
424 |
+
"6": 0.862000048160553
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.647,
|
428 |
+
"2": 0.603,
|
429 |
+
"3": 0.598,
|
430 |
+
"5": 0.555,
|
431 |
+
"6": 0.592
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.75,
|
435 |
+
"2": 0.648,
|
436 |
+
"3": 0.607,
|
437 |
+
"5": 0.606,
|
438 |
+
"6": 0.626
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.767,
|
442 |
+
"2": 0.641,
|
443 |
+
"3": 0.645,
|
444 |
+
"5": 0.638,
|
445 |
+
"6": 0.676
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.842,
|
449 |
+
"2": 0.729,
|
450 |
+
"3": 0.54,
|
451 |
+
"5": 0.536,
|
452 |
+
"6": 0.597
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.906,
|
456 |
+
"2": 0.84,
|
457 |
+
"3": 0.569,
|
458 |
+
"5": 0.791,
|
459 |
+
"6": 0.59
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.918,
|
463 |
+
"2": 0.844,
|
464 |
+
"3": 0.6,
|
465 |
+
"5": 0.877,
|
466 |
+
"6": 0.71
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9720000624656677,
|
472 |
+
"5.0": 0.9730000495910645
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9780000448226929,
|
476 |
+
"5.0": 0.9810000658035278
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.673,
|
480 |
+
"5.0": 0.673
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.885,
|
492 |
+
"5.0": 0.885
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.889,
|
496 |
+
"5.0": 0.889
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.931,
|
500 |
+
"5.0": 0.931
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9530000686645508,
|
506 |
+
"Python": 0.9930000305175781,
|
507 |
+
"HTML": 0.984000027179718,
|
508 |
+
"Java": 0.9630000591278076,
|
509 |
+
"PHP": 0.956000030040741
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.984000027179718,
|
514 |
+
"HTML": 0.9900000691413879,
|
515 |
+
"Java": 0.9670000672340393,
|
516 |
+
"PHP": 0.9570000171661377
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.666,
|
520 |
+
"Python": 0.626,
|
521 |
+
"HTML": 0.721,
|
522 |
+
"Java": 0.619,
|
523 |
+
"PHP": 0.594
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.679,
|
527 |
+
"Python": 0.674,
|
528 |
+
"HTML": 0.8,
|
529 |
+
"Java": 0.676,
|
530 |
+
"PHP": 0.651
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.783,
|
534 |
+
"Python": 0.717,
|
535 |
+
"HTML": 0.935,
|
536 |
+
"Java": 0.733,
|
537 |
+
"PHP": 0.715
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.536,
|
541 |
+
"Python": 0.645,
|
542 |
+
"HTML": 0.579,
|
543 |
+
"Java": 0.613,
|
544 |
+
"PHP": 0.585
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.634,
|
548 |
+
"Python": 0.94,
|
549 |
+
"HTML": 0.691,
|
550 |
+
"Java": 0.617,
|
551 |
+
"PHP": 0.898
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.687,
|
555 |
+
"Python": 0.943,
|
556 |
+
"HTML": 0.804,
|
557 |
+
"Java": 0.675,
|
558 |
+
"PHP": 0.91
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9380000233650208,
|
564 |
+
"1": 0.984000027179718,
|
565 |
+
"2": 0.9330000281333923,
|
566 |
+
"3": 0.9550000429153442
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.940000057220459,
|
570 |
+
"1": 0.9860000610351562,
|
571 |
+
"2": 0.9200000166893005,
|
572 |
+
"3": 0.9540000557899475
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.573,
|
576 |
+
"1": 0.671,
|
577 |
+
"2": 0.672,
|
578 |
+
"3": 0.635
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.802,
|
582 |
+
"1": 0.808,
|
583 |
+
"2": 0.701,
|
584 |
+
"3": 0.816
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.81,
|
588 |
+
"1": 0.891,
|
589 |
+
"2": 0.752,
|
590 |
+
"3": 0.832
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.575,
|
594 |
+
"1": 0.667,
|
595 |
+
"2": 0.566,
|
596 |
+
"3": 0.633
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.737,
|
600 |
+
"1": 0.706,
|
601 |
+
"2": 0.708,
|
602 |
+
"3": 0.694
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.799,
|
606 |
+
"1": 0.749,
|
607 |
+
"2": 0.821,
|
608 |
+
"3": 0.73
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 1.0,
|
614 |
+
"fr": 0.999000072479248,
|
615 |
+
"de": 1.0,
|
616 |
+
"es": 1.0,
|
617 |
+
"nl": 0.999000072479248
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 0.999000072479248,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 0.999000072479248,
|
624 |
+
"nl": 1.0
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.739,
|
628 |
+
"fr": 0.585,
|
629 |
+
"de": 0.758,
|
630 |
+
"es": 0.496,
|
631 |
+
"nl": 0.649
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.829,
|
635 |
+
"fr": 0.582,
|
636 |
+
"de": 0.82,
|
637 |
+
"es": 0.958,
|
638 |
+
"nl": 0.753
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.892,
|
642 |
+
"fr": 0.888,
|
643 |
+
"de": 0.894,
|
644 |
+
"es": 0.98,
|
645 |
+
"nl": 0.852
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.76,
|
649 |
+
"fr": 0.994,
|
650 |
+
"de": 0.923,
|
651 |
+
"es": 0.884,
|
652 |
+
"nl": 0.665
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.997,
|
656 |
+
"fr": 0.994,
|
657 |
+
"de": 0.924,
|
658 |
+
"es": 0.996,
|
659 |
+
"nl": 0.997
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.998,
|
663 |
+
"fr": 0.997,
|
664 |
+
"de": 0.975,
|
665 |
+
"es": 0.995,
|
666 |
+
"nl": 0.996
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "8a95660d-45b7-41a4-a525-961bf9e6596a",
|
73 |
+
"datetime_epoch_millis": 1740163272675,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.002374991774559021,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.004799991846084595,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.0024250000715255737,
|
79 |
+
"tpp_threshold_5_total_metric": 0.0023500144481658934,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.0051000118255615234,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.00274999737739563,
|
82 |
+
"tpp_threshold_10_total_metric": 0.007099992036819458,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.010999995470046996,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.003900003433227539,
|
85 |
+
"tpp_threshold_20_total_metric": 0.018574997782707214,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.025499999523162842,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.0069250017404556274,
|
88 |
+
"tpp_threshold_50_total_metric": 0.04237500578165054,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.05090000629425048,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.008525000512599945,
|
91 |
+
"tpp_threshold_100_total_metric": 0.08157499581575393,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.0940999984741211,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.012525002658367156,
|
94 |
+
"tpp_threshold_500_total_metric": 0.2862000107765198,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.30840001106262205,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.022200000286102296
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.0029499828815460205,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.0045999884605407715,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.001650005578994751,
|
105 |
+
"tpp_threshold_5_total_metric": 0.0015500158071517945,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.004000020027160644,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.00245000422000885,
|
108 |
+
"tpp_threshold_10_total_metric": 0.0034999847412109375,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.0054000020027160645,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.001900017261505127,
|
111 |
+
"tpp_threshold_20_total_metric": 0.008800002932548522,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.01100001335144043,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.002200010418891907,
|
114 |
+
"tpp_threshold_50_total_metric": 0.020700007677078247,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.022600018978118898,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.0019000113010406495,
|
117 |
+
"tpp_threshold_100_total_metric": 0.05239999294281006,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.05540000200271607,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.003000009059906006,
|
120 |
+
"tpp_threshold_500_total_metric": 0.2711500138044357,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.27920001745224,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.00805000364780426
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.0018000006675720215,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.004999995231628418,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.0031999945640563965,
|
129 |
+
"tpp_threshold_5_total_metric": 0.0031500130891799925,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.0062000036239624025,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.0030499905347824096,
|
132 |
+
"tpp_threshold_10_total_metric": 0.01069999933242798,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.01659998893737793,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.005899989604949951,
|
135 |
+
"tpp_threshold_20_total_metric": 0.028349992632865906,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.039999985694885255,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.011649993062019349,
|
138 |
+
"tpp_threshold_50_total_metric": 0.06405000388622284,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.07919999361038207,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.01514998972415924,
|
141 |
+
"tpp_threshold_100_total_metric": 0.11074999868869781,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.13279999494552613,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.022049996256828307,
|
144 |
+
"tpp_threshold_500_total_metric": 0.30125000774860383,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.33760000467300416,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.03634999692440033
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0",
|
152 |
+
"sae_lens_version": "5.4.2",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 65536,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.006750002503395081,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.002250000834465027,
|
184 |
+
"tpp_threshold_5_total_metric": 0.011500045657157898,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.01500004529953003,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.0034999996423721313,
|
187 |
+
"tpp_threshold_10_total_metric": 0.004999995231628418,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.008000016212463379,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.003000020980834961,
|
190 |
+
"tpp_threshold_20_total_metric": 0.01800002157688141,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.020000040531158447,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.0020000189542770386,
|
193 |
+
"tpp_threshold_50_total_metric": 0.0350000262260437,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.03800004720687866,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.003000020980834961,
|
196 |
+
"tpp_threshold_100_total_metric": 0.11850003898143768,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.12200003862380981,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.0034999996423721313,
|
199 |
+
"tpp_threshold_500_total_metric": 0.39775002002716064,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.406000018119812,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.008249998092651367
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.0045000165700912476,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.0040000081062316895,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": -0.0005000084638595581,
|
207 |
+
"tpp_threshold_5_total_metric": -0.0002499520778656006,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.001000046730041504,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0012499988079071045,
|
210 |
+
"tpp_threshold_10_total_metric": 0.003750026226043701,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.0020000338554382324,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": -0.0017499923706054688,
|
213 |
+
"tpp_threshold_20_total_metric": -0.0012499690055847168,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.0020000338554382324,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.0032500028610229492,
|
216 |
+
"tpp_threshold_50_total_metric": 0.0025000572204589844,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.005000054836273193,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.002499997615814209,
|
219 |
+
"tpp_threshold_100_total_metric": 0.009000018239021301,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.012000024318695068,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.003000006079673767,
|
222 |
+
"tpp_threshold_500_total_metric": 0.1807500571012497,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.18500006198883057,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.004250004887580872
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": -0.00025004148483276367,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.001999974250793457,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.0022500157356262207,
|
230 |
+
"tpp_threshold_5_total_metric": -0.004250004887580872,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.0,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.004250004887580872,
|
233 |
+
"tpp_threshold_10_total_metric": 0.009249955415725708,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.010999977588653564,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.0017500221729278564,
|
236 |
+
"tpp_threshold_20_total_metric": 0.01299998164176941,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.013999998569488525,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.0010000169277191162,
|
239 |
+
"tpp_threshold_50_total_metric": 0.03224998712539673,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.03200000524520874,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": -0.0002499818801879883,
|
242 |
+
"tpp_threshold_100_total_metric": 0.05349995195865631,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.05299997329711914,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": -0.0004999786615371704,
|
245 |
+
"tpp_threshold_500_total_metric": 0.3799999952316284,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.3889999985694885,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.009000003337860107
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": -0.000500023365020752,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0015000104904174805,
|
253 |
+
"tpp_threshold_5_total_metric": 0.000250011682510376,
|
254 |
+
"tpp_threshold_5_intended_diff_only": -0.0009999871253967285,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0012499988079071045,
|
256 |
+
"tpp_threshold_10_total_metric": 0.0012499988079071045,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.003000020980834961,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.0017500221729278564,
|
259 |
+
"tpp_threshold_20_total_metric": 0.001999974250793457,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.0009999871253967285,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": -0.0009999871253967285,
|
262 |
+
"tpp_threshold_50_total_metric": 0.002750009298324585,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.003000020980834961,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.000250011682510376,
|
265 |
+
"tpp_threshold_100_total_metric": 0.002749994397163391,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.004999995231628418,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.002250000834465027,
|
268 |
+
"tpp_threshold_500_total_metric": 0.016249999403953552,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.023000001907348633,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.006750002503395081
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.00424996018409729,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.006999969482421875,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.002750009298324585,
|
276 |
+
"tpp_threshold_5_total_metric": 0.0004999786615371704,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.0045000165700912476,
|
279 |
+
"tpp_threshold_10_total_metric": -0.0017500519752502441,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.0029999613761901855,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.00475001335144043,
|
282 |
+
"tpp_threshold_20_total_metric": 0.012250006198883057,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.018000006675720215,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.005750000476837158,
|
285 |
+
"tpp_threshold_50_total_metric": 0.030999958515167236,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.034999966621398926,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.0040000081062316895,
|
288 |
+
"tpp_threshold_100_total_metric": 0.0782499611377716,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.08499997854232788,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.006750017404556274,
|
291 |
+
"tpp_threshold_500_total_metric": 0.38099999725818634,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.3930000066757202,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.012000009417533875
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.009749948978424072,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.012999951839447021,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
301 |
+
"tpp_threshold_5_total_metric": 0.004499971866607666,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.006999969482421875,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.002499997615814209,
|
304 |
+
"tpp_threshold_10_total_metric": -0.0015000104904174805,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.006999969482421875,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.008499979972839355,
|
307 |
+
"tpp_threshold_20_total_metric": 0.0014999806880950928,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.006999969482421875,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.005499988794326782,
|
310 |
+
"tpp_threshold_50_total_metric": 0.014499977231025696,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.01699995994567871,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.002499982714653015,
|
313 |
+
"tpp_threshold_100_total_metric": 0.024999961256980896,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.034999966621398926,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.01000000536441803,
|
316 |
+
"tpp_threshold_500_total_metric": 0.1757499873638153,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.18599998950958252,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.010250002145767212
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.001249954104423523,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.0029999613761901855,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.0017500072717666626,
|
324 |
+
"tpp_threshold_5_total_metric": -0.004749983549118042,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.008749991655349731,
|
327 |
+
"tpp_threshold_10_total_metric": 0.011499986052513123,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.015999972820281982,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.00449998676776886,
|
330 |
+
"tpp_threshold_20_total_metric": 0.020249977707862854,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.02499997615814209,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.004749998450279236,
|
333 |
+
"tpp_threshold_50_total_metric": 0.07249999046325684,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.08499997854232788,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.012499988079071045,
|
336 |
+
"tpp_threshold_100_total_metric": 0.12825001776218414,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.15200001001358032,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.02374999225139618,
|
339 |
+
"tpp_threshold_500_total_metric": 0.3604999780654907,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.3999999761581421,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.03949999809265137
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.008499979972839355,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.0059999823570251465,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.002499997615814209,
|
347 |
+
"tpp_threshold_5_total_metric": 0.0005000084638595581,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.0,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": -0.0005000084638595581,
|
350 |
+
"tpp_threshold_10_total_metric": 0.01424996554851532,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.01699995994567871,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.002749994397163391,
|
353 |
+
"tpp_threshold_20_total_metric": 0.008999988436698914,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.014999985694885254,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.00599999725818634,
|
356 |
+
"tpp_threshold_50_total_metric": 0.03624999523162842,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.041999995708465576,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.005750000476837158,
|
359 |
+
"tpp_threshold_100_total_metric": 0.07375001907348633,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.08700001239776611,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.013249993324279785,
|
362 |
+
"tpp_threshold_500_total_metric": 0.29850004613399506,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.3400000333786011,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.04149998724460602
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": -0.006499916315078735,
|
368 |
+
"tpp_threshold_2_intended_diff_only": -0.003999948501586914,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.0024999678134918213,
|
370 |
+
"tpp_threshold_5_total_metric": -0.007999926805496216,
|
371 |
+
"tpp_threshold_5_intended_diff_only": -0.003999948501586914,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.003999978303909302,
|
373 |
+
"tpp_threshold_10_total_metric": -0.014499947428703308,
|
374 |
+
"tpp_threshold_10_intended_diff_only": -0.007999956607818604,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.006499990820884705,
|
376 |
+
"tpp_threshold_20_total_metric": 0.03825005888938904,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.06700003147125244,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.028749972581863403,
|
379 |
+
"tpp_threshold_50_total_metric": 0.07950006425380707,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.12200003862380981,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.04249997437000275,
|
382 |
+
"tpp_threshold_100_total_metric": 0.13325001299381256,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.18000000715255737,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.04674999415874481,
|
385 |
+
"tpp_threshold_500_total_metric": 0.3255000412464142,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.39100003242492676,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.06549999117851257
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.012999996542930603,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.018999993801116943,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.00599999725818634,
|
393 |
+
"tpp_threshold_5_total_metric": 0.023499995470046997,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.02399998903274536,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": 0.0004999935626983643,
|
396 |
+
"tpp_threshold_10_total_metric": 0.04375000298023224,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.050999999046325684,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.007249996066093445,
|
399 |
+
"tpp_threshold_20_total_metric": 0.07274995744228363,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.08599996566772461,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.013250008225440979,
|
402 |
+
"tpp_threshold_50_total_metric": 0.11749999225139618,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.12999999523162842,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.012500002980232239,
|
405 |
+
"tpp_threshold_100_total_metric": 0.19349998235702515,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.20999997854232788,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.016499996185302734,
|
408 |
+
"tpp_threshold_500_total_metric": 0.34599998593330383,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.3709999918937683,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.025000005960464478
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "2072bfe8-9d3d-4573-8939-241c618278fe",
|
73 |
+
"datetime_epoch_millis": 1740162955007,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.0027249947190284727,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.005399996042251587,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.002675001323223114,
|
79 |
+
"tpp_threshold_5_total_metric": 0.004999993741512299,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.007799994945526124,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.002800001204013825,
|
82 |
+
"tpp_threshold_10_total_metric": 0.011049999296665192,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.014899998903274536,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.0038499996066093446,
|
85 |
+
"tpp_threshold_20_total_metric": 0.023100003600120544,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.0281000018119812,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.004999998211860657,
|
88 |
+
"tpp_threshold_50_total_metric": 0.0674250066280365,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.07430000305175781,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.006874996423721313,
|
91 |
+
"tpp_threshold_100_total_metric": 0.14430001527071,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.15460001230239867,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.01029999703168869,
|
94 |
+
"tpp_threshold_500_total_metric": 0.3798500135540962,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.4061000108718872,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.026249997317790985
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.003049987554550171,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.00559999942779541,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.002550011873245239,
|
105 |
+
"tpp_threshold_5_total_metric": 0.00589999258518219,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.008800005912780762,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.002900013327598572,
|
108 |
+
"tpp_threshold_10_total_metric": 0.011350002884864808,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.014000010490417481,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.0026500076055526733,
|
111 |
+
"tpp_threshold_20_total_metric": 0.026500004529953002,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.02960001230239868,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.0031000077724456787,
|
114 |
+
"tpp_threshold_50_total_metric": 0.0689500093460083,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.07320001125335693,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.004250001907348633,
|
117 |
+
"tpp_threshold_100_total_metric": 0.15995001494884492,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.16520001888275146,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.005250003933906555,
|
120 |
+
"tpp_threshold_500_total_metric": 0.4329000234603882,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.44420002698898314,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.011300003528594971
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.002400001883506775,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.005199992656707763,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.002799990773200989,
|
129 |
+
"tpp_threshold_5_total_metric": 0.004099994897842407,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.0067999839782714845,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.0026999890804290773,
|
132 |
+
"tpp_threshold_10_total_metric": 0.010749995708465576,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.01579998731613159,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.005049991607666016,
|
135 |
+
"tpp_threshold_20_total_metric": 0.019700002670288087,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.02659999132156372,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.006899988651275635,
|
138 |
+
"tpp_threshold_50_total_metric": 0.0659000039100647,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.07539999485015869,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.009499990940093994,
|
141 |
+
"tpp_threshold_100_total_metric": 0.12865001559257508,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.1440000057220459,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.015349990129470826,
|
144 |
+
"tpp_threshold_500_total_metric": 0.32680000364780426,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.3679999947547913,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.041199991106987
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0",
|
152 |
+
"sae_lens_version": "5.4.2",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 65536,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.0037500113248825073,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.008000016212463379,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.004250004887580872,
|
184 |
+
"tpp_threshold_5_total_metric": 0.011250019073486328,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.016000032424926758,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.00475001335144043,
|
187 |
+
"tpp_threshold_10_total_metric": 0.010500013828277588,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.016000032424926758,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.00550001859664917,
|
190 |
+
"tpp_threshold_20_total_metric": 0.02700003981590271,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.030000030994415283,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.0029999911785125732,
|
193 |
+
"tpp_threshold_50_total_metric": 0.06300003826618195,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.06700003147125244,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.003999993205070496,
|
196 |
+
"tpp_threshold_100_total_metric": 0.1420000195503235,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.1470000147819519,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.004999995231628418,
|
199 |
+
"tpp_threshold_500_total_metric": 0.427000030875206,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.44200003147125244,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.015000000596046448
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.00700002908706665,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.00700002908706665,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": 0.0,
|
207 |
+
"tpp_threshold_5_total_metric": 0.001999989151954651,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0020000189542770386,
|
210 |
+
"tpp_threshold_10_total_metric": 0.005500048398971558,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.005000054836273193,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": -0.0004999935626983643,
|
213 |
+
"tpp_threshold_20_total_metric": 0.0072500258684158325,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.012000024318695068,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.004749998450279236,
|
216 |
+
"tpp_threshold_50_total_metric": 0.061750054359436035,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.0700000524520874,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.008249998092651367,
|
219 |
+
"tpp_threshold_100_total_metric": 0.11425001919269562,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.12200003862380981,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.007750019431114197,
|
222 |
+
"tpp_threshold_500_total_metric": 0.4257500469684601,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.4360000491142273,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.010250002145767212
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": -0.0020000338554382324,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.003000020980834961,
|
230 |
+
"tpp_threshold_5_total_metric": 0.0057499706745147705,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.009999990463256836,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.004250019788742065,
|
233 |
+
"tpp_threshold_10_total_metric": 0.02674996852874756,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.02899998426437378,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.0022500157356262207,
|
236 |
+
"tpp_threshold_20_total_metric": 0.04725000262260437,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.04900002479553223,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.0017500221729278564,
|
239 |
+
"tpp_threshold_50_total_metric": 0.08799996972084045,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.08899998664855957,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.0010000169277191162,
|
242 |
+
"tpp_threshold_100_total_metric": 0.19200001657009125,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.19300001859664917,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.0010000020265579224,
|
245 |
+
"tpp_threshold_500_total_metric": 0.43150001764297485,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.43800002336502075,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.0065000057220458984
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.001999989151954651,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0040000081062316895,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0020000189542770386,
|
253 |
+
"tpp_threshold_5_total_metric": 0.003500029444694519,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0014999955892562866,
|
256 |
+
"tpp_threshold_10_total_metric": 0.00449998676776886,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.004999995231628418,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.0005000084638595581,
|
259 |
+
"tpp_threshold_20_total_metric": 0.0052499920129776,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.0040000081062316895,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": -0.0012499839067459106,
|
262 |
+
"tpp_threshold_50_total_metric": 0.016000032424926758,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.017000019550323486,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.0009999871253967285,
|
265 |
+
"tpp_threshold_100_total_metric": 0.1157500147819519,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.11900001764297485,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.0032500028610229492,
|
268 |
+
"tpp_threshold_500_total_metric": 0.42000001668930054,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.42900002002716064,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.009000003337860107
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.004499942064285278,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.007999956607818604,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.003500014543533325,
|
276 |
+
"tpp_threshold_5_total_metric": 0.006999954581260681,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.011999964714050293,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.005000010132789612,
|
279 |
+
"tpp_threshold_10_total_metric": 0.009499996900558472,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.014999985694885254,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.005499988794326782,
|
282 |
+
"tpp_threshold_20_total_metric": 0.0457499623298645,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.05299997329711914,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.007250010967254639,
|
285 |
+
"tpp_threshold_50_total_metric": 0.11599995195865631,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.12299996614456177,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.0070000141859054565,
|
288 |
+
"tpp_threshold_100_total_metric": 0.2357500046491623,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.24500000476837158,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.00925000011920929,
|
291 |
+
"tpp_threshold_500_total_metric": 0.46025000512599945,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.47600001096725464,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.015750005841255188
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.00475001335144043,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.004249989986419678,
|
301 |
+
"tpp_threshold_5_total_metric": 0.0020000040531158447,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.0029999911785125732,
|
304 |
+
"tpp_threshold_10_total_metric": 0.004999950528144836,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.012999951839447021,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.008000001311302185,
|
307 |
+
"tpp_threshold_20_total_metric": 0.005249962210655212,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.011999964714050293,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.006750002503395081,
|
310 |
+
"tpp_threshold_50_total_metric": 0.01299998164176941,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.014999985694885254,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.0020000040531158447,
|
313 |
+
"tpp_threshold_100_total_metric": 0.022250011563301086,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.03700000047683716,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.014749988913536072,
|
316 |
+
"tpp_threshold_500_total_metric": 0.2149999886751175,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.2749999761581421,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.0599999874830246
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.0037499964237213135,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.004999995231628418,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.0012499988079071045,
|
324 |
+
"tpp_threshold_5_total_metric": 0.0007499754428863525,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.007999956607818604,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.007249981164932251,
|
327 |
+
"tpp_threshold_10_total_metric": 0.014500007033348083,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.018999993801116943,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.00449998676776886,
|
330 |
+
"tpp_threshold_20_total_metric": 0.032000020146369934,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.03600001335144043,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.003999993205070496,
|
333 |
+
"tpp_threshold_50_total_metric": 0.0845000296831131,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.09600001573562622,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.011499986052513123,
|
336 |
+
"tpp_threshold_100_total_metric": 0.17149998247623444,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.19099998474121094,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.0195000022649765,
|
339 |
+
"tpp_threshold_500_total_metric": 0.3877500146627426,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.41600000858306885,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.028249993920326233
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.01000000536441803,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.008000016212463379,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.001999989151954651,
|
347 |
+
"tpp_threshold_5_total_metric": -2.9802322387695312e-08,
|
348 |
+
"tpp_threshold_5_intended_diff_only": -0.0020000338554382324,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": -0.0020000040531158447,
|
350 |
+
"tpp_threshold_10_total_metric": 0.009499981999397278,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.010999977588653564,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.0014999955892562866,
|
353 |
+
"tpp_threshold_20_total_metric": 0.002249985933303833,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.011999964714050293,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.00974997878074646,
|
356 |
+
"tpp_threshold_50_total_metric": 0.0455000102519989,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.0559999942779541,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.0104999840259552,
|
359 |
+
"tpp_threshold_100_total_metric": 0.0767500251531601,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.0910000205039978,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.014249995350837708,
|
362 |
+
"tpp_threshold_500_total_metric": 0.3192499876022339,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.3579999804496765,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.03874999284744263
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": 2.9802322387695312e-08,
|
368 |
+
"tpp_threshold_2_intended_diff_only": 0.003000020980834961,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.0029999911785125732,
|
370 |
+
"tpp_threshold_5_total_metric": 0.006750032305717468,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.013000011444091797,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.006249979138374329,
|
373 |
+
"tpp_threshold_10_total_metric": 0.004500046372413635,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.012000024318695068,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.007499977946281433,
|
376 |
+
"tpp_threshold_20_total_metric": 0.01575005054473877,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.025000035762786865,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.009249985218048096,
|
379 |
+
"tpp_threshold_50_total_metric": 0.08250001072883606,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.09700000286102295,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.01449999213218689,
|
382 |
+
"tpp_threshold_100_total_metric": 0.15550003945827484,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.17500001192092896,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.019499972462654114,
|
385 |
+
"tpp_threshold_500_total_metric": 0.35050003230571747,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.4150000214576721,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.06449998915195465
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.013499975204467773,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.01699995994567871,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.0034999847412109375,
|
393 |
+
"tpp_threshold_5_total_metric": 0.010999992489814758,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.009999990463256836,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": -0.0010000020265579224,
|
396 |
+
"tpp_threshold_10_total_metric": 0.020249992609024048,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.02399998903274536,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.0037499964237213135,
|
399 |
+
"tpp_threshold_20_total_metric": 0.04324999451637268,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.04799997806549072,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.004749983549118042,
|
402 |
+
"tpp_threshold_50_total_metric": 0.10399998724460602,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.11299997568130493,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.008999988436698914,
|
405 |
+
"tpp_threshold_100_total_metric": 0.2172500193119049,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.22600001096725464,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.008749991655349731,
|
408 |
+
"tpp_threshold_500_total_metric": 0.36149999499320984,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.37599998712539673,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.01449999213218689
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "f2bf09ac-6740-414f-aa61-a62e38a23b92",
|
73 |
+
"datetime_epoch_millis": 1740162794981,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.002074998617172241,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.004600000381469726,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.0025250017642974854,
|
79 |
+
"tpp_threshold_5_total_metric": 0.003899991512298584,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.0056999921798706055,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.0018000006675720215,
|
82 |
+
"tpp_threshold_10_total_metric": 2.5008618831634565e-05,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.002800005674362183,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.002774997055530548,
|
85 |
+
"tpp_threshold_20_total_metric": 0.003749997913837433,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.007200002670288086,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.0034500047564506534,
|
88 |
+
"tpp_threshold_50_total_metric": 0.012599988281726836,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.016999995708465575,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.004400007426738739,
|
91 |
+
"tpp_threshold_100_total_metric": 0.02380000501871109,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.02910000681877136,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.005300001800060272,
|
94 |
+
"tpp_threshold_500_total_metric": 0.07820001393556594,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.08660001158714295,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.008399997651576997
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.005099990963935852,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.007200002670288086,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.002100011706352234,
|
105 |
+
"tpp_threshold_5_total_metric": 0.002500000596046448,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.0048000097274780275,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.0023000091314315796,
|
108 |
+
"tpp_threshold_10_total_metric": 0.0022500097751617433,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.004400014877319336,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.0021500051021575927,
|
111 |
+
"tpp_threshold_20_total_metric": 0.0045499980449676515,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.007200014591217041,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.0026500165462493897,
|
114 |
+
"tpp_threshold_50_total_metric": 0.014249974489212036,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.016999995708465575,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.00275002121925354,
|
117 |
+
"tpp_threshold_100_total_metric": 0.021900007128715517,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.0254000186920166,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.0035000115633010863,
|
120 |
+
"tpp_threshold_500_total_metric": 0.07305001318454743,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.07700002193450928,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.003950008749961853
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": -0.0009499937295913696,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.0019999980926513673,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.002949991822242737,
|
129 |
+
"tpp_threshold_5_total_metric": 0.00529998242855072,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.006599974632263183,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.0012999922037124634,
|
132 |
+
"tpp_threshold_10_total_metric": -0.002199992537498474,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.0011999964714050292,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.0033999890089035033,
|
135 |
+
"tpp_threshold_20_total_metric": 0.0029499977827072144,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.007199990749359131,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.004249992966651917,
|
138 |
+
"tpp_threshold_50_total_metric": 0.010950002074241637,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.016999995708465575,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.006049993634223938,
|
141 |
+
"tpp_threshold_100_total_metric": 0.025700002908706665,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.03279999494552612,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.007099992036819458,
|
144 |
+
"tpp_threshold_500_total_metric": 0.08335001468658447,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.09620000123977661,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.012849986553192139
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0",
|
152 |
+
"sae_lens_version": "5.4.2",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 65536,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.005000010132789612,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.00700002908706665,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.0020000189542770386,
|
184 |
+
"tpp_threshold_5_total_metric": 0.004249975085258484,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.0007500201463699341,
|
187 |
+
"tpp_threshold_10_total_metric": 0.0072500258684158325,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.00700002908706665,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": -0.00024999678134918213,
|
190 |
+
"tpp_threshold_20_total_metric": 0.004749998450279236,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.008000016212463379,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.003250017762184143,
|
193 |
+
"tpp_threshold_50_total_metric": 0.022749990224838257,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.02799999713897705,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.005250006914138794,
|
196 |
+
"tpp_threshold_100_total_metric": 0.038000017404556274,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.04400002956390381,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.006000012159347534,
|
199 |
+
"tpp_threshold_500_total_metric": 0.14850004017353058,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.15400004386901855,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.005500003695487976
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.00875002145767212,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.012000024318695068,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
207 |
+
"tpp_threshold_5_total_metric": -0.0012499988079071045,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.005250006914138794,
|
210 |
+
"tpp_threshold_10_total_metric": -0.0009999573230743408,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.001000046730041504,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": 0.0020000040531158447,
|
213 |
+
"tpp_threshold_20_total_metric": -0.00024996697902679443,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.001000046730041504,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.0012500137090682983,
|
216 |
+
"tpp_threshold_50_total_metric": -0.004250004887580872,
|
217 |
+
"tpp_threshold_50_intended_diff_only": -0.0009999871253967285,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.003250017762184143,
|
219 |
+
"tpp_threshold_100_total_metric": 0.007249981164932251,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.009000003337860107,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.0017500221729278564,
|
222 |
+
"tpp_threshold_500_total_metric": 0.03200000524520874,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.03700000047683716,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.004999995231628418
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.0062499940395355225,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.002750009298324585,
|
230 |
+
"tpp_threshold_5_total_metric": 0.0065000057220458984,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.008000016212463379,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.0015000104904174805,
|
233 |
+
"tpp_threshold_10_total_metric": 0.0007500052452087402,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.0032500028610229492,
|
236 |
+
"tpp_threshold_20_total_metric": 0.007999971508979797,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.009999990463256836,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.0020000189542770386,
|
239 |
+
"tpp_threshold_50_total_metric": 0.014999955892562866,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.015999972820281982,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.0010000169277191162,
|
242 |
+
"tpp_threshold_100_total_metric": 0.008500009775161743,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.013000011444091797,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.004500001668930054,
|
245 |
+
"tpp_threshold_500_total_metric": 0.0455000102519989,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.04900002479553223,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.003500014543533325
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.0014999806880950928,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": -0.0004999935626983643,
|
253 |
+
"tpp_threshold_5_total_metric": -0.0004999637603759766,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": 0.002499997615814209,
|
256 |
+
"tpp_threshold_10_total_metric": 0.0032500028610229492,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.0007500052452087402,
|
259 |
+
"tpp_threshold_20_total_metric": -0.0004999935626983643,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.003000020980834961,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": 0.003500014543533325,
|
262 |
+
"tpp_threshold_50_total_metric": 0.0027499794960021973,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.0040000081062316895,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.0012500286102294922,
|
265 |
+
"tpp_threshold_100_total_metric": 0.0015000253915786743,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.0020000338554382324,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.0005000084638595581,
|
268 |
+
"tpp_threshold_500_total_metric": 0.004250019788742065,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.00700002908706665,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.002750009298324585
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.003999948501586914,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.006999969482421875,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.003000020980834961,
|
276 |
+
"tpp_threshold_5_total_metric": 0.0034999847412109375,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.0015000104904174805,
|
279 |
+
"tpp_threshold_10_total_metric": 0.0009999722242355347,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.0059999823570251465,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.005000010132789612,
|
282 |
+
"tpp_threshold_20_total_metric": 0.010749980807304382,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.013999998569488525,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.003250017762184143,
|
285 |
+
"tpp_threshold_50_total_metric": 0.03499995172023773,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.03799998760223389,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.003000035881996155,
|
288 |
+
"tpp_threshold_100_total_metric": 0.05425000190734863,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.05900001525878906,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.00475001335144043,
|
291 |
+
"tpp_threshold_500_total_metric": 0.13499999046325684,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.1380000114440918,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.003000020980834961
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.0007500052452087402,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.004999995231628418,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.004249989986419678,
|
301 |
+
"tpp_threshold_5_total_metric": 0.0037499815225601196,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.006999969482421875,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.0032499879598617554,
|
304 |
+
"tpp_threshold_10_total_metric": -0.0002499818801879883,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.004999995231628418,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.005249977111816406,
|
307 |
+
"tpp_threshold_20_total_metric": -0.003250032663345337,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.0029999613761901855,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.0062499940395355225,
|
310 |
+
"tpp_threshold_50_total_metric": 0.0009999871253967285,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.006999969482421875,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.0059999823570251465,
|
313 |
+
"tpp_threshold_100_total_metric": 0.002499997615814209,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.0059999823570251465,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.0034999847412109375,
|
316 |
+
"tpp_threshold_500_total_metric": 0.015999972820281982,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.02199995517730713,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.0059999823570251465
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.0005000084638595581,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.004999995231628418,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.00449998676776886,
|
324 |
+
"tpp_threshold_5_total_metric": 0.012999966740608215,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.011999964714050293,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": -0.0010000020265579224,
|
327 |
+
"tpp_threshold_10_total_metric": 0.004749983549118042,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.006999969482421875,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.002249985933303833,
|
330 |
+
"tpp_threshold_20_total_metric": 0.013750016689300537,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.018000006675720215,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.004249989986419678,
|
333 |
+
"tpp_threshold_50_total_metric": 0.0062499940395355225,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.013999998569488525,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.007750004529953003,
|
336 |
+
"tpp_threshold_100_total_metric": 0.023999974131584167,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.029999971389770508,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.00599999725818634,
|
339 |
+
"tpp_threshold_500_total_metric": 0.07625000178813934,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.08899998664855957,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.012749984860420227
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": 0.0004999637603759766,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.0020000338554382324,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": -0.002499997615814209,
|
347 |
+
"tpp_threshold_5_total_metric": -0.003500029444694519,
|
348 |
+
"tpp_threshold_5_intended_diff_only": -0.0020000338554382324,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": 0.0014999955892562866,
|
350 |
+
"tpp_threshold_10_total_metric": -0.008749991655349731,
|
351 |
+
"tpp_threshold_10_intended_diff_only": -0.004999995231628418,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.0037499964237213135,
|
353 |
+
"tpp_threshold_20_total_metric": -0.000500023365020752,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.001999974250793457,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.002499997615814209,
|
356 |
+
"tpp_threshold_50_total_metric": 0.011249974370002747,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.011999964714050293,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.0007499903440475464,
|
359 |
+
"tpp_threshold_100_total_metric": 0.024500012397766113,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.027000010013580322,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.002499997615814209,
|
362 |
+
"tpp_threshold_500_total_metric": 0.07074999809265137,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.08300000429153442,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.012250006198883057
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": -0.014499962329864502,
|
368 |
+
"tpp_threshold_2_intended_diff_only": -0.006999969482421875,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.007499992847442627,
|
370 |
+
"tpp_threshold_5_total_metric": -0.008249983191490173,
|
371 |
+
"tpp_threshold_5_intended_diff_only": -0.004999995231628418,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.0032499879598617554,
|
373 |
+
"tpp_threshold_10_total_metric": -0.012499943375587463,
|
374 |
+
"tpp_threshold_10_intended_diff_only": -0.007999956607818604,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.00449998676776886,
|
376 |
+
"tpp_threshold_20_total_metric": -0.009499967098236084,
|
377 |
+
"tpp_threshold_20_intended_diff_only": -0.0009999871253967285,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.008499979972839355,
|
379 |
+
"tpp_threshold_50_total_metric": 0.004000052809715271,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.01500004529953003,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.010999992489814758,
|
382 |
+
"tpp_threshold_100_total_metric": 0.01875002682209015,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.03600001335144043,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.01724998652935028,
|
385 |
+
"tpp_threshold_500_total_metric": 0.09225007891654968,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.11200004816055298,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.019749969244003296
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.008000016212463379,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.0009999871253967285,
|
393 |
+
"tpp_threshold_5_total_metric": 0.02149997651576996,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.0209999680519104,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": -0.0005000084638595581,
|
396 |
+
"tpp_threshold_10_total_metric": 0.0057499706745147705,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.006999969482421875,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.0012499988079071045,
|
399 |
+
"tpp_threshold_20_total_metric": 0.014249995350837708,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.013999998569488525,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": -0.00024999678134918213,
|
402 |
+
"tpp_threshold_50_total_metric": 0.03225000202655792,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.03700000047683716,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.004749998450279236,
|
405 |
+
"tpp_threshold_100_total_metric": 0.058750003576278687,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.06499999761581421,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.0062499940395355225,
|
408 |
+
"tpp_threshold_500_total_metric": 0.1615000218153,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.17500001192092896,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.013499990105628967
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "072d027b-019d-4c30-82dd-a58126cb07ee",
|
73 |
+
"datetime_epoch_millis": 1740163113677,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.0036749929189682003,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.00629999041557312,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.002624997496604919,
|
79 |
+
"tpp_threshold_5_total_metric": 0.005574998259544372,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.008499997854232787,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.0029249995946884154,
|
82 |
+
"tpp_threshold_10_total_metric": 0.009825007617473602,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.013400006294250488,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.003574998676776886,
|
85 |
+
"tpp_threshold_20_total_metric": 0.014200010895729066,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.01830000877380371,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.004099997878074646,
|
88 |
+
"tpp_threshold_50_total_metric": 0.027800002694129942,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.033399999141693115,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.005599996447563172,
|
91 |
+
"tpp_threshold_100_total_metric": 0.04107500612735748,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.04900000095367432,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.007924994826316834,
|
94 |
+
"tpp_threshold_500_total_metric": 0.1252000018954277,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.13830000162124634,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.013099999725818634
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.006199979782104492,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.008199989795684814,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.002000010013580322,
|
105 |
+
"tpp_threshold_5_total_metric": 0.007949993014335632,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.010800004005432129,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.0028500109910964966,
|
108 |
+
"tpp_threshold_10_total_metric": 0.012450012564659118,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.014600014686584473,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.002150002121925354,
|
111 |
+
"tpp_threshold_20_total_metric": 0.018750008940696717,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.02100001573562622,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.0022500067949295043,
|
114 |
+
"tpp_threshold_50_total_metric": 0.03235000073909759,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.03760000467300415,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.005250003933906555,
|
117 |
+
"tpp_threshold_100_total_metric": 0.05085000991821289,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.05620001554489136,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.005350005626678467,
|
120 |
+
"tpp_threshold_500_total_metric": 0.1507999986410141,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.15760000944137573,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.006800010800361633
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.0011500060558319093,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.004399991035461426,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032499849796295164,
|
129 |
+
"tpp_threshold_5_total_metric": 0.003200003504753113,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.006199991703033448,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.0029999881982803343,
|
132 |
+
"tpp_threshold_10_total_metric": 0.007200002670288086,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.012199997901916504,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.004999995231628418,
|
135 |
+
"tpp_threshold_20_total_metric": 0.009650012850761414,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.015600001811981202,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.005949988961219788,
|
138 |
+
"tpp_threshold_50_total_metric": 0.02325000464916229,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.02919999361038208,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.005949988961219788,
|
141 |
+
"tpp_threshold_100_total_metric": 0.03130000233650208,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.04179998636245728,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.0104999840259552,
|
144 |
+
"tpp_threshold_500_total_metric": 0.09960000514984131,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.11899999380111695,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.019399988651275634
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0",
|
152 |
+
"sae_lens_version": "5.4.2",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 65536,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.01124998927116394,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.013999998569488525,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.002750009298324585,
|
184 |
+
"tpp_threshold_5_total_metric": 0.014999985694885254,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.018999993801116943,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.0040000081062316895,
|
187 |
+
"tpp_threshold_10_total_metric": 0.010249987244606018,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.013999998569488525,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.0037500113248825073,
|
190 |
+
"tpp_threshold_20_total_metric": 0.021500006318092346,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.023000001907348633,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.0014999955892562866,
|
193 |
+
"tpp_threshold_50_total_metric": 0.04150000214576721,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.05500000715255737,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.013500005006790161,
|
196 |
+
"tpp_threshold_100_total_metric": 0.06099998950958252,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.07400000095367432,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.013000011444091797,
|
199 |
+
"tpp_threshold_500_total_metric": 0.19425000250339508,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.2070000171661377,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.012750014662742615
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.009250015020370483,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": -0.000250011682510376,
|
207 |
+
"tpp_threshold_5_total_metric": 0.008250042796134949,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.010000050067901611,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0017500072717666626,
|
210 |
+
"tpp_threshold_10_total_metric": 0.006750047206878662,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.006000041961669922,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": -0.0007500052452087402,
|
213 |
+
"tpp_threshold_20_total_metric": 0.004000052809715271,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.006000041961669922,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.001999989151954651,
|
216 |
+
"tpp_threshold_50_total_metric": 0.006000041961669922,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.01100003719329834,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.004999995231628418,
|
219 |
+
"tpp_threshold_100_total_metric": 0.018000051379203796,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.024000048637390137,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.00599999725818634,
|
222 |
+
"tpp_threshold_500_total_metric": 0.09325002133846283,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.10000002384185791,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.006750002503395081
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.006749957799911499,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.010999977588653564,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.004250019788742065,
|
230 |
+
"tpp_threshold_5_total_metric": 0.014499962329864502,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.019999980926513672,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.00550001859664917,
|
233 |
+
"tpp_threshold_10_total_metric": 0.038000017404556274,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.04000002145767212,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.0020000040531158447,
|
236 |
+
"tpp_threshold_20_total_metric": 0.047749996185302734,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.050000011920928955,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.0022500157356262207,
|
239 |
+
"tpp_threshold_50_total_metric": 0.06199999153614044,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.06400001049041748,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.0020000189542770386,
|
242 |
+
"tpp_threshold_100_total_metric": 0.08500000834465027,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.08700001239776611,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.0020000040531158447,
|
245 |
+
"tpp_threshold_500_total_metric": 0.26524999737739563,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.2670000195503235,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.0017500221729278564
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": -0.0002500265836715698,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0012500137090682983,
|
253 |
+
"tpp_threshold_5_total_metric": 0.000250011682510376,
|
254 |
+
"tpp_threshold_5_intended_diff_only": -0.0009999871253967285,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0012499988079071045,
|
256 |
+
"tpp_threshold_10_total_metric": 0.003000006079673767,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.0010000020265579224,
|
259 |
+
"tpp_threshold_20_total_metric": 0.003250017762184143,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.0020000338554382324,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": -0.0012499839067459106,
|
262 |
+
"tpp_threshold_50_total_metric": 0.004250004887580872,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.004999995231628418,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.0007499903440475464,
|
265 |
+
"tpp_threshold_100_total_metric": 0.011750027537345886,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.01100003719329834,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": -0.0007499903440475464,
|
268 |
+
"tpp_threshold_500_total_metric": 0.015999987721443176,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.019999980926513672,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.003999993205070496
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.003999963402748108,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.0059999823570251465,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.0020000189542770386,
|
276 |
+
"tpp_threshold_5_total_metric": 0.001749962568283081,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.0059999823570251465,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.004250019788742065,
|
279 |
+
"tpp_threshold_10_total_metric": 0.004250004887580872,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.009000003337860107,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.004749998450279236,
|
282 |
+
"tpp_threshold_20_total_metric": 0.017249971628189087,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.02399998903274536,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.006750017404556274,
|
285 |
+
"tpp_threshold_50_total_metric": 0.04799996316432953,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.05299997329711914,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.005000010132789612,
|
288 |
+
"tpp_threshold_100_total_metric": 0.07849997282028198,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.08499997854232788,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.0065000057220458984,
|
291 |
+
"tpp_threshold_500_total_metric": 0.18524998426437378,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.1940000057220459,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.00875002145767212
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.0062499940395355225,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.010999977588653564,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.004749983549118042,
|
301 |
+
"tpp_threshold_5_total_metric": 0.0015000104904174805,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.0034999847412109375,
|
304 |
+
"tpp_threshold_10_total_metric": 0.0032499730587005615,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.010999977588653564,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.007750004529953003,
|
307 |
+
"tpp_threshold_20_total_metric": 0.001749977469444275,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.006999969482421875,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.0052499920129776,
|
310 |
+
"tpp_threshold_50_total_metric": 0.001499965786933899,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.003999948501586914,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.002499982714653015,
|
313 |
+
"tpp_threshold_100_total_metric": 0.001749962568283081,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.007999956607818604,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.0062499940395355225,
|
316 |
+
"tpp_threshold_500_total_metric": 0.027499958872795105,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.030999958515167236,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.0034999996423721313
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.00475001335144043,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.004999995231628418,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.0002499818801879883,
|
324 |
+
"tpp_threshold_5_total_metric": 0.001499965786933899,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.007999956607818604,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.006499990820884705,
|
327 |
+
"tpp_threshold_10_total_metric": 0.008749961853027344,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.011999964714050293,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.0032500028610229492,
|
330 |
+
"tpp_threshold_20_total_metric": 0.005750015377998352,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.009000003337860107,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.0032499879598617554,
|
333 |
+
"tpp_threshold_50_total_metric": 0.012500002980232239,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.018000006675720215,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.005500003695487976,
|
336 |
+
"tpp_threshold_100_total_metric": 0.019000008702278137,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.02899998426437378,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.009999975562095642,
|
339 |
+
"tpp_threshold_500_total_metric": 0.08375000953674316,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.09299999475479126,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.009249985218048096
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.00849999487400055,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.004999995231628418,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.0034999996423721313,
|
347 |
+
"tpp_threshold_5_total_metric": 0.005000025033950806,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": -0.0010000169277191162,
|
350 |
+
"tpp_threshold_10_total_metric": 0.006750002503395081,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.009999990463256836,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.0032499879598617554,
|
353 |
+
"tpp_threshold_20_total_metric": 0.003250017762184143,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.009000003337860107,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.005749985575675964,
|
356 |
+
"tpp_threshold_50_total_metric": 0.016000017523765564,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.018000006675720215,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.001999989151954651,
|
359 |
+
"tpp_threshold_100_total_metric": 0.023000001907348633,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.03299999237060547,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.009999990463256836,
|
362 |
+
"tpp_threshold_500_total_metric": 0.0832500159740448,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.09700000286102295,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.01374998688697815
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": -0.004999950528144836,
|
368 |
+
"tpp_threshold_2_intended_diff_only": -0.001999974250793457,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.0029999762773513794,
|
370 |
+
"tpp_threshold_5_total_metric": -0.0009999871253967285,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.0059999823570251465,
|
373 |
+
"tpp_threshold_10_total_metric": -0.0074999332427978516,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.001000046730041504,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.008499979972839355,
|
376 |
+
"tpp_threshold_20_total_metric": 0.0055000633001327515,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.01500004529953003,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.009499981999397278,
|
379 |
+
"tpp_threshold_50_total_metric": 0.03925004601478577,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.04900002479553223,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.00974997878074646,
|
382 |
+
"tpp_threshold_100_total_metric": 0.04725003242492676,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.06400001049041748,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.016749978065490723,
|
385 |
+
"tpp_threshold_500_total_metric": 0.08625003695487976,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.14600002765655518,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.059749990701675415
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.00824996829032898,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.012999951839447021,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.004749983549118042,
|
393 |
+
"tpp_threshold_5_total_metric": 0.009000003337860107,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.009000003337860107,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": 0.0,
|
396 |
+
"tpp_threshold_10_total_metric": 0.024750009179115295,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.027000010013580322,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.002250000834465027,
|
399 |
+
"tpp_threshold_20_total_metric": 0.031999990344047546,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.03799998760223389,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.00599999725818634,
|
402 |
+
"tpp_threshold_50_total_metric": 0.046999990940093994,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.05699998140335083,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.009999990463256836,
|
405 |
+
"tpp_threshold_100_total_metric": 0.06550000607967377,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.07499998807907104,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.009499981999397278,
|
408 |
+
"tpp_threshold_500_total_metric": 0.2172500044107437,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.2279999852180481,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.010749980807304382
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "6d523acd-6837-4bf6-8169-4ebea2aedf9e",
|
73 |
+
"datetime_epoch_millis": 1740163747509,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.001275007426738739,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.0037000060081481935,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.0024249985814094547,
|
79 |
+
"tpp_threshold_5_total_metric": 0.0007249891757965087,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.0034999907016754154,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.0027750015258789064,
|
82 |
+
"tpp_threshold_10_total_metric": 0.004174999892711639,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.0078000009059906,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.0036250010132789614,
|
85 |
+
"tpp_threshold_20_total_metric": 0.0060749977827072145,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.009899997711181642,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.003824999928474426,
|
88 |
+
"tpp_threshold_50_total_metric": 0.016099993884563447,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.022099995613098146,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.006000001728534699,
|
91 |
+
"tpp_threshold_100_total_metric": 0.02952500283718109,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.039000004529953,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.009475001692771911,
|
94 |
+
"tpp_threshold_500_total_metric": 0.12557500153779982,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.1384999990463257,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.012924997508525847
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.0031000018119812013,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.0048000097274780275,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.0017000079154968263,
|
105 |
+
"tpp_threshold_5_total_metric": 0.0022999852895736693,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.004799997806549073,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.002500012516975403,
|
108 |
+
"tpp_threshold_10_total_metric": 0.006099998950958252,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.008200013637542724,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.0021000146865844727,
|
111 |
+
"tpp_threshold_20_total_metric": 0.01099998652935028,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.013199996948242188,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.002200010418891907,
|
114 |
+
"tpp_threshold_50_total_metric": 0.0169999897480011,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.020399999618530274,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.0034000098705291746,
|
117 |
+
"tpp_threshold_100_total_metric": 0.03155000209808349,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.035600018501281736,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.004050016403198242,
|
120 |
+
"tpp_threshold_500_total_metric": 0.12450000941753388,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.13040001392364503,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.005900004506111145
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": -0.0005499869585037231,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.0026000022888183595,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.0031499892473220827,
|
129 |
+
"tpp_threshold_5_total_metric": -0.0008500069379806519,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.0021999835968017577,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.0030499905347824096,
|
132 |
+
"tpp_threshold_10_total_metric": 0.002250000834465027,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.007399988174438476,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.0051499873399734495,
|
135 |
+
"tpp_threshold_20_total_metric": 0.001150009036064148,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.006599998474121094,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.0054499894380569455,
|
138 |
+
"tpp_threshold_50_total_metric": 0.015199998021125793,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.023799991607666014,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.008599993586540223,
|
141 |
+
"tpp_threshold_100_total_metric": 0.027500003576278687,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.04239999055862427,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.01489998698234558,
|
144 |
+
"tpp_threshold_500_total_metric": 0.12664999365806578,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.14659998416900635,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.01994999051094055
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0",
|
152 |
+
"sae_lens_version": "5.4.2",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 65536,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.007249996066093445,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.0017500072717666626,
|
184 |
+
"tpp_threshold_5_total_metric": 0.00925000011920929,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.012000024318695068,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.002750024199485779,
|
187 |
+
"tpp_threshold_10_total_metric": 0.003000006079673767,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.00700002908706665,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.004000023007392883,
|
190 |
+
"tpp_threshold_20_total_metric": 0.0169999897480011,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.018999993801116943,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.0020000040531158447,
|
193 |
+
"tpp_threshold_50_total_metric": 0.028999969363212585,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.03299999237060547,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.004000023007392883,
|
196 |
+
"tpp_threshold_100_total_metric": 0.05250002443790436,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.058000028133392334,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.005500003695487976,
|
199 |
+
"tpp_threshold_500_total_metric": 0.23200003802776337,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.23700004816055298,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.005000010132789612
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.007750034332275391,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.00700002908706665,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": -0.0007500052452087402,
|
207 |
+
"tpp_threshold_5_total_metric": 0.0015000104904174805,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.003000020980834961,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0015000104904174805,
|
210 |
+
"tpp_threshold_10_total_metric": 0.006750032305717468,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.006000041961669922,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": -0.0007499903440475464,
|
213 |
+
"tpp_threshold_20_total_metric": -0.0002499818801879883,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.003000020980834961,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.0032500028610229492,
|
216 |
+
"tpp_threshold_50_total_metric": 0.002750024199485779,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.00700002908706665,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.004250004887580872,
|
219 |
+
"tpp_threshold_100_total_metric": 0.0017500221729278564,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.006000041961669922,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.004250019788742065,
|
222 |
+
"tpp_threshold_500_total_metric": 0.059500038623809814,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.06700003147125244,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.007499992847442627
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": -0.002750009298324585,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.0,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.002750009298324585,
|
230 |
+
"tpp_threshold_5_total_metric": -0.003250032663345337,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.004250019788742065,
|
233 |
+
"tpp_threshold_10_total_metric": 0.013499975204467773,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.014999985694885254,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.0015000104904174805,
|
236 |
+
"tpp_threshold_20_total_metric": 0.02374996244907379,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.02499997615814209,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.0012500137090682983,
|
239 |
+
"tpp_threshold_50_total_metric": 0.022499963641166687,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.02499997615814209,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.002500012516975403,
|
242 |
+
"tpp_threshold_100_total_metric": 0.03275001049041748,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.0350000262260437,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.0022500157356262207,
|
245 |
+
"tpp_threshold_500_total_metric": 0.148250013589859,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.15200001001358032,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.0037499964237213135
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 1.4901161193847656e-08,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0020000338554382324,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0020000189542770386,
|
253 |
+
"tpp_threshold_5_total_metric": 0.001999989151954651,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0010000020265579224,
|
256 |
+
"tpp_threshold_10_total_metric": 0.002749994397163391,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.0012500137090682983,
|
259 |
+
"tpp_threshold_20_total_metric": 0.003250017762184143,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.0020000338554382324,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": -0.0012499839067459106,
|
262 |
+
"tpp_threshold_50_total_metric": 0.003000006079673767,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.0040000081062316895,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.0010000020265579224,
|
265 |
+
"tpp_threshold_100_total_metric": 0.0034999847412109375,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.0040000081062316895,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.000500023365020752,
|
268 |
+
"tpp_threshold_500_total_metric": 0.012749999761581421,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.018000006675720215,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.005250006914138794
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.0032499730587005615,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.0059999823570251465,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.002750009298324585,
|
276 |
+
"tpp_threshold_5_total_metric": 0.001999959349632263,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.006999969482421875,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.005000010132789612,
|
279 |
+
"tpp_threshold_10_total_metric": 0.00449998676776886,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.009000003337860107,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.0045000165700912476,
|
282 |
+
"tpp_threshold_20_total_metric": 0.011249944567680359,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.01699995994567871,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.005750015377998352,
|
285 |
+
"tpp_threshold_50_total_metric": 0.027749985456466675,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.03299999237060547,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.005250006914138794,
|
288 |
+
"tpp_threshold_100_total_metric": 0.06724996864795685,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.07499998807907104,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.007750019431114197,
|
291 |
+
"tpp_threshold_500_total_metric": 0.16999995708465576,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.17799997329711914,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.008000016212463379
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.00599999725818634,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.009999990463256836,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.003999993205070496,
|
301 |
+
"tpp_threshold_5_total_metric": 0.003999963402748108,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.006999969482421875,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.003000006079673767,
|
304 |
+
"tpp_threshold_10_total_metric": 0.003000006079673767,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.009999990463256836,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.006999984383583069,
|
307 |
+
"tpp_threshold_20_total_metric": -0.0025000274181365967,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.0029999613761901855,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.005499988794326782,
|
310 |
+
"tpp_threshold_50_total_metric": 0.00349995493888855,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.003999948501586914,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.0004999935626983643,
|
313 |
+
"tpp_threshold_100_total_metric": 0.0020000189542770386,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.009000003337860107,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.006999984383583069,
|
316 |
+
"tpp_threshold_500_total_metric": 0.022749975323677063,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.042999982833862305,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.02025000751018524
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": -0.0004999935626983643,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.0,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.0004999935626983643,
|
324 |
+
"tpp_threshold_5_total_metric": -0.005500033497810364,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.0029999613761901855,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.00849999487400055,
|
327 |
+
"tpp_threshold_10_total_metric": 0.0014999955892562866,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.006999969482421875,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.005499973893165588,
|
330 |
+
"tpp_threshold_20_total_metric": -0.0009999722242355347,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.0040000081062316895,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.004999980330467224,
|
333 |
+
"tpp_threshold_50_total_metric": 0.008999988436698914,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.014999985694885254,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.00599999725818634,
|
336 |
+
"tpp_threshold_100_total_metric": 0.01150001585483551,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.023000001907348633,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.011499986052513123,
|
339 |
+
"tpp_threshold_500_total_metric": 0.08924996852874756,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.10499995946884155,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.015749990940093994
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.009249970316886902,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.0059999823570251465,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032499879598617554,
|
347 |
+
"tpp_threshold_5_total_metric": -0.004499971866607666,
|
348 |
+
"tpp_threshold_5_intended_diff_only": -0.004999995231628418,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": -0.000500023365020752,
|
350 |
+
"tpp_threshold_10_total_metric": 0.004249989986419678,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.0059999823570251465,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.0017499923706054688,
|
353 |
+
"tpp_threshold_20_total_metric": -0.005499973893165588,
|
354 |
+
"tpp_threshold_20_intended_diff_only": -0.0009999871253967285,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.00449998676776886,
|
356 |
+
"tpp_threshold_50_total_metric": 0.013749971985816956,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.034999966621398926,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.02124999463558197,
|
359 |
+
"tpp_threshold_100_total_metric": 0.029999971389770508,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.05799996852874756,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.02799999713897705,
|
362 |
+
"tpp_threshold_500_total_metric": 0.10924999415874481,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.13599997758865356,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.026749983429908752
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": -0.005499929189682007,
|
368 |
+
"tpp_threshold_2_intended_diff_only": -0.0029999613761901855,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.0024999678134918213,
|
370 |
+
"tpp_threshold_5_total_metric": -0.006249964237213135,
|
371 |
+
"tpp_threshold_5_intended_diff_only": -0.0009999871253967285,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.005249977111816406,
|
373 |
+
"tpp_threshold_10_total_metric": -0.009999975562095642,
|
374 |
+
"tpp_threshold_10_intended_diff_only": -0.0009999871253967285,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.008999988436698914,
|
376 |
+
"tpp_threshold_20_total_metric": -0.006999969482421875,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.003000020980834961,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.009999990463256836,
|
379 |
+
"tpp_threshold_50_total_metric": 0.01650005578994751,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.024000048637390137,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.007499992847442627,
|
382 |
+
"tpp_threshold_100_total_metric": 0.04000003635883331,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.06300002336502075,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.02299998700618744,
|
385 |
+
"tpp_threshold_500_total_metric": 0.16475005447864532,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.19200003147125244,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.027249976992607117
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.006499961018562317,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.011999964714050293,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.005500003695487976,
|
393 |
+
"tpp_threshold_5_total_metric": 0.007999971508979797,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.006999969482421875,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": -0.0010000020265579224,
|
396 |
+
"tpp_threshold_10_total_metric": 0.012499988079071045,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.014999985694885254,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.002499997615814209,
|
399 |
+
"tpp_threshold_20_total_metric": 0.021749988198280334,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.02399998903274536,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.002250000834465027,
|
402 |
+
"tpp_threshold_50_total_metric": 0.03325001895427704,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.04100000858306885,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.007749989628791809,
|
405 |
+
"tpp_threshold_100_total_metric": 0.05399997532367706,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.05899995565414429,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.004999980330467224,
|
408 |
+
"tpp_threshold_500_total_metric": 0.24724997580051422,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.2569999694824219,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.009749993681907654
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "16072d61-c7c6-4047-91dd-5fff05bf32c4",
|
73 |
+
"datetime_epoch_millis": 1740163589499,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.0028499945998191833,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.00549999475479126,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.0026500001549720764,
|
79 |
+
"tpp_threshold_5_total_metric": 0.004700003564357758,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.0076000034809112545,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.0028999999165534975,
|
82 |
+
"tpp_threshold_10_total_metric": 0.01082499623298645,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.014699995517730713,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.0038749992847442625,
|
85 |
+
"tpp_threshold_20_total_metric": 0.017949993908405303,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.02199999690055847,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.004050002992153167,
|
88 |
+
"tpp_threshold_50_total_metric": 0.03577501326799393,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.04050000905990601,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.0047249957919120785,
|
91 |
+
"tpp_threshold_100_total_metric": 0.06557500511407852,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.07450000643730165,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.008925001323223113,
|
94 |
+
"tpp_threshold_500_total_metric": 0.21700000911951065,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.23130001425743102,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.01430000513792038
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.003350001573562622,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.005600011348724366,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.0022500097751617433,
|
105 |
+
"tpp_threshold_5_total_metric": 0.006550008058547973,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.009600019454956055,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.003050011396408081,
|
108 |
+
"tpp_threshold_10_total_metric": 0.01145000457763672,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.014200007915496827,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.0027500033378601075,
|
111 |
+
"tpp_threshold_20_total_metric": 0.01974998414516449,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.02239999771118164,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.002650013566017151,
|
114 |
+
"tpp_threshold_50_total_metric": 0.03780001997947693,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.04080002307891846,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.003000003099441528,
|
117 |
+
"tpp_threshold_100_total_metric": 0.06720000505447388,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.07420001029968262,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.007000005245208741,
|
120 |
+
"tpp_threshold_500_total_metric": 0.24940000772476195,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.26020002365112305,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.010800015926361085
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.0023499876260757446,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.005399978160858155,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.0030499905347824096,
|
129 |
+
"tpp_threshold_5_total_metric": 0.0028499990701675417,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.005599987506866455,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.0027499884366989137,
|
132 |
+
"tpp_threshold_10_total_metric": 0.010199987888336181,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.0151999831199646,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.004999995231628418,
|
135 |
+
"tpp_threshold_20_total_metric": 0.016150003671646117,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.021599996089935302,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.005449992418289184,
|
138 |
+
"tpp_threshold_50_total_metric": 0.03375000655651093,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.040199995040893555,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.00644998848438263,
|
141 |
+
"tpp_threshold_100_total_metric": 0.06395000517368317,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.07480000257492066,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.010849997401237488,
|
144 |
+
"tpp_threshold_500_total_metric": 0.18460001051425934,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.20240000486373902,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.017799994349479674
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0",
|
152 |
+
"sae_lens_version": "5.4.2",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 65536,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.0072500258684158325,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.01100003719329834,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037500113248825073,
|
184 |
+
"tpp_threshold_5_total_metric": 0.016750022768974304,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.021000027656555176,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.004250004887580872,
|
187 |
+
"tpp_threshold_10_total_metric": 0.012500002980232239,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.017000019550323486,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.0045000165700912476,
|
190 |
+
"tpp_threshold_20_total_metric": 0.030749991536140442,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.03299999237060547,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.002250000834465027,
|
193 |
+
"tpp_threshold_50_total_metric": 0.04200001060962677,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.04500001668930054,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.003000006079673767,
|
196 |
+
"tpp_threshold_100_total_metric": 0.08250001072883606,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.10199999809265137,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.019499987363815308,
|
199 |
+
"tpp_threshold_500_total_metric": 0.3044999986886978,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.328000009059906,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.02350001037120819
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.006250038743019104,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.006000041961669922,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": -0.00024999678134918213,
|
207 |
+
"tpp_threshold_5_total_metric": 0.0077500492334365845,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.010000050067901611,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.002250000834465027,
|
210 |
+
"tpp_threshold_10_total_metric": 0.012000024318695068,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.012000024318695068,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": 0.0,
|
213 |
+
"tpp_threshold_20_total_metric": 0.005749985575675964,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.009000003337860107,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.003250017762184143,
|
216 |
+
"tpp_threshold_50_total_metric": 0.015250056982040405,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.01900005340576172,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.0037499964237213135,
|
219 |
+
"tpp_threshold_100_total_metric": 0.026749998331069946,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.03100001811981201,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.004250019788742065,
|
222 |
+
"tpp_threshold_500_total_metric": 0.14800003170967102,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.15400004386901855,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.006000012159347534
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": -0.0032500028610229492,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.0,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
230 |
+
"tpp_threshold_5_total_metric": 0.0007499754428863525,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.004250019788742065,
|
233 |
+
"tpp_threshold_10_total_metric": 0.02400001883506775,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.026000022888183594,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.0020000040531158447,
|
236 |
+
"tpp_threshold_20_total_metric": 0.03149998188018799,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.03299999237060547,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.0015000104904174805,
|
239 |
+
"tpp_threshold_50_total_metric": 0.05425000190734863,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.0559999942779541,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.0017499923706054688,
|
242 |
+
"tpp_threshold_100_total_metric": 0.07874995470046997,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.07999998331069946,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.0012500286102294922,
|
245 |
+
"tpp_threshold_500_total_metric": 0.2527500092983246,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.25700002908706665,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.004250019788742065
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.001999989151954651,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0040000081062316895,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0020000189542770386,
|
253 |
+
"tpp_threshold_5_total_metric": 0.002750024199485779,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0007499903440475464,
|
256 |
+
"tpp_threshold_10_total_metric": 0.004749983549118042,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.0059999823570251465,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.0012499988079071045,
|
259 |
+
"tpp_threshold_20_total_metric": 0.004999995231628418,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.0040000081062316895,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": -0.0009999871253967285,
|
262 |
+
"tpp_threshold_50_total_metric": 0.010000035166740417,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.01100003719329834,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.0010000020265579224,
|
265 |
+
"tpp_threshold_100_total_metric": 0.03125004470348358,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.03400003910064697,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.002749994397163391,
|
268 |
+
"tpp_threshold_500_total_metric": 0.19425003230571747,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.20100003480911255,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.006750002503395081
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.004499956965446472,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.006999969482421875,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.002500012516975403,
|
276 |
+
"tpp_threshold_5_total_metric": 0.004749968647956848,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.009999990463256836,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.005250021815299988,
|
279 |
+
"tpp_threshold_10_total_metric": 0.003999993205070496,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.009999990463256836,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.00599999725818634,
|
282 |
+
"tpp_threshold_20_total_metric": 0.025749966502189636,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.03299999237060547,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.0072500258684158325,
|
285 |
+
"tpp_threshold_50_total_metric": 0.06749999523162842,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.07300001382827759,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.00550001859664917,
|
288 |
+
"tpp_threshold_100_total_metric": 0.11675001680850983,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.12400001287460327,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.007249996066093445,
|
291 |
+
"tpp_threshold_500_total_metric": 0.3474999666213989,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.3610000014305115,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.013500034809112549
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.0037499666213989258,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.007999956607818604,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.004249989986419678,
|
301 |
+
"tpp_threshold_5_total_metric": -0.00025004148483276367,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.003999948501586914,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.004249989986419678,
|
304 |
+
"tpp_threshold_10_total_metric": 0.0017500072717666626,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.009000003337860107,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.007249996066093445,
|
307 |
+
"tpp_threshold_20_total_metric": -0.000250011682510376,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.0059999823570251465,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.0062499940395355225,
|
310 |
+
"tpp_threshold_50_total_metric": 0.010500013828277588,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.013999998569488525,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.0034999847412109375,
|
313 |
+
"tpp_threshold_100_total_metric": 0.004749983549118042,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.014999985694885254,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.010250002145767212,
|
316 |
+
"tpp_threshold_500_total_metric": 0.08899998664855957,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.09799998998641968,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.009000003337860107
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.0014999806880950928,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.001999974250793457,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.0004999935626983643,
|
324 |
+
"tpp_threshold_5_total_metric": -0.002499997615814209,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.007499992847442627,
|
327 |
+
"tpp_threshold_10_total_metric": 0.01099996268749237,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.01699995994567871,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.00599999725818634,
|
330 |
+
"tpp_threshold_20_total_metric": 0.011499986052513123,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.014999985694885254,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.0034999996423721313,
|
333 |
+
"tpp_threshold_50_total_metric": 0.03499998152256012,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.042999982833862305,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.008000001311302185,
|
336 |
+
"tpp_threshold_100_total_metric": 0.04949997365474701,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.06299996376037598,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.013499990105628967,
|
339 |
+
"tpp_threshold_500_total_metric": 0.1442500203847885,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.16100001335144043,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.016749992966651917
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.010000020265579224,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.00700002908706665,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.0029999911785125732,
|
347 |
+
"tpp_threshold_5_total_metric": 0.006999999284744263,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": -0.0020000040531158447,
|
350 |
+
"tpp_threshold_10_total_metric": 0.013249978423118591,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.014999985694885254,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.0017500072717666626,
|
353 |
+
"tpp_threshold_20_total_metric": 0.0052499920129776,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.009999990463256836,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.004749998450279236,
|
356 |
+
"tpp_threshold_50_total_metric": 0.02750001847743988,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.03100001811981201,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.0034999996423721313,
|
359 |
+
"tpp_threshold_100_total_metric": 0.04750002920627594,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.054000020027160645,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.006499990820884705,
|
362 |
+
"tpp_threshold_500_total_metric": 0.13974998891353607,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.1629999876022339,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.023249998688697815
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": 0.00025004148483276367,
|
368 |
+
"tpp_threshold_2_intended_diff_only": 0.003000020980834961,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.0027499794960021973,
|
370 |
+
"tpp_threshold_5_total_metric": 0.0012500584125518799,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.00700002908706665,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.0057499706745147705,
|
373 |
+
"tpp_threshold_10_total_metric": -0.004249989986419678,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.008249998092651367,
|
376 |
+
"tpp_threshold_20_total_metric": 0.020750045776367188,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.030000030994415283,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.009249985218048096,
|
379 |
+
"tpp_threshold_50_total_metric": 0.031500041484832764,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.04000002145767212,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.008499979972839355,
|
382 |
+
"tpp_threshold_100_total_metric": 0.11225005984306335,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.1300000548362732,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.01774999499320984,
|
385 |
+
"tpp_threshold_500_total_metric": 0.24350006878376007,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.2690000534057617,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.025499984622001648
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.016249969601631165,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.0209999680519104,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.004749998450279236,
|
393 |
+
"tpp_threshold_5_total_metric": 0.008749976754188538,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.006999969482421875,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": -0.0017500072717666626,
|
396 |
+
"tpp_threshold_10_total_metric": 0.02924998104572296,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.030999958515167236,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.001749977469444275,
|
399 |
+
"tpp_threshold_20_total_metric": 0.04350000619888306,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.046999990940093994,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.0034999847412109375,
|
402 |
+
"tpp_threshold_50_total_metric": 0.06424997746944427,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.07299995422363281,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.008749976754188538,
|
405 |
+
"tpp_threshold_100_total_metric": 0.10574997961521149,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.1119999885559082,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.006250008940696716,
|
408 |
+
"tpp_threshold_500_total_metric": 0.30649998784065247,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.32099997997283936,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.01449999213218689
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "0cf78e95-df49-4332-8aa4-30ee2712619b",
|
73 |
+
"datetime_epoch_millis": 1740163430683,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.001475006341934204,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.003700006008148193,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.0022249996662139894,
|
79 |
+
"tpp_threshold_5_total_metric": 0.0010999992489814758,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.003600001335144043,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.002500002086162567,
|
82 |
+
"tpp_threshold_10_total_metric": 0.004625001549720764,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.00790000557899475,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.0032750040292739866,
|
85 |
+
"tpp_threshold_20_total_metric": 0.008000005781650544,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.01170000433921814,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.003699998557567596,
|
88 |
+
"tpp_threshold_50_total_metric": 0.024850000441074372,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.03130000233650208,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.0064500018954277046,
|
91 |
+
"tpp_threshold_100_total_metric": 0.042700006067752844,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.05190000534057617,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.009199999272823334,
|
94 |
+
"tpp_threshold_500_total_metric": 0.19910001307725905,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.21640000939369203,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.017299996316432954
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.003000003099441528,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.004600012302398681,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.0016000092029571534,
|
105 |
+
"tpp_threshold_5_total_metric": 0.0025999963283538817,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.005000007152557373,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.002400010824203491,
|
108 |
+
"tpp_threshold_10_total_metric": 0.005449992418289184,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.007800006866455078,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.0023500144481658934,
|
111 |
+
"tpp_threshold_20_total_metric": 0.010450014472007751,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.012800014019012452,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.0023499995470046995,
|
114 |
+
"tpp_threshold_50_total_metric": 0.01789999008178711,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.02080000638961792,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.0029000163078308104,
|
117 |
+
"tpp_threshold_100_total_metric": 0.0331000030040741,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.036400008201599124,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.0033000051975250245,
|
120 |
+
"tpp_threshold_500_total_metric": 0.19535001814365388,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.20200002193450928,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.006650003790855408
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": -4.999041557312012e-05,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.002799999713897705,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.0028499901294708253,
|
129 |
+
"tpp_threshold_5_total_metric": -0.0003999978303909302,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.002199995517730713,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.002599993348121643,
|
132 |
+
"tpp_threshold_10_total_metric": 0.0038000106811523437,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.008000004291534423,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.00419999361038208,
|
135 |
+
"tpp_threshold_20_total_metric": 0.005549997091293335,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.010599994659423828,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.005049997568130493,
|
138 |
+
"tpp_threshold_50_total_metric": 0.031800010800361635,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.04179999828338623,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.009999987483024598,
|
141 |
+
"tpp_threshold_100_total_metric": 0.05230000913143158,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.06740000247955322,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.015099993348121643,
|
144 |
+
"tpp_threshold_500_total_metric": 0.20285000801086425,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.23079999685287475,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.027949988842010498
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0",
|
152 |
+
"sae_lens_version": "5.4.2",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 65536,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.008750036358833313,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.01100003719329834,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.002250000834465027,
|
184 |
+
"tpp_threshold_5_total_metric": 0.012000024318695068,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.01500004529953003,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.003000020980834961,
|
187 |
+
"tpp_threshold_10_total_metric": 0.006749972701072693,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.009999990463256836,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.003250017762184143,
|
190 |
+
"tpp_threshold_20_total_metric": 0.01975002884864807,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.022000014781951904,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.002249985933303833,
|
193 |
+
"tpp_threshold_50_total_metric": 0.03349998593330383,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.03700000047683716,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.003500014543533325,
|
196 |
+
"tpp_threshold_100_total_metric": 0.06024999916553497,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.06400001049041748,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.0037500113248825073,
|
199 |
+
"tpp_threshold_500_total_metric": 0.3145000487565994,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.3200000524520874,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.005500003695487976
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.0055000633001327515,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.005000054836273193,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": -0.0005000084638595581,
|
207 |
+
"tpp_threshold_5_total_metric": -0.0010000020265579224,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.0,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0010000020265579224,
|
210 |
+
"tpp_threshold_10_total_metric": 0.0022500455379486084,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.001000046730041504,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": -0.0012499988079071045,
|
213 |
+
"tpp_threshold_20_total_metric": 0.0002500265836715698,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.003000020980834961,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.002749994397163391,
|
216 |
+
"tpp_threshold_50_total_metric": 0.004500031471252441,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.00700002908706665,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.002499997615814209,
|
219 |
+
"tpp_threshold_100_total_metric": 0.014500007033348083,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.018000006675720215,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.0034999996423721313,
|
222 |
+
"tpp_threshold_500_total_metric": 0.13175006210803986,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.1350000500679016,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.0032499879598617554
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": -0.0005000531673431396,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.001999974250793457,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.0025000274181365967,
|
230 |
+
"tpp_threshold_5_total_metric": -1.4901161193847656e-08,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.005000010132789612,
|
233 |
+
"tpp_threshold_10_total_metric": 0.015000000596046448,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.017000019550323486,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.0020000189542770386,
|
236 |
+
"tpp_threshold_20_total_metric": 0.02025000751018524,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.022000014781951904,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.0017500072717666626,
|
239 |
+
"tpp_threshold_50_total_metric": 0.025249987840652466,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.027000010013580322,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.0017500221729278564,
|
242 |
+
"tpp_threshold_100_total_metric": 0.04474999010562897,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.046999990940093994,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.002250000834465027,
|
245 |
+
"tpp_threshold_500_total_metric": 0.30299998819828033,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.3100000023841858,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.0070000141859054565
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": -0.0015000104904174805,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0015000104904174805,
|
253 |
+
"tpp_threshold_5_total_metric": 0.000250011682510376,
|
254 |
+
"tpp_threshold_5_intended_diff_only": -0.0009999871253967285,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0012499988079071045,
|
256 |
+
"tpp_threshold_10_total_metric": 0.002249985933303833,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.0017500221729278564,
|
259 |
+
"tpp_threshold_20_total_metric": 0.00475001335144043,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.003000020980834961,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": -0.0017499923706054688,
|
262 |
+
"tpp_threshold_50_total_metric": 0.004249989986419678,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.0040000081062316895,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": -0.0002499818801879883,
|
265 |
+
"tpp_threshold_100_total_metric": 0.0037500113248825073,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.003000020980834961,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": -0.0007499903440475464,
|
268 |
+
"tpp_threshold_500_total_metric": 0.030749976634979248,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.03799998760223389,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.007250010967254639
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.0027499794960021973,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.004999995231628418,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.0022500157356262207,
|
276 |
+
"tpp_threshold_5_total_metric": 0.001749962568283081,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.0059999823570251465,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.004250019788742065,
|
279 |
+
"tpp_threshold_10_total_metric": 0.0009999573230743408,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.006999969482421875,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.006000012159347534,
|
282 |
+
"tpp_threshold_20_total_metric": 0.007249996066093445,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.013999998569488525,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.006750002503395081,
|
285 |
+
"tpp_threshold_50_total_metric": 0.02199995517730713,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.02899998426437378,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.00700002908706665,
|
288 |
+
"tpp_threshold_100_total_metric": 0.04225000739097595,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.050000011920928955,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.007750004529953003,
|
291 |
+
"tpp_threshold_500_total_metric": 0.19675001502037048,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.2070000171661377,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.010250002145767212
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.007499992847442627,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.010999977588653564,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.0034999847412109375,
|
301 |
+
"tpp_threshold_5_total_metric": 0.004249975085258484,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.0059999823570251465,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.0017500072717666626,
|
304 |
+
"tpp_threshold_10_total_metric": 0.0022500157356262207,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.009000003337860107,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.006749987602233887,
|
307 |
+
"tpp_threshold_20_total_metric": 0.00024999678134918213,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.004999995231628418,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.004749998450279236,
|
310 |
+
"tpp_threshold_50_total_metric": 0.010250002145767212,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.013999998569488525,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.0037499964237213135,
|
313 |
+
"tpp_threshold_100_total_metric": 0.008000016212463379,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.018000006675720215,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.009999990463256836,
|
316 |
+
"tpp_threshold_500_total_metric": 0.08074997365474701,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.09099996089935303,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.010249987244606018
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.0004999935626983643,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.0004999935626983643,
|
324 |
+
"tpp_threshold_5_total_metric": -0.0070000141859054565,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.008000001311302185,
|
327 |
+
"tpp_threshold_10_total_metric": 0.002249971032142639,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.007999956607818604,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.005749985575675964,
|
330 |
+
"tpp_threshold_20_total_metric": 0.0029999762773513794,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.0059999823570251465,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.003000006079673767,
|
333 |
+
"tpp_threshold_50_total_metric": 0.016999974846839905,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.02599996328353882,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.008999988436698914,
|
336 |
+
"tpp_threshold_100_total_metric": 0.028250008821487427,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.041999995708465576,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.01374998688697815,
|
339 |
+
"tpp_threshold_500_total_metric": 0.19774997234344482,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.21999996900558472,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.022249996662139893
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.008249998092651367,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.004999995231628418,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
347 |
+
"tpp_threshold_5_total_metric": 0.000250011682510376,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.0,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": -0.000250011682510376,
|
350 |
+
"tpp_threshold_10_total_metric": 0.0072500258684158325,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.008000016212463379,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.0007499903440475464,
|
353 |
+
"tpp_threshold_20_total_metric": -0.0015000253915786743,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.001999974250793457,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.0034999996423721313,
|
356 |
+
"tpp_threshold_50_total_metric": 0.012000039219856262,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.013000011444091797,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.0009999722242355347,
|
359 |
+
"tpp_threshold_100_total_metric": 0.02925001084804535,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.04100000858306885,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.011749997735023499,
|
362 |
+
"tpp_threshold_500_total_metric": 0.1612500250339508,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.2160000205039978,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.054749995470047
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": -0.006749927997589111,
|
368 |
+
"tpp_threshold_2_intended_diff_only": -0.003999948501586914,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.0027499794960021973,
|
370 |
+
"tpp_threshold_5_total_metric": -0.006249949336051941,
|
371 |
+
"tpp_threshold_5_intended_diff_only": -0.001999974250793457,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.004249975085258484,
|
373 |
+
"tpp_threshold_10_total_metric": -0.008999958634376526,
|
374 |
+
"tpp_threshold_10_intended_diff_only": -0.0029999613761901855,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.00599999725818634,
|
376 |
+
"tpp_threshold_20_total_metric": -0.009499937295913696,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.001000046730041504,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.0104999840259552,
|
379 |
+
"tpp_threshold_50_total_metric": 0.05525003373622894,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.0820000171661377,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.026749983429908752,
|
382 |
+
"tpp_threshold_100_total_metric": 0.08600002527236938,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.11800003051757812,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.03200000524520874,
|
385 |
+
"tpp_threshold_500_total_metric": 0.2552500516176224,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.2940000295639038,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.03874997794628143
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.006749987602233887,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.010999977588653564,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.004249989986419678,
|
393 |
+
"tpp_threshold_5_total_metric": 0.006749987602233887,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.0059999823570251465,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": -0.0007500052452087402,
|
396 |
+
"tpp_threshold_10_total_metric": 0.016249999403953552,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.018000006675720215,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.0017500072717666626,
|
399 |
+
"tpp_threshold_20_total_metric": 0.035499975085258484,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.038999974727630615,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.0034999996423721313,
|
402 |
+
"tpp_threshold_50_total_metric": 0.06450000405311584,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.07400000095367432,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.009499996900558472,
|
405 |
+
"tpp_threshold_100_total_metric": 0.10999998450279236,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.11799997091293335,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.007999986410140991,
|
408 |
+
"tpp_threshold_500_total_metric": 0.3192500174045563,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.3330000042915344,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.01374998688697815
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "496eec06-563d-45a6-8ad8-e68c3fad1008",
|
73 |
+
"datetime_epoch_millis": 1740163906732,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.003649994730949402,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.006199997663497925,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.0025500029325485228,
|
79 |
+
"tpp_threshold_5_total_metric": 0.0050999939441680915,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.007999992370605467,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.0028999984264373776,
|
82 |
+
"tpp_threshold_10_total_metric": 0.013499999046325683,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.017399996519088745,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.0038999974727630614,
|
85 |
+
"tpp_threshold_20_total_metric": 0.02162499576807022,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.02619999647140503,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.004575000703334808,
|
88 |
+
"tpp_threshold_50_total_metric": 0.051074995100498205,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.057199996709823606,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.006125001609325409,
|
91 |
+
"tpp_threshold_100_total_metric": 0.09944999665021896,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.10920000076293945,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.009750004112720489,
|
94 |
+
"tpp_threshold_500_total_metric": 0.310000017285347,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.3243000149726868,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.014299997687339782
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.004200002551078797,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.006200015544891357,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.002000012993812561,
|
105 |
+
"tpp_threshold_5_total_metric": 0.0056500047445297245,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.00840001106262207,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.002750006318092346,
|
108 |
+
"tpp_threshold_10_total_metric": 0.01315000057220459,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.015400004386901856,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.002250003814697266,
|
111 |
+
"tpp_threshold_20_total_metric": 0.02469998598098755,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.02799999713897705,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.003300011157989502,
|
114 |
+
"tpp_threshold_50_total_metric": 0.05049999952316284,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.053800010681152345,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.003300011157989502,
|
117 |
+
"tpp_threshold_100_total_metric": 0.09890000522136688,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.103600013256073,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.004700008034706116,
|
120 |
+
"tpp_threshold_500_total_metric": 0.3669000148773193,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.37420002222061155,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.0073000073432922365
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.0030999869108200074,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.006199979782104492,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.003099992871284485,
|
129 |
+
"tpp_threshold_5_total_metric": 0.004549983143806458,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.007599973678588867,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.0030499905347824096,
|
132 |
+
"tpp_threshold_10_total_metric": 0.013849997520446777,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.019399988651275634,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.005549991130828857,
|
135 |
+
"tpp_threshold_20_total_metric": 0.018550005555152894,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.024399995803833008,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.005849990248680115,
|
138 |
+
"tpp_threshold_50_total_metric": 0.05164999067783356,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.06059998273849487,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.008949992060661317,
|
141 |
+
"tpp_threshold_100_total_metric": 0.09999998807907104,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.1147999882698059,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.014800000190734863,
|
144 |
+
"tpp_threshold_500_total_metric": 0.25310001969337464,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.274400007724762,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.02129998803138733
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0",
|
152 |
+
"sae_lens_version": "5.4.2",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 65536,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.006749987602233887,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.009999990463256836,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
184 |
+
"tpp_threshold_5_total_metric": 0.017000019550323486,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.021000027656555176,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.0040000081062316895,
|
187 |
+
"tpp_threshold_10_total_metric": 0.009750038385391235,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.01500004529953003,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.005250006914138794,
|
190 |
+
"tpp_threshold_20_total_metric": 0.029749974608421326,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.03299999237060547,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.003250017762184143,
|
193 |
+
"tpp_threshold_50_total_metric": 0.046999990940093994,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.050999999046325684,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.0040000081062316895,
|
196 |
+
"tpp_threshold_100_total_metric": 0.12275001406669617,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.12800002098083496,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.005250006914138794,
|
199 |
+
"tpp_threshold_500_total_metric": 0.398250013589859,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.4020000100135803,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.0037499964237213135
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.0050000399351119995,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.005000054836273193,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": 1.4901161193847656e-08,
|
207 |
+
"tpp_threshold_5_total_metric": 0.0025000274181365967,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.003000020980834961,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0004999935626983643,
|
210 |
+
"tpp_threshold_10_total_metric": 0.010499998927116394,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.009000003337860107,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": -0.0014999955892562866,
|
213 |
+
"tpp_threshold_20_total_metric": 0.0104999840259552,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.013999998569488525,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.003500014543533325,
|
216 |
+
"tpp_threshold_50_total_metric": 0.04275001585483551,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.0480000376701355,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.005250021815299988,
|
219 |
+
"tpp_threshold_100_total_metric": 0.0625,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.0690000057220459,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.0065000057220458984,
|
222 |
+
"tpp_threshold_500_total_metric": 0.31550003588199615,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.32600003480911255,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.010499998927116394
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.0007500052452087402,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.003000020980834961,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.0022500157356262207,
|
230 |
+
"tpp_threshold_5_total_metric": 0.001499950885772705,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.006999969482421875,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.00550001859664917,
|
233 |
+
"tpp_threshold_10_total_metric": 0.0337500125169754,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.03600001335144043,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.002250000834465027,
|
236 |
+
"tpp_threshold_20_total_metric": 0.047749996185302734,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.050999999046325684,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.0032500028610229492,
|
239 |
+
"tpp_threshold_50_total_metric": 0.07749998569488525,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.07899999618530273,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.0015000104904174805,
|
242 |
+
"tpp_threshold_100_total_metric": 0.13199999928474426,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.1340000033378601,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.0020000040531158447,
|
245 |
+
"tpp_threshold_500_total_metric": 0.37299999594688416,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.3790000081062317,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.006000012159347534
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.0015000104904174805,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.003000020980834961,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0015000104904174805,
|
253 |
+
"tpp_threshold_5_total_metric": 0.003000035881996155,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0010000020265579224,
|
256 |
+
"tpp_threshold_10_total_metric": 0.005249977111816406,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.0059999823570251465,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.0007500052452087402,
|
259 |
+
"tpp_threshold_20_total_metric": 0.004999995231628418,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.004999995231628418,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": 0.0,
|
262 |
+
"tpp_threshold_50_total_metric": 0.008500009775161743,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.009000003337860107,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.0004999935626983643,
|
265 |
+
"tpp_threshold_100_total_metric": 0.02300003170967102,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.025000035762786865,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.0020000040531158447,
|
268 |
+
"tpp_threshold_500_total_metric": 0.320000022649765,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.32600003480911255,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.006000012159347534
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.006999969482421875,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.009999990463256836,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.003000020980834961,
|
276 |
+
"tpp_threshold_5_total_metric": 0.004249989986419678,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.009000003337860107,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.00475001335144043,
|
279 |
+
"tpp_threshold_10_total_metric": 0.006499975919723511,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.010999977588653564,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.004500001668930054,
|
282 |
+
"tpp_threshold_20_total_metric": 0.030499979853630066,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.03700000047683716,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.006500020623207092,
|
285 |
+
"tpp_threshold_50_total_metric": 0.07674999535083771,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.0820000171661377,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.005250021815299988,
|
288 |
+
"tpp_threshold_100_total_metric": 0.15424998104572296,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.16200000047683716,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.007750019431114197,
|
291 |
+
"tpp_threshold_500_total_metric": 0.42775000631809235,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.43800002336502075,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.010250017046928406
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.00424996018409729,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.007999956607818604,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037499964237213135,
|
301 |
+
"tpp_threshold_5_total_metric": 0.0015000104904174805,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.0034999847412109375,
|
304 |
+
"tpp_threshold_10_total_metric": 0.0007499605417251587,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.007999956607818604,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.007249996066093445,
|
307 |
+
"tpp_threshold_20_total_metric": 0.001749962568283081,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.006999969482421875,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.005250006914138794,
|
310 |
+
"tpp_threshold_50_total_metric": 0.011249944567680359,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.012999951839447021,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.0017500072717666626,
|
313 |
+
"tpp_threshold_100_total_metric": 0.011499956250190735,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.0209999680519104,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.009500011801719666,
|
316 |
+
"tpp_threshold_500_total_metric": 0.1262499988079071,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.13899999856948853,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.012749999761581421
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.0040000081062316895,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.004999995231628418,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.0009999871253967285,
|
324 |
+
"tpp_threshold_5_total_metric": -0.004500031471252441,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.0029999613761901855,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.007499992847442627,
|
327 |
+
"tpp_threshold_10_total_metric": 0.00449998676776886,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.010999977588653564,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.006499990820884705,
|
330 |
+
"tpp_threshold_20_total_metric": 0.009250015020370483,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.013999998569488525,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.004749983549118042,
|
333 |
+
"tpp_threshold_50_total_metric": 0.03349998593330383,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.04399996995925903,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.0104999840259552,
|
336 |
+
"tpp_threshold_100_total_metric": 0.06824997067451477,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.08499997854232788,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.01675000786781311,
|
339 |
+
"tpp_threshold_500_total_metric": 0.25550003349781036,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.2720000147819519,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.01649998128414154
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.006250008940696716,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.003000020980834961,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032499879598617554,
|
347 |
+
"tpp_threshold_5_total_metric": 0.0027499645948410034,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.0029999613761901855,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": 0.00024999678134918213,
|
350 |
+
"tpp_threshold_10_total_metric": 0.0065000057220458984,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.009999990463256836,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.0034999847412109375,
|
353 |
+
"tpp_threshold_20_total_metric": 0.006000012159347534,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.013000011444091797,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.006999999284744263,
|
356 |
+
"tpp_threshold_50_total_metric": 0.02499997615814209,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.029999971389770508,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.004999995231628418,
|
359 |
+
"tpp_threshold_100_total_metric": 0.06025001406669617,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.07300001382827759,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.012749999761581421,
|
362 |
+
"tpp_threshold_500_total_metric": 0.19625000655651093,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.22699999809265137,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.030749991536140442
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": 0.006000012159347534,
|
368 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.0029999911785125732,
|
370 |
+
"tpp_threshold_5_total_metric": 0.008500009775161743,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.013999998569488525,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.005499988794326782,
|
373 |
+
"tpp_threshold_10_total_metric": 0.018500030040740967,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.026000022888183594,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.007499992847442627,
|
376 |
+
"tpp_threshold_20_total_metric": 0.027250036597251892,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.03600001335144043,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.008749976754188538,
|
379 |
+
"tpp_threshold_50_total_metric": 0.09700007736682892,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.11600005626678467,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.01899997889995575,
|
382 |
+
"tpp_threshold_100_total_metric": 0.17525003850460052,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.20200002193450928,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.026749983429908752,
|
385 |
+
"tpp_threshold_500_total_metric": 0.34125006198883057,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.3720000386238098,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.030749976634979248
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.007499963045120239,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.011999964714050293,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.004500001668930054,
|
393 |
+
"tpp_threshold_5_total_metric": 0.014499962329864502,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.012999951839447021,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": -0.0015000104904174805,
|
396 |
+
"tpp_threshold_10_total_metric": 0.039000004529953,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.041999995708465576,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.0029999911785125732,
|
399 |
+
"tpp_threshold_20_total_metric": 0.048500001430511475,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.05199998617172241,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.0034999847412109375,
|
402 |
+
"tpp_threshold_50_total_metric": 0.09149996936321259,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.09999996423721313,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.00849999487400055,
|
405 |
+
"tpp_threshold_100_total_metric": 0.18474996089935303,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.1929999589920044,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.008249998092651367,
|
408 |
+
"tpp_threshold_500_total_metric": 0.3462499976158142,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.3619999885559082,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.015749990940093994
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "9c4fef1a-ae28-4280-b511-c8d59c94496f",
|
37 |
+
"datetime_epoch_millis": 1740170757553,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.022514045238494873
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "d91a218b4cc4ac6c164d0e1b739c8437901c7acd",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0",
|
47 |
+
"sae_lens_version": "5.4.2",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 65536,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "topk",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|