adamkarvonen commited on
Commit
5f9c6ec
·
verified ·
1 Parent(s): 6355eb3

Add files using upload-large-folder tool

Browse files
Files changed (29) hide show
  1. .gitattributes +8 -0
  2. eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json +268 -0
  3. eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json +268 -0
  4. eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json +268 -0
  5. eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json +268 -0
  6. eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json +3 -0
  7. eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json +3 -0
  8. eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json +3 -0
  9. eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json +3 -0
  10. eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json +3 -0
  11. eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json +3 -0
  12. eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json +3 -0
  13. eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json +3 -0
  14. eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json +323 -0
  15. eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json +323 -0
  16. eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json +323 -0
  17. eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json +323 -0
  18. eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json +670 -0
  19. eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json +670 -0
  20. eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json +670 -0
  21. eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json +670 -0
  22. eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json +414 -0
  23. eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json +414 -0
  24. eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json +414 -0
  25. eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json +414 -0
  26. eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json +74 -0
  27. eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json +74 -0
  28. eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json +74 -0
  29. eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json +74 -0
.gitattributes CHANGED
@@ -85,3 +85,11 @@ eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_16
85
  eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
86
  eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
87
  eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
85
  eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
86
  eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
87
  eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
88
+ eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
89
+ eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
90
+ eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
91
+ eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
92
+ eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
93
+ eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
94
+ eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
95
+ eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "absorption_first_letter",
3
+ "eval_config": {
4
+ "model_name": "gemma-2-2b",
5
+ "random_seed": 42,
6
+ "f1_jump_threshold": 0.03,
7
+ "max_k_value": 10,
8
+ "prompt_template": "{word} has the first letter:",
9
+ "prompt_token_pos": -6,
10
+ "llm_batch_size": 32,
11
+ "llm_dtype": "bfloat16",
12
+ "k_sparse_probe_l1_decay": 0.01,
13
+ "k_sparse_probe_batch_size": 4096,
14
+ "k_sparse_probe_num_epochs": 50
15
+ },
16
+ "eval_id": "24efad56-2464-4fc7-a72f-d8ee75af8889",
17
+ "datetime_epoch_millis": 1740196513100,
18
+ "eval_result_metrics": {
19
+ "mean": {
20
+ "mean_absorption_fraction_score": 0.07986090984327109,
21
+ "mean_full_absorption_score": 0.05125248877852254,
22
+ "mean_num_split_features": 1.1153846153846154,
23
+ "std_dev_absorption_fraction_score": 0.09178111702879053,
24
+ "std_dev_full_absorption_score": 0.06575471363588886,
25
+ "std_dev_num_split_features": 0.3258125936084211
26
+ }
27
+ },
28
+ "eval_result_details": [
29
+ {
30
+ "first_letter": "a",
31
+ "mean_absorption_fraction": 0.028337216815245637,
32
+ "full_absorption_rate": 0.02044753086419753,
33
+ "num_full_absorption": 53,
34
+ "num_probe_true_positives": 2592,
35
+ "num_split_features": 1
36
+ },
37
+ {
38
+ "first_letter": "b",
39
+ "mean_absorption_fraction": 0.0018002834587106894,
40
+ "full_absorption_rate": 0.0031308703819661866,
41
+ "num_full_absorption": 5,
42
+ "num_probe_true_positives": 1597,
43
+ "num_split_features": 1
44
+ },
45
+ {
46
+ "first_letter": "c",
47
+ "mean_absorption_fraction": 0.25239018623716264,
48
+ "full_absorption_rate": 0.16895803183791605,
49
+ "num_full_absorption": 467,
50
+ "num_probe_true_positives": 2764,
51
+ "num_split_features": 1
52
+ },
53
+ {
54
+ "first_letter": "d",
55
+ "mean_absorption_fraction": 0.21202494352131684,
56
+ "full_absorption_rate": 0.1261904761904762,
57
+ "num_full_absorption": 212,
58
+ "num_probe_true_positives": 1680,
59
+ "num_split_features": 1
60
+ },
61
+ {
62
+ "first_letter": "e",
63
+ "mean_absorption_fraction": 0.1599800731731651,
64
+ "full_absorption_rate": 0.09768009768009768,
65
+ "num_full_absorption": 160,
66
+ "num_probe_true_positives": 1638,
67
+ "num_split_features": 1
68
+ },
69
+ {
70
+ "first_letter": "f",
71
+ "mean_absorption_fraction": 0.08533490557098641,
72
+ "full_absorption_rate": 0.030032467532467532,
73
+ "num_full_absorption": 37,
74
+ "num_probe_true_positives": 1232,
75
+ "num_split_features": 1
76
+ },
77
+ {
78
+ "first_letter": "g",
79
+ "mean_absorption_fraction": 0.01762098136284656,
80
+ "full_absorption_rate": 0.011583011583011582,
81
+ "num_full_absorption": 12,
82
+ "num_probe_true_positives": 1036,
83
+ "num_split_features": 2
84
+ },
85
+ {
86
+ "first_letter": "h",
87
+ "mean_absorption_fraction": 0.017284095093548002,
88
+ "full_absorption_rate": 0.004945598417408506,
89
+ "num_full_absorption": 5,
90
+ "num_probe_true_positives": 1011,
91
+ "num_split_features": 1
92
+ },
93
+ {
94
+ "first_letter": "i",
95
+ "mean_absorption_fraction": 0.21843708576706966,
96
+ "full_absorption_rate": 0.1754278728606357,
97
+ "num_full_absorption": 287,
98
+ "num_probe_true_positives": 1636,
99
+ "num_split_features": 2
100
+ },
101
+ {
102
+ "first_letter": "j",
103
+ "mean_absorption_fraction": 0.002670673585986552,
104
+ "full_absorption_rate": 0.004576659038901602,
105
+ "num_full_absorption": 2,
106
+ "num_probe_true_positives": 437,
107
+ "num_split_features": 1
108
+ },
109
+ {
110
+ "first_letter": "k",
111
+ "mean_absorption_fraction": 0.0009236539068285315,
112
+ "full_absorption_rate": 0.004379562043795621,
113
+ "num_full_absorption": 3,
114
+ "num_probe_true_positives": 685,
115
+ "num_split_features": 1
116
+ },
117
+ {
118
+ "first_letter": "l",
119
+ "mean_absorption_fraction": 0.09799425185837078,
120
+ "full_absorption_rate": 0.05155482815057283,
121
+ "num_full_absorption": 63,
122
+ "num_probe_true_positives": 1222,
123
+ "num_split_features": 1
124
+ },
125
+ {
126
+ "first_letter": "m",
127
+ "mean_absorption_fraction": 0.00829845786933557,
128
+ "full_absorption_rate": 0.011791128579449747,
129
+ "num_full_absorption": 21,
130
+ "num_probe_true_positives": 1781,
131
+ "num_split_features": 1
132
+ },
133
+ {
134
+ "first_letter": "n",
135
+ "mean_absorption_fraction": 0.07769420525556889,
136
+ "full_absorption_rate": 0.03904282115869018,
137
+ "num_full_absorption": 31,
138
+ "num_probe_true_positives": 794,
139
+ "num_split_features": 1
140
+ },
141
+ {
142
+ "first_letter": "o",
143
+ "mean_absorption_fraction": 0.09555476160586396,
144
+ "full_absorption_rate": 0.0508637236084453,
145
+ "num_full_absorption": 53,
146
+ "num_probe_true_positives": 1042,
147
+ "num_split_features": 1
148
+ },
149
+ {
150
+ "first_letter": "p",
151
+ "mean_absorption_fraction": 0.31828712961790506,
152
+ "full_absorption_rate": 0.23675357443229605,
153
+ "num_full_absorption": 563,
154
+ "num_probe_true_positives": 2378,
155
+ "num_split_features": 1
156
+ },
157
+ {
158
+ "first_letter": "q",
159
+ "mean_absorption_fraction": 0.0,
160
+ "full_absorption_rate": 0.0,
161
+ "num_full_absorption": 0,
162
+ "num_probe_true_positives": 180,
163
+ "num_split_features": 1
164
+ },
165
+ {
166
+ "first_letter": "r",
167
+ "mean_absorption_fraction": 0.1849465735936363,
168
+ "full_absorption_rate": 0.14627994955863807,
169
+ "num_full_absorption": 232,
170
+ "num_probe_true_positives": 1586,
171
+ "num_split_features": 1
172
+ },
173
+ {
174
+ "first_letter": "s",
175
+ "mean_absorption_fraction": 0.13957874484003893,
176
+ "full_absorption_rate": 0.05545927209705372,
177
+ "num_full_absorption": 160,
178
+ "num_probe_true_positives": 2885,
179
+ "num_split_features": 1
180
+ },
181
+ {
182
+ "first_letter": "t",
183
+ "mean_absorption_fraction": 0.016794345867815363,
184
+ "full_absorption_rate": 0.0029222676797194622,
185
+ "num_full_absorption": 5,
186
+ "num_probe_true_positives": 1711,
187
+ "num_split_features": 1
188
+ },
189
+ {
190
+ "first_letter": "u",
191
+ "mean_absorption_fraction": 0.016594745549382752,
192
+ "full_absorption_rate": 0.011811023622047244,
193
+ "num_full_absorption": 9,
194
+ "num_probe_true_positives": 762,
195
+ "num_split_features": 2
196
+ },
197
+ {
198
+ "first_letter": "v",
199
+ "mean_absorption_fraction": 0.0009117410273845578,
200
+ "full_absorption_rate": 0.0012642225031605564,
201
+ "num_full_absorption": 1,
202
+ "num_probe_true_positives": 791,
203
+ "num_split_features": 1
204
+ },
205
+ {
206
+ "first_letter": "w",
207
+ "mean_absorption_fraction": 0.05020130296913499,
208
+ "full_absorption_rate": 0.038461538461538464,
209
+ "num_full_absorption": 26,
210
+ "num_probe_true_positives": 676,
211
+ "num_split_features": 1
212
+ },
213
+ {
214
+ "first_letter": "x",
215
+ "mean_absorption_fraction": 0.025096508373739165,
216
+ "full_absorption_rate": 0.0,
217
+ "num_full_absorption": 0,
218
+ "num_probe_true_positives": 85,
219
+ "num_split_features": 1
220
+ },
221
+ {
222
+ "first_letter": "y",
223
+ "mean_absorption_fraction": 0.04695990283036635,
224
+ "full_absorption_rate": 0.03067484662576687,
225
+ "num_full_absorption": 5,
226
+ "num_probe_true_positives": 163,
227
+ "num_split_features": 1
228
+ },
229
+ {
230
+ "first_letter": "z",
231
+ "mean_absorption_fraction": 0.0006668861736390532,
232
+ "full_absorption_rate": 0.008333333333333333,
233
+ "num_full_absorption": 2,
234
+ "num_probe_true_positives": 240,
235
+ "num_split_features": 1
236
+ }
237
+ ],
238
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
239
+ "sae_lens_id": "custom_sae",
240
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000",
241
+ "sae_lens_version": "5.5.0",
242
+ "sae_cfg_dict": {
243
+ "model_name": "gemma-2-2b",
244
+ "d_in": 2304,
245
+ "d_sae": 65536,
246
+ "hook_layer": 12,
247
+ "hook_name": "blocks.12.hook_resid_post",
248
+ "context_size": null,
249
+ "hook_head_index": null,
250
+ "architecture": "topk",
251
+ "apply_b_dec_to_input": null,
252
+ "finetuning_scaling_factor": null,
253
+ "activation_fn_str": "",
254
+ "prepend_bos": true,
255
+ "normalize_activations": "none",
256
+ "dtype": "bfloat16",
257
+ "device": "",
258
+ "dataset_path": "",
259
+ "dataset_trust_remote_code": true,
260
+ "seqpos_slice": [
261
+ null
262
+ ],
263
+ "training_tokens": -100000,
264
+ "sae_lens_training_version": null,
265
+ "neuronpedia_id": null
266
+ },
267
+ "eval_result_unstructured": null
268
+ }
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "absorption_first_letter",
3
+ "eval_config": {
4
+ "model_name": "gemma-2-2b",
5
+ "random_seed": 42,
6
+ "f1_jump_threshold": 0.03,
7
+ "max_k_value": 10,
8
+ "prompt_template": "{word} has the first letter:",
9
+ "prompt_token_pos": -6,
10
+ "llm_batch_size": 32,
11
+ "llm_dtype": "bfloat16",
12
+ "k_sparse_probe_l1_decay": 0.01,
13
+ "k_sparse_probe_batch_size": 4096,
14
+ "k_sparse_probe_num_epochs": 50
15
+ },
16
+ "eval_id": "2075299c-bf70-47cc-9514-e6ff4f19bb19",
17
+ "datetime_epoch_millis": 1740198096538,
18
+ "eval_result_metrics": {
19
+ "mean": {
20
+ "mean_absorption_fraction_score": 0.3657679681798776,
21
+ "mean_full_absorption_score": 0.4750635142269141,
22
+ "mean_num_split_features": 4.884615384615385,
23
+ "std_dev_absorption_fraction_score": 0.15876472157027252,
24
+ "std_dev_full_absorption_score": 0.17905931379618015,
25
+ "std_dev_num_split_features": 2.303509028884811
26
+ }
27
+ },
28
+ "eval_result_details": [
29
+ {
30
+ "first_letter": "a",
31
+ "mean_absorption_fraction": 0.42724656970095964,
32
+ "full_absorption_rate": 0.46103395061728397,
33
+ "num_full_absorption": 1195,
34
+ "num_probe_true_positives": 2592,
35
+ "num_split_features": 9
36
+ },
37
+ {
38
+ "first_letter": "b",
39
+ "mean_absorption_fraction": 0.6019079325716311,
40
+ "full_absorption_rate": 0.7013149655604258,
41
+ "num_full_absorption": 1120,
42
+ "num_probe_true_positives": 1597,
43
+ "num_split_features": 4
44
+ },
45
+ {
46
+ "first_letter": "c",
47
+ "mean_absorption_fraction": 0.4751708724747572,
48
+ "full_absorption_rate": 0.6425470332850941,
49
+ "num_full_absorption": 1776,
50
+ "num_probe_true_positives": 2764,
51
+ "num_split_features": 8
52
+ },
53
+ {
54
+ "first_letter": "d",
55
+ "mean_absorption_fraction": 0.45846605418570807,
56
+ "full_absorption_rate": 0.5779761904761904,
57
+ "num_full_absorption": 971,
58
+ "num_probe_true_positives": 1680,
59
+ "num_split_features": 6
60
+ },
61
+ {
62
+ "first_letter": "e",
63
+ "mean_absorption_fraction": 0.46395411641440426,
64
+ "full_absorption_rate": 0.5500610500610501,
65
+ "num_full_absorption": 901,
66
+ "num_probe_true_positives": 1638,
67
+ "num_split_features": 6
68
+ },
69
+ {
70
+ "first_letter": "f",
71
+ "mean_absorption_fraction": 0.4687472478503038,
72
+ "full_absorption_rate": 0.5787337662337663,
73
+ "num_full_absorption": 713,
74
+ "num_probe_true_positives": 1232,
75
+ "num_split_features": 7
76
+ },
77
+ {
78
+ "first_letter": "g",
79
+ "mean_absorption_fraction": 0.5120442444837915,
80
+ "full_absorption_rate": 0.6621621621621622,
81
+ "num_full_absorption": 686,
82
+ "num_probe_true_positives": 1036,
83
+ "num_split_features": 4
84
+ },
85
+ {
86
+ "first_letter": "h",
87
+ "mean_absorption_fraction": 0.3676333184285843,
88
+ "full_absorption_rate": 0.4540059347181009,
89
+ "num_full_absorption": 459,
90
+ "num_probe_true_positives": 1011,
91
+ "num_split_features": 5
92
+ },
93
+ {
94
+ "first_letter": "i",
95
+ "mean_absorption_fraction": 0.354487705235415,
96
+ "full_absorption_rate": 0.597799511002445,
97
+ "num_full_absorption": 978,
98
+ "num_probe_true_positives": 1636,
99
+ "num_split_features": 4
100
+ },
101
+ {
102
+ "first_letter": "j",
103
+ "mean_absorption_fraction": 0.2174656084226122,
104
+ "full_absorption_rate": 0.30663615560640733,
105
+ "num_full_absorption": 134,
106
+ "num_probe_true_positives": 437,
107
+ "num_split_features": 4
108
+ },
109
+ {
110
+ "first_letter": "k",
111
+ "mean_absorption_fraction": 0.10844747417238029,
112
+ "full_absorption_rate": 0.21021897810218979,
113
+ "num_full_absorption": 144,
114
+ "num_probe_true_positives": 685,
115
+ "num_split_features": 3
116
+ },
117
+ {
118
+ "first_letter": "l",
119
+ "mean_absorption_fraction": 0.3507451077733594,
120
+ "full_absorption_rate": 0.41080196399345337,
121
+ "num_full_absorption": 502,
122
+ "num_probe_true_positives": 1222,
123
+ "num_split_features": 6
124
+ },
125
+ {
126
+ "first_letter": "m",
127
+ "mean_absorption_fraction": 0.4872792444667222,
128
+ "full_absorption_rate": 0.6254912970241437,
129
+ "num_full_absorption": 1114,
130
+ "num_probe_true_positives": 1781,
131
+ "num_split_features": 6
132
+ },
133
+ {
134
+ "first_letter": "n",
135
+ "mean_absorption_fraction": 0.35957856873086463,
136
+ "full_absorption_rate": 0.44962216624685136,
137
+ "num_full_absorption": 357,
138
+ "num_probe_true_positives": 794,
139
+ "num_split_features": 4
140
+ },
141
+ {
142
+ "first_letter": "o",
143
+ "mean_absorption_fraction": 0.3379671565466484,
144
+ "full_absorption_rate": 0.5930902111324377,
145
+ "num_full_absorption": 618,
146
+ "num_probe_true_positives": 1042,
147
+ "num_split_features": 4
148
+ },
149
+ {
150
+ "first_letter": "p",
151
+ "mean_absorption_fraction": 0.6400315149674903,
152
+ "full_absorption_rate": 0.6703111858704794,
153
+ "num_full_absorption": 1594,
154
+ "num_probe_true_positives": 2378,
155
+ "num_split_features": 8
156
+ },
157
+ {
158
+ "first_letter": "q",
159
+ "mean_absorption_fraction": 0.14011099917784295,
160
+ "full_absorption_rate": 0.21666666666666667,
161
+ "num_full_absorption": 39,
162
+ "num_probe_true_positives": 180,
163
+ "num_split_features": 2
164
+ },
165
+ {
166
+ "first_letter": "r",
167
+ "mean_absorption_fraction": 0.5426072959801443,
168
+ "full_absorption_rate": 0.6204287515762925,
169
+ "num_full_absorption": 984,
170
+ "num_probe_true_positives": 1586,
171
+ "num_split_features": 4
172
+ },
173
+ {
174
+ "first_letter": "s",
175
+ "mean_absorption_fraction": 0.5787110174312542,
176
+ "full_absorption_rate": 0.638474870017331,
177
+ "num_full_absorption": 1842,
178
+ "num_probe_true_positives": 2885,
179
+ "num_split_features": 9
180
+ },
181
+ {
182
+ "first_letter": "t",
183
+ "mean_absorption_fraction": 0.4268379794908569,
184
+ "full_absorption_rate": 0.47808299240210406,
185
+ "num_full_absorption": 818,
186
+ "num_probe_true_positives": 1711,
187
+ "num_split_features": 7
188
+ },
189
+ {
190
+ "first_letter": "u",
191
+ "mean_absorption_fraction": 0.23241182613430145,
192
+ "full_absorption_rate": 0.5183727034120735,
193
+ "num_full_absorption": 395,
194
+ "num_probe_true_positives": 762,
195
+ "num_split_features": 4
196
+ },
197
+ {
198
+ "first_letter": "v",
199
+ "mean_absorption_fraction": 0.16568475161369217,
200
+ "full_absorption_rate": 0.31352718078381797,
201
+ "num_full_absorption": 248,
202
+ "num_probe_true_positives": 791,
203
+ "num_split_features": 6
204
+ },
205
+ {
206
+ "first_letter": "w",
207
+ "mean_absorption_fraction": 0.33654925106797057,
208
+ "full_absorption_rate": 0.5976331360946746,
209
+ "num_full_absorption": 404,
210
+ "num_probe_true_positives": 676,
211
+ "num_split_features": 4
212
+ },
213
+ {
214
+ "first_letter": "x",
215
+ "mean_absorption_fraction": 0.18780014519391397,
216
+ "full_absorption_rate": 0.09411764705882353,
217
+ "num_full_absorption": 8,
218
+ "num_probe_true_positives": 85,
219
+ "num_split_features": 1
220
+ },
221
+ {
222
+ "first_letter": "y",
223
+ "mean_absorption_fraction": 0.13871370413905518,
224
+ "full_absorption_rate": 0.15337423312883436,
225
+ "num_full_absorption": 25,
226
+ "num_probe_true_positives": 163,
227
+ "num_split_features": 1
228
+ },
229
+ {
230
+ "first_letter": "z",
231
+ "mean_absorption_fraction": 0.12936746602215327,
232
+ "full_absorption_rate": 0.22916666666666666,
233
+ "num_full_absorption": 55,
234
+ "num_probe_true_positives": 240,
235
+ "num_split_features": 1
236
+ }
237
+ ],
238
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
239
+ "sae_lens_id": "custom_sae",
240
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000",
241
+ "sae_lens_version": "5.5.0",
242
+ "sae_cfg_dict": {
243
+ "model_name": "gemma-2-2b",
244
+ "d_in": 2304,
245
+ "d_sae": 65536,
246
+ "hook_layer": 12,
247
+ "hook_name": "blocks.12.hook_resid_post",
248
+ "context_size": null,
249
+ "hook_head_index": null,
250
+ "architecture": "topk",
251
+ "apply_b_dec_to_input": null,
252
+ "finetuning_scaling_factor": null,
253
+ "activation_fn_str": "",
254
+ "prepend_bos": true,
255
+ "normalize_activations": "none",
256
+ "dtype": "bfloat16",
257
+ "device": "",
258
+ "dataset_path": "",
259
+ "dataset_trust_remote_code": true,
260
+ "seqpos_slice": [
261
+ null
262
+ ],
263
+ "training_tokens": -100000,
264
+ "sae_lens_training_version": null,
265
+ "neuronpedia_id": null
266
+ },
267
+ "eval_result_unstructured": null
268
+ }
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "absorption_first_letter",
3
+ "eval_config": {
4
+ "model_name": "gemma-2-2b",
5
+ "random_seed": 42,
6
+ "f1_jump_threshold": 0.03,
7
+ "max_k_value": 10,
8
+ "prompt_template": "{word} has the first letter:",
9
+ "prompt_token_pos": -6,
10
+ "llm_batch_size": 32,
11
+ "llm_dtype": "bfloat16",
12
+ "k_sparse_probe_l1_decay": 0.01,
13
+ "k_sparse_probe_batch_size": 4096,
14
+ "k_sparse_probe_num_epochs": 50
15
+ },
16
+ "eval_id": "fdaa2ac8-02d7-4f25-8e15-9292e5b6ab72",
17
+ "datetime_epoch_millis": 1740197304227,
18
+ "eval_result_metrics": {
19
+ "mean": {
20
+ "mean_absorption_fraction_score": 0.4712541829224532,
21
+ "mean_full_absorption_score": 0.5038687495232235,
22
+ "mean_num_split_features": 3.423076923076923,
23
+ "std_dev_absorption_fraction_score": 0.2176478202450696,
24
+ "std_dev_full_absorption_score": 0.23505900853406136,
25
+ "std_dev_num_split_features": 1.747525723371806
26
+ }
27
+ },
28
+ "eval_result_details": [
29
+ {
30
+ "first_letter": "a",
31
+ "mean_absorption_fraction": 0.5003749369137354,
32
+ "full_absorption_rate": 0.42746913580246915,
33
+ "num_full_absorption": 1108,
34
+ "num_probe_true_positives": 2592,
35
+ "num_split_features": 6
36
+ },
37
+ {
38
+ "first_letter": "b",
39
+ "mean_absorption_fraction": 0.6772489720427498,
40
+ "full_absorption_rate": 0.7025673137132122,
41
+ "num_full_absorption": 1122,
42
+ "num_probe_true_positives": 1597,
43
+ "num_split_features": 5
44
+ },
45
+ {
46
+ "first_letter": "c",
47
+ "mean_absorption_fraction": 0.6902377294696528,
48
+ "full_absorption_rate": 0.7373371924746743,
49
+ "num_full_absorption": 2038,
50
+ "num_probe_true_positives": 2764,
51
+ "num_split_features": 6
52
+ },
53
+ {
54
+ "first_letter": "d",
55
+ "mean_absorption_fraction": 0.5995046475493223,
56
+ "full_absorption_rate": 0.7053571428571429,
57
+ "num_full_absorption": 1185,
58
+ "num_probe_true_positives": 1680,
59
+ "num_split_features": 5
60
+ },
61
+ {
62
+ "first_letter": "e",
63
+ "mean_absorption_fraction": 0.5325509916043534,
64
+ "full_absorption_rate": 0.5738705738705738,
65
+ "num_full_absorption": 940,
66
+ "num_probe_true_positives": 1638,
67
+ "num_split_features": 4
68
+ },
69
+ {
70
+ "first_letter": "f",
71
+ "mean_absorption_fraction": 0.6960053818306784,
72
+ "full_absorption_rate": 0.698051948051948,
73
+ "num_full_absorption": 860,
74
+ "num_probe_true_positives": 1232,
75
+ "num_split_features": 6
76
+ },
77
+ {
78
+ "first_letter": "g",
79
+ "mean_absorption_fraction": 0.6161939126364128,
80
+ "full_absorption_rate": 0.6998069498069498,
81
+ "num_full_absorption": 725,
82
+ "num_probe_true_positives": 1036,
83
+ "num_split_features": 4
84
+ },
85
+ {
86
+ "first_letter": "h",
87
+ "mean_absorption_fraction": 0.522152641756954,
88
+ "full_absorption_rate": 0.5588526211671613,
89
+ "num_full_absorption": 565,
90
+ "num_probe_true_positives": 1011,
91
+ "num_split_features": 4
92
+ },
93
+ {
94
+ "first_letter": "i",
95
+ "mean_absorption_fraction": 0.5450258516855058,
96
+ "full_absorption_rate": 0.7377750611246944,
97
+ "num_full_absorption": 1207,
98
+ "num_probe_true_positives": 1636,
99
+ "num_split_features": 2
100
+ },
101
+ {
102
+ "first_letter": "j",
103
+ "mean_absorption_fraction": 0.34805945001583916,
104
+ "full_absorption_rate": 0.30892448512585813,
105
+ "num_full_absorption": 135,
106
+ "num_probe_true_positives": 437,
107
+ "num_split_features": 1
108
+ },
109
+ {
110
+ "first_letter": "k",
111
+ "mean_absorption_fraction": 0.12442012974468852,
112
+ "full_absorption_rate": 0.16496350364963502,
113
+ "num_full_absorption": 113,
114
+ "num_probe_true_positives": 685,
115
+ "num_split_features": 1
116
+ },
117
+ {
118
+ "first_letter": "l",
119
+ "mean_absorption_fraction": 0.5697497391831701,
120
+ "full_absorption_rate": 0.6039279869067103,
121
+ "num_full_absorption": 738,
122
+ "num_probe_true_positives": 1222,
123
+ "num_split_features": 4
124
+ },
125
+ {
126
+ "first_letter": "m",
127
+ "mean_absorption_fraction": 0.6777915542597618,
128
+ "full_absorption_rate": 0.7422796181920269,
129
+ "num_full_absorption": 1322,
130
+ "num_probe_true_positives": 1781,
131
+ "num_split_features": 3
132
+ },
133
+ {
134
+ "first_letter": "n",
135
+ "mean_absorption_fraction": 0.535242959534133,
136
+ "full_absorption_rate": 0.5289672544080605,
137
+ "num_full_absorption": 420,
138
+ "num_probe_true_positives": 794,
139
+ "num_split_features": 4
140
+ },
141
+ {
142
+ "first_letter": "o",
143
+ "mean_absorption_fraction": 0.36678618140490626,
144
+ "full_absorption_rate": 0.45681381957773515,
145
+ "num_full_absorption": 476,
146
+ "num_probe_true_positives": 1042,
147
+ "num_split_features": 1
148
+ },
149
+ {
150
+ "first_letter": "p",
151
+ "mean_absorption_fraction": 0.7778720671159048,
152
+ "full_absorption_rate": 0.7653490328006728,
153
+ "num_full_absorption": 1820,
154
+ "num_probe_true_positives": 2378,
155
+ "num_split_features": 4
156
+ },
157
+ {
158
+ "first_letter": "q",
159
+ "mean_absorption_fraction": 0.30341799266122393,
160
+ "full_absorption_rate": 0.25,
161
+ "num_full_absorption": 45,
162
+ "num_probe_true_positives": 180,
163
+ "num_split_features": 2
164
+ },
165
+ {
166
+ "first_letter": "r",
167
+ "mean_absorption_fraction": 0.5465185579059652,
168
+ "full_absorption_rate": 0.6141235813366961,
169
+ "num_full_absorption": 974,
170
+ "num_probe_true_positives": 1586,
171
+ "num_split_features": 5
172
+ },
173
+ {
174
+ "first_letter": "s",
175
+ "mean_absorption_fraction": 0.7579124661305765,
176
+ "full_absorption_rate": 0.7632582322357019,
177
+ "num_full_absorption": 2202,
178
+ "num_probe_true_positives": 2885,
179
+ "num_split_features": 4
180
+ },
181
+ {
182
+ "first_letter": "t",
183
+ "mean_absorption_fraction": 0.6113684686981942,
184
+ "full_absorption_rate": 0.6060783167738165,
185
+ "num_full_absorption": 1037,
186
+ "num_probe_true_positives": 1711,
187
+ "num_split_features": 5
188
+ },
189
+ {
190
+ "first_letter": "u",
191
+ "mean_absorption_fraction": 0.15134342914968058,
192
+ "full_absorption_rate": 0.29396325459317585,
193
+ "num_full_absorption": 224,
194
+ "num_probe_true_positives": 762,
195
+ "num_split_features": 2
196
+ },
197
+ {
198
+ "first_letter": "v",
199
+ "mean_absorption_fraction": 0.3218639830617784,
200
+ "full_absorption_rate": 0.37926675094816686,
201
+ "num_full_absorption": 300,
202
+ "num_probe_true_positives": 791,
203
+ "num_split_features": 5
204
+ },
205
+ {
206
+ "first_letter": "w",
207
+ "mean_absorption_fraction": 0.4937193345242905,
208
+ "full_absorption_rate": 0.5872781065088757,
209
+ "num_full_absorption": 397,
210
+ "num_probe_true_positives": 676,
211
+ "num_split_features": 3
212
+ },
213
+ {
214
+ "first_letter": "x",
215
+ "mean_absorption_fraction": 0.08372842617309774,
216
+ "full_absorption_rate": 0.011764705882352941,
217
+ "num_full_absorption": 1,
218
+ "num_probe_true_positives": 85,
219
+ "num_split_features": 1
220
+ },
221
+ {
222
+ "first_letter": "y",
223
+ "mean_absorption_fraction": 0.18045047824415694,
224
+ "full_absorption_rate": 0.15337423312883436,
225
+ "num_full_absorption": 25,
226
+ "num_probe_true_positives": 163,
227
+ "num_split_features": 1
228
+ },
229
+ {
230
+ "first_letter": "z",
231
+ "mean_absorption_fraction": 0.02306847268705111,
232
+ "full_absorption_rate": 0.029166666666666667,
233
+ "num_full_absorption": 7,
234
+ "num_probe_true_positives": 240,
235
+ "num_split_features": 1
236
+ }
237
+ ],
238
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
239
+ "sae_lens_id": "custom_sae",
240
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000",
241
+ "sae_lens_version": "5.5.0",
242
+ "sae_cfg_dict": {
243
+ "model_name": "gemma-2-2b",
244
+ "d_in": 2304,
245
+ "d_sae": 65536,
246
+ "hook_layer": 12,
247
+ "hook_name": "blocks.12.hook_resid_post",
248
+ "context_size": null,
249
+ "hook_head_index": null,
250
+ "architecture": "topk",
251
+ "apply_b_dec_to_input": null,
252
+ "finetuning_scaling_factor": null,
253
+ "activation_fn_str": "",
254
+ "prepend_bos": true,
255
+ "normalize_activations": "none",
256
+ "dtype": "bfloat16",
257
+ "device": "",
258
+ "dataset_path": "",
259
+ "dataset_trust_remote_code": true,
260
+ "seqpos_slice": [
261
+ null
262
+ ],
263
+ "training_tokens": -100000,
264
+ "sae_lens_training_version": null,
265
+ "neuronpedia_id": null
266
+ },
267
+ "eval_result_unstructured": null
268
+ }
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "absorption_first_letter",
3
+ "eval_config": {
4
+ "model_name": "gemma-2-2b",
5
+ "random_seed": 42,
6
+ "f1_jump_threshold": 0.03,
7
+ "max_k_value": 10,
8
+ "prompt_template": "{word} has the first letter:",
9
+ "prompt_token_pos": -6,
10
+ "llm_batch_size": 32,
11
+ "llm_dtype": "bfloat16",
12
+ "k_sparse_probe_l1_decay": 0.01,
13
+ "k_sparse_probe_batch_size": 4096,
14
+ "k_sparse_probe_num_epochs": 50
15
+ },
16
+ "eval_id": "0acc62cf-3e4f-4222-bb14-633ee7a3b344",
17
+ "datetime_epoch_millis": 1740195695924,
18
+ "eval_result_metrics": {
19
+ "mean": {
20
+ "mean_absorption_fraction_score": 0.2956776053959005,
21
+ "mean_full_absorption_score": 0.30409644176184997,
22
+ "mean_num_split_features": 1.5384615384615385,
23
+ "std_dev_absorption_fraction_score": 0.20040407999198337,
24
+ "std_dev_full_absorption_score": 0.2151226601623177,
25
+ "std_dev_num_split_features": 1.2076678096486377
26
+ }
27
+ },
28
+ "eval_result_details": [
29
+ {
30
+ "first_letter": "a",
31
+ "mean_absorption_fraction": 0.37592651585433173,
32
+ "full_absorption_rate": 0.41435185185185186,
33
+ "num_full_absorption": 1074,
34
+ "num_probe_true_positives": 2592,
35
+ "num_split_features": 1
36
+ },
37
+ {
38
+ "first_letter": "b",
39
+ "mean_absorption_fraction": 0.3279063865305123,
40
+ "full_absorption_rate": 0.2686286787726988,
41
+ "num_full_absorption": 429,
42
+ "num_probe_true_positives": 1597,
43
+ "num_split_features": 1
44
+ },
45
+ {
46
+ "first_letter": "c",
47
+ "mean_absorption_fraction": 0.6351888347806534,
48
+ "full_absorption_rate": 0.6772793053545586,
49
+ "num_full_absorption": 1872,
50
+ "num_probe_true_positives": 2764,
51
+ "num_split_features": 2
52
+ },
53
+ {
54
+ "first_letter": "d",
55
+ "mean_absorption_fraction": 0.35041653145091295,
56
+ "full_absorption_rate": 0.3755952380952381,
57
+ "num_full_absorption": 631,
58
+ "num_probe_true_positives": 1680,
59
+ "num_split_features": 2
60
+ },
61
+ {
62
+ "first_letter": "e",
63
+ "mean_absorption_fraction": 0.38590072216293675,
64
+ "full_absorption_rate": 0.43223443223443225,
65
+ "num_full_absorption": 708,
66
+ "num_probe_true_positives": 1638,
67
+ "num_split_features": 3
68
+ },
69
+ {
70
+ "first_letter": "f",
71
+ "mean_absorption_fraction": 0.4643418922512006,
72
+ "full_absorption_rate": 0.450487012987013,
73
+ "num_full_absorption": 555,
74
+ "num_probe_true_positives": 1232,
75
+ "num_split_features": 1
76
+ },
77
+ {
78
+ "first_letter": "g",
79
+ "mean_absorption_fraction": 0.18654193040142905,
80
+ "full_absorption_rate": 0.1805019305019305,
81
+ "num_full_absorption": 187,
82
+ "num_probe_true_positives": 1036,
83
+ "num_split_features": 1
84
+ },
85
+ {
86
+ "first_letter": "h",
87
+ "mean_absorption_fraction": 0.16308714809331934,
88
+ "full_absorption_rate": 0.1552917903066271,
89
+ "num_full_absorption": 157,
90
+ "num_probe_true_positives": 1011,
91
+ "num_split_features": 1
92
+ },
93
+ {
94
+ "first_letter": "i",
95
+ "mean_absorption_fraction": 0.4303391938395123,
96
+ "full_absorption_rate": 0.5623471882640587,
97
+ "num_full_absorption": 920,
98
+ "num_probe_true_positives": 1636,
99
+ "num_split_features": 3
100
+ },
101
+ {
102
+ "first_letter": "j",
103
+ "mean_absorption_fraction": 0.011237078817775636,
104
+ "full_absorption_rate": 0.016018306636155607,
105
+ "num_full_absorption": 7,
106
+ "num_probe_true_positives": 437,
107
+ "num_split_features": 1
108
+ },
109
+ {
110
+ "first_letter": "k",
111
+ "mean_absorption_fraction": 0.011207543180270212,
112
+ "full_absorption_rate": 0.027737226277372264,
113
+ "num_full_absorption": 19,
114
+ "num_probe_true_positives": 685,
115
+ "num_split_features": 1
116
+ },
117
+ {
118
+ "first_letter": "l",
119
+ "mean_absorption_fraction": 0.37143542853909217,
120
+ "full_absorption_rate": 0.37561374795417346,
121
+ "num_full_absorption": 459,
122
+ "num_probe_true_positives": 1222,
123
+ "num_split_features": 1
124
+ },
125
+ {
126
+ "first_letter": "m",
127
+ "mean_absorption_fraction": 0.5060310945441595,
128
+ "full_absorption_rate": 0.5384615384615384,
129
+ "num_full_absorption": 959,
130
+ "num_probe_true_positives": 1781,
131
+ "num_split_features": 1
132
+ },
133
+ {
134
+ "first_letter": "n",
135
+ "mean_absorption_fraction": 0.3325042979830935,
136
+ "full_absorption_rate": 0.29345088161209065,
137
+ "num_full_absorption": 233,
138
+ "num_probe_true_positives": 794,
139
+ "num_split_features": 4
140
+ },
141
+ {
142
+ "first_letter": "o",
143
+ "mean_absorption_fraction": 0.225787069641184,
144
+ "full_absorption_rate": 0.2744721689059501,
145
+ "num_full_absorption": 286,
146
+ "num_probe_true_positives": 1042,
147
+ "num_split_features": 1
148
+ },
149
+ {
150
+ "first_letter": "p",
151
+ "mean_absorption_fraction": 0.7353015458594555,
152
+ "full_absorption_rate": 0.7283431455004206,
153
+ "num_full_absorption": 1732,
154
+ "num_probe_true_positives": 2378,
155
+ "num_split_features": 1
156
+ },
157
+ {
158
+ "first_letter": "q",
159
+ "mean_absorption_fraction": 0.019577668010399153,
160
+ "full_absorption_rate": 0.022222222222222223,
161
+ "num_full_absorption": 4,
162
+ "num_probe_true_positives": 180,
163
+ "num_split_features": 1
164
+ },
165
+ {
166
+ "first_letter": "r",
167
+ "mean_absorption_fraction": 0.40605372278006896,
168
+ "full_absorption_rate": 0.45081967213114754,
169
+ "num_full_absorption": 715,
170
+ "num_probe_true_positives": 1586,
171
+ "num_split_features": 1
172
+ },
173
+ {
174
+ "first_letter": "s",
175
+ "mean_absorption_fraction": 0.5598209126630196,
176
+ "full_absorption_rate": 0.49740034662045063,
177
+ "num_full_absorption": 1435,
178
+ "num_probe_true_positives": 2885,
179
+ "num_split_features": 6
180
+ },
181
+ {
182
+ "first_letter": "t",
183
+ "mean_absorption_fraction": 0.35543202018081804,
184
+ "full_absorption_rate": 0.31209818819403856,
185
+ "num_full_absorption": 534,
186
+ "num_probe_true_positives": 1711,
187
+ "num_split_features": 1
188
+ },
189
+ {
190
+ "first_letter": "u",
191
+ "mean_absorption_fraction": 0.3328107394590775,
192
+ "full_absorption_rate": 0.4671916010498688,
193
+ "num_full_absorption": 356,
194
+ "num_probe_true_positives": 762,
195
+ "num_split_features": 1
196
+ },
197
+ {
198
+ "first_letter": "v",
199
+ "mean_absorption_fraction": 0.054853668853618774,
200
+ "full_absorption_rate": 0.061946902654867256,
201
+ "num_full_absorption": 49,
202
+ "num_probe_true_positives": 791,
203
+ "num_split_features": 1
204
+ },
205
+ {
206
+ "first_letter": "w",
207
+ "mean_absorption_fraction": 0.21344579189584933,
208
+ "full_absorption_rate": 0.23372781065088757,
209
+ "num_full_absorption": 158,
210
+ "num_probe_true_positives": 676,
211
+ "num_split_features": 1
212
+ },
213
+ {
214
+ "first_letter": "x",
215
+ "mean_absorption_fraction": 0.1554455330699017,
216
+ "full_absorption_rate": 0.0,
217
+ "num_full_absorption": 0,
218
+ "num_probe_true_positives": 85,
219
+ "num_split_features": 1
220
+ },
221
+ {
222
+ "first_letter": "y",
223
+ "mean_absorption_fraction": 0.07682376512280702,
224
+ "full_absorption_rate": 0.0736196319018405,
225
+ "num_full_absorption": 12,
226
+ "num_probe_true_positives": 163,
227
+ "num_split_features": 1
228
+ },
229
+ {
230
+ "first_letter": "z",
231
+ "mean_absorption_fraction": 0.0002007043280144281,
232
+ "full_absorption_rate": 0.016666666666666666,
233
+ "num_full_absorption": 4,
234
+ "num_probe_true_positives": 240,
235
+ "num_split_features": 1
236
+ }
237
+ ],
238
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
239
+ "sae_lens_id": "custom_sae",
240
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000",
241
+ "sae_lens_version": "5.5.0",
242
+ "sae_cfg_dict": {
243
+ "model_name": "gemma-2-2b",
244
+ "d_in": 2304,
245
+ "d_sae": 65536,
246
+ "hook_layer": 12,
247
+ "hook_name": "blocks.12.hook_resid_post",
248
+ "context_size": null,
249
+ "hook_head_index": null,
250
+ "architecture": "topk",
251
+ "apply_b_dec_to_input": null,
252
+ "finetuning_scaling_factor": null,
253
+ "activation_fn_str": "",
254
+ "prepend_bos": true,
255
+ "normalize_activations": "none",
256
+ "dtype": "bfloat16",
257
+ "device": "",
258
+ "dataset_path": "",
259
+ "dataset_trust_remote_code": true,
260
+ "seqpos_slice": [
261
+ null
262
+ ],
263
+ "training_tokens": -100000,
264
+ "sae_lens_training_version": null,
265
+ "neuronpedia_id": null
266
+ },
267
+ "eval_result_unstructured": null
268
+ }
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf17d005ae5204272b4c662cb93db6a0bcde577a14e57310a7e2109e39b69107
3
+ size 25991318
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b6ea85cd6c507c1763e9dd60c7e44b2df4bd2f6b1de54e785d1000c4b498bcb
3
+ size 25835979
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7478f4580ec833ae1ad9c530373287986c4503a95360ca57bd6e5890fc9fd24
3
+ size 25528227
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a9a8db4efe22a9bf8a3e7aa395e2f22f7e993697e7ede40fb5172370ee56c4e
3
+ size 25820187
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3df2b9e39a5d68fedd246832bc864c25d7818f47ab55f9d3bf510a45d50ff462
3
+ size 21796194
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd7f12343cbfa1d8071df229a205f103952c4afa96d5375f935381e52a4c8ba7
3
+ size 21209969
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:741a051b33165047cd4820f6d6fe7053639e0ba7554cb65b3064828cfbeb18c5
3
+ size 21598398
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:564b11414d3b95135bf4757715dd45bb98953d953656ebb5e9b6b62baf2d6231
3
+ size 21713941
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "scr",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": true,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "5a1c4654-7650-404c-b3f2-2c2e726acd7b",
73
+ "datetime_epoch_millis": 1740201072988,
74
+ "eval_result_metrics": {
75
+ "scr_metrics": {
76
+ "scr_dir1_threshold_2": 0.20733915350751708,
77
+ "scr_metric_threshold_2": 0.11930950078221424,
78
+ "scr_dir2_threshold_2": 0.12406375737178194,
79
+ "scr_dir1_threshold_5": 0.22061339364104127,
80
+ "scr_metric_threshold_5": 0.17187754396157023,
81
+ "scr_dir2_threshold_5": 0.18389215216748134,
82
+ "scr_dir1_threshold_10": 0.27419770961645995,
83
+ "scr_metric_threshold_10": 0.21920456810879457,
84
+ "scr_dir2_threshold_10": 0.23032246687146835,
85
+ "scr_dir1_threshold_20": 0.3095246723854987,
86
+ "scr_metric_threshold_20": 0.2684238692342566,
87
+ "scr_dir2_threshold_20": 0.27399307227914804,
88
+ "scr_dir1_threshold_50": 0.2617380192076716,
89
+ "scr_metric_threshold_50": 0.37010005307771054,
90
+ "scr_dir2_threshold_50": 0.37541888018379954,
91
+ "scr_dir1_threshold_100": 0.12272683889285801,
92
+ "scr_metric_threshold_100": 0.42974841039466505,
93
+ "scr_dir2_threshold_100": 0.4320246013307986,
94
+ "scr_dir1_threshold_500": -0.0015497384482677581,
95
+ "scr_metric_threshold_500": 0.35600559749255273,
96
+ "scr_dir2_threshold_500": 0.36566734575390525
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
102
+ "scr_dir1_threshold_2": 0.46031747533501616,
103
+ "scr_metric_threshold_2": 0.009828028179079856,
104
+ "scr_dir2_threshold_2": 0.009828028179079856,
105
+ "scr_dir1_threshold_5": 0.5238097040201937,
106
+ "scr_metric_threshold_5": 0.024570143672067307,
107
+ "scr_dir2_threshold_5": 0.024570143672067307,
108
+ "scr_dir1_threshold_10": 0.555555345309774,
109
+ "scr_metric_threshold_10": 0.0319410549698257,
110
+ "scr_dir2_threshold_10": 0.0319410549698257,
111
+ "scr_dir1_threshold_20": 0.5873019327053712,
112
+ "scr_metric_threshold_20": 0.061425139507065275,
113
+ "scr_dir2_threshold_20": 0.061425139507065275,
114
+ "scr_dir1_threshold_50": 0.5396825246649839,
115
+ "scr_metric_threshold_50": 0.16953315657947304,
116
+ "scr_dir2_threshold_50": 0.16953315657947304,
117
+ "scr_dir1_threshold_100": 0.3650786592542414,
118
+ "scr_metric_threshold_100": 0.21621618059355086,
119
+ "scr_dir2_threshold_100": 0.21621618059355086,
120
+ "scr_dir1_threshold_500": 0.3968252466498386,
121
+ "scr_metric_threshold_500": 0.26535632148895016,
122
+ "scr_dir2_threshold_500": 0.26535632148895016
123
+ },
124
+ {
125
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
126
+ "scr_dir1_threshold_2": 0.19191917367472955,
127
+ "scr_metric_threshold_2": 0.09915005123804593,
128
+ "scr_dir2_threshold_2": 0.09915005123804593,
129
+ "scr_dir1_threshold_5": 0.19191917367472955,
130
+ "scr_metric_threshold_5": 0.18130315345731438,
131
+ "scr_dir2_threshold_5": 0.18130315345731438,
132
+ "scr_dir1_threshold_10": 0.2626259524704024,
133
+ "scr_metric_threshold_10": 0.21246456436179043,
134
+ "scr_dir2_threshold_10": 0.21246456436179043,
135
+ "scr_dir1_threshold_20": 0.34343421572310684,
136
+ "scr_metric_threshold_20": 0.2294616822322599,
137
+ "scr_dir2_threshold_20": 0.2294616822322599,
138
+ "scr_dir1_threshold_50": 0.3232324509435598,
139
+ "scr_metric_threshold_50": 0.3456090201924673,
140
+ "scr_dir2_threshold_50": 0.3456090201924673,
141
+ "scr_dir1_threshold_100": 0.09090914564247801,
142
+ "scr_metric_threshold_100": 0.43909342175758737,
143
+ "scr_dir2_threshold_100": 0.43909342175758737,
144
+ "scr_dir1_threshold_500": -0.48484897744896877,
145
+ "scr_metric_threshold_500": 0.16997168525977072,
146
+ "scr_dir2_threshold_500": 0.16997168525977072
147
+ },
148
+ {
149
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
150
+ "scr_dir1_threshold_2": 0.532258142045613,
151
+ "scr_metric_threshold_2": 0.020202059731676766,
152
+ "scr_dir2_threshold_2": 0.020202059731676766,
153
+ "scr_dir1_threshold_5": 0.5483869727270193,
154
+ "scr_metric_threshold_5": 0.03282830943478244,
155
+ "scr_dir2_threshold_5": 0.03282830943478244,
156
+ "scr_dir1_threshold_10": 0.5645158034084256,
157
+ "scr_metric_threshold_10": 0.07070705854409946,
158
+ "scr_dir2_threshold_10": 0.07070705854409946,
159
+ "scr_dir1_threshold_20": 0.5806446340898319,
160
+ "scr_metric_threshold_20": 0.1010101481416146,
161
+ "scr_dir2_threshold_20": 0.1010101481416146,
162
+ "scr_dir1_threshold_50": 0.3870967431817548,
163
+ "scr_metric_threshold_50": 0.22474757585217325,
164
+ "scr_dir2_threshold_50": 0.22474757585217325,
165
+ "scr_dir1_threshold_100": 0.35483812045334157,
166
+ "scr_metric_threshold_100": 0.3055555137453419,
167
+ "scr_dir2_threshold_100": 0.3055555137453419,
168
+ "scr_dir1_threshold_500": 0.2741930056807093,
169
+ "scr_metric_threshold_500": 0.12373742771055866,
170
+ "scr_dir2_threshold_500": 0.12373742771055866
171
+ },
172
+ {
173
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
174
+ "scr_dir1_threshold_2": 0.2845529991791037,
175
+ "scr_metric_threshold_2": 0.07624623538069193,
176
+ "scr_dir2_threshold_2": 0.07624623538069193,
177
+ "scr_dir1_threshold_5": 0.25203261487944845,
178
+ "scr_metric_threshold_5": 0.17008790375159719,
179
+ "scr_dir2_threshold_5": 0.17008790375159719,
180
+ "scr_dir1_threshold_10": 0.3333333333333333,
181
+ "scr_metric_threshold_10": 0.23460408353041323,
182
+ "scr_dir2_threshold_10": 0.23460408353041323,
183
+ "scr_dir1_threshold_20": 0.1707318964255635,
184
+ "scr_metric_threshold_20": 0.32551323800084947,
185
+ "scr_dir2_threshold_20": 0.32551323800084947,
186
+ "scr_dir1_threshold_50": -0.05691030908151685,
187
+ "scr_metric_threshold_50": 0.4193549063717547,
188
+ "scr_dir2_threshold_50": 0.4193549063717547,
189
+ "scr_dir1_threshold_100": -0.2682925647340228,
190
+ "scr_metric_threshold_100": 0.5073313721480223,
191
+ "scr_dir2_threshold_100": 0.5073313721480223,
192
+ "scr_dir1_threshold_500": -1.0243899247818615,
193
+ "scr_metric_threshold_500": 0.1260996708634634,
194
+ "scr_dir2_threshold_500": 0.1260996708634634
195
+ },
196
+ {
197
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
198
+ "scr_dir1_threshold_2": 0.016393554752069144,
199
+ "scr_metric_threshold_2": 0.417968888243172,
200
+ "scr_dir2_threshold_2": 0.417968888243172,
201
+ "scr_dir1_threshold_5": 0.021857964433295084,
202
+ "scr_metric_threshold_5": 0.496093800931695,
203
+ "scr_dir2_threshold_5": 0.496093800931695,
204
+ "scr_dir1_threshold_10": 0.06557389329988525,
205
+ "scr_metric_threshold_10": 0.5742187136202179,
206
+ "scr_dir2_threshold_10": 0.5742187136202179,
207
+ "scr_dir1_threshold_20": 0.18579220912042124,
208
+ "scr_metric_threshold_20": 0.6406250291038257,
209
+ "scr_dir2_threshold_20": 0.6406250291038257,
210
+ "scr_dir1_threshold_50": 0.14754101564344832,
211
+ "scr_metric_threshold_50": 0.7148437427240436,
212
+ "scr_dir2_threshold_50": 0.7148437427240436,
213
+ "scr_dir1_threshold_100": -0.36612033426800783,
214
+ "scr_metric_threshold_100": 0.621093800931695,
215
+ "scr_dir2_threshold_100": 0.621093800931695,
216
+ "scr_dir1_threshold_500": -0.3060108506493485,
217
+ "scr_metric_threshold_500": 0.703124912688523,
218
+ "scr_dir2_threshold_500": 0.703124912688523
219
+ },
220
+ {
221
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
222
+ "scr_dir1_threshold_2": 0.07179486709233554,
223
+ "scr_metric_threshold_2": 0.060483807005948444,
224
+ "scr_dir2_threshold_2": 0.060483807005948444,
225
+ "scr_dir1_threshold_5": 0.07179486709233554,
226
+ "scr_metric_threshold_5": 0.0927419335456348,
227
+ "scr_dir2_threshold_5": 0.0927419335456348,
228
+ "scr_dir1_threshold_10": 0.19487173844135988,
229
+ "scr_metric_threshold_10": 0.1370969176230247,
230
+ "scr_dir2_threshold_10": 0.1370969176230247,
231
+ "scr_dir1_threshold_20": 0.24615374269804866,
232
+ "scr_metric_threshold_20": 0.16129039235714715,
233
+ "scr_dir2_threshold_20": 0.16129039235714715,
234
+ "scr_dir1_threshold_50": 0.3179486097903842,
235
+ "scr_metric_threshold_50": 0.31854833864087,
236
+ "scr_dir2_threshold_50": 0.31854833864087,
237
+ "scr_dir1_threshold_100": 0.3179486097903842,
238
+ "scr_metric_threshold_100": 0.4838709367301568,
239
+ "scr_dir2_threshold_100": 0.4838709367301568,
240
+ "scr_dir1_threshold_500": 0.4358973418467109,
241
+ "scr_metric_threshold_500": 0.5403225380039657,
242
+ "scr_dir2_threshold_500": 0.5403225380039657
243
+ },
244
+ {
245
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
246
+ "scr_dir1_threshold_2": 0.05855860935384807,
247
+ "scr_metric_threshold_2": 0.22767852985167758,
248
+ "scr_dir2_threshold_2": 0.22767852985167758,
249
+ "scr_dir1_threshold_5": 0.10360361086007354,
250
+ "scr_metric_threshold_5": 0.3258928654582359,
251
+ "scr_dir2_threshold_5": 0.3258928654582359,
252
+ "scr_dir1_threshold_10": 0.14414416591355395,
253
+ "scr_metric_threshold_10": 0.4196427074660393,
254
+ "scr_dir2_threshold_10": 0.4196427074660393,
255
+ "scr_dir1_threshold_20": 0.2162161146256372,
256
+ "scr_metric_threshold_20": 0.4821426908352818,
257
+ "scr_dir2_threshold_20": 0.4821426908352818,
258
+ "scr_dir1_threshold_50": 0.22072082956776967,
259
+ "scr_metric_threshold_50": 0.5535713953099135,
260
+ "scr_dir2_threshold_50": 0.5535713953099135,
261
+ "scr_dir1_threshold_100": 0.23423416892600485,
262
+ "scr_metric_threshold_100": 0.6116071511725216,
263
+ "scr_dir2_threshold_100": 0.6116071511725216,
264
+ "scr_dir1_threshold_500": 0.4684683378520097,
265
+ "scr_metric_threshold_500": 0.691964310660422,
266
+ "scr_dir2_threshold_500": 0.691964310660422
267
+ },
268
+ {
269
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
270
+ "scr_dir1_threshold_2": 0.042918406627421406,
271
+ "scr_metric_threshold_2": 0.042918406627421406,
272
+ "scr_dir2_threshold_2": 0.08095245934396302,
273
+ "scr_dir1_threshold_5": 0.05150224144123495,
274
+ "scr_metric_threshold_5": 0.05150224144123495,
275
+ "scr_dir2_threshold_5": 0.14761910708852366,
276
+ "scr_dir1_threshold_10": 0.07296144475494565,
277
+ "scr_metric_threshold_10": 0.07296144475494565,
278
+ "scr_dir2_threshold_10": 0.1619046348563358,
279
+ "scr_dir1_threshold_20": 0.1459226336960092,
280
+ "scr_metric_threshold_20": 0.1459226336960092,
281
+ "scr_dir2_threshold_20": 0.19047625805514054,
282
+ "scr_dir1_threshold_50": 0.21459228895098914,
283
+ "scr_metric_threshold_50": 0.21459228895098914,
284
+ "scr_dir2_threshold_50": 0.25714290579970117,
285
+ "scr_dir1_threshold_100": 0.2532189060784448,
286
+ "scr_metric_threshold_100": 0.2532189060784448,
287
+ "scr_dir2_threshold_100": 0.27142843356751334,
288
+ "scr_dir1_threshold_500": 0.2274679132647684,
289
+ "scr_metric_threshold_500": 0.2274679132647684,
290
+ "scr_dir2_threshold_500": 0.3047618993555888
291
+ }
292
+ ],
293
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
294
+ "sae_lens_id": "custom_sae",
295
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000",
296
+ "sae_lens_version": "5.5.0",
297
+ "sae_cfg_dict": {
298
+ "model_name": "gemma-2-2b",
299
+ "d_in": 2304,
300
+ "d_sae": 65536,
301
+ "hook_layer": 12,
302
+ "hook_name": "blocks.12.hook_resid_post",
303
+ "context_size": null,
304
+ "hook_head_index": null,
305
+ "architecture": "topk",
306
+ "apply_b_dec_to_input": null,
307
+ "finetuning_scaling_factor": null,
308
+ "activation_fn_str": "",
309
+ "prepend_bos": true,
310
+ "normalize_activations": "none",
311
+ "dtype": "bfloat16",
312
+ "device": "",
313
+ "dataset_path": "",
314
+ "dataset_trust_remote_code": true,
315
+ "seqpos_slice": [
316
+ null
317
+ ],
318
+ "training_tokens": -100000,
319
+ "sae_lens_training_version": null,
320
+ "neuronpedia_id": null
321
+ },
322
+ "eval_result_unstructured": null
323
+ }
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "scr",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": true,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "755bb8ea-c7ff-4fa3-97a2-31b4cab1f07b",
73
+ "datetime_epoch_millis": 1740201946199,
74
+ "eval_result_metrics": {
75
+ "scr_metrics": {
76
+ "scr_dir1_threshold_2": 0.12652894537606835,
77
+ "scr_metric_threshold_2": 0.0907272315324803,
78
+ "scr_dir2_threshold_2": 0.09369576167212271,
79
+ "scr_dir1_threshold_5": 0.17040759173008782,
80
+ "scr_metric_threshold_5": 0.15632193837867778,
81
+ "scr_dir2_threshold_5": 0.1618476987026072,
82
+ "scr_dir1_threshold_10": 0.16350941954437448,
83
+ "scr_metric_threshold_10": 0.21441616120887247,
84
+ "scr_dir2_threshold_10": 0.21743575905034218,
85
+ "scr_dir1_threshold_20": 0.13848990984582848,
86
+ "scr_metric_threshold_20": 0.25758062356316186,
87
+ "scr_dir2_threshold_20": 0.2642891875768084,
88
+ "scr_dir1_threshold_50": 0.04031027019135072,
89
+ "scr_metric_threshold_50": 0.31428981658344224,
90
+ "scr_dir2_threshold_50": 0.32129216728011795,
91
+ "scr_dir1_threshold_100": -0.05118550304462503,
92
+ "scr_metric_threshold_100": 0.2917722375027481,
93
+ "scr_dir2_threshold_100": 0.3017507752966668,
94
+ "scr_dir1_threshold_500": -0.173869850419616,
95
+ "scr_metric_threshold_500": 0.27581343492090155,
96
+ "scr_dir2_threshold_500": 0.28674743879422065
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
102
+ "scr_dir1_threshold_2": 0.2857136099242737,
103
+ "scr_metric_threshold_2": 0.004914087313907592,
104
+ "scr_dir2_threshold_2": 0.004914087313907592,
105
+ "scr_dir1_threshold_5": 0.333333017964661,
106
+ "scr_metric_threshold_5": 0.0319410549698257,
107
+ "scr_dir2_threshold_5": 0.0319410549698257,
108
+ "scr_dir1_threshold_10": 0.3492058386094512,
109
+ "scr_metric_threshold_10": 0.04914014089539928,
110
+ "scr_dir2_threshold_10": 0.04914014089539928,
111
+ "scr_dir1_threshold_20": 0.2698407892794835,
112
+ "scr_metric_threshold_20": 0.11793619170022296,
113
+ "scr_dir2_threshold_20": 0.11793619170022296,
114
+ "scr_dir1_threshold_50": 0.23809514798990317,
115
+ "scr_metric_threshold_50": 0.15724815796780706,
116
+ "scr_dir2_threshold_50": 0.15724815796780706,
117
+ "scr_dir1_threshold_100": 0.11111069061954806,
118
+ "scr_metric_threshold_100": 0.17444724389338062,
119
+ "scr_dir2_threshold_100": 0.17444724389338062,
120
+ "scr_dir1_threshold_500": 0.04761846193437052,
121
+ "scr_metric_threshold_500": 0.11547922126763682,
122
+ "scr_dir2_threshold_500": 0.11547922126763682
123
+ },
124
+ {
125
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
126
+ "scr_dir1_threshold_2": 0.07070677879567282,
127
+ "scr_metric_threshold_2": 0.11898016279667031,
128
+ "scr_dir2_threshold_2": 0.11898016279667031,
129
+ "scr_dir1_threshold_5": 0.15151504204837732,
130
+ "scr_metric_threshold_5": 0.18130315345731438,
131
+ "scr_dir2_threshold_5": 0.18130315345731438,
132
+ "scr_dir1_threshold_10": 0.1313126752015721,
133
+ "scr_metric_threshold_10": 0.23796032559334063,
134
+ "scr_dir2_threshold_10": 0.23796032559334063,
135
+ "scr_dir1_threshold_20": 0.06060589640589929,
136
+ "scr_metric_threshold_20": 0.26062326198842795,
137
+ "scr_dir2_threshold_20": 0.26062326198842795,
138
+ "scr_dir1_threshold_50": -0.3333339354005915,
139
+ "scr_metric_threshold_50": 0.32861190232199783,
140
+ "scr_dir2_threshold_50": 0.32861190232199783,
141
+ "scr_dir1_threshold_100": -0.3333339354005915,
142
+ "scr_metric_threshold_100": 0.1869688031302402,
143
+ "scr_dir2_threshold_100": 0.1869688031302402,
144
+ "scr_dir1_threshold_500": -0.41414159658603783,
145
+ "scr_metric_threshold_500": 0.26062326198842795,
146
+ "scr_dir2_threshold_500": 0.26062326198842795
147
+ },
148
+ {
149
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
150
+ "scr_dir1_threshold_2": 0.3870967431817548,
151
+ "scr_metric_threshold_2": 0.010101029865838383,
152
+ "scr_dir2_threshold_2": 0.010101029865838383,
153
+ "scr_dir1_threshold_5": 0.40322557386316116,
154
+ "scr_metric_threshold_5": 0.042929339300620824,
155
+ "scr_dir2_threshold_5": 0.042929339300620824,
156
+ "scr_dir1_threshold_10": 0.40322557386316116,
157
+ "scr_metric_threshold_10": 0.07828286857267056,
158
+ "scr_dir2_threshold_10": 0.07828286857267056,
159
+ "scr_dir1_threshold_20": 0.37096695113474787,
160
+ "scr_metric_threshold_20": 0.1085859581701857,
161
+ "scr_dir2_threshold_20": 0.1085859581701857,
162
+ "scr_dir1_threshold_50": 0.03225766136281265,
163
+ "scr_metric_threshold_50": 0.17424242652298136,
164
+ "scr_dir2_threshold_50": 0.17424242652298136,
165
+ "scr_dir1_threshold_100": 0.016128830681406324,
166
+ "scr_metric_threshold_100": 0.2045455161204965,
167
+ "scr_dir2_threshold_100": 0.2045455161204965,
168
+ "scr_dir1_threshold_500": -0.20967768295508404,
169
+ "scr_metric_threshold_500": 0.0959595579503108,
170
+ "scr_dir2_threshold_500": 0.0959595579503108
171
+ },
172
+ {
173
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
174
+ "scr_dir1_threshold_2": 0.10569112782625298,
175
+ "scr_metric_threshold_2": 0.17008790375159719,
176
+ "scr_dir2_threshold_2": 0.17008790375159719,
177
+ "scr_dir1_threshold_5": 0.22764220550708036,
178
+ "scr_metric_threshold_5": 0.208211108838793,
179
+ "scr_dir2_threshold_5": 0.208211108838793,
180
+ "scr_dir1_threshold_10": 0.08130071845388491,
181
+ "scr_metric_threshold_10": 0.27859231641854704,
182
+ "scr_dir2_threshold_10": 0.27859231641854704,
183
+ "scr_dir1_threshold_20": 0.12195107768082737,
184
+ "scr_metric_threshold_20": 0.34310849619736306,
185
+ "scr_dir2_threshold_20": 0.34310849619736306,
186
+ "scr_dir1_threshold_50": 0.06504076859931053,
187
+ "scr_metric_threshold_50": 0.39882695948107244,
188
+ "scr_dir2_threshold_50": 0.39882695948107244,
189
+ "scr_dir1_threshold_100": -0.5121947200956776,
190
+ "scr_metric_threshold_100": 0.4134897037771171,
191
+ "scr_dir2_threshold_100": 0.4134897037771171,
192
+ "scr_dir1_threshold_500": -0.7154470008208963,
193
+ "scr_metric_threshold_500": 0.23167156962994426,
194
+ "scr_dir2_threshold_500": 0.23167156962994426
195
+ },
196
+ {
197
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
198
+ "scr_dir1_threshold_2": 0.03278678379574697,
199
+ "scr_metric_threshold_2": 0.14062502910382568,
200
+ "scr_dir2_threshold_2": 0.14062502910382568,
201
+ "scr_dir1_threshold_5": 0.03825119347697291,
202
+ "scr_metric_threshold_5": 0.33984374272404355,
203
+ "scr_dir2_threshold_5": 0.33984374272404355,
204
+ "scr_dir1_threshold_10": 0.06010915791026799,
205
+ "scr_metric_threshold_10": 0.4570313445874335,
206
+ "scr_dir2_threshold_10": 0.4570313445874335,
207
+ "scr_dir1_threshold_20": 0.01092881936245188,
208
+ "scr_metric_threshold_20": 0.5117188300355207,
209
+ "scr_dir2_threshold_20": 0.5117188300355207,
210
+ "scr_dir1_threshold_50": -0.06010915791026799,
211
+ "scr_metric_threshold_50": 0.5859375436557386,
212
+ "scr_dir2_threshold_50": 0.5859375436557386,
213
+ "scr_dir1_threshold_100": -0.14754101564344832,
214
+ "scr_metric_threshold_100": 0.61718760186339,
215
+ "scr_dir2_threshold_100": 0.61718760186339,
216
+ "scr_dir1_threshold_500": -0.35519118919716464,
217
+ "scr_metric_threshold_500": 0.6562500582076514,
218
+ "scr_dir2_threshold_500": 0.6562500582076514
219
+ },
220
+ {
221
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
222
+ "scr_dir1_threshold_2": 0.05128200425668879,
223
+ "scr_metric_threshold_2": 0.06854845881151235,
224
+ "scr_dir2_threshold_2": 0.06854845881151235,
225
+ "scr_dir1_threshold_5": 0.07692300638503319,
226
+ "scr_metric_threshold_5": 0.10483879108333834,
227
+ "scr_dir2_threshold_5": 0.10483879108333834,
228
+ "scr_dir1_threshold_10": 0.06666642213478169,
229
+ "scr_metric_threshold_10": 0.1491935348194436,
230
+ "scr_dir2_threshold_10": 0.1491935348194436,
231
+ "scr_dir1_threshold_20": 0.03076914142104203,
232
+ "scr_metric_threshold_20": 0.2137097878988163,
233
+ "scr_dir2_threshold_20": 0.2137097878988163,
234
+ "scr_dir1_threshold_50": 0.06666642213478169,
235
+ "scr_metric_threshold_50": 0.2983870696388872,
236
+ "scr_dir2_threshold_50": 0.2983870696388872,
237
+ "scr_dir1_threshold_100": 0.09743586922067994,
238
+ "scr_metric_threshold_100": 0.23790326263293876,
239
+ "scr_dir2_threshold_100": 0.23790326263293876,
240
+ "scr_dir1_threshold_500": 0.07179486709233554,
241
+ "scr_metric_threshold_500": 0.31048392717659073,
242
+ "scr_dir2_threshold_500": 0.31048392717659073
243
+ },
244
+ {
245
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
246
+ "scr_dir1_threshold_2": 0.03603610860073534,
247
+ "scr_metric_threshold_2": 0.16964277398906946,
248
+ "scr_dir2_threshold_2": 0.16964277398906946,
249
+ "scr_dir1_threshold_5": 0.07657666365421574,
250
+ "scr_metric_threshold_5": 0.2857142857142857,
251
+ "scr_dir2_threshold_5": 0.2857142857142857,
252
+ "scr_dir1_threshold_10": 0.12612611161318626,
253
+ "scr_metric_threshold_10": 0.37499990021545476,
254
+ "scr_dir2_threshold_10": 0.37499990021545476,
255
+ "scr_dir1_threshold_20": 0.14414416591355395,
256
+ "scr_metric_threshold_20": 0.4062500249461363,
257
+ "scr_dir2_threshold_20": 0.4062500249461363,
258
+ "scr_dir1_threshold_50": 0.19369361387252446,
259
+ "scr_metric_threshold_50": 0.4508928321967208,
260
+ "scr_dir2_threshold_50": 0.4508928321967208,
261
+ "scr_dir1_threshold_100": 0.23873861537874994,
262
+ "scr_metric_threshold_100": 0.37946412772208915,
263
+ "scr_dir2_threshold_100": 0.37946412772208915,
264
+ "scr_dir1_threshold_500": 0.0810811101069608,
265
+ "scr_metric_threshold_500": 0.43303565607806294,
266
+ "scr_dir2_threshold_500": 0.43303565607806294
267
+ },
268
+ {
269
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
270
+ "scr_dir1_threshold_2": 0.042918406627421406,
271
+ "scr_metric_threshold_2": 0.042918406627421406,
272
+ "scr_dir2_threshold_2": 0.06666664774456064,
273
+ "scr_dir1_threshold_5": 0.05579403094120067,
274
+ "scr_metric_threshold_5": 0.05579403094120067,
275
+ "scr_dir2_threshold_5": 0.10000011353263609,
276
+ "scr_dir1_threshold_10": 0.09012885856869063,
277
+ "scr_metric_threshold_10": 0.09012885856869063,
278
+ "scr_dir2_threshold_10": 0.11428564130044823,
279
+ "scr_dir1_threshold_20": 0.09871243756862208,
280
+ "scr_metric_threshold_20": 0.09871243756862208,
281
+ "scr_dir2_threshold_20": 0.1523809496777944,
282
+ "scr_dir1_threshold_50": 0.12017164088233277,
283
+ "scr_metric_threshold_50": 0.12017164088233277,
284
+ "scr_dir2_threshold_50": 0.17619044645573817,
285
+ "scr_dir1_threshold_100": 0.12017164088233277,
286
+ "scr_metric_threshold_100": 0.12017164088233277,
287
+ "scr_dir2_threshold_100": 0.19999994323368195,
288
+ "scr_dir1_threshold_500": 0.10300422706858779,
289
+ "scr_metric_threshold_500": 0.10300422706858779,
290
+ "scr_dir2_threshold_500": 0.19047625805514054
291
+ }
292
+ ],
293
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
294
+ "sae_lens_id": "custom_sae",
295
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000",
296
+ "sae_lens_version": "5.5.0",
297
+ "sae_cfg_dict": {
298
+ "model_name": "gemma-2-2b",
299
+ "d_in": 2304,
300
+ "d_sae": 65536,
301
+ "hook_layer": 12,
302
+ "hook_name": "blocks.12.hook_resid_post",
303
+ "context_size": null,
304
+ "hook_head_index": null,
305
+ "architecture": "topk",
306
+ "apply_b_dec_to_input": null,
307
+ "finetuning_scaling_factor": null,
308
+ "activation_fn_str": "",
309
+ "prepend_bos": true,
310
+ "normalize_activations": "none",
311
+ "dtype": "bfloat16",
312
+ "device": "",
313
+ "dataset_path": "",
314
+ "dataset_trust_remote_code": true,
315
+ "seqpos_slice": [
316
+ null
317
+ ],
318
+ "training_tokens": -100000,
319
+ "sae_lens_training_version": null,
320
+ "neuronpedia_id": null
321
+ },
322
+ "eval_result_unstructured": null
323
+ }
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "scr",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": true,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "abc92c56-c28d-443e-a67d-598723b6587c",
73
+ "datetime_epoch_millis": 1740201509460,
74
+ "eval_result_metrics": {
75
+ "scr_metrics": {
76
+ "scr_dir1_threshold_2": 0.18683727638317182,
77
+ "scr_metric_threshold_2": 0.08404231131018772,
78
+ "scr_dir2_threshold_2": 0.08224127094506128,
79
+ "scr_dir1_threshold_5": 0.2253563520666828,
80
+ "scr_metric_threshold_5": 0.13959451018246774,
81
+ "scr_dir2_threshold_5": 0.1419013964046876,
82
+ "scr_dir1_threshold_10": 0.2258952276339641,
83
+ "scr_metric_threshold_10": 0.1895312922582373,
84
+ "scr_dir2_threshold_10": 0.19236695973140355,
85
+ "scr_dir1_threshold_20": 0.24793378644576078,
86
+ "scr_metric_threshold_20": 0.25111043795982735,
87
+ "scr_dir2_threshold_20": 0.2575763149690073,
88
+ "scr_dir1_threshold_50": 0.1565240815441055,
89
+ "scr_metric_threshold_50": 0.3408563171460069,
90
+ "scr_dir2_threshold_50": 0.34439707475977116,
91
+ "scr_dir1_threshold_100": 0.0584345012425294,
92
+ "scr_metric_threshold_100": 0.3621166850027227,
93
+ "scr_dir2_threshold_100": 0.36213712487414773,
94
+ "scr_dir1_threshold_500": 0.00039541249184461835,
95
+ "scr_metric_threshold_500": 0.31633581888474427,
96
+ "scr_dir2_threshold_500": 0.31587085277050087
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
102
+ "scr_dir1_threshold_2": 0.3968252466498386,
103
+ "scr_metric_threshold_2": 0.009828028179079856,
104
+ "scr_dir2_threshold_2": 0.009828028179079856,
105
+ "scr_dir1_threshold_5": 0.42857088793941894,
106
+ "scr_metric_threshold_5": 0.03439802540241183,
107
+ "scr_dir2_threshold_5": 0.03439802540241183,
108
+ "scr_dir1_threshold_10": 0.3968252466498386,
109
+ "scr_metric_threshold_10": 0.05405408176057155,
110
+ "scr_dir2_threshold_10": 0.05405408176057155,
111
+ "scr_dir1_threshold_20": 0.41269806729462877,
112
+ "scr_metric_threshold_20": 0.11302210438631537,
113
+ "scr_dir2_threshold_20": 0.11302210438631537,
114
+ "scr_dir1_threshold_50": 0.3809524260050484,
115
+ "scr_metric_threshold_50": 0.13759224805838266,
116
+ "scr_dir2_threshold_50": 0.13759224805838266,
117
+ "scr_dir1_threshold_100": 0.333333017964661,
118
+ "scr_metric_threshold_100": 0.16216224528171463,
119
+ "scr_dir2_threshold_100": 0.16216224528171463,
120
+ "scr_dir1_threshold_500": -0.1269844573703551,
121
+ "scr_metric_threshold_500": 0.1031940762072355,
122
+ "scr_dir2_threshold_500": 0.1031940762072355
123
+ },
124
+ {
125
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
126
+ "scr_dir1_threshold_2": 0.10101002803225154,
127
+ "scr_metric_threshold_2": 0.1416430991917576,
128
+ "scr_dir2_threshold_2": 0.1416430991917576,
129
+ "scr_dir1_threshold_5": 0.16161592443815084,
130
+ "scr_metric_threshold_5": 0.21813038288640824,
131
+ "scr_dir2_threshold_5": 0.21813038288640824,
132
+ "scr_dir1_threshold_10": 0.15151504204837732,
133
+ "scr_metric_threshold_10": 0.27478755502243446,
134
+ "scr_dir2_threshold_10": 0.27478755502243446,
135
+ "scr_dir1_threshold_20": 0.1414141596586038,
136
+ "scr_metric_threshold_20": 0.2832861983835152,
137
+ "scr_dir2_threshold_20": 0.2832861983835152,
138
+ "scr_dir1_threshold_50": 0.09090914564247801,
139
+ "scr_metric_threshold_50": 0.43059494724819863,
140
+ "scr_dir2_threshold_50": 0.43059494724819863,
141
+ "scr_dir1_threshold_100": -0.38383894941671726,
142
+ "scr_metric_threshold_100": 0.24645896895442135,
143
+ "scr_dir2_threshold_100": 0.24645896895442135,
144
+ "scr_dir1_threshold_500": -0.2828283193172076,
145
+ "scr_metric_threshold_500": 0.3371105456830785,
146
+ "scr_dir2_threshold_500": 0.3371105456830785
147
+ },
148
+ {
149
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
150
+ "scr_dir1_threshold_2": 0.4999995193171997,
151
+ "scr_metric_threshold_2": 0.022727279568944055,
152
+ "scr_dir2_threshold_2": 0.022727279568944055,
153
+ "scr_dir1_threshold_5": 0.5483869727270193,
154
+ "scr_metric_threshold_5": 0.0530303691664592,
155
+ "scr_dir2_threshold_5": 0.0530303691664592,
156
+ "scr_dir1_threshold_10": 0.5483869727270193,
157
+ "scr_metric_threshold_10": 0.08080808840993783,
158
+ "scr_dir2_threshold_10": 0.08080808840993783,
159
+ "scr_dir1_threshold_20": 0.5645158034084256,
160
+ "scr_metric_threshold_20": 0.16919198684844677,
161
+ "scr_dir2_threshold_20": 0.16919198684844677,
162
+ "scr_dir1_threshold_50": 0.2741930056807093,
163
+ "scr_metric_threshold_50": 0.23989904539254622,
164
+ "scr_dir2_threshold_50": 0.23989904539254622,
165
+ "scr_dir1_threshold_100": 0.0645153227256253,
166
+ "scr_metric_threshold_100": 0.25000007525838464,
167
+ "scr_dir2_threshold_100": 0.25000007525838464,
168
+ "scr_dir1_threshold_500": 0.016128830681406324,
169
+ "scr_metric_threshold_500": 0.07575764873540326,
170
+ "scr_dir2_threshold_500": 0.07575764873540326
171
+ },
172
+ {
173
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
174
+ "scr_dir1_threshold_2": 0.14634148705319544,
175
+ "scr_metric_threshold_2": 0.12023446826882572,
176
+ "scr_dir2_threshold_2": 0.12023446826882572,
177
+ "scr_dir1_threshold_5": 0.21951223057979316,
178
+ "scr_metric_threshold_5": 0.2375365974308822,
179
+ "scr_dir2_threshold_5": 0.2375365974308822,
180
+ "scr_dir1_threshold_10": 0.10569112782625298,
181
+ "scr_metric_threshold_10": 0.28739003291365367,
182
+ "scr_dir2_threshold_10": 0.28739003291365367,
183
+ "scr_dir1_threshold_20": 0.1869918462801379,
184
+ "scr_metric_threshold_20": 0.3665689569885143,
185
+ "scr_dir2_threshold_20": 0.3665689569885143,
186
+ "scr_dir1_threshold_50": -0.34146330826062055,
187
+ "scr_metric_threshold_50": 0.4398826784687373,
188
+ "scr_dir2_threshold_50": 0.4398826784687373,
189
+ "scr_dir1_threshold_100": -0.4390244611595863,
190
+ "scr_metric_threshold_100": 0.5395893746405804,
191
+ "scr_dir2_threshold_100": 0.5395893746405804,
192
+ "scr_dir1_threshold_500": -0.5284551545407584,
193
+ "scr_metric_threshold_500": 0.26099705822203345,
194
+ "scr_dir2_threshold_500": 0.26099705822203345
195
+ },
196
+ {
197
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
198
+ "scr_dir1_threshold_2": 0.021857964433295084,
199
+ "scr_metric_threshold_2": 0.12890643189891055,
200
+ "scr_dir2_threshold_2": 0.12890643189891055,
201
+ "scr_dir1_threshold_5": 0.027322374114521025,
202
+ "scr_metric_threshold_5": 0.2265625727595642,
203
+ "scr_dir2_threshold_5": 0.2265625727595642,
204
+ "scr_dir1_threshold_10": 0.06010915791026799,
205
+ "scr_metric_threshold_10": 0.3164063154836078,
206
+ "scr_dir2_threshold_10": 0.3164063154836078,
207
+ "scr_dir1_threshold_20": 0.06010915791026799,
208
+ "scr_metric_threshold_20": 0.41015625727595645,
209
+ "scr_dir2_threshold_20": 0.41015625727595645,
210
+ "scr_dir1_threshold_50": 0.08743153202478901,
211
+ "scr_metric_threshold_50": 0.5703125145519129,
212
+ "scr_dir2_threshold_50": 0.5703125145519129,
213
+ "scr_dir1_threshold_100": -0.00546440968122594,
214
+ "scr_metric_threshold_100": 0.6562500582076514,
215
+ "scr_dir2_threshold_100": 0.6562500582076514,
216
+ "scr_dir1_threshold_500": -0.016393554752069144,
217
+ "scr_metric_threshold_500": 0.6757812863797821,
218
+ "scr_dir2_threshold_500": 0.6757812863797821
219
+ },
220
+ {
221
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
222
+ "scr_dir1_threshold_2": 0.10769214780607521,
223
+ "scr_metric_threshold_2": 0.060483807005948444,
224
+ "scr_dir2_threshold_2": 0.060483807005948444,
225
+ "scr_dir1_threshold_5": 0.1692307363130155,
226
+ "scr_metric_threshold_5": 0.10483879108333834,
227
+ "scr_dir2_threshold_5": 0.10483879108333834,
228
+ "scr_dir1_threshold_10": 0.23076901915509954,
229
+ "scr_metric_threshold_10": 0.1370969176230247,
230
+ "scr_dir2_threshold_10": 0.1370969176230247,
231
+ "scr_dir1_threshold_20": 0.24102560340535104,
232
+ "scr_metric_threshold_20": 0.18951607282340924,
233
+ "scr_dir2_threshold_20": 0.18951607282340924,
234
+ "scr_dir1_threshold_50": 0.24615374269804866,
235
+ "scr_metric_threshold_50": 0.3306451961785736,
236
+ "scr_dir2_threshold_50": 0.3306451961785736,
237
+ "scr_dir1_threshold_100": 0.28717946836934216,
238
+ "scr_metric_threshold_100": 0.35483867091269605,
239
+ "scr_dir2_threshold_100": 0.35483867091269605,
240
+ "scr_dir1_threshold_500": 0.29743574695473746,
241
+ "scr_metric_threshold_500": 0.33870960764285285,
242
+ "scr_dir2_threshold_500": 0.33870960764285285
243
+ },
244
+ {
245
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
246
+ "scr_dir1_threshold_2": 0.13513500451867638,
247
+ "scr_metric_threshold_2": 0.10267856311319266,
248
+ "scr_dir2_threshold_2": 0.10267856311319266,
249
+ "scr_dir1_threshold_5": 0.16666666666666666,
250
+ "scr_metric_threshold_5": 0.1607143189758008,
251
+ "scr_dir2_threshold_5": 0.1607143189758008,
252
+ "scr_dir1_threshold_10": 0.19369361387252446,
253
+ "scr_metric_threshold_10": 0.24553570597033553,
254
+ "scr_dir2_threshold_10": 0.24553570597033553,
255
+ "scr_dir1_threshold_20": 0.25225222322637253,
256
+ "scr_metric_threshold_20": 0.35267849659016254,
257
+ "scr_dir2_threshold_20": 0.35267849659016254,
258
+ "scr_dir1_threshold_50": 0.34234222623882343,
259
+ "scr_metric_threshold_50": 0.4062500249461363,
260
+ "scr_dir2_threshold_50": 0.4062500249461363,
261
+ "scr_dir1_threshold_100": 0.3918919426871814,
262
+ "scr_metric_threshold_100": 0.4687500083153788,
263
+ "scr_dir2_threshold_100": 0.4687500083153788,
264
+ "scr_dir1_threshold_500": 0.37387388838681374,
265
+ "scr_metric_threshold_500": 0.4687500083153788,
266
+ "scr_dir2_threshold_500": 0.4687500083153788
267
+ },
268
+ {
269
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
270
+ "scr_dir1_threshold_2": 0.08583681325484281,
271
+ "scr_metric_threshold_2": 0.08583681325484281,
272
+ "scr_dir2_threshold_2": 0.07142849033383136,
273
+ "scr_dir1_threshold_5": 0.08154502375487709,
274
+ "scr_metric_threshold_5": 0.08154502375487709,
275
+ "scr_dir2_threshold_5": 0.10000011353263609,
276
+ "scr_dir1_threshold_10": 0.12017164088233277,
277
+ "scr_metric_threshold_10": 0.12017164088233277,
278
+ "scr_dir2_threshold_10": 0.14285698066766273,
279
+ "scr_dir1_threshold_20": 0.1244634303822985,
280
+ "scr_metric_threshold_20": 0.1244634303822985,
281
+ "scr_dir2_threshold_20": 0.17619044645573817,
282
+ "scr_dir1_threshold_50": 0.17167388232356773,
283
+ "scr_metric_threshold_50": 0.17167388232356773,
284
+ "scr_dir2_threshold_50": 0.19999994323368195,
285
+ "scr_dir1_threshold_100": 0.21888407845095484,
286
+ "scr_metric_threshold_100": 0.21888407845095484,
287
+ "scr_dir2_threshold_100": 0.21904759742235502,
288
+ "scr_dir1_threshold_500": 0.2703863198921898,
289
+ "scr_metric_threshold_500": 0.2703863198921898,
290
+ "scr_dir2_threshold_500": 0.2666665909782426
291
+ }
292
+ ],
293
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
294
+ "sae_lens_id": "custom_sae",
295
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000",
296
+ "sae_lens_version": "5.5.0",
297
+ "sae_cfg_dict": {
298
+ "model_name": "gemma-2-2b",
299
+ "d_in": 2304,
300
+ "d_sae": 65536,
301
+ "hook_layer": 12,
302
+ "hook_name": "blocks.12.hook_resid_post",
303
+ "context_size": null,
304
+ "hook_head_index": null,
305
+ "architecture": "topk",
306
+ "apply_b_dec_to_input": null,
307
+ "finetuning_scaling_factor": null,
308
+ "activation_fn_str": "",
309
+ "prepend_bos": true,
310
+ "normalize_activations": "none",
311
+ "dtype": "bfloat16",
312
+ "device": "",
313
+ "dataset_path": "",
314
+ "dataset_trust_remote_code": true,
315
+ "seqpos_slice": [
316
+ null
317
+ ],
318
+ "training_tokens": -100000,
319
+ "sae_lens_training_version": null,
320
+ "neuronpedia_id": null
321
+ },
322
+ "eval_result_unstructured": null
323
+ }
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "scr",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": true,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "7def7f32-ddef-4567-ac07-ad6074134cf7",
73
+ "datetime_epoch_millis": 1740200633241,
74
+ "eval_result_metrics": {
75
+ "scr_metrics": {
76
+ "scr_dir1_threshold_2": 0.20793359576572354,
77
+ "scr_metric_threshold_2": 0.08837145336971095,
78
+ "scr_dir2_threshold_2": 0.08633535098098324,
79
+ "scr_dir1_threshold_5": 0.23885799540887315,
80
+ "scr_metric_threshold_5": 0.14267344527111614,
81
+ "scr_dir2_threshold_5": 0.1444438578058403,
82
+ "scr_dir1_threshold_10": 0.2422475369969285,
83
+ "scr_metric_threshold_10": 0.20760578487732154,
84
+ "scr_dir2_threshold_10": 0.21431434889096806,
85
+ "scr_dir1_threshold_20": 0.25614441733043103,
86
+ "scr_metric_threshold_20": 0.2715601857041677,
87
+ "scr_dir2_threshold_20": 0.28106097447001843,
88
+ "scr_dir1_threshold_50": 0.1546741739649193,
89
+ "scr_metric_threshold_50": 0.35566136240159857,
90
+ "scr_dir2_threshold_50": 0.36247209379563494,
91
+ "scr_dir1_threshold_100": 0.1450762302274518,
92
+ "scr_metric_threshold_100": 0.34184354784903986,
93
+ "scr_dir2_threshold_100": 0.3450164482284107,
94
+ "scr_dir1_threshold_500": -0.007035319664922636,
95
+ "scr_metric_threshold_500": 0.3299446478569424,
96
+ "scr_dir2_threshold_500": 0.3331686479148758
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
102
+ "scr_dir1_threshold_2": 0.444444654690226,
103
+ "scr_metric_threshold_2": 0.004914087313907592,
104
+ "scr_dir2_threshold_2": 0.004914087313907592,
105
+ "scr_dir1_threshold_5": 0.47619029597980633,
106
+ "scr_metric_threshold_5": 0.017199085925573582,
107
+ "scr_dir2_threshold_5": 0.017199085925573582,
108
+ "scr_dir1_threshold_10": 0.4920631166245965,
109
+ "scr_metric_threshold_10": 0.04914014089539928,
110
+ "scr_dir2_threshold_10": 0.04914014089539928,
111
+ "scr_dir1_threshold_20": 0.4920631166245965,
112
+ "scr_metric_threshold_20": 0.12039316213280908,
113
+ "scr_dir2_threshold_20": 0.12039316213280908,
114
+ "scr_dir1_threshold_50": 0.3492058386094512,
115
+ "scr_metric_threshold_50": 0.14250618892355493,
116
+ "scr_dir2_threshold_50": 0.14250618892355493,
117
+ "scr_dir1_threshold_100": 0.42857088793941894,
118
+ "scr_metric_threshold_100": 0.20147421154929873,
119
+ "scr_dir2_threshold_100": 0.20147421154929873,
120
+ "scr_dir1_threshold_500": 0.19047573994951578,
121
+ "scr_metric_threshold_500": 0.09828013534206324,
122
+ "scr_dir2_threshold_500": 0.09828013534206324
123
+ },
124
+ {
125
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
126
+ "scr_dir1_threshold_2": 0.17171680682792437,
127
+ "scr_metric_threshold_2": 0.14447592402822051,
128
+ "scr_dir2_threshold_2": 0.14447592402822051,
129
+ "scr_dir1_threshold_5": 0.24242418769085533,
130
+ "scr_metric_threshold_5": 0.22096320772287115,
131
+ "scr_dir2_threshold_5": 0.22096320772287115,
132
+ "scr_dir1_threshold_10": 0.2323233053010818,
133
+ "scr_metric_threshold_10": 0.28895184805644103,
134
+ "scr_dir2_threshold_10": 0.28895184805644103,
135
+ "scr_dir1_threshold_20": 0.2121209384542766,
136
+ "scr_metric_threshold_20": 0.3031161410904476,
137
+ "scr_dir2_threshold_20": 0.3031161410904476,
138
+ "scr_dir1_threshold_50": 0.11111091042202506,
139
+ "scr_metric_threshold_50": 0.419263479050655,
140
+ "scr_dir2_threshold_50": 0.419263479050655,
141
+ "scr_dir1_threshold_100": 0.010100882389773526,
142
+ "scr_metric_threshold_100": 0.18130315345731438,
143
+ "scr_dir2_threshold_100": 0.18130315345731438,
144
+ "scr_dir1_threshold_500": -0.5555557562446416,
145
+ "scr_metric_threshold_500": 0.24929179379088426,
146
+ "scr_dir2_threshold_500": 0.24929179379088426
147
+ },
148
+ {
149
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
150
+ "scr_dir1_threshold_2": 0.5806446340898319,
151
+ "scr_metric_threshold_2": 0.03030308959751515,
152
+ "scr_dir2_threshold_2": 0.03030308959751515,
153
+ "scr_dir1_threshold_5": 0.5645158034084256,
154
+ "scr_metric_threshold_5": 0.0479797789751554,
155
+ "scr_dir2_threshold_5": 0.0479797789751554,
156
+ "scr_dir1_threshold_10": 0.5483869727270193,
157
+ "scr_metric_threshold_10": 0.06060602867826107,
158
+ "scr_dir2_threshold_10": 0.06060602867826107,
159
+ "scr_dir1_threshold_20": 0.5967744261368388,
160
+ "scr_metric_threshold_20": 0.14141411708819893,
161
+ "scr_dir2_threshold_20": 0.14141411708819893,
162
+ "scr_dir1_threshold_50": 0.40322557386316116,
163
+ "scr_metric_threshold_50": 0.19949492592919268,
164
+ "scr_dir2_threshold_50": 0.19949492592919268,
165
+ "scr_dir1_threshold_100": 0.258064174999303,
166
+ "scr_metric_threshold_100": 0.2904040442049689,
167
+ "scr_dir2_threshold_100": 0.2904040442049689,
168
+ "scr_dir1_threshold_500": 0.016128830681406324,
169
+ "scr_metric_threshold_500": 0.09848492830434731,
170
+ "scr_dir2_threshold_500": 0.09848492830434731
171
+ },
172
+ {
173
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
174
+ "scr_dir1_threshold_2": 0.23577218043436754,
175
+ "scr_metric_threshold_2": 0.10850441266694984,
176
+ "scr_dir2_threshold_2": 0.10850441266694984,
177
+ "scr_dir1_threshold_5": 0.2682925647340228,
178
+ "scr_metric_threshold_5": 0.21994133923436865,
179
+ "scr_dir2_threshold_5": 0.21994133923436865,
180
+ "scr_dir1_threshold_10": 0.15447146198048264,
181
+ "scr_metric_threshold_10": 0.28445751901318467,
182
+ "scr_dir2_threshold_10": 0.28445751901318467,
183
+ "scr_dir1_threshold_20": 0.0894311779716786,
184
+ "scr_metric_threshold_20": 0.3724339847894522,
185
+ "scr_dir2_threshold_20": 0.3724339847894522,
186
+ "scr_dir1_threshold_50": -0.32520287381553964,
187
+ "scr_metric_threshold_50": 0.4545454227647819,
188
+ "scr_dir2_threshold_50": 0.4545454227647819,
189
+ "scr_dir1_threshold_100": -0.39024364241485016,
190
+ "scr_metric_threshold_100": 0.14662744296044594,
191
+ "scr_dir2_threshold_100": 0.14662744296044594,
192
+ "scr_dir1_threshold_500": -0.764227334975126,
193
+ "scr_metric_threshold_500": 0.19354836454274843,
194
+ "scr_dir2_threshold_500": 0.19354836454274843
195
+ },
196
+ {
197
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
198
+ "scr_dir1_threshold_2": 0.00546440968122594,
199
+ "scr_metric_threshold_2": 0.1914063154836078,
200
+ "scr_dir2_threshold_2": 0.1914063154836078,
201
+ "scr_dir1_threshold_5": 0.03278678379574697,
202
+ "scr_metric_threshold_5": 0.2656250291038257,
203
+ "scr_dir2_threshold_5": 0.2656250291038257,
204
+ "scr_dir1_threshold_10": 0.09289626741440628,
205
+ "scr_metric_threshold_10": 0.42578128637978213,
206
+ "scr_dir2_threshold_10": 0.42578128637978213,
207
+ "scr_dir1_threshold_20": 0.1092894964580841,
208
+ "scr_metric_threshold_20": 0.5195312281721307,
209
+ "scr_dir2_threshold_20": 0.5195312281721307,
210
+ "scr_dir1_threshold_50": -0.016393554752069144,
211
+ "scr_metric_threshold_50": 0.6562500582076514,
212
+ "scr_dir2_threshold_50": 0.6562500582076514,
213
+ "scr_dir1_threshold_100": -0.06010915791026799,
214
+ "scr_metric_threshold_100": 0.7187499417923486,
215
+ "scr_dir2_threshold_100": 0.7187499417923486,
216
+ "scr_dir1_threshold_500": -0.05464474822904205,
217
+ "scr_metric_threshold_500": 0.7343749708961743,
218
+ "scr_dir2_threshold_500": 0.7343749708961743
219
+ },
220
+ {
221
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
222
+ "scr_dir1_threshold_2": 0.06666642213478169,
223
+ "scr_metric_threshold_2": 0.060483807005948444,
224
+ "scr_dir2_threshold_2": 0.060483807005948444,
225
+ "scr_dir1_threshold_5": 0.12820501064172196,
226
+ "scr_metric_threshold_5": 0.1008065853511987,
227
+ "scr_dir2_threshold_5": 0.1008065853511987,
228
+ "scr_dir1_threshold_10": 0.17948701489841076,
229
+ "scr_metric_threshold_10": 0.14516132908730398,
230
+ "scr_dir2_threshold_10": 0.14516132908730398,
231
+ "scr_dir1_threshold_20": 0.1999998777340575,
232
+ "scr_metric_threshold_20": 0.18951607282340924,
233
+ "scr_dir2_threshold_20": 0.18951607282340924,
234
+ "scr_dir1_threshold_50": 0.22051274056970427,
235
+ "scr_metric_threshold_50": 0.30241927537102686,
236
+ "scr_dir2_threshold_50": 0.30241927537102686,
237
+ "scr_dir1_threshold_100": 0.24102560340535104,
238
+ "scr_metric_threshold_100": 0.4153227182599291,
239
+ "scr_dir2_threshold_100": 0.4153227182599291,
240
+ "scr_dir1_threshold_500": 0.36410247475437535,
241
+ "scr_metric_threshold_500": 0.37499993991467884,
242
+ "scr_dir2_threshold_500": 0.37499993991467884
243
+ },
244
+ {
245
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
246
+ "scr_dir1_threshold_2": 0.09009000301245093,
247
+ "scr_metric_threshold_2": 0.09821433560655832,
248
+ "scr_dir2_threshold_2": 0.09821433560655832,
249
+ "scr_dir1_threshold_5": 0.11261250376556366,
250
+ "scr_metric_threshold_5": 0.18303572260109305,
251
+ "scr_dir2_threshold_5": 0.18303572260109305,
252
+ "scr_dir1_threshold_10": 0.13963971946080886,
253
+ "scr_metric_threshold_10": 0.308035689339578,
254
+ "scr_dir2_threshold_10": 0.308035689339578,
255
+ "scr_dir1_threshold_20": 0.22072082956776967,
256
+ "scr_metric_threshold_20": 0.39732130384074704,
257
+ "scr_dir2_threshold_20": 0.39732130384074704,
258
+ "scr_dir1_threshold_50": 0.30180167118534307,
259
+ "scr_metric_threshold_50": 0.47767846332864744,
260
+ "scr_dir2_threshold_50": 0.47767846332864744,
261
+ "scr_dir1_threshold_100": 0.44144139064615195,
262
+ "scr_metric_threshold_100": 0.5491071678032792,
263
+ "scr_dir2_threshold_100": 0.5491071678032792,
264
+ "scr_dir1_threshold_500": 0.4684683378520097,
265
+ "scr_metric_threshold_500": 0.6116071511725216,
266
+ "scr_dir2_threshold_500": 0.6116071511725216
267
+ },
268
+ {
269
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
270
+ "scr_dir1_threshold_2": 0.06866965525497992,
271
+ "scr_metric_threshold_2": 0.06866965525497992,
272
+ "scr_dir2_threshold_2": 0.05238083614515829,
273
+ "scr_dir1_threshold_5": 0.08583681325484281,
274
+ "scr_metric_threshold_5": 0.08583681325484281,
275
+ "scr_dir2_threshold_5": 0.10000011353263609,
276
+ "scr_dir1_threshold_10": 0.09871243756862208,
277
+ "scr_metric_threshold_10": 0.09871243756862208,
278
+ "scr_dir2_threshold_10": 0.1523809496777944,
279
+ "scr_dir1_threshold_20": 0.12875547569614632,
280
+ "scr_metric_threshold_20": 0.12875547569614632,
281
+ "scr_dir2_threshold_20": 0.20476178582295265,
282
+ "scr_dir1_threshold_50": 0.19313308563727843,
283
+ "scr_metric_threshold_50": 0.19313308563727843,
284
+ "scr_dir2_threshold_50": 0.24761893678956953,
285
+ "scr_dir1_threshold_100": 0.23175970276473412,
286
+ "scr_metric_threshold_100": 0.23175970276473412,
287
+ "scr_dir2_threshold_100": 0.25714290579970117,
288
+ "scr_dir1_threshold_500": 0.27896989889212126,
289
+ "scr_metric_threshold_500": 0.27896989889212126,
290
+ "scr_dir2_threshold_500": 0.3047618993555888
291
+ }
292
+ ],
293
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
294
+ "sae_lens_id": "custom_sae",
295
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000",
296
+ "sae_lens_version": "5.5.0",
297
+ "sae_cfg_dict": {
298
+ "model_name": "gemma-2-2b",
299
+ "d_in": 2304,
300
+ "d_sae": 65536,
301
+ "hook_layer": 12,
302
+ "hook_name": "blocks.12.hook_resid_post",
303
+ "context_size": null,
304
+ "hook_head_index": null,
305
+ "architecture": "topk",
306
+ "apply_b_dec_to_input": null,
307
+ "finetuning_scaling_factor": null,
308
+ "activation_fn_str": "",
309
+ "prepend_bos": true,
310
+ "normalize_activations": "none",
311
+ "dtype": "bfloat16",
312
+ "device": "",
313
+ "dataset_path": "",
314
+ "dataset_trust_remote_code": true,
315
+ "seqpos_slice": [
316
+ null
317
+ ],
318
+ "training_tokens": -100000,
319
+ "sae_lens_training_version": null,
320
+ "neuronpedia_id": null
321
+ },
322
+ "eval_result_unstructured": null
323
+ }
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "sparse_probing",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "LabHC/bias_in_bios_class_set2",
8
+ "LabHC/bias_in_bios_class_set3",
9
+ "canrager/amazon_reviews_mcauley_1and5",
10
+ "canrager/amazon_reviews_mcauley_1and5_sentiment",
11
+ "codeparrot/github-code",
12
+ "fancyzhx/ag_news",
13
+ "Helsinki-NLP/europarl"
14
+ ],
15
+ "probe_train_set_size": 4000,
16
+ "probe_test_set_size": 1000,
17
+ "context_length": 128,
18
+ "sae_batch_size": 125,
19
+ "llm_batch_size": 32,
20
+ "llm_dtype": "bfloat16",
21
+ "model_name": "gemma-2-2b",
22
+ "k_values": [
23
+ 1,
24
+ 2,
25
+ 5
26
+ ],
27
+ "lower_vram_usage": false
28
+ },
29
+ "eval_id": "0eeb0d78-c565-4180-bea5-6e755bb53289",
30
+ "datetime_epoch_millis": 1740203520129,
31
+ "eval_result_metrics": {
32
+ "llm": {
33
+ "llm_test_accuracy": 0.9587437950074672,
34
+ "llm_top_1_test_accuracy": 0.6508312500000001,
35
+ "llm_top_2_test_accuracy": 0.7267250000000001,
36
+ "llm_top_5_test_accuracy": 0.77896875,
37
+ "llm_top_10_test_accuracy": null,
38
+ "llm_top_20_test_accuracy": null,
39
+ "llm_top_50_test_accuracy": null,
40
+ "llm_top_100_test_accuracy": null
41
+ },
42
+ "sae": {
43
+ "sae_test_accuracy": 0.9576437868177892,
44
+ "sae_top_1_test_accuracy": 0.6868375000000001,
45
+ "sae_top_2_test_accuracy": 0.7738562499999999,
46
+ "sae_top_5_test_accuracy": 0.86161875,
47
+ "sae_top_10_test_accuracy": null,
48
+ "sae_top_20_test_accuracy": null,
49
+ "sae_top_50_test_accuracy": null,
50
+ "sae_top_100_test_accuracy": null
51
+ }
52
+ },
53
+ "eval_result_details": [
54
+ {
55
+ "dataset_name": "LabHC/bias_in_bios_class_set1_results",
56
+ "llm_test_accuracy": 0.966800057888031,
57
+ "llm_top_1_test_accuracy": 0.6397999999999999,
58
+ "llm_top_2_test_accuracy": 0.6954,
59
+ "llm_top_5_test_accuracy": 0.7869999999999999,
60
+ "llm_top_10_test_accuracy": null,
61
+ "llm_top_20_test_accuracy": null,
62
+ "llm_top_50_test_accuracy": null,
63
+ "llm_top_100_test_accuracy": null,
64
+ "sae_test_accuracy": 0.9654000401496887,
65
+ "sae_top_1_test_accuracy": 0.6992,
66
+ "sae_top_2_test_accuracy": 0.7709999999999999,
67
+ "sae_top_5_test_accuracy": 0.8725999999999999,
68
+ "sae_top_10_test_accuracy": null,
69
+ "sae_top_20_test_accuracy": null,
70
+ "sae_top_50_test_accuracy": null,
71
+ "sae_top_100_test_accuracy": null
72
+ },
73
+ {
74
+ "dataset_name": "LabHC/bias_in_bios_class_set2_results",
75
+ "llm_test_accuracy": 0.9544000387191772,
76
+ "llm_top_1_test_accuracy": 0.6744000000000001,
77
+ "llm_top_2_test_accuracy": 0.7334,
78
+ "llm_top_5_test_accuracy": 0.763,
79
+ "llm_top_10_test_accuracy": null,
80
+ "llm_top_20_test_accuracy": null,
81
+ "llm_top_50_test_accuracy": null,
82
+ "llm_top_100_test_accuracy": null,
83
+ "sae_test_accuracy": 0.9496000289916993,
84
+ "sae_top_1_test_accuracy": 0.6658,
85
+ "sae_top_2_test_accuracy": 0.7876,
86
+ "sae_top_5_test_accuracy": 0.8301999999999999,
87
+ "sae_top_10_test_accuracy": null,
88
+ "sae_top_20_test_accuracy": null,
89
+ "sae_top_50_test_accuracy": null,
90
+ "sae_top_100_test_accuracy": null
91
+ },
92
+ {
93
+ "dataset_name": "LabHC/bias_in_bios_class_set3_results",
94
+ "llm_test_accuracy": 0.9310000538825989,
95
+ "llm_top_1_test_accuracy": 0.6864000000000001,
96
+ "llm_top_2_test_accuracy": 0.7436,
97
+ "llm_top_5_test_accuracy": 0.763,
98
+ "llm_top_10_test_accuracy": null,
99
+ "llm_top_20_test_accuracy": null,
100
+ "llm_top_50_test_accuracy": null,
101
+ "llm_top_100_test_accuracy": null,
102
+ "sae_test_accuracy": 0.9298000454902648,
103
+ "sae_top_1_test_accuracy": 0.7092,
104
+ "sae_top_2_test_accuracy": 0.8158,
105
+ "sae_top_5_test_accuracy": 0.8566,
106
+ "sae_top_10_test_accuracy": null,
107
+ "sae_top_20_test_accuracy": null,
108
+ "sae_top_50_test_accuracy": null,
109
+ "sae_top_100_test_accuracy": null
110
+ },
111
+ {
112
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
113
+ "llm_test_accuracy": 0.915600061416626,
114
+ "llm_top_1_test_accuracy": 0.599,
115
+ "llm_top_2_test_accuracy": 0.6476000000000001,
116
+ "llm_top_5_test_accuracy": 0.6708000000000001,
117
+ "llm_top_10_test_accuracy": null,
118
+ "llm_top_20_test_accuracy": null,
119
+ "llm_top_50_test_accuracy": null,
120
+ "llm_top_100_test_accuracy": null,
121
+ "sae_test_accuracy": 0.9200000286102294,
122
+ "sae_top_1_test_accuracy": 0.7190000000000001,
123
+ "sae_top_2_test_accuracy": 0.7762,
124
+ "sae_top_5_test_accuracy": 0.8204,
125
+ "sae_top_10_test_accuracy": null,
126
+ "sae_top_20_test_accuracy": null,
127
+ "sae_top_50_test_accuracy": null,
128
+ "sae_top_100_test_accuracy": null
129
+ },
130
+ {
131
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
132
+ "llm_test_accuracy": 0.9820000529289246,
133
+ "llm_top_1_test_accuracy": 0.673,
134
+ "llm_top_2_test_accuracy": 0.724,
135
+ "llm_top_5_test_accuracy": 0.766,
136
+ "llm_top_10_test_accuracy": null,
137
+ "llm_top_20_test_accuracy": null,
138
+ "llm_top_50_test_accuracy": null,
139
+ "llm_top_100_test_accuracy": null,
140
+ "sae_test_accuracy": 0.9730000495910645,
141
+ "sae_top_1_test_accuracy": 0.591,
142
+ "sae_top_2_test_accuracy": 0.664,
143
+ "sae_top_5_test_accuracy": 0.904,
144
+ "sae_top_10_test_accuracy": null,
145
+ "sae_top_20_test_accuracy": null,
146
+ "sae_top_50_test_accuracy": null,
147
+ "sae_top_100_test_accuracy": null
148
+ },
149
+ {
150
+ "dataset_name": "codeparrot/github-code_results",
151
+ "llm_test_accuracy": 0.9708000302314759,
152
+ "llm_top_1_test_accuracy": 0.649,
153
+ "llm_top_2_test_accuracy": 0.6958,
154
+ "llm_top_5_test_accuracy": 0.7556,
155
+ "llm_top_10_test_accuracy": null,
156
+ "llm_top_20_test_accuracy": null,
157
+ "llm_top_50_test_accuracy": null,
158
+ "llm_top_100_test_accuracy": null,
159
+ "sae_test_accuracy": 0.9706000328063965,
160
+ "sae_top_1_test_accuracy": 0.6342000000000001,
161
+ "sae_top_2_test_accuracy": 0.7213999999999999,
162
+ "sae_top_5_test_accuracy": 0.8103999999999999,
163
+ "sae_top_10_test_accuracy": null,
164
+ "sae_top_20_test_accuracy": null,
165
+ "sae_top_50_test_accuracy": null,
166
+ "sae_top_100_test_accuracy": null
167
+ },
168
+ {
169
+ "dataset_name": "fancyzhx/ag_news_results",
170
+ "llm_test_accuracy": 0.9497500360012054,
171
+ "llm_top_1_test_accuracy": 0.63425,
172
+ "llm_top_2_test_accuracy": 0.782,
173
+ "llm_top_5_test_accuracy": 0.8247499999999999,
174
+ "llm_top_10_test_accuracy": null,
175
+ "llm_top_20_test_accuracy": null,
176
+ "llm_top_50_test_accuracy": null,
177
+ "llm_top_100_test_accuracy": null,
178
+ "sae_test_accuracy": 0.9537500441074371,
179
+ "sae_top_1_test_accuracy": 0.6445000000000001,
180
+ "sae_top_2_test_accuracy": 0.74825,
181
+ "sae_top_5_test_accuracy": 0.81875,
182
+ "sae_top_10_test_accuracy": null,
183
+ "sae_top_20_test_accuracy": null,
184
+ "sae_top_50_test_accuracy": null,
185
+ "sae_top_100_test_accuracy": null
186
+ },
187
+ {
188
+ "dataset_name": "Helsinki-NLP/europarl_results",
189
+ "llm_test_accuracy": 0.9996000289916992,
190
+ "llm_top_1_test_accuracy": 0.6508,
191
+ "llm_top_2_test_accuracy": 0.792,
192
+ "llm_top_5_test_accuracy": 0.9016,
193
+ "llm_top_10_test_accuracy": null,
194
+ "llm_top_20_test_accuracy": null,
195
+ "llm_top_50_test_accuracy": null,
196
+ "llm_top_100_test_accuracy": null,
197
+ "sae_test_accuracy": 0.9990000247955322,
198
+ "sae_top_1_test_accuracy": 0.8318,
199
+ "sae_top_2_test_accuracy": 0.9065999999999999,
200
+ "sae_top_5_test_accuracy": 0.9800000000000001,
201
+ "sae_top_10_test_accuracy": null,
202
+ "sae_top_20_test_accuracy": null,
203
+ "sae_top_50_test_accuracy": null,
204
+ "sae_top_100_test_accuracy": null
205
+ }
206
+ ],
207
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
208
+ "sae_lens_id": "custom_sae",
209
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000",
210
+ "sae_lens_version": "5.5.0",
211
+ "sae_cfg_dict": {
212
+ "model_name": "gemma-2-2b",
213
+ "d_in": 2304,
214
+ "d_sae": 65536,
215
+ "hook_layer": 12,
216
+ "hook_name": "blocks.12.hook_resid_post",
217
+ "context_size": null,
218
+ "hook_head_index": null,
219
+ "architecture": "topk",
220
+ "apply_b_dec_to_input": null,
221
+ "finetuning_scaling_factor": null,
222
+ "activation_fn_str": "",
223
+ "prepend_bos": true,
224
+ "normalize_activations": "none",
225
+ "dtype": "bfloat16",
226
+ "device": "",
227
+ "dataset_path": "",
228
+ "dataset_trust_remote_code": true,
229
+ "seqpos_slice": [
230
+ null
231
+ ],
232
+ "training_tokens": -100000,
233
+ "sae_lens_training_version": null,
234
+ "neuronpedia_id": null
235
+ },
236
+ "eval_result_unstructured": {
237
+ "LabHC/bias_in_bios_class_set1_results": {
238
+ "sae_test_accuracy": {
239
+ "0": 0.9430000185966492,
240
+ "1": 0.9650000333786011,
241
+ "2": 0.9550000429153442,
242
+ "6": 0.987000048160553,
243
+ "9": 0.9770000576972961
244
+ },
245
+ "llm_test_accuracy": {
246
+ "0": 0.9510000348091125,
247
+ "1": 0.9670000672340393,
248
+ "2": 0.9530000686645508,
249
+ "6": 0.987000048160553,
250
+ "9": 0.9760000705718994
251
+ },
252
+ "llm_top_1_test_accuracy": {
253
+ "0": 0.577,
254
+ "1": 0.613,
255
+ "2": 0.662,
256
+ "6": 0.787,
257
+ "9": 0.56
258
+ },
259
+ "llm_top_2_test_accuracy": {
260
+ "0": 0.574,
261
+ "1": 0.66,
262
+ "2": 0.718,
263
+ "6": 0.811,
264
+ "9": 0.714
265
+ },
266
+ "llm_top_5_test_accuracy": {
267
+ "0": 0.713,
268
+ "1": 0.711,
269
+ "2": 0.755,
270
+ "6": 0.895,
271
+ "9": 0.861
272
+ },
273
+ "sae_top_1_test_accuracy": {
274
+ "0": 0.555,
275
+ "1": 0.659,
276
+ "2": 0.847,
277
+ "6": 0.817,
278
+ "9": 0.618
279
+ },
280
+ "sae_top_2_test_accuracy": {
281
+ "0": 0.57,
282
+ "1": 0.658,
283
+ "2": 0.859,
284
+ "6": 0.828,
285
+ "9": 0.94
286
+ },
287
+ "sae_top_5_test_accuracy": {
288
+ "0": 0.689,
289
+ "1": 0.85,
290
+ "2": 0.909,
291
+ "6": 0.972,
292
+ "9": 0.943
293
+ }
294
+ },
295
+ "LabHC/bias_in_bios_class_set2_results": {
296
+ "sae_test_accuracy": {
297
+ "11": 0.9650000333786011,
298
+ "13": 0.9470000267028809,
299
+ "14": 0.9470000267028809,
300
+ "18": 0.9290000200271606,
301
+ "19": 0.9600000381469727
302
+ },
303
+ "llm_test_accuracy": {
304
+ "11": 0.9660000205039978,
305
+ "13": 0.9510000348091125,
306
+ "14": 0.9540000557899475,
307
+ "18": 0.940000057220459,
308
+ "19": 0.9610000252723694
309
+ },
310
+ "llm_top_1_test_accuracy": {
311
+ "11": 0.553,
312
+ "13": 0.673,
313
+ "14": 0.651,
314
+ "18": 0.706,
315
+ "19": 0.789
316
+ },
317
+ "llm_top_2_test_accuracy": {
318
+ "11": 0.77,
319
+ "13": 0.719,
320
+ "14": 0.672,
321
+ "18": 0.717,
322
+ "19": 0.789
323
+ },
324
+ "llm_top_5_test_accuracy": {
325
+ "11": 0.793,
326
+ "13": 0.739,
327
+ "14": 0.732,
328
+ "18": 0.723,
329
+ "19": 0.828
330
+ },
331
+ "sae_top_1_test_accuracy": {
332
+ "11": 0.573,
333
+ "13": 0.657,
334
+ "14": 0.625,
335
+ "18": 0.662,
336
+ "19": 0.812
337
+ },
338
+ "sae_top_2_test_accuracy": {
339
+ "11": 0.86,
340
+ "13": 0.681,
341
+ "14": 0.872,
342
+ "18": 0.695,
343
+ "19": 0.83
344
+ },
345
+ "sae_top_5_test_accuracy": {
346
+ "11": 0.945,
347
+ "13": 0.755,
348
+ "14": 0.889,
349
+ "18": 0.74,
350
+ "19": 0.822
351
+ }
352
+ },
353
+ "LabHC/bias_in_bios_class_set3_results": {
354
+ "sae_test_accuracy": {
355
+ "20": 0.9570000171661377,
356
+ "21": 0.9220000505447388,
357
+ "22": 0.9170000553131104,
358
+ "25": 0.9540000557899475,
359
+ "26": 0.8990000486373901
360
+ },
361
+ "llm_test_accuracy": {
362
+ "20": 0.9650000333786011,
363
+ "21": 0.9250000715255737,
364
+ "22": 0.9140000343322754,
365
+ "25": 0.9670000672340393,
366
+ "26": 0.8840000629425049
367
+ },
368
+ "llm_top_1_test_accuracy": {
369
+ "20": 0.709,
370
+ "21": 0.762,
371
+ "22": 0.653,
372
+ "25": 0.683,
373
+ "26": 0.625
374
+ },
375
+ "llm_top_2_test_accuracy": {
376
+ "20": 0.811,
377
+ "21": 0.769,
378
+ "22": 0.688,
379
+ "25": 0.766,
380
+ "26": 0.684
381
+ },
382
+ "llm_top_5_test_accuracy": {
383
+ "20": 0.815,
384
+ "21": 0.794,
385
+ "22": 0.706,
386
+ "25": 0.803,
387
+ "26": 0.697
388
+ },
389
+ "sae_top_1_test_accuracy": {
390
+ "20": 0.573,
391
+ "21": 0.769,
392
+ "22": 0.867,
393
+ "25": 0.729,
394
+ "26": 0.608
395
+ },
396
+ "sae_top_2_test_accuracy": {
397
+ "20": 0.875,
398
+ "21": 0.84,
399
+ "22": 0.888,
400
+ "25": 0.861,
401
+ "26": 0.615
402
+ },
403
+ "sae_top_5_test_accuracy": {
404
+ "20": 0.897,
405
+ "21": 0.844,
406
+ "22": 0.891,
407
+ "25": 0.888,
408
+ "26": 0.763
409
+ }
410
+ },
411
+ "canrager/amazon_reviews_mcauley_1and5_results": {
412
+ "sae_test_accuracy": {
413
+ "1": 0.9380000233650208,
414
+ "2": 0.9420000314712524,
415
+ "3": 0.9200000166893005,
416
+ "5": 0.9290000200271606,
417
+ "6": 0.8710000514984131
418
+ },
419
+ "llm_test_accuracy": {
420
+ "1": 0.9490000605583191,
421
+ "2": 0.9300000667572021,
422
+ "3": 0.9120000600814819,
423
+ "5": 0.9250000715255737,
424
+ "6": 0.862000048160553
425
+ },
426
+ "llm_top_1_test_accuracy": {
427
+ "1": 0.684,
428
+ "2": 0.592,
429
+ "3": 0.583,
430
+ "5": 0.551,
431
+ "6": 0.585
432
+ },
433
+ "llm_top_2_test_accuracy": {
434
+ "1": 0.739,
435
+ "2": 0.635,
436
+ "3": 0.609,
437
+ "5": 0.635,
438
+ "6": 0.62
439
+ },
440
+ "llm_top_5_test_accuracy": {
441
+ "1": 0.757,
442
+ "2": 0.636,
443
+ "3": 0.622,
444
+ "5": 0.657,
445
+ "6": 0.682
446
+ },
447
+ "sae_top_1_test_accuracy": {
448
+ "1": 0.908,
449
+ "2": 0.844,
450
+ "3": 0.539,
451
+ "5": 0.769,
452
+ "6": 0.535
453
+ },
454
+ "sae_top_2_test_accuracy": {
455
+ "1": 0.907,
456
+ "2": 0.853,
457
+ "3": 0.703,
458
+ "5": 0.846,
459
+ "6": 0.572
460
+ },
461
+ "sae_top_5_test_accuracy": {
462
+ "1": 0.92,
463
+ "2": 0.881,
464
+ "3": 0.764,
465
+ "5": 0.878,
466
+ "6": 0.659
467
+ }
468
+ },
469
+ "canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
470
+ "sae_test_accuracy": {
471
+ "1.0": 0.9730000495910645,
472
+ "5.0": 0.9730000495910645
473
+ },
474
+ "llm_test_accuracy": {
475
+ "1.0": 0.9820000529289246,
476
+ "5.0": 0.9820000529289246
477
+ },
478
+ "llm_top_1_test_accuracy": {
479
+ "1.0": 0.673,
480
+ "5.0": 0.673
481
+ },
482
+ "llm_top_2_test_accuracy": {
483
+ "1.0": 0.724,
484
+ "5.0": 0.724
485
+ },
486
+ "llm_top_5_test_accuracy": {
487
+ "1.0": 0.766,
488
+ "5.0": 0.766
489
+ },
490
+ "sae_top_1_test_accuracy": {
491
+ "1.0": 0.591,
492
+ "5.0": 0.591
493
+ },
494
+ "sae_top_2_test_accuracy": {
495
+ "1.0": 0.664,
496
+ "5.0": 0.664
497
+ },
498
+ "sae_top_5_test_accuracy": {
499
+ "1.0": 0.904,
500
+ "5.0": 0.904
501
+ }
502
+ },
503
+ "codeparrot/github-code_results": {
504
+ "sae_test_accuracy": {
505
+ "C": 0.9610000252723694,
506
+ "Python": 0.9830000400543213,
507
+ "HTML": 0.9890000224113464,
508
+ "Java": 0.9660000205039978,
509
+ "PHP": 0.9540000557899475
510
+ },
511
+ "llm_test_accuracy": {
512
+ "C": 0.956000030040741,
513
+ "Python": 0.9830000400543213,
514
+ "HTML": 0.984000027179718,
515
+ "Java": 0.9700000286102295,
516
+ "PHP": 0.9610000252723694
517
+ },
518
+ "llm_top_1_test_accuracy": {
519
+ "C": 0.663,
520
+ "Python": 0.655,
521
+ "HTML": 0.72,
522
+ "Java": 0.612,
523
+ "PHP": 0.595
524
+ },
525
+ "llm_top_2_test_accuracy": {
526
+ "C": 0.68,
527
+ "Python": 0.683,
528
+ "HTML": 0.798,
529
+ "Java": 0.681,
530
+ "PHP": 0.637
531
+ },
532
+ "llm_top_5_test_accuracy": {
533
+ "C": 0.74,
534
+ "Python": 0.724,
535
+ "HTML": 0.902,
536
+ "Java": 0.739,
537
+ "PHP": 0.673
538
+ },
539
+ "sae_top_1_test_accuracy": {
540
+ "C": 0.623,
541
+ "Python": 0.641,
542
+ "HTML": 0.687,
543
+ "Java": 0.613,
544
+ "PHP": 0.607
545
+ },
546
+ "sae_top_2_test_accuracy": {
547
+ "C": 0.627,
548
+ "Python": 0.661,
549
+ "HTML": 0.813,
550
+ "Java": 0.598,
551
+ "PHP": 0.908
552
+ },
553
+ "sae_top_5_test_accuracy": {
554
+ "C": 0.691,
555
+ "Python": 0.939,
556
+ "HTML": 0.835,
557
+ "Java": 0.662,
558
+ "PHP": 0.925
559
+ }
560
+ },
561
+ "fancyzhx/ag_news_results": {
562
+ "sae_test_accuracy": {
563
+ "0": 0.9450000524520874,
564
+ "1": 0.984000027179718,
565
+ "2": 0.940000057220459,
566
+ "3": 0.9460000395774841
567
+ },
568
+ "llm_test_accuracy": {
569
+ "0": 0.9450000524520874,
570
+ "1": 0.9890000224113464,
571
+ "2": 0.9200000166893005,
572
+ "3": 0.9450000524520874
573
+ },
574
+ "llm_top_1_test_accuracy": {
575
+ "0": 0.559,
576
+ "1": 0.66,
577
+ "2": 0.668,
578
+ "3": 0.65
579
+ },
580
+ "llm_top_2_test_accuracy": {
581
+ "0": 0.807,
582
+ "1": 0.799,
583
+ "2": 0.699,
584
+ "3": 0.823
585
+ },
586
+ "llm_top_5_test_accuracy": {
587
+ "0": 0.815,
588
+ "1": 0.88,
589
+ "2": 0.756,
590
+ "3": 0.848
591
+ },
592
+ "sae_top_1_test_accuracy": {
593
+ "0": 0.673,
594
+ "1": 0.637,
595
+ "2": 0.558,
596
+ "3": 0.71
597
+ },
598
+ "sae_top_2_test_accuracy": {
599
+ "0": 0.837,
600
+ "1": 0.657,
601
+ "2": 0.777,
602
+ "3": 0.722
603
+ },
604
+ "sae_top_5_test_accuracy": {
605
+ "0": 0.84,
606
+ "1": 0.872,
607
+ "2": 0.798,
608
+ "3": 0.765
609
+ }
610
+ },
611
+ "Helsinki-NLP/europarl_results": {
612
+ "sae_test_accuracy": {
613
+ "en": 0.9980000257492065,
614
+ "fr": 1.0,
615
+ "de": 0.999000072479248,
616
+ "es": 1.0,
617
+ "nl": 0.9980000257492065
618
+ },
619
+ "llm_test_accuracy": {
620
+ "en": 0.999000072479248,
621
+ "fr": 1.0,
622
+ "de": 1.0,
623
+ "es": 1.0,
624
+ "nl": 0.999000072479248
625
+ },
626
+ "llm_top_1_test_accuracy": {
627
+ "en": 0.741,
628
+ "fr": 0.609,
629
+ "de": 0.754,
630
+ "es": 0.494,
631
+ "nl": 0.656
632
+ },
633
+ "llm_top_2_test_accuracy": {
634
+ "en": 0.832,
635
+ "fr": 0.597,
636
+ "de": 0.826,
637
+ "es": 0.964,
638
+ "nl": 0.741
639
+ },
640
+ "llm_top_5_test_accuracy": {
641
+ "en": 0.879,
642
+ "fr": 0.909,
643
+ "de": 0.874,
644
+ "es": 0.979,
645
+ "nl": 0.867
646
+ },
647
+ "sae_top_1_test_accuracy": {
648
+ "en": 0.718,
649
+ "fr": 0.993,
650
+ "de": 0.891,
651
+ "es": 0.897,
652
+ "nl": 0.66
653
+ },
654
+ "sae_top_2_test_accuracy": {
655
+ "en": 0.722,
656
+ "fr": 0.997,
657
+ "de": 0.905,
658
+ "es": 0.911,
659
+ "nl": 0.998
660
+ },
661
+ "sae_top_5_test_accuracy": {
662
+ "en": 0.999,
663
+ "fr": 0.995,
664
+ "de": 0.98,
665
+ "es": 0.931,
666
+ "nl": 0.995
667
+ }
668
+ }
669
+ }
670
+ }
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "sparse_probing",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "LabHC/bias_in_bios_class_set2",
8
+ "LabHC/bias_in_bios_class_set3",
9
+ "canrager/amazon_reviews_mcauley_1and5",
10
+ "canrager/amazon_reviews_mcauley_1and5_sentiment",
11
+ "codeparrot/github-code",
12
+ "fancyzhx/ag_news",
13
+ "Helsinki-NLP/europarl"
14
+ ],
15
+ "probe_train_set_size": 4000,
16
+ "probe_test_set_size": 1000,
17
+ "context_length": 128,
18
+ "sae_batch_size": 125,
19
+ "llm_batch_size": 32,
20
+ "llm_dtype": "bfloat16",
21
+ "model_name": "gemma-2-2b",
22
+ "k_values": [
23
+ 1,
24
+ 2,
25
+ 5
26
+ ],
27
+ "lower_vram_usage": false
28
+ },
29
+ "eval_id": "efe2ab1e-0e1b-47f8-a56e-d7f2f99e4ac0",
30
+ "datetime_epoch_millis": 1740203784246,
31
+ "eval_result_metrics": {
32
+ "llm": {
33
+ "llm_test_accuracy": 0.9587437950074672,
34
+ "llm_top_1_test_accuracy": 0.6508312500000001,
35
+ "llm_top_2_test_accuracy": 0.7267250000000001,
36
+ "llm_top_5_test_accuracy": 0.77896875,
37
+ "llm_top_10_test_accuracy": null,
38
+ "llm_top_20_test_accuracy": null,
39
+ "llm_top_50_test_accuracy": null,
40
+ "llm_top_100_test_accuracy": null
41
+ },
42
+ "sae": {
43
+ "sae_test_accuracy": 0.9502250470221043,
44
+ "sae_top_1_test_accuracy": 0.69245,
45
+ "sae_top_2_test_accuracy": 0.77648125,
46
+ "sae_top_5_test_accuracy": 0.85739375,
47
+ "sae_top_10_test_accuracy": null,
48
+ "sae_top_20_test_accuracy": null,
49
+ "sae_top_50_test_accuracy": null,
50
+ "sae_top_100_test_accuracy": null
51
+ }
52
+ },
53
+ "eval_result_details": [
54
+ {
55
+ "dataset_name": "LabHC/bias_in_bios_class_set1_results",
56
+ "llm_test_accuracy": 0.966800057888031,
57
+ "llm_top_1_test_accuracy": 0.6397999999999999,
58
+ "llm_top_2_test_accuracy": 0.6954,
59
+ "llm_top_5_test_accuracy": 0.7869999999999999,
60
+ "llm_top_10_test_accuracy": null,
61
+ "llm_top_20_test_accuracy": null,
62
+ "llm_top_50_test_accuracy": null,
63
+ "llm_top_100_test_accuracy": null,
64
+ "sae_test_accuracy": 0.955400037765503,
65
+ "sae_top_1_test_accuracy": 0.6858,
66
+ "sae_top_2_test_accuracy": 0.8606,
67
+ "sae_top_5_test_accuracy": 0.8960000000000001,
68
+ "sae_top_10_test_accuracy": null,
69
+ "sae_top_20_test_accuracy": null,
70
+ "sae_top_50_test_accuracy": null,
71
+ "sae_top_100_test_accuracy": null
72
+ },
73
+ {
74
+ "dataset_name": "LabHC/bias_in_bios_class_set2_results",
75
+ "llm_test_accuracy": 0.9544000387191772,
76
+ "llm_top_1_test_accuracy": 0.6744000000000001,
77
+ "llm_top_2_test_accuracy": 0.7334,
78
+ "llm_top_5_test_accuracy": 0.763,
79
+ "llm_top_10_test_accuracy": null,
80
+ "llm_top_20_test_accuracy": null,
81
+ "llm_top_50_test_accuracy": null,
82
+ "llm_top_100_test_accuracy": null,
83
+ "sae_test_accuracy": 0.9468000531196594,
84
+ "sae_top_1_test_accuracy": 0.6689999999999999,
85
+ "sae_top_2_test_accuracy": 0.8225999999999999,
86
+ "sae_top_5_test_accuracy": 0.8782,
87
+ "sae_top_10_test_accuracy": null,
88
+ "sae_top_20_test_accuracy": null,
89
+ "sae_top_50_test_accuracy": null,
90
+ "sae_top_100_test_accuracy": null
91
+ },
92
+ {
93
+ "dataset_name": "LabHC/bias_in_bios_class_set3_results",
94
+ "llm_test_accuracy": 0.9310000538825989,
95
+ "llm_top_1_test_accuracy": 0.6864000000000001,
96
+ "llm_top_2_test_accuracy": 0.7436,
97
+ "llm_top_5_test_accuracy": 0.763,
98
+ "llm_top_10_test_accuracy": null,
99
+ "llm_top_20_test_accuracy": null,
100
+ "llm_top_50_test_accuracy": null,
101
+ "llm_top_100_test_accuracy": null,
102
+ "sae_test_accuracy": 0.92160005569458,
103
+ "sae_top_1_test_accuracy": 0.7544,
104
+ "sae_top_2_test_accuracy": 0.7952,
105
+ "sae_top_5_test_accuracy": 0.8654,
106
+ "sae_top_10_test_accuracy": null,
107
+ "sae_top_20_test_accuracy": null,
108
+ "sae_top_50_test_accuracy": null,
109
+ "sae_top_100_test_accuracy": null
110
+ },
111
+ {
112
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
113
+ "llm_test_accuracy": 0.915600061416626,
114
+ "llm_top_1_test_accuracy": 0.599,
115
+ "llm_top_2_test_accuracy": 0.6476000000000001,
116
+ "llm_top_5_test_accuracy": 0.6708000000000001,
117
+ "llm_top_10_test_accuracy": null,
118
+ "llm_top_20_test_accuracy": null,
119
+ "llm_top_50_test_accuracy": null,
120
+ "llm_top_100_test_accuracy": null,
121
+ "sae_test_accuracy": 0.9052000403404236,
122
+ "sae_top_1_test_accuracy": 0.713,
123
+ "sae_top_2_test_accuracy": 0.7246,
124
+ "sae_top_5_test_accuracy": 0.8096,
125
+ "sae_top_10_test_accuracy": null,
126
+ "sae_top_20_test_accuracy": null,
127
+ "sae_top_50_test_accuracy": null,
128
+ "sae_top_100_test_accuracy": null
129
+ },
130
+ {
131
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
132
+ "llm_test_accuracy": 0.9820000529289246,
133
+ "llm_top_1_test_accuracy": 0.673,
134
+ "llm_top_2_test_accuracy": 0.724,
135
+ "llm_top_5_test_accuracy": 0.766,
136
+ "llm_top_10_test_accuracy": null,
137
+ "llm_top_20_test_accuracy": null,
138
+ "llm_top_50_test_accuracy": null,
139
+ "llm_top_100_test_accuracy": null,
140
+ "sae_test_accuracy": 0.9630000591278076,
141
+ "sae_top_1_test_accuracy": 0.742,
142
+ "sae_top_2_test_accuracy": 0.752,
143
+ "sae_top_5_test_accuracy": 0.797,
144
+ "sae_top_10_test_accuracy": null,
145
+ "sae_top_20_test_accuracy": null,
146
+ "sae_top_50_test_accuracy": null,
147
+ "sae_top_100_test_accuracy": null
148
+ },
149
+ {
150
+ "dataset_name": "codeparrot/github-code_results",
151
+ "llm_test_accuracy": 0.9708000302314759,
152
+ "llm_top_1_test_accuracy": 0.649,
153
+ "llm_top_2_test_accuracy": 0.6958,
154
+ "llm_top_5_test_accuracy": 0.7556,
155
+ "llm_top_10_test_accuracy": null,
156
+ "llm_top_20_test_accuracy": null,
157
+ "llm_top_50_test_accuracy": null,
158
+ "llm_top_100_test_accuracy": null,
159
+ "sae_test_accuracy": 0.9660000443458557,
160
+ "sae_top_1_test_accuracy": 0.5998,
161
+ "sae_top_2_test_accuracy": 0.6456000000000001,
162
+ "sae_top_5_test_accuracy": 0.797,
163
+ "sae_top_10_test_accuracy": null,
164
+ "sae_top_20_test_accuracy": null,
165
+ "sae_top_50_test_accuracy": null,
166
+ "sae_top_100_test_accuracy": null
167
+ },
168
+ {
169
+ "dataset_name": "fancyzhx/ag_news_results",
170
+ "llm_test_accuracy": 0.9497500360012054,
171
+ "llm_top_1_test_accuracy": 0.63425,
172
+ "llm_top_2_test_accuracy": 0.782,
173
+ "llm_top_5_test_accuracy": 0.8247499999999999,
174
+ "llm_top_10_test_accuracy": null,
175
+ "llm_top_20_test_accuracy": null,
176
+ "llm_top_50_test_accuracy": null,
177
+ "llm_top_100_test_accuracy": null,
178
+ "sae_test_accuracy": 0.9480000436306,
179
+ "sae_top_1_test_accuracy": 0.605,
180
+ "sae_top_2_test_accuracy": 0.69525,
181
+ "sae_top_5_test_accuracy": 0.83375,
182
+ "sae_top_10_test_accuracy": null,
183
+ "sae_top_20_test_accuracy": null,
184
+ "sae_top_50_test_accuracy": null,
185
+ "sae_top_100_test_accuracy": null
186
+ },
187
+ {
188
+ "dataset_name": "Helsinki-NLP/europarl_results",
189
+ "llm_test_accuracy": 0.9996000289916992,
190
+ "llm_top_1_test_accuracy": 0.6508,
191
+ "llm_top_2_test_accuracy": 0.792,
192
+ "llm_top_5_test_accuracy": 0.9016,
193
+ "llm_top_10_test_accuracy": null,
194
+ "llm_top_20_test_accuracy": null,
195
+ "llm_top_50_test_accuracy": null,
196
+ "llm_top_100_test_accuracy": null,
197
+ "sae_test_accuracy": 0.9958000421524048,
198
+ "sae_top_1_test_accuracy": 0.7706000000000001,
199
+ "sae_top_2_test_accuracy": 0.916,
200
+ "sae_top_5_test_accuracy": 0.9822000000000001,
201
+ "sae_top_10_test_accuracy": null,
202
+ "sae_top_20_test_accuracy": null,
203
+ "sae_top_50_test_accuracy": null,
204
+ "sae_top_100_test_accuracy": null
205
+ }
206
+ ],
207
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
208
+ "sae_lens_id": "custom_sae",
209
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000",
210
+ "sae_lens_version": "5.5.0",
211
+ "sae_cfg_dict": {
212
+ "model_name": "gemma-2-2b",
213
+ "d_in": 2304,
214
+ "d_sae": 65536,
215
+ "hook_layer": 12,
216
+ "hook_name": "blocks.12.hook_resid_post",
217
+ "context_size": null,
218
+ "hook_head_index": null,
219
+ "architecture": "topk",
220
+ "apply_b_dec_to_input": null,
221
+ "finetuning_scaling_factor": null,
222
+ "activation_fn_str": "",
223
+ "prepend_bos": true,
224
+ "normalize_activations": "none",
225
+ "dtype": "bfloat16",
226
+ "device": "",
227
+ "dataset_path": "",
228
+ "dataset_trust_remote_code": true,
229
+ "seqpos_slice": [
230
+ null
231
+ ],
232
+ "training_tokens": -100000,
233
+ "sae_lens_training_version": null,
234
+ "neuronpedia_id": null
235
+ },
236
+ "eval_result_unstructured": {
237
+ "LabHC/bias_in_bios_class_set1_results": {
238
+ "sae_test_accuracy": {
239
+ "0": 0.9330000281333923,
240
+ "1": 0.9500000476837158,
241
+ "2": 0.9430000185966492,
242
+ "6": 0.9830000400543213,
243
+ "9": 0.968000054359436
244
+ },
245
+ "llm_test_accuracy": {
246
+ "0": 0.9510000348091125,
247
+ "1": 0.9670000672340393,
248
+ "2": 0.9530000686645508,
249
+ "6": 0.987000048160553,
250
+ "9": 0.9760000705718994
251
+ },
252
+ "llm_top_1_test_accuracy": {
253
+ "0": 0.577,
254
+ "1": 0.613,
255
+ "2": 0.662,
256
+ "6": 0.787,
257
+ "9": 0.56
258
+ },
259
+ "llm_top_2_test_accuracy": {
260
+ "0": 0.574,
261
+ "1": 0.66,
262
+ "2": 0.718,
263
+ "6": 0.811,
264
+ "9": 0.714
265
+ },
266
+ "llm_top_5_test_accuracy": {
267
+ "0": 0.713,
268
+ "1": 0.711,
269
+ "2": 0.755,
270
+ "6": 0.895,
271
+ "9": 0.861
272
+ },
273
+ "sae_top_1_test_accuracy": {
274
+ "0": 0.569,
275
+ "1": 0.639,
276
+ "2": 0.886,
277
+ "6": 0.787,
278
+ "9": 0.548
279
+ },
280
+ "sae_top_2_test_accuracy": {
281
+ "0": 0.878,
282
+ "1": 0.744,
283
+ "2": 0.884,
284
+ "6": 0.953,
285
+ "9": 0.844
286
+ },
287
+ "sae_top_5_test_accuracy": {
288
+ "0": 0.866,
289
+ "1": 0.853,
290
+ "2": 0.901,
291
+ "6": 0.983,
292
+ "9": 0.877
293
+ }
294
+ },
295
+ "LabHC/bias_in_bios_class_set2_results": {
296
+ "sae_test_accuracy": {
297
+ "11": 0.9600000381469727,
298
+ "13": 0.9550000429153442,
299
+ "14": 0.9480000734329224,
300
+ "18": 0.9130000472068787,
301
+ "19": 0.9580000638961792
302
+ },
303
+ "llm_test_accuracy": {
304
+ "11": 0.9660000205039978,
305
+ "13": 0.9510000348091125,
306
+ "14": 0.9540000557899475,
307
+ "18": 0.940000057220459,
308
+ "19": 0.9610000252723694
309
+ },
310
+ "llm_top_1_test_accuracy": {
311
+ "11": 0.553,
312
+ "13": 0.673,
313
+ "14": 0.651,
314
+ "18": 0.706,
315
+ "19": 0.789
316
+ },
317
+ "llm_top_2_test_accuracy": {
318
+ "11": 0.77,
319
+ "13": 0.719,
320
+ "14": 0.672,
321
+ "18": 0.717,
322
+ "19": 0.789
323
+ },
324
+ "llm_top_5_test_accuracy": {
325
+ "11": 0.793,
326
+ "13": 0.739,
327
+ "14": 0.732,
328
+ "18": 0.723,
329
+ "19": 0.828
330
+ },
331
+ "sae_top_1_test_accuracy": {
332
+ "11": 0.552,
333
+ "13": 0.674,
334
+ "14": 0.646,
335
+ "18": 0.689,
336
+ "19": 0.784
337
+ },
338
+ "sae_top_2_test_accuracy": {
339
+ "11": 0.848,
340
+ "13": 0.69,
341
+ "14": 0.878,
342
+ "18": 0.915,
343
+ "19": 0.782
344
+ },
345
+ "sae_top_5_test_accuracy": {
346
+ "11": 0.925,
347
+ "13": 0.77,
348
+ "14": 0.881,
349
+ "18": 0.916,
350
+ "19": 0.899
351
+ }
352
+ },
353
+ "LabHC/bias_in_bios_class_set3_results": {
354
+ "sae_test_accuracy": {
355
+ "20": 0.9480000734329224,
356
+ "21": 0.921000063419342,
357
+ "22": 0.906000018119812,
358
+ "25": 0.9480000734329224,
359
+ "26": 0.8850000500679016
360
+ },
361
+ "llm_test_accuracy": {
362
+ "20": 0.9650000333786011,
363
+ "21": 0.9250000715255737,
364
+ "22": 0.9140000343322754,
365
+ "25": 0.9670000672340393,
366
+ "26": 0.8840000629425049
367
+ },
368
+ "llm_top_1_test_accuracy": {
369
+ "20": 0.709,
370
+ "21": 0.762,
371
+ "22": 0.653,
372
+ "25": 0.683,
373
+ "26": 0.625
374
+ },
375
+ "llm_top_2_test_accuracy": {
376
+ "20": 0.811,
377
+ "21": 0.769,
378
+ "22": 0.688,
379
+ "25": 0.766,
380
+ "26": 0.684
381
+ },
382
+ "llm_top_5_test_accuracy": {
383
+ "20": 0.815,
384
+ "21": 0.794,
385
+ "22": 0.706,
386
+ "25": 0.803,
387
+ "26": 0.697
388
+ },
389
+ "sae_top_1_test_accuracy": {
390
+ "20": 0.851,
391
+ "21": 0.738,
392
+ "22": 0.835,
393
+ "25": 0.703,
394
+ "26": 0.645
395
+ },
396
+ "sae_top_2_test_accuracy": {
397
+ "20": 0.883,
398
+ "21": 0.75,
399
+ "22": 0.859,
400
+ "25": 0.852,
401
+ "26": 0.632
402
+ },
403
+ "sae_top_5_test_accuracy": {
404
+ "20": 0.941,
405
+ "21": 0.843,
406
+ "22": 0.869,
407
+ "25": 0.873,
408
+ "26": 0.801
409
+ }
410
+ },
411
+ "canrager/amazon_reviews_mcauley_1and5_results": {
412
+ "sae_test_accuracy": {
413
+ "1": 0.9300000667572021,
414
+ "2": 0.9230000376701355,
415
+ "3": 0.906000018119812,
416
+ "5": 0.9180000424385071,
417
+ "6": 0.8490000367164612
418
+ },
419
+ "llm_test_accuracy": {
420
+ "1": 0.9490000605583191,
421
+ "2": 0.9300000667572021,
422
+ "3": 0.9120000600814819,
423
+ "5": 0.9250000715255737,
424
+ "6": 0.862000048160553
425
+ },
426
+ "llm_top_1_test_accuracy": {
427
+ "1": 0.684,
428
+ "2": 0.592,
429
+ "3": 0.583,
430
+ "5": 0.551,
431
+ "6": 0.585
432
+ },
433
+ "llm_top_2_test_accuracy": {
434
+ "1": 0.739,
435
+ "2": 0.635,
436
+ "3": 0.609,
437
+ "5": 0.635,
438
+ "6": 0.62
439
+ },
440
+ "llm_top_5_test_accuracy": {
441
+ "1": 0.757,
442
+ "2": 0.636,
443
+ "3": 0.622,
444
+ "5": 0.657,
445
+ "6": 0.682
446
+ },
447
+ "sae_top_1_test_accuracy": {
448
+ "1": 0.862,
449
+ "2": 0.864,
450
+ "3": 0.686,
451
+ "5": 0.527,
452
+ "6": 0.626
453
+ },
454
+ "sae_top_2_test_accuracy": {
455
+ "1": 0.861,
456
+ "2": 0.865,
457
+ "3": 0.67,
458
+ "5": 0.522,
459
+ "6": 0.705
460
+ },
461
+ "sae_top_5_test_accuracy": {
462
+ "1": 0.885,
463
+ "2": 0.874,
464
+ "3": 0.688,
465
+ "5": 0.87,
466
+ "6": 0.731
467
+ }
468
+ },
469
+ "canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
470
+ "sae_test_accuracy": {
471
+ "1.0": 0.9640000462532043,
472
+ "5.0": 0.9620000720024109
473
+ },
474
+ "llm_test_accuracy": {
475
+ "1.0": 0.9820000529289246,
476
+ "5.0": 0.9820000529289246
477
+ },
478
+ "llm_top_1_test_accuracy": {
479
+ "1.0": 0.673,
480
+ "5.0": 0.673
481
+ },
482
+ "llm_top_2_test_accuracy": {
483
+ "1.0": 0.724,
484
+ "5.0": 0.724
485
+ },
486
+ "llm_top_5_test_accuracy": {
487
+ "1.0": 0.766,
488
+ "5.0": 0.766
489
+ },
490
+ "sae_top_1_test_accuracy": {
491
+ "1.0": 0.742,
492
+ "5.0": 0.742
493
+ },
494
+ "sae_top_2_test_accuracy": {
495
+ "1.0": 0.752,
496
+ "5.0": 0.752
497
+ },
498
+ "sae_top_5_test_accuracy": {
499
+ "1.0": 0.797,
500
+ "5.0": 0.797
501
+ }
502
+ },
503
+ "codeparrot/github-code_results": {
504
+ "sae_test_accuracy": {
505
+ "C": 0.9490000605583191,
506
+ "Python": 0.9790000319480896,
507
+ "HTML": 0.984000027179718,
508
+ "Java": 0.971000075340271,
509
+ "PHP": 0.9470000267028809
510
+ },
511
+ "llm_test_accuracy": {
512
+ "C": 0.956000030040741,
513
+ "Python": 0.9830000400543213,
514
+ "HTML": 0.984000027179718,
515
+ "Java": 0.9700000286102295,
516
+ "PHP": 0.9610000252723694
517
+ },
518
+ "llm_top_1_test_accuracy": {
519
+ "C": 0.663,
520
+ "Python": 0.655,
521
+ "HTML": 0.72,
522
+ "Java": 0.612,
523
+ "PHP": 0.595
524
+ },
525
+ "llm_top_2_test_accuracy": {
526
+ "C": 0.68,
527
+ "Python": 0.683,
528
+ "HTML": 0.798,
529
+ "Java": 0.681,
530
+ "PHP": 0.637
531
+ },
532
+ "llm_top_5_test_accuracy": {
533
+ "C": 0.74,
534
+ "Python": 0.724,
535
+ "HTML": 0.902,
536
+ "Java": 0.739,
537
+ "PHP": 0.673
538
+ },
539
+ "sae_top_1_test_accuracy": {
540
+ "C": 0.545,
541
+ "Python": 0.646,
542
+ "HTML": 0.602,
543
+ "Java": 0.616,
544
+ "PHP": 0.59
545
+ },
546
+ "sae_top_2_test_accuracy": {
547
+ "C": 0.573,
548
+ "Python": 0.68,
549
+ "HTML": 0.748,
550
+ "Java": 0.646,
551
+ "PHP": 0.581
552
+ },
553
+ "sae_top_5_test_accuracy": {
554
+ "C": 0.656,
555
+ "Python": 0.855,
556
+ "HTML": 0.896,
557
+ "Java": 0.669,
558
+ "PHP": 0.909
559
+ }
560
+ },
561
+ "fancyzhx/ag_news_results": {
562
+ "sae_test_accuracy": {
563
+ "0": 0.9420000314712524,
564
+ "1": 0.9820000529289246,
565
+ "2": 0.9270000457763672,
566
+ "3": 0.9410000443458557
567
+ },
568
+ "llm_test_accuracy": {
569
+ "0": 0.9450000524520874,
570
+ "1": 0.9890000224113464,
571
+ "2": 0.9200000166893005,
572
+ "3": 0.9450000524520874
573
+ },
574
+ "llm_top_1_test_accuracy": {
575
+ "0": 0.559,
576
+ "1": 0.66,
577
+ "2": 0.668,
578
+ "3": 0.65
579
+ },
580
+ "llm_top_2_test_accuracy": {
581
+ "0": 0.807,
582
+ "1": 0.799,
583
+ "2": 0.699,
584
+ "3": 0.823
585
+ },
586
+ "llm_top_5_test_accuracy": {
587
+ "0": 0.815,
588
+ "1": 0.88,
589
+ "2": 0.756,
590
+ "3": 0.848
591
+ },
592
+ "sae_top_1_test_accuracy": {
593
+ "0": 0.582,
594
+ "1": 0.656,
595
+ "2": 0.544,
596
+ "3": 0.638
597
+ },
598
+ "sae_top_2_test_accuracy": {
599
+ "0": 0.669,
600
+ "1": 0.693,
601
+ "2": 0.754,
602
+ "3": 0.665
603
+ },
604
+ "sae_top_5_test_accuracy": {
605
+ "0": 0.796,
606
+ "1": 0.934,
607
+ "2": 0.802,
608
+ "3": 0.803
609
+ }
610
+ },
611
+ "Helsinki-NLP/europarl_results": {
612
+ "sae_test_accuracy": {
613
+ "en": 0.9970000386238098,
614
+ "fr": 0.9980000257492065,
615
+ "de": 0.9930000305175781,
616
+ "es": 0.9950000643730164,
617
+ "nl": 0.9960000514984131
618
+ },
619
+ "llm_test_accuracy": {
620
+ "en": 0.999000072479248,
621
+ "fr": 1.0,
622
+ "de": 1.0,
623
+ "es": 1.0,
624
+ "nl": 0.999000072479248
625
+ },
626
+ "llm_top_1_test_accuracy": {
627
+ "en": 0.741,
628
+ "fr": 0.609,
629
+ "de": 0.754,
630
+ "es": 0.494,
631
+ "nl": 0.656
632
+ },
633
+ "llm_top_2_test_accuracy": {
634
+ "en": 0.832,
635
+ "fr": 0.597,
636
+ "de": 0.826,
637
+ "es": 0.964,
638
+ "nl": 0.741
639
+ },
640
+ "llm_top_5_test_accuracy": {
641
+ "en": 0.879,
642
+ "fr": 0.909,
643
+ "de": 0.874,
644
+ "es": 0.979,
645
+ "nl": 0.867
646
+ },
647
+ "sae_top_1_test_accuracy": {
648
+ "en": 0.741,
649
+ "fr": 0.595,
650
+ "de": 0.936,
651
+ "es": 0.925,
652
+ "nl": 0.656
653
+ },
654
+ "sae_top_2_test_accuracy": {
655
+ "en": 0.974,
656
+ "fr": 0.978,
657
+ "de": 0.936,
658
+ "es": 0.919,
659
+ "nl": 0.773
660
+ },
661
+ "sae_top_5_test_accuracy": {
662
+ "en": 0.998,
663
+ "fr": 0.98,
664
+ "de": 0.95,
665
+ "es": 0.99,
666
+ "nl": 0.993
667
+ }
668
+ }
669
+ }
670
+ }
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "sparse_probing",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "LabHC/bias_in_bios_class_set2",
8
+ "LabHC/bias_in_bios_class_set3",
9
+ "canrager/amazon_reviews_mcauley_1and5",
10
+ "canrager/amazon_reviews_mcauley_1and5_sentiment",
11
+ "codeparrot/github-code",
12
+ "fancyzhx/ag_news",
13
+ "Helsinki-NLP/europarl"
14
+ ],
15
+ "probe_train_set_size": 4000,
16
+ "probe_test_set_size": 1000,
17
+ "context_length": 128,
18
+ "sae_batch_size": 125,
19
+ "llm_batch_size": 32,
20
+ "llm_dtype": "bfloat16",
21
+ "model_name": "gemma-2-2b",
22
+ "k_values": [
23
+ 1,
24
+ 2,
25
+ 5
26
+ ],
27
+ "lower_vram_usage": false
28
+ },
29
+ "eval_id": "46c18a2b-109a-432c-a208-824f33f6831a",
30
+ "datetime_epoch_millis": 1740203656830,
31
+ "eval_result_metrics": {
32
+ "llm": {
33
+ "llm_test_accuracy": 0.9587437950074672,
34
+ "llm_top_1_test_accuracy": 0.6508312500000001,
35
+ "llm_top_2_test_accuracy": 0.7267250000000001,
36
+ "llm_top_5_test_accuracy": 0.77896875,
37
+ "llm_top_10_test_accuracy": null,
38
+ "llm_top_20_test_accuracy": null,
39
+ "llm_top_50_test_accuracy": null,
40
+ "llm_top_100_test_accuracy": null
41
+ },
42
+ "sae": {
43
+ "sae_test_accuracy": 0.9546437960118056,
44
+ "sae_top_1_test_accuracy": 0.7031999999999999,
45
+ "sae_top_2_test_accuracy": 0.80226875,
46
+ "sae_top_5_test_accuracy": 0.8621687499999999,
47
+ "sae_top_10_test_accuracy": null,
48
+ "sae_top_20_test_accuracy": null,
49
+ "sae_top_50_test_accuracy": null,
50
+ "sae_top_100_test_accuracy": null
51
+ }
52
+ },
53
+ "eval_result_details": [
54
+ {
55
+ "dataset_name": "LabHC/bias_in_bios_class_set1_results",
56
+ "llm_test_accuracy": 0.966800057888031,
57
+ "llm_top_1_test_accuracy": 0.6397999999999999,
58
+ "llm_top_2_test_accuracy": 0.6954,
59
+ "llm_top_5_test_accuracy": 0.7869999999999999,
60
+ "llm_top_10_test_accuracy": null,
61
+ "llm_top_20_test_accuracy": null,
62
+ "llm_top_50_test_accuracy": null,
63
+ "llm_top_100_test_accuracy": null,
64
+ "sae_test_accuracy": 0.9562000513076783,
65
+ "sae_top_1_test_accuracy": 0.6838,
66
+ "sae_top_2_test_accuracy": 0.8674,
67
+ "sae_top_5_test_accuracy": 0.8904,
68
+ "sae_top_10_test_accuracy": null,
69
+ "sae_top_20_test_accuracy": null,
70
+ "sae_top_50_test_accuracy": null,
71
+ "sae_top_100_test_accuracy": null
72
+ },
73
+ {
74
+ "dataset_name": "LabHC/bias_in_bios_class_set2_results",
75
+ "llm_test_accuracy": 0.9544000387191772,
76
+ "llm_top_1_test_accuracy": 0.6744000000000001,
77
+ "llm_top_2_test_accuracy": 0.7334,
78
+ "llm_top_5_test_accuracy": 0.763,
79
+ "llm_top_10_test_accuracy": null,
80
+ "llm_top_20_test_accuracy": null,
81
+ "llm_top_50_test_accuracy": null,
82
+ "llm_top_100_test_accuracy": null,
83
+ "sae_test_accuracy": 0.9466000437736511,
84
+ "sae_top_1_test_accuracy": 0.6722,
85
+ "sae_top_2_test_accuracy": 0.7876000000000001,
86
+ "sae_top_5_test_accuracy": 0.8619999999999999,
87
+ "sae_top_10_test_accuracy": null,
88
+ "sae_top_20_test_accuracy": null,
89
+ "sae_top_50_test_accuracy": null,
90
+ "sae_top_100_test_accuracy": null
91
+ },
92
+ {
93
+ "dataset_name": "LabHC/bias_in_bios_class_set3_results",
94
+ "llm_test_accuracy": 0.9310000538825989,
95
+ "llm_top_1_test_accuracy": 0.6864000000000001,
96
+ "llm_top_2_test_accuracy": 0.7436,
97
+ "llm_top_5_test_accuracy": 0.763,
98
+ "llm_top_10_test_accuracy": null,
99
+ "llm_top_20_test_accuracy": null,
100
+ "llm_top_50_test_accuracy": null,
101
+ "llm_top_100_test_accuracy": null,
102
+ "sae_test_accuracy": 0.9256000518798828,
103
+ "sae_top_1_test_accuracy": 0.7188,
104
+ "sae_top_2_test_accuracy": 0.7922,
105
+ "sae_top_5_test_accuracy": 0.8664,
106
+ "sae_top_10_test_accuracy": null,
107
+ "sae_top_20_test_accuracy": null,
108
+ "sae_top_50_test_accuracy": null,
109
+ "sae_top_100_test_accuracy": null
110
+ },
111
+ {
112
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
113
+ "llm_test_accuracy": 0.915600061416626,
114
+ "llm_top_1_test_accuracy": 0.599,
115
+ "llm_top_2_test_accuracy": 0.6476000000000001,
116
+ "llm_top_5_test_accuracy": 0.6708000000000001,
117
+ "llm_top_10_test_accuracy": null,
118
+ "llm_top_20_test_accuracy": null,
119
+ "llm_top_50_test_accuracy": null,
120
+ "llm_top_100_test_accuracy": null,
121
+ "sae_test_accuracy": 0.9176000356674194,
122
+ "sae_top_1_test_accuracy": 0.7352000000000001,
123
+ "sae_top_2_test_accuracy": 0.772,
124
+ "sae_top_5_test_accuracy": 0.7998,
125
+ "sae_top_10_test_accuracy": null,
126
+ "sae_top_20_test_accuracy": null,
127
+ "sae_top_50_test_accuracy": null,
128
+ "sae_top_100_test_accuracy": null
129
+ },
130
+ {
131
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
132
+ "llm_test_accuracy": 0.9820000529289246,
133
+ "llm_top_1_test_accuracy": 0.673,
134
+ "llm_top_2_test_accuracy": 0.724,
135
+ "llm_top_5_test_accuracy": 0.766,
136
+ "llm_top_10_test_accuracy": null,
137
+ "llm_top_20_test_accuracy": null,
138
+ "llm_top_50_test_accuracy": null,
139
+ "llm_top_100_test_accuracy": null,
140
+ "sae_test_accuracy": 0.9705000519752502,
141
+ "sae_top_1_test_accuracy": 0.832,
142
+ "sae_top_2_test_accuracy": 0.83,
143
+ "sae_top_5_test_accuracy": 0.919,
144
+ "sae_top_10_test_accuracy": null,
145
+ "sae_top_20_test_accuracy": null,
146
+ "sae_top_50_test_accuracy": null,
147
+ "sae_top_100_test_accuracy": null
148
+ },
149
+ {
150
+ "dataset_name": "codeparrot/github-code_results",
151
+ "llm_test_accuracy": 0.9708000302314759,
152
+ "llm_top_1_test_accuracy": 0.649,
153
+ "llm_top_2_test_accuracy": 0.6958,
154
+ "llm_top_5_test_accuracy": 0.7556,
155
+ "llm_top_10_test_accuracy": null,
156
+ "llm_top_20_test_accuracy": null,
157
+ "llm_top_50_test_accuracy": null,
158
+ "llm_top_100_test_accuracy": null,
159
+ "sae_test_accuracy": 0.9708000421524048,
160
+ "sae_top_1_test_accuracy": 0.614,
161
+ "sae_top_2_test_accuracy": 0.6594,
162
+ "sae_top_5_test_accuracy": 0.7447999999999999,
163
+ "sae_top_10_test_accuracy": null,
164
+ "sae_top_20_test_accuracy": null,
165
+ "sae_top_50_test_accuracy": null,
166
+ "sae_top_100_test_accuracy": null
167
+ },
168
+ {
169
+ "dataset_name": "fancyzhx/ag_news_results",
170
+ "llm_test_accuracy": 0.9497500360012054,
171
+ "llm_top_1_test_accuracy": 0.63425,
172
+ "llm_top_2_test_accuracy": 0.782,
173
+ "llm_top_5_test_accuracy": 0.8247499999999999,
174
+ "llm_top_10_test_accuracy": null,
175
+ "llm_top_20_test_accuracy": null,
176
+ "llm_top_50_test_accuracy": null,
177
+ "llm_top_100_test_accuracy": null,
178
+ "sae_test_accuracy": 0.9522500485181808,
179
+ "sae_top_1_test_accuracy": 0.6060000000000001,
180
+ "sae_top_2_test_accuracy": 0.71875,
181
+ "sae_top_5_test_accuracy": 0.81875,
182
+ "sae_top_10_test_accuracy": null,
183
+ "sae_top_20_test_accuracy": null,
184
+ "sae_top_50_test_accuracy": null,
185
+ "sae_top_100_test_accuracy": null
186
+ },
187
+ {
188
+ "dataset_name": "Helsinki-NLP/europarl_results",
189
+ "llm_test_accuracy": 0.9996000289916992,
190
+ "llm_top_1_test_accuracy": 0.6508,
191
+ "llm_top_2_test_accuracy": 0.792,
192
+ "llm_top_5_test_accuracy": 0.9016,
193
+ "llm_top_10_test_accuracy": null,
194
+ "llm_top_20_test_accuracy": null,
195
+ "llm_top_50_test_accuracy": null,
196
+ "llm_top_100_test_accuracy": null,
197
+ "sae_test_accuracy": 0.9976000428199768,
198
+ "sae_top_1_test_accuracy": 0.7636000000000001,
199
+ "sae_top_2_test_accuracy": 0.9907999999999999,
200
+ "sae_top_5_test_accuracy": 0.9962,
201
+ "sae_top_10_test_accuracy": null,
202
+ "sae_top_20_test_accuracy": null,
203
+ "sae_top_50_test_accuracy": null,
204
+ "sae_top_100_test_accuracy": null
205
+ }
206
+ ],
207
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
208
+ "sae_lens_id": "custom_sae",
209
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000",
210
+ "sae_lens_version": "5.5.0",
211
+ "sae_cfg_dict": {
212
+ "model_name": "gemma-2-2b",
213
+ "d_in": 2304,
214
+ "d_sae": 65536,
215
+ "hook_layer": 12,
216
+ "hook_name": "blocks.12.hook_resid_post",
217
+ "context_size": null,
218
+ "hook_head_index": null,
219
+ "architecture": "topk",
220
+ "apply_b_dec_to_input": null,
221
+ "finetuning_scaling_factor": null,
222
+ "activation_fn_str": "",
223
+ "prepend_bos": true,
224
+ "normalize_activations": "none",
225
+ "dtype": "bfloat16",
226
+ "device": "",
227
+ "dataset_path": "",
228
+ "dataset_trust_remote_code": true,
229
+ "seqpos_slice": [
230
+ null
231
+ ],
232
+ "training_tokens": -100000,
233
+ "sae_lens_training_version": null,
234
+ "neuronpedia_id": null
235
+ },
236
+ "eval_result_unstructured": {
237
+ "LabHC/bias_in_bios_class_set1_results": {
238
+ "sae_test_accuracy": {
239
+ "0": 0.940000057220459,
240
+ "1": 0.9530000686645508,
241
+ "2": 0.9430000185966492,
242
+ "6": 0.9830000400543213,
243
+ "9": 0.9620000720024109
244
+ },
245
+ "llm_test_accuracy": {
246
+ "0": 0.9510000348091125,
247
+ "1": 0.9670000672340393,
248
+ "2": 0.9530000686645508,
249
+ "6": 0.987000048160553,
250
+ "9": 0.9760000705718994
251
+ },
252
+ "llm_top_1_test_accuracy": {
253
+ "0": 0.577,
254
+ "1": 0.613,
255
+ "2": 0.662,
256
+ "6": 0.787,
257
+ "9": 0.56
258
+ },
259
+ "llm_top_2_test_accuracy": {
260
+ "0": 0.574,
261
+ "1": 0.66,
262
+ "2": 0.718,
263
+ "6": 0.811,
264
+ "9": 0.714
265
+ },
266
+ "llm_top_5_test_accuracy": {
267
+ "0": 0.713,
268
+ "1": 0.711,
269
+ "2": 0.755,
270
+ "6": 0.895,
271
+ "9": 0.861
272
+ },
273
+ "sae_top_1_test_accuracy": {
274
+ "0": 0.563,
275
+ "1": 0.64,
276
+ "2": 0.84,
277
+ "6": 0.814,
278
+ "9": 0.562
279
+ },
280
+ "sae_top_2_test_accuracy": {
281
+ "0": 0.867,
282
+ "1": 0.819,
283
+ "2": 0.84,
284
+ "6": 0.976,
285
+ "9": 0.835
286
+ },
287
+ "sae_top_5_test_accuracy": {
288
+ "0": 0.874,
289
+ "1": 0.861,
290
+ "2": 0.859,
291
+ "6": 0.988,
292
+ "9": 0.87
293
+ }
294
+ },
295
+ "LabHC/bias_in_bios_class_set2_results": {
296
+ "sae_test_accuracy": {
297
+ "11": 0.9550000429153442,
298
+ "13": 0.9530000686645508,
299
+ "14": 0.9460000395774841,
300
+ "18": 0.9220000505447388,
301
+ "19": 0.9570000171661377
302
+ },
303
+ "llm_test_accuracy": {
304
+ "11": 0.9660000205039978,
305
+ "13": 0.9510000348091125,
306
+ "14": 0.9540000557899475,
307
+ "18": 0.940000057220459,
308
+ "19": 0.9610000252723694
309
+ },
310
+ "llm_top_1_test_accuracy": {
311
+ "11": 0.553,
312
+ "13": 0.673,
313
+ "14": 0.651,
314
+ "18": 0.706,
315
+ "19": 0.789
316
+ },
317
+ "llm_top_2_test_accuracy": {
318
+ "11": 0.77,
319
+ "13": 0.719,
320
+ "14": 0.672,
321
+ "18": 0.717,
322
+ "19": 0.789
323
+ },
324
+ "llm_top_5_test_accuracy": {
325
+ "11": 0.793,
326
+ "13": 0.739,
327
+ "14": 0.732,
328
+ "18": 0.723,
329
+ "19": 0.828
330
+ },
331
+ "sae_top_1_test_accuracy": {
332
+ "11": 0.562,
333
+ "13": 0.667,
334
+ "14": 0.635,
335
+ "18": 0.704,
336
+ "19": 0.793
337
+ },
338
+ "sae_top_2_test_accuracy": {
339
+ "11": 0.855,
340
+ "13": 0.668,
341
+ "14": 0.865,
342
+ "18": 0.732,
343
+ "19": 0.818
344
+ },
345
+ "sae_top_5_test_accuracy": {
346
+ "11": 0.947,
347
+ "13": 0.718,
348
+ "14": 0.884,
349
+ "18": 0.896,
350
+ "19": 0.865
351
+ }
352
+ },
353
+ "LabHC/bias_in_bios_class_set3_results": {
354
+ "sae_test_accuracy": {
355
+ "20": 0.9470000267028809,
356
+ "21": 0.9270000457763672,
357
+ "22": 0.9120000600814819,
358
+ "25": 0.9580000638961792,
359
+ "26": 0.8840000629425049
360
+ },
361
+ "llm_test_accuracy": {
362
+ "20": 0.9650000333786011,
363
+ "21": 0.9250000715255737,
364
+ "22": 0.9140000343322754,
365
+ "25": 0.9670000672340393,
366
+ "26": 0.8840000629425049
367
+ },
368
+ "llm_top_1_test_accuracy": {
369
+ "20": 0.709,
370
+ "21": 0.762,
371
+ "22": 0.653,
372
+ "25": 0.683,
373
+ "26": 0.625
374
+ },
375
+ "llm_top_2_test_accuracy": {
376
+ "20": 0.811,
377
+ "21": 0.769,
378
+ "22": 0.688,
379
+ "25": 0.766,
380
+ "26": 0.684
381
+ },
382
+ "llm_top_5_test_accuracy": {
383
+ "20": 0.815,
384
+ "21": 0.794,
385
+ "22": 0.706,
386
+ "25": 0.803,
387
+ "26": 0.697
388
+ },
389
+ "sae_top_1_test_accuracy": {
390
+ "20": 0.841,
391
+ "21": 0.504,
392
+ "22": 0.887,
393
+ "25": 0.712,
394
+ "26": 0.65
395
+ },
396
+ "sae_top_2_test_accuracy": {
397
+ "20": 0.855,
398
+ "21": 0.737,
399
+ "22": 0.89,
400
+ "25": 0.848,
401
+ "26": 0.631
402
+ },
403
+ "sae_top_5_test_accuracy": {
404
+ "20": 0.917,
405
+ "21": 0.85,
406
+ "22": 0.886,
407
+ "25": 0.889,
408
+ "26": 0.79
409
+ }
410
+ },
411
+ "canrager/amazon_reviews_mcauley_1and5_results": {
412
+ "sae_test_accuracy": {
413
+ "1": 0.9450000524520874,
414
+ "2": 0.9350000619888306,
415
+ "3": 0.9200000166893005,
416
+ "5": 0.9200000166893005,
417
+ "6": 0.8680000305175781
418
+ },
419
+ "llm_test_accuracy": {
420
+ "1": 0.9490000605583191,
421
+ "2": 0.9300000667572021,
422
+ "3": 0.9120000600814819,
423
+ "5": 0.9250000715255737,
424
+ "6": 0.862000048160553
425
+ },
426
+ "llm_top_1_test_accuracy": {
427
+ "1": 0.684,
428
+ "2": 0.592,
429
+ "3": 0.583,
430
+ "5": 0.551,
431
+ "6": 0.585
432
+ },
433
+ "llm_top_2_test_accuracy": {
434
+ "1": 0.739,
435
+ "2": 0.635,
436
+ "3": 0.609,
437
+ "5": 0.635,
438
+ "6": 0.62
439
+ },
440
+ "llm_top_5_test_accuracy": {
441
+ "1": 0.757,
442
+ "2": 0.636,
443
+ "3": 0.622,
444
+ "5": 0.657,
445
+ "6": 0.682
446
+ },
447
+ "sae_top_1_test_accuracy": {
448
+ "1": 0.861,
449
+ "2": 0.843,
450
+ "3": 0.542,
451
+ "5": 0.83,
452
+ "6": 0.6
453
+ },
454
+ "sae_top_2_test_accuracy": {
455
+ "1": 0.885,
456
+ "2": 0.848,
457
+ "3": 0.603,
458
+ "5": 0.884,
459
+ "6": 0.64
460
+ },
461
+ "sae_top_5_test_accuracy": {
462
+ "1": 0.904,
463
+ "2": 0.846,
464
+ "3": 0.632,
465
+ "5": 0.881,
466
+ "6": 0.736
467
+ }
468
+ },
469
+ "canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
470
+ "sae_test_accuracy": {
471
+ "1.0": 0.9700000286102295,
472
+ "5.0": 0.971000075340271
473
+ },
474
+ "llm_test_accuracy": {
475
+ "1.0": 0.9820000529289246,
476
+ "5.0": 0.9820000529289246
477
+ },
478
+ "llm_top_1_test_accuracy": {
479
+ "1.0": 0.673,
480
+ "5.0": 0.673
481
+ },
482
+ "llm_top_2_test_accuracy": {
483
+ "1.0": 0.724,
484
+ "5.0": 0.724
485
+ },
486
+ "llm_top_5_test_accuracy": {
487
+ "1.0": 0.766,
488
+ "5.0": 0.766
489
+ },
490
+ "sae_top_1_test_accuracy": {
491
+ "1.0": 0.832,
492
+ "5.0": 0.832
493
+ },
494
+ "sae_top_2_test_accuracy": {
495
+ "1.0": 0.83,
496
+ "5.0": 0.83
497
+ },
498
+ "sae_top_5_test_accuracy": {
499
+ "1.0": 0.919,
500
+ "5.0": 0.919
501
+ }
502
+ },
503
+ "codeparrot/github-code_results": {
504
+ "sae_test_accuracy": {
505
+ "C": 0.9580000638961792,
506
+ "Python": 0.9860000610351562,
507
+ "HTML": 0.984000027179718,
508
+ "Java": 0.9650000333786011,
509
+ "PHP": 0.9610000252723694
510
+ },
511
+ "llm_test_accuracy": {
512
+ "C": 0.956000030040741,
513
+ "Python": 0.9830000400543213,
514
+ "HTML": 0.984000027179718,
515
+ "Java": 0.9700000286102295,
516
+ "PHP": 0.9610000252723694
517
+ },
518
+ "llm_top_1_test_accuracy": {
519
+ "C": 0.663,
520
+ "Python": 0.655,
521
+ "HTML": 0.72,
522
+ "Java": 0.612,
523
+ "PHP": 0.595
524
+ },
525
+ "llm_top_2_test_accuracy": {
526
+ "C": 0.68,
527
+ "Python": 0.683,
528
+ "HTML": 0.798,
529
+ "Java": 0.681,
530
+ "PHP": 0.637
531
+ },
532
+ "llm_top_5_test_accuracy": {
533
+ "C": 0.74,
534
+ "Python": 0.724,
535
+ "HTML": 0.902,
536
+ "Java": 0.739,
537
+ "PHP": 0.673
538
+ },
539
+ "sae_top_1_test_accuracy": {
540
+ "C": 0.624,
541
+ "Python": 0.653,
542
+ "HTML": 0.578,
543
+ "Java": 0.623,
544
+ "PHP": 0.592
545
+ },
546
+ "sae_top_2_test_accuracy": {
547
+ "C": 0.591,
548
+ "Python": 0.652,
549
+ "HTML": 0.81,
550
+ "Java": 0.663,
551
+ "PHP": 0.581
552
+ },
553
+ "sae_top_5_test_accuracy": {
554
+ "C": 0.628,
555
+ "Python": 0.66,
556
+ "HTML": 0.826,
557
+ "Java": 0.702,
558
+ "PHP": 0.908
559
+ }
560
+ },
561
+ "fancyzhx/ag_news_results": {
562
+ "sae_test_accuracy": {
563
+ "0": 0.9430000185966492,
564
+ "1": 0.9810000658035278,
565
+ "2": 0.9320000410079956,
566
+ "3": 0.9530000686645508
567
+ },
568
+ "llm_test_accuracy": {
569
+ "0": 0.9450000524520874,
570
+ "1": 0.9890000224113464,
571
+ "2": 0.9200000166893005,
572
+ "3": 0.9450000524520874
573
+ },
574
+ "llm_top_1_test_accuracy": {
575
+ "0": 0.559,
576
+ "1": 0.66,
577
+ "2": 0.668,
578
+ "3": 0.65
579
+ },
580
+ "llm_top_2_test_accuracy": {
581
+ "0": 0.807,
582
+ "1": 0.799,
583
+ "2": 0.699,
584
+ "3": 0.823
585
+ },
586
+ "llm_top_5_test_accuracy": {
587
+ "0": 0.815,
588
+ "1": 0.88,
589
+ "2": 0.756,
590
+ "3": 0.848
591
+ },
592
+ "sae_top_1_test_accuracy": {
593
+ "0": 0.585,
594
+ "1": 0.656,
595
+ "2": 0.543,
596
+ "3": 0.64
597
+ },
598
+ "sae_top_2_test_accuracy": {
599
+ "0": 0.805,
600
+ "1": 0.716,
601
+ "2": 0.618,
602
+ "3": 0.736
603
+ },
604
+ "sae_top_5_test_accuracy": {
605
+ "0": 0.834,
606
+ "1": 0.802,
607
+ "2": 0.823,
608
+ "3": 0.816
609
+ }
610
+ },
611
+ "Helsinki-NLP/europarl_results": {
612
+ "sae_test_accuracy": {
613
+ "en": 0.9980000257492065,
614
+ "fr": 0.9950000643730164,
615
+ "de": 0.9980000257492065,
616
+ "es": 0.9980000257492065,
617
+ "nl": 0.999000072479248
618
+ },
619
+ "llm_test_accuracy": {
620
+ "en": 0.999000072479248,
621
+ "fr": 1.0,
622
+ "de": 1.0,
623
+ "es": 1.0,
624
+ "nl": 0.999000072479248
625
+ },
626
+ "llm_top_1_test_accuracy": {
627
+ "en": 0.741,
628
+ "fr": 0.609,
629
+ "de": 0.754,
630
+ "es": 0.494,
631
+ "nl": 0.656
632
+ },
633
+ "llm_top_2_test_accuracy": {
634
+ "en": 0.832,
635
+ "fr": 0.597,
636
+ "de": 0.826,
637
+ "es": 0.964,
638
+ "nl": 0.741
639
+ },
640
+ "llm_top_5_test_accuracy": {
641
+ "en": 0.879,
642
+ "fr": 0.909,
643
+ "de": 0.874,
644
+ "es": 0.979,
645
+ "nl": 0.867
646
+ },
647
+ "sae_top_1_test_accuracy": {
648
+ "en": 0.747,
649
+ "fr": 0.598,
650
+ "de": 0.909,
651
+ "es": 0.91,
652
+ "nl": 0.654
653
+ },
654
+ "sae_top_2_test_accuracy": {
655
+ "en": 0.995,
656
+ "fr": 0.984,
657
+ "de": 0.988,
658
+ "es": 0.989,
659
+ "nl": 0.998
660
+ },
661
+ "sae_top_5_test_accuracy": {
662
+ "en": 0.999,
663
+ "fr": 0.996,
664
+ "de": 0.993,
665
+ "es": 0.993,
666
+ "nl": 1.0
667
+ }
668
+ }
669
+ }
670
+ }
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "sparse_probing",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "LabHC/bias_in_bios_class_set2",
8
+ "LabHC/bias_in_bios_class_set3",
9
+ "canrager/amazon_reviews_mcauley_1and5",
10
+ "canrager/amazon_reviews_mcauley_1and5_sentiment",
11
+ "codeparrot/github-code",
12
+ "fancyzhx/ag_news",
13
+ "Helsinki-NLP/europarl"
14
+ ],
15
+ "probe_train_set_size": 4000,
16
+ "probe_test_set_size": 1000,
17
+ "context_length": 128,
18
+ "sae_batch_size": 125,
19
+ "llm_batch_size": 32,
20
+ "llm_dtype": "bfloat16",
21
+ "model_name": "gemma-2-2b",
22
+ "k_values": [
23
+ 1,
24
+ 2,
25
+ 5
26
+ ],
27
+ "lower_vram_usage": false
28
+ },
29
+ "eval_id": "fccca889-102f-47b0-9e93-59ed0335697a",
30
+ "datetime_epoch_millis": 1740203387785,
31
+ "eval_result_metrics": {
32
+ "llm": {
33
+ "llm_test_accuracy": 0.9587437950074672,
34
+ "llm_top_1_test_accuracy": 0.6508312500000001,
35
+ "llm_top_2_test_accuracy": 0.7267250000000001,
36
+ "llm_top_5_test_accuracy": 0.77896875,
37
+ "llm_top_10_test_accuracy": null,
38
+ "llm_top_20_test_accuracy": null,
39
+ "llm_top_50_test_accuracy": null,
40
+ "llm_top_100_test_accuracy": null
41
+ },
42
+ "sae": {
43
+ "sae_test_accuracy": 0.9568687990307807,
44
+ "sae_top_1_test_accuracy": 0.7255,
45
+ "sae_top_2_test_accuracy": 0.8132124999999998,
46
+ "sae_top_5_test_accuracy": 0.869575,
47
+ "sae_top_10_test_accuracy": null,
48
+ "sae_top_20_test_accuracy": null,
49
+ "sae_top_50_test_accuracy": null,
50
+ "sae_top_100_test_accuracy": null
51
+ }
52
+ },
53
+ "eval_result_details": [
54
+ {
55
+ "dataset_name": "LabHC/bias_in_bios_class_set1_results",
56
+ "llm_test_accuracy": 0.966800057888031,
57
+ "llm_top_1_test_accuracy": 0.6397999999999999,
58
+ "llm_top_2_test_accuracy": 0.6954,
59
+ "llm_top_5_test_accuracy": 0.7869999999999999,
60
+ "llm_top_10_test_accuracy": null,
61
+ "llm_top_20_test_accuracy": null,
62
+ "llm_top_50_test_accuracy": null,
63
+ "llm_top_100_test_accuracy": null,
64
+ "sae_test_accuracy": 0.9588000655174256,
65
+ "sae_top_1_test_accuracy": 0.6734,
66
+ "sae_top_2_test_accuracy": 0.8904,
67
+ "sae_top_5_test_accuracy": 0.9034000000000001,
68
+ "sae_top_10_test_accuracy": null,
69
+ "sae_top_20_test_accuracy": null,
70
+ "sae_top_50_test_accuracy": null,
71
+ "sae_top_100_test_accuracy": null
72
+ },
73
+ {
74
+ "dataset_name": "LabHC/bias_in_bios_class_set2_results",
75
+ "llm_test_accuracy": 0.9544000387191772,
76
+ "llm_top_1_test_accuracy": 0.6744000000000001,
77
+ "llm_top_2_test_accuracy": 0.7334,
78
+ "llm_top_5_test_accuracy": 0.763,
79
+ "llm_top_10_test_accuracy": null,
80
+ "llm_top_20_test_accuracy": null,
81
+ "llm_top_50_test_accuracy": null,
82
+ "llm_top_100_test_accuracy": null,
83
+ "sae_test_accuracy": 0.9468000531196594,
84
+ "sae_top_1_test_accuracy": 0.6684,
85
+ "sae_top_2_test_accuracy": 0.7619999999999999,
86
+ "sae_top_5_test_accuracy": 0.8596,
87
+ "sae_top_10_test_accuracy": null,
88
+ "sae_top_20_test_accuracy": null,
89
+ "sae_top_50_test_accuracy": null,
90
+ "sae_top_100_test_accuracy": null
91
+ },
92
+ {
93
+ "dataset_name": "LabHC/bias_in_bios_class_set3_results",
94
+ "llm_test_accuracy": 0.9310000538825989,
95
+ "llm_top_1_test_accuracy": 0.6864000000000001,
96
+ "llm_top_2_test_accuracy": 0.7436,
97
+ "llm_top_5_test_accuracy": 0.763,
98
+ "llm_top_10_test_accuracy": null,
99
+ "llm_top_20_test_accuracy": null,
100
+ "llm_top_50_test_accuracy": null,
101
+ "llm_top_100_test_accuracy": null,
102
+ "sae_test_accuracy": 0.9338000416755676,
103
+ "sae_top_1_test_accuracy": 0.7182000000000001,
104
+ "sae_top_2_test_accuracy": 0.805,
105
+ "sae_top_5_test_accuracy": 0.8632000000000002,
106
+ "sae_top_10_test_accuracy": null,
107
+ "sae_top_20_test_accuracy": null,
108
+ "sae_top_50_test_accuracy": null,
109
+ "sae_top_100_test_accuracy": null
110
+ },
111
+ {
112
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
113
+ "llm_test_accuracy": 0.915600061416626,
114
+ "llm_top_1_test_accuracy": 0.599,
115
+ "llm_top_2_test_accuracy": 0.6476000000000001,
116
+ "llm_top_5_test_accuracy": 0.6708000000000001,
117
+ "llm_top_10_test_accuracy": null,
118
+ "llm_top_20_test_accuracy": null,
119
+ "llm_top_50_test_accuracy": null,
120
+ "llm_top_100_test_accuracy": null,
121
+ "sae_test_accuracy": 0.9242000460624695,
122
+ "sae_top_1_test_accuracy": 0.7218000000000001,
123
+ "sae_top_2_test_accuracy": 0.7634000000000001,
124
+ "sae_top_5_test_accuracy": 0.8140000000000001,
125
+ "sae_top_10_test_accuracy": null,
126
+ "sae_top_20_test_accuracy": null,
127
+ "sae_top_50_test_accuracy": null,
128
+ "sae_top_100_test_accuracy": null
129
+ },
130
+ {
131
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
132
+ "llm_test_accuracy": 0.9820000529289246,
133
+ "llm_top_1_test_accuracy": 0.673,
134
+ "llm_top_2_test_accuracy": 0.724,
135
+ "llm_top_5_test_accuracy": 0.766,
136
+ "llm_top_10_test_accuracy": null,
137
+ "llm_top_20_test_accuracy": null,
138
+ "llm_top_50_test_accuracy": null,
139
+ "llm_top_100_test_accuracy": null,
140
+ "sae_test_accuracy": 0.9735000431537628,
141
+ "sae_top_1_test_accuracy": 0.92,
142
+ "sae_top_2_test_accuracy": 0.919,
143
+ "sae_top_5_test_accuracy": 0.944,
144
+ "sae_top_10_test_accuracy": null,
145
+ "sae_top_20_test_accuracy": null,
146
+ "sae_top_50_test_accuracy": null,
147
+ "sae_top_100_test_accuracy": null
148
+ },
149
+ {
150
+ "dataset_name": "codeparrot/github-code_results",
151
+ "llm_test_accuracy": 0.9708000302314759,
152
+ "llm_top_1_test_accuracy": 0.649,
153
+ "llm_top_2_test_accuracy": 0.6958,
154
+ "llm_top_5_test_accuracy": 0.7556,
155
+ "llm_top_10_test_accuracy": null,
156
+ "llm_top_20_test_accuracy": null,
157
+ "llm_top_50_test_accuracy": null,
158
+ "llm_top_100_test_accuracy": null,
159
+ "sae_test_accuracy": 0.9690000414848328,
160
+ "sae_top_1_test_accuracy": 0.6166,
161
+ "sae_top_2_test_accuracy": 0.6898,
162
+ "sae_top_5_test_accuracy": 0.7966,
163
+ "sae_top_10_test_accuracy": null,
164
+ "sae_top_20_test_accuracy": null,
165
+ "sae_top_50_test_accuracy": null,
166
+ "sae_top_100_test_accuracy": null
167
+ },
168
+ {
169
+ "dataset_name": "fancyzhx/ag_news_results",
170
+ "llm_test_accuracy": 0.9497500360012054,
171
+ "llm_top_1_test_accuracy": 0.63425,
172
+ "llm_top_2_test_accuracy": 0.782,
173
+ "llm_top_5_test_accuracy": 0.8247499999999999,
174
+ "llm_top_10_test_accuracy": null,
175
+ "llm_top_20_test_accuracy": null,
176
+ "llm_top_50_test_accuracy": null,
177
+ "llm_top_100_test_accuracy": null,
178
+ "sae_test_accuracy": 0.9502500593662262,
179
+ "sae_top_1_test_accuracy": 0.649,
180
+ "sae_top_2_test_accuracy": 0.7204999999999999,
181
+ "sae_top_5_test_accuracy": 0.795,
182
+ "sae_top_10_test_accuracy": null,
183
+ "sae_top_20_test_accuracy": null,
184
+ "sae_top_50_test_accuracy": null,
185
+ "sae_top_100_test_accuracy": null
186
+ },
187
+ {
188
+ "dataset_name": "Helsinki-NLP/europarl_results",
189
+ "llm_test_accuracy": 0.9996000289916992,
190
+ "llm_top_1_test_accuracy": 0.6508,
191
+ "llm_top_2_test_accuracy": 0.792,
192
+ "llm_top_5_test_accuracy": 0.9016,
193
+ "llm_top_10_test_accuracy": null,
194
+ "llm_top_20_test_accuracy": null,
195
+ "llm_top_50_test_accuracy": null,
196
+ "llm_top_100_test_accuracy": null,
197
+ "sae_test_accuracy": 0.9986000418663025,
198
+ "sae_top_1_test_accuracy": 0.8366,
199
+ "sae_top_2_test_accuracy": 0.9555999999999999,
200
+ "sae_top_5_test_accuracy": 0.9808,
201
+ "sae_top_10_test_accuracy": null,
202
+ "sae_top_20_test_accuracy": null,
203
+ "sae_top_50_test_accuracy": null,
204
+ "sae_top_100_test_accuracy": null
205
+ }
206
+ ],
207
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
208
+ "sae_lens_id": "custom_sae",
209
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000",
210
+ "sae_lens_version": "5.5.0",
211
+ "sae_cfg_dict": {
212
+ "model_name": "gemma-2-2b",
213
+ "d_in": 2304,
214
+ "d_sae": 65536,
215
+ "hook_layer": 12,
216
+ "hook_name": "blocks.12.hook_resid_post",
217
+ "context_size": null,
218
+ "hook_head_index": null,
219
+ "architecture": "topk",
220
+ "apply_b_dec_to_input": null,
221
+ "finetuning_scaling_factor": null,
222
+ "activation_fn_str": "",
223
+ "prepend_bos": true,
224
+ "normalize_activations": "none",
225
+ "dtype": "bfloat16",
226
+ "device": "",
227
+ "dataset_path": "",
228
+ "dataset_trust_remote_code": true,
229
+ "seqpos_slice": [
230
+ null
231
+ ],
232
+ "training_tokens": -100000,
233
+ "sae_lens_training_version": null,
234
+ "neuronpedia_id": null
235
+ },
236
+ "eval_result_unstructured": {
237
+ "LabHC/bias_in_bios_class_set1_results": {
238
+ "sae_test_accuracy": {
239
+ "0": 0.9390000700950623,
240
+ "1": 0.9600000381469727,
241
+ "2": 0.9390000700950623,
242
+ "6": 0.9850000739097595,
243
+ "9": 0.971000075340271
244
+ },
245
+ "llm_test_accuracy": {
246
+ "0": 0.9510000348091125,
247
+ "1": 0.9670000672340393,
248
+ "2": 0.9530000686645508,
249
+ "6": 0.987000048160553,
250
+ "9": 0.9760000705718994
251
+ },
252
+ "llm_top_1_test_accuracy": {
253
+ "0": 0.577,
254
+ "1": 0.613,
255
+ "2": 0.662,
256
+ "6": 0.787,
257
+ "9": 0.56
258
+ },
259
+ "llm_top_2_test_accuracy": {
260
+ "0": 0.574,
261
+ "1": 0.66,
262
+ "2": 0.718,
263
+ "6": 0.811,
264
+ "9": 0.714
265
+ },
266
+ "llm_top_5_test_accuracy": {
267
+ "0": 0.713,
268
+ "1": 0.711,
269
+ "2": 0.755,
270
+ "6": 0.895,
271
+ "9": 0.861
272
+ },
273
+ "sae_top_1_test_accuracy": {
274
+ "0": 0.577,
275
+ "1": 0.62,
276
+ "2": 0.834,
277
+ "6": 0.81,
278
+ "9": 0.526
279
+ },
280
+ "sae_top_2_test_accuracy": {
281
+ "0": 0.876,
282
+ "1": 0.815,
283
+ "2": 0.84,
284
+ "6": 0.976,
285
+ "9": 0.945
286
+ },
287
+ "sae_top_5_test_accuracy": {
288
+ "0": 0.88,
289
+ "1": 0.864,
290
+ "2": 0.839,
291
+ "6": 0.987,
292
+ "9": 0.947
293
+ }
294
+ },
295
+ "LabHC/bias_in_bios_class_set2_results": {
296
+ "sae_test_accuracy": {
297
+ "11": 0.9470000267028809,
298
+ "13": 0.9580000638961792,
299
+ "14": 0.9450000524520874,
300
+ "18": 0.9220000505447388,
301
+ "19": 0.9620000720024109
302
+ },
303
+ "llm_test_accuracy": {
304
+ "11": 0.9660000205039978,
305
+ "13": 0.9510000348091125,
306
+ "14": 0.9540000557899475,
307
+ "18": 0.940000057220459,
308
+ "19": 0.9610000252723694
309
+ },
310
+ "llm_top_1_test_accuracy": {
311
+ "11": 0.553,
312
+ "13": 0.673,
313
+ "14": 0.651,
314
+ "18": 0.706,
315
+ "19": 0.789
316
+ },
317
+ "llm_top_2_test_accuracy": {
318
+ "11": 0.77,
319
+ "13": 0.719,
320
+ "14": 0.672,
321
+ "18": 0.717,
322
+ "19": 0.789
323
+ },
324
+ "llm_top_5_test_accuracy": {
325
+ "11": 0.793,
326
+ "13": 0.739,
327
+ "14": 0.732,
328
+ "18": 0.723,
329
+ "19": 0.828
330
+ },
331
+ "sae_top_1_test_accuracy": {
332
+ "11": 0.552,
333
+ "13": 0.673,
334
+ "14": 0.627,
335
+ "18": 0.7,
336
+ "19": 0.79
337
+ },
338
+ "sae_top_2_test_accuracy": {
339
+ "11": 0.749,
340
+ "13": 0.7,
341
+ "14": 0.79,
342
+ "18": 0.743,
343
+ "19": 0.828
344
+ },
345
+ "sae_top_5_test_accuracy": {
346
+ "11": 0.906,
347
+ "13": 0.742,
348
+ "14": 0.908,
349
+ "18": 0.894,
350
+ "19": 0.848
351
+ }
352
+ },
353
+ "LabHC/bias_in_bios_class_set3_results": {
354
+ "sae_test_accuracy": {
355
+ "20": 0.9630000591278076,
356
+ "21": 0.9280000329017639,
357
+ "22": 0.9250000715255737,
358
+ "25": 0.9610000252723694,
359
+ "26": 0.8920000195503235
360
+ },
361
+ "llm_test_accuracy": {
362
+ "20": 0.9650000333786011,
363
+ "21": 0.9250000715255737,
364
+ "22": 0.9140000343322754,
365
+ "25": 0.9670000672340393,
366
+ "26": 0.8840000629425049
367
+ },
368
+ "llm_top_1_test_accuracy": {
369
+ "20": 0.709,
370
+ "21": 0.762,
371
+ "22": 0.653,
372
+ "25": 0.683,
373
+ "26": 0.625
374
+ },
375
+ "llm_top_2_test_accuracy": {
376
+ "20": 0.811,
377
+ "21": 0.769,
378
+ "22": 0.688,
379
+ "25": 0.766,
380
+ "26": 0.684
381
+ },
382
+ "llm_top_5_test_accuracy": {
383
+ "20": 0.815,
384
+ "21": 0.794,
385
+ "22": 0.706,
386
+ "25": 0.803,
387
+ "26": 0.697
388
+ },
389
+ "sae_top_1_test_accuracy": {
390
+ "20": 0.866,
391
+ "21": 0.499,
392
+ "22": 0.877,
393
+ "25": 0.705,
394
+ "26": 0.644
395
+ },
396
+ "sae_top_2_test_accuracy": {
397
+ "20": 0.882,
398
+ "21": 0.735,
399
+ "22": 0.885,
400
+ "25": 0.878,
401
+ "26": 0.645
402
+ },
403
+ "sae_top_5_test_accuracy": {
404
+ "20": 0.904,
405
+ "21": 0.853,
406
+ "22": 0.901,
407
+ "25": 0.894,
408
+ "26": 0.764
409
+ }
410
+ },
411
+ "canrager/amazon_reviews_mcauley_1and5_results": {
412
+ "sae_test_accuracy": {
413
+ "1": 0.9480000734329224,
414
+ "2": 0.9420000314712524,
415
+ "3": 0.9220000505447388,
416
+ "5": 0.9270000457763672,
417
+ "6": 0.8820000290870667
418
+ },
419
+ "llm_test_accuracy": {
420
+ "1": 0.9490000605583191,
421
+ "2": 0.9300000667572021,
422
+ "3": 0.9120000600814819,
423
+ "5": 0.9250000715255737,
424
+ "6": 0.862000048160553
425
+ },
426
+ "llm_top_1_test_accuracy": {
427
+ "1": 0.684,
428
+ "2": 0.592,
429
+ "3": 0.583,
430
+ "5": 0.551,
431
+ "6": 0.585
432
+ },
433
+ "llm_top_2_test_accuracy": {
434
+ "1": 0.739,
435
+ "2": 0.635,
436
+ "3": 0.609,
437
+ "5": 0.635,
438
+ "6": 0.62
439
+ },
440
+ "llm_top_5_test_accuracy": {
441
+ "1": 0.757,
442
+ "2": 0.636,
443
+ "3": 0.622,
444
+ "5": 0.657,
445
+ "6": 0.682
446
+ },
447
+ "sae_top_1_test_accuracy": {
448
+ "1": 0.867,
449
+ "2": 0.765,
450
+ "3": 0.547,
451
+ "5": 0.814,
452
+ "6": 0.616
453
+ },
454
+ "sae_top_2_test_accuracy": {
455
+ "1": 0.9,
456
+ "2": 0.785,
457
+ "3": 0.608,
458
+ "5": 0.812,
459
+ "6": 0.712
460
+ },
461
+ "sae_top_5_test_accuracy": {
462
+ "1": 0.914,
463
+ "2": 0.869,
464
+ "3": 0.68,
465
+ "5": 0.874,
466
+ "6": 0.733
467
+ }
468
+ },
469
+ "canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
470
+ "sae_test_accuracy": {
471
+ "1.0": 0.9730000495910645,
472
+ "5.0": 0.9740000367164612
473
+ },
474
+ "llm_test_accuracy": {
475
+ "1.0": 0.9820000529289246,
476
+ "5.0": 0.9820000529289246
477
+ },
478
+ "llm_top_1_test_accuracy": {
479
+ "1.0": 0.673,
480
+ "5.0": 0.673
481
+ },
482
+ "llm_top_2_test_accuracy": {
483
+ "1.0": 0.724,
484
+ "5.0": 0.724
485
+ },
486
+ "llm_top_5_test_accuracy": {
487
+ "1.0": 0.766,
488
+ "5.0": 0.766
489
+ },
490
+ "sae_top_1_test_accuracy": {
491
+ "1.0": 0.92,
492
+ "5.0": 0.92
493
+ },
494
+ "sae_top_2_test_accuracy": {
495
+ "1.0": 0.919,
496
+ "5.0": 0.919
497
+ },
498
+ "sae_top_5_test_accuracy": {
499
+ "1.0": 0.944,
500
+ "5.0": 0.944
501
+ }
502
+ },
503
+ "codeparrot/github-code_results": {
504
+ "sae_test_accuracy": {
505
+ "C": 0.9520000219345093,
506
+ "Python": 0.9890000224113464,
507
+ "HTML": 0.9830000400543213,
508
+ "Java": 0.9670000672340393,
509
+ "PHP": 0.9540000557899475
510
+ },
511
+ "llm_test_accuracy": {
512
+ "C": 0.956000030040741,
513
+ "Python": 0.9830000400543213,
514
+ "HTML": 0.984000027179718,
515
+ "Java": 0.9700000286102295,
516
+ "PHP": 0.9610000252723694
517
+ },
518
+ "llm_top_1_test_accuracy": {
519
+ "C": 0.663,
520
+ "Python": 0.655,
521
+ "HTML": 0.72,
522
+ "Java": 0.612,
523
+ "PHP": 0.595
524
+ },
525
+ "llm_top_2_test_accuracy": {
526
+ "C": 0.68,
527
+ "Python": 0.683,
528
+ "HTML": 0.798,
529
+ "Java": 0.681,
530
+ "PHP": 0.637
531
+ },
532
+ "llm_top_5_test_accuracy": {
533
+ "C": 0.74,
534
+ "Python": 0.724,
535
+ "HTML": 0.902,
536
+ "Java": 0.739,
537
+ "PHP": 0.673
538
+ },
539
+ "sae_top_1_test_accuracy": {
540
+ "C": 0.635,
541
+ "Python": 0.65,
542
+ "HTML": 0.583,
543
+ "Java": 0.627,
544
+ "PHP": 0.588
545
+ },
546
+ "sae_top_2_test_accuracy": {
547
+ "C": 0.583,
548
+ "Python": 0.639,
549
+ "HTML": 0.701,
550
+ "Java": 0.622,
551
+ "PHP": 0.904
552
+ },
553
+ "sae_top_5_test_accuracy": {
554
+ "C": 0.657,
555
+ "Python": 0.921,
556
+ "HTML": 0.815,
557
+ "Java": 0.681,
558
+ "PHP": 0.909
559
+ }
560
+ },
561
+ "fancyzhx/ag_news_results": {
562
+ "sae_test_accuracy": {
563
+ "0": 0.9390000700950623,
564
+ "1": 0.9830000400543213,
565
+ "2": 0.9300000667572021,
566
+ "3": 0.9490000605583191
567
+ },
568
+ "llm_test_accuracy": {
569
+ "0": 0.9450000524520874,
570
+ "1": 0.9890000224113464,
571
+ "2": 0.9200000166893005,
572
+ "3": 0.9450000524520874
573
+ },
574
+ "llm_top_1_test_accuracy": {
575
+ "0": 0.559,
576
+ "1": 0.66,
577
+ "2": 0.668,
578
+ "3": 0.65
579
+ },
580
+ "llm_top_2_test_accuracy": {
581
+ "0": 0.807,
582
+ "1": 0.799,
583
+ "2": 0.699,
584
+ "3": 0.823
585
+ },
586
+ "llm_top_5_test_accuracy": {
587
+ "0": 0.815,
588
+ "1": 0.88,
589
+ "2": 0.756,
590
+ "3": 0.848
591
+ },
592
+ "sae_top_1_test_accuracy": {
593
+ "0": 0.754,
594
+ "1": 0.668,
595
+ "2": 0.532,
596
+ "3": 0.642
597
+ },
598
+ "sae_top_2_test_accuracy": {
599
+ "0": 0.754,
600
+ "1": 0.709,
601
+ "2": 0.715,
602
+ "3": 0.704
603
+ },
604
+ "sae_top_5_test_accuracy": {
605
+ "0": 0.797,
606
+ "1": 0.806,
607
+ "2": 0.819,
608
+ "3": 0.758
609
+ }
610
+ },
611
+ "Helsinki-NLP/europarl_results": {
612
+ "sae_test_accuracy": {
613
+ "en": 0.9970000386238098,
614
+ "fr": 0.999000072479248,
615
+ "de": 0.9980000257492065,
616
+ "es": 1.0,
617
+ "nl": 0.999000072479248
618
+ },
619
+ "llm_test_accuracy": {
620
+ "en": 0.999000072479248,
621
+ "fr": 1.0,
622
+ "de": 1.0,
623
+ "es": 1.0,
624
+ "nl": 0.999000072479248
625
+ },
626
+ "llm_top_1_test_accuracy": {
627
+ "en": 0.741,
628
+ "fr": 0.609,
629
+ "de": 0.754,
630
+ "es": 0.494,
631
+ "nl": 0.656
632
+ },
633
+ "llm_top_2_test_accuracy": {
634
+ "en": 0.832,
635
+ "fr": 0.597,
636
+ "de": 0.826,
637
+ "es": 0.964,
638
+ "nl": 0.741
639
+ },
640
+ "llm_top_5_test_accuracy": {
641
+ "en": 0.879,
642
+ "fr": 0.909,
643
+ "de": 0.874,
644
+ "es": 0.979,
645
+ "nl": 0.867
646
+ },
647
+ "sae_top_1_test_accuracy": {
648
+ "en": 0.751,
649
+ "fr": 0.988,
650
+ "de": 0.923,
651
+ "es": 0.877,
652
+ "nl": 0.644
653
+ },
654
+ "sae_top_2_test_accuracy": {
655
+ "en": 0.999,
656
+ "fr": 0.992,
657
+ "de": 0.913,
658
+ "es": 0.875,
659
+ "nl": 0.999
660
+ },
661
+ "sae_top_5_test_accuracy": {
662
+ "en": 0.999,
663
+ "fr": 0.995,
664
+ "de": 0.912,
665
+ "es": 0.998,
666
+ "nl": 1.0
667
+ }
668
+ }
669
+ }
670
+ }
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "tpp",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": false,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "b9a9adb1-7a85-4d3d-bb84-63c1a6ab3b28",
73
+ "datetime_epoch_millis": 1740202474022,
74
+ "eval_result_metrics": {
75
+ "tpp_metrics": {
76
+ "tpp_threshold_2_total_metric": 0.004200004041194916,
77
+ "tpp_threshold_2_intended_diff_only": 0.007200002670288086,
78
+ "tpp_threshold_2_unintended_diff_only": 0.00299999862909317,
79
+ "tpp_threshold_5_total_metric": 0.006300006806850434,
80
+ "tpp_threshold_5_intended_diff_only": 0.009600007534027101,
81
+ "tpp_threshold_5_unintended_diff_only": 0.003300000727176666,
82
+ "tpp_threshold_10_total_metric": 0.015000005066394807,
83
+ "tpp_threshold_10_intended_diff_only": 0.020000004768371583,
84
+ "tpp_threshold_10_unintended_diff_only": 0.004999999701976777,
85
+ "tpp_threshold_20_total_metric": 0.03174999803304672,
86
+ "tpp_threshold_20_intended_diff_only": 0.03789999485015869,
87
+ "tpp_threshold_20_unintended_diff_only": 0.006149996817111969,
88
+ "tpp_threshold_50_total_metric": 0.08190000057220459,
89
+ "tpp_threshold_50_intended_diff_only": 0.08960000276565552,
90
+ "tpp_threshold_50_unintended_diff_only": 0.007700002193450928,
91
+ "tpp_threshold_100_total_metric": 0.1676749989390373,
92
+ "tpp_threshold_100_intended_diff_only": 0.18299999833106995,
93
+ "tpp_threshold_100_unintended_diff_only": 0.015324999392032624,
94
+ "tpp_threshold_500_total_metric": 0.3794000178575516,
95
+ "tpp_threshold_500_intended_diff_only": 0.4046000182628632,
96
+ "tpp_threshold_500_unintended_diff_only": 0.025200000405311587
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
102
+ "tpp_threshold_2_total_metric": 0.003600001335144043,
103
+ "tpp_threshold_2_intended_diff_only": 0.006000006198883056,
104
+ "tpp_threshold_2_unintended_diff_only": 0.0024000048637390138,
105
+ "tpp_threshold_5_total_metric": 0.006100001931190491,
106
+ "tpp_threshold_5_intended_diff_only": 0.009000015258789063,
107
+ "tpp_threshold_5_unintended_diff_only": 0.002900013327598572,
108
+ "tpp_threshold_10_total_metric": 0.014000010490417481,
109
+ "tpp_threshold_10_intended_diff_only": 0.017000019550323486,
110
+ "tpp_threshold_10_unintended_diff_only": 0.003000009059906006,
111
+ "tpp_threshold_20_total_metric": 0.03419999182224274,
112
+ "tpp_threshold_20_intended_diff_only": 0.037199997901916505,
113
+ "tpp_threshold_20_unintended_diff_only": 0.003000006079673767,
114
+ "tpp_threshold_50_total_metric": 0.08610000610351562,
115
+ "tpp_threshold_50_intended_diff_only": 0.09040001630783082,
116
+ "tpp_threshold_50_unintended_diff_only": 0.004300010204315185,
117
+ "tpp_threshold_100_total_metric": 0.17919999957084656,
118
+ "tpp_threshold_100_intended_diff_only": 0.18480000495910645,
119
+ "tpp_threshold_100_unintended_diff_only": 0.005600005388259888,
120
+ "tpp_threshold_500_total_metric": 0.43065003752708436,
121
+ "tpp_threshold_500_intended_diff_only": 0.4422000408172607,
122
+ "tpp_threshold_500_unintended_diff_only": 0.011550003290176391
123
+ },
124
+ {
125
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
126
+ "tpp_threshold_2_total_metric": 0.004800006747245789,
127
+ "tpp_threshold_2_intended_diff_only": 0.008399999141693116,
128
+ "tpp_threshold_2_unintended_diff_only": 0.0035999923944473266,
129
+ "tpp_threshold_5_total_metric": 0.006500011682510376,
130
+ "tpp_threshold_5_intended_diff_only": 0.010199999809265137,
131
+ "tpp_threshold_5_unintended_diff_only": 0.0036999881267547607,
132
+ "tpp_threshold_10_total_metric": 0.015999999642372132,
133
+ "tpp_threshold_10_intended_diff_only": 0.022999989986419677,
134
+ "tpp_threshold_10_unintended_diff_only": 0.006999990344047547,
135
+ "tpp_threshold_20_total_metric": 0.029300004243850708,
136
+ "tpp_threshold_20_intended_diff_only": 0.03859999179840088,
137
+ "tpp_threshold_20_unintended_diff_only": 0.00929998755455017,
138
+ "tpp_threshold_50_total_metric": 0.07769999504089356,
139
+ "tpp_threshold_50_intended_diff_only": 0.08879998922348023,
140
+ "tpp_threshold_50_unintended_diff_only": 0.01109999418258667,
141
+ "tpp_threshold_100_total_metric": 0.1561499983072281,
142
+ "tpp_threshold_100_intended_diff_only": 0.18119999170303344,
143
+ "tpp_threshold_100_unintended_diff_only": 0.02504999339580536,
144
+ "tpp_threshold_500_total_metric": 0.3281499981880188,
145
+ "tpp_threshold_500_intended_diff_only": 0.3669999957084656,
146
+ "tpp_threshold_500_unintended_diff_only": 0.03884999752044678
147
+ }
148
+ ],
149
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
150
+ "sae_lens_id": "custom_sae",
151
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000",
152
+ "sae_lens_version": "5.5.0",
153
+ "sae_cfg_dict": {
154
+ "model_name": "gemma-2-2b",
155
+ "d_in": 2304,
156
+ "d_sae": 65536,
157
+ "hook_layer": 12,
158
+ "hook_name": "blocks.12.hook_resid_post",
159
+ "context_size": null,
160
+ "hook_head_index": null,
161
+ "architecture": "topk",
162
+ "apply_b_dec_to_input": null,
163
+ "finetuning_scaling_factor": null,
164
+ "activation_fn_str": "",
165
+ "prepend_bos": true,
166
+ "normalize_activations": "none",
167
+ "dtype": "bfloat16",
168
+ "device": "",
169
+ "dataset_path": "",
170
+ "dataset_trust_remote_code": true,
171
+ "seqpos_slice": [
172
+ null
173
+ ],
174
+ "training_tokens": -100000,
175
+ "sae_lens_training_version": null,
176
+ "neuronpedia_id": null
177
+ },
178
+ "eval_result_unstructured": {
179
+ "LabHC/bias_in_bios_class_set1": {
180
+ "0": {
181
+ "tpp_threshold_2_total_metric": 0.0059999823570251465,
182
+ "tpp_threshold_2_intended_diff_only": 0.009999990463256836,
183
+ "tpp_threshold_2_unintended_diff_only": 0.0040000081062316895,
184
+ "tpp_threshold_5_total_metric": 0.012749999761581421,
185
+ "tpp_threshold_5_intended_diff_only": 0.017000019550323486,
186
+ "tpp_threshold_5_unintended_diff_only": 0.004250019788742065,
187
+ "tpp_threshold_10_total_metric": 0.011500000953674316,
188
+ "tpp_threshold_10_intended_diff_only": 0.017000019550323486,
189
+ "tpp_threshold_10_unintended_diff_only": 0.00550001859664917,
190
+ "tpp_threshold_20_total_metric": 0.029499992728233337,
191
+ "tpp_threshold_20_intended_diff_only": 0.03299999237060547,
192
+ "tpp_threshold_20_unintended_diff_only": 0.0034999996423721313,
193
+ "tpp_threshold_50_total_metric": 0.0765000432729721,
194
+ "tpp_threshold_50_intended_diff_only": 0.08000004291534424,
195
+ "tpp_threshold_50_unintended_diff_only": 0.0034999996423721313,
196
+ "tpp_threshold_100_total_metric": 0.1547500193119049,
197
+ "tpp_threshold_100_intended_diff_only": 0.1600000262260437,
198
+ "tpp_threshold_100_unintended_diff_only": 0.005250006914138794,
199
+ "tpp_threshold_500_total_metric": 0.4282500445842743,
200
+ "tpp_threshold_500_intended_diff_only": 0.4410000443458557,
201
+ "tpp_threshold_500_unintended_diff_only": 0.012749999761581421
202
+ },
203
+ "1": {
204
+ "tpp_threshold_2_total_metric": 0.006250053644180298,
205
+ "tpp_threshold_2_intended_diff_only": 0.006000041961669922,
206
+ "tpp_threshold_2_unintended_diff_only": -0.000250011682510376,
207
+ "tpp_threshold_5_total_metric": 0.0027500391006469727,
208
+ "tpp_threshold_5_intended_diff_only": 0.005000054836273193,
209
+ "tpp_threshold_5_unintended_diff_only": 0.0022500157356262207,
210
+ "tpp_threshold_10_total_metric": 0.006000041961669922,
211
+ "tpp_threshold_10_intended_diff_only": 0.006000041961669922,
212
+ "tpp_threshold_10_unintended_diff_only": 0.0,
213
+ "tpp_threshold_20_total_metric": 0.014750048518180847,
214
+ "tpp_threshold_20_intended_diff_only": 0.01900005340576172,
215
+ "tpp_threshold_20_unintended_diff_only": 0.004250004887580872,
216
+ "tpp_threshold_50_total_metric": 0.07450000941753387,
217
+ "tpp_threshold_50_intended_diff_only": 0.0820000171661377,
218
+ "tpp_threshold_50_unintended_diff_only": 0.007500007748603821,
219
+ "tpp_threshold_100_total_metric": 0.13900001347064972,
220
+ "tpp_threshold_100_intended_diff_only": 0.1470000147819519,
221
+ "tpp_threshold_100_unintended_diff_only": 0.008000001311302185,
222
+ "tpp_threshold_500_total_metric": 0.43675006926059723,
223
+ "tpp_threshold_500_intended_diff_only": 0.4490000605583191,
224
+ "tpp_threshold_500_unintended_diff_only": 0.012249991297721863
225
+ },
226
+ "2": {
227
+ "tpp_threshold_2_total_metric": -0.0017500221729278564,
228
+ "tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
229
+ "tpp_threshold_2_unintended_diff_only": 0.002750009298324585,
230
+ "tpp_threshold_5_total_metric": 0.004749983549118042,
231
+ "tpp_threshold_5_intended_diff_only": 0.009000003337860107,
232
+ "tpp_threshold_5_unintended_diff_only": 0.004250019788742065,
233
+ "tpp_threshold_10_total_metric": 0.032250016927719116,
234
+ "tpp_threshold_10_intended_diff_only": 0.0350000262260437,
235
+ "tpp_threshold_10_unintended_diff_only": 0.002750009298324585,
236
+ "tpp_threshold_20_total_metric": 0.05924998223781586,
237
+ "tpp_threshold_20_intended_diff_only": 0.06099998950958252,
238
+ "tpp_threshold_20_unintended_diff_only": 0.0017500072717666626,
239
+ "tpp_threshold_50_total_metric": 0.10799999535083771,
240
+ "tpp_threshold_50_intended_diff_only": 0.11000001430511475,
241
+ "tpp_threshold_50_unintended_diff_only": 0.0020000189542770386,
242
+ "tpp_threshold_100_total_metric": 0.2057500034570694,
243
+ "tpp_threshold_100_intended_diff_only": 0.2070000171661377,
244
+ "tpp_threshold_100_unintended_diff_only": 0.0012500137090682983,
245
+ "tpp_threshold_500_total_metric": 0.43125003576278687,
246
+ "tpp_threshold_500_intended_diff_only": 0.437000036239624,
247
+ "tpp_threshold_500_unintended_diff_only": 0.005750000476837158
248
+ },
249
+ "6": {
250
+ "tpp_threshold_2_total_metric": 0.0002500265836715698,
251
+ "tpp_threshold_2_intended_diff_only": 0.0020000338554382324,
252
+ "tpp_threshold_2_unintended_diff_only": 0.0017500072717666626,
253
+ "tpp_threshold_5_total_metric": 0.002249985933303833,
254
+ "tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
255
+ "tpp_threshold_5_unintended_diff_only": -0.0012499988079071045,
256
+ "tpp_threshold_10_total_metric": 0.0037499815225601196,
257
+ "tpp_threshold_10_intended_diff_only": 0.004999995231628418,
258
+ "tpp_threshold_10_unintended_diff_only": 0.0012500137090682983,
259
+ "tpp_threshold_20_total_metric": 0.005749985575675964,
260
+ "tpp_threshold_20_intended_diff_only": 0.004999995231628418,
261
+ "tpp_threshold_20_unintended_diff_only": -0.0007499903440475464,
262
+ "tpp_threshold_50_total_metric": 0.01199999451637268,
263
+ "tpp_threshold_50_intended_diff_only": 0.013000011444091797,
264
+ "tpp_threshold_50_unintended_diff_only": 0.0010000169277191162,
265
+ "tpp_threshold_100_total_metric": 0.109499990940094,
266
+ "tpp_threshold_100_intended_diff_only": 0.1119999885559082,
267
+ "tpp_threshold_100_unintended_diff_only": 0.002499997615814209,
268
+ "tpp_threshold_500_total_metric": 0.3985000401735306,
269
+ "tpp_threshold_500_intended_diff_only": 0.40800005197525024,
270
+ "tpp_threshold_500_unintended_diff_only": 0.009500011801719666
271
+ },
272
+ "9": {
273
+ "tpp_threshold_2_total_metric": 0.007249966263771057,
274
+ "tpp_threshold_2_intended_diff_only": 0.010999977588653564,
275
+ "tpp_threshold_2_unintended_diff_only": 0.0037500113248825073,
276
+ "tpp_threshold_5_total_metric": 0.008000001311302185,
277
+ "tpp_threshold_5_intended_diff_only": 0.013000011444091797,
278
+ "tpp_threshold_5_unintended_diff_only": 0.005000010132789612,
279
+ "tpp_threshold_10_total_metric": 0.016500011086463928,
280
+ "tpp_threshold_10_intended_diff_only": 0.022000014781951904,
281
+ "tpp_threshold_10_unintended_diff_only": 0.005500003695487976,
282
+ "tpp_threshold_20_total_metric": 0.06174995005130768,
283
+ "tpp_threshold_20_intended_diff_only": 0.0679999589920044,
284
+ "tpp_threshold_20_unintended_diff_only": 0.006250008940696716,
285
+ "tpp_threshold_50_total_metric": 0.15949998795986176,
286
+ "tpp_threshold_50_intended_diff_only": 0.16699999570846558,
287
+ "tpp_threshold_50_unintended_diff_only": 0.007500007748603821,
288
+ "tpp_threshold_100_total_metric": 0.28699997067451477,
289
+ "tpp_threshold_100_intended_diff_only": 0.2979999780654907,
290
+ "tpp_threshold_100_unintended_diff_only": 0.011000007390975952,
291
+ "tpp_threshold_500_total_metric": 0.4584999978542328,
292
+ "tpp_threshold_500_intended_diff_only": 0.47600001096725464,
293
+ "tpp_threshold_500_unintended_diff_only": 0.01750001311302185
294
+ }
295
+ },
296
+ "canrager/amazon_reviews_mcauley_1and5": {
297
+ "1": {
298
+ "tpp_threshold_2_total_metric": 0.00475001335144043,
299
+ "tpp_threshold_2_intended_diff_only": 0.009000003337860107,
300
+ "tpp_threshold_2_unintended_diff_only": 0.004249989986419678,
301
+ "tpp_threshold_5_total_metric": 0.001249954104423523,
302
+ "tpp_threshold_5_intended_diff_only": 0.003999948501586914,
303
+ "tpp_threshold_5_unintended_diff_only": 0.002749994397163391,
304
+ "tpp_threshold_10_total_metric": 0.004249975085258484,
305
+ "tpp_threshold_10_intended_diff_only": 0.011999964714050293,
306
+ "tpp_threshold_10_unintended_diff_only": 0.007749989628791809,
307
+ "tpp_threshold_20_total_metric": 0.0037500113248825073,
308
+ "tpp_threshold_20_intended_diff_only": 0.009999990463256836,
309
+ "tpp_threshold_20_unintended_diff_only": 0.006249979138374329,
310
+ "tpp_threshold_50_total_metric": 0.016249999403953552,
311
+ "tpp_threshold_50_intended_diff_only": 0.018999993801116943,
312
+ "tpp_threshold_50_unintended_diff_only": 0.002749994397163391,
313
+ "tpp_threshold_100_total_metric": 0.0034999698400497437,
314
+ "tpp_threshold_100_intended_diff_only": 0.05399996042251587,
315
+ "tpp_threshold_100_unintended_diff_only": 0.050499990582466125,
316
+ "tpp_threshold_500_total_metric": 0.22074998915195465,
317
+ "tpp_threshold_500_intended_diff_only": 0.2849999666213989,
318
+ "tpp_threshold_500_unintended_diff_only": 0.06424997746944427
319
+ },
320
+ "2": {
321
+ "tpp_threshold_2_total_metric": 0.007499992847442627,
322
+ "tpp_threshold_2_intended_diff_only": 0.009999990463256836,
323
+ "tpp_threshold_2_unintended_diff_only": 0.002499997615814209,
324
+ "tpp_threshold_5_total_metric": 0.0010000020265579224,
325
+ "tpp_threshold_5_intended_diff_only": 0.009999990463256836,
326
+ "tpp_threshold_5_unintended_diff_only": 0.008999988436698914,
327
+ "tpp_threshold_10_total_metric": 0.0222499817609787,
328
+ "tpp_threshold_10_intended_diff_only": 0.029999971389770508,
329
+ "tpp_threshold_10_unintended_diff_only": 0.007749989628791809,
330
+ "tpp_threshold_20_total_metric": 0.052000001072883606,
331
+ "tpp_threshold_20_intended_diff_only": 0.05699998140335083,
332
+ "tpp_threshold_20_unintended_diff_only": 0.004999980330467224,
333
+ "tpp_threshold_50_total_metric": 0.11399997770786285,
334
+ "tpp_threshold_50_intended_diff_only": 0.12699997425079346,
335
+ "tpp_threshold_50_unintended_diff_only": 0.012999996542930603,
336
+ "tpp_threshold_100_total_metric": 0.17999999225139618,
337
+ "tpp_threshold_100_intended_diff_only": 0.19999998807907104,
338
+ "tpp_threshold_100_unintended_diff_only": 0.019999995827674866,
339
+ "tpp_threshold_500_total_metric": 0.39000001549720764,
340
+ "tpp_threshold_500_intended_diff_only": 0.42000001668930054,
341
+ "tpp_threshold_500_unintended_diff_only": 0.030000001192092896
342
+ },
343
+ "3": {
344
+ "tpp_threshold_2_total_metric": -0.009999975562095642,
345
+ "tpp_threshold_2_intended_diff_only": -0.0059999823570251465,
346
+ "tpp_threshold_2_unintended_diff_only": 0.003999993205070496,
347
+ "tpp_threshold_5_total_metric": -0.0009999722242355347,
348
+ "tpp_threshold_5_intended_diff_only": -0.0009999871253967285,
349
+ "tpp_threshold_5_unintended_diff_only": -1.4901161193847656e-08,
350
+ "tpp_threshold_10_total_metric": 0.010000035166740417,
351
+ "tpp_threshold_10_intended_diff_only": 0.013000011444091797,
352
+ "tpp_threshold_10_unintended_diff_only": 0.0029999762773513794,
353
+ "tpp_threshold_20_total_metric": 0.004249989986419678,
354
+ "tpp_threshold_20_intended_diff_only": 0.018999993801116943,
355
+ "tpp_threshold_20_unintended_diff_only": 0.014750003814697266,
356
+ "tpp_threshold_50_total_metric": 0.04099997878074646,
357
+ "tpp_threshold_50_intended_diff_only": 0.05299997329711914,
358
+ "tpp_threshold_50_unintended_diff_only": 0.01199999451637268,
359
+ "tpp_threshold_100_total_metric": 0.07449999451637268,
360
+ "tpp_threshold_100_intended_diff_only": 0.0899999737739563,
361
+ "tpp_threshold_100_unintended_diff_only": 0.015499979257583618,
362
+ "tpp_threshold_500_total_metric": 0.29624997079372406,
363
+ "tpp_threshold_500_intended_diff_only": 0.33899998664855957,
364
+ "tpp_threshold_500_unintended_diff_only": 0.04275001585483551
365
+ },
366
+ "5": {
367
+ "tpp_threshold_2_total_metric": 0.0045000165700912476,
368
+ "tpp_threshold_2_intended_diff_only": 0.009000003337860107,
369
+ "tpp_threshold_2_unintended_diff_only": 0.00449998676776886,
370
+ "tpp_threshold_5_total_metric": 0.012500062584877014,
371
+ "tpp_threshold_5_intended_diff_only": 0.020000040531158447,
372
+ "tpp_threshold_5_unintended_diff_only": 0.007499977946281433,
373
+ "tpp_threshold_10_total_metric": 0.012500032782554626,
374
+ "tpp_threshold_10_intended_diff_only": 0.021000027656555176,
375
+ "tpp_threshold_10_unintended_diff_only": 0.00849999487400055,
376
+ "tpp_threshold_20_total_metric": 0.028750047087669373,
377
+ "tpp_threshold_20_intended_diff_only": 0.03900003433227539,
378
+ "tpp_threshold_20_unintended_diff_only": 0.010249987244606018,
379
+ "tpp_threshold_50_total_metric": 0.10075005888938904,
380
+ "tpp_threshold_50_intended_diff_only": 0.1170000433921814,
381
+ "tpp_threshold_50_unintended_diff_only": 0.01624998450279236,
382
+ "tpp_threshold_100_total_metric": 0.27750006318092346,
383
+ "tpp_threshold_100_intended_diff_only": 0.3060000538825989,
384
+ "tpp_threshold_100_unintended_diff_only": 0.028499990701675415,
385
+ "tpp_threshold_500_total_metric": 0.37825003266334534,
386
+ "tpp_threshold_500_intended_diff_only": 0.4150000214576721,
387
+ "tpp_threshold_500_unintended_diff_only": 0.03674998879432678
388
+ },
389
+ "6": {
390
+ "tpp_threshold_2_total_metric": 0.01724998652935028,
391
+ "tpp_threshold_2_intended_diff_only": 0.019999980926513672,
392
+ "tpp_threshold_2_unintended_diff_only": 0.002749994397163391,
393
+ "tpp_threshold_5_total_metric": 0.018750011920928955,
394
+ "tpp_threshold_5_intended_diff_only": 0.018000006675720215,
395
+ "tpp_threshold_5_unintended_diff_only": -0.0007500052452087402,
396
+ "tpp_threshold_10_total_metric": 0.03099997341632843,
397
+ "tpp_threshold_10_intended_diff_only": 0.038999974727630615,
398
+ "tpp_threshold_10_unintended_diff_only": 0.008000001311302185,
399
+ "tpp_threshold_20_total_metric": 0.057749971747398376,
400
+ "tpp_threshold_20_intended_diff_only": 0.0679999589920044,
401
+ "tpp_threshold_20_unintended_diff_only": 0.010249987244606018,
402
+ "tpp_threshold_50_total_metric": 0.11649996042251587,
403
+ "tpp_threshold_50_intended_diff_only": 0.12799996137619019,
404
+ "tpp_threshold_50_unintended_diff_only": 0.011500000953674316,
405
+ "tpp_threshold_100_total_metric": 0.24524997174739838,
406
+ "tpp_threshold_100_intended_diff_only": 0.25599998235702515,
407
+ "tpp_threshold_100_unintended_diff_only": 0.01075001060962677,
408
+ "tpp_threshold_500_total_metric": 0.3554999828338623,
409
+ "tpp_threshold_500_intended_diff_only": 0.37599998712539673,
410
+ "tpp_threshold_500_unintended_diff_only": 0.020500004291534424
411
+ }
412
+ }
413
+ }
414
+ }
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "tpp",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": false,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "88010eee-5f36-4f31-947e-cec777cc359e",
73
+ "datetime_epoch_millis": 1740202787337,
74
+ "eval_result_metrics": {
75
+ "tpp_metrics": {
76
+ "tpp_threshold_2_total_metric": 0.0037499994039535524,
77
+ "tpp_threshold_2_intended_diff_only": 0.006599998474121094,
78
+ "tpp_threshold_2_unintended_diff_only": 0.0028499990701675417,
79
+ "tpp_threshold_5_total_metric": 0.005775000154972077,
80
+ "tpp_threshold_5_intended_diff_only": 0.008799999952316284,
81
+ "tpp_threshold_5_unintended_diff_only": 0.0030249997973442076,
82
+ "tpp_threshold_10_total_metric": 0.009449997544288635,
83
+ "tpp_threshold_10_intended_diff_only": 0.013499993085861205,
84
+ "tpp_threshold_10_unintended_diff_only": 0.0040499955415725705,
85
+ "tpp_threshold_20_total_metric": 0.016199994087219238,
86
+ "tpp_threshold_20_intended_diff_only": 0.02059999108314514,
87
+ "tpp_threshold_20_unintended_diff_only": 0.004399996995925903,
88
+ "tpp_threshold_50_total_metric": 0.031475001573562617,
89
+ "tpp_threshold_50_intended_diff_only": 0.03600000143051148,
90
+ "tpp_threshold_50_unintended_diff_only": 0.004524999856948852,
91
+ "tpp_threshold_100_total_metric": 0.04169999808073044,
92
+ "tpp_threshold_100_intended_diff_only": 0.04989999532699585,
93
+ "tpp_threshold_100_unintended_diff_only": 0.008199997246265411,
94
+ "tpp_threshold_500_total_metric": 0.12555001527071,
95
+ "tpp_threshold_500_intended_diff_only": 0.1348000168800354,
96
+ "tpp_threshold_500_unintended_diff_only": 0.00925000160932541
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
102
+ "tpp_threshold_2_total_metric": 0.005249989032745361,
103
+ "tpp_threshold_2_intended_diff_only": 0.0075999975204467775,
104
+ "tpp_threshold_2_unintended_diff_only": 0.002350008487701416,
105
+ "tpp_threshold_5_total_metric": 0.0073000043630599976,
106
+ "tpp_threshold_5_intended_diff_only": 0.010000014305114746,
107
+ "tpp_threshold_5_unintended_diff_only": 0.0027000099420547486,
108
+ "tpp_threshold_10_total_metric": 0.010399997234344482,
109
+ "tpp_threshold_10_intended_diff_only": 0.013199996948242188,
110
+ "tpp_threshold_10_unintended_diff_only": 0.002799999713897705,
111
+ "tpp_threshold_20_total_metric": 0.01979999840259552,
112
+ "tpp_threshold_20_intended_diff_only": 0.022600007057189942,
113
+ "tpp_threshold_20_unintended_diff_only": 0.0028000086545944213,
114
+ "tpp_threshold_50_total_metric": 0.03664999902248382,
115
+ "tpp_threshold_50_intended_diff_only": 0.04000000953674317,
116
+ "tpp_threshold_50_unintended_diff_only": 0.0033500105142593383,
117
+ "tpp_threshold_100_total_metric": 0.050449994206428525,
118
+ "tpp_threshold_100_intended_diff_only": 0.05640000104904175,
119
+ "tpp_threshold_100_unintended_diff_only": 0.00595000684261322,
120
+ "tpp_threshold_500_total_metric": 0.14450002312660218,
121
+ "tpp_threshold_500_intended_diff_only": 0.15240002870559693,
122
+ "tpp_threshold_500_unintended_diff_only": 0.007900005578994751
123
+ },
124
+ {
125
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
126
+ "tpp_threshold_2_total_metric": 0.0022500097751617433,
127
+ "tpp_threshold_2_intended_diff_only": 0.00559999942779541,
128
+ "tpp_threshold_2_unintended_diff_only": 0.003349989652633667,
129
+ "tpp_threshold_5_total_metric": 0.004249995946884156,
130
+ "tpp_threshold_5_intended_diff_only": 0.007599985599517823,
131
+ "tpp_threshold_5_unintended_diff_only": 0.003349989652633667,
132
+ "tpp_threshold_10_total_metric": 0.008499997854232787,
133
+ "tpp_threshold_10_intended_diff_only": 0.013799989223480224,
134
+ "tpp_threshold_10_unintended_diff_only": 0.005299991369247437,
135
+ "tpp_threshold_20_total_metric": 0.012599989771842957,
136
+ "tpp_threshold_20_intended_diff_only": 0.01859997510910034,
137
+ "tpp_threshold_20_unintended_diff_only": 0.005999985337257385,
138
+ "tpp_threshold_50_total_metric": 0.026300004124641417,
139
+ "tpp_threshold_50_intended_diff_only": 0.03199999332427979,
140
+ "tpp_threshold_50_unintended_diff_only": 0.0056999891996383665,
141
+ "tpp_threshold_100_total_metric": 0.03295000195503235,
142
+ "tpp_threshold_100_intended_diff_only": 0.04339998960494995,
143
+ "tpp_threshold_100_unintended_diff_only": 0.010449987649917603,
144
+ "tpp_threshold_500_total_metric": 0.10660000741481782,
145
+ "tpp_threshold_500_intended_diff_only": 0.11720000505447388,
146
+ "tpp_threshold_500_unintended_diff_only": 0.010599997639656068
147
+ }
148
+ ],
149
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
150
+ "sae_lens_id": "custom_sae",
151
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000",
152
+ "sae_lens_version": "5.5.0",
153
+ "sae_cfg_dict": {
154
+ "model_name": "gemma-2-2b",
155
+ "d_in": 2304,
156
+ "d_sae": 65536,
157
+ "hook_layer": 12,
158
+ "hook_name": "blocks.12.hook_resid_post",
159
+ "context_size": null,
160
+ "hook_head_index": null,
161
+ "architecture": "topk",
162
+ "apply_b_dec_to_input": null,
163
+ "finetuning_scaling_factor": null,
164
+ "activation_fn_str": "",
165
+ "prepend_bos": true,
166
+ "normalize_activations": "none",
167
+ "dtype": "bfloat16",
168
+ "device": "",
169
+ "dataset_path": "",
170
+ "dataset_trust_remote_code": true,
171
+ "seqpos_slice": [
172
+ null
173
+ ],
174
+ "training_tokens": -100000,
175
+ "sae_lens_training_version": null,
176
+ "neuronpedia_id": null
177
+ },
178
+ "eval_result_unstructured": {
179
+ "LabHC/bias_in_bios_class_set1": {
180
+ "0": {
181
+ "tpp_threshold_2_total_metric": 0.008250012993812561,
182
+ "tpp_threshold_2_intended_diff_only": 0.012000024318695068,
183
+ "tpp_threshold_2_unintended_diff_only": 0.0037500113248825073,
184
+ "tpp_threshold_5_total_metric": 0.011250033974647522,
185
+ "tpp_threshold_5_intended_diff_only": 0.01500004529953003,
186
+ "tpp_threshold_5_unintended_diff_only": 0.0037500113248825073,
187
+ "tpp_threshold_10_total_metric": 0.006249979138374329,
188
+ "tpp_threshold_10_intended_diff_only": 0.009999990463256836,
189
+ "tpp_threshold_10_unintended_diff_only": 0.0037500113248825073,
190
+ "tpp_threshold_20_total_metric": 0.01975002884864807,
191
+ "tpp_threshold_20_intended_diff_only": 0.021000027656555176,
192
+ "tpp_threshold_20_unintended_diff_only": 0.0012499988079071045,
193
+ "tpp_threshold_50_total_metric": 0.036500006914138794,
194
+ "tpp_threshold_50_intended_diff_only": 0.04000002145767212,
195
+ "tpp_threshold_50_unintended_diff_only": 0.003500014543533325,
196
+ "tpp_threshold_100_total_metric": 0.04475003480911255,
197
+ "tpp_threshold_100_intended_diff_only": 0.058000028133392334,
198
+ "tpp_threshold_100_unintended_diff_only": 0.013249993324279785,
199
+ "tpp_threshold_500_total_metric": 0.14125002920627594,
200
+ "tpp_threshold_500_intended_diff_only": 0.15500003099441528,
201
+ "tpp_threshold_500_unintended_diff_only": 0.013750001788139343
202
+ },
203
+ "1": {
204
+ "tpp_threshold_2_total_metric": 0.009250015020370483,
205
+ "tpp_threshold_2_intended_diff_only": 0.009000003337860107,
206
+ "tpp_threshold_2_unintended_diff_only": -0.000250011682510376,
207
+ "tpp_threshold_5_total_metric": 0.004000052809715271,
208
+ "tpp_threshold_5_intended_diff_only": 0.005000054836273193,
209
+ "tpp_threshold_5_unintended_diff_only": 0.0010000020265579224,
210
+ "tpp_threshold_10_total_metric": 0.006250038743019104,
211
+ "tpp_threshold_10_intended_diff_only": 0.00700002908706665,
212
+ "tpp_threshold_10_unintended_diff_only": 0.0007499903440475464,
213
+ "tpp_threshold_20_total_metric": 0.0045000165700912476,
214
+ "tpp_threshold_20_intended_diff_only": 0.008000016212463379,
215
+ "tpp_threshold_20_unintended_diff_only": 0.0034999996423721313,
216
+ "tpp_threshold_50_total_metric": 0.015500038862228394,
217
+ "tpp_threshold_50_intended_diff_only": 0.020000040531158447,
218
+ "tpp_threshold_50_unintended_diff_only": 0.004500001668930054,
219
+ "tpp_threshold_100_total_metric": 0.02699999511241913,
220
+ "tpp_threshold_100_intended_diff_only": 0.03200000524520874,
221
+ "tpp_threshold_100_unintended_diff_only": 0.005000010132789612,
222
+ "tpp_threshold_500_total_metric": 0.09950004518032074,
223
+ "tpp_threshold_500_intended_diff_only": 0.10700005292892456,
224
+ "tpp_threshold_500_unintended_diff_only": 0.007500007748603821
225
+ },
226
+ "2": {
227
+ "tpp_threshold_2_total_metric": 0.008249998092651367,
228
+ "tpp_threshold_2_intended_diff_only": 0.013000011444091797,
229
+ "tpp_threshold_2_unintended_diff_only": 0.00475001335144043,
230
+ "tpp_threshold_5_total_metric": 0.017000004649162292,
231
+ "tpp_threshold_5_intended_diff_only": 0.022000014781951904,
232
+ "tpp_threshold_5_unintended_diff_only": 0.005000010132789612,
233
+ "tpp_threshold_10_total_metric": 0.03350000083446503,
234
+ "tpp_threshold_10_intended_diff_only": 0.03700000047683716,
235
+ "tpp_threshold_10_unintended_diff_only": 0.0034999996423721313,
236
+ "tpp_threshold_20_total_metric": 0.04799997806549072,
237
+ "tpp_threshold_20_intended_diff_only": 0.050999999046325684,
238
+ "tpp_threshold_20_unintended_diff_only": 0.003000020980834961,
239
+ "tpp_threshold_50_total_metric": 0.06849998235702515,
240
+ "tpp_threshold_50_intended_diff_only": 0.06999999284744263,
241
+ "tpp_threshold_50_unintended_diff_only": 0.0015000104904174805,
242
+ "tpp_threshold_100_total_metric": 0.09125001728534698,
243
+ "tpp_threshold_100_intended_diff_only": 0.09500002861022949,
244
+ "tpp_threshold_100_unintended_diff_only": 0.0037500113248825073,
245
+ "tpp_threshold_500_total_metric": 0.2682500183582306,
246
+ "tpp_threshold_500_intended_diff_only": 0.2720000147819519,
247
+ "tpp_threshold_500_unintended_diff_only": 0.0037499964237213135
248
+ },
249
+ "6": {
250
+ "tpp_threshold_2_total_metric": -0.000500023365020752,
251
+ "tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
252
+ "tpp_threshold_2_unintended_diff_only": 0.0015000104904174805,
253
+ "tpp_threshold_5_total_metric": 0.002249985933303833,
254
+ "tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
255
+ "tpp_threshold_5_unintended_diff_only": -0.0012499988079071045,
256
+ "tpp_threshold_10_total_metric": 0.003000006079673767,
257
+ "tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
258
+ "tpp_threshold_10_unintended_diff_only": 0.0010000020265579224,
259
+ "tpp_threshold_20_total_metric": 0.002750024199485779,
260
+ "tpp_threshold_20_intended_diff_only": 0.0020000338554382324,
261
+ "tpp_threshold_20_unintended_diff_only": -0.0007499903440475464,
262
+ "tpp_threshold_50_total_metric": 0.004249989986419678,
263
+ "tpp_threshold_50_intended_diff_only": 0.004999995231628418,
264
+ "tpp_threshold_50_unintended_diff_only": 0.0007500052452087402,
265
+ "tpp_threshold_100_total_metric": 0.006249964237213135,
266
+ "tpp_threshold_100_intended_diff_only": 0.0059999823570251465,
267
+ "tpp_threshold_100_unintended_diff_only": -0.0002499818801879883,
268
+ "tpp_threshold_500_total_metric": 0.012250006198883057,
269
+ "tpp_threshold_500_intended_diff_only": 0.017000019550323486,
270
+ "tpp_threshold_500_unintended_diff_only": 0.00475001335144043
271
+ },
272
+ "9": {
273
+ "tpp_threshold_2_total_metric": 0.000999942421913147,
274
+ "tpp_threshold_2_intended_diff_only": 0.0029999613761901855,
275
+ "tpp_threshold_2_unintended_diff_only": 0.0020000189542770386,
276
+ "tpp_threshold_5_total_metric": 0.0019999444484710693,
277
+ "tpp_threshold_5_intended_diff_only": 0.006999969482421875,
278
+ "tpp_threshold_5_unintended_diff_only": 0.005000025033950806,
279
+ "tpp_threshold_10_total_metric": 0.0029999613761901855,
280
+ "tpp_threshold_10_intended_diff_only": 0.007999956607818604,
281
+ "tpp_threshold_10_unintended_diff_only": 0.004999995231628418,
282
+ "tpp_threshold_20_total_metric": 0.02399994432926178,
283
+ "tpp_threshold_20_intended_diff_only": 0.030999958515167236,
284
+ "tpp_threshold_20_unintended_diff_only": 0.0070000141859054565,
285
+ "tpp_threshold_50_total_metric": 0.05849997699260712,
286
+ "tpp_threshold_50_intended_diff_only": 0.06499999761581421,
287
+ "tpp_threshold_50_unintended_diff_only": 0.006500020623207092,
288
+ "tpp_threshold_100_total_metric": 0.08299995958805084,
289
+ "tpp_threshold_100_intended_diff_only": 0.09099996089935303,
290
+ "tpp_threshold_100_unintended_diff_only": 0.008000001311302185,
291
+ "tpp_threshold_500_total_metric": 0.20125001668930054,
292
+ "tpp_threshold_500_intended_diff_only": 0.21100002527236938,
293
+ "tpp_threshold_500_unintended_diff_only": 0.009750008583068848
294
+ }
295
+ },
296
+ "canrager/amazon_reviews_mcauley_1and5": {
297
+ "1": {
298
+ "tpp_threshold_2_total_metric": 0.005000010132789612,
299
+ "tpp_threshold_2_intended_diff_only": 0.009000003337860107,
300
+ "tpp_threshold_2_unintended_diff_only": 0.003999993205070496,
301
+ "tpp_threshold_5_total_metric": 0.002249985933303833,
302
+ "tpp_threshold_5_intended_diff_only": 0.0059999823570251465,
303
+ "tpp_threshold_5_unintended_diff_only": 0.0037499964237213135,
304
+ "tpp_threshold_10_total_metric": 0.0004999637603759766,
305
+ "tpp_threshold_10_intended_diff_only": 0.007999956607818604,
306
+ "tpp_threshold_10_unintended_diff_only": 0.007499992847442627,
307
+ "tpp_threshold_20_total_metric": -2.9802322387695312e-08,
308
+ "tpp_threshold_20_intended_diff_only": 0.003999948501586914,
309
+ "tpp_threshold_20_unintended_diff_only": 0.003999978303909302,
310
+ "tpp_threshold_50_total_metric": 0.0027499645948410034,
311
+ "tpp_threshold_50_intended_diff_only": 0.0029999613761901855,
312
+ "tpp_threshold_50_unintended_diff_only": 0.00024999678134918213,
313
+ "tpp_threshold_100_total_metric": 0.00024996697902679443,
314
+ "tpp_threshold_100_intended_diff_only": 0.006999969482421875,
315
+ "tpp_threshold_100_unintended_diff_only": 0.006750002503395081,
316
+ "tpp_threshold_500_total_metric": 0.02699999511241913,
317
+ "tpp_threshold_500_intended_diff_only": 0.02799999713897705,
318
+ "tpp_threshold_500_unintended_diff_only": 0.0010000020265579224
319
+ },
320
+ "2": {
321
+ "tpp_threshold_2_total_metric": 0.009250015020370483,
322
+ "tpp_threshold_2_intended_diff_only": 0.009000003337860107,
323
+ "tpp_threshold_2_unintended_diff_only": -0.000250011682510376,
324
+ "tpp_threshold_5_total_metric": 0.002500012516975403,
325
+ "tpp_threshold_5_intended_diff_only": 0.009000003337860107,
326
+ "tpp_threshold_5_unintended_diff_only": 0.006499990820884705,
327
+ "tpp_threshold_10_total_metric": 0.009500011801719666,
328
+ "tpp_threshold_10_intended_diff_only": 0.014999985694885254,
329
+ "tpp_threshold_10_unintended_diff_only": 0.005499973893165588,
330
+ "tpp_threshold_20_total_metric": 0.012999996542930603,
331
+ "tpp_threshold_20_intended_diff_only": 0.015999972820281982,
332
+ "tpp_threshold_20_unintended_diff_only": 0.0029999762773513794,
333
+ "tpp_threshold_50_total_metric": 0.02899998426437378,
334
+ "tpp_threshold_50_intended_diff_only": 0.0339999794960022,
335
+ "tpp_threshold_50_unintended_diff_only": 0.004999995231628418,
336
+ "tpp_threshold_100_total_metric": 0.03225000202655792,
337
+ "tpp_threshold_100_intended_diff_only": 0.042999982833862305,
338
+ "tpp_threshold_100_unintended_diff_only": 0.010749980807304382,
339
+ "tpp_threshold_500_total_metric": 0.09800000488758087,
340
+ "tpp_threshold_500_intended_diff_only": 0.10600000619888306,
341
+ "tpp_threshold_500_unintended_diff_only": 0.008000001311302185
342
+ },
343
+ "3": {
344
+ "tpp_threshold_2_total_metric": -0.008000016212463379,
345
+ "tpp_threshold_2_intended_diff_only": -0.003000020980834961,
346
+ "tpp_threshold_2_unintended_diff_only": 0.004999995231628418,
347
+ "tpp_threshold_5_total_metric": 0.005500003695487976,
348
+ "tpp_threshold_5_intended_diff_only": 0.004999995231628418,
349
+ "tpp_threshold_5_unintended_diff_only": -0.0005000084638595581,
350
+ "tpp_threshold_10_total_metric": 0.00875002145767212,
351
+ "tpp_threshold_10_intended_diff_only": 0.013000011444091797,
352
+ "tpp_threshold_10_unintended_diff_only": 0.004249989986419678,
353
+ "tpp_threshold_20_total_metric": 0.0020000189542770386,
354
+ "tpp_threshold_20_intended_diff_only": 0.009000003337860107,
355
+ "tpp_threshold_20_unintended_diff_only": 0.006999984383583069,
356
+ "tpp_threshold_50_total_metric": 0.011499986052513123,
357
+ "tpp_threshold_50_intended_diff_only": 0.015999972820281982,
358
+ "tpp_threshold_50_unintended_diff_only": 0.00449998676776886,
359
+ "tpp_threshold_100_total_metric": 0.020749986171722412,
360
+ "tpp_threshold_100_intended_diff_only": 0.029999971389770508,
361
+ "tpp_threshold_100_unintended_diff_only": 0.009249985218048096,
362
+ "tpp_threshold_500_total_metric": 0.07975000143051147,
363
+ "tpp_threshold_500_intended_diff_only": 0.09299999475479126,
364
+ "tpp_threshold_500_unintended_diff_only": 0.013249993324279785
365
+ },
366
+ "5": {
367
+ "tpp_threshold_2_total_metric": -0.004749953746795654,
368
+ "tpp_threshold_2_intended_diff_only": -0.001999974250793457,
369
+ "tpp_threshold_2_unintended_diff_only": 0.0027499794960021973,
370
+ "tpp_threshold_5_total_metric": -0.0009999871253967285,
371
+ "tpp_threshold_5_intended_diff_only": 0.004999995231628418,
372
+ "tpp_threshold_5_unintended_diff_only": 0.0059999823570251465,
373
+ "tpp_threshold_10_total_metric": -0.0037499815225601196,
374
+ "tpp_threshold_10_intended_diff_only": 0.003000020980834961,
375
+ "tpp_threshold_10_unintended_diff_only": 0.006750002503395081,
376
+ "tpp_threshold_20_total_metric": 0.005500003695487976,
377
+ "tpp_threshold_20_intended_diff_only": 0.013999998569488525,
378
+ "tpp_threshold_20_unintended_diff_only": 0.00849999487400055,
379
+ "tpp_threshold_50_total_metric": 0.03400006890296936,
380
+ "tpp_threshold_50_intended_diff_only": 0.04200005531311035,
381
+ "tpp_threshold_50_unintended_diff_only": 0.007999986410140991,
382
+ "tpp_threshold_100_total_metric": 0.042000025510787964,
383
+ "tpp_threshold_100_intended_diff_only": 0.05900001525878906,
384
+ "tpp_threshold_100_unintended_diff_only": 0.0169999897480011,
385
+ "tpp_threshold_500_total_metric": 0.11125005781650543,
386
+ "tpp_threshold_500_intended_diff_only": 0.13100004196166992,
387
+ "tpp_threshold_500_unintended_diff_only": 0.01974998414516449
388
+ },
389
+ "6": {
390
+ "tpp_threshold_2_total_metric": 0.009749993681907654,
391
+ "tpp_threshold_2_intended_diff_only": 0.014999985694885254,
392
+ "tpp_threshold_2_unintended_diff_only": 0.0052499920129776,
393
+ "tpp_threshold_5_total_metric": 0.011999964714050293,
394
+ "tpp_threshold_5_intended_diff_only": 0.012999951839447021,
395
+ "tpp_threshold_5_unintended_diff_only": 0.0009999871253967285,
396
+ "tpp_threshold_10_total_metric": 0.0274999737739563,
397
+ "tpp_threshold_10_intended_diff_only": 0.029999971389770508,
398
+ "tpp_threshold_10_unintended_diff_only": 0.002499997615814209,
399
+ "tpp_threshold_20_total_metric": 0.04249995946884155,
400
+ "tpp_threshold_20_intended_diff_only": 0.04999995231628418,
401
+ "tpp_threshold_20_unintended_diff_only": 0.007499992847442627,
402
+ "tpp_threshold_50_total_metric": 0.05425001680850983,
403
+ "tpp_threshold_50_intended_diff_only": 0.06499999761581421,
404
+ "tpp_threshold_50_unintended_diff_only": 0.010749980807304382,
405
+ "tpp_threshold_100_total_metric": 0.06950002908706665,
406
+ "tpp_threshold_100_intended_diff_only": 0.078000009059906,
407
+ "tpp_threshold_100_unintended_diff_only": 0.008499979972839355,
408
+ "tpp_threshold_500_total_metric": 0.21699997782707214,
409
+ "tpp_threshold_500_intended_diff_only": 0.2279999852180481,
410
+ "tpp_threshold_500_unintended_diff_only": 0.011000007390975952
411
+ }
412
+ }
413
+ }
414
+ }
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "tpp",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": false,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "a63c9e71-5b6c-46d2-b489-d01f4d9597c7",
73
+ "datetime_epoch_millis": 1740202630595,
74
+ "eval_result_metrics": {
75
+ "tpp_metrics": {
76
+ "tpp_threshold_2_total_metric": 0.004799993336200714,
77
+ "tpp_threshold_2_intended_diff_only": 0.0075999975204467775,
78
+ "tpp_threshold_2_unintended_diff_only": 0.0028000041842460633,
79
+ "tpp_threshold_5_total_metric": 0.007149985432624817,
80
+ "tpp_threshold_5_intended_diff_only": 0.010399985313415527,
81
+ "tpp_threshold_5_unintended_diff_only": 0.0032499998807907103,
82
+ "tpp_threshold_10_total_metric": 0.014450006186962128,
83
+ "tpp_threshold_10_intended_diff_only": 0.018300002813339232,
84
+ "tpp_threshold_10_unintended_diff_only": 0.003849996626377106,
85
+ "tpp_threshold_20_total_metric": 0.022200000286102296,
86
+ "tpp_threshold_20_intended_diff_only": 0.02680000066757202,
87
+ "tpp_threshold_20_unintended_diff_only": 0.004600000381469727,
88
+ "tpp_threshold_50_total_metric": 0.04717499911785126,
89
+ "tpp_threshold_50_intended_diff_only": 0.051999998092651364,
90
+ "tpp_threshold_50_unintended_diff_only": 0.004824998974800109,
91
+ "tpp_threshold_100_total_metric": 0.07345000356435775,
92
+ "tpp_threshold_100_intended_diff_only": 0.08180000185966492,
93
+ "tpp_threshold_100_unintended_diff_only": 0.00834999829530716,
94
+ "tpp_threshold_500_total_metric": 0.22270000725984573,
95
+ "tpp_threshold_500_intended_diff_only": 0.23360000848770143,
96
+ "tpp_threshold_500_unintended_diff_only": 0.010900001227855682
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
102
+ "tpp_threshold_2_total_metric": 0.004999995231628418,
103
+ "tpp_threshold_2_intended_diff_only": 0.007600009441375732,
104
+ "tpp_threshold_2_unintended_diff_only": 0.0026000142097473145,
105
+ "tpp_threshold_5_total_metric": 0.01009998917579651,
106
+ "tpp_threshold_5_intended_diff_only": 0.013199996948242188,
107
+ "tpp_threshold_5_unintended_diff_only": 0.0031000077724456787,
108
+ "tpp_threshold_10_total_metric": 0.017699992656707762,
109
+ "tpp_threshold_10_intended_diff_only": 0.020399999618530274,
110
+ "tpp_threshold_10_unintended_diff_only": 0.0027000069618225097,
111
+ "tpp_threshold_20_total_metric": 0.03189999759197235,
112
+ "tpp_threshold_20_intended_diff_only": 0.03480000495910644,
113
+ "tpp_threshold_20_unintended_diff_only": 0.0029000073671340944,
114
+ "tpp_threshold_50_total_metric": 0.05539998412132263,
115
+ "tpp_threshold_50_intended_diff_only": 0.058799993991851804,
116
+ "tpp_threshold_50_unintended_diff_only": 0.0034000098705291746,
117
+ "tpp_threshold_100_total_metric": 0.08500000834465027,
118
+ "tpp_threshold_100_intended_diff_only": 0.08960001468658448,
119
+ "tpp_threshold_100_unintended_diff_only": 0.004600006341934204,
120
+ "tpp_threshold_500_total_metric": 0.2710500031709671,
121
+ "tpp_threshold_500_intended_diff_only": 0.27760001420974734,
122
+ "tpp_threshold_500_unintended_diff_only": 0.006550011038780212
123
+ },
124
+ {
125
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
126
+ "tpp_threshold_2_total_metric": 0.00459999144077301,
127
+ "tpp_threshold_2_intended_diff_only": 0.007599985599517823,
128
+ "tpp_threshold_2_unintended_diff_only": 0.002999994158744812,
129
+ "tpp_threshold_5_total_metric": 0.004199981689453125,
130
+ "tpp_threshold_5_intended_diff_only": 0.007599973678588867,
131
+ "tpp_threshold_5_unintended_diff_only": 0.0033999919891357423,
132
+ "tpp_threshold_10_total_metric": 0.011200019717216491,
133
+ "tpp_threshold_10_intended_diff_only": 0.016200006008148193,
134
+ "tpp_threshold_10_unintended_diff_only": 0.004999986290931702,
135
+ "tpp_threshold_20_total_metric": 0.012500002980232239,
136
+ "tpp_threshold_20_intended_diff_only": 0.018799996376037596,
137
+ "tpp_threshold_20_unintended_diff_only": 0.006299993395805359,
138
+ "tpp_threshold_50_total_metric": 0.038950014114379886,
139
+ "tpp_threshold_50_intended_diff_only": 0.045200002193450925,
140
+ "tpp_threshold_50_unintended_diff_only": 0.006249988079071045,
141
+ "tpp_threshold_100_total_metric": 0.06189999878406525,
142
+ "tpp_threshold_100_intended_diff_only": 0.07399998903274536,
143
+ "tpp_threshold_100_unintended_diff_only": 0.012099990248680114,
144
+ "tpp_threshold_500_total_metric": 0.17435001134872435,
145
+ "tpp_threshold_500_intended_diff_only": 0.18960000276565553,
146
+ "tpp_threshold_500_unintended_diff_only": 0.015249991416931152
147
+ }
148
+ ],
149
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
150
+ "sae_lens_id": "custom_sae",
151
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000",
152
+ "sae_lens_version": "5.5.0",
153
+ "sae_cfg_dict": {
154
+ "model_name": "gemma-2-2b",
155
+ "d_in": 2304,
156
+ "d_sae": 65536,
157
+ "hook_layer": 12,
158
+ "hook_name": "blocks.12.hook_resid_post",
159
+ "context_size": null,
160
+ "hook_head_index": null,
161
+ "architecture": "topk",
162
+ "apply_b_dec_to_input": null,
163
+ "finetuning_scaling_factor": null,
164
+ "activation_fn_str": "",
165
+ "prepend_bos": true,
166
+ "normalize_activations": "none",
167
+ "dtype": "bfloat16",
168
+ "device": "",
169
+ "dataset_path": "",
170
+ "dataset_trust_remote_code": true,
171
+ "seqpos_slice": [
172
+ null
173
+ ],
174
+ "training_tokens": -100000,
175
+ "sae_lens_training_version": null,
176
+ "neuronpedia_id": null
177
+ },
178
+ "eval_result_unstructured": {
179
+ "LabHC/bias_in_bios_class_set1": {
180
+ "0": {
181
+ "tpp_threshold_2_total_metric": 0.0072500258684158325,
182
+ "tpp_threshold_2_intended_diff_only": 0.01100003719329834,
183
+ "tpp_threshold_2_unintended_diff_only": 0.0037500113248825073,
184
+ "tpp_threshold_5_total_metric": 0.014999985694885254,
185
+ "tpp_threshold_5_intended_diff_only": 0.018999993801116943,
186
+ "tpp_threshold_5_unintended_diff_only": 0.0040000081062316895,
187
+ "tpp_threshold_10_total_metric": 0.00974997878074646,
188
+ "tpp_threshold_10_intended_diff_only": 0.013999998569488525,
189
+ "tpp_threshold_10_unintended_diff_only": 0.004250019788742065,
190
+ "tpp_threshold_20_total_metric": 0.025000005960464478,
191
+ "tpp_threshold_20_intended_diff_only": 0.027000010013580322,
192
+ "tpp_threshold_20_unintended_diff_only": 0.0020000040531158447,
193
+ "tpp_threshold_50_total_metric": 0.03924998641014099,
194
+ "tpp_threshold_50_intended_diff_only": 0.041999995708465576,
195
+ "tpp_threshold_50_unintended_diff_only": 0.002750009298324585,
196
+ "tpp_threshold_100_total_metric": 0.06700004637241364,
197
+ "tpp_threshold_100_intended_diff_only": 0.07100003957748413,
198
+ "tpp_threshold_100_unintended_diff_only": 0.003999993205070496,
199
+ "tpp_threshold_500_total_metric": 0.27824999392032623,
200
+ "tpp_threshold_500_intended_diff_only": 0.281000018119812,
201
+ "tpp_threshold_500_unintended_diff_only": 0.002750024199485779
202
+ },
203
+ "1": {
204
+ "tpp_threshold_2_total_metric": 0.012750014662742615,
205
+ "tpp_threshold_2_intended_diff_only": 0.013000011444091797,
206
+ "tpp_threshold_2_unintended_diff_only": 0.00024999678134918213,
207
+ "tpp_threshold_5_total_metric": 0.010500013828277588,
208
+ "tpp_threshold_5_intended_diff_only": 0.013000011444091797,
209
+ "tpp_threshold_5_unintended_diff_only": 0.002499997615814209,
210
+ "tpp_threshold_10_total_metric": 0.013500005006790161,
211
+ "tpp_threshold_10_intended_diff_only": 0.013000011444091797,
212
+ "tpp_threshold_10_unintended_diff_only": -0.0004999935626983643,
213
+ "tpp_threshold_20_total_metric": 0.012750014662742615,
214
+ "tpp_threshold_20_intended_diff_only": 0.017000019550323486,
215
+ "tpp_threshold_20_unintended_diff_only": 0.004250004887580872,
216
+ "tpp_threshold_50_total_metric": 0.026250004768371582,
217
+ "tpp_threshold_50_intended_diff_only": 0.03100001811981201,
218
+ "tpp_threshold_50_unintended_diff_only": 0.00475001335144043,
219
+ "tpp_threshold_100_total_metric": 0.053500011563301086,
220
+ "tpp_threshold_100_intended_diff_only": 0.05900001525878906,
221
+ "tpp_threshold_100_unintended_diff_only": 0.005500003695487976,
222
+ "tpp_threshold_500_total_metric": 0.22600004076957703,
223
+ "tpp_threshold_500_intended_diff_only": 0.2330000400543213,
224
+ "tpp_threshold_500_unintended_diff_only": 0.006999999284744263
225
+ },
226
+ "2": {
227
+ "tpp_threshold_2_total_metric": 0.0004999786615371704,
228
+ "tpp_threshold_2_intended_diff_only": 0.004999995231628418,
229
+ "tpp_threshold_2_unintended_diff_only": 0.0045000165700912476,
230
+ "tpp_threshold_5_total_metric": 0.019749954342842102,
231
+ "tpp_threshold_5_intended_diff_only": 0.02499997615814209,
232
+ "tpp_threshold_5_unintended_diff_only": 0.005250021815299988,
233
+ "tpp_threshold_10_total_metric": 0.05399997532367706,
234
+ "tpp_threshold_10_intended_diff_only": 0.05699998140335083,
235
+ "tpp_threshold_10_unintended_diff_only": 0.003000006079673767,
236
+ "tpp_threshold_20_total_metric": 0.07900001108646393,
237
+ "tpp_threshold_20_intended_diff_only": 0.0820000171661377,
238
+ "tpp_threshold_20_unintended_diff_only": 0.003000006079673767,
239
+ "tpp_threshold_50_total_metric": 0.12099999189376831,
240
+ "tpp_threshold_50_intended_diff_only": 0.125,
241
+ "tpp_threshold_50_unintended_diff_only": 0.0040000081062316895,
242
+ "tpp_threshold_100_total_metric": 0.17149999737739563,
243
+ "tpp_threshold_100_intended_diff_only": 0.17500001192092896,
244
+ "tpp_threshold_100_unintended_diff_only": 0.003500014543533325,
245
+ "tpp_threshold_500_total_metric": 0.3410000056028366,
246
+ "tpp_threshold_500_intended_diff_only": 0.34700000286102295,
247
+ "tpp_threshold_500_unintended_diff_only": 0.00599999725818634
248
+ },
249
+ "6": {
250
+ "tpp_threshold_2_total_metric": 0.0012499988079071045,
251
+ "tpp_threshold_2_intended_diff_only": 0.003000020980834961,
252
+ "tpp_threshold_2_unintended_diff_only": 0.0017500221729278564,
253
+ "tpp_threshold_5_total_metric": 0.003000035881996155,
254
+ "tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
255
+ "tpp_threshold_5_unintended_diff_only": -0.0010000020265579224,
256
+ "tpp_threshold_10_total_metric": 0.0037499964237213135,
257
+ "tpp_threshold_10_intended_diff_only": 0.004999995231628418,
258
+ "tpp_threshold_10_unintended_diff_only": 0.0012499988079071045,
259
+ "tpp_threshold_20_total_metric": 0.006749972701072693,
260
+ "tpp_threshold_20_intended_diff_only": 0.0059999823570251465,
261
+ "tpp_threshold_20_unintended_diff_only": -0.0007499903440475464,
262
+ "tpp_threshold_50_total_metric": 0.009249985218048096,
263
+ "tpp_threshold_50_intended_diff_only": 0.009999990463256836,
264
+ "tpp_threshold_50_unintended_diff_only": 0.0007500052452087402,
265
+ "tpp_threshold_100_total_metric": 0.02025000751018524,
266
+ "tpp_threshold_100_intended_diff_only": 0.022000014781951904,
267
+ "tpp_threshold_100_unintended_diff_only": 0.0017500072717666626,
268
+ "tpp_threshold_500_total_metric": 0.117249995470047,
269
+ "tpp_threshold_500_intended_diff_only": 0.12400001287460327,
270
+ "tpp_threshold_500_unintended_diff_only": 0.006750017404556274
271
+ },
272
+ "9": {
273
+ "tpp_threshold_2_total_metric": 0.0032499581575393677,
274
+ "tpp_threshold_2_intended_diff_only": 0.0059999823570251465,
275
+ "tpp_threshold_2_unintended_diff_only": 0.002750024199485779,
276
+ "tpp_threshold_5_total_metric": 0.0022499561309814453,
277
+ "tpp_threshold_5_intended_diff_only": 0.006999969482421875,
278
+ "tpp_threshold_5_unintended_diff_only": 0.00475001335144043,
279
+ "tpp_threshold_10_total_metric": 0.007500007748603821,
280
+ "tpp_threshold_10_intended_diff_only": 0.013000011444091797,
281
+ "tpp_threshold_10_unintended_diff_only": 0.005500003695487976,
282
+ "tpp_threshold_20_total_metric": 0.03599998354911804,
283
+ "tpp_threshold_20_intended_diff_only": 0.041999995708465576,
284
+ "tpp_threshold_20_unintended_diff_only": 0.006000012159347534,
285
+ "tpp_threshold_50_total_metric": 0.08124995231628418,
286
+ "tpp_threshold_50_intended_diff_only": 0.08599996566772461,
287
+ "tpp_threshold_50_unintended_diff_only": 0.00475001335144043,
288
+ "tpp_threshold_100_total_metric": 0.11274997889995575,
289
+ "tpp_threshold_100_intended_diff_only": 0.12099999189376831,
290
+ "tpp_threshold_100_unintended_diff_only": 0.008250012993812561,
291
+ "tpp_threshold_500_total_metric": 0.39274998009204865,
292
+ "tpp_threshold_500_intended_diff_only": 0.40299999713897705,
293
+ "tpp_threshold_500_unintended_diff_only": 0.010250017046928406
294
+ }
295
+ },
296
+ "canrager/amazon_reviews_mcauley_1and5": {
297
+ "1": {
298
+ "tpp_threshold_2_total_metric": 0.003999978303909302,
299
+ "tpp_threshold_2_intended_diff_only": 0.007999956607818604,
300
+ "tpp_threshold_2_unintended_diff_only": 0.003999978303909302,
301
+ "tpp_threshold_5_total_metric": 0.00024996697902679443,
302
+ "tpp_threshold_5_intended_diff_only": 0.003999948501586914,
303
+ "tpp_threshold_5_unintended_diff_only": 0.0037499815225601196,
304
+ "tpp_threshold_10_total_metric": 0.0034999996423721313,
305
+ "tpp_threshold_10_intended_diff_only": 0.009999990463256836,
306
+ "tpp_threshold_10_unintended_diff_only": 0.006499990820884705,
307
+ "tpp_threshold_20_total_metric": -0.005250021815299988,
308
+ "tpp_threshold_20_intended_diff_only": 0.001999974250793457,
309
+ "tpp_threshold_20_unintended_diff_only": 0.007249996066093445,
310
+ "tpp_threshold_50_total_metric": 0.007249996066093445,
311
+ "tpp_threshold_50_intended_diff_only": 0.009999990463256836,
312
+ "tpp_threshold_50_unintended_diff_only": 0.002749994397163391,
313
+ "tpp_threshold_100_total_metric": 0.006999969482421875,
314
+ "tpp_threshold_100_intended_diff_only": 0.015999972820281982,
315
+ "tpp_threshold_100_unintended_diff_only": 0.009000003337860107,
316
+ "tpp_threshold_500_total_metric": 0.0702500194311142,
317
+ "tpp_threshold_500_intended_diff_only": 0.078000009059906,
318
+ "tpp_threshold_500_unintended_diff_only": 0.007749989628791809
319
+ },
320
+ "2": {
321
+ "tpp_threshold_2_total_metric": 0.001749977469444275,
322
+ "tpp_threshold_2_intended_diff_only": 0.001999974250793457,
323
+ "tpp_threshold_2_unintended_diff_only": 0.00024999678134918213,
324
+ "tpp_threshold_5_total_metric": -0.0015000104904174805,
325
+ "tpp_threshold_5_intended_diff_only": 0.0059999823570251465,
326
+ "tpp_threshold_5_unintended_diff_only": 0.007499992847442627,
327
+ "tpp_threshold_10_total_metric": 0.004999995231628418,
328
+ "tpp_threshold_10_intended_diff_only": 0.010999977588653564,
329
+ "tpp_threshold_10_unintended_diff_only": 0.0059999823570251465,
330
+ "tpp_threshold_20_total_metric": 0.008500024676322937,
331
+ "tpp_threshold_20_intended_diff_only": 0.013000011444091797,
332
+ "tpp_threshold_20_unintended_diff_only": 0.00449998676776886,
333
+ "tpp_threshold_50_total_metric": 0.025749996304512024,
334
+ "tpp_threshold_50_intended_diff_only": 0.03200000524520874,
335
+ "tpp_threshold_50_unintended_diff_only": 0.006250008940696716,
336
+ "tpp_threshold_100_total_metric": 0.03049999475479126,
337
+ "tpp_threshold_100_intended_diff_only": 0.046999990940093994,
338
+ "tpp_threshold_100_unintended_diff_only": 0.016499996185302734,
339
+ "tpp_threshold_500_total_metric": 0.12775003910064697,
340
+ "tpp_threshold_500_intended_diff_only": 0.1420000195503235,
341
+ "tpp_threshold_500_unintended_diff_only": 0.014249980449676514
342
+ },
343
+ "3": {
344
+ "tpp_threshold_2_total_metric": -0.00625002384185791,
345
+ "tpp_threshold_2_intended_diff_only": -0.003000020980834961,
346
+ "tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
347
+ "tpp_threshold_5_total_metric": 0.002249985933303833,
348
+ "tpp_threshold_5_intended_diff_only": 0.001999974250793457,
349
+ "tpp_threshold_5_unintended_diff_only": -0.000250011682510376,
350
+ "tpp_threshold_10_total_metric": 0.010500013828277588,
351
+ "tpp_threshold_10_intended_diff_only": 0.013000011444091797,
352
+ "tpp_threshold_10_unintended_diff_only": 0.002499997615814209,
353
+ "tpp_threshold_20_total_metric": 0.00024996697902679443,
354
+ "tpp_threshold_20_intended_diff_only": 0.006999969482421875,
355
+ "tpp_threshold_20_unintended_diff_only": 0.006750002503395081,
356
+ "tpp_threshold_50_total_metric": 0.02550002932548523,
357
+ "tpp_threshold_50_intended_diff_only": 0.027000010013580322,
358
+ "tpp_threshold_50_unintended_diff_only": 0.0014999806880950928,
359
+ "tpp_threshold_100_total_metric": 0.040249988436698914,
360
+ "tpp_threshold_100_intended_diff_only": 0.04899996519088745,
361
+ "tpp_threshold_100_unintended_diff_only": 0.008749976754188538,
362
+ "tpp_threshold_500_total_metric": 0.14424999058246613,
363
+ "tpp_threshold_500_intended_diff_only": 0.16399997472763062,
364
+ "tpp_threshold_500_unintended_diff_only": 0.01974998414516449
365
+ },
366
+ "5": {
367
+ "tpp_threshold_2_total_metric": 0.006250068545341492,
368
+ "tpp_threshold_2_intended_diff_only": 0.010000050067901611,
369
+ "tpp_threshold_2_unintended_diff_only": 0.0037499815225601196,
370
+ "tpp_threshold_5_total_metric": 0.007249996066093445,
371
+ "tpp_threshold_5_intended_diff_only": 0.013999998569488525,
372
+ "tpp_threshold_5_unintended_diff_only": 0.006750002503395081,
373
+ "tpp_threshold_10_total_metric": 0.00200006365776062,
374
+ "tpp_threshold_10_intended_diff_only": 0.010000050067901611,
375
+ "tpp_threshold_10_unintended_diff_only": 0.007999986410140991,
376
+ "tpp_threshold_20_total_metric": 0.014250069856643677,
377
+ "tpp_threshold_20_intended_diff_only": 0.024000048637390137,
378
+ "tpp_threshold_20_unintended_diff_only": 0.00974997878074646,
379
+ "tpp_threshold_50_total_metric": 0.07175002992153168,
380
+ "tpp_threshold_50_intended_diff_only": 0.08300000429153442,
381
+ "tpp_threshold_50_unintended_diff_only": 0.011249974370002747,
382
+ "tpp_threshold_100_total_metric": 0.1260000616312027,
383
+ "tpp_threshold_100_intended_diff_only": 0.14500004053115845,
384
+ "tpp_threshold_100_unintended_diff_only": 0.01899997889995575,
385
+ "tpp_threshold_500_total_metric": 0.247250035405159,
386
+ "tpp_threshold_500_intended_diff_only": 0.27000004053115845,
387
+ "tpp_threshold_500_unintended_diff_only": 0.02275000512599945
388
+ },
389
+ "6": {
390
+ "tpp_threshold_2_total_metric": 0.017249956727027893,
391
+ "tpp_threshold_2_intended_diff_only": 0.0209999680519104,
392
+ "tpp_threshold_2_unintended_diff_only": 0.0037500113248825073,
393
+ "tpp_threshold_5_total_metric": 0.012749969959259033,
394
+ "tpp_threshold_5_intended_diff_only": 0.011999964714050293,
395
+ "tpp_threshold_5_unintended_diff_only": -0.0007500052452087402,
396
+ "tpp_threshold_10_total_metric": 0.0350000262260437,
397
+ "tpp_threshold_10_intended_diff_only": 0.03700000047683716,
398
+ "tpp_threshold_10_unintended_diff_only": 0.001999974250793457,
399
+ "tpp_threshold_20_total_metric": 0.04474997520446777,
400
+ "tpp_threshold_20_intended_diff_only": 0.04799997806549072,
401
+ "tpp_threshold_20_unintended_diff_only": 0.0032500028610229492,
402
+ "tpp_threshold_50_total_metric": 0.06450001895427704,
403
+ "tpp_threshold_50_intended_diff_only": 0.07400000095367432,
404
+ "tpp_threshold_50_unintended_diff_only": 0.009499981999397278,
405
+ "tpp_threshold_100_total_metric": 0.10574997961521149,
406
+ "tpp_threshold_100_intended_diff_only": 0.11299997568130493,
407
+ "tpp_threshold_100_unintended_diff_only": 0.007249996066093445,
408
+ "tpp_threshold_500_total_metric": 0.28224997222423553,
409
+ "tpp_threshold_500_intended_diff_only": 0.29399996995925903,
410
+ "tpp_threshold_500_unintended_diff_only": 0.011749997735023499
411
+ }
412
+ }
413
+ }
414
+ }
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "tpp",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": false,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "c5404fea-7e48-43fd-8a52-df4372acae0b",
73
+ "datetime_epoch_millis": 1740202316416,
74
+ "eval_result_metrics": {
75
+ "tpp_metrics": {
76
+ "tpp_threshold_2_total_metric": 0.0044249996542930605,
77
+ "tpp_threshold_2_intended_diff_only": 0.007800000905990601,
78
+ "tpp_threshold_2_unintended_diff_only": 0.0033750012516975403,
79
+ "tpp_threshold_5_total_metric": 0.008849999308586121,
80
+ "tpp_threshold_5_intended_diff_only": 0.011799997091293334,
81
+ "tpp_threshold_5_unintended_diff_only": 0.0029499977827072144,
82
+ "tpp_threshold_10_total_metric": 0.015325011312961578,
83
+ "tpp_threshold_10_intended_diff_only": 0.018900007009506226,
84
+ "tpp_threshold_10_unintended_diff_only": 0.003574995696544647,
85
+ "tpp_threshold_20_total_metric": 0.028574997186660768,
86
+ "tpp_threshold_20_intended_diff_only": 0.033899998664855956,
87
+ "tpp_threshold_20_unintended_diff_only": 0.00532500147819519,
88
+ "tpp_threshold_50_total_metric": 0.06917501091957093,
89
+ "tpp_threshold_50_intended_diff_only": 0.07640000581741332,
90
+ "tpp_threshold_50_unintended_diff_only": 0.007224994897842407,
91
+ "tpp_threshold_100_total_metric": 0.1251750007271767,
92
+ "tpp_threshold_100_intended_diff_only": 0.13420000672340393,
93
+ "tpp_threshold_100_unintended_diff_only": 0.009025005996227263,
94
+ "tpp_threshold_500_total_metric": 0.313950015604496,
95
+ "tpp_threshold_500_intended_diff_only": 0.32780001163482664,
96
+ "tpp_threshold_500_unintended_diff_only": 0.013849996030330658
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
102
+ "tpp_threshold_2_total_metric": 0.0048499912023544315,
103
+ "tpp_threshold_2_intended_diff_only": 0.007400000095367431,
104
+ "tpp_threshold_2_unintended_diff_only": 0.0025500088930130007,
105
+ "tpp_threshold_5_total_metric": 0.0064999938011169435,
106
+ "tpp_threshold_5_intended_diff_only": 0.009399998188018798,
107
+ "tpp_threshold_5_unintended_diff_only": 0.0029000043869018555,
108
+ "tpp_threshold_10_total_metric": 0.016850015521049498,
109
+ "tpp_threshold_10_intended_diff_only": 0.019600021839141845,
110
+ "tpp_threshold_10_unintended_diff_only": 0.002750006318092346,
111
+ "tpp_threshold_20_total_metric": 0.03354999721050263,
112
+ "tpp_threshold_20_intended_diff_only": 0.037400007247924805,
113
+ "tpp_threshold_20_unintended_diff_only": 0.00385001003742218,
114
+ "tpp_threshold_50_total_metric": 0.0748000055551529,
115
+ "tpp_threshold_50_intended_diff_only": 0.07900000810623169,
116
+ "tpp_threshold_50_unintended_diff_only": 0.004200002551078797,
117
+ "tpp_threshold_100_total_metric": 0.12809998989105226,
118
+ "tpp_threshold_100_intended_diff_only": 0.13380000591278077,
119
+ "tpp_threshold_100_unintended_diff_only": 0.005700016021728515,
120
+ "tpp_threshold_500_total_metric": 0.3692500054836273,
121
+ "tpp_threshold_500_intended_diff_only": 0.376800012588501,
122
+ "tpp_threshold_500_unintended_diff_only": 0.007550007104873658
123
+ },
124
+ {
125
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
126
+ "tpp_threshold_2_total_metric": 0.0040000081062316895,
127
+ "tpp_threshold_2_intended_diff_only": 0.00820000171661377,
128
+ "tpp_threshold_2_unintended_diff_only": 0.00419999361038208,
129
+ "tpp_threshold_5_total_metric": 0.011200004816055298,
130
+ "tpp_threshold_5_intended_diff_only": 0.01419999599456787,
131
+ "tpp_threshold_5_unintended_diff_only": 0.0029999911785125732,
132
+ "tpp_threshold_10_total_metric": 0.013800007104873658,
133
+ "tpp_threshold_10_intended_diff_only": 0.018199992179870606,
134
+ "tpp_threshold_10_unintended_diff_only": 0.004399985074996948,
135
+ "tpp_threshold_20_total_metric": 0.02359999716281891,
136
+ "tpp_threshold_20_intended_diff_only": 0.03039999008178711,
137
+ "tpp_threshold_20_unintended_diff_only": 0.0067999929189682005,
138
+ "tpp_threshold_50_total_metric": 0.06355001628398896,
139
+ "tpp_threshold_50_intended_diff_only": 0.07380000352859498,
140
+ "tpp_threshold_50_unintended_diff_only": 0.010249987244606018,
141
+ "tpp_threshold_100_total_metric": 0.12225001156330109,
142
+ "tpp_threshold_100_intended_diff_only": 0.1346000075340271,
143
+ "tpp_threshold_100_unintended_diff_only": 0.012349995970726012,
144
+ "tpp_threshold_500_total_metric": 0.25865002572536466,
145
+ "tpp_threshold_500_intended_diff_only": 0.27880001068115234,
146
+ "tpp_threshold_500_unintended_diff_only": 0.02014998495578766
147
+ }
148
+ ],
149
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
150
+ "sae_lens_id": "custom_sae",
151
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000",
152
+ "sae_lens_version": "5.5.0",
153
+ "sae_cfg_dict": {
154
+ "model_name": "gemma-2-2b",
155
+ "d_in": 2304,
156
+ "d_sae": 65536,
157
+ "hook_layer": 12,
158
+ "hook_name": "blocks.12.hook_resid_post",
159
+ "context_size": null,
160
+ "hook_head_index": null,
161
+ "architecture": "topk",
162
+ "apply_b_dec_to_input": null,
163
+ "finetuning_scaling_factor": null,
164
+ "activation_fn_str": "",
165
+ "prepend_bos": true,
166
+ "normalize_activations": "none",
167
+ "dtype": "bfloat16",
168
+ "device": "",
169
+ "dataset_path": "",
170
+ "dataset_trust_remote_code": true,
171
+ "seqpos_slice": [
172
+ null
173
+ ],
174
+ "training_tokens": -100000,
175
+ "sae_lens_training_version": null,
176
+ "neuronpedia_id": null
177
+ },
178
+ "eval_result_unstructured": {
179
+ "LabHC/bias_in_bios_class_set1": {
180
+ "0": {
181
+ "tpp_threshold_2_total_metric": 0.0020000338554382324,
182
+ "tpp_threshold_2_intended_diff_only": 0.006000041961669922,
183
+ "tpp_threshold_2_unintended_diff_only": 0.0040000081062316895,
184
+ "tpp_threshold_5_total_metric": 0.005250021815299988,
185
+ "tpp_threshold_5_intended_diff_only": 0.00700002908706665,
186
+ "tpp_threshold_5_unintended_diff_only": 0.0017500072717666626,
187
+ "tpp_threshold_10_total_metric": 0.014250040054321289,
188
+ "tpp_threshold_10_intended_diff_only": 0.01500004529953003,
189
+ "tpp_threshold_10_unintended_diff_only": 0.0007500052452087402,
190
+ "tpp_threshold_20_total_metric": 0.01675000786781311,
191
+ "tpp_threshold_20_intended_diff_only": 0.021000027656555176,
192
+ "tpp_threshold_20_unintended_diff_only": 0.004250019788742065,
193
+ "tpp_threshold_50_total_metric": 0.0417499840259552,
194
+ "tpp_threshold_50_intended_diff_only": 0.046999990940093994,
195
+ "tpp_threshold_50_unintended_diff_only": 0.005250006914138794,
196
+ "tpp_threshold_100_total_metric": 0.11149998009204865,
197
+ "tpp_threshold_100_intended_diff_only": 0.11599999666213989,
198
+ "tpp_threshold_100_unintended_diff_only": 0.0045000165700912476,
199
+ "tpp_threshold_500_total_metric": 0.37950001657009125,
200
+ "tpp_threshold_500_intended_diff_only": 0.3830000162124634,
201
+ "tpp_threshold_500_unintended_diff_only": 0.0034999996423721313
202
+ },
203
+ "1": {
204
+ "tpp_threshold_2_total_metric": 0.005749985575675964,
205
+ "tpp_threshold_2_intended_diff_only": 0.009000003337860107,
206
+ "tpp_threshold_2_unintended_diff_only": 0.003250017762184143,
207
+ "tpp_threshold_5_total_metric": -0.0010000020265579224,
208
+ "tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
209
+ "tpp_threshold_5_unintended_diff_only": 0.005000010132789612,
210
+ "tpp_threshold_10_total_metric": 0.008250042796134949,
211
+ "tpp_threshold_10_intended_diff_only": 0.01100003719329834,
212
+ "tpp_threshold_10_unintended_diff_only": 0.002749994397163391,
213
+ "tpp_threshold_20_total_metric": 0.017250031232833862,
214
+ "tpp_threshold_20_intended_diff_only": 0.021000027656555176,
215
+ "tpp_threshold_20_unintended_diff_only": 0.0037499964237213135,
216
+ "tpp_threshold_50_total_metric": 0.03675006330013275,
217
+ "tpp_threshold_50_intended_diff_only": 0.04200005531311035,
218
+ "tpp_threshold_50_unintended_diff_only": 0.0052499920129776,
219
+ "tpp_threshold_100_total_metric": 0.08974997699260712,
220
+ "tpp_threshold_100_intended_diff_only": 0.09700000286102295,
221
+ "tpp_threshold_100_unintended_diff_only": 0.0072500258684158325,
222
+ "tpp_threshold_500_total_metric": 0.3217500150203705,
223
+ "tpp_threshold_500_intended_diff_only": 0.3360000252723694,
224
+ "tpp_threshold_500_unintended_diff_only": 0.014250010251998901
225
+ },
226
+ "2": {
227
+ "tpp_threshold_2_total_metric": 0.006749987602233887,
228
+ "tpp_threshold_2_intended_diff_only": 0.009999990463256836,
229
+ "tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
230
+ "tpp_threshold_5_total_metric": 0.021249979734420776,
231
+ "tpp_threshold_5_intended_diff_only": 0.02399998903274536,
232
+ "tpp_threshold_5_unintended_diff_only": 0.002750009298324585,
233
+ "tpp_threshold_10_total_metric": 0.04600001871585846,
234
+ "tpp_threshold_10_intended_diff_only": 0.04900002479553223,
235
+ "tpp_threshold_10_unintended_diff_only": 0.003000006079673767,
236
+ "tpp_threshold_20_total_metric": 0.07674998044967651,
237
+ "tpp_threshold_20_intended_diff_only": 0.07999998331069946,
238
+ "tpp_threshold_20_unintended_diff_only": 0.0032500028610229492,
239
+ "tpp_threshold_50_total_metric": 0.14174997806549072,
240
+ "tpp_threshold_50_intended_diff_only": 0.14499998092651367,
241
+ "tpp_threshold_50_unintended_diff_only": 0.0032500028610229492,
242
+ "tpp_threshold_100_total_metric": 0.1912500113248825,
243
+ "tpp_threshold_100_intended_diff_only": 0.1980000138282776,
244
+ "tpp_threshold_100_unintended_diff_only": 0.006750002503395081,
245
+ "tpp_threshold_500_total_metric": 0.38600003719329834,
246
+ "tpp_threshold_500_intended_diff_only": 0.39500004053115845,
247
+ "tpp_threshold_500_unintended_diff_only": 0.009000003337860107
248
+ },
249
+ "6": {
250
+ "tpp_threshold_2_total_metric": 0.0017499923706054688,
251
+ "tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
252
+ "tpp_threshold_2_unintended_diff_only": -0.0007500052452087402,
253
+ "tpp_threshold_5_total_metric": 0.0010000169277191162,
254
+ "tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
255
+ "tpp_threshold_5_unintended_diff_only": 0.0029999911785125732,
256
+ "tpp_threshold_10_total_metric": 0.005250021815299988,
257
+ "tpp_threshold_10_intended_diff_only": 0.00700002908706665,
258
+ "tpp_threshold_10_unintended_diff_only": 0.0017500072717666626,
259
+ "tpp_threshold_20_total_metric": 0.0022499561309814453,
260
+ "tpp_threshold_20_intended_diff_only": 0.0059999823570251465,
261
+ "tpp_threshold_20_unintended_diff_only": 0.003750026226043701,
262
+ "tpp_threshold_50_total_metric": 0.010250017046928406,
263
+ "tpp_threshold_50_intended_diff_only": 0.012000024318695068,
264
+ "tpp_threshold_50_unintended_diff_only": 0.0017500072717666626,
265
+ "tpp_threshold_100_total_metric": 0.017249971628189087,
266
+ "tpp_threshold_100_intended_diff_only": 0.018999993801116943,
267
+ "tpp_threshold_100_unintended_diff_only": 0.0017500221729278564,
268
+ "tpp_threshold_500_total_metric": 0.30149997770786285,
269
+ "tpp_threshold_500_intended_diff_only": 0.3059999942779541,
270
+ "tpp_threshold_500_unintended_diff_only": 0.0045000165700912476
271
+ },
272
+ "9": {
273
+ "tpp_threshold_2_total_metric": 0.007999956607818604,
274
+ "tpp_threshold_2_intended_diff_only": 0.010999977588653564,
275
+ "tpp_threshold_2_unintended_diff_only": 0.003000020980834961,
276
+ "tpp_threshold_5_total_metric": 0.005999952554702759,
277
+ "tpp_threshold_5_intended_diff_only": 0.007999956607818604,
278
+ "tpp_threshold_5_unintended_diff_only": 0.0020000040531158447,
279
+ "tpp_threshold_10_total_metric": 0.010499954223632812,
280
+ "tpp_threshold_10_intended_diff_only": 0.015999972820281982,
281
+ "tpp_threshold_10_unintended_diff_only": 0.00550001859664917,
282
+ "tpp_threshold_20_total_metric": 0.05475001037120819,
283
+ "tpp_threshold_20_intended_diff_only": 0.05900001525878906,
284
+ "tpp_threshold_20_unintended_diff_only": 0.004250004887580872,
285
+ "tpp_threshold_50_total_metric": 0.14349998533725739,
286
+ "tpp_threshold_50_intended_diff_only": 0.14899998903274536,
287
+ "tpp_threshold_50_unintended_diff_only": 0.005500003695487976,
288
+ "tpp_threshold_100_total_metric": 0.23075000941753387,
289
+ "tpp_threshold_100_intended_diff_only": 0.23900002241134644,
290
+ "tpp_threshold_100_unintended_diff_only": 0.008250012993812561,
291
+ "tpp_threshold_500_total_metric": 0.45749998092651367,
292
+ "tpp_threshold_500_intended_diff_only": 0.46399998664855957,
293
+ "tpp_threshold_500_unintended_diff_only": 0.0065000057220458984
294
+ }
295
+ },
296
+ "canrager/amazon_reviews_mcauley_1and5": {
297
+ "1": {
298
+ "tpp_threshold_2_total_metric": 0.0037500113248825073,
299
+ "tpp_threshold_2_intended_diff_only": 0.009000003337860107,
300
+ "tpp_threshold_2_unintended_diff_only": 0.0052499920129776,
301
+ "tpp_threshold_5_total_metric": 0.005499988794326782,
302
+ "tpp_threshold_5_intended_diff_only": 0.009999990463256836,
303
+ "tpp_threshold_5_unintended_diff_only": 0.004500001668930054,
304
+ "tpp_threshold_10_total_metric": 0.003250017762184143,
305
+ "tpp_threshold_10_intended_diff_only": 0.009000003337860107,
306
+ "tpp_threshold_10_unintended_diff_only": 0.005749985575675964,
307
+ "tpp_threshold_20_total_metric": -0.0022500157356262207,
308
+ "tpp_threshold_20_intended_diff_only": 0.0059999823570251465,
309
+ "tpp_threshold_20_unintended_diff_only": 0.008249998092651367,
310
+ "tpp_threshold_50_total_metric": 0.006999999284744263,
311
+ "tpp_threshold_50_intended_diff_only": 0.014999985694885254,
312
+ "tpp_threshold_50_unintended_diff_only": 0.007999986410140991,
313
+ "tpp_threshold_100_total_metric": 0.019749969244003296,
314
+ "tpp_threshold_100_intended_diff_only": 0.02899998426437378,
315
+ "tpp_threshold_100_unintended_diff_only": 0.009250015020370483,
316
+ "tpp_threshold_500_total_metric": 0.13574999570846558,
317
+ "tpp_threshold_500_intended_diff_only": 0.14899998903274536,
318
+ "tpp_threshold_500_unintended_diff_only": 0.013249993324279785
319
+ },
320
+ "2": {
321
+ "tpp_threshold_2_total_metric": -0.003500014543533325,
322
+ "tpp_threshold_2_intended_diff_only": 0.001999974250793457,
323
+ "tpp_threshold_2_unintended_diff_only": 0.005499988794326782,
324
+ "tpp_threshold_5_total_metric": 0.009999975562095642,
325
+ "tpp_threshold_5_intended_diff_only": 0.010999977588653564,
326
+ "tpp_threshold_5_unintended_diff_only": 0.0010000020265579224,
327
+ "tpp_threshold_10_total_metric": 0.008749976754188538,
328
+ "tpp_threshold_10_intended_diff_only": 0.011999964714050293,
329
+ "tpp_threshold_10_unintended_diff_only": 0.0032499879598617554,
330
+ "tpp_threshold_20_total_metric": 0.019249960780143738,
331
+ "tpp_threshold_20_intended_diff_only": 0.02599996328353882,
332
+ "tpp_threshold_20_unintended_diff_only": 0.006750002503395081,
333
+ "tpp_threshold_50_total_metric": 0.039250001311302185,
334
+ "tpp_threshold_50_intended_diff_only": 0.046999990940093994,
335
+ "tpp_threshold_50_unintended_diff_only": 0.007749989628791809,
336
+ "tpp_threshold_100_total_metric": 0.085999995470047,
337
+ "tpp_threshold_100_intended_diff_only": 0.09700000286102295,
338
+ "tpp_threshold_100_unintended_diff_only": 0.011000007390975952,
339
+ "tpp_threshold_500_total_metric": 0.2435000240802765,
340
+ "tpp_threshold_500_intended_diff_only": 0.2580000162124634,
341
+ "tpp_threshold_500_unintended_diff_only": 0.01449999213218689
342
+ },
343
+ "3": {
344
+ "tpp_threshold_2_total_metric": 0.004999995231628418,
345
+ "tpp_threshold_2_intended_diff_only": 0.0040000081062316895,
346
+ "tpp_threshold_2_unintended_diff_only": -0.0009999871253967285,
347
+ "tpp_threshold_5_total_metric": -0.0012499988079071045,
348
+ "tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
349
+ "tpp_threshold_5_unintended_diff_only": 0.002249985933303833,
350
+ "tpp_threshold_10_total_metric": -0.003000006079673767,
351
+ "tpp_threshold_10_intended_diff_only": 0.001999974250793457,
352
+ "tpp_threshold_10_unintended_diff_only": 0.004999980330467224,
353
+ "tpp_threshold_20_total_metric": 0.010749995708465576,
354
+ "tpp_threshold_20_intended_diff_only": 0.015999972820281982,
355
+ "tpp_threshold_20_unintended_diff_only": 0.005249977111816406,
356
+ "tpp_threshold_50_total_metric": 0.02850000560283661,
357
+ "tpp_threshold_50_intended_diff_only": 0.03700000047683716,
358
+ "tpp_threshold_50_unintended_diff_only": 0.00849999487400055,
359
+ "tpp_threshold_100_total_metric": 0.056500017642974854,
360
+ "tpp_threshold_100_intended_diff_only": 0.06499999761581421,
361
+ "tpp_threshold_100_unintended_diff_only": 0.008499979972839355,
362
+ "tpp_threshold_500_total_metric": 0.19200001657009125,
363
+ "tpp_threshold_500_intended_diff_only": 0.22299998998641968,
364
+ "tpp_threshold_500_unintended_diff_only": 0.03099997341632843
365
+ },
366
+ "5": {
367
+ "tpp_threshold_2_total_metric": 0.0005000680685043335,
368
+ "tpp_threshold_2_intended_diff_only": 0.010000050067901611,
369
+ "tpp_threshold_2_unintended_diff_only": 0.009499981999397278,
370
+ "tpp_threshold_5_total_metric": 0.008750051259994507,
371
+ "tpp_threshold_5_intended_diff_only": 0.016000032424926758,
372
+ "tpp_threshold_5_unintended_diff_only": 0.007249981164932251,
373
+ "tpp_threshold_10_total_metric": 0.0297500342130661,
374
+ "tpp_threshold_10_intended_diff_only": 0.03600001335144043,
375
+ "tpp_threshold_10_unintended_diff_only": 0.006249979138374329,
376
+ "tpp_threshold_20_total_metric": 0.04075004160404205,
377
+ "tpp_threshold_20_intended_diff_only": 0.053000032901763916,
378
+ "tpp_threshold_20_unintended_diff_only": 0.012249991297721863,
379
+ "tpp_threshold_50_total_metric": 0.14325006306171417,
380
+ "tpp_threshold_50_intended_diff_only": 0.1640000343322754,
381
+ "tpp_threshold_50_unintended_diff_only": 0.020749971270561218,
382
+ "tpp_threshold_100_total_metric": 0.24500006437301636,
383
+ "tpp_threshold_100_intended_diff_only": 0.2690000534057617,
384
+ "tpp_threshold_100_unintended_diff_only": 0.02399998903274536,
385
+ "tpp_threshold_500_total_metric": 0.37525005638599396,
386
+ "tpp_threshold_500_intended_diff_only": 0.40000003576278687,
387
+ "tpp_threshold_500_unintended_diff_only": 0.024749979376792908
388
+ },
389
+ "6": {
390
+ "tpp_threshold_2_total_metric": 0.014249980449676514,
391
+ "tpp_threshold_2_intended_diff_only": 0.015999972820281982,
392
+ "tpp_threshold_2_unintended_diff_only": 0.0017499923706054688,
393
+ "tpp_threshold_5_total_metric": 0.03300000727176666,
394
+ "tpp_threshold_5_intended_diff_only": 0.03299999237060547,
395
+ "tpp_threshold_5_unintended_diff_only": -1.4901161193847656e-08,
396
+ "tpp_threshold_10_total_metric": 0.03025001287460327,
397
+ "tpp_threshold_10_intended_diff_only": 0.03200000524520874,
398
+ "tpp_threshold_10_unintended_diff_only": 0.0017499923706054688,
399
+ "tpp_threshold_20_total_metric": 0.0495000034570694,
400
+ "tpp_threshold_20_intended_diff_only": 0.050999999046325684,
401
+ "tpp_threshold_20_unintended_diff_only": 0.0014999955892562866,
402
+ "tpp_threshold_50_total_metric": 0.09975001215934753,
403
+ "tpp_threshold_50_intended_diff_only": 0.10600000619888306,
404
+ "tpp_threshold_50_unintended_diff_only": 0.0062499940395355225,
405
+ "tpp_threshold_100_total_metric": 0.20400001108646393,
406
+ "tpp_threshold_100_intended_diff_only": 0.21299999952316284,
407
+ "tpp_threshold_100_unintended_diff_only": 0.008999988436698914,
408
+ "tpp_threshold_500_total_metric": 0.34675003588199615,
409
+ "tpp_threshold_500_intended_diff_only": 0.36400002241134644,
410
+ "tpp_threshold_500_unintended_diff_only": 0.01724998652935028
411
+ }
412
+ }
413
+ }
414
+ }
eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "unlearning",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "wmdp-bio",
7
+ "high_school_us_history",
8
+ "college_computer_science",
9
+ "high_school_geography",
10
+ "human_aging"
11
+ ],
12
+ "intervention_method": "clamp_feature_activation",
13
+ "retain_thresholds": [
14
+ 0.001,
15
+ 0.01
16
+ ],
17
+ "n_features_list": [
18
+ 10,
19
+ 20
20
+ ],
21
+ "multipliers": [
22
+ 25,
23
+ 50,
24
+ 100,
25
+ 200
26
+ ],
27
+ "dataset_size": 1024,
28
+ "seq_len": 1024,
29
+ "n_batch_loss_added": 50,
30
+ "target_metric": "correct",
31
+ "save_metrics": true,
32
+ "model_name": "gemma-2-2b-it",
33
+ "llm_batch_size": 4,
34
+ "llm_dtype": "bfloat16"
35
+ },
36
+ "eval_id": "20465b2c-8176-41ec-9e49-9f00775b929b",
37
+ "datetime_epoch_millis": 1740206428701,
38
+ "eval_result_metrics": {
39
+ "unlearning": {
40
+ "unlearning_score": 0.058161377906799316
41
+ }
42
+ },
43
+ "eval_result_details": [],
44
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
45
+ "sae_lens_id": "custom_sae",
46
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000",
47
+ "sae_lens_version": "5.5.0",
48
+ "sae_cfg_dict": {
49
+ "model_name": "gemma-2-2b",
50
+ "d_in": 2304,
51
+ "d_sae": 65536,
52
+ "hook_layer": 12,
53
+ "hook_name": "blocks.12.hook_resid_post",
54
+ "context_size": null,
55
+ "hook_head_index": null,
56
+ "architecture": "topk",
57
+ "apply_b_dec_to_input": null,
58
+ "finetuning_scaling_factor": null,
59
+ "activation_fn_str": "",
60
+ "prepend_bos": true,
61
+ "normalize_activations": "none",
62
+ "dtype": "bfloat16",
63
+ "device": "",
64
+ "dataset_path": "",
65
+ "dataset_trust_remote_code": true,
66
+ "seqpos_slice": [
67
+ null
68
+ ],
69
+ "training_tokens": -100000,
70
+ "sae_lens_training_version": null,
71
+ "neuronpedia_id": null
72
+ },
73
+ "eval_result_unstructured": null
74
+ }
eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "unlearning",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "wmdp-bio",
7
+ "high_school_us_history",
8
+ "college_computer_science",
9
+ "high_school_geography",
10
+ "human_aging"
11
+ ],
12
+ "intervention_method": "clamp_feature_activation",
13
+ "retain_thresholds": [
14
+ 0.001,
15
+ 0.01
16
+ ],
17
+ "n_features_list": [
18
+ 10,
19
+ 20
20
+ ],
21
+ "multipliers": [
22
+ 25,
23
+ 50,
24
+ 100,
25
+ 200
26
+ ],
27
+ "dataset_size": 1024,
28
+ "seq_len": 1024,
29
+ "n_batch_loss_added": 50,
30
+ "target_metric": "correct",
31
+ "save_metrics": true,
32
+ "model_name": "gemma-2-2b-it",
33
+ "llm_batch_size": 4,
34
+ "llm_dtype": "bfloat16"
35
+ },
36
+ "eval_id": "11f1352e-ddcf-441e-97b5-61b2393a7154",
37
+ "datetime_epoch_millis": 1740207423992,
38
+ "eval_result_metrics": {
39
+ "unlearning": {
40
+ "unlearning_score": 0.028142571449279785
41
+ }
42
+ },
43
+ "eval_result_details": [],
44
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
45
+ "sae_lens_id": "custom_sae",
46
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000",
47
+ "sae_lens_version": "5.5.0",
48
+ "sae_cfg_dict": {
49
+ "model_name": "gemma-2-2b",
50
+ "d_in": 2304,
51
+ "d_sae": 65536,
52
+ "hook_layer": 12,
53
+ "hook_name": "blocks.12.hook_resid_post",
54
+ "context_size": null,
55
+ "hook_head_index": null,
56
+ "architecture": "topk",
57
+ "apply_b_dec_to_input": null,
58
+ "finetuning_scaling_factor": null,
59
+ "activation_fn_str": "",
60
+ "prepend_bos": true,
61
+ "normalize_activations": "none",
62
+ "dtype": "bfloat16",
63
+ "device": "",
64
+ "dataset_path": "",
65
+ "dataset_trust_remote_code": true,
66
+ "seqpos_slice": [
67
+ null
68
+ ],
69
+ "training_tokens": -100000,
70
+ "sae_lens_training_version": null,
71
+ "neuronpedia_id": null
72
+ },
73
+ "eval_result_unstructured": null
74
+ }
eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "unlearning",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "wmdp-bio",
7
+ "high_school_us_history",
8
+ "college_computer_science",
9
+ "high_school_geography",
10
+ "human_aging"
11
+ ],
12
+ "intervention_method": "clamp_feature_activation",
13
+ "retain_thresholds": [
14
+ 0.001,
15
+ 0.01
16
+ ],
17
+ "n_features_list": [
18
+ 10,
19
+ 20
20
+ ],
21
+ "multipliers": [
22
+ 25,
23
+ 50,
24
+ 100,
25
+ 200
26
+ ],
27
+ "dataset_size": 1024,
28
+ "seq_len": 1024,
29
+ "n_batch_loss_added": 50,
30
+ "target_metric": "correct",
31
+ "save_metrics": true,
32
+ "model_name": "gemma-2-2b-it",
33
+ "llm_batch_size": 4,
34
+ "llm_dtype": "bfloat16"
35
+ },
36
+ "eval_id": "725f3662-bdf4-4419-8888-9f841bac76ad",
37
+ "datetime_epoch_millis": 1740206898872,
38
+ "eval_result_metrics": {
39
+ "unlearning": {
40
+ "unlearning_score": 0.09380865097045898
41
+ }
42
+ },
43
+ "eval_result_details": [],
44
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
45
+ "sae_lens_id": "custom_sae",
46
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000",
47
+ "sae_lens_version": "5.5.0",
48
+ "sae_cfg_dict": {
49
+ "model_name": "gemma-2-2b",
50
+ "d_in": 2304,
51
+ "d_sae": 65536,
52
+ "hook_layer": 12,
53
+ "hook_name": "blocks.12.hook_resid_post",
54
+ "context_size": null,
55
+ "hook_head_index": null,
56
+ "architecture": "topk",
57
+ "apply_b_dec_to_input": null,
58
+ "finetuning_scaling_factor": null,
59
+ "activation_fn_str": "",
60
+ "prepend_bos": true,
61
+ "normalize_activations": "none",
62
+ "dtype": "bfloat16",
63
+ "device": "",
64
+ "dataset_path": "",
65
+ "dataset_trust_remote_code": true,
66
+ "seqpos_slice": [
67
+ null
68
+ ],
69
+ "training_tokens": -100000,
70
+ "sae_lens_training_version": null,
71
+ "neuronpedia_id": null
72
+ },
73
+ "eval_result_unstructured": null
74
+ }
eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "unlearning",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "wmdp-bio",
7
+ "high_school_us_history",
8
+ "college_computer_science",
9
+ "high_school_geography",
10
+ "human_aging"
11
+ ],
12
+ "intervention_method": "clamp_feature_activation",
13
+ "retain_thresholds": [
14
+ 0.001,
15
+ 0.01
16
+ ],
17
+ "n_features_list": [
18
+ 10,
19
+ 20
20
+ ],
21
+ "multipliers": [
22
+ 25,
23
+ 50,
24
+ 100,
25
+ 200
26
+ ],
27
+ "dataset_size": 1024,
28
+ "seq_len": 1024,
29
+ "n_batch_loss_added": 50,
30
+ "target_metric": "correct",
31
+ "save_metrics": true,
32
+ "model_name": "gemma-2-2b-it",
33
+ "llm_batch_size": 4,
34
+ "llm_dtype": "bfloat16"
35
+ },
36
+ "eval_id": "7c04fe70-57d3-4455-875d-f1b5132102b7",
37
+ "datetime_epoch_millis": 1740205938198,
38
+ "eval_result_metrics": {
39
+ "unlearning": {
40
+ "unlearning_score": 0.03752344846725464
41
+ }
42
+ },
43
+ "eval_result_details": [],
44
+ "sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
45
+ "sae_lens_id": "custom_sae",
46
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000",
47
+ "sae_lens_version": "5.5.0",
48
+ "sae_cfg_dict": {
49
+ "model_name": "gemma-2-2b",
50
+ "d_in": 2304,
51
+ "d_sae": 65536,
52
+ "hook_layer": 12,
53
+ "hook_name": "blocks.12.hook_resid_post",
54
+ "context_size": null,
55
+ "hook_head_index": null,
56
+ "architecture": "topk",
57
+ "apply_b_dec_to_input": null,
58
+ "finetuning_scaling_factor": null,
59
+ "activation_fn_str": "",
60
+ "prepend_bos": true,
61
+ "normalize_activations": "none",
62
+ "dtype": "bfloat16",
63
+ "device": "",
64
+ "dataset_path": "",
65
+ "dataset_trust_remote_code": true,
66
+ "seqpos_slice": [
67
+ null
68
+ ],
69
+ "training_tokens": -100000,
70
+ "sae_lens_training_version": null,
71
+ "neuronpedia_id": null
72
+ },
73
+ "eval_result_unstructured": null
74
+ }