Add files using upload-large-folder tool
Browse files- .gitattributes +8 -0
- eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json +268 -0
- eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json +268 -0
- eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json +268 -0
- eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json +268 -0
- eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json +3 -0
- eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json +323 -0
- eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json +323 -0
- eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json +323 -0
- eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json +323 -0
- eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json +670 -0
- eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json +670 -0
- eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json +670 -0
- eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json +670 -0
- eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json +414 -0
- eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json +414 -0
- eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json +414 -0
- eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json +414 -0
- eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json +74 -0
- eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json +74 -0
- eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json +74 -0
- eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json +74 -0
.gitattributes
CHANGED
@@ -85,3 +85,11 @@ eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_16
|
|
85 |
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
86 |
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
87 |
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
86 |
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
87 |
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
88 |
+
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
89 |
+
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
90 |
+
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
91 |
+
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
92 |
+
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
93 |
+
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
94 |
+
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
95 |
+
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "24efad56-2464-4fc7-a72f-d8ee75af8889",
|
17 |
+
"datetime_epoch_millis": 1740196513100,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.07986090984327109,
|
21 |
+
"mean_full_absorption_score": 0.05125248877852254,
|
22 |
+
"mean_num_split_features": 1.1153846153846154,
|
23 |
+
"std_dev_absorption_fraction_score": 0.09178111702879053,
|
24 |
+
"std_dev_full_absorption_score": 0.06575471363588886,
|
25 |
+
"std_dev_num_split_features": 0.3258125936084211
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.028337216815245637,
|
32 |
+
"full_absorption_rate": 0.02044753086419753,
|
33 |
+
"num_full_absorption": 53,
|
34 |
+
"num_probe_true_positives": 2592,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.0018002834587106894,
|
40 |
+
"full_absorption_rate": 0.0031308703819661866,
|
41 |
+
"num_full_absorption": 5,
|
42 |
+
"num_probe_true_positives": 1597,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.25239018623716264,
|
48 |
+
"full_absorption_rate": 0.16895803183791605,
|
49 |
+
"num_full_absorption": 467,
|
50 |
+
"num_probe_true_positives": 2764,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.21202494352131684,
|
56 |
+
"full_absorption_rate": 0.1261904761904762,
|
57 |
+
"num_full_absorption": 212,
|
58 |
+
"num_probe_true_positives": 1680,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.1599800731731651,
|
64 |
+
"full_absorption_rate": 0.09768009768009768,
|
65 |
+
"num_full_absorption": 160,
|
66 |
+
"num_probe_true_positives": 1638,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.08533490557098641,
|
72 |
+
"full_absorption_rate": 0.030032467532467532,
|
73 |
+
"num_full_absorption": 37,
|
74 |
+
"num_probe_true_positives": 1232,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.01762098136284656,
|
80 |
+
"full_absorption_rate": 0.011583011583011582,
|
81 |
+
"num_full_absorption": 12,
|
82 |
+
"num_probe_true_positives": 1036,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.017284095093548002,
|
88 |
+
"full_absorption_rate": 0.004945598417408506,
|
89 |
+
"num_full_absorption": 5,
|
90 |
+
"num_probe_true_positives": 1011,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.21843708576706966,
|
96 |
+
"full_absorption_rate": 0.1754278728606357,
|
97 |
+
"num_full_absorption": 287,
|
98 |
+
"num_probe_true_positives": 1636,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.002670673585986552,
|
104 |
+
"full_absorption_rate": 0.004576659038901602,
|
105 |
+
"num_full_absorption": 2,
|
106 |
+
"num_probe_true_positives": 437,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.0009236539068285315,
|
112 |
+
"full_absorption_rate": 0.004379562043795621,
|
113 |
+
"num_full_absorption": 3,
|
114 |
+
"num_probe_true_positives": 685,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.09799425185837078,
|
120 |
+
"full_absorption_rate": 0.05155482815057283,
|
121 |
+
"num_full_absorption": 63,
|
122 |
+
"num_probe_true_positives": 1222,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.00829845786933557,
|
128 |
+
"full_absorption_rate": 0.011791128579449747,
|
129 |
+
"num_full_absorption": 21,
|
130 |
+
"num_probe_true_positives": 1781,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.07769420525556889,
|
136 |
+
"full_absorption_rate": 0.03904282115869018,
|
137 |
+
"num_full_absorption": 31,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.09555476160586396,
|
144 |
+
"full_absorption_rate": 0.0508637236084453,
|
145 |
+
"num_full_absorption": 53,
|
146 |
+
"num_probe_true_positives": 1042,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.31828712961790506,
|
152 |
+
"full_absorption_rate": 0.23675357443229605,
|
153 |
+
"num_full_absorption": 563,
|
154 |
+
"num_probe_true_positives": 2378,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.0,
|
160 |
+
"full_absorption_rate": 0.0,
|
161 |
+
"num_full_absorption": 0,
|
162 |
+
"num_probe_true_positives": 180,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.1849465735936363,
|
168 |
+
"full_absorption_rate": 0.14627994955863807,
|
169 |
+
"num_full_absorption": 232,
|
170 |
+
"num_probe_true_positives": 1586,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.13957874484003893,
|
176 |
+
"full_absorption_rate": 0.05545927209705372,
|
177 |
+
"num_full_absorption": 160,
|
178 |
+
"num_probe_true_positives": 2885,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.016794345867815363,
|
184 |
+
"full_absorption_rate": 0.0029222676797194622,
|
185 |
+
"num_full_absorption": 5,
|
186 |
+
"num_probe_true_positives": 1711,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.016594745549382752,
|
192 |
+
"full_absorption_rate": 0.011811023622047244,
|
193 |
+
"num_full_absorption": 9,
|
194 |
+
"num_probe_true_positives": 762,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.0009117410273845578,
|
200 |
+
"full_absorption_rate": 0.0012642225031605564,
|
201 |
+
"num_full_absorption": 1,
|
202 |
+
"num_probe_true_positives": 791,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.05020130296913499,
|
208 |
+
"full_absorption_rate": 0.038461538461538464,
|
209 |
+
"num_full_absorption": 26,
|
210 |
+
"num_probe_true_positives": 676,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.025096508373739165,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 85,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.04695990283036635,
|
224 |
+
"full_absorption_rate": 0.03067484662576687,
|
225 |
+
"num_full_absorption": 5,
|
226 |
+
"num_probe_true_positives": 163,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.0006668861736390532,
|
232 |
+
"full_absorption_rate": 0.008333333333333333,
|
233 |
+
"num_full_absorption": 2,
|
234 |
+
"num_probe_true_positives": 240,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000",
|
241 |
+
"sae_lens_version": "5.5.0",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "2075299c-bf70-47cc-9514-e6ff4f19bb19",
|
17 |
+
"datetime_epoch_millis": 1740198096538,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.3657679681798776,
|
21 |
+
"mean_full_absorption_score": 0.4750635142269141,
|
22 |
+
"mean_num_split_features": 4.884615384615385,
|
23 |
+
"std_dev_absorption_fraction_score": 0.15876472157027252,
|
24 |
+
"std_dev_full_absorption_score": 0.17905931379618015,
|
25 |
+
"std_dev_num_split_features": 2.303509028884811
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.42724656970095964,
|
32 |
+
"full_absorption_rate": 0.46103395061728397,
|
33 |
+
"num_full_absorption": 1195,
|
34 |
+
"num_probe_true_positives": 2592,
|
35 |
+
"num_split_features": 9
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.6019079325716311,
|
40 |
+
"full_absorption_rate": 0.7013149655604258,
|
41 |
+
"num_full_absorption": 1120,
|
42 |
+
"num_probe_true_positives": 1597,
|
43 |
+
"num_split_features": 4
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.4751708724747572,
|
48 |
+
"full_absorption_rate": 0.6425470332850941,
|
49 |
+
"num_full_absorption": 1776,
|
50 |
+
"num_probe_true_positives": 2764,
|
51 |
+
"num_split_features": 8
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.45846605418570807,
|
56 |
+
"full_absorption_rate": 0.5779761904761904,
|
57 |
+
"num_full_absorption": 971,
|
58 |
+
"num_probe_true_positives": 1680,
|
59 |
+
"num_split_features": 6
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.46395411641440426,
|
64 |
+
"full_absorption_rate": 0.5500610500610501,
|
65 |
+
"num_full_absorption": 901,
|
66 |
+
"num_probe_true_positives": 1638,
|
67 |
+
"num_split_features": 6
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.4687472478503038,
|
72 |
+
"full_absorption_rate": 0.5787337662337663,
|
73 |
+
"num_full_absorption": 713,
|
74 |
+
"num_probe_true_positives": 1232,
|
75 |
+
"num_split_features": 7
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.5120442444837915,
|
80 |
+
"full_absorption_rate": 0.6621621621621622,
|
81 |
+
"num_full_absorption": 686,
|
82 |
+
"num_probe_true_positives": 1036,
|
83 |
+
"num_split_features": 4
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.3676333184285843,
|
88 |
+
"full_absorption_rate": 0.4540059347181009,
|
89 |
+
"num_full_absorption": 459,
|
90 |
+
"num_probe_true_positives": 1011,
|
91 |
+
"num_split_features": 5
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.354487705235415,
|
96 |
+
"full_absorption_rate": 0.597799511002445,
|
97 |
+
"num_full_absorption": 978,
|
98 |
+
"num_probe_true_positives": 1636,
|
99 |
+
"num_split_features": 4
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.2174656084226122,
|
104 |
+
"full_absorption_rate": 0.30663615560640733,
|
105 |
+
"num_full_absorption": 134,
|
106 |
+
"num_probe_true_positives": 437,
|
107 |
+
"num_split_features": 4
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.10844747417238029,
|
112 |
+
"full_absorption_rate": 0.21021897810218979,
|
113 |
+
"num_full_absorption": 144,
|
114 |
+
"num_probe_true_positives": 685,
|
115 |
+
"num_split_features": 3
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.3507451077733594,
|
120 |
+
"full_absorption_rate": 0.41080196399345337,
|
121 |
+
"num_full_absorption": 502,
|
122 |
+
"num_probe_true_positives": 1222,
|
123 |
+
"num_split_features": 6
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.4872792444667222,
|
128 |
+
"full_absorption_rate": 0.6254912970241437,
|
129 |
+
"num_full_absorption": 1114,
|
130 |
+
"num_probe_true_positives": 1781,
|
131 |
+
"num_split_features": 6
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.35957856873086463,
|
136 |
+
"full_absorption_rate": 0.44962216624685136,
|
137 |
+
"num_full_absorption": 357,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 4
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.3379671565466484,
|
144 |
+
"full_absorption_rate": 0.5930902111324377,
|
145 |
+
"num_full_absorption": 618,
|
146 |
+
"num_probe_true_positives": 1042,
|
147 |
+
"num_split_features": 4
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.6400315149674903,
|
152 |
+
"full_absorption_rate": 0.6703111858704794,
|
153 |
+
"num_full_absorption": 1594,
|
154 |
+
"num_probe_true_positives": 2378,
|
155 |
+
"num_split_features": 8
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.14011099917784295,
|
160 |
+
"full_absorption_rate": 0.21666666666666667,
|
161 |
+
"num_full_absorption": 39,
|
162 |
+
"num_probe_true_positives": 180,
|
163 |
+
"num_split_features": 2
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.5426072959801443,
|
168 |
+
"full_absorption_rate": 0.6204287515762925,
|
169 |
+
"num_full_absorption": 984,
|
170 |
+
"num_probe_true_positives": 1586,
|
171 |
+
"num_split_features": 4
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.5787110174312542,
|
176 |
+
"full_absorption_rate": 0.638474870017331,
|
177 |
+
"num_full_absorption": 1842,
|
178 |
+
"num_probe_true_positives": 2885,
|
179 |
+
"num_split_features": 9
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.4268379794908569,
|
184 |
+
"full_absorption_rate": 0.47808299240210406,
|
185 |
+
"num_full_absorption": 818,
|
186 |
+
"num_probe_true_positives": 1711,
|
187 |
+
"num_split_features": 7
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.23241182613430145,
|
192 |
+
"full_absorption_rate": 0.5183727034120735,
|
193 |
+
"num_full_absorption": 395,
|
194 |
+
"num_probe_true_positives": 762,
|
195 |
+
"num_split_features": 4
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.16568475161369217,
|
200 |
+
"full_absorption_rate": 0.31352718078381797,
|
201 |
+
"num_full_absorption": 248,
|
202 |
+
"num_probe_true_positives": 791,
|
203 |
+
"num_split_features": 6
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.33654925106797057,
|
208 |
+
"full_absorption_rate": 0.5976331360946746,
|
209 |
+
"num_full_absorption": 404,
|
210 |
+
"num_probe_true_positives": 676,
|
211 |
+
"num_split_features": 4
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.18780014519391397,
|
216 |
+
"full_absorption_rate": 0.09411764705882353,
|
217 |
+
"num_full_absorption": 8,
|
218 |
+
"num_probe_true_positives": 85,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.13871370413905518,
|
224 |
+
"full_absorption_rate": 0.15337423312883436,
|
225 |
+
"num_full_absorption": 25,
|
226 |
+
"num_probe_true_positives": 163,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.12936746602215327,
|
232 |
+
"full_absorption_rate": 0.22916666666666666,
|
233 |
+
"num_full_absorption": 55,
|
234 |
+
"num_probe_true_positives": 240,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000",
|
241 |
+
"sae_lens_version": "5.5.0",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "fdaa2ac8-02d7-4f25-8e15-9292e5b6ab72",
|
17 |
+
"datetime_epoch_millis": 1740197304227,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.4712541829224532,
|
21 |
+
"mean_full_absorption_score": 0.5038687495232235,
|
22 |
+
"mean_num_split_features": 3.423076923076923,
|
23 |
+
"std_dev_absorption_fraction_score": 0.2176478202450696,
|
24 |
+
"std_dev_full_absorption_score": 0.23505900853406136,
|
25 |
+
"std_dev_num_split_features": 1.747525723371806
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.5003749369137354,
|
32 |
+
"full_absorption_rate": 0.42746913580246915,
|
33 |
+
"num_full_absorption": 1108,
|
34 |
+
"num_probe_true_positives": 2592,
|
35 |
+
"num_split_features": 6
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.6772489720427498,
|
40 |
+
"full_absorption_rate": 0.7025673137132122,
|
41 |
+
"num_full_absorption": 1122,
|
42 |
+
"num_probe_true_positives": 1597,
|
43 |
+
"num_split_features": 5
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.6902377294696528,
|
48 |
+
"full_absorption_rate": 0.7373371924746743,
|
49 |
+
"num_full_absorption": 2038,
|
50 |
+
"num_probe_true_positives": 2764,
|
51 |
+
"num_split_features": 6
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.5995046475493223,
|
56 |
+
"full_absorption_rate": 0.7053571428571429,
|
57 |
+
"num_full_absorption": 1185,
|
58 |
+
"num_probe_true_positives": 1680,
|
59 |
+
"num_split_features": 5
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.5325509916043534,
|
64 |
+
"full_absorption_rate": 0.5738705738705738,
|
65 |
+
"num_full_absorption": 940,
|
66 |
+
"num_probe_true_positives": 1638,
|
67 |
+
"num_split_features": 4
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.6960053818306784,
|
72 |
+
"full_absorption_rate": 0.698051948051948,
|
73 |
+
"num_full_absorption": 860,
|
74 |
+
"num_probe_true_positives": 1232,
|
75 |
+
"num_split_features": 6
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.6161939126364128,
|
80 |
+
"full_absorption_rate": 0.6998069498069498,
|
81 |
+
"num_full_absorption": 725,
|
82 |
+
"num_probe_true_positives": 1036,
|
83 |
+
"num_split_features": 4
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.522152641756954,
|
88 |
+
"full_absorption_rate": 0.5588526211671613,
|
89 |
+
"num_full_absorption": 565,
|
90 |
+
"num_probe_true_positives": 1011,
|
91 |
+
"num_split_features": 4
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.5450258516855058,
|
96 |
+
"full_absorption_rate": 0.7377750611246944,
|
97 |
+
"num_full_absorption": 1207,
|
98 |
+
"num_probe_true_positives": 1636,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.34805945001583916,
|
104 |
+
"full_absorption_rate": 0.30892448512585813,
|
105 |
+
"num_full_absorption": 135,
|
106 |
+
"num_probe_true_positives": 437,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.12442012974468852,
|
112 |
+
"full_absorption_rate": 0.16496350364963502,
|
113 |
+
"num_full_absorption": 113,
|
114 |
+
"num_probe_true_positives": 685,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.5697497391831701,
|
120 |
+
"full_absorption_rate": 0.6039279869067103,
|
121 |
+
"num_full_absorption": 738,
|
122 |
+
"num_probe_true_positives": 1222,
|
123 |
+
"num_split_features": 4
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.6777915542597618,
|
128 |
+
"full_absorption_rate": 0.7422796181920269,
|
129 |
+
"num_full_absorption": 1322,
|
130 |
+
"num_probe_true_positives": 1781,
|
131 |
+
"num_split_features": 3
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.535242959534133,
|
136 |
+
"full_absorption_rate": 0.5289672544080605,
|
137 |
+
"num_full_absorption": 420,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 4
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.36678618140490626,
|
144 |
+
"full_absorption_rate": 0.45681381957773515,
|
145 |
+
"num_full_absorption": 476,
|
146 |
+
"num_probe_true_positives": 1042,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.7778720671159048,
|
152 |
+
"full_absorption_rate": 0.7653490328006728,
|
153 |
+
"num_full_absorption": 1820,
|
154 |
+
"num_probe_true_positives": 2378,
|
155 |
+
"num_split_features": 4
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.30341799266122393,
|
160 |
+
"full_absorption_rate": 0.25,
|
161 |
+
"num_full_absorption": 45,
|
162 |
+
"num_probe_true_positives": 180,
|
163 |
+
"num_split_features": 2
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.5465185579059652,
|
168 |
+
"full_absorption_rate": 0.6141235813366961,
|
169 |
+
"num_full_absorption": 974,
|
170 |
+
"num_probe_true_positives": 1586,
|
171 |
+
"num_split_features": 5
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.7579124661305765,
|
176 |
+
"full_absorption_rate": 0.7632582322357019,
|
177 |
+
"num_full_absorption": 2202,
|
178 |
+
"num_probe_true_positives": 2885,
|
179 |
+
"num_split_features": 4
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.6113684686981942,
|
184 |
+
"full_absorption_rate": 0.6060783167738165,
|
185 |
+
"num_full_absorption": 1037,
|
186 |
+
"num_probe_true_positives": 1711,
|
187 |
+
"num_split_features": 5
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.15134342914968058,
|
192 |
+
"full_absorption_rate": 0.29396325459317585,
|
193 |
+
"num_full_absorption": 224,
|
194 |
+
"num_probe_true_positives": 762,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.3218639830617784,
|
200 |
+
"full_absorption_rate": 0.37926675094816686,
|
201 |
+
"num_full_absorption": 300,
|
202 |
+
"num_probe_true_positives": 791,
|
203 |
+
"num_split_features": 5
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.4937193345242905,
|
208 |
+
"full_absorption_rate": 0.5872781065088757,
|
209 |
+
"num_full_absorption": 397,
|
210 |
+
"num_probe_true_positives": 676,
|
211 |
+
"num_split_features": 3
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.08372842617309774,
|
216 |
+
"full_absorption_rate": 0.011764705882352941,
|
217 |
+
"num_full_absorption": 1,
|
218 |
+
"num_probe_true_positives": 85,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.18045047824415694,
|
224 |
+
"full_absorption_rate": 0.15337423312883436,
|
225 |
+
"num_full_absorption": 25,
|
226 |
+
"num_probe_true_positives": 163,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.02306847268705111,
|
232 |
+
"full_absorption_rate": 0.029166666666666667,
|
233 |
+
"num_full_absorption": 7,
|
234 |
+
"num_probe_true_positives": 240,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000",
|
241 |
+
"sae_lens_version": "5.5.0",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "0acc62cf-3e4f-4222-bb14-633ee7a3b344",
|
17 |
+
"datetime_epoch_millis": 1740195695924,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.2956776053959005,
|
21 |
+
"mean_full_absorption_score": 0.30409644176184997,
|
22 |
+
"mean_num_split_features": 1.5384615384615385,
|
23 |
+
"std_dev_absorption_fraction_score": 0.20040407999198337,
|
24 |
+
"std_dev_full_absorption_score": 0.2151226601623177,
|
25 |
+
"std_dev_num_split_features": 1.2076678096486377
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.37592651585433173,
|
32 |
+
"full_absorption_rate": 0.41435185185185186,
|
33 |
+
"num_full_absorption": 1074,
|
34 |
+
"num_probe_true_positives": 2592,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.3279063865305123,
|
40 |
+
"full_absorption_rate": 0.2686286787726988,
|
41 |
+
"num_full_absorption": 429,
|
42 |
+
"num_probe_true_positives": 1597,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.6351888347806534,
|
48 |
+
"full_absorption_rate": 0.6772793053545586,
|
49 |
+
"num_full_absorption": 1872,
|
50 |
+
"num_probe_true_positives": 2764,
|
51 |
+
"num_split_features": 2
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.35041653145091295,
|
56 |
+
"full_absorption_rate": 0.3755952380952381,
|
57 |
+
"num_full_absorption": 631,
|
58 |
+
"num_probe_true_positives": 1680,
|
59 |
+
"num_split_features": 2
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.38590072216293675,
|
64 |
+
"full_absorption_rate": 0.43223443223443225,
|
65 |
+
"num_full_absorption": 708,
|
66 |
+
"num_probe_true_positives": 1638,
|
67 |
+
"num_split_features": 3
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.4643418922512006,
|
72 |
+
"full_absorption_rate": 0.450487012987013,
|
73 |
+
"num_full_absorption": 555,
|
74 |
+
"num_probe_true_positives": 1232,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.18654193040142905,
|
80 |
+
"full_absorption_rate": 0.1805019305019305,
|
81 |
+
"num_full_absorption": 187,
|
82 |
+
"num_probe_true_positives": 1036,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.16308714809331934,
|
88 |
+
"full_absorption_rate": 0.1552917903066271,
|
89 |
+
"num_full_absorption": 157,
|
90 |
+
"num_probe_true_positives": 1011,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.4303391938395123,
|
96 |
+
"full_absorption_rate": 0.5623471882640587,
|
97 |
+
"num_full_absorption": 920,
|
98 |
+
"num_probe_true_positives": 1636,
|
99 |
+
"num_split_features": 3
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.011237078817775636,
|
104 |
+
"full_absorption_rate": 0.016018306636155607,
|
105 |
+
"num_full_absorption": 7,
|
106 |
+
"num_probe_true_positives": 437,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.011207543180270212,
|
112 |
+
"full_absorption_rate": 0.027737226277372264,
|
113 |
+
"num_full_absorption": 19,
|
114 |
+
"num_probe_true_positives": 685,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.37143542853909217,
|
120 |
+
"full_absorption_rate": 0.37561374795417346,
|
121 |
+
"num_full_absorption": 459,
|
122 |
+
"num_probe_true_positives": 1222,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.5060310945441595,
|
128 |
+
"full_absorption_rate": 0.5384615384615384,
|
129 |
+
"num_full_absorption": 959,
|
130 |
+
"num_probe_true_positives": 1781,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.3325042979830935,
|
136 |
+
"full_absorption_rate": 0.29345088161209065,
|
137 |
+
"num_full_absorption": 233,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 4
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.225787069641184,
|
144 |
+
"full_absorption_rate": 0.2744721689059501,
|
145 |
+
"num_full_absorption": 286,
|
146 |
+
"num_probe_true_positives": 1042,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.7353015458594555,
|
152 |
+
"full_absorption_rate": 0.7283431455004206,
|
153 |
+
"num_full_absorption": 1732,
|
154 |
+
"num_probe_true_positives": 2378,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.019577668010399153,
|
160 |
+
"full_absorption_rate": 0.022222222222222223,
|
161 |
+
"num_full_absorption": 4,
|
162 |
+
"num_probe_true_positives": 180,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.40605372278006896,
|
168 |
+
"full_absorption_rate": 0.45081967213114754,
|
169 |
+
"num_full_absorption": 715,
|
170 |
+
"num_probe_true_positives": 1586,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.5598209126630196,
|
176 |
+
"full_absorption_rate": 0.49740034662045063,
|
177 |
+
"num_full_absorption": 1435,
|
178 |
+
"num_probe_true_positives": 2885,
|
179 |
+
"num_split_features": 6
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.35543202018081804,
|
184 |
+
"full_absorption_rate": 0.31209818819403856,
|
185 |
+
"num_full_absorption": 534,
|
186 |
+
"num_probe_true_positives": 1711,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.3328107394590775,
|
192 |
+
"full_absorption_rate": 0.4671916010498688,
|
193 |
+
"num_full_absorption": 356,
|
194 |
+
"num_probe_true_positives": 762,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.054853668853618774,
|
200 |
+
"full_absorption_rate": 0.061946902654867256,
|
201 |
+
"num_full_absorption": 49,
|
202 |
+
"num_probe_true_positives": 791,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.21344579189584933,
|
208 |
+
"full_absorption_rate": 0.23372781065088757,
|
209 |
+
"num_full_absorption": 158,
|
210 |
+
"num_probe_true_positives": 676,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.1554455330699017,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 85,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.07682376512280702,
|
224 |
+
"full_absorption_rate": 0.0736196319018405,
|
225 |
+
"num_full_absorption": 12,
|
226 |
+
"num_probe_true_positives": 163,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.0002007043280144281,
|
232 |
+
"full_absorption_rate": 0.016666666666666666,
|
233 |
+
"num_full_absorption": 4,
|
234 |
+
"num_probe_true_positives": 240,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000",
|
241 |
+
"sae_lens_version": "5.5.0",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf17d005ae5204272b4c662cb93db6a0bcde577a14e57310a7e2109e39b69107
|
3 |
+
size 25991318
|
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b6ea85cd6c507c1763e9dd60c7e44b2df4bd2f6b1de54e785d1000c4b498bcb
|
3 |
+
size 25835979
|
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d7478f4580ec833ae1ad9c530373287986c4503a95360ca57bd6e5890fc9fd24
|
3 |
+
size 25528227
|
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7a9a8db4efe22a9bf8a3e7aa395e2f22f7e993697e7ede40fb5172370ee56c4e
|
3 |
+
size 25820187
|
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3df2b9e39a5d68fedd246832bc864c25d7818f47ab55f9d3bf510a45d50ff462
|
3 |
+
size 21796194
|
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd7f12343cbfa1d8071df229a205f103952c4afa96d5375f935381e52a4c8ba7
|
3 |
+
size 21209969
|
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:741a051b33165047cd4820f6d6fe7053639e0ba7554cb65b3064828cfbeb18c5
|
3 |
+
size 21598398
|
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:564b11414d3b95135bf4757715dd45bb98953d953656ebb5e9b6b62baf2d6231
|
3 |
+
size 21713941
|
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "5a1c4654-7650-404c-b3f2-2c2e726acd7b",
|
73 |
+
"datetime_epoch_millis": 1740201072988,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.20733915350751708,
|
77 |
+
"scr_metric_threshold_2": 0.11930950078221424,
|
78 |
+
"scr_dir2_threshold_2": 0.12406375737178194,
|
79 |
+
"scr_dir1_threshold_5": 0.22061339364104127,
|
80 |
+
"scr_metric_threshold_5": 0.17187754396157023,
|
81 |
+
"scr_dir2_threshold_5": 0.18389215216748134,
|
82 |
+
"scr_dir1_threshold_10": 0.27419770961645995,
|
83 |
+
"scr_metric_threshold_10": 0.21920456810879457,
|
84 |
+
"scr_dir2_threshold_10": 0.23032246687146835,
|
85 |
+
"scr_dir1_threshold_20": 0.3095246723854987,
|
86 |
+
"scr_metric_threshold_20": 0.2684238692342566,
|
87 |
+
"scr_dir2_threshold_20": 0.27399307227914804,
|
88 |
+
"scr_dir1_threshold_50": 0.2617380192076716,
|
89 |
+
"scr_metric_threshold_50": 0.37010005307771054,
|
90 |
+
"scr_dir2_threshold_50": 0.37541888018379954,
|
91 |
+
"scr_dir1_threshold_100": 0.12272683889285801,
|
92 |
+
"scr_metric_threshold_100": 0.42974841039466505,
|
93 |
+
"scr_dir2_threshold_100": 0.4320246013307986,
|
94 |
+
"scr_dir1_threshold_500": -0.0015497384482677581,
|
95 |
+
"scr_metric_threshold_500": 0.35600559749255273,
|
96 |
+
"scr_dir2_threshold_500": 0.36566734575390525
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.46031747533501616,
|
103 |
+
"scr_metric_threshold_2": 0.009828028179079856,
|
104 |
+
"scr_dir2_threshold_2": 0.009828028179079856,
|
105 |
+
"scr_dir1_threshold_5": 0.5238097040201937,
|
106 |
+
"scr_metric_threshold_5": 0.024570143672067307,
|
107 |
+
"scr_dir2_threshold_5": 0.024570143672067307,
|
108 |
+
"scr_dir1_threshold_10": 0.555555345309774,
|
109 |
+
"scr_metric_threshold_10": 0.0319410549698257,
|
110 |
+
"scr_dir2_threshold_10": 0.0319410549698257,
|
111 |
+
"scr_dir1_threshold_20": 0.5873019327053712,
|
112 |
+
"scr_metric_threshold_20": 0.061425139507065275,
|
113 |
+
"scr_dir2_threshold_20": 0.061425139507065275,
|
114 |
+
"scr_dir1_threshold_50": 0.5396825246649839,
|
115 |
+
"scr_metric_threshold_50": 0.16953315657947304,
|
116 |
+
"scr_dir2_threshold_50": 0.16953315657947304,
|
117 |
+
"scr_dir1_threshold_100": 0.3650786592542414,
|
118 |
+
"scr_metric_threshold_100": 0.21621618059355086,
|
119 |
+
"scr_dir2_threshold_100": 0.21621618059355086,
|
120 |
+
"scr_dir1_threshold_500": 0.3968252466498386,
|
121 |
+
"scr_metric_threshold_500": 0.26535632148895016,
|
122 |
+
"scr_dir2_threshold_500": 0.26535632148895016
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.19191917367472955,
|
127 |
+
"scr_metric_threshold_2": 0.09915005123804593,
|
128 |
+
"scr_dir2_threshold_2": 0.09915005123804593,
|
129 |
+
"scr_dir1_threshold_5": 0.19191917367472955,
|
130 |
+
"scr_metric_threshold_5": 0.18130315345731438,
|
131 |
+
"scr_dir2_threshold_5": 0.18130315345731438,
|
132 |
+
"scr_dir1_threshold_10": 0.2626259524704024,
|
133 |
+
"scr_metric_threshold_10": 0.21246456436179043,
|
134 |
+
"scr_dir2_threshold_10": 0.21246456436179043,
|
135 |
+
"scr_dir1_threshold_20": 0.34343421572310684,
|
136 |
+
"scr_metric_threshold_20": 0.2294616822322599,
|
137 |
+
"scr_dir2_threshold_20": 0.2294616822322599,
|
138 |
+
"scr_dir1_threshold_50": 0.3232324509435598,
|
139 |
+
"scr_metric_threshold_50": 0.3456090201924673,
|
140 |
+
"scr_dir2_threshold_50": 0.3456090201924673,
|
141 |
+
"scr_dir1_threshold_100": 0.09090914564247801,
|
142 |
+
"scr_metric_threshold_100": 0.43909342175758737,
|
143 |
+
"scr_dir2_threshold_100": 0.43909342175758737,
|
144 |
+
"scr_dir1_threshold_500": -0.48484897744896877,
|
145 |
+
"scr_metric_threshold_500": 0.16997168525977072,
|
146 |
+
"scr_dir2_threshold_500": 0.16997168525977072
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.532258142045613,
|
151 |
+
"scr_metric_threshold_2": 0.020202059731676766,
|
152 |
+
"scr_dir2_threshold_2": 0.020202059731676766,
|
153 |
+
"scr_dir1_threshold_5": 0.5483869727270193,
|
154 |
+
"scr_metric_threshold_5": 0.03282830943478244,
|
155 |
+
"scr_dir2_threshold_5": 0.03282830943478244,
|
156 |
+
"scr_dir1_threshold_10": 0.5645158034084256,
|
157 |
+
"scr_metric_threshold_10": 0.07070705854409946,
|
158 |
+
"scr_dir2_threshold_10": 0.07070705854409946,
|
159 |
+
"scr_dir1_threshold_20": 0.5806446340898319,
|
160 |
+
"scr_metric_threshold_20": 0.1010101481416146,
|
161 |
+
"scr_dir2_threshold_20": 0.1010101481416146,
|
162 |
+
"scr_dir1_threshold_50": 0.3870967431817548,
|
163 |
+
"scr_metric_threshold_50": 0.22474757585217325,
|
164 |
+
"scr_dir2_threshold_50": 0.22474757585217325,
|
165 |
+
"scr_dir1_threshold_100": 0.35483812045334157,
|
166 |
+
"scr_metric_threshold_100": 0.3055555137453419,
|
167 |
+
"scr_dir2_threshold_100": 0.3055555137453419,
|
168 |
+
"scr_dir1_threshold_500": 0.2741930056807093,
|
169 |
+
"scr_metric_threshold_500": 0.12373742771055866,
|
170 |
+
"scr_dir2_threshold_500": 0.12373742771055866
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.2845529991791037,
|
175 |
+
"scr_metric_threshold_2": 0.07624623538069193,
|
176 |
+
"scr_dir2_threshold_2": 0.07624623538069193,
|
177 |
+
"scr_dir1_threshold_5": 0.25203261487944845,
|
178 |
+
"scr_metric_threshold_5": 0.17008790375159719,
|
179 |
+
"scr_dir2_threshold_5": 0.17008790375159719,
|
180 |
+
"scr_dir1_threshold_10": 0.3333333333333333,
|
181 |
+
"scr_metric_threshold_10": 0.23460408353041323,
|
182 |
+
"scr_dir2_threshold_10": 0.23460408353041323,
|
183 |
+
"scr_dir1_threshold_20": 0.1707318964255635,
|
184 |
+
"scr_metric_threshold_20": 0.32551323800084947,
|
185 |
+
"scr_dir2_threshold_20": 0.32551323800084947,
|
186 |
+
"scr_dir1_threshold_50": -0.05691030908151685,
|
187 |
+
"scr_metric_threshold_50": 0.4193549063717547,
|
188 |
+
"scr_dir2_threshold_50": 0.4193549063717547,
|
189 |
+
"scr_dir1_threshold_100": -0.2682925647340228,
|
190 |
+
"scr_metric_threshold_100": 0.5073313721480223,
|
191 |
+
"scr_dir2_threshold_100": 0.5073313721480223,
|
192 |
+
"scr_dir1_threshold_500": -1.0243899247818615,
|
193 |
+
"scr_metric_threshold_500": 0.1260996708634634,
|
194 |
+
"scr_dir2_threshold_500": 0.1260996708634634
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.016393554752069144,
|
199 |
+
"scr_metric_threshold_2": 0.417968888243172,
|
200 |
+
"scr_dir2_threshold_2": 0.417968888243172,
|
201 |
+
"scr_dir1_threshold_5": 0.021857964433295084,
|
202 |
+
"scr_metric_threshold_5": 0.496093800931695,
|
203 |
+
"scr_dir2_threshold_5": 0.496093800931695,
|
204 |
+
"scr_dir1_threshold_10": 0.06557389329988525,
|
205 |
+
"scr_metric_threshold_10": 0.5742187136202179,
|
206 |
+
"scr_dir2_threshold_10": 0.5742187136202179,
|
207 |
+
"scr_dir1_threshold_20": 0.18579220912042124,
|
208 |
+
"scr_metric_threshold_20": 0.6406250291038257,
|
209 |
+
"scr_dir2_threshold_20": 0.6406250291038257,
|
210 |
+
"scr_dir1_threshold_50": 0.14754101564344832,
|
211 |
+
"scr_metric_threshold_50": 0.7148437427240436,
|
212 |
+
"scr_dir2_threshold_50": 0.7148437427240436,
|
213 |
+
"scr_dir1_threshold_100": -0.36612033426800783,
|
214 |
+
"scr_metric_threshold_100": 0.621093800931695,
|
215 |
+
"scr_dir2_threshold_100": 0.621093800931695,
|
216 |
+
"scr_dir1_threshold_500": -0.3060108506493485,
|
217 |
+
"scr_metric_threshold_500": 0.703124912688523,
|
218 |
+
"scr_dir2_threshold_500": 0.703124912688523
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.07179486709233554,
|
223 |
+
"scr_metric_threshold_2": 0.060483807005948444,
|
224 |
+
"scr_dir2_threshold_2": 0.060483807005948444,
|
225 |
+
"scr_dir1_threshold_5": 0.07179486709233554,
|
226 |
+
"scr_metric_threshold_5": 0.0927419335456348,
|
227 |
+
"scr_dir2_threshold_5": 0.0927419335456348,
|
228 |
+
"scr_dir1_threshold_10": 0.19487173844135988,
|
229 |
+
"scr_metric_threshold_10": 0.1370969176230247,
|
230 |
+
"scr_dir2_threshold_10": 0.1370969176230247,
|
231 |
+
"scr_dir1_threshold_20": 0.24615374269804866,
|
232 |
+
"scr_metric_threshold_20": 0.16129039235714715,
|
233 |
+
"scr_dir2_threshold_20": 0.16129039235714715,
|
234 |
+
"scr_dir1_threshold_50": 0.3179486097903842,
|
235 |
+
"scr_metric_threshold_50": 0.31854833864087,
|
236 |
+
"scr_dir2_threshold_50": 0.31854833864087,
|
237 |
+
"scr_dir1_threshold_100": 0.3179486097903842,
|
238 |
+
"scr_metric_threshold_100": 0.4838709367301568,
|
239 |
+
"scr_dir2_threshold_100": 0.4838709367301568,
|
240 |
+
"scr_dir1_threshold_500": 0.4358973418467109,
|
241 |
+
"scr_metric_threshold_500": 0.5403225380039657,
|
242 |
+
"scr_dir2_threshold_500": 0.5403225380039657
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.05855860935384807,
|
247 |
+
"scr_metric_threshold_2": 0.22767852985167758,
|
248 |
+
"scr_dir2_threshold_2": 0.22767852985167758,
|
249 |
+
"scr_dir1_threshold_5": 0.10360361086007354,
|
250 |
+
"scr_metric_threshold_5": 0.3258928654582359,
|
251 |
+
"scr_dir2_threshold_5": 0.3258928654582359,
|
252 |
+
"scr_dir1_threshold_10": 0.14414416591355395,
|
253 |
+
"scr_metric_threshold_10": 0.4196427074660393,
|
254 |
+
"scr_dir2_threshold_10": 0.4196427074660393,
|
255 |
+
"scr_dir1_threshold_20": 0.2162161146256372,
|
256 |
+
"scr_metric_threshold_20": 0.4821426908352818,
|
257 |
+
"scr_dir2_threshold_20": 0.4821426908352818,
|
258 |
+
"scr_dir1_threshold_50": 0.22072082956776967,
|
259 |
+
"scr_metric_threshold_50": 0.5535713953099135,
|
260 |
+
"scr_dir2_threshold_50": 0.5535713953099135,
|
261 |
+
"scr_dir1_threshold_100": 0.23423416892600485,
|
262 |
+
"scr_metric_threshold_100": 0.6116071511725216,
|
263 |
+
"scr_dir2_threshold_100": 0.6116071511725216,
|
264 |
+
"scr_dir1_threshold_500": 0.4684683378520097,
|
265 |
+
"scr_metric_threshold_500": 0.691964310660422,
|
266 |
+
"scr_dir2_threshold_500": 0.691964310660422
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.042918406627421406,
|
271 |
+
"scr_metric_threshold_2": 0.042918406627421406,
|
272 |
+
"scr_dir2_threshold_2": 0.08095245934396302,
|
273 |
+
"scr_dir1_threshold_5": 0.05150224144123495,
|
274 |
+
"scr_metric_threshold_5": 0.05150224144123495,
|
275 |
+
"scr_dir2_threshold_5": 0.14761910708852366,
|
276 |
+
"scr_dir1_threshold_10": 0.07296144475494565,
|
277 |
+
"scr_metric_threshold_10": 0.07296144475494565,
|
278 |
+
"scr_dir2_threshold_10": 0.1619046348563358,
|
279 |
+
"scr_dir1_threshold_20": 0.1459226336960092,
|
280 |
+
"scr_metric_threshold_20": 0.1459226336960092,
|
281 |
+
"scr_dir2_threshold_20": 0.19047625805514054,
|
282 |
+
"scr_dir1_threshold_50": 0.21459228895098914,
|
283 |
+
"scr_metric_threshold_50": 0.21459228895098914,
|
284 |
+
"scr_dir2_threshold_50": 0.25714290579970117,
|
285 |
+
"scr_dir1_threshold_100": 0.2532189060784448,
|
286 |
+
"scr_metric_threshold_100": 0.2532189060784448,
|
287 |
+
"scr_dir2_threshold_100": 0.27142843356751334,
|
288 |
+
"scr_dir1_threshold_500": 0.2274679132647684,
|
289 |
+
"scr_metric_threshold_500": 0.2274679132647684,
|
290 |
+
"scr_dir2_threshold_500": 0.3047618993555888
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000",
|
296 |
+
"sae_lens_version": "5.5.0",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 65536,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "755bb8ea-c7ff-4fa3-97a2-31b4cab1f07b",
|
73 |
+
"datetime_epoch_millis": 1740201946199,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.12652894537606835,
|
77 |
+
"scr_metric_threshold_2": 0.0907272315324803,
|
78 |
+
"scr_dir2_threshold_2": 0.09369576167212271,
|
79 |
+
"scr_dir1_threshold_5": 0.17040759173008782,
|
80 |
+
"scr_metric_threshold_5": 0.15632193837867778,
|
81 |
+
"scr_dir2_threshold_5": 0.1618476987026072,
|
82 |
+
"scr_dir1_threshold_10": 0.16350941954437448,
|
83 |
+
"scr_metric_threshold_10": 0.21441616120887247,
|
84 |
+
"scr_dir2_threshold_10": 0.21743575905034218,
|
85 |
+
"scr_dir1_threshold_20": 0.13848990984582848,
|
86 |
+
"scr_metric_threshold_20": 0.25758062356316186,
|
87 |
+
"scr_dir2_threshold_20": 0.2642891875768084,
|
88 |
+
"scr_dir1_threshold_50": 0.04031027019135072,
|
89 |
+
"scr_metric_threshold_50": 0.31428981658344224,
|
90 |
+
"scr_dir2_threshold_50": 0.32129216728011795,
|
91 |
+
"scr_dir1_threshold_100": -0.05118550304462503,
|
92 |
+
"scr_metric_threshold_100": 0.2917722375027481,
|
93 |
+
"scr_dir2_threshold_100": 0.3017507752966668,
|
94 |
+
"scr_dir1_threshold_500": -0.173869850419616,
|
95 |
+
"scr_metric_threshold_500": 0.27581343492090155,
|
96 |
+
"scr_dir2_threshold_500": 0.28674743879422065
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.2857136099242737,
|
103 |
+
"scr_metric_threshold_2": 0.004914087313907592,
|
104 |
+
"scr_dir2_threshold_2": 0.004914087313907592,
|
105 |
+
"scr_dir1_threshold_5": 0.333333017964661,
|
106 |
+
"scr_metric_threshold_5": 0.0319410549698257,
|
107 |
+
"scr_dir2_threshold_5": 0.0319410549698257,
|
108 |
+
"scr_dir1_threshold_10": 0.3492058386094512,
|
109 |
+
"scr_metric_threshold_10": 0.04914014089539928,
|
110 |
+
"scr_dir2_threshold_10": 0.04914014089539928,
|
111 |
+
"scr_dir1_threshold_20": 0.2698407892794835,
|
112 |
+
"scr_metric_threshold_20": 0.11793619170022296,
|
113 |
+
"scr_dir2_threshold_20": 0.11793619170022296,
|
114 |
+
"scr_dir1_threshold_50": 0.23809514798990317,
|
115 |
+
"scr_metric_threshold_50": 0.15724815796780706,
|
116 |
+
"scr_dir2_threshold_50": 0.15724815796780706,
|
117 |
+
"scr_dir1_threshold_100": 0.11111069061954806,
|
118 |
+
"scr_metric_threshold_100": 0.17444724389338062,
|
119 |
+
"scr_dir2_threshold_100": 0.17444724389338062,
|
120 |
+
"scr_dir1_threshold_500": 0.04761846193437052,
|
121 |
+
"scr_metric_threshold_500": 0.11547922126763682,
|
122 |
+
"scr_dir2_threshold_500": 0.11547922126763682
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.07070677879567282,
|
127 |
+
"scr_metric_threshold_2": 0.11898016279667031,
|
128 |
+
"scr_dir2_threshold_2": 0.11898016279667031,
|
129 |
+
"scr_dir1_threshold_5": 0.15151504204837732,
|
130 |
+
"scr_metric_threshold_5": 0.18130315345731438,
|
131 |
+
"scr_dir2_threshold_5": 0.18130315345731438,
|
132 |
+
"scr_dir1_threshold_10": 0.1313126752015721,
|
133 |
+
"scr_metric_threshold_10": 0.23796032559334063,
|
134 |
+
"scr_dir2_threshold_10": 0.23796032559334063,
|
135 |
+
"scr_dir1_threshold_20": 0.06060589640589929,
|
136 |
+
"scr_metric_threshold_20": 0.26062326198842795,
|
137 |
+
"scr_dir2_threshold_20": 0.26062326198842795,
|
138 |
+
"scr_dir1_threshold_50": -0.3333339354005915,
|
139 |
+
"scr_metric_threshold_50": 0.32861190232199783,
|
140 |
+
"scr_dir2_threshold_50": 0.32861190232199783,
|
141 |
+
"scr_dir1_threshold_100": -0.3333339354005915,
|
142 |
+
"scr_metric_threshold_100": 0.1869688031302402,
|
143 |
+
"scr_dir2_threshold_100": 0.1869688031302402,
|
144 |
+
"scr_dir1_threshold_500": -0.41414159658603783,
|
145 |
+
"scr_metric_threshold_500": 0.26062326198842795,
|
146 |
+
"scr_dir2_threshold_500": 0.26062326198842795
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.3870967431817548,
|
151 |
+
"scr_metric_threshold_2": 0.010101029865838383,
|
152 |
+
"scr_dir2_threshold_2": 0.010101029865838383,
|
153 |
+
"scr_dir1_threshold_5": 0.40322557386316116,
|
154 |
+
"scr_metric_threshold_5": 0.042929339300620824,
|
155 |
+
"scr_dir2_threshold_5": 0.042929339300620824,
|
156 |
+
"scr_dir1_threshold_10": 0.40322557386316116,
|
157 |
+
"scr_metric_threshold_10": 0.07828286857267056,
|
158 |
+
"scr_dir2_threshold_10": 0.07828286857267056,
|
159 |
+
"scr_dir1_threshold_20": 0.37096695113474787,
|
160 |
+
"scr_metric_threshold_20": 0.1085859581701857,
|
161 |
+
"scr_dir2_threshold_20": 0.1085859581701857,
|
162 |
+
"scr_dir1_threshold_50": 0.03225766136281265,
|
163 |
+
"scr_metric_threshold_50": 0.17424242652298136,
|
164 |
+
"scr_dir2_threshold_50": 0.17424242652298136,
|
165 |
+
"scr_dir1_threshold_100": 0.016128830681406324,
|
166 |
+
"scr_metric_threshold_100": 0.2045455161204965,
|
167 |
+
"scr_dir2_threshold_100": 0.2045455161204965,
|
168 |
+
"scr_dir1_threshold_500": -0.20967768295508404,
|
169 |
+
"scr_metric_threshold_500": 0.0959595579503108,
|
170 |
+
"scr_dir2_threshold_500": 0.0959595579503108
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.10569112782625298,
|
175 |
+
"scr_metric_threshold_2": 0.17008790375159719,
|
176 |
+
"scr_dir2_threshold_2": 0.17008790375159719,
|
177 |
+
"scr_dir1_threshold_5": 0.22764220550708036,
|
178 |
+
"scr_metric_threshold_5": 0.208211108838793,
|
179 |
+
"scr_dir2_threshold_5": 0.208211108838793,
|
180 |
+
"scr_dir1_threshold_10": 0.08130071845388491,
|
181 |
+
"scr_metric_threshold_10": 0.27859231641854704,
|
182 |
+
"scr_dir2_threshold_10": 0.27859231641854704,
|
183 |
+
"scr_dir1_threshold_20": 0.12195107768082737,
|
184 |
+
"scr_metric_threshold_20": 0.34310849619736306,
|
185 |
+
"scr_dir2_threshold_20": 0.34310849619736306,
|
186 |
+
"scr_dir1_threshold_50": 0.06504076859931053,
|
187 |
+
"scr_metric_threshold_50": 0.39882695948107244,
|
188 |
+
"scr_dir2_threshold_50": 0.39882695948107244,
|
189 |
+
"scr_dir1_threshold_100": -0.5121947200956776,
|
190 |
+
"scr_metric_threshold_100": 0.4134897037771171,
|
191 |
+
"scr_dir2_threshold_100": 0.4134897037771171,
|
192 |
+
"scr_dir1_threshold_500": -0.7154470008208963,
|
193 |
+
"scr_metric_threshold_500": 0.23167156962994426,
|
194 |
+
"scr_dir2_threshold_500": 0.23167156962994426
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.03278678379574697,
|
199 |
+
"scr_metric_threshold_2": 0.14062502910382568,
|
200 |
+
"scr_dir2_threshold_2": 0.14062502910382568,
|
201 |
+
"scr_dir1_threshold_5": 0.03825119347697291,
|
202 |
+
"scr_metric_threshold_5": 0.33984374272404355,
|
203 |
+
"scr_dir2_threshold_5": 0.33984374272404355,
|
204 |
+
"scr_dir1_threshold_10": 0.06010915791026799,
|
205 |
+
"scr_metric_threshold_10": 0.4570313445874335,
|
206 |
+
"scr_dir2_threshold_10": 0.4570313445874335,
|
207 |
+
"scr_dir1_threshold_20": 0.01092881936245188,
|
208 |
+
"scr_metric_threshold_20": 0.5117188300355207,
|
209 |
+
"scr_dir2_threshold_20": 0.5117188300355207,
|
210 |
+
"scr_dir1_threshold_50": -0.06010915791026799,
|
211 |
+
"scr_metric_threshold_50": 0.5859375436557386,
|
212 |
+
"scr_dir2_threshold_50": 0.5859375436557386,
|
213 |
+
"scr_dir1_threshold_100": -0.14754101564344832,
|
214 |
+
"scr_metric_threshold_100": 0.61718760186339,
|
215 |
+
"scr_dir2_threshold_100": 0.61718760186339,
|
216 |
+
"scr_dir1_threshold_500": -0.35519118919716464,
|
217 |
+
"scr_metric_threshold_500": 0.6562500582076514,
|
218 |
+
"scr_dir2_threshold_500": 0.6562500582076514
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.05128200425668879,
|
223 |
+
"scr_metric_threshold_2": 0.06854845881151235,
|
224 |
+
"scr_dir2_threshold_2": 0.06854845881151235,
|
225 |
+
"scr_dir1_threshold_5": 0.07692300638503319,
|
226 |
+
"scr_metric_threshold_5": 0.10483879108333834,
|
227 |
+
"scr_dir2_threshold_5": 0.10483879108333834,
|
228 |
+
"scr_dir1_threshold_10": 0.06666642213478169,
|
229 |
+
"scr_metric_threshold_10": 0.1491935348194436,
|
230 |
+
"scr_dir2_threshold_10": 0.1491935348194436,
|
231 |
+
"scr_dir1_threshold_20": 0.03076914142104203,
|
232 |
+
"scr_metric_threshold_20": 0.2137097878988163,
|
233 |
+
"scr_dir2_threshold_20": 0.2137097878988163,
|
234 |
+
"scr_dir1_threshold_50": 0.06666642213478169,
|
235 |
+
"scr_metric_threshold_50": 0.2983870696388872,
|
236 |
+
"scr_dir2_threshold_50": 0.2983870696388872,
|
237 |
+
"scr_dir1_threshold_100": 0.09743586922067994,
|
238 |
+
"scr_metric_threshold_100": 0.23790326263293876,
|
239 |
+
"scr_dir2_threshold_100": 0.23790326263293876,
|
240 |
+
"scr_dir1_threshold_500": 0.07179486709233554,
|
241 |
+
"scr_metric_threshold_500": 0.31048392717659073,
|
242 |
+
"scr_dir2_threshold_500": 0.31048392717659073
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.03603610860073534,
|
247 |
+
"scr_metric_threshold_2": 0.16964277398906946,
|
248 |
+
"scr_dir2_threshold_2": 0.16964277398906946,
|
249 |
+
"scr_dir1_threshold_5": 0.07657666365421574,
|
250 |
+
"scr_metric_threshold_5": 0.2857142857142857,
|
251 |
+
"scr_dir2_threshold_5": 0.2857142857142857,
|
252 |
+
"scr_dir1_threshold_10": 0.12612611161318626,
|
253 |
+
"scr_metric_threshold_10": 0.37499990021545476,
|
254 |
+
"scr_dir2_threshold_10": 0.37499990021545476,
|
255 |
+
"scr_dir1_threshold_20": 0.14414416591355395,
|
256 |
+
"scr_metric_threshold_20": 0.4062500249461363,
|
257 |
+
"scr_dir2_threshold_20": 0.4062500249461363,
|
258 |
+
"scr_dir1_threshold_50": 0.19369361387252446,
|
259 |
+
"scr_metric_threshold_50": 0.4508928321967208,
|
260 |
+
"scr_dir2_threshold_50": 0.4508928321967208,
|
261 |
+
"scr_dir1_threshold_100": 0.23873861537874994,
|
262 |
+
"scr_metric_threshold_100": 0.37946412772208915,
|
263 |
+
"scr_dir2_threshold_100": 0.37946412772208915,
|
264 |
+
"scr_dir1_threshold_500": 0.0810811101069608,
|
265 |
+
"scr_metric_threshold_500": 0.43303565607806294,
|
266 |
+
"scr_dir2_threshold_500": 0.43303565607806294
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.042918406627421406,
|
271 |
+
"scr_metric_threshold_2": 0.042918406627421406,
|
272 |
+
"scr_dir2_threshold_2": 0.06666664774456064,
|
273 |
+
"scr_dir1_threshold_5": 0.05579403094120067,
|
274 |
+
"scr_metric_threshold_5": 0.05579403094120067,
|
275 |
+
"scr_dir2_threshold_5": 0.10000011353263609,
|
276 |
+
"scr_dir1_threshold_10": 0.09012885856869063,
|
277 |
+
"scr_metric_threshold_10": 0.09012885856869063,
|
278 |
+
"scr_dir2_threshold_10": 0.11428564130044823,
|
279 |
+
"scr_dir1_threshold_20": 0.09871243756862208,
|
280 |
+
"scr_metric_threshold_20": 0.09871243756862208,
|
281 |
+
"scr_dir2_threshold_20": 0.1523809496777944,
|
282 |
+
"scr_dir1_threshold_50": 0.12017164088233277,
|
283 |
+
"scr_metric_threshold_50": 0.12017164088233277,
|
284 |
+
"scr_dir2_threshold_50": 0.17619044645573817,
|
285 |
+
"scr_dir1_threshold_100": 0.12017164088233277,
|
286 |
+
"scr_metric_threshold_100": 0.12017164088233277,
|
287 |
+
"scr_dir2_threshold_100": 0.19999994323368195,
|
288 |
+
"scr_dir1_threshold_500": 0.10300422706858779,
|
289 |
+
"scr_metric_threshold_500": 0.10300422706858779,
|
290 |
+
"scr_dir2_threshold_500": 0.19047625805514054
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000",
|
296 |
+
"sae_lens_version": "5.5.0",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 65536,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "abc92c56-c28d-443e-a67d-598723b6587c",
|
73 |
+
"datetime_epoch_millis": 1740201509460,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.18683727638317182,
|
77 |
+
"scr_metric_threshold_2": 0.08404231131018772,
|
78 |
+
"scr_dir2_threshold_2": 0.08224127094506128,
|
79 |
+
"scr_dir1_threshold_5": 0.2253563520666828,
|
80 |
+
"scr_metric_threshold_5": 0.13959451018246774,
|
81 |
+
"scr_dir2_threshold_5": 0.1419013964046876,
|
82 |
+
"scr_dir1_threshold_10": 0.2258952276339641,
|
83 |
+
"scr_metric_threshold_10": 0.1895312922582373,
|
84 |
+
"scr_dir2_threshold_10": 0.19236695973140355,
|
85 |
+
"scr_dir1_threshold_20": 0.24793378644576078,
|
86 |
+
"scr_metric_threshold_20": 0.25111043795982735,
|
87 |
+
"scr_dir2_threshold_20": 0.2575763149690073,
|
88 |
+
"scr_dir1_threshold_50": 0.1565240815441055,
|
89 |
+
"scr_metric_threshold_50": 0.3408563171460069,
|
90 |
+
"scr_dir2_threshold_50": 0.34439707475977116,
|
91 |
+
"scr_dir1_threshold_100": 0.0584345012425294,
|
92 |
+
"scr_metric_threshold_100": 0.3621166850027227,
|
93 |
+
"scr_dir2_threshold_100": 0.36213712487414773,
|
94 |
+
"scr_dir1_threshold_500": 0.00039541249184461835,
|
95 |
+
"scr_metric_threshold_500": 0.31633581888474427,
|
96 |
+
"scr_dir2_threshold_500": 0.31587085277050087
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.3968252466498386,
|
103 |
+
"scr_metric_threshold_2": 0.009828028179079856,
|
104 |
+
"scr_dir2_threshold_2": 0.009828028179079856,
|
105 |
+
"scr_dir1_threshold_5": 0.42857088793941894,
|
106 |
+
"scr_metric_threshold_5": 0.03439802540241183,
|
107 |
+
"scr_dir2_threshold_5": 0.03439802540241183,
|
108 |
+
"scr_dir1_threshold_10": 0.3968252466498386,
|
109 |
+
"scr_metric_threshold_10": 0.05405408176057155,
|
110 |
+
"scr_dir2_threshold_10": 0.05405408176057155,
|
111 |
+
"scr_dir1_threshold_20": 0.41269806729462877,
|
112 |
+
"scr_metric_threshold_20": 0.11302210438631537,
|
113 |
+
"scr_dir2_threshold_20": 0.11302210438631537,
|
114 |
+
"scr_dir1_threshold_50": 0.3809524260050484,
|
115 |
+
"scr_metric_threshold_50": 0.13759224805838266,
|
116 |
+
"scr_dir2_threshold_50": 0.13759224805838266,
|
117 |
+
"scr_dir1_threshold_100": 0.333333017964661,
|
118 |
+
"scr_metric_threshold_100": 0.16216224528171463,
|
119 |
+
"scr_dir2_threshold_100": 0.16216224528171463,
|
120 |
+
"scr_dir1_threshold_500": -0.1269844573703551,
|
121 |
+
"scr_metric_threshold_500": 0.1031940762072355,
|
122 |
+
"scr_dir2_threshold_500": 0.1031940762072355
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.10101002803225154,
|
127 |
+
"scr_metric_threshold_2": 0.1416430991917576,
|
128 |
+
"scr_dir2_threshold_2": 0.1416430991917576,
|
129 |
+
"scr_dir1_threshold_5": 0.16161592443815084,
|
130 |
+
"scr_metric_threshold_5": 0.21813038288640824,
|
131 |
+
"scr_dir2_threshold_5": 0.21813038288640824,
|
132 |
+
"scr_dir1_threshold_10": 0.15151504204837732,
|
133 |
+
"scr_metric_threshold_10": 0.27478755502243446,
|
134 |
+
"scr_dir2_threshold_10": 0.27478755502243446,
|
135 |
+
"scr_dir1_threshold_20": 0.1414141596586038,
|
136 |
+
"scr_metric_threshold_20": 0.2832861983835152,
|
137 |
+
"scr_dir2_threshold_20": 0.2832861983835152,
|
138 |
+
"scr_dir1_threshold_50": 0.09090914564247801,
|
139 |
+
"scr_metric_threshold_50": 0.43059494724819863,
|
140 |
+
"scr_dir2_threshold_50": 0.43059494724819863,
|
141 |
+
"scr_dir1_threshold_100": -0.38383894941671726,
|
142 |
+
"scr_metric_threshold_100": 0.24645896895442135,
|
143 |
+
"scr_dir2_threshold_100": 0.24645896895442135,
|
144 |
+
"scr_dir1_threshold_500": -0.2828283193172076,
|
145 |
+
"scr_metric_threshold_500": 0.3371105456830785,
|
146 |
+
"scr_dir2_threshold_500": 0.3371105456830785
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.4999995193171997,
|
151 |
+
"scr_metric_threshold_2": 0.022727279568944055,
|
152 |
+
"scr_dir2_threshold_2": 0.022727279568944055,
|
153 |
+
"scr_dir1_threshold_5": 0.5483869727270193,
|
154 |
+
"scr_metric_threshold_5": 0.0530303691664592,
|
155 |
+
"scr_dir2_threshold_5": 0.0530303691664592,
|
156 |
+
"scr_dir1_threshold_10": 0.5483869727270193,
|
157 |
+
"scr_metric_threshold_10": 0.08080808840993783,
|
158 |
+
"scr_dir2_threshold_10": 0.08080808840993783,
|
159 |
+
"scr_dir1_threshold_20": 0.5645158034084256,
|
160 |
+
"scr_metric_threshold_20": 0.16919198684844677,
|
161 |
+
"scr_dir2_threshold_20": 0.16919198684844677,
|
162 |
+
"scr_dir1_threshold_50": 0.2741930056807093,
|
163 |
+
"scr_metric_threshold_50": 0.23989904539254622,
|
164 |
+
"scr_dir2_threshold_50": 0.23989904539254622,
|
165 |
+
"scr_dir1_threshold_100": 0.0645153227256253,
|
166 |
+
"scr_metric_threshold_100": 0.25000007525838464,
|
167 |
+
"scr_dir2_threshold_100": 0.25000007525838464,
|
168 |
+
"scr_dir1_threshold_500": 0.016128830681406324,
|
169 |
+
"scr_metric_threshold_500": 0.07575764873540326,
|
170 |
+
"scr_dir2_threshold_500": 0.07575764873540326
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.14634148705319544,
|
175 |
+
"scr_metric_threshold_2": 0.12023446826882572,
|
176 |
+
"scr_dir2_threshold_2": 0.12023446826882572,
|
177 |
+
"scr_dir1_threshold_5": 0.21951223057979316,
|
178 |
+
"scr_metric_threshold_5": 0.2375365974308822,
|
179 |
+
"scr_dir2_threshold_5": 0.2375365974308822,
|
180 |
+
"scr_dir1_threshold_10": 0.10569112782625298,
|
181 |
+
"scr_metric_threshold_10": 0.28739003291365367,
|
182 |
+
"scr_dir2_threshold_10": 0.28739003291365367,
|
183 |
+
"scr_dir1_threshold_20": 0.1869918462801379,
|
184 |
+
"scr_metric_threshold_20": 0.3665689569885143,
|
185 |
+
"scr_dir2_threshold_20": 0.3665689569885143,
|
186 |
+
"scr_dir1_threshold_50": -0.34146330826062055,
|
187 |
+
"scr_metric_threshold_50": 0.4398826784687373,
|
188 |
+
"scr_dir2_threshold_50": 0.4398826784687373,
|
189 |
+
"scr_dir1_threshold_100": -0.4390244611595863,
|
190 |
+
"scr_metric_threshold_100": 0.5395893746405804,
|
191 |
+
"scr_dir2_threshold_100": 0.5395893746405804,
|
192 |
+
"scr_dir1_threshold_500": -0.5284551545407584,
|
193 |
+
"scr_metric_threshold_500": 0.26099705822203345,
|
194 |
+
"scr_dir2_threshold_500": 0.26099705822203345
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.021857964433295084,
|
199 |
+
"scr_metric_threshold_2": 0.12890643189891055,
|
200 |
+
"scr_dir2_threshold_2": 0.12890643189891055,
|
201 |
+
"scr_dir1_threshold_5": 0.027322374114521025,
|
202 |
+
"scr_metric_threshold_5": 0.2265625727595642,
|
203 |
+
"scr_dir2_threshold_5": 0.2265625727595642,
|
204 |
+
"scr_dir1_threshold_10": 0.06010915791026799,
|
205 |
+
"scr_metric_threshold_10": 0.3164063154836078,
|
206 |
+
"scr_dir2_threshold_10": 0.3164063154836078,
|
207 |
+
"scr_dir1_threshold_20": 0.06010915791026799,
|
208 |
+
"scr_metric_threshold_20": 0.41015625727595645,
|
209 |
+
"scr_dir2_threshold_20": 0.41015625727595645,
|
210 |
+
"scr_dir1_threshold_50": 0.08743153202478901,
|
211 |
+
"scr_metric_threshold_50": 0.5703125145519129,
|
212 |
+
"scr_dir2_threshold_50": 0.5703125145519129,
|
213 |
+
"scr_dir1_threshold_100": -0.00546440968122594,
|
214 |
+
"scr_metric_threshold_100": 0.6562500582076514,
|
215 |
+
"scr_dir2_threshold_100": 0.6562500582076514,
|
216 |
+
"scr_dir1_threshold_500": -0.016393554752069144,
|
217 |
+
"scr_metric_threshold_500": 0.6757812863797821,
|
218 |
+
"scr_dir2_threshold_500": 0.6757812863797821
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.10769214780607521,
|
223 |
+
"scr_metric_threshold_2": 0.060483807005948444,
|
224 |
+
"scr_dir2_threshold_2": 0.060483807005948444,
|
225 |
+
"scr_dir1_threshold_5": 0.1692307363130155,
|
226 |
+
"scr_metric_threshold_5": 0.10483879108333834,
|
227 |
+
"scr_dir2_threshold_5": 0.10483879108333834,
|
228 |
+
"scr_dir1_threshold_10": 0.23076901915509954,
|
229 |
+
"scr_metric_threshold_10": 0.1370969176230247,
|
230 |
+
"scr_dir2_threshold_10": 0.1370969176230247,
|
231 |
+
"scr_dir1_threshold_20": 0.24102560340535104,
|
232 |
+
"scr_metric_threshold_20": 0.18951607282340924,
|
233 |
+
"scr_dir2_threshold_20": 0.18951607282340924,
|
234 |
+
"scr_dir1_threshold_50": 0.24615374269804866,
|
235 |
+
"scr_metric_threshold_50": 0.3306451961785736,
|
236 |
+
"scr_dir2_threshold_50": 0.3306451961785736,
|
237 |
+
"scr_dir1_threshold_100": 0.28717946836934216,
|
238 |
+
"scr_metric_threshold_100": 0.35483867091269605,
|
239 |
+
"scr_dir2_threshold_100": 0.35483867091269605,
|
240 |
+
"scr_dir1_threshold_500": 0.29743574695473746,
|
241 |
+
"scr_metric_threshold_500": 0.33870960764285285,
|
242 |
+
"scr_dir2_threshold_500": 0.33870960764285285
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.13513500451867638,
|
247 |
+
"scr_metric_threshold_2": 0.10267856311319266,
|
248 |
+
"scr_dir2_threshold_2": 0.10267856311319266,
|
249 |
+
"scr_dir1_threshold_5": 0.16666666666666666,
|
250 |
+
"scr_metric_threshold_5": 0.1607143189758008,
|
251 |
+
"scr_dir2_threshold_5": 0.1607143189758008,
|
252 |
+
"scr_dir1_threshold_10": 0.19369361387252446,
|
253 |
+
"scr_metric_threshold_10": 0.24553570597033553,
|
254 |
+
"scr_dir2_threshold_10": 0.24553570597033553,
|
255 |
+
"scr_dir1_threshold_20": 0.25225222322637253,
|
256 |
+
"scr_metric_threshold_20": 0.35267849659016254,
|
257 |
+
"scr_dir2_threshold_20": 0.35267849659016254,
|
258 |
+
"scr_dir1_threshold_50": 0.34234222623882343,
|
259 |
+
"scr_metric_threshold_50": 0.4062500249461363,
|
260 |
+
"scr_dir2_threshold_50": 0.4062500249461363,
|
261 |
+
"scr_dir1_threshold_100": 0.3918919426871814,
|
262 |
+
"scr_metric_threshold_100": 0.4687500083153788,
|
263 |
+
"scr_dir2_threshold_100": 0.4687500083153788,
|
264 |
+
"scr_dir1_threshold_500": 0.37387388838681374,
|
265 |
+
"scr_metric_threshold_500": 0.4687500083153788,
|
266 |
+
"scr_dir2_threshold_500": 0.4687500083153788
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.08583681325484281,
|
271 |
+
"scr_metric_threshold_2": 0.08583681325484281,
|
272 |
+
"scr_dir2_threshold_2": 0.07142849033383136,
|
273 |
+
"scr_dir1_threshold_5": 0.08154502375487709,
|
274 |
+
"scr_metric_threshold_5": 0.08154502375487709,
|
275 |
+
"scr_dir2_threshold_5": 0.10000011353263609,
|
276 |
+
"scr_dir1_threshold_10": 0.12017164088233277,
|
277 |
+
"scr_metric_threshold_10": 0.12017164088233277,
|
278 |
+
"scr_dir2_threshold_10": 0.14285698066766273,
|
279 |
+
"scr_dir1_threshold_20": 0.1244634303822985,
|
280 |
+
"scr_metric_threshold_20": 0.1244634303822985,
|
281 |
+
"scr_dir2_threshold_20": 0.17619044645573817,
|
282 |
+
"scr_dir1_threshold_50": 0.17167388232356773,
|
283 |
+
"scr_metric_threshold_50": 0.17167388232356773,
|
284 |
+
"scr_dir2_threshold_50": 0.19999994323368195,
|
285 |
+
"scr_dir1_threshold_100": 0.21888407845095484,
|
286 |
+
"scr_metric_threshold_100": 0.21888407845095484,
|
287 |
+
"scr_dir2_threshold_100": 0.21904759742235502,
|
288 |
+
"scr_dir1_threshold_500": 0.2703863198921898,
|
289 |
+
"scr_metric_threshold_500": 0.2703863198921898,
|
290 |
+
"scr_dir2_threshold_500": 0.2666665909782426
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000",
|
296 |
+
"sae_lens_version": "5.5.0",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 65536,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "7def7f32-ddef-4567-ac07-ad6074134cf7",
|
73 |
+
"datetime_epoch_millis": 1740200633241,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.20793359576572354,
|
77 |
+
"scr_metric_threshold_2": 0.08837145336971095,
|
78 |
+
"scr_dir2_threshold_2": 0.08633535098098324,
|
79 |
+
"scr_dir1_threshold_5": 0.23885799540887315,
|
80 |
+
"scr_metric_threshold_5": 0.14267344527111614,
|
81 |
+
"scr_dir2_threshold_5": 0.1444438578058403,
|
82 |
+
"scr_dir1_threshold_10": 0.2422475369969285,
|
83 |
+
"scr_metric_threshold_10": 0.20760578487732154,
|
84 |
+
"scr_dir2_threshold_10": 0.21431434889096806,
|
85 |
+
"scr_dir1_threshold_20": 0.25614441733043103,
|
86 |
+
"scr_metric_threshold_20": 0.2715601857041677,
|
87 |
+
"scr_dir2_threshold_20": 0.28106097447001843,
|
88 |
+
"scr_dir1_threshold_50": 0.1546741739649193,
|
89 |
+
"scr_metric_threshold_50": 0.35566136240159857,
|
90 |
+
"scr_dir2_threshold_50": 0.36247209379563494,
|
91 |
+
"scr_dir1_threshold_100": 0.1450762302274518,
|
92 |
+
"scr_metric_threshold_100": 0.34184354784903986,
|
93 |
+
"scr_dir2_threshold_100": 0.3450164482284107,
|
94 |
+
"scr_dir1_threshold_500": -0.007035319664922636,
|
95 |
+
"scr_metric_threshold_500": 0.3299446478569424,
|
96 |
+
"scr_dir2_threshold_500": 0.3331686479148758
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.444444654690226,
|
103 |
+
"scr_metric_threshold_2": 0.004914087313907592,
|
104 |
+
"scr_dir2_threshold_2": 0.004914087313907592,
|
105 |
+
"scr_dir1_threshold_5": 0.47619029597980633,
|
106 |
+
"scr_metric_threshold_5": 0.017199085925573582,
|
107 |
+
"scr_dir2_threshold_5": 0.017199085925573582,
|
108 |
+
"scr_dir1_threshold_10": 0.4920631166245965,
|
109 |
+
"scr_metric_threshold_10": 0.04914014089539928,
|
110 |
+
"scr_dir2_threshold_10": 0.04914014089539928,
|
111 |
+
"scr_dir1_threshold_20": 0.4920631166245965,
|
112 |
+
"scr_metric_threshold_20": 0.12039316213280908,
|
113 |
+
"scr_dir2_threshold_20": 0.12039316213280908,
|
114 |
+
"scr_dir1_threshold_50": 0.3492058386094512,
|
115 |
+
"scr_metric_threshold_50": 0.14250618892355493,
|
116 |
+
"scr_dir2_threshold_50": 0.14250618892355493,
|
117 |
+
"scr_dir1_threshold_100": 0.42857088793941894,
|
118 |
+
"scr_metric_threshold_100": 0.20147421154929873,
|
119 |
+
"scr_dir2_threshold_100": 0.20147421154929873,
|
120 |
+
"scr_dir1_threshold_500": 0.19047573994951578,
|
121 |
+
"scr_metric_threshold_500": 0.09828013534206324,
|
122 |
+
"scr_dir2_threshold_500": 0.09828013534206324
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.17171680682792437,
|
127 |
+
"scr_metric_threshold_2": 0.14447592402822051,
|
128 |
+
"scr_dir2_threshold_2": 0.14447592402822051,
|
129 |
+
"scr_dir1_threshold_5": 0.24242418769085533,
|
130 |
+
"scr_metric_threshold_5": 0.22096320772287115,
|
131 |
+
"scr_dir2_threshold_5": 0.22096320772287115,
|
132 |
+
"scr_dir1_threshold_10": 0.2323233053010818,
|
133 |
+
"scr_metric_threshold_10": 0.28895184805644103,
|
134 |
+
"scr_dir2_threshold_10": 0.28895184805644103,
|
135 |
+
"scr_dir1_threshold_20": 0.2121209384542766,
|
136 |
+
"scr_metric_threshold_20": 0.3031161410904476,
|
137 |
+
"scr_dir2_threshold_20": 0.3031161410904476,
|
138 |
+
"scr_dir1_threshold_50": 0.11111091042202506,
|
139 |
+
"scr_metric_threshold_50": 0.419263479050655,
|
140 |
+
"scr_dir2_threshold_50": 0.419263479050655,
|
141 |
+
"scr_dir1_threshold_100": 0.010100882389773526,
|
142 |
+
"scr_metric_threshold_100": 0.18130315345731438,
|
143 |
+
"scr_dir2_threshold_100": 0.18130315345731438,
|
144 |
+
"scr_dir1_threshold_500": -0.5555557562446416,
|
145 |
+
"scr_metric_threshold_500": 0.24929179379088426,
|
146 |
+
"scr_dir2_threshold_500": 0.24929179379088426
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.5806446340898319,
|
151 |
+
"scr_metric_threshold_2": 0.03030308959751515,
|
152 |
+
"scr_dir2_threshold_2": 0.03030308959751515,
|
153 |
+
"scr_dir1_threshold_5": 0.5645158034084256,
|
154 |
+
"scr_metric_threshold_5": 0.0479797789751554,
|
155 |
+
"scr_dir2_threshold_5": 0.0479797789751554,
|
156 |
+
"scr_dir1_threshold_10": 0.5483869727270193,
|
157 |
+
"scr_metric_threshold_10": 0.06060602867826107,
|
158 |
+
"scr_dir2_threshold_10": 0.06060602867826107,
|
159 |
+
"scr_dir1_threshold_20": 0.5967744261368388,
|
160 |
+
"scr_metric_threshold_20": 0.14141411708819893,
|
161 |
+
"scr_dir2_threshold_20": 0.14141411708819893,
|
162 |
+
"scr_dir1_threshold_50": 0.40322557386316116,
|
163 |
+
"scr_metric_threshold_50": 0.19949492592919268,
|
164 |
+
"scr_dir2_threshold_50": 0.19949492592919268,
|
165 |
+
"scr_dir1_threshold_100": 0.258064174999303,
|
166 |
+
"scr_metric_threshold_100": 0.2904040442049689,
|
167 |
+
"scr_dir2_threshold_100": 0.2904040442049689,
|
168 |
+
"scr_dir1_threshold_500": 0.016128830681406324,
|
169 |
+
"scr_metric_threshold_500": 0.09848492830434731,
|
170 |
+
"scr_dir2_threshold_500": 0.09848492830434731
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.23577218043436754,
|
175 |
+
"scr_metric_threshold_2": 0.10850441266694984,
|
176 |
+
"scr_dir2_threshold_2": 0.10850441266694984,
|
177 |
+
"scr_dir1_threshold_5": 0.2682925647340228,
|
178 |
+
"scr_metric_threshold_5": 0.21994133923436865,
|
179 |
+
"scr_dir2_threshold_5": 0.21994133923436865,
|
180 |
+
"scr_dir1_threshold_10": 0.15447146198048264,
|
181 |
+
"scr_metric_threshold_10": 0.28445751901318467,
|
182 |
+
"scr_dir2_threshold_10": 0.28445751901318467,
|
183 |
+
"scr_dir1_threshold_20": 0.0894311779716786,
|
184 |
+
"scr_metric_threshold_20": 0.3724339847894522,
|
185 |
+
"scr_dir2_threshold_20": 0.3724339847894522,
|
186 |
+
"scr_dir1_threshold_50": -0.32520287381553964,
|
187 |
+
"scr_metric_threshold_50": 0.4545454227647819,
|
188 |
+
"scr_dir2_threshold_50": 0.4545454227647819,
|
189 |
+
"scr_dir1_threshold_100": -0.39024364241485016,
|
190 |
+
"scr_metric_threshold_100": 0.14662744296044594,
|
191 |
+
"scr_dir2_threshold_100": 0.14662744296044594,
|
192 |
+
"scr_dir1_threshold_500": -0.764227334975126,
|
193 |
+
"scr_metric_threshold_500": 0.19354836454274843,
|
194 |
+
"scr_dir2_threshold_500": 0.19354836454274843
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.00546440968122594,
|
199 |
+
"scr_metric_threshold_2": 0.1914063154836078,
|
200 |
+
"scr_dir2_threshold_2": 0.1914063154836078,
|
201 |
+
"scr_dir1_threshold_5": 0.03278678379574697,
|
202 |
+
"scr_metric_threshold_5": 0.2656250291038257,
|
203 |
+
"scr_dir2_threshold_5": 0.2656250291038257,
|
204 |
+
"scr_dir1_threshold_10": 0.09289626741440628,
|
205 |
+
"scr_metric_threshold_10": 0.42578128637978213,
|
206 |
+
"scr_dir2_threshold_10": 0.42578128637978213,
|
207 |
+
"scr_dir1_threshold_20": 0.1092894964580841,
|
208 |
+
"scr_metric_threshold_20": 0.5195312281721307,
|
209 |
+
"scr_dir2_threshold_20": 0.5195312281721307,
|
210 |
+
"scr_dir1_threshold_50": -0.016393554752069144,
|
211 |
+
"scr_metric_threshold_50": 0.6562500582076514,
|
212 |
+
"scr_dir2_threshold_50": 0.6562500582076514,
|
213 |
+
"scr_dir1_threshold_100": -0.06010915791026799,
|
214 |
+
"scr_metric_threshold_100": 0.7187499417923486,
|
215 |
+
"scr_dir2_threshold_100": 0.7187499417923486,
|
216 |
+
"scr_dir1_threshold_500": -0.05464474822904205,
|
217 |
+
"scr_metric_threshold_500": 0.7343749708961743,
|
218 |
+
"scr_dir2_threshold_500": 0.7343749708961743
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.06666642213478169,
|
223 |
+
"scr_metric_threshold_2": 0.060483807005948444,
|
224 |
+
"scr_dir2_threshold_2": 0.060483807005948444,
|
225 |
+
"scr_dir1_threshold_5": 0.12820501064172196,
|
226 |
+
"scr_metric_threshold_5": 0.1008065853511987,
|
227 |
+
"scr_dir2_threshold_5": 0.1008065853511987,
|
228 |
+
"scr_dir1_threshold_10": 0.17948701489841076,
|
229 |
+
"scr_metric_threshold_10": 0.14516132908730398,
|
230 |
+
"scr_dir2_threshold_10": 0.14516132908730398,
|
231 |
+
"scr_dir1_threshold_20": 0.1999998777340575,
|
232 |
+
"scr_metric_threshold_20": 0.18951607282340924,
|
233 |
+
"scr_dir2_threshold_20": 0.18951607282340924,
|
234 |
+
"scr_dir1_threshold_50": 0.22051274056970427,
|
235 |
+
"scr_metric_threshold_50": 0.30241927537102686,
|
236 |
+
"scr_dir2_threshold_50": 0.30241927537102686,
|
237 |
+
"scr_dir1_threshold_100": 0.24102560340535104,
|
238 |
+
"scr_metric_threshold_100": 0.4153227182599291,
|
239 |
+
"scr_dir2_threshold_100": 0.4153227182599291,
|
240 |
+
"scr_dir1_threshold_500": 0.36410247475437535,
|
241 |
+
"scr_metric_threshold_500": 0.37499993991467884,
|
242 |
+
"scr_dir2_threshold_500": 0.37499993991467884
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.09009000301245093,
|
247 |
+
"scr_metric_threshold_2": 0.09821433560655832,
|
248 |
+
"scr_dir2_threshold_2": 0.09821433560655832,
|
249 |
+
"scr_dir1_threshold_5": 0.11261250376556366,
|
250 |
+
"scr_metric_threshold_5": 0.18303572260109305,
|
251 |
+
"scr_dir2_threshold_5": 0.18303572260109305,
|
252 |
+
"scr_dir1_threshold_10": 0.13963971946080886,
|
253 |
+
"scr_metric_threshold_10": 0.308035689339578,
|
254 |
+
"scr_dir2_threshold_10": 0.308035689339578,
|
255 |
+
"scr_dir1_threshold_20": 0.22072082956776967,
|
256 |
+
"scr_metric_threshold_20": 0.39732130384074704,
|
257 |
+
"scr_dir2_threshold_20": 0.39732130384074704,
|
258 |
+
"scr_dir1_threshold_50": 0.30180167118534307,
|
259 |
+
"scr_metric_threshold_50": 0.47767846332864744,
|
260 |
+
"scr_dir2_threshold_50": 0.47767846332864744,
|
261 |
+
"scr_dir1_threshold_100": 0.44144139064615195,
|
262 |
+
"scr_metric_threshold_100": 0.5491071678032792,
|
263 |
+
"scr_dir2_threshold_100": 0.5491071678032792,
|
264 |
+
"scr_dir1_threshold_500": 0.4684683378520097,
|
265 |
+
"scr_metric_threshold_500": 0.6116071511725216,
|
266 |
+
"scr_dir2_threshold_500": 0.6116071511725216
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.06866965525497992,
|
271 |
+
"scr_metric_threshold_2": 0.06866965525497992,
|
272 |
+
"scr_dir2_threshold_2": 0.05238083614515829,
|
273 |
+
"scr_dir1_threshold_5": 0.08583681325484281,
|
274 |
+
"scr_metric_threshold_5": 0.08583681325484281,
|
275 |
+
"scr_dir2_threshold_5": 0.10000011353263609,
|
276 |
+
"scr_dir1_threshold_10": 0.09871243756862208,
|
277 |
+
"scr_metric_threshold_10": 0.09871243756862208,
|
278 |
+
"scr_dir2_threshold_10": 0.1523809496777944,
|
279 |
+
"scr_dir1_threshold_20": 0.12875547569614632,
|
280 |
+
"scr_metric_threshold_20": 0.12875547569614632,
|
281 |
+
"scr_dir2_threshold_20": 0.20476178582295265,
|
282 |
+
"scr_dir1_threshold_50": 0.19313308563727843,
|
283 |
+
"scr_metric_threshold_50": 0.19313308563727843,
|
284 |
+
"scr_dir2_threshold_50": 0.24761893678956953,
|
285 |
+
"scr_dir1_threshold_100": 0.23175970276473412,
|
286 |
+
"scr_metric_threshold_100": 0.23175970276473412,
|
287 |
+
"scr_dir2_threshold_100": 0.25714290579970117,
|
288 |
+
"scr_dir1_threshold_500": 0.27896989889212126,
|
289 |
+
"scr_metric_threshold_500": 0.27896989889212126,
|
290 |
+
"scr_dir2_threshold_500": 0.3047618993555888
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000",
|
296 |
+
"sae_lens_version": "5.5.0",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 65536,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "0eeb0d78-c565-4180-bea5-6e755bb53289",
|
30 |
+
"datetime_epoch_millis": 1740203520129,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9587437950074672,
|
34 |
+
"llm_top_1_test_accuracy": 0.6508312500000001,
|
35 |
+
"llm_top_2_test_accuracy": 0.7267250000000001,
|
36 |
+
"llm_top_5_test_accuracy": 0.77896875,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9576437868177892,
|
44 |
+
"sae_top_1_test_accuracy": 0.6868375000000001,
|
45 |
+
"sae_top_2_test_accuracy": 0.7738562499999999,
|
46 |
+
"sae_top_5_test_accuracy": 0.86161875,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.966800057888031,
|
57 |
+
"llm_top_1_test_accuracy": 0.6397999999999999,
|
58 |
+
"llm_top_2_test_accuracy": 0.6954,
|
59 |
+
"llm_top_5_test_accuracy": 0.7869999999999999,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9654000401496887,
|
65 |
+
"sae_top_1_test_accuracy": 0.6992,
|
66 |
+
"sae_top_2_test_accuracy": 0.7709999999999999,
|
67 |
+
"sae_top_5_test_accuracy": 0.8725999999999999,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9544000387191772,
|
76 |
+
"llm_top_1_test_accuracy": 0.6744000000000001,
|
77 |
+
"llm_top_2_test_accuracy": 0.7334,
|
78 |
+
"llm_top_5_test_accuracy": 0.763,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9496000289916993,
|
84 |
+
"sae_top_1_test_accuracy": 0.6658,
|
85 |
+
"sae_top_2_test_accuracy": 0.7876,
|
86 |
+
"sae_top_5_test_accuracy": 0.8301999999999999,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9310000538825989,
|
95 |
+
"llm_top_1_test_accuracy": 0.6864000000000001,
|
96 |
+
"llm_top_2_test_accuracy": 0.7436,
|
97 |
+
"llm_top_5_test_accuracy": 0.763,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9298000454902648,
|
103 |
+
"sae_top_1_test_accuracy": 0.7092,
|
104 |
+
"sae_top_2_test_accuracy": 0.8158,
|
105 |
+
"sae_top_5_test_accuracy": 0.8566,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.915600061416626,
|
114 |
+
"llm_top_1_test_accuracy": 0.599,
|
115 |
+
"llm_top_2_test_accuracy": 0.6476000000000001,
|
116 |
+
"llm_top_5_test_accuracy": 0.6708000000000001,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9200000286102294,
|
122 |
+
"sae_top_1_test_accuracy": 0.7190000000000001,
|
123 |
+
"sae_top_2_test_accuracy": 0.7762,
|
124 |
+
"sae_top_5_test_accuracy": 0.8204,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9820000529289246,
|
133 |
+
"llm_top_1_test_accuracy": 0.673,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9730000495910645,
|
141 |
+
"sae_top_1_test_accuracy": 0.591,
|
142 |
+
"sae_top_2_test_accuracy": 0.664,
|
143 |
+
"sae_top_5_test_accuracy": 0.904,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9708000302314759,
|
152 |
+
"llm_top_1_test_accuracy": 0.649,
|
153 |
+
"llm_top_2_test_accuracy": 0.6958,
|
154 |
+
"llm_top_5_test_accuracy": 0.7556,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9706000328063965,
|
160 |
+
"sae_top_1_test_accuracy": 0.6342000000000001,
|
161 |
+
"sae_top_2_test_accuracy": 0.7213999999999999,
|
162 |
+
"sae_top_5_test_accuracy": 0.8103999999999999,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9497500360012054,
|
171 |
+
"llm_top_1_test_accuracy": 0.63425,
|
172 |
+
"llm_top_2_test_accuracy": 0.782,
|
173 |
+
"llm_top_5_test_accuracy": 0.8247499999999999,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9537500441074371,
|
179 |
+
"sae_top_1_test_accuracy": 0.6445000000000001,
|
180 |
+
"sae_top_2_test_accuracy": 0.74825,
|
181 |
+
"sae_top_5_test_accuracy": 0.81875,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6508,
|
191 |
+
"llm_top_2_test_accuracy": 0.792,
|
192 |
+
"llm_top_5_test_accuracy": 0.9016,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9990000247955322,
|
198 |
+
"sae_top_1_test_accuracy": 0.8318,
|
199 |
+
"sae_top_2_test_accuracy": 0.9065999999999999,
|
200 |
+
"sae_top_5_test_accuracy": 0.9800000000000001,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000",
|
210 |
+
"sae_lens_version": "5.5.0",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 65536,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9430000185966492,
|
240 |
+
"1": 0.9650000333786011,
|
241 |
+
"2": 0.9550000429153442,
|
242 |
+
"6": 0.987000048160553,
|
243 |
+
"9": 0.9770000576972961
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9530000686645508,
|
249 |
+
"6": 0.987000048160553,
|
250 |
+
"9": 0.9760000705718994
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.577,
|
254 |
+
"1": 0.613,
|
255 |
+
"2": 0.662,
|
256 |
+
"6": 0.787,
|
257 |
+
"9": 0.56
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.574,
|
261 |
+
"1": 0.66,
|
262 |
+
"2": 0.718,
|
263 |
+
"6": 0.811,
|
264 |
+
"9": 0.714
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.713,
|
268 |
+
"1": 0.711,
|
269 |
+
"2": 0.755,
|
270 |
+
"6": 0.895,
|
271 |
+
"9": 0.861
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.555,
|
275 |
+
"1": 0.659,
|
276 |
+
"2": 0.847,
|
277 |
+
"6": 0.817,
|
278 |
+
"9": 0.618
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.57,
|
282 |
+
"1": 0.658,
|
283 |
+
"2": 0.859,
|
284 |
+
"6": 0.828,
|
285 |
+
"9": 0.94
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.689,
|
289 |
+
"1": 0.85,
|
290 |
+
"2": 0.909,
|
291 |
+
"6": 0.972,
|
292 |
+
"9": 0.943
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9650000333786011,
|
298 |
+
"13": 0.9470000267028809,
|
299 |
+
"14": 0.9470000267028809,
|
300 |
+
"18": 0.9290000200271606,
|
301 |
+
"19": 0.9600000381469727
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9660000205039978,
|
305 |
+
"13": 0.9510000348091125,
|
306 |
+
"14": 0.9540000557899475,
|
307 |
+
"18": 0.940000057220459,
|
308 |
+
"19": 0.9610000252723694
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.553,
|
312 |
+
"13": 0.673,
|
313 |
+
"14": 0.651,
|
314 |
+
"18": 0.706,
|
315 |
+
"19": 0.789
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.77,
|
319 |
+
"13": 0.719,
|
320 |
+
"14": 0.672,
|
321 |
+
"18": 0.717,
|
322 |
+
"19": 0.789
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.793,
|
326 |
+
"13": 0.739,
|
327 |
+
"14": 0.732,
|
328 |
+
"18": 0.723,
|
329 |
+
"19": 0.828
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.573,
|
333 |
+
"13": 0.657,
|
334 |
+
"14": 0.625,
|
335 |
+
"18": 0.662,
|
336 |
+
"19": 0.812
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.86,
|
340 |
+
"13": 0.681,
|
341 |
+
"14": 0.872,
|
342 |
+
"18": 0.695,
|
343 |
+
"19": 0.83
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.945,
|
347 |
+
"13": 0.755,
|
348 |
+
"14": 0.889,
|
349 |
+
"18": 0.74,
|
350 |
+
"19": 0.822
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9570000171661377,
|
356 |
+
"21": 0.9220000505447388,
|
357 |
+
"22": 0.9170000553131104,
|
358 |
+
"25": 0.9540000557899475,
|
359 |
+
"26": 0.8990000486373901
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.9650000333786011,
|
363 |
+
"21": 0.9250000715255737,
|
364 |
+
"22": 0.9140000343322754,
|
365 |
+
"25": 0.9670000672340393,
|
366 |
+
"26": 0.8840000629425049
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.709,
|
370 |
+
"21": 0.762,
|
371 |
+
"22": 0.653,
|
372 |
+
"25": 0.683,
|
373 |
+
"26": 0.625
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.811,
|
377 |
+
"21": 0.769,
|
378 |
+
"22": 0.688,
|
379 |
+
"25": 0.766,
|
380 |
+
"26": 0.684
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.815,
|
384 |
+
"21": 0.794,
|
385 |
+
"22": 0.706,
|
386 |
+
"25": 0.803,
|
387 |
+
"26": 0.697
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.573,
|
391 |
+
"21": 0.769,
|
392 |
+
"22": 0.867,
|
393 |
+
"25": 0.729,
|
394 |
+
"26": 0.608
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.875,
|
398 |
+
"21": 0.84,
|
399 |
+
"22": 0.888,
|
400 |
+
"25": 0.861,
|
401 |
+
"26": 0.615
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.897,
|
405 |
+
"21": 0.844,
|
406 |
+
"22": 0.891,
|
407 |
+
"25": 0.888,
|
408 |
+
"26": 0.763
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9380000233650208,
|
414 |
+
"2": 0.9420000314712524,
|
415 |
+
"3": 0.9200000166893005,
|
416 |
+
"5": 0.9290000200271606,
|
417 |
+
"6": 0.8710000514984131
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9490000605583191,
|
421 |
+
"2": 0.9300000667572021,
|
422 |
+
"3": 0.9120000600814819,
|
423 |
+
"5": 0.9250000715255737,
|
424 |
+
"6": 0.862000048160553
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.684,
|
428 |
+
"2": 0.592,
|
429 |
+
"3": 0.583,
|
430 |
+
"5": 0.551,
|
431 |
+
"6": 0.585
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.739,
|
435 |
+
"2": 0.635,
|
436 |
+
"3": 0.609,
|
437 |
+
"5": 0.635,
|
438 |
+
"6": 0.62
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.757,
|
442 |
+
"2": 0.636,
|
443 |
+
"3": 0.622,
|
444 |
+
"5": 0.657,
|
445 |
+
"6": 0.682
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.908,
|
449 |
+
"2": 0.844,
|
450 |
+
"3": 0.539,
|
451 |
+
"5": 0.769,
|
452 |
+
"6": 0.535
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.907,
|
456 |
+
"2": 0.853,
|
457 |
+
"3": 0.703,
|
458 |
+
"5": 0.846,
|
459 |
+
"6": 0.572
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.92,
|
463 |
+
"2": 0.881,
|
464 |
+
"3": 0.764,
|
465 |
+
"5": 0.878,
|
466 |
+
"6": 0.659
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9730000495910645,
|
472 |
+
"5.0": 0.9730000495910645
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9820000529289246,
|
476 |
+
"5.0": 0.9820000529289246
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.673,
|
480 |
+
"5.0": 0.673
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.591,
|
492 |
+
"5.0": 0.591
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.664,
|
496 |
+
"5.0": 0.664
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.904,
|
500 |
+
"5.0": 0.904
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9610000252723694,
|
506 |
+
"Python": 0.9830000400543213,
|
507 |
+
"HTML": 0.9890000224113464,
|
508 |
+
"Java": 0.9660000205039978,
|
509 |
+
"PHP": 0.9540000557899475
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.9830000400543213,
|
514 |
+
"HTML": 0.984000027179718,
|
515 |
+
"Java": 0.9700000286102295,
|
516 |
+
"PHP": 0.9610000252723694
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.663,
|
520 |
+
"Python": 0.655,
|
521 |
+
"HTML": 0.72,
|
522 |
+
"Java": 0.612,
|
523 |
+
"PHP": 0.595
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.68,
|
527 |
+
"Python": 0.683,
|
528 |
+
"HTML": 0.798,
|
529 |
+
"Java": 0.681,
|
530 |
+
"PHP": 0.637
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.74,
|
534 |
+
"Python": 0.724,
|
535 |
+
"HTML": 0.902,
|
536 |
+
"Java": 0.739,
|
537 |
+
"PHP": 0.673
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.623,
|
541 |
+
"Python": 0.641,
|
542 |
+
"HTML": 0.687,
|
543 |
+
"Java": 0.613,
|
544 |
+
"PHP": 0.607
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.627,
|
548 |
+
"Python": 0.661,
|
549 |
+
"HTML": 0.813,
|
550 |
+
"Java": 0.598,
|
551 |
+
"PHP": 0.908
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.691,
|
555 |
+
"Python": 0.939,
|
556 |
+
"HTML": 0.835,
|
557 |
+
"Java": 0.662,
|
558 |
+
"PHP": 0.925
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9450000524520874,
|
564 |
+
"1": 0.984000027179718,
|
565 |
+
"2": 0.940000057220459,
|
566 |
+
"3": 0.9460000395774841
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.9450000524520874,
|
570 |
+
"1": 0.9890000224113464,
|
571 |
+
"2": 0.9200000166893005,
|
572 |
+
"3": 0.9450000524520874
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.559,
|
576 |
+
"1": 0.66,
|
577 |
+
"2": 0.668,
|
578 |
+
"3": 0.65
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.807,
|
582 |
+
"1": 0.799,
|
583 |
+
"2": 0.699,
|
584 |
+
"3": 0.823
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.815,
|
588 |
+
"1": 0.88,
|
589 |
+
"2": 0.756,
|
590 |
+
"3": 0.848
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.673,
|
594 |
+
"1": 0.637,
|
595 |
+
"2": 0.558,
|
596 |
+
"3": 0.71
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.837,
|
600 |
+
"1": 0.657,
|
601 |
+
"2": 0.777,
|
602 |
+
"3": 0.722
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.84,
|
606 |
+
"1": 0.872,
|
607 |
+
"2": 0.798,
|
608 |
+
"3": 0.765
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.9980000257492065,
|
614 |
+
"fr": 1.0,
|
615 |
+
"de": 0.999000072479248,
|
616 |
+
"es": 1.0,
|
617 |
+
"nl": 0.9980000257492065
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 0.999000072479248,
|
621 |
+
"fr": 1.0,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 1.0,
|
624 |
+
"nl": 0.999000072479248
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.741,
|
628 |
+
"fr": 0.609,
|
629 |
+
"de": 0.754,
|
630 |
+
"es": 0.494,
|
631 |
+
"nl": 0.656
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.832,
|
635 |
+
"fr": 0.597,
|
636 |
+
"de": 0.826,
|
637 |
+
"es": 0.964,
|
638 |
+
"nl": 0.741
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.879,
|
642 |
+
"fr": 0.909,
|
643 |
+
"de": 0.874,
|
644 |
+
"es": 0.979,
|
645 |
+
"nl": 0.867
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.718,
|
649 |
+
"fr": 0.993,
|
650 |
+
"de": 0.891,
|
651 |
+
"es": 0.897,
|
652 |
+
"nl": 0.66
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.722,
|
656 |
+
"fr": 0.997,
|
657 |
+
"de": 0.905,
|
658 |
+
"es": 0.911,
|
659 |
+
"nl": 0.998
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.999,
|
663 |
+
"fr": 0.995,
|
664 |
+
"de": 0.98,
|
665 |
+
"es": 0.931,
|
666 |
+
"nl": 0.995
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "efe2ab1e-0e1b-47f8-a56e-d7f2f99e4ac0",
|
30 |
+
"datetime_epoch_millis": 1740203784246,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9587437950074672,
|
34 |
+
"llm_top_1_test_accuracy": 0.6508312500000001,
|
35 |
+
"llm_top_2_test_accuracy": 0.7267250000000001,
|
36 |
+
"llm_top_5_test_accuracy": 0.77896875,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9502250470221043,
|
44 |
+
"sae_top_1_test_accuracy": 0.69245,
|
45 |
+
"sae_top_2_test_accuracy": 0.77648125,
|
46 |
+
"sae_top_5_test_accuracy": 0.85739375,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.966800057888031,
|
57 |
+
"llm_top_1_test_accuracy": 0.6397999999999999,
|
58 |
+
"llm_top_2_test_accuracy": 0.6954,
|
59 |
+
"llm_top_5_test_accuracy": 0.7869999999999999,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.955400037765503,
|
65 |
+
"sae_top_1_test_accuracy": 0.6858,
|
66 |
+
"sae_top_2_test_accuracy": 0.8606,
|
67 |
+
"sae_top_5_test_accuracy": 0.8960000000000001,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9544000387191772,
|
76 |
+
"llm_top_1_test_accuracy": 0.6744000000000001,
|
77 |
+
"llm_top_2_test_accuracy": 0.7334,
|
78 |
+
"llm_top_5_test_accuracy": 0.763,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9468000531196594,
|
84 |
+
"sae_top_1_test_accuracy": 0.6689999999999999,
|
85 |
+
"sae_top_2_test_accuracy": 0.8225999999999999,
|
86 |
+
"sae_top_5_test_accuracy": 0.8782,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9310000538825989,
|
95 |
+
"llm_top_1_test_accuracy": 0.6864000000000001,
|
96 |
+
"llm_top_2_test_accuracy": 0.7436,
|
97 |
+
"llm_top_5_test_accuracy": 0.763,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.92160005569458,
|
103 |
+
"sae_top_1_test_accuracy": 0.7544,
|
104 |
+
"sae_top_2_test_accuracy": 0.7952,
|
105 |
+
"sae_top_5_test_accuracy": 0.8654,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.915600061416626,
|
114 |
+
"llm_top_1_test_accuracy": 0.599,
|
115 |
+
"llm_top_2_test_accuracy": 0.6476000000000001,
|
116 |
+
"llm_top_5_test_accuracy": 0.6708000000000001,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9052000403404236,
|
122 |
+
"sae_top_1_test_accuracy": 0.713,
|
123 |
+
"sae_top_2_test_accuracy": 0.7246,
|
124 |
+
"sae_top_5_test_accuracy": 0.8096,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9820000529289246,
|
133 |
+
"llm_top_1_test_accuracy": 0.673,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9630000591278076,
|
141 |
+
"sae_top_1_test_accuracy": 0.742,
|
142 |
+
"sae_top_2_test_accuracy": 0.752,
|
143 |
+
"sae_top_5_test_accuracy": 0.797,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9708000302314759,
|
152 |
+
"llm_top_1_test_accuracy": 0.649,
|
153 |
+
"llm_top_2_test_accuracy": 0.6958,
|
154 |
+
"llm_top_5_test_accuracy": 0.7556,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9660000443458557,
|
160 |
+
"sae_top_1_test_accuracy": 0.5998,
|
161 |
+
"sae_top_2_test_accuracy": 0.6456000000000001,
|
162 |
+
"sae_top_5_test_accuracy": 0.797,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9497500360012054,
|
171 |
+
"llm_top_1_test_accuracy": 0.63425,
|
172 |
+
"llm_top_2_test_accuracy": 0.782,
|
173 |
+
"llm_top_5_test_accuracy": 0.8247499999999999,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9480000436306,
|
179 |
+
"sae_top_1_test_accuracy": 0.605,
|
180 |
+
"sae_top_2_test_accuracy": 0.69525,
|
181 |
+
"sae_top_5_test_accuracy": 0.83375,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6508,
|
191 |
+
"llm_top_2_test_accuracy": 0.792,
|
192 |
+
"llm_top_5_test_accuracy": 0.9016,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9958000421524048,
|
198 |
+
"sae_top_1_test_accuracy": 0.7706000000000001,
|
199 |
+
"sae_top_2_test_accuracy": 0.916,
|
200 |
+
"sae_top_5_test_accuracy": 0.9822000000000001,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000",
|
210 |
+
"sae_lens_version": "5.5.0",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 65536,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9330000281333923,
|
240 |
+
"1": 0.9500000476837158,
|
241 |
+
"2": 0.9430000185966492,
|
242 |
+
"6": 0.9830000400543213,
|
243 |
+
"9": 0.968000054359436
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9530000686645508,
|
249 |
+
"6": 0.987000048160553,
|
250 |
+
"9": 0.9760000705718994
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.577,
|
254 |
+
"1": 0.613,
|
255 |
+
"2": 0.662,
|
256 |
+
"6": 0.787,
|
257 |
+
"9": 0.56
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.574,
|
261 |
+
"1": 0.66,
|
262 |
+
"2": 0.718,
|
263 |
+
"6": 0.811,
|
264 |
+
"9": 0.714
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.713,
|
268 |
+
"1": 0.711,
|
269 |
+
"2": 0.755,
|
270 |
+
"6": 0.895,
|
271 |
+
"9": 0.861
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.569,
|
275 |
+
"1": 0.639,
|
276 |
+
"2": 0.886,
|
277 |
+
"6": 0.787,
|
278 |
+
"9": 0.548
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.878,
|
282 |
+
"1": 0.744,
|
283 |
+
"2": 0.884,
|
284 |
+
"6": 0.953,
|
285 |
+
"9": 0.844
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.866,
|
289 |
+
"1": 0.853,
|
290 |
+
"2": 0.901,
|
291 |
+
"6": 0.983,
|
292 |
+
"9": 0.877
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9600000381469727,
|
298 |
+
"13": 0.9550000429153442,
|
299 |
+
"14": 0.9480000734329224,
|
300 |
+
"18": 0.9130000472068787,
|
301 |
+
"19": 0.9580000638961792
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9660000205039978,
|
305 |
+
"13": 0.9510000348091125,
|
306 |
+
"14": 0.9540000557899475,
|
307 |
+
"18": 0.940000057220459,
|
308 |
+
"19": 0.9610000252723694
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.553,
|
312 |
+
"13": 0.673,
|
313 |
+
"14": 0.651,
|
314 |
+
"18": 0.706,
|
315 |
+
"19": 0.789
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.77,
|
319 |
+
"13": 0.719,
|
320 |
+
"14": 0.672,
|
321 |
+
"18": 0.717,
|
322 |
+
"19": 0.789
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.793,
|
326 |
+
"13": 0.739,
|
327 |
+
"14": 0.732,
|
328 |
+
"18": 0.723,
|
329 |
+
"19": 0.828
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.552,
|
333 |
+
"13": 0.674,
|
334 |
+
"14": 0.646,
|
335 |
+
"18": 0.689,
|
336 |
+
"19": 0.784
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.848,
|
340 |
+
"13": 0.69,
|
341 |
+
"14": 0.878,
|
342 |
+
"18": 0.915,
|
343 |
+
"19": 0.782
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.925,
|
347 |
+
"13": 0.77,
|
348 |
+
"14": 0.881,
|
349 |
+
"18": 0.916,
|
350 |
+
"19": 0.899
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9480000734329224,
|
356 |
+
"21": 0.921000063419342,
|
357 |
+
"22": 0.906000018119812,
|
358 |
+
"25": 0.9480000734329224,
|
359 |
+
"26": 0.8850000500679016
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.9650000333786011,
|
363 |
+
"21": 0.9250000715255737,
|
364 |
+
"22": 0.9140000343322754,
|
365 |
+
"25": 0.9670000672340393,
|
366 |
+
"26": 0.8840000629425049
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.709,
|
370 |
+
"21": 0.762,
|
371 |
+
"22": 0.653,
|
372 |
+
"25": 0.683,
|
373 |
+
"26": 0.625
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.811,
|
377 |
+
"21": 0.769,
|
378 |
+
"22": 0.688,
|
379 |
+
"25": 0.766,
|
380 |
+
"26": 0.684
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.815,
|
384 |
+
"21": 0.794,
|
385 |
+
"22": 0.706,
|
386 |
+
"25": 0.803,
|
387 |
+
"26": 0.697
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.851,
|
391 |
+
"21": 0.738,
|
392 |
+
"22": 0.835,
|
393 |
+
"25": 0.703,
|
394 |
+
"26": 0.645
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.883,
|
398 |
+
"21": 0.75,
|
399 |
+
"22": 0.859,
|
400 |
+
"25": 0.852,
|
401 |
+
"26": 0.632
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.941,
|
405 |
+
"21": 0.843,
|
406 |
+
"22": 0.869,
|
407 |
+
"25": 0.873,
|
408 |
+
"26": 0.801
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9300000667572021,
|
414 |
+
"2": 0.9230000376701355,
|
415 |
+
"3": 0.906000018119812,
|
416 |
+
"5": 0.9180000424385071,
|
417 |
+
"6": 0.8490000367164612
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9490000605583191,
|
421 |
+
"2": 0.9300000667572021,
|
422 |
+
"3": 0.9120000600814819,
|
423 |
+
"5": 0.9250000715255737,
|
424 |
+
"6": 0.862000048160553
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.684,
|
428 |
+
"2": 0.592,
|
429 |
+
"3": 0.583,
|
430 |
+
"5": 0.551,
|
431 |
+
"6": 0.585
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.739,
|
435 |
+
"2": 0.635,
|
436 |
+
"3": 0.609,
|
437 |
+
"5": 0.635,
|
438 |
+
"6": 0.62
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.757,
|
442 |
+
"2": 0.636,
|
443 |
+
"3": 0.622,
|
444 |
+
"5": 0.657,
|
445 |
+
"6": 0.682
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.862,
|
449 |
+
"2": 0.864,
|
450 |
+
"3": 0.686,
|
451 |
+
"5": 0.527,
|
452 |
+
"6": 0.626
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.861,
|
456 |
+
"2": 0.865,
|
457 |
+
"3": 0.67,
|
458 |
+
"5": 0.522,
|
459 |
+
"6": 0.705
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.885,
|
463 |
+
"2": 0.874,
|
464 |
+
"3": 0.688,
|
465 |
+
"5": 0.87,
|
466 |
+
"6": 0.731
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9640000462532043,
|
472 |
+
"5.0": 0.9620000720024109
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9820000529289246,
|
476 |
+
"5.0": 0.9820000529289246
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.673,
|
480 |
+
"5.0": 0.673
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.742,
|
492 |
+
"5.0": 0.742
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.752,
|
496 |
+
"5.0": 0.752
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.797,
|
500 |
+
"5.0": 0.797
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9490000605583191,
|
506 |
+
"Python": 0.9790000319480896,
|
507 |
+
"HTML": 0.984000027179718,
|
508 |
+
"Java": 0.971000075340271,
|
509 |
+
"PHP": 0.9470000267028809
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.9830000400543213,
|
514 |
+
"HTML": 0.984000027179718,
|
515 |
+
"Java": 0.9700000286102295,
|
516 |
+
"PHP": 0.9610000252723694
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.663,
|
520 |
+
"Python": 0.655,
|
521 |
+
"HTML": 0.72,
|
522 |
+
"Java": 0.612,
|
523 |
+
"PHP": 0.595
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.68,
|
527 |
+
"Python": 0.683,
|
528 |
+
"HTML": 0.798,
|
529 |
+
"Java": 0.681,
|
530 |
+
"PHP": 0.637
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.74,
|
534 |
+
"Python": 0.724,
|
535 |
+
"HTML": 0.902,
|
536 |
+
"Java": 0.739,
|
537 |
+
"PHP": 0.673
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.545,
|
541 |
+
"Python": 0.646,
|
542 |
+
"HTML": 0.602,
|
543 |
+
"Java": 0.616,
|
544 |
+
"PHP": 0.59
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.573,
|
548 |
+
"Python": 0.68,
|
549 |
+
"HTML": 0.748,
|
550 |
+
"Java": 0.646,
|
551 |
+
"PHP": 0.581
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.656,
|
555 |
+
"Python": 0.855,
|
556 |
+
"HTML": 0.896,
|
557 |
+
"Java": 0.669,
|
558 |
+
"PHP": 0.909
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9420000314712524,
|
564 |
+
"1": 0.9820000529289246,
|
565 |
+
"2": 0.9270000457763672,
|
566 |
+
"3": 0.9410000443458557
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.9450000524520874,
|
570 |
+
"1": 0.9890000224113464,
|
571 |
+
"2": 0.9200000166893005,
|
572 |
+
"3": 0.9450000524520874
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.559,
|
576 |
+
"1": 0.66,
|
577 |
+
"2": 0.668,
|
578 |
+
"3": 0.65
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.807,
|
582 |
+
"1": 0.799,
|
583 |
+
"2": 0.699,
|
584 |
+
"3": 0.823
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.815,
|
588 |
+
"1": 0.88,
|
589 |
+
"2": 0.756,
|
590 |
+
"3": 0.848
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.582,
|
594 |
+
"1": 0.656,
|
595 |
+
"2": 0.544,
|
596 |
+
"3": 0.638
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.669,
|
600 |
+
"1": 0.693,
|
601 |
+
"2": 0.754,
|
602 |
+
"3": 0.665
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.796,
|
606 |
+
"1": 0.934,
|
607 |
+
"2": 0.802,
|
608 |
+
"3": 0.803
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.9970000386238098,
|
614 |
+
"fr": 0.9980000257492065,
|
615 |
+
"de": 0.9930000305175781,
|
616 |
+
"es": 0.9950000643730164,
|
617 |
+
"nl": 0.9960000514984131
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 0.999000072479248,
|
621 |
+
"fr": 1.0,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 1.0,
|
624 |
+
"nl": 0.999000072479248
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.741,
|
628 |
+
"fr": 0.609,
|
629 |
+
"de": 0.754,
|
630 |
+
"es": 0.494,
|
631 |
+
"nl": 0.656
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.832,
|
635 |
+
"fr": 0.597,
|
636 |
+
"de": 0.826,
|
637 |
+
"es": 0.964,
|
638 |
+
"nl": 0.741
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.879,
|
642 |
+
"fr": 0.909,
|
643 |
+
"de": 0.874,
|
644 |
+
"es": 0.979,
|
645 |
+
"nl": 0.867
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.741,
|
649 |
+
"fr": 0.595,
|
650 |
+
"de": 0.936,
|
651 |
+
"es": 0.925,
|
652 |
+
"nl": 0.656
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.974,
|
656 |
+
"fr": 0.978,
|
657 |
+
"de": 0.936,
|
658 |
+
"es": 0.919,
|
659 |
+
"nl": 0.773
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.998,
|
663 |
+
"fr": 0.98,
|
664 |
+
"de": 0.95,
|
665 |
+
"es": 0.99,
|
666 |
+
"nl": 0.993
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "46c18a2b-109a-432c-a208-824f33f6831a",
|
30 |
+
"datetime_epoch_millis": 1740203656830,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9587437950074672,
|
34 |
+
"llm_top_1_test_accuracy": 0.6508312500000001,
|
35 |
+
"llm_top_2_test_accuracy": 0.7267250000000001,
|
36 |
+
"llm_top_5_test_accuracy": 0.77896875,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9546437960118056,
|
44 |
+
"sae_top_1_test_accuracy": 0.7031999999999999,
|
45 |
+
"sae_top_2_test_accuracy": 0.80226875,
|
46 |
+
"sae_top_5_test_accuracy": 0.8621687499999999,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.966800057888031,
|
57 |
+
"llm_top_1_test_accuracy": 0.6397999999999999,
|
58 |
+
"llm_top_2_test_accuracy": 0.6954,
|
59 |
+
"llm_top_5_test_accuracy": 0.7869999999999999,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9562000513076783,
|
65 |
+
"sae_top_1_test_accuracy": 0.6838,
|
66 |
+
"sae_top_2_test_accuracy": 0.8674,
|
67 |
+
"sae_top_5_test_accuracy": 0.8904,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9544000387191772,
|
76 |
+
"llm_top_1_test_accuracy": 0.6744000000000001,
|
77 |
+
"llm_top_2_test_accuracy": 0.7334,
|
78 |
+
"llm_top_5_test_accuracy": 0.763,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9466000437736511,
|
84 |
+
"sae_top_1_test_accuracy": 0.6722,
|
85 |
+
"sae_top_2_test_accuracy": 0.7876000000000001,
|
86 |
+
"sae_top_5_test_accuracy": 0.8619999999999999,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9310000538825989,
|
95 |
+
"llm_top_1_test_accuracy": 0.6864000000000001,
|
96 |
+
"llm_top_2_test_accuracy": 0.7436,
|
97 |
+
"llm_top_5_test_accuracy": 0.763,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9256000518798828,
|
103 |
+
"sae_top_1_test_accuracy": 0.7188,
|
104 |
+
"sae_top_2_test_accuracy": 0.7922,
|
105 |
+
"sae_top_5_test_accuracy": 0.8664,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.915600061416626,
|
114 |
+
"llm_top_1_test_accuracy": 0.599,
|
115 |
+
"llm_top_2_test_accuracy": 0.6476000000000001,
|
116 |
+
"llm_top_5_test_accuracy": 0.6708000000000001,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9176000356674194,
|
122 |
+
"sae_top_1_test_accuracy": 0.7352000000000001,
|
123 |
+
"sae_top_2_test_accuracy": 0.772,
|
124 |
+
"sae_top_5_test_accuracy": 0.7998,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9820000529289246,
|
133 |
+
"llm_top_1_test_accuracy": 0.673,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9705000519752502,
|
141 |
+
"sae_top_1_test_accuracy": 0.832,
|
142 |
+
"sae_top_2_test_accuracy": 0.83,
|
143 |
+
"sae_top_5_test_accuracy": 0.919,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9708000302314759,
|
152 |
+
"llm_top_1_test_accuracy": 0.649,
|
153 |
+
"llm_top_2_test_accuracy": 0.6958,
|
154 |
+
"llm_top_5_test_accuracy": 0.7556,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9708000421524048,
|
160 |
+
"sae_top_1_test_accuracy": 0.614,
|
161 |
+
"sae_top_2_test_accuracy": 0.6594,
|
162 |
+
"sae_top_5_test_accuracy": 0.7447999999999999,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9497500360012054,
|
171 |
+
"llm_top_1_test_accuracy": 0.63425,
|
172 |
+
"llm_top_2_test_accuracy": 0.782,
|
173 |
+
"llm_top_5_test_accuracy": 0.8247499999999999,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9522500485181808,
|
179 |
+
"sae_top_1_test_accuracy": 0.6060000000000001,
|
180 |
+
"sae_top_2_test_accuracy": 0.71875,
|
181 |
+
"sae_top_5_test_accuracy": 0.81875,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6508,
|
191 |
+
"llm_top_2_test_accuracy": 0.792,
|
192 |
+
"llm_top_5_test_accuracy": 0.9016,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9976000428199768,
|
198 |
+
"sae_top_1_test_accuracy": 0.7636000000000001,
|
199 |
+
"sae_top_2_test_accuracy": 0.9907999999999999,
|
200 |
+
"sae_top_5_test_accuracy": 0.9962,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000",
|
210 |
+
"sae_lens_version": "5.5.0",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 65536,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.940000057220459,
|
240 |
+
"1": 0.9530000686645508,
|
241 |
+
"2": 0.9430000185966492,
|
242 |
+
"6": 0.9830000400543213,
|
243 |
+
"9": 0.9620000720024109
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9530000686645508,
|
249 |
+
"6": 0.987000048160553,
|
250 |
+
"9": 0.9760000705718994
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.577,
|
254 |
+
"1": 0.613,
|
255 |
+
"2": 0.662,
|
256 |
+
"6": 0.787,
|
257 |
+
"9": 0.56
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.574,
|
261 |
+
"1": 0.66,
|
262 |
+
"2": 0.718,
|
263 |
+
"6": 0.811,
|
264 |
+
"9": 0.714
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.713,
|
268 |
+
"1": 0.711,
|
269 |
+
"2": 0.755,
|
270 |
+
"6": 0.895,
|
271 |
+
"9": 0.861
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.563,
|
275 |
+
"1": 0.64,
|
276 |
+
"2": 0.84,
|
277 |
+
"6": 0.814,
|
278 |
+
"9": 0.562
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.867,
|
282 |
+
"1": 0.819,
|
283 |
+
"2": 0.84,
|
284 |
+
"6": 0.976,
|
285 |
+
"9": 0.835
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.874,
|
289 |
+
"1": 0.861,
|
290 |
+
"2": 0.859,
|
291 |
+
"6": 0.988,
|
292 |
+
"9": 0.87
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9550000429153442,
|
298 |
+
"13": 0.9530000686645508,
|
299 |
+
"14": 0.9460000395774841,
|
300 |
+
"18": 0.9220000505447388,
|
301 |
+
"19": 0.9570000171661377
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9660000205039978,
|
305 |
+
"13": 0.9510000348091125,
|
306 |
+
"14": 0.9540000557899475,
|
307 |
+
"18": 0.940000057220459,
|
308 |
+
"19": 0.9610000252723694
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.553,
|
312 |
+
"13": 0.673,
|
313 |
+
"14": 0.651,
|
314 |
+
"18": 0.706,
|
315 |
+
"19": 0.789
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.77,
|
319 |
+
"13": 0.719,
|
320 |
+
"14": 0.672,
|
321 |
+
"18": 0.717,
|
322 |
+
"19": 0.789
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.793,
|
326 |
+
"13": 0.739,
|
327 |
+
"14": 0.732,
|
328 |
+
"18": 0.723,
|
329 |
+
"19": 0.828
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.562,
|
333 |
+
"13": 0.667,
|
334 |
+
"14": 0.635,
|
335 |
+
"18": 0.704,
|
336 |
+
"19": 0.793
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.855,
|
340 |
+
"13": 0.668,
|
341 |
+
"14": 0.865,
|
342 |
+
"18": 0.732,
|
343 |
+
"19": 0.818
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.947,
|
347 |
+
"13": 0.718,
|
348 |
+
"14": 0.884,
|
349 |
+
"18": 0.896,
|
350 |
+
"19": 0.865
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9470000267028809,
|
356 |
+
"21": 0.9270000457763672,
|
357 |
+
"22": 0.9120000600814819,
|
358 |
+
"25": 0.9580000638961792,
|
359 |
+
"26": 0.8840000629425049
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.9650000333786011,
|
363 |
+
"21": 0.9250000715255737,
|
364 |
+
"22": 0.9140000343322754,
|
365 |
+
"25": 0.9670000672340393,
|
366 |
+
"26": 0.8840000629425049
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.709,
|
370 |
+
"21": 0.762,
|
371 |
+
"22": 0.653,
|
372 |
+
"25": 0.683,
|
373 |
+
"26": 0.625
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.811,
|
377 |
+
"21": 0.769,
|
378 |
+
"22": 0.688,
|
379 |
+
"25": 0.766,
|
380 |
+
"26": 0.684
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.815,
|
384 |
+
"21": 0.794,
|
385 |
+
"22": 0.706,
|
386 |
+
"25": 0.803,
|
387 |
+
"26": 0.697
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.841,
|
391 |
+
"21": 0.504,
|
392 |
+
"22": 0.887,
|
393 |
+
"25": 0.712,
|
394 |
+
"26": 0.65
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.855,
|
398 |
+
"21": 0.737,
|
399 |
+
"22": 0.89,
|
400 |
+
"25": 0.848,
|
401 |
+
"26": 0.631
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.917,
|
405 |
+
"21": 0.85,
|
406 |
+
"22": 0.886,
|
407 |
+
"25": 0.889,
|
408 |
+
"26": 0.79
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9450000524520874,
|
414 |
+
"2": 0.9350000619888306,
|
415 |
+
"3": 0.9200000166893005,
|
416 |
+
"5": 0.9200000166893005,
|
417 |
+
"6": 0.8680000305175781
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9490000605583191,
|
421 |
+
"2": 0.9300000667572021,
|
422 |
+
"3": 0.9120000600814819,
|
423 |
+
"5": 0.9250000715255737,
|
424 |
+
"6": 0.862000048160553
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.684,
|
428 |
+
"2": 0.592,
|
429 |
+
"3": 0.583,
|
430 |
+
"5": 0.551,
|
431 |
+
"6": 0.585
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.739,
|
435 |
+
"2": 0.635,
|
436 |
+
"3": 0.609,
|
437 |
+
"5": 0.635,
|
438 |
+
"6": 0.62
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.757,
|
442 |
+
"2": 0.636,
|
443 |
+
"3": 0.622,
|
444 |
+
"5": 0.657,
|
445 |
+
"6": 0.682
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.861,
|
449 |
+
"2": 0.843,
|
450 |
+
"3": 0.542,
|
451 |
+
"5": 0.83,
|
452 |
+
"6": 0.6
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.885,
|
456 |
+
"2": 0.848,
|
457 |
+
"3": 0.603,
|
458 |
+
"5": 0.884,
|
459 |
+
"6": 0.64
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.904,
|
463 |
+
"2": 0.846,
|
464 |
+
"3": 0.632,
|
465 |
+
"5": 0.881,
|
466 |
+
"6": 0.736
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9700000286102295,
|
472 |
+
"5.0": 0.971000075340271
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9820000529289246,
|
476 |
+
"5.0": 0.9820000529289246
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.673,
|
480 |
+
"5.0": 0.673
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.832,
|
492 |
+
"5.0": 0.832
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.83,
|
496 |
+
"5.0": 0.83
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.919,
|
500 |
+
"5.0": 0.919
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9580000638961792,
|
506 |
+
"Python": 0.9860000610351562,
|
507 |
+
"HTML": 0.984000027179718,
|
508 |
+
"Java": 0.9650000333786011,
|
509 |
+
"PHP": 0.9610000252723694
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.9830000400543213,
|
514 |
+
"HTML": 0.984000027179718,
|
515 |
+
"Java": 0.9700000286102295,
|
516 |
+
"PHP": 0.9610000252723694
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.663,
|
520 |
+
"Python": 0.655,
|
521 |
+
"HTML": 0.72,
|
522 |
+
"Java": 0.612,
|
523 |
+
"PHP": 0.595
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.68,
|
527 |
+
"Python": 0.683,
|
528 |
+
"HTML": 0.798,
|
529 |
+
"Java": 0.681,
|
530 |
+
"PHP": 0.637
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.74,
|
534 |
+
"Python": 0.724,
|
535 |
+
"HTML": 0.902,
|
536 |
+
"Java": 0.739,
|
537 |
+
"PHP": 0.673
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.624,
|
541 |
+
"Python": 0.653,
|
542 |
+
"HTML": 0.578,
|
543 |
+
"Java": 0.623,
|
544 |
+
"PHP": 0.592
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.591,
|
548 |
+
"Python": 0.652,
|
549 |
+
"HTML": 0.81,
|
550 |
+
"Java": 0.663,
|
551 |
+
"PHP": 0.581
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.628,
|
555 |
+
"Python": 0.66,
|
556 |
+
"HTML": 0.826,
|
557 |
+
"Java": 0.702,
|
558 |
+
"PHP": 0.908
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9430000185966492,
|
564 |
+
"1": 0.9810000658035278,
|
565 |
+
"2": 0.9320000410079956,
|
566 |
+
"3": 0.9530000686645508
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.9450000524520874,
|
570 |
+
"1": 0.9890000224113464,
|
571 |
+
"2": 0.9200000166893005,
|
572 |
+
"3": 0.9450000524520874
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.559,
|
576 |
+
"1": 0.66,
|
577 |
+
"2": 0.668,
|
578 |
+
"3": 0.65
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.807,
|
582 |
+
"1": 0.799,
|
583 |
+
"2": 0.699,
|
584 |
+
"3": 0.823
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.815,
|
588 |
+
"1": 0.88,
|
589 |
+
"2": 0.756,
|
590 |
+
"3": 0.848
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.585,
|
594 |
+
"1": 0.656,
|
595 |
+
"2": 0.543,
|
596 |
+
"3": 0.64
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.805,
|
600 |
+
"1": 0.716,
|
601 |
+
"2": 0.618,
|
602 |
+
"3": 0.736
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.834,
|
606 |
+
"1": 0.802,
|
607 |
+
"2": 0.823,
|
608 |
+
"3": 0.816
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.9980000257492065,
|
614 |
+
"fr": 0.9950000643730164,
|
615 |
+
"de": 0.9980000257492065,
|
616 |
+
"es": 0.9980000257492065,
|
617 |
+
"nl": 0.999000072479248
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 0.999000072479248,
|
621 |
+
"fr": 1.0,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 1.0,
|
624 |
+
"nl": 0.999000072479248
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.741,
|
628 |
+
"fr": 0.609,
|
629 |
+
"de": 0.754,
|
630 |
+
"es": 0.494,
|
631 |
+
"nl": 0.656
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.832,
|
635 |
+
"fr": 0.597,
|
636 |
+
"de": 0.826,
|
637 |
+
"es": 0.964,
|
638 |
+
"nl": 0.741
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.879,
|
642 |
+
"fr": 0.909,
|
643 |
+
"de": 0.874,
|
644 |
+
"es": 0.979,
|
645 |
+
"nl": 0.867
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.747,
|
649 |
+
"fr": 0.598,
|
650 |
+
"de": 0.909,
|
651 |
+
"es": 0.91,
|
652 |
+
"nl": 0.654
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.995,
|
656 |
+
"fr": 0.984,
|
657 |
+
"de": 0.988,
|
658 |
+
"es": 0.989,
|
659 |
+
"nl": 0.998
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.999,
|
663 |
+
"fr": 0.996,
|
664 |
+
"de": 0.993,
|
665 |
+
"es": 0.993,
|
666 |
+
"nl": 1.0
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "fccca889-102f-47b0-9e93-59ed0335697a",
|
30 |
+
"datetime_epoch_millis": 1740203387785,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9587437950074672,
|
34 |
+
"llm_top_1_test_accuracy": 0.6508312500000001,
|
35 |
+
"llm_top_2_test_accuracy": 0.7267250000000001,
|
36 |
+
"llm_top_5_test_accuracy": 0.77896875,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9568687990307807,
|
44 |
+
"sae_top_1_test_accuracy": 0.7255,
|
45 |
+
"sae_top_2_test_accuracy": 0.8132124999999998,
|
46 |
+
"sae_top_5_test_accuracy": 0.869575,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.966800057888031,
|
57 |
+
"llm_top_1_test_accuracy": 0.6397999999999999,
|
58 |
+
"llm_top_2_test_accuracy": 0.6954,
|
59 |
+
"llm_top_5_test_accuracy": 0.7869999999999999,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9588000655174256,
|
65 |
+
"sae_top_1_test_accuracy": 0.6734,
|
66 |
+
"sae_top_2_test_accuracy": 0.8904,
|
67 |
+
"sae_top_5_test_accuracy": 0.9034000000000001,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9544000387191772,
|
76 |
+
"llm_top_1_test_accuracy": 0.6744000000000001,
|
77 |
+
"llm_top_2_test_accuracy": 0.7334,
|
78 |
+
"llm_top_5_test_accuracy": 0.763,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9468000531196594,
|
84 |
+
"sae_top_1_test_accuracy": 0.6684,
|
85 |
+
"sae_top_2_test_accuracy": 0.7619999999999999,
|
86 |
+
"sae_top_5_test_accuracy": 0.8596,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9310000538825989,
|
95 |
+
"llm_top_1_test_accuracy": 0.6864000000000001,
|
96 |
+
"llm_top_2_test_accuracy": 0.7436,
|
97 |
+
"llm_top_5_test_accuracy": 0.763,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9338000416755676,
|
103 |
+
"sae_top_1_test_accuracy": 0.7182000000000001,
|
104 |
+
"sae_top_2_test_accuracy": 0.805,
|
105 |
+
"sae_top_5_test_accuracy": 0.8632000000000002,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.915600061416626,
|
114 |
+
"llm_top_1_test_accuracy": 0.599,
|
115 |
+
"llm_top_2_test_accuracy": 0.6476000000000001,
|
116 |
+
"llm_top_5_test_accuracy": 0.6708000000000001,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9242000460624695,
|
122 |
+
"sae_top_1_test_accuracy": 0.7218000000000001,
|
123 |
+
"sae_top_2_test_accuracy": 0.7634000000000001,
|
124 |
+
"sae_top_5_test_accuracy": 0.8140000000000001,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9820000529289246,
|
133 |
+
"llm_top_1_test_accuracy": 0.673,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9735000431537628,
|
141 |
+
"sae_top_1_test_accuracy": 0.92,
|
142 |
+
"sae_top_2_test_accuracy": 0.919,
|
143 |
+
"sae_top_5_test_accuracy": 0.944,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9708000302314759,
|
152 |
+
"llm_top_1_test_accuracy": 0.649,
|
153 |
+
"llm_top_2_test_accuracy": 0.6958,
|
154 |
+
"llm_top_5_test_accuracy": 0.7556,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9690000414848328,
|
160 |
+
"sae_top_1_test_accuracy": 0.6166,
|
161 |
+
"sae_top_2_test_accuracy": 0.6898,
|
162 |
+
"sae_top_5_test_accuracy": 0.7966,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9497500360012054,
|
171 |
+
"llm_top_1_test_accuracy": 0.63425,
|
172 |
+
"llm_top_2_test_accuracy": 0.782,
|
173 |
+
"llm_top_5_test_accuracy": 0.8247499999999999,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9502500593662262,
|
179 |
+
"sae_top_1_test_accuracy": 0.649,
|
180 |
+
"sae_top_2_test_accuracy": 0.7204999999999999,
|
181 |
+
"sae_top_5_test_accuracy": 0.795,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6508,
|
191 |
+
"llm_top_2_test_accuracy": 0.792,
|
192 |
+
"llm_top_5_test_accuracy": 0.9016,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9986000418663025,
|
198 |
+
"sae_top_1_test_accuracy": 0.8366,
|
199 |
+
"sae_top_2_test_accuracy": 0.9555999999999999,
|
200 |
+
"sae_top_5_test_accuracy": 0.9808,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000",
|
210 |
+
"sae_lens_version": "5.5.0",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 65536,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9390000700950623,
|
240 |
+
"1": 0.9600000381469727,
|
241 |
+
"2": 0.9390000700950623,
|
242 |
+
"6": 0.9850000739097595,
|
243 |
+
"9": 0.971000075340271
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9530000686645508,
|
249 |
+
"6": 0.987000048160553,
|
250 |
+
"9": 0.9760000705718994
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.577,
|
254 |
+
"1": 0.613,
|
255 |
+
"2": 0.662,
|
256 |
+
"6": 0.787,
|
257 |
+
"9": 0.56
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.574,
|
261 |
+
"1": 0.66,
|
262 |
+
"2": 0.718,
|
263 |
+
"6": 0.811,
|
264 |
+
"9": 0.714
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.713,
|
268 |
+
"1": 0.711,
|
269 |
+
"2": 0.755,
|
270 |
+
"6": 0.895,
|
271 |
+
"9": 0.861
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.577,
|
275 |
+
"1": 0.62,
|
276 |
+
"2": 0.834,
|
277 |
+
"6": 0.81,
|
278 |
+
"9": 0.526
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.876,
|
282 |
+
"1": 0.815,
|
283 |
+
"2": 0.84,
|
284 |
+
"6": 0.976,
|
285 |
+
"9": 0.945
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.88,
|
289 |
+
"1": 0.864,
|
290 |
+
"2": 0.839,
|
291 |
+
"6": 0.987,
|
292 |
+
"9": 0.947
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9470000267028809,
|
298 |
+
"13": 0.9580000638961792,
|
299 |
+
"14": 0.9450000524520874,
|
300 |
+
"18": 0.9220000505447388,
|
301 |
+
"19": 0.9620000720024109
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9660000205039978,
|
305 |
+
"13": 0.9510000348091125,
|
306 |
+
"14": 0.9540000557899475,
|
307 |
+
"18": 0.940000057220459,
|
308 |
+
"19": 0.9610000252723694
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.553,
|
312 |
+
"13": 0.673,
|
313 |
+
"14": 0.651,
|
314 |
+
"18": 0.706,
|
315 |
+
"19": 0.789
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.77,
|
319 |
+
"13": 0.719,
|
320 |
+
"14": 0.672,
|
321 |
+
"18": 0.717,
|
322 |
+
"19": 0.789
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.793,
|
326 |
+
"13": 0.739,
|
327 |
+
"14": 0.732,
|
328 |
+
"18": 0.723,
|
329 |
+
"19": 0.828
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.552,
|
333 |
+
"13": 0.673,
|
334 |
+
"14": 0.627,
|
335 |
+
"18": 0.7,
|
336 |
+
"19": 0.79
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.749,
|
340 |
+
"13": 0.7,
|
341 |
+
"14": 0.79,
|
342 |
+
"18": 0.743,
|
343 |
+
"19": 0.828
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.906,
|
347 |
+
"13": 0.742,
|
348 |
+
"14": 0.908,
|
349 |
+
"18": 0.894,
|
350 |
+
"19": 0.848
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9630000591278076,
|
356 |
+
"21": 0.9280000329017639,
|
357 |
+
"22": 0.9250000715255737,
|
358 |
+
"25": 0.9610000252723694,
|
359 |
+
"26": 0.8920000195503235
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.9650000333786011,
|
363 |
+
"21": 0.9250000715255737,
|
364 |
+
"22": 0.9140000343322754,
|
365 |
+
"25": 0.9670000672340393,
|
366 |
+
"26": 0.8840000629425049
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.709,
|
370 |
+
"21": 0.762,
|
371 |
+
"22": 0.653,
|
372 |
+
"25": 0.683,
|
373 |
+
"26": 0.625
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.811,
|
377 |
+
"21": 0.769,
|
378 |
+
"22": 0.688,
|
379 |
+
"25": 0.766,
|
380 |
+
"26": 0.684
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.815,
|
384 |
+
"21": 0.794,
|
385 |
+
"22": 0.706,
|
386 |
+
"25": 0.803,
|
387 |
+
"26": 0.697
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.866,
|
391 |
+
"21": 0.499,
|
392 |
+
"22": 0.877,
|
393 |
+
"25": 0.705,
|
394 |
+
"26": 0.644
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.882,
|
398 |
+
"21": 0.735,
|
399 |
+
"22": 0.885,
|
400 |
+
"25": 0.878,
|
401 |
+
"26": 0.645
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.904,
|
405 |
+
"21": 0.853,
|
406 |
+
"22": 0.901,
|
407 |
+
"25": 0.894,
|
408 |
+
"26": 0.764
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9480000734329224,
|
414 |
+
"2": 0.9420000314712524,
|
415 |
+
"3": 0.9220000505447388,
|
416 |
+
"5": 0.9270000457763672,
|
417 |
+
"6": 0.8820000290870667
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9490000605583191,
|
421 |
+
"2": 0.9300000667572021,
|
422 |
+
"3": 0.9120000600814819,
|
423 |
+
"5": 0.9250000715255737,
|
424 |
+
"6": 0.862000048160553
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.684,
|
428 |
+
"2": 0.592,
|
429 |
+
"3": 0.583,
|
430 |
+
"5": 0.551,
|
431 |
+
"6": 0.585
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.739,
|
435 |
+
"2": 0.635,
|
436 |
+
"3": 0.609,
|
437 |
+
"5": 0.635,
|
438 |
+
"6": 0.62
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.757,
|
442 |
+
"2": 0.636,
|
443 |
+
"3": 0.622,
|
444 |
+
"5": 0.657,
|
445 |
+
"6": 0.682
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.867,
|
449 |
+
"2": 0.765,
|
450 |
+
"3": 0.547,
|
451 |
+
"5": 0.814,
|
452 |
+
"6": 0.616
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.9,
|
456 |
+
"2": 0.785,
|
457 |
+
"3": 0.608,
|
458 |
+
"5": 0.812,
|
459 |
+
"6": 0.712
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.914,
|
463 |
+
"2": 0.869,
|
464 |
+
"3": 0.68,
|
465 |
+
"5": 0.874,
|
466 |
+
"6": 0.733
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9730000495910645,
|
472 |
+
"5.0": 0.9740000367164612
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9820000529289246,
|
476 |
+
"5.0": 0.9820000529289246
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.673,
|
480 |
+
"5.0": 0.673
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.92,
|
492 |
+
"5.0": 0.92
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.919,
|
496 |
+
"5.0": 0.919
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.944,
|
500 |
+
"5.0": 0.944
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9520000219345093,
|
506 |
+
"Python": 0.9890000224113464,
|
507 |
+
"HTML": 0.9830000400543213,
|
508 |
+
"Java": 0.9670000672340393,
|
509 |
+
"PHP": 0.9540000557899475
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.9830000400543213,
|
514 |
+
"HTML": 0.984000027179718,
|
515 |
+
"Java": 0.9700000286102295,
|
516 |
+
"PHP": 0.9610000252723694
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.663,
|
520 |
+
"Python": 0.655,
|
521 |
+
"HTML": 0.72,
|
522 |
+
"Java": 0.612,
|
523 |
+
"PHP": 0.595
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.68,
|
527 |
+
"Python": 0.683,
|
528 |
+
"HTML": 0.798,
|
529 |
+
"Java": 0.681,
|
530 |
+
"PHP": 0.637
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.74,
|
534 |
+
"Python": 0.724,
|
535 |
+
"HTML": 0.902,
|
536 |
+
"Java": 0.739,
|
537 |
+
"PHP": 0.673
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.635,
|
541 |
+
"Python": 0.65,
|
542 |
+
"HTML": 0.583,
|
543 |
+
"Java": 0.627,
|
544 |
+
"PHP": 0.588
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.583,
|
548 |
+
"Python": 0.639,
|
549 |
+
"HTML": 0.701,
|
550 |
+
"Java": 0.622,
|
551 |
+
"PHP": 0.904
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.657,
|
555 |
+
"Python": 0.921,
|
556 |
+
"HTML": 0.815,
|
557 |
+
"Java": 0.681,
|
558 |
+
"PHP": 0.909
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9390000700950623,
|
564 |
+
"1": 0.9830000400543213,
|
565 |
+
"2": 0.9300000667572021,
|
566 |
+
"3": 0.9490000605583191
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.9450000524520874,
|
570 |
+
"1": 0.9890000224113464,
|
571 |
+
"2": 0.9200000166893005,
|
572 |
+
"3": 0.9450000524520874
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.559,
|
576 |
+
"1": 0.66,
|
577 |
+
"2": 0.668,
|
578 |
+
"3": 0.65
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.807,
|
582 |
+
"1": 0.799,
|
583 |
+
"2": 0.699,
|
584 |
+
"3": 0.823
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.815,
|
588 |
+
"1": 0.88,
|
589 |
+
"2": 0.756,
|
590 |
+
"3": 0.848
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.754,
|
594 |
+
"1": 0.668,
|
595 |
+
"2": 0.532,
|
596 |
+
"3": 0.642
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.754,
|
600 |
+
"1": 0.709,
|
601 |
+
"2": 0.715,
|
602 |
+
"3": 0.704
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.797,
|
606 |
+
"1": 0.806,
|
607 |
+
"2": 0.819,
|
608 |
+
"3": 0.758
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.9970000386238098,
|
614 |
+
"fr": 0.999000072479248,
|
615 |
+
"de": 0.9980000257492065,
|
616 |
+
"es": 1.0,
|
617 |
+
"nl": 0.999000072479248
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 0.999000072479248,
|
621 |
+
"fr": 1.0,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 1.0,
|
624 |
+
"nl": 0.999000072479248
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.741,
|
628 |
+
"fr": 0.609,
|
629 |
+
"de": 0.754,
|
630 |
+
"es": 0.494,
|
631 |
+
"nl": 0.656
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.832,
|
635 |
+
"fr": 0.597,
|
636 |
+
"de": 0.826,
|
637 |
+
"es": 0.964,
|
638 |
+
"nl": 0.741
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.879,
|
642 |
+
"fr": 0.909,
|
643 |
+
"de": 0.874,
|
644 |
+
"es": 0.979,
|
645 |
+
"nl": 0.867
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.751,
|
649 |
+
"fr": 0.988,
|
650 |
+
"de": 0.923,
|
651 |
+
"es": 0.877,
|
652 |
+
"nl": 0.644
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.999,
|
656 |
+
"fr": 0.992,
|
657 |
+
"de": 0.913,
|
658 |
+
"es": 0.875,
|
659 |
+
"nl": 0.999
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.999,
|
663 |
+
"fr": 0.995,
|
664 |
+
"de": 0.912,
|
665 |
+
"es": 0.998,
|
666 |
+
"nl": 1.0
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "b9a9adb1-7a85-4d3d-bb84-63c1a6ab3b28",
|
73 |
+
"datetime_epoch_millis": 1740202474022,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.004200004041194916,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.007200002670288086,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.00299999862909317,
|
79 |
+
"tpp_threshold_5_total_metric": 0.006300006806850434,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.009600007534027101,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.003300000727176666,
|
82 |
+
"tpp_threshold_10_total_metric": 0.015000005066394807,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.020000004768371583,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.004999999701976777,
|
85 |
+
"tpp_threshold_20_total_metric": 0.03174999803304672,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.03789999485015869,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.006149996817111969,
|
88 |
+
"tpp_threshold_50_total_metric": 0.08190000057220459,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.08960000276565552,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.007700002193450928,
|
91 |
+
"tpp_threshold_100_total_metric": 0.1676749989390373,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.18299999833106995,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.015324999392032624,
|
94 |
+
"tpp_threshold_500_total_metric": 0.3794000178575516,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.4046000182628632,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.025200000405311587
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.003600001335144043,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.006000006198883056,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.0024000048637390138,
|
105 |
+
"tpp_threshold_5_total_metric": 0.006100001931190491,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.009000015258789063,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.002900013327598572,
|
108 |
+
"tpp_threshold_10_total_metric": 0.014000010490417481,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.017000019550323486,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.003000009059906006,
|
111 |
+
"tpp_threshold_20_total_metric": 0.03419999182224274,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.037199997901916505,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.003000006079673767,
|
114 |
+
"tpp_threshold_50_total_metric": 0.08610000610351562,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.09040001630783082,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.004300010204315185,
|
117 |
+
"tpp_threshold_100_total_metric": 0.17919999957084656,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.18480000495910645,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.005600005388259888,
|
120 |
+
"tpp_threshold_500_total_metric": 0.43065003752708436,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.4422000408172607,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.011550003290176391
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.004800006747245789,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.008399999141693116,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.0035999923944473266,
|
129 |
+
"tpp_threshold_5_total_metric": 0.006500011682510376,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.010199999809265137,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.0036999881267547607,
|
132 |
+
"tpp_threshold_10_total_metric": 0.015999999642372132,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.022999989986419677,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.006999990344047547,
|
135 |
+
"tpp_threshold_20_total_metric": 0.029300004243850708,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.03859999179840088,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.00929998755455017,
|
138 |
+
"tpp_threshold_50_total_metric": 0.07769999504089356,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.08879998922348023,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.01109999418258667,
|
141 |
+
"tpp_threshold_100_total_metric": 0.1561499983072281,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.18119999170303344,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.02504999339580536,
|
144 |
+
"tpp_threshold_500_total_metric": 0.3281499981880188,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.3669999957084656,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.03884999752044678
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000",
|
152 |
+
"sae_lens_version": "5.5.0",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 65536,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.0059999823570251465,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.009999990463256836,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.0040000081062316895,
|
184 |
+
"tpp_threshold_5_total_metric": 0.012749999761581421,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.017000019550323486,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.004250019788742065,
|
187 |
+
"tpp_threshold_10_total_metric": 0.011500000953674316,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.017000019550323486,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.00550001859664917,
|
190 |
+
"tpp_threshold_20_total_metric": 0.029499992728233337,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.03299999237060547,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.0034999996423721313,
|
193 |
+
"tpp_threshold_50_total_metric": 0.0765000432729721,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.08000004291534424,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.0034999996423721313,
|
196 |
+
"tpp_threshold_100_total_metric": 0.1547500193119049,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.1600000262260437,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.005250006914138794,
|
199 |
+
"tpp_threshold_500_total_metric": 0.4282500445842743,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.4410000443458557,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.012749999761581421
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.006250053644180298,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.006000041961669922,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": -0.000250011682510376,
|
207 |
+
"tpp_threshold_5_total_metric": 0.0027500391006469727,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.005000054836273193,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0022500157356262207,
|
210 |
+
"tpp_threshold_10_total_metric": 0.006000041961669922,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.006000041961669922,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": 0.0,
|
213 |
+
"tpp_threshold_20_total_metric": 0.014750048518180847,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.01900005340576172,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.004250004887580872,
|
216 |
+
"tpp_threshold_50_total_metric": 0.07450000941753387,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.0820000171661377,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.007500007748603821,
|
219 |
+
"tpp_threshold_100_total_metric": 0.13900001347064972,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.1470000147819519,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.008000001311302185,
|
222 |
+
"tpp_threshold_500_total_metric": 0.43675006926059723,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.4490000605583191,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.012249991297721863
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": -0.0017500221729278564,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.002750009298324585,
|
230 |
+
"tpp_threshold_5_total_metric": 0.004749983549118042,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.009000003337860107,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.004250019788742065,
|
233 |
+
"tpp_threshold_10_total_metric": 0.032250016927719116,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.0350000262260437,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.002750009298324585,
|
236 |
+
"tpp_threshold_20_total_metric": 0.05924998223781586,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.06099998950958252,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.0017500072717666626,
|
239 |
+
"tpp_threshold_50_total_metric": 0.10799999535083771,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.11000001430511475,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.0020000189542770386,
|
242 |
+
"tpp_threshold_100_total_metric": 0.2057500034570694,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.2070000171661377,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.0012500137090682983,
|
245 |
+
"tpp_threshold_500_total_metric": 0.43125003576278687,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.437000036239624,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.005750000476837158
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.0002500265836715698,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0020000338554382324,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0017500072717666626,
|
253 |
+
"tpp_threshold_5_total_metric": 0.002249985933303833,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0012499988079071045,
|
256 |
+
"tpp_threshold_10_total_metric": 0.0037499815225601196,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.004999995231628418,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.0012500137090682983,
|
259 |
+
"tpp_threshold_20_total_metric": 0.005749985575675964,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.004999995231628418,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": -0.0007499903440475464,
|
262 |
+
"tpp_threshold_50_total_metric": 0.01199999451637268,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.013000011444091797,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.0010000169277191162,
|
265 |
+
"tpp_threshold_100_total_metric": 0.109499990940094,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.1119999885559082,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.002499997615814209,
|
268 |
+
"tpp_threshold_500_total_metric": 0.3985000401735306,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.40800005197525024,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.009500011801719666
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.007249966263771057,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.010999977588653564,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037500113248825073,
|
276 |
+
"tpp_threshold_5_total_metric": 0.008000001311302185,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.013000011444091797,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.005000010132789612,
|
279 |
+
"tpp_threshold_10_total_metric": 0.016500011086463928,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.022000014781951904,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.005500003695487976,
|
282 |
+
"tpp_threshold_20_total_metric": 0.06174995005130768,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.0679999589920044,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.006250008940696716,
|
285 |
+
"tpp_threshold_50_total_metric": 0.15949998795986176,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.16699999570846558,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.007500007748603821,
|
288 |
+
"tpp_threshold_100_total_metric": 0.28699997067451477,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.2979999780654907,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.011000007390975952,
|
291 |
+
"tpp_threshold_500_total_metric": 0.4584999978542328,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.47600001096725464,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.01750001311302185
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.00475001335144043,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.004249989986419678,
|
301 |
+
"tpp_threshold_5_total_metric": 0.001249954104423523,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.003999948501586914,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.002749994397163391,
|
304 |
+
"tpp_threshold_10_total_metric": 0.004249975085258484,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.011999964714050293,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.007749989628791809,
|
307 |
+
"tpp_threshold_20_total_metric": 0.0037500113248825073,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.009999990463256836,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.006249979138374329,
|
310 |
+
"tpp_threshold_50_total_metric": 0.016249999403953552,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.018999993801116943,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.002749994397163391,
|
313 |
+
"tpp_threshold_100_total_metric": 0.0034999698400497437,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.05399996042251587,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.050499990582466125,
|
316 |
+
"tpp_threshold_500_total_metric": 0.22074998915195465,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.2849999666213989,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.06424997746944427
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.007499992847442627,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.009999990463256836,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.002499997615814209,
|
324 |
+
"tpp_threshold_5_total_metric": 0.0010000020265579224,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.009999990463256836,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.008999988436698914,
|
327 |
+
"tpp_threshold_10_total_metric": 0.0222499817609787,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.029999971389770508,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.007749989628791809,
|
330 |
+
"tpp_threshold_20_total_metric": 0.052000001072883606,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.05699998140335083,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.004999980330467224,
|
333 |
+
"tpp_threshold_50_total_metric": 0.11399997770786285,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.12699997425079346,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.012999996542930603,
|
336 |
+
"tpp_threshold_100_total_metric": 0.17999999225139618,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.19999998807907104,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.019999995827674866,
|
339 |
+
"tpp_threshold_500_total_metric": 0.39000001549720764,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.42000001668930054,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.030000001192092896
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.009999975562095642,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.0059999823570251465,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.003999993205070496,
|
347 |
+
"tpp_threshold_5_total_metric": -0.0009999722242355347,
|
348 |
+
"tpp_threshold_5_intended_diff_only": -0.0009999871253967285,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": -1.4901161193847656e-08,
|
350 |
+
"tpp_threshold_10_total_metric": 0.010000035166740417,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.013000011444091797,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.0029999762773513794,
|
353 |
+
"tpp_threshold_20_total_metric": 0.004249989986419678,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.018999993801116943,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.014750003814697266,
|
356 |
+
"tpp_threshold_50_total_metric": 0.04099997878074646,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.05299997329711914,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.01199999451637268,
|
359 |
+
"tpp_threshold_100_total_metric": 0.07449999451637268,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.0899999737739563,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.015499979257583618,
|
362 |
+
"tpp_threshold_500_total_metric": 0.29624997079372406,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.33899998664855957,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.04275001585483551
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": 0.0045000165700912476,
|
368 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.00449998676776886,
|
370 |
+
"tpp_threshold_5_total_metric": 0.012500062584877014,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.020000040531158447,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.007499977946281433,
|
373 |
+
"tpp_threshold_10_total_metric": 0.012500032782554626,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.021000027656555176,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.00849999487400055,
|
376 |
+
"tpp_threshold_20_total_metric": 0.028750047087669373,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.03900003433227539,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.010249987244606018,
|
379 |
+
"tpp_threshold_50_total_metric": 0.10075005888938904,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.1170000433921814,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.01624998450279236,
|
382 |
+
"tpp_threshold_100_total_metric": 0.27750006318092346,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.3060000538825989,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.028499990701675415,
|
385 |
+
"tpp_threshold_500_total_metric": 0.37825003266334534,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.4150000214576721,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.03674998879432678
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.01724998652935028,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.019999980926513672,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.002749994397163391,
|
393 |
+
"tpp_threshold_5_total_metric": 0.018750011920928955,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.018000006675720215,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": -0.0007500052452087402,
|
396 |
+
"tpp_threshold_10_total_metric": 0.03099997341632843,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.038999974727630615,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.008000001311302185,
|
399 |
+
"tpp_threshold_20_total_metric": 0.057749971747398376,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.0679999589920044,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.010249987244606018,
|
402 |
+
"tpp_threshold_50_total_metric": 0.11649996042251587,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.12799996137619019,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.011500000953674316,
|
405 |
+
"tpp_threshold_100_total_metric": 0.24524997174739838,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.25599998235702515,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.01075001060962677,
|
408 |
+
"tpp_threshold_500_total_metric": 0.3554999828338623,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.37599998712539673,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.020500004291534424
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "88010eee-5f36-4f31-947e-cec777cc359e",
|
73 |
+
"datetime_epoch_millis": 1740202787337,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.0037499994039535524,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.006599998474121094,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.0028499990701675417,
|
79 |
+
"tpp_threshold_5_total_metric": 0.005775000154972077,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.008799999952316284,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.0030249997973442076,
|
82 |
+
"tpp_threshold_10_total_metric": 0.009449997544288635,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.013499993085861205,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.0040499955415725705,
|
85 |
+
"tpp_threshold_20_total_metric": 0.016199994087219238,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.02059999108314514,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.004399996995925903,
|
88 |
+
"tpp_threshold_50_total_metric": 0.031475001573562617,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.03600000143051148,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.004524999856948852,
|
91 |
+
"tpp_threshold_100_total_metric": 0.04169999808073044,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.04989999532699585,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.008199997246265411,
|
94 |
+
"tpp_threshold_500_total_metric": 0.12555001527071,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.1348000168800354,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.00925000160932541
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.005249989032745361,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.0075999975204467775,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.002350008487701416,
|
105 |
+
"tpp_threshold_5_total_metric": 0.0073000043630599976,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.010000014305114746,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.0027000099420547486,
|
108 |
+
"tpp_threshold_10_total_metric": 0.010399997234344482,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.013199996948242188,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.002799999713897705,
|
111 |
+
"tpp_threshold_20_total_metric": 0.01979999840259552,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.022600007057189942,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.0028000086545944213,
|
114 |
+
"tpp_threshold_50_total_metric": 0.03664999902248382,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.04000000953674317,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.0033500105142593383,
|
117 |
+
"tpp_threshold_100_total_metric": 0.050449994206428525,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.05640000104904175,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.00595000684261322,
|
120 |
+
"tpp_threshold_500_total_metric": 0.14450002312660218,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.15240002870559693,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.007900005578994751
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.0022500097751617433,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.00559999942779541,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.003349989652633667,
|
129 |
+
"tpp_threshold_5_total_metric": 0.004249995946884156,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.007599985599517823,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.003349989652633667,
|
132 |
+
"tpp_threshold_10_total_metric": 0.008499997854232787,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.013799989223480224,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.005299991369247437,
|
135 |
+
"tpp_threshold_20_total_metric": 0.012599989771842957,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.01859997510910034,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.005999985337257385,
|
138 |
+
"tpp_threshold_50_total_metric": 0.026300004124641417,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.03199999332427979,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.0056999891996383665,
|
141 |
+
"tpp_threshold_100_total_metric": 0.03295000195503235,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.04339998960494995,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.010449987649917603,
|
144 |
+
"tpp_threshold_500_total_metric": 0.10660000741481782,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.11720000505447388,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.010599997639656068
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000",
|
152 |
+
"sae_lens_version": "5.5.0",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 65536,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.008250012993812561,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.012000024318695068,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037500113248825073,
|
184 |
+
"tpp_threshold_5_total_metric": 0.011250033974647522,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.01500004529953003,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.0037500113248825073,
|
187 |
+
"tpp_threshold_10_total_metric": 0.006249979138374329,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.009999990463256836,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.0037500113248825073,
|
190 |
+
"tpp_threshold_20_total_metric": 0.01975002884864807,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.021000027656555176,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.0012499988079071045,
|
193 |
+
"tpp_threshold_50_total_metric": 0.036500006914138794,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.04000002145767212,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.003500014543533325,
|
196 |
+
"tpp_threshold_100_total_metric": 0.04475003480911255,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.058000028133392334,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.013249993324279785,
|
199 |
+
"tpp_threshold_500_total_metric": 0.14125002920627594,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.15500003099441528,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.013750001788139343
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.009250015020370483,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": -0.000250011682510376,
|
207 |
+
"tpp_threshold_5_total_metric": 0.004000052809715271,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.005000054836273193,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0010000020265579224,
|
210 |
+
"tpp_threshold_10_total_metric": 0.006250038743019104,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.00700002908706665,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": 0.0007499903440475464,
|
213 |
+
"tpp_threshold_20_total_metric": 0.0045000165700912476,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.008000016212463379,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.0034999996423721313,
|
216 |
+
"tpp_threshold_50_total_metric": 0.015500038862228394,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.020000040531158447,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.004500001668930054,
|
219 |
+
"tpp_threshold_100_total_metric": 0.02699999511241913,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.03200000524520874,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.005000010132789612,
|
222 |
+
"tpp_threshold_500_total_metric": 0.09950004518032074,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.10700005292892456,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.007500007748603821
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.008249998092651367,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.013000011444091797,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.00475001335144043,
|
230 |
+
"tpp_threshold_5_total_metric": 0.017000004649162292,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.022000014781951904,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.005000010132789612,
|
233 |
+
"tpp_threshold_10_total_metric": 0.03350000083446503,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.03700000047683716,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.0034999996423721313,
|
236 |
+
"tpp_threshold_20_total_metric": 0.04799997806549072,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.050999999046325684,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.003000020980834961,
|
239 |
+
"tpp_threshold_50_total_metric": 0.06849998235702515,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.06999999284744263,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.0015000104904174805,
|
242 |
+
"tpp_threshold_100_total_metric": 0.09125001728534698,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.09500002861022949,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.0037500113248825073,
|
245 |
+
"tpp_threshold_500_total_metric": 0.2682500183582306,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.2720000147819519,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.0037499964237213135
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": -0.000500023365020752,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0015000104904174805,
|
253 |
+
"tpp_threshold_5_total_metric": 0.002249985933303833,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0012499988079071045,
|
256 |
+
"tpp_threshold_10_total_metric": 0.003000006079673767,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.0010000020265579224,
|
259 |
+
"tpp_threshold_20_total_metric": 0.002750024199485779,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.0020000338554382324,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": -0.0007499903440475464,
|
262 |
+
"tpp_threshold_50_total_metric": 0.004249989986419678,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.004999995231628418,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.0007500052452087402,
|
265 |
+
"tpp_threshold_100_total_metric": 0.006249964237213135,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.0059999823570251465,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": -0.0002499818801879883,
|
268 |
+
"tpp_threshold_500_total_metric": 0.012250006198883057,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.017000019550323486,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.00475001335144043
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.000999942421913147,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.0029999613761901855,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.0020000189542770386,
|
276 |
+
"tpp_threshold_5_total_metric": 0.0019999444484710693,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.006999969482421875,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.005000025033950806,
|
279 |
+
"tpp_threshold_10_total_metric": 0.0029999613761901855,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.007999956607818604,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.004999995231628418,
|
282 |
+
"tpp_threshold_20_total_metric": 0.02399994432926178,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.030999958515167236,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.0070000141859054565,
|
285 |
+
"tpp_threshold_50_total_metric": 0.05849997699260712,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.06499999761581421,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.006500020623207092,
|
288 |
+
"tpp_threshold_100_total_metric": 0.08299995958805084,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.09099996089935303,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.008000001311302185,
|
291 |
+
"tpp_threshold_500_total_metric": 0.20125001668930054,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.21100002527236938,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.009750008583068848
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.005000010132789612,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.003999993205070496,
|
301 |
+
"tpp_threshold_5_total_metric": 0.002249985933303833,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.0059999823570251465,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.0037499964237213135,
|
304 |
+
"tpp_threshold_10_total_metric": 0.0004999637603759766,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.007999956607818604,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.007499992847442627,
|
307 |
+
"tpp_threshold_20_total_metric": -2.9802322387695312e-08,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.003999948501586914,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.003999978303909302,
|
310 |
+
"tpp_threshold_50_total_metric": 0.0027499645948410034,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.0029999613761901855,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.00024999678134918213,
|
313 |
+
"tpp_threshold_100_total_metric": 0.00024996697902679443,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.006999969482421875,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.006750002503395081,
|
316 |
+
"tpp_threshold_500_total_metric": 0.02699999511241913,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.02799999713897705,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.0010000020265579224
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.009250015020370483,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": -0.000250011682510376,
|
324 |
+
"tpp_threshold_5_total_metric": 0.002500012516975403,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.009000003337860107,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.006499990820884705,
|
327 |
+
"tpp_threshold_10_total_metric": 0.009500011801719666,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.014999985694885254,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.005499973893165588,
|
330 |
+
"tpp_threshold_20_total_metric": 0.012999996542930603,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.015999972820281982,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.0029999762773513794,
|
333 |
+
"tpp_threshold_50_total_metric": 0.02899998426437378,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.0339999794960022,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.004999995231628418,
|
336 |
+
"tpp_threshold_100_total_metric": 0.03225000202655792,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.042999982833862305,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.010749980807304382,
|
339 |
+
"tpp_threshold_500_total_metric": 0.09800000488758087,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.10600000619888306,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.008000001311302185
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.008000016212463379,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.003000020980834961,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.004999995231628418,
|
347 |
+
"tpp_threshold_5_total_metric": 0.005500003695487976,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": -0.0005000084638595581,
|
350 |
+
"tpp_threshold_10_total_metric": 0.00875002145767212,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.013000011444091797,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.004249989986419678,
|
353 |
+
"tpp_threshold_20_total_metric": 0.0020000189542770386,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.009000003337860107,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.006999984383583069,
|
356 |
+
"tpp_threshold_50_total_metric": 0.011499986052513123,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.015999972820281982,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.00449998676776886,
|
359 |
+
"tpp_threshold_100_total_metric": 0.020749986171722412,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.029999971389770508,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.009249985218048096,
|
362 |
+
"tpp_threshold_500_total_metric": 0.07975000143051147,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.09299999475479126,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.013249993324279785
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": -0.004749953746795654,
|
368 |
+
"tpp_threshold_2_intended_diff_only": -0.001999974250793457,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.0027499794960021973,
|
370 |
+
"tpp_threshold_5_total_metric": -0.0009999871253967285,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.0059999823570251465,
|
373 |
+
"tpp_threshold_10_total_metric": -0.0037499815225601196,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.003000020980834961,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.006750002503395081,
|
376 |
+
"tpp_threshold_20_total_metric": 0.005500003695487976,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.013999998569488525,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.00849999487400055,
|
379 |
+
"tpp_threshold_50_total_metric": 0.03400006890296936,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.04200005531311035,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.007999986410140991,
|
382 |
+
"tpp_threshold_100_total_metric": 0.042000025510787964,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.05900001525878906,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.0169999897480011,
|
385 |
+
"tpp_threshold_500_total_metric": 0.11125005781650543,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.13100004196166992,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.01974998414516449
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.009749993681907654,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.014999985694885254,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.0052499920129776,
|
393 |
+
"tpp_threshold_5_total_metric": 0.011999964714050293,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.012999951839447021,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": 0.0009999871253967285,
|
396 |
+
"tpp_threshold_10_total_metric": 0.0274999737739563,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.029999971389770508,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.002499997615814209,
|
399 |
+
"tpp_threshold_20_total_metric": 0.04249995946884155,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.04999995231628418,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.007499992847442627,
|
402 |
+
"tpp_threshold_50_total_metric": 0.05425001680850983,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.06499999761581421,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.010749980807304382,
|
405 |
+
"tpp_threshold_100_total_metric": 0.06950002908706665,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.078000009059906,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.008499979972839355,
|
408 |
+
"tpp_threshold_500_total_metric": 0.21699997782707214,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.2279999852180481,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.011000007390975952
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "a63c9e71-5b6c-46d2-b489-d01f4d9597c7",
|
73 |
+
"datetime_epoch_millis": 1740202630595,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.004799993336200714,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.0075999975204467775,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.0028000041842460633,
|
79 |
+
"tpp_threshold_5_total_metric": 0.007149985432624817,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.010399985313415527,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.0032499998807907103,
|
82 |
+
"tpp_threshold_10_total_metric": 0.014450006186962128,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.018300002813339232,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.003849996626377106,
|
85 |
+
"tpp_threshold_20_total_metric": 0.022200000286102296,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.02680000066757202,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.004600000381469727,
|
88 |
+
"tpp_threshold_50_total_metric": 0.04717499911785126,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.051999998092651364,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.004824998974800109,
|
91 |
+
"tpp_threshold_100_total_metric": 0.07345000356435775,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.08180000185966492,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.00834999829530716,
|
94 |
+
"tpp_threshold_500_total_metric": 0.22270000725984573,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.23360000848770143,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.010900001227855682
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.004999995231628418,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.007600009441375732,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.0026000142097473145,
|
105 |
+
"tpp_threshold_5_total_metric": 0.01009998917579651,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.013199996948242188,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.0031000077724456787,
|
108 |
+
"tpp_threshold_10_total_metric": 0.017699992656707762,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.020399999618530274,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.0027000069618225097,
|
111 |
+
"tpp_threshold_20_total_metric": 0.03189999759197235,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.03480000495910644,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.0029000073671340944,
|
114 |
+
"tpp_threshold_50_total_metric": 0.05539998412132263,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.058799993991851804,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.0034000098705291746,
|
117 |
+
"tpp_threshold_100_total_metric": 0.08500000834465027,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.08960001468658448,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.004600006341934204,
|
120 |
+
"tpp_threshold_500_total_metric": 0.2710500031709671,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.27760001420974734,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.006550011038780212
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.00459999144077301,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.007599985599517823,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.002999994158744812,
|
129 |
+
"tpp_threshold_5_total_metric": 0.004199981689453125,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.007599973678588867,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.0033999919891357423,
|
132 |
+
"tpp_threshold_10_total_metric": 0.011200019717216491,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.016200006008148193,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.004999986290931702,
|
135 |
+
"tpp_threshold_20_total_metric": 0.012500002980232239,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.018799996376037596,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.006299993395805359,
|
138 |
+
"tpp_threshold_50_total_metric": 0.038950014114379886,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.045200002193450925,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.006249988079071045,
|
141 |
+
"tpp_threshold_100_total_metric": 0.06189999878406525,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.07399998903274536,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.012099990248680114,
|
144 |
+
"tpp_threshold_500_total_metric": 0.17435001134872435,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.18960000276565553,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.015249991416931152
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000",
|
152 |
+
"sae_lens_version": "5.5.0",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 65536,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.0072500258684158325,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.01100003719329834,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037500113248825073,
|
184 |
+
"tpp_threshold_5_total_metric": 0.014999985694885254,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.018999993801116943,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.0040000081062316895,
|
187 |
+
"tpp_threshold_10_total_metric": 0.00974997878074646,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.013999998569488525,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.004250019788742065,
|
190 |
+
"tpp_threshold_20_total_metric": 0.025000005960464478,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.027000010013580322,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.0020000040531158447,
|
193 |
+
"tpp_threshold_50_total_metric": 0.03924998641014099,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.041999995708465576,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.002750009298324585,
|
196 |
+
"tpp_threshold_100_total_metric": 0.06700004637241364,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.07100003957748413,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.003999993205070496,
|
199 |
+
"tpp_threshold_500_total_metric": 0.27824999392032623,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.281000018119812,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.002750024199485779
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.012750014662742615,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.013000011444091797,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": 0.00024999678134918213,
|
207 |
+
"tpp_threshold_5_total_metric": 0.010500013828277588,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.013000011444091797,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.002499997615814209,
|
210 |
+
"tpp_threshold_10_total_metric": 0.013500005006790161,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.013000011444091797,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": -0.0004999935626983643,
|
213 |
+
"tpp_threshold_20_total_metric": 0.012750014662742615,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.017000019550323486,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.004250004887580872,
|
216 |
+
"tpp_threshold_50_total_metric": 0.026250004768371582,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.03100001811981201,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.00475001335144043,
|
219 |
+
"tpp_threshold_100_total_metric": 0.053500011563301086,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.05900001525878906,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.005500003695487976,
|
222 |
+
"tpp_threshold_500_total_metric": 0.22600004076957703,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.2330000400543213,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.006999999284744263
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.0004999786615371704,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.004999995231628418,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.0045000165700912476,
|
230 |
+
"tpp_threshold_5_total_metric": 0.019749954342842102,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.02499997615814209,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.005250021815299988,
|
233 |
+
"tpp_threshold_10_total_metric": 0.05399997532367706,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.05699998140335083,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.003000006079673767,
|
236 |
+
"tpp_threshold_20_total_metric": 0.07900001108646393,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.0820000171661377,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.003000006079673767,
|
239 |
+
"tpp_threshold_50_total_metric": 0.12099999189376831,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.125,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.0040000081062316895,
|
242 |
+
"tpp_threshold_100_total_metric": 0.17149999737739563,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.17500001192092896,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.003500014543533325,
|
245 |
+
"tpp_threshold_500_total_metric": 0.3410000056028366,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.34700000286102295,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.00599999725818634
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.0012499988079071045,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.003000020980834961,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0017500221729278564,
|
253 |
+
"tpp_threshold_5_total_metric": 0.003000035881996155,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0010000020265579224,
|
256 |
+
"tpp_threshold_10_total_metric": 0.0037499964237213135,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.004999995231628418,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.0012499988079071045,
|
259 |
+
"tpp_threshold_20_total_metric": 0.006749972701072693,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.0059999823570251465,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": -0.0007499903440475464,
|
262 |
+
"tpp_threshold_50_total_metric": 0.009249985218048096,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.009999990463256836,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.0007500052452087402,
|
265 |
+
"tpp_threshold_100_total_metric": 0.02025000751018524,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.022000014781951904,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.0017500072717666626,
|
268 |
+
"tpp_threshold_500_total_metric": 0.117249995470047,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.12400001287460327,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.006750017404556274
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.0032499581575393677,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.0059999823570251465,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.002750024199485779,
|
276 |
+
"tpp_threshold_5_total_metric": 0.0022499561309814453,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.006999969482421875,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.00475001335144043,
|
279 |
+
"tpp_threshold_10_total_metric": 0.007500007748603821,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.013000011444091797,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.005500003695487976,
|
282 |
+
"tpp_threshold_20_total_metric": 0.03599998354911804,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.041999995708465576,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.006000012159347534,
|
285 |
+
"tpp_threshold_50_total_metric": 0.08124995231628418,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.08599996566772461,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.00475001335144043,
|
288 |
+
"tpp_threshold_100_total_metric": 0.11274997889995575,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.12099999189376831,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.008250012993812561,
|
291 |
+
"tpp_threshold_500_total_metric": 0.39274998009204865,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.40299999713897705,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.010250017046928406
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.003999978303909302,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.007999956607818604,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.003999978303909302,
|
301 |
+
"tpp_threshold_5_total_metric": 0.00024996697902679443,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.003999948501586914,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.0037499815225601196,
|
304 |
+
"tpp_threshold_10_total_metric": 0.0034999996423721313,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.009999990463256836,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.006499990820884705,
|
307 |
+
"tpp_threshold_20_total_metric": -0.005250021815299988,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.001999974250793457,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.007249996066093445,
|
310 |
+
"tpp_threshold_50_total_metric": 0.007249996066093445,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.009999990463256836,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.002749994397163391,
|
313 |
+
"tpp_threshold_100_total_metric": 0.006999969482421875,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.015999972820281982,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.009000003337860107,
|
316 |
+
"tpp_threshold_500_total_metric": 0.0702500194311142,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.078000009059906,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.007749989628791809
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.001749977469444275,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.001999974250793457,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.00024999678134918213,
|
324 |
+
"tpp_threshold_5_total_metric": -0.0015000104904174805,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.0059999823570251465,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.007499992847442627,
|
327 |
+
"tpp_threshold_10_total_metric": 0.004999995231628418,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.010999977588653564,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.0059999823570251465,
|
330 |
+
"tpp_threshold_20_total_metric": 0.008500024676322937,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.013000011444091797,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.00449998676776886,
|
333 |
+
"tpp_threshold_50_total_metric": 0.025749996304512024,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.03200000524520874,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.006250008940696716,
|
336 |
+
"tpp_threshold_100_total_metric": 0.03049999475479126,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.046999990940093994,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.016499996185302734,
|
339 |
+
"tpp_threshold_500_total_metric": 0.12775003910064697,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.1420000195503235,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.014249980449676514
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.00625002384185791,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.003000020980834961,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
347 |
+
"tpp_threshold_5_total_metric": 0.002249985933303833,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.001999974250793457,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": -0.000250011682510376,
|
350 |
+
"tpp_threshold_10_total_metric": 0.010500013828277588,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.013000011444091797,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.002499997615814209,
|
353 |
+
"tpp_threshold_20_total_metric": 0.00024996697902679443,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.006999969482421875,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.006750002503395081,
|
356 |
+
"tpp_threshold_50_total_metric": 0.02550002932548523,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.027000010013580322,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.0014999806880950928,
|
359 |
+
"tpp_threshold_100_total_metric": 0.040249988436698914,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.04899996519088745,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.008749976754188538,
|
362 |
+
"tpp_threshold_500_total_metric": 0.14424999058246613,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.16399997472763062,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.01974998414516449
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": 0.006250068545341492,
|
368 |
+
"tpp_threshold_2_intended_diff_only": 0.010000050067901611,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037499815225601196,
|
370 |
+
"tpp_threshold_5_total_metric": 0.007249996066093445,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.013999998569488525,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.006750002503395081,
|
373 |
+
"tpp_threshold_10_total_metric": 0.00200006365776062,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.010000050067901611,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.007999986410140991,
|
376 |
+
"tpp_threshold_20_total_metric": 0.014250069856643677,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.024000048637390137,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.00974997878074646,
|
379 |
+
"tpp_threshold_50_total_metric": 0.07175002992153168,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.08300000429153442,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.011249974370002747,
|
382 |
+
"tpp_threshold_100_total_metric": 0.1260000616312027,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.14500004053115845,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.01899997889995575,
|
385 |
+
"tpp_threshold_500_total_metric": 0.247250035405159,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.27000004053115845,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.02275000512599945
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.017249956727027893,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.0209999680519104,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037500113248825073,
|
393 |
+
"tpp_threshold_5_total_metric": 0.012749969959259033,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.011999964714050293,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": -0.0007500052452087402,
|
396 |
+
"tpp_threshold_10_total_metric": 0.0350000262260437,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.03700000047683716,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.001999974250793457,
|
399 |
+
"tpp_threshold_20_total_metric": 0.04474997520446777,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.04799997806549072,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.0032500028610229492,
|
402 |
+
"tpp_threshold_50_total_metric": 0.06450001895427704,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.07400000095367432,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.009499981999397278,
|
405 |
+
"tpp_threshold_100_total_metric": 0.10574997961521149,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.11299997568130493,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.007249996066093445,
|
408 |
+
"tpp_threshold_500_total_metric": 0.28224997222423553,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.29399996995925903,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.011749997735023499
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "c5404fea-7e48-43fd-8a52-df4372acae0b",
|
73 |
+
"datetime_epoch_millis": 1740202316416,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.0044249996542930605,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.007800000905990601,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.0033750012516975403,
|
79 |
+
"tpp_threshold_5_total_metric": 0.008849999308586121,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.011799997091293334,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.0029499977827072144,
|
82 |
+
"tpp_threshold_10_total_metric": 0.015325011312961578,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.018900007009506226,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.003574995696544647,
|
85 |
+
"tpp_threshold_20_total_metric": 0.028574997186660768,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.033899998664855956,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.00532500147819519,
|
88 |
+
"tpp_threshold_50_total_metric": 0.06917501091957093,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.07640000581741332,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.007224994897842407,
|
91 |
+
"tpp_threshold_100_total_metric": 0.1251750007271767,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.13420000672340393,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.009025005996227263,
|
94 |
+
"tpp_threshold_500_total_metric": 0.313950015604496,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.32780001163482664,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.013849996030330658
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.0048499912023544315,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.007400000095367431,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.0025500088930130007,
|
105 |
+
"tpp_threshold_5_total_metric": 0.0064999938011169435,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.009399998188018798,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.0029000043869018555,
|
108 |
+
"tpp_threshold_10_total_metric": 0.016850015521049498,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.019600021839141845,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.002750006318092346,
|
111 |
+
"tpp_threshold_20_total_metric": 0.03354999721050263,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.037400007247924805,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.00385001003742218,
|
114 |
+
"tpp_threshold_50_total_metric": 0.0748000055551529,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.07900000810623169,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.004200002551078797,
|
117 |
+
"tpp_threshold_100_total_metric": 0.12809998989105226,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.13380000591278077,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.005700016021728515,
|
120 |
+
"tpp_threshold_500_total_metric": 0.3692500054836273,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.376800012588501,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.007550007104873658
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.0040000081062316895,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.00820000171661377,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.00419999361038208,
|
129 |
+
"tpp_threshold_5_total_metric": 0.011200004816055298,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.01419999599456787,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.0029999911785125732,
|
132 |
+
"tpp_threshold_10_total_metric": 0.013800007104873658,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.018199992179870606,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.004399985074996948,
|
135 |
+
"tpp_threshold_20_total_metric": 0.02359999716281891,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.03039999008178711,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.0067999929189682005,
|
138 |
+
"tpp_threshold_50_total_metric": 0.06355001628398896,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.07380000352859498,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.010249987244606018,
|
141 |
+
"tpp_threshold_100_total_metric": 0.12225001156330109,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.1346000075340271,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.012349995970726012,
|
144 |
+
"tpp_threshold_500_total_metric": 0.25865002572536466,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.27880001068115234,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.02014998495578766
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000",
|
152 |
+
"sae_lens_version": "5.5.0",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 65536,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.0020000338554382324,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.006000041961669922,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.0040000081062316895,
|
184 |
+
"tpp_threshold_5_total_metric": 0.005250021815299988,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.00700002908706665,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.0017500072717666626,
|
187 |
+
"tpp_threshold_10_total_metric": 0.014250040054321289,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.01500004529953003,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.0007500052452087402,
|
190 |
+
"tpp_threshold_20_total_metric": 0.01675000786781311,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.021000027656555176,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.004250019788742065,
|
193 |
+
"tpp_threshold_50_total_metric": 0.0417499840259552,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.046999990940093994,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.005250006914138794,
|
196 |
+
"tpp_threshold_100_total_metric": 0.11149998009204865,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.11599999666213989,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.0045000165700912476,
|
199 |
+
"tpp_threshold_500_total_metric": 0.37950001657009125,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.3830000162124634,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.0034999996423721313
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.005749985575675964,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": 0.003250017762184143,
|
207 |
+
"tpp_threshold_5_total_metric": -0.0010000020265579224,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.005000010132789612,
|
210 |
+
"tpp_threshold_10_total_metric": 0.008250042796134949,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.01100003719329834,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": 0.002749994397163391,
|
213 |
+
"tpp_threshold_20_total_metric": 0.017250031232833862,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.021000027656555176,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.0037499964237213135,
|
216 |
+
"tpp_threshold_50_total_metric": 0.03675006330013275,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.04200005531311035,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.0052499920129776,
|
219 |
+
"tpp_threshold_100_total_metric": 0.08974997699260712,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.09700000286102295,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.0072500258684158325,
|
222 |
+
"tpp_threshold_500_total_metric": 0.3217500150203705,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.3360000252723694,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.014250010251998901
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.006749987602233887,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.009999990463256836,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
230 |
+
"tpp_threshold_5_total_metric": 0.021249979734420776,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.02399998903274536,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.002750009298324585,
|
233 |
+
"tpp_threshold_10_total_metric": 0.04600001871585846,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.04900002479553223,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.003000006079673767,
|
236 |
+
"tpp_threshold_20_total_metric": 0.07674998044967651,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.07999998331069946,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.0032500028610229492,
|
239 |
+
"tpp_threshold_50_total_metric": 0.14174997806549072,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.14499998092651367,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.0032500028610229492,
|
242 |
+
"tpp_threshold_100_total_metric": 0.1912500113248825,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.1980000138282776,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.006750002503395081,
|
245 |
+
"tpp_threshold_500_total_metric": 0.38600003719329834,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.39500004053115845,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.009000003337860107
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.0017499923706054688,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": -0.0007500052452087402,
|
253 |
+
"tpp_threshold_5_total_metric": 0.0010000169277191162,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": 0.0029999911785125732,
|
256 |
+
"tpp_threshold_10_total_metric": 0.005250021815299988,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.00700002908706665,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.0017500072717666626,
|
259 |
+
"tpp_threshold_20_total_metric": 0.0022499561309814453,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.0059999823570251465,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": 0.003750026226043701,
|
262 |
+
"tpp_threshold_50_total_metric": 0.010250017046928406,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.012000024318695068,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.0017500072717666626,
|
265 |
+
"tpp_threshold_100_total_metric": 0.017249971628189087,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.018999993801116943,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.0017500221729278564,
|
268 |
+
"tpp_threshold_500_total_metric": 0.30149997770786285,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.3059999942779541,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.0045000165700912476
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.007999956607818604,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.010999977588653564,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.003000020980834961,
|
276 |
+
"tpp_threshold_5_total_metric": 0.005999952554702759,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.007999956607818604,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.0020000040531158447,
|
279 |
+
"tpp_threshold_10_total_metric": 0.010499954223632812,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.015999972820281982,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.00550001859664917,
|
282 |
+
"tpp_threshold_20_total_metric": 0.05475001037120819,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.05900001525878906,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.004250004887580872,
|
285 |
+
"tpp_threshold_50_total_metric": 0.14349998533725739,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.14899998903274536,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.005500003695487976,
|
288 |
+
"tpp_threshold_100_total_metric": 0.23075000941753387,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.23900002241134644,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.008250012993812561,
|
291 |
+
"tpp_threshold_500_total_metric": 0.45749998092651367,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.46399998664855957,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.0065000057220458984
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.0037500113248825073,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.0052499920129776,
|
301 |
+
"tpp_threshold_5_total_metric": 0.005499988794326782,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.009999990463256836,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.004500001668930054,
|
304 |
+
"tpp_threshold_10_total_metric": 0.003250017762184143,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.009000003337860107,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.005749985575675964,
|
307 |
+
"tpp_threshold_20_total_metric": -0.0022500157356262207,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.0059999823570251465,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.008249998092651367,
|
310 |
+
"tpp_threshold_50_total_metric": 0.006999999284744263,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.014999985694885254,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.007999986410140991,
|
313 |
+
"tpp_threshold_100_total_metric": 0.019749969244003296,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.02899998426437378,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.009250015020370483,
|
316 |
+
"tpp_threshold_500_total_metric": 0.13574999570846558,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.14899998903274536,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.013249993324279785
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": -0.003500014543533325,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.001999974250793457,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.005499988794326782,
|
324 |
+
"tpp_threshold_5_total_metric": 0.009999975562095642,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.010999977588653564,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.0010000020265579224,
|
327 |
+
"tpp_threshold_10_total_metric": 0.008749976754188538,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.011999964714050293,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.0032499879598617554,
|
330 |
+
"tpp_threshold_20_total_metric": 0.019249960780143738,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.02599996328353882,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.006750002503395081,
|
333 |
+
"tpp_threshold_50_total_metric": 0.039250001311302185,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.046999990940093994,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.007749989628791809,
|
336 |
+
"tpp_threshold_100_total_metric": 0.085999995470047,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.09700000286102295,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.011000007390975952,
|
339 |
+
"tpp_threshold_500_total_metric": 0.2435000240802765,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.2580000162124634,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.01449999213218689
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": 0.004999995231628418,
|
345 |
+
"tpp_threshold_2_intended_diff_only": 0.0040000081062316895,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": -0.0009999871253967285,
|
347 |
+
"tpp_threshold_5_total_metric": -0.0012499988079071045,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": 0.002249985933303833,
|
350 |
+
"tpp_threshold_10_total_metric": -0.003000006079673767,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.001999974250793457,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.004999980330467224,
|
353 |
+
"tpp_threshold_20_total_metric": 0.010749995708465576,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.015999972820281982,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.005249977111816406,
|
356 |
+
"tpp_threshold_50_total_metric": 0.02850000560283661,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.03700000047683716,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.00849999487400055,
|
359 |
+
"tpp_threshold_100_total_metric": 0.056500017642974854,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.06499999761581421,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.008499979972839355,
|
362 |
+
"tpp_threshold_500_total_metric": 0.19200001657009125,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.22299998998641968,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.03099997341632843
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": 0.0005000680685043335,
|
368 |
+
"tpp_threshold_2_intended_diff_only": 0.010000050067901611,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.009499981999397278,
|
370 |
+
"tpp_threshold_5_total_metric": 0.008750051259994507,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.016000032424926758,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.007249981164932251,
|
373 |
+
"tpp_threshold_10_total_metric": 0.0297500342130661,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.03600001335144043,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.006249979138374329,
|
376 |
+
"tpp_threshold_20_total_metric": 0.04075004160404205,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.053000032901763916,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.012249991297721863,
|
379 |
+
"tpp_threshold_50_total_metric": 0.14325006306171417,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.1640000343322754,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.020749971270561218,
|
382 |
+
"tpp_threshold_100_total_metric": 0.24500006437301636,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.2690000534057617,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.02399998903274536,
|
385 |
+
"tpp_threshold_500_total_metric": 0.37525005638599396,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.40000003576278687,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.024749979376792908
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.014249980449676514,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.015999972820281982,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.0017499923706054688,
|
393 |
+
"tpp_threshold_5_total_metric": 0.03300000727176666,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.03299999237060547,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": -1.4901161193847656e-08,
|
396 |
+
"tpp_threshold_10_total_metric": 0.03025001287460327,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.03200000524520874,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.0017499923706054688,
|
399 |
+
"tpp_threshold_20_total_metric": 0.0495000034570694,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.050999999046325684,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.0014999955892562866,
|
402 |
+
"tpp_threshold_50_total_metric": 0.09975001215934753,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.10600000619888306,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.0062499940395355225,
|
405 |
+
"tpp_threshold_100_total_metric": 0.20400001108646393,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.21299999952316284,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.008999988436698914,
|
408 |
+
"tpp_threshold_500_total_metric": 0.34675003588199615,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.36400002241134644,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.01724998652935028
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "20465b2c-8176-41ec-9e49-9f00775b929b",
|
37 |
+
"datetime_epoch_millis": 1740206428701,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.058161377906799316
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_checkpoints_475000",
|
47 |
+
"sae_lens_version": "5.5.0",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 65536,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "topk",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|
eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "11f1352e-ddcf-441e-97b5-61b2393a7154",
|
37 |
+
"datetime_epoch_millis": 1740207423992,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.028142571449279785
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_checkpoints_475000",
|
47 |
+
"sae_lens_version": "5.5.0",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 65536,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "topk",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|
eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "725f3662-bdf4-4419-8888-9f841bac76ad",
|
37 |
+
"datetime_epoch_millis": 1740206898872,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.09380865097045898
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_checkpoints_475000",
|
47 |
+
"sae_lens_version": "5.5.0",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 65536,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "topk",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|
eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "7c04fe70-57d3-4455-875d-f1b5132102b7",
|
37 |
+
"datetime_epoch_millis": 1740205938198,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.03752344846725464
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "c0f54314bc8a8eba515c056e4d1175902f0f7f95",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_checkpoints_475000",
|
47 |
+
"sae_lens_version": "5.5.0",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 65536,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "topk",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|