adamkarvonen commited on
Commit
6355eb3
·
verified ·
1 Parent(s): 14ef0de

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +16 -0
  2. eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +268 -0
  3. eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +268 -0
  4. eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +268 -0
  5. eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +268 -0
  6. eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +268 -0
  7. eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +268 -0
  8. eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +268 -0
  9. eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +268 -0
  10. eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
  11. eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
  12. eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
  13. eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
  14. eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
  15. eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
  16. eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
  17. eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
  18. eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
  19. eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
  20. eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
  21. eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
  22. eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
  23. eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
  24. eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +3 -0
  25. eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +3 -0
  26. eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +323 -0
  27. eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +323 -0
  28. eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +323 -0
  29. eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +323 -0
  30. eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +323 -0
  31. eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +323 -0
  32. eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +323 -0
  33. eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +323 -0
  34. eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +670 -0
  35. eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +670 -0
  36. eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +670 -0
  37. eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +670 -0
  38. eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +670 -0
  39. eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +670 -0
  40. eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +670 -0
  41. eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +670 -0
  42. eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +414 -0
  43. eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +414 -0
  44. eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +414 -0
  45. eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +414 -0
  46. eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +414 -0
  47. eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +414 -0
  48. eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +414 -0
  49. eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json +414 -0
  50. eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json +74 -0
.gitattributes CHANGED
@@ -69,3 +69,19 @@ eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_da
69
  eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
70
  eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
71
  eval_results_finetunes/core/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
70
  eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
71
  eval_results_finetunes/core/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
72
+ eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
73
+ eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
74
+ eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
75
+ eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
76
+ eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
77
+ eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
78
+ eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
79
+ eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
80
+ eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
81
+ eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
82
+ eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
83
+ eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
84
+ eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
85
+ eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
86
+ eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
87
+ eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "absorption_first_letter",
3
+ "eval_config": {
4
+ "model_name": "gemma-2-2b",
5
+ "random_seed": 42,
6
+ "f1_jump_threshold": 0.03,
7
+ "max_k_value": 10,
8
+ "prompt_template": "{word} has the first letter:",
9
+ "prompt_token_pos": -6,
10
+ "llm_batch_size": 32,
11
+ "llm_dtype": "bfloat16",
12
+ "k_sparse_probe_l1_decay": 0.01,
13
+ "k_sparse_probe_batch_size": 4096,
14
+ "k_sparse_probe_num_epochs": 50
15
+ },
16
+ "eval_id": "ef0c46fe-510f-4912-be90-57aeae2de794",
17
+ "datetime_epoch_millis": 1740152315652,
18
+ "eval_result_metrics": {
19
+ "mean": {
20
+ "mean_absorption_fraction_score": 0.5196529448711236,
21
+ "mean_full_absorption_score": 0.3042496024774553,
22
+ "mean_num_split_features": 2.8461538461538463,
23
+ "std_dev_absorption_fraction_score": 0.2663894021350886,
24
+ "std_dev_full_absorption_score": 0.17641031485178768,
25
+ "std_dev_num_split_features": 1.6417626550097355
26
+ }
27
+ },
28
+ "eval_result_details": [
29
+ {
30
+ "first_letter": "a",
31
+ "mean_absorption_fraction": 0.6889635895777194,
32
+ "full_absorption_rate": 0.3417065390749601,
33
+ "num_full_absorption": 857,
34
+ "num_probe_true_positives": 2508,
35
+ "num_split_features": 4
36
+ },
37
+ {
38
+ "first_letter": "b",
39
+ "mean_absorption_fraction": 0.679161422720634,
40
+ "full_absorption_rate": 0.3430609597924773,
41
+ "num_full_absorption": 529,
42
+ "num_probe_true_positives": 1542,
43
+ "num_split_features": 6
44
+ },
45
+ {
46
+ "first_letter": "c",
47
+ "mean_absorption_fraction": 0.8155963696018853,
48
+ "full_absorption_rate": 0.48698752228163994,
49
+ "num_full_absorption": 1366,
50
+ "num_probe_true_positives": 2805,
51
+ "num_split_features": 3
52
+ },
53
+ {
54
+ "first_letter": "d",
55
+ "mean_absorption_fraction": 0.7004096658239145,
56
+ "full_absorption_rate": 0.41807228915662653,
57
+ "num_full_absorption": 694,
58
+ "num_probe_true_positives": 1660,
59
+ "num_split_features": 3
60
+ },
61
+ {
62
+ "first_letter": "e",
63
+ "mean_absorption_fraction": 0.5585561782067426,
64
+ "full_absorption_rate": 0.3681930693069307,
65
+ "num_full_absorption": 595,
66
+ "num_probe_true_positives": 1616,
67
+ "num_split_features": 4
68
+ },
69
+ {
70
+ "first_letter": "f",
71
+ "mean_absorption_fraction": 0.8006965508885888,
72
+ "full_absorption_rate": 0.5872374798061389,
73
+ "num_full_absorption": 727,
74
+ "num_probe_true_positives": 1238,
75
+ "num_split_features": 3
76
+ },
77
+ {
78
+ "first_letter": "g",
79
+ "mean_absorption_fraction": 0.5635756145458141,
80
+ "full_absorption_rate": 0.34323144104803494,
81
+ "num_full_absorption": 393,
82
+ "num_probe_true_positives": 1145,
83
+ "num_split_features": 3
84
+ },
85
+ {
86
+ "first_letter": "h",
87
+ "mean_absorption_fraction": 0.7999027114878418,
88
+ "full_absorption_rate": 0.45893719806763283,
89
+ "num_full_absorption": 475,
90
+ "num_probe_true_positives": 1035,
91
+ "num_split_features": 3
92
+ },
93
+ {
94
+ "first_letter": "i",
95
+ "mean_absorption_fraction": 0.6117770347749325,
96
+ "full_absorption_rate": 0.358974358974359,
97
+ "num_full_absorption": 588,
98
+ "num_probe_true_positives": 1638,
99
+ "num_split_features": 2
100
+ },
101
+ {
102
+ "first_letter": "j",
103
+ "mean_absorption_fraction": 0.20771700655851907,
104
+ "full_absorption_rate": 0.06310679611650485,
105
+ "num_full_absorption": 26,
106
+ "num_probe_true_positives": 412,
107
+ "num_split_features": 1
108
+ },
109
+ {
110
+ "first_letter": "k",
111
+ "mean_absorption_fraction": 0.4378382360943496,
112
+ "full_absorption_rate": 0.16592592592592592,
113
+ "num_full_absorption": 112,
114
+ "num_probe_true_positives": 675,
115
+ "num_split_features": 3
116
+ },
117
+ {
118
+ "first_letter": "l",
119
+ "mean_absorption_fraction": 0.6610036837302684,
120
+ "full_absorption_rate": 0.3856041131105398,
121
+ "num_full_absorption": 450,
122
+ "num_probe_true_positives": 1167,
123
+ "num_split_features": 6
124
+ },
125
+ {
126
+ "first_letter": "m",
127
+ "mean_absorption_fraction": 0.7708970748978563,
128
+ "full_absorption_rate": 0.5172981878088962,
129
+ "num_full_absorption": 942,
130
+ "num_probe_true_positives": 1821,
131
+ "num_split_features": 2
132
+ },
133
+ {
134
+ "first_letter": "n",
135
+ "mean_absorption_fraction": 0.7095649014624135,
136
+ "full_absorption_rate": 0.3765743073047859,
137
+ "num_full_absorption": 299,
138
+ "num_probe_true_positives": 794,
139
+ "num_split_features": 3
140
+ },
141
+ {
142
+ "first_letter": "o",
143
+ "mean_absorption_fraction": 0.6244493390182382,
144
+ "full_absorption_rate": 0.3964386129334583,
145
+ "num_full_absorption": 423,
146
+ "num_probe_true_positives": 1067,
147
+ "num_split_features": 4
148
+ },
149
+ {
150
+ "first_letter": "p",
151
+ "mean_absorption_fraction": 0.6332302852454678,
152
+ "full_absorption_rate": 0.38650306748466257,
153
+ "num_full_absorption": 882,
154
+ "num_probe_true_positives": 2282,
155
+ "num_split_features": 7
156
+ },
157
+ {
158
+ "first_letter": "q",
159
+ "mean_absorption_fraction": 0.009843952976727736,
160
+ "full_absorption_rate": 0.010526315789473684,
161
+ "num_full_absorption": 2,
162
+ "num_probe_true_positives": 190,
163
+ "num_split_features": 1
164
+ },
165
+ {
166
+ "first_letter": "r",
167
+ "mean_absorption_fraction": 0.7078808696018212,
168
+ "full_absorption_rate": 0.4720752498530276,
169
+ "num_full_absorption": 803,
170
+ "num_probe_true_positives": 1701,
171
+ "num_split_features": 3
172
+ },
173
+ {
174
+ "first_letter": "s",
175
+ "mean_absorption_fraction": 0.7539738349846717,
176
+ "full_absorption_rate": 0.4912718204488778,
177
+ "num_full_absorption": 1379,
178
+ "num_probe_true_positives": 2807,
179
+ "num_split_features": 3
180
+ },
181
+ {
182
+ "first_letter": "t",
183
+ "mean_absorption_fraction": 0.655318360480264,
184
+ "full_absorption_rate": 0.3734513274336283,
185
+ "num_full_absorption": 633,
186
+ "num_probe_true_positives": 1695,
187
+ "num_split_features": 3
188
+ },
189
+ {
190
+ "first_letter": "u",
191
+ "mean_absorption_fraction": 0.3289562491496899,
192
+ "full_absorption_rate": 0.1986754966887417,
193
+ "num_full_absorption": 150,
194
+ "num_probe_true_positives": 755,
195
+ "num_split_features": 2
196
+ },
197
+ {
198
+ "first_letter": "v",
199
+ "mean_absorption_fraction": 0.3259209126950176,
200
+ "full_absorption_rate": 0.22813688212927757,
201
+ "num_full_absorption": 180,
202
+ "num_probe_true_positives": 789,
203
+ "num_split_features": 1
204
+ },
205
+ {
206
+ "first_letter": "w",
207
+ "mean_absorption_fraction": 0.06180711593258981,
208
+ "full_absorption_rate": 0.03581267217630854,
209
+ "num_full_absorption": 26,
210
+ "num_probe_true_positives": 726,
211
+ "num_split_features": 1
212
+ },
213
+ {
214
+ "first_letter": "x",
215
+ "mean_absorption_fraction": 0.37442147942939097,
216
+ "full_absorption_rate": 0.08849557522123894,
217
+ "num_full_absorption": 10,
218
+ "num_probe_true_positives": 113,
219
+ "num_split_features": 1
220
+ },
221
+ {
222
+ "first_letter": "y",
223
+ "mean_absorption_fraction": 0.022340281665225267,
224
+ "full_absorption_rate": 0.005681818181818182,
225
+ "num_full_absorption": 1,
226
+ "num_probe_true_positives": 176,
227
+ "num_split_features": 1
228
+ },
229
+ {
230
+ "first_letter": "z",
231
+ "mean_absorption_fraction": 0.007173845098628674,
232
+ "full_absorption_rate": 0.00851063829787234,
233
+ "num_full_absorption": 2,
234
+ "num_probe_true_positives": 235,
235
+ "num_split_features": 1
236
+ }
237
+ ],
238
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
239
+ "sae_lens_id": "custom_sae",
240
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0",
241
+ "sae_lens_version": "5.4.2",
242
+ "sae_cfg_dict": {
243
+ "model_name": "gemma-2-2b",
244
+ "d_in": 2304,
245
+ "d_sae": 65536,
246
+ "hook_layer": 12,
247
+ "hook_name": "blocks.12.hook_resid_post",
248
+ "context_size": null,
249
+ "hook_head_index": null,
250
+ "architecture": "topk",
251
+ "apply_b_dec_to_input": null,
252
+ "finetuning_scaling_factor": null,
253
+ "activation_fn_str": "",
254
+ "prepend_bos": true,
255
+ "normalize_activations": "none",
256
+ "dtype": "bfloat16",
257
+ "device": "",
258
+ "dataset_path": "",
259
+ "dataset_trust_remote_code": true,
260
+ "seqpos_slice": [
261
+ null
262
+ ],
263
+ "training_tokens": -100000,
264
+ "sae_lens_training_version": null,
265
+ "neuronpedia_id": null
266
+ },
267
+ "eval_result_unstructured": null
268
+ }
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "absorption_first_letter",
3
+ "eval_config": {
4
+ "model_name": "gemma-2-2b",
5
+ "random_seed": 42,
6
+ "f1_jump_threshold": 0.03,
7
+ "max_k_value": 10,
8
+ "prompt_template": "{word} has the first letter:",
9
+ "prompt_token_pos": -6,
10
+ "llm_batch_size": 32,
11
+ "llm_dtype": "bfloat16",
12
+ "k_sparse_probe_l1_decay": 0.01,
13
+ "k_sparse_probe_batch_size": 4096,
14
+ "k_sparse_probe_num_epochs": 50
15
+ },
16
+ "eval_id": "a794207a-cbc8-4c1d-8ea8-36a54549f1c6",
17
+ "datetime_epoch_millis": 1740150709168,
18
+ "eval_result_metrics": {
19
+ "mean": {
20
+ "mean_absorption_fraction_score": 0.0834770439283162,
21
+ "mean_full_absorption_score": 0.06013611847978803,
22
+ "mean_num_split_features": 1.1538461538461537,
23
+ "std_dev_absorption_fraction_score": 0.09241618138074081,
24
+ "std_dev_full_absorption_score": 0.08150335423500922,
25
+ "std_dev_num_split_features": 0.36794648440311994
26
+ }
27
+ },
28
+ "eval_result_details": [
29
+ {
30
+ "first_letter": "a",
31
+ "mean_absorption_fraction": 0.057028034407490984,
32
+ "full_absorption_rate": 0.017543859649122806,
33
+ "num_full_absorption": 44,
34
+ "num_probe_true_positives": 2508,
35
+ "num_split_features": 1
36
+ },
37
+ {
38
+ "first_letter": "b",
39
+ "mean_absorption_fraction": 0.0017792171984287038,
40
+ "full_absorption_rate": 0.0025940337224383916,
41
+ "num_full_absorption": 4,
42
+ "num_probe_true_positives": 1542,
43
+ "num_split_features": 1
44
+ },
45
+ {
46
+ "first_letter": "c",
47
+ "mean_absorption_fraction": 0.25592667322599083,
48
+ "full_absorption_rate": 0.1857397504456328,
49
+ "num_full_absorption": 521,
50
+ "num_probe_true_positives": 2805,
51
+ "num_split_features": 1
52
+ },
53
+ {
54
+ "first_letter": "d",
55
+ "mean_absorption_fraction": 0.24011997118028888,
56
+ "full_absorption_rate": 0.12590361445783133,
57
+ "num_full_absorption": 209,
58
+ "num_probe_true_positives": 1660,
59
+ "num_split_features": 1
60
+ },
61
+ {
62
+ "first_letter": "e",
63
+ "mean_absorption_fraction": 0.1110917028199224,
64
+ "full_absorption_rate": 0.15346534653465346,
65
+ "num_full_absorption": 248,
66
+ "num_probe_true_positives": 1616,
67
+ "num_split_features": 1
68
+ },
69
+ {
70
+ "first_letter": "f",
71
+ "mean_absorption_fraction": 0.0545493485524353,
72
+ "full_absorption_rate": 0.02665589660743134,
73
+ "num_full_absorption": 33,
74
+ "num_probe_true_positives": 1238,
75
+ "num_split_features": 1
76
+ },
77
+ {
78
+ "first_letter": "g",
79
+ "mean_absorption_fraction": 0.015559784132844725,
80
+ "full_absorption_rate": 0.0026200873362445414,
81
+ "num_full_absorption": 3,
82
+ "num_probe_true_positives": 1145,
83
+ "num_split_features": 2
84
+ },
85
+ {
86
+ "first_letter": "h",
87
+ "mean_absorption_fraction": 0.020966493314900192,
88
+ "full_absorption_rate": 0.005797101449275362,
89
+ "num_full_absorption": 6,
90
+ "num_probe_true_positives": 1035,
91
+ "num_split_features": 1
92
+ },
93
+ {
94
+ "first_letter": "i",
95
+ "mean_absorption_fraction": 0.22232954346440803,
96
+ "full_absorption_rate": 0.23321123321123322,
97
+ "num_full_absorption": 382,
98
+ "num_probe_true_positives": 1638,
99
+ "num_split_features": 2
100
+ },
101
+ {
102
+ "first_letter": "j",
103
+ "mean_absorption_fraction": 0.0033853270927236977,
104
+ "full_absorption_rate": 0.0024271844660194173,
105
+ "num_full_absorption": 1,
106
+ "num_probe_true_positives": 412,
107
+ "num_split_features": 1
108
+ },
109
+ {
110
+ "first_letter": "k",
111
+ "mean_absorption_fraction": 0.00362522533780823,
112
+ "full_absorption_rate": 0.005925925925925926,
113
+ "num_full_absorption": 4,
114
+ "num_probe_true_positives": 675,
115
+ "num_split_features": 1
116
+ },
117
+ {
118
+ "first_letter": "l",
119
+ "mean_absorption_fraction": 0.12461768062488289,
120
+ "full_absorption_rate": 0.06512425021422451,
121
+ "num_full_absorption": 76,
122
+ "num_probe_true_positives": 1167,
123
+ "num_split_features": 1
124
+ },
125
+ {
126
+ "first_letter": "m",
127
+ "mean_absorption_fraction": 0.0051840957147463755,
128
+ "full_absorption_rate": 0.008237232289950576,
129
+ "num_full_absorption": 15,
130
+ "num_probe_true_positives": 1821,
131
+ "num_split_features": 1
132
+ },
133
+ {
134
+ "first_letter": "n",
135
+ "mean_absorption_fraction": 0.05486755467875741,
136
+ "full_absorption_rate": 0.021410579345088162,
137
+ "num_full_absorption": 17,
138
+ "num_probe_true_positives": 794,
139
+ "num_split_features": 1
140
+ },
141
+ {
142
+ "first_letter": "o",
143
+ "mean_absorption_fraction": 0.11608643361066023,
144
+ "full_absorption_rate": 0.06560449859418932,
145
+ "num_full_absorption": 70,
146
+ "num_probe_true_positives": 1067,
147
+ "num_split_features": 1
148
+ },
149
+ {
150
+ "first_letter": "p",
151
+ "mean_absorption_fraction": 0.286828957721577,
152
+ "full_absorption_rate": 0.28702892199824714,
153
+ "num_full_absorption": 655,
154
+ "num_probe_true_positives": 2282,
155
+ "num_split_features": 1
156
+ },
157
+ {
158
+ "first_letter": "q",
159
+ "mean_absorption_fraction": 0.0012196642214486685,
160
+ "full_absorption_rate": 0.0,
161
+ "num_full_absorption": 0,
162
+ "num_probe_true_positives": 190,
163
+ "num_split_features": 2
164
+ },
165
+ {
166
+ "first_letter": "r",
167
+ "mean_absorption_fraction": 0.23541147654145936,
168
+ "full_absorption_rate": 0.1781305114638448,
169
+ "num_full_absorption": 303,
170
+ "num_probe_true_positives": 1701,
171
+ "num_split_features": 1
172
+ },
173
+ {
174
+ "first_letter": "s",
175
+ "mean_absorption_fraction": 0.15717383958699338,
176
+ "full_absorption_rate": 0.06840042750267189,
177
+ "num_full_absorption": 192,
178
+ "num_probe_true_positives": 2807,
179
+ "num_split_features": 1
180
+ },
181
+ {
182
+ "first_letter": "t",
183
+ "mean_absorption_fraction": 0.019660784963942177,
184
+ "full_absorption_rate": 0.008849557522123894,
185
+ "num_full_absorption": 15,
186
+ "num_probe_true_positives": 1695,
187
+ "num_split_features": 1
188
+ },
189
+ {
190
+ "first_letter": "u",
191
+ "mean_absorption_fraction": 0.02239216664758688,
192
+ "full_absorption_rate": 0.017218543046357615,
193
+ "num_full_absorption": 13,
194
+ "num_probe_true_positives": 755,
195
+ "num_split_features": 2
196
+ },
197
+ {
198
+ "first_letter": "v",
199
+ "mean_absorption_fraction": 0.000307858004094181,
200
+ "full_absorption_rate": 0.0,
201
+ "num_full_absorption": 0,
202
+ "num_probe_true_positives": 789,
203
+ "num_split_features": 1
204
+ },
205
+ {
206
+ "first_letter": "w",
207
+ "mean_absorption_fraction": 0.058073269829955455,
208
+ "full_absorption_rate": 0.0440771349862259,
209
+ "num_full_absorption": 32,
210
+ "num_probe_true_positives": 726,
211
+ "num_split_features": 1
212
+ },
213
+ {
214
+ "first_letter": "x",
215
+ "mean_absorption_fraction": 0.04631967164684324,
216
+ "full_absorption_rate": 0.017699115044247787,
217
+ "num_full_absorption": 2,
218
+ "num_probe_true_positives": 113,
219
+ "num_split_features": 1
220
+ },
221
+ {
222
+ "first_letter": "y",
223
+ "mean_absorption_fraction": 0.04255402129395246,
224
+ "full_absorption_rate": 0.011363636363636364,
225
+ "num_full_absorption": 2,
226
+ "num_probe_true_positives": 176,
227
+ "num_split_features": 1
228
+ },
229
+ {
230
+ "first_letter": "z",
231
+ "mean_absorption_fraction": 0.013344346322079504,
232
+ "full_absorption_rate": 0.00851063829787234,
233
+ "num_full_absorption": 2,
234
+ "num_probe_true_positives": 235,
235
+ "num_split_features": 1
236
+ }
237
+ ],
238
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
239
+ "sae_lens_id": "custom_sae",
240
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0",
241
+ "sae_lens_version": "5.4.2",
242
+ "sae_cfg_dict": {
243
+ "model_name": "gemma-2-2b",
244
+ "d_in": 2304,
245
+ "d_sae": 65536,
246
+ "hook_layer": 12,
247
+ "hook_name": "blocks.12.hook_resid_post",
248
+ "context_size": null,
249
+ "hook_head_index": null,
250
+ "architecture": "topk",
251
+ "apply_b_dec_to_input": null,
252
+ "finetuning_scaling_factor": null,
253
+ "activation_fn_str": "",
254
+ "prepend_bos": true,
255
+ "normalize_activations": "none",
256
+ "dtype": "bfloat16",
257
+ "device": "",
258
+ "dataset_path": "",
259
+ "dataset_trust_remote_code": true,
260
+ "seqpos_slice": [
261
+ null
262
+ ],
263
+ "training_tokens": -100000,
264
+ "sae_lens_training_version": null,
265
+ "neuronpedia_id": null
266
+ },
267
+ "eval_result_unstructured": null
268
+ }
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "absorption_first_letter",
3
+ "eval_config": {
4
+ "model_name": "gemma-2-2b",
5
+ "random_seed": 42,
6
+ "f1_jump_threshold": 0.03,
7
+ "max_k_value": 10,
8
+ "prompt_template": "{word} has the first letter:",
9
+ "prompt_token_pos": -6,
10
+ "llm_batch_size": 32,
11
+ "llm_dtype": "bfloat16",
12
+ "k_sparse_probe_l1_decay": 0.01,
13
+ "k_sparse_probe_batch_size": 4096,
14
+ "k_sparse_probe_num_epochs": 50
15
+ },
16
+ "eval_id": "3fd72f62-0cc2-4495-be28-9e81ace44644",
17
+ "datetime_epoch_millis": 1740149898408,
18
+ "eval_result_metrics": {
19
+ "mean": {
20
+ "mean_absorption_fraction_score": 0.6847120849800752,
21
+ "mean_full_absorption_score": 0.6661390205611261,
22
+ "mean_num_split_features": 2.6538461538461537,
23
+ "std_dev_absorption_fraction_score": 0.2003883916940885,
24
+ "std_dev_full_absorption_score": 0.2090506101541011,
25
+ "std_dev_num_split_features": 1.3249092857190696
26
+ }
27
+ },
28
+ "eval_result_details": [
29
+ {
30
+ "first_letter": "a",
31
+ "mean_absorption_fraction": 0.7233867462553377,
32
+ "full_absorption_rate": 0.64792663476874,
33
+ "num_full_absorption": 1625,
34
+ "num_probe_true_positives": 2508,
35
+ "num_split_features": 4
36
+ },
37
+ {
38
+ "first_letter": "b",
39
+ "mean_absorption_fraction": 0.7168509009331099,
40
+ "full_absorption_rate": 0.7159533073929961,
41
+ "num_full_absorption": 1104,
42
+ "num_probe_true_positives": 1542,
43
+ "num_split_features": 4
44
+ },
45
+ {
46
+ "first_letter": "c",
47
+ "mean_absorption_fraction": 0.9156825137832028,
48
+ "full_absorption_rate": 0.8762923351158646,
49
+ "num_full_absorption": 2458,
50
+ "num_probe_true_positives": 2805,
51
+ "num_split_features": 1
52
+ },
53
+ {
54
+ "first_letter": "d",
55
+ "mean_absorption_fraction": 0.7605077790256731,
56
+ "full_absorption_rate": 0.7590361445783133,
57
+ "num_full_absorption": 1260,
58
+ "num_probe_true_positives": 1660,
59
+ "num_split_features": 2
60
+ },
61
+ {
62
+ "first_letter": "e",
63
+ "mean_absorption_fraction": 0.5875146098112013,
64
+ "full_absorption_rate": 0.7271039603960396,
65
+ "num_full_absorption": 1175,
66
+ "num_probe_true_positives": 1616,
67
+ "num_split_features": 3
68
+ },
69
+ {
70
+ "first_letter": "f",
71
+ "mean_absorption_fraction": 0.7127841577147391,
72
+ "full_absorption_rate": 0.7059773828756059,
73
+ "num_full_absorption": 874,
74
+ "num_probe_true_positives": 1238,
75
+ "num_split_features": 5
76
+ },
77
+ {
78
+ "first_letter": "g",
79
+ "mean_absorption_fraction": 0.7512413415830805,
80
+ "full_absorption_rate": 0.7362445414847162,
81
+ "num_full_absorption": 843,
82
+ "num_probe_true_positives": 1145,
83
+ "num_split_features": 2
84
+ },
85
+ {
86
+ "first_letter": "h",
87
+ "mean_absorption_fraction": 0.7663615485897274,
88
+ "full_absorption_rate": 0.7661835748792271,
89
+ "num_full_absorption": 793,
90
+ "num_probe_true_positives": 1035,
91
+ "num_split_features": 4
92
+ },
93
+ {
94
+ "first_letter": "i",
95
+ "mean_absorption_fraction": 0.7978799456118504,
96
+ "full_absorption_rate": 0.8028083028083028,
97
+ "num_full_absorption": 1315,
98
+ "num_probe_true_positives": 1638,
99
+ "num_split_features": 2
100
+ },
101
+ {
102
+ "first_letter": "j",
103
+ "mean_absorption_fraction": 0.7148716538061125,
104
+ "full_absorption_rate": 0.6820388349514563,
105
+ "num_full_absorption": 281,
106
+ "num_probe_true_positives": 412,
107
+ "num_split_features": 2
108
+ },
109
+ {
110
+ "first_letter": "k",
111
+ "mean_absorption_fraction": 0.5989374406326367,
112
+ "full_absorption_rate": 0.5555555555555556,
113
+ "num_full_absorption": 375,
114
+ "num_probe_true_positives": 675,
115
+ "num_split_features": 4
116
+ },
117
+ {
118
+ "first_letter": "l",
119
+ "mean_absorption_fraction": 0.7327207331271454,
120
+ "full_absorption_rate": 0.7446443873179092,
121
+ "num_full_absorption": 869,
122
+ "num_probe_true_positives": 1167,
123
+ "num_split_features": 3
124
+ },
125
+ {
126
+ "first_letter": "m",
127
+ "mean_absorption_fraction": 0.7807434725648233,
128
+ "full_absorption_rate": 0.8160351455244371,
129
+ "num_full_absorption": 1486,
130
+ "num_probe_true_positives": 1821,
131
+ "num_split_features": 2
132
+ },
133
+ {
134
+ "first_letter": "n",
135
+ "mean_absorption_fraction": 0.7722688362643463,
136
+ "full_absorption_rate": 0.7657430730478589,
137
+ "num_full_absorption": 608,
138
+ "num_probe_true_positives": 794,
139
+ "num_split_features": 3
140
+ },
141
+ {
142
+ "first_letter": "o",
143
+ "mean_absorption_fraction": 0.8168333418476779,
144
+ "full_absorption_rate": 0.7769447047797563,
145
+ "num_full_absorption": 829,
146
+ "num_probe_true_positives": 1067,
147
+ "num_split_features": 2
148
+ },
149
+ {
150
+ "first_letter": "p",
151
+ "mean_absorption_fraction": 0.8399943933692386,
152
+ "full_absorption_rate": 0.8273444347063978,
153
+ "num_full_absorption": 1888,
154
+ "num_probe_true_positives": 2282,
155
+ "num_split_features": 2
156
+ },
157
+ {
158
+ "first_letter": "q",
159
+ "mean_absorption_fraction": 0.7839560158065032,
160
+ "full_absorption_rate": 0.7157894736842105,
161
+ "num_full_absorption": 136,
162
+ "num_probe_true_positives": 190,
163
+ "num_split_features": 2
164
+ },
165
+ {
166
+ "first_letter": "r",
167
+ "mean_absorption_fraction": 0.8207604001227187,
168
+ "full_absorption_rate": 0.8300999412110524,
169
+ "num_full_absorption": 1412,
170
+ "num_probe_true_positives": 1701,
171
+ "num_split_features": 3
172
+ },
173
+ {
174
+ "first_letter": "s",
175
+ "mean_absorption_fraction": 0.9192547073005679,
176
+ "full_absorption_rate": 0.8696116850730317,
177
+ "num_full_absorption": 2441,
178
+ "num_probe_true_positives": 2807,
179
+ "num_split_features": 1
180
+ },
181
+ {
182
+ "first_letter": "t",
183
+ "mean_absorption_fraction": 0.7832159717277276,
184
+ "full_absorption_rate": 0.736283185840708,
185
+ "num_full_absorption": 1248,
186
+ "num_probe_true_positives": 1695,
187
+ "num_split_features": 2
188
+ },
189
+ {
190
+ "first_letter": "u",
191
+ "mean_absorption_fraction": 0.31044767910219884,
192
+ "full_absorption_rate": 0.3880794701986755,
193
+ "num_full_absorption": 293,
194
+ "num_probe_true_positives": 755,
195
+ "num_split_features": 6
196
+ },
197
+ {
198
+ "first_letter": "v",
199
+ "mean_absorption_fraction": 0.6288526333046588,
200
+ "full_absorption_rate": 0.5779467680608364,
201
+ "num_full_absorption": 456,
202
+ "num_probe_true_positives": 789,
203
+ "num_split_features": 4
204
+ },
205
+ {
206
+ "first_letter": "w",
207
+ "mean_absorption_fraction": 0.820589593812622,
208
+ "full_absorption_rate": 0.7630853994490359,
209
+ "num_full_absorption": 554,
210
+ "num_probe_true_positives": 726,
211
+ "num_split_features": 3
212
+ },
213
+ {
214
+ "first_letter": "x",
215
+ "mean_absorption_fraction": 0.2525570176189269,
216
+ "full_absorption_rate": 0.08849557522123894,
217
+ "num_full_absorption": 10,
218
+ "num_probe_true_positives": 113,
219
+ "num_split_features": 1
220
+ },
221
+ {
222
+ "first_letter": "y",
223
+ "mean_absorption_fraction": 0.3328708151340147,
224
+ "full_absorption_rate": 0.29545454545454547,
225
+ "num_full_absorption": 52,
226
+ "num_probe_true_positives": 176,
227
+ "num_split_features": 1
228
+ },
229
+ {
230
+ "first_letter": "z",
231
+ "mean_absorption_fraction": 0.16142996063211232,
232
+ "full_absorption_rate": 0.14893617021276595,
233
+ "num_full_absorption": 35,
234
+ "num_probe_true_positives": 235,
235
+ "num_split_features": 1
236
+ }
237
+ ],
238
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
239
+ "sae_lens_id": "custom_sae",
240
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0",
241
+ "sae_lens_version": "5.4.2",
242
+ "sae_cfg_dict": {
243
+ "model_name": "gemma-2-2b",
244
+ "d_in": 2304,
245
+ "d_sae": 65536,
246
+ "hook_layer": 12,
247
+ "hook_name": "blocks.12.hook_resid_post",
248
+ "context_size": null,
249
+ "hook_head_index": null,
250
+ "architecture": "topk",
251
+ "apply_b_dec_to_input": null,
252
+ "finetuning_scaling_factor": null,
253
+ "activation_fn_str": "",
254
+ "prepend_bos": true,
255
+ "normalize_activations": "none",
256
+ "dtype": "bfloat16",
257
+ "device": "",
258
+ "dataset_path": "",
259
+ "dataset_trust_remote_code": true,
260
+ "seqpos_slice": [
261
+ null
262
+ ],
263
+ "training_tokens": -100000,
264
+ "sae_lens_training_version": null,
265
+ "neuronpedia_id": null
266
+ },
267
+ "eval_result_unstructured": null
268
+ }
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "absorption_first_letter",
3
+ "eval_config": {
4
+ "model_name": "gemma-2-2b",
5
+ "random_seed": 42,
6
+ "f1_jump_threshold": 0.03,
7
+ "max_k_value": 10,
8
+ "prompt_template": "{word} has the first letter:",
9
+ "prompt_token_pos": -6,
10
+ "llm_batch_size": 32,
11
+ "llm_dtype": "bfloat16",
12
+ "k_sparse_probe_l1_decay": 0.01,
13
+ "k_sparse_probe_batch_size": 4096,
14
+ "k_sparse_probe_num_epochs": 50
15
+ },
16
+ "eval_id": "e4f2dfbd-7bca-41a0-b7cc-7b89d14b8d8f",
17
+ "datetime_epoch_millis": 1740151507106,
18
+ "eval_result_metrics": {
19
+ "mean": {
20
+ "mean_absorption_fraction_score": 0.4524983861704342,
21
+ "mean_full_absorption_score": 0.5240608368125174,
22
+ "mean_num_split_features": 4.3076923076923075,
23
+ "std_dev_absorption_fraction_score": 0.17312136051231236,
24
+ "std_dev_full_absorption_score": 0.19044461427366993,
25
+ "std_dev_num_split_features": 2.412786451706504
26
+ }
27
+ },
28
+ "eval_result_details": [
29
+ {
30
+ "first_letter": "a",
31
+ "mean_absorption_fraction": 0.5244886983482652,
32
+ "full_absorption_rate": 0.5231259968102073,
33
+ "num_full_absorption": 1312,
34
+ "num_probe_true_positives": 2508,
35
+ "num_split_features": 9
36
+ },
37
+ {
38
+ "first_letter": "b",
39
+ "mean_absorption_fraction": 0.5030903415851329,
40
+ "full_absorption_rate": 0.6848249027237354,
41
+ "num_full_absorption": 1056,
42
+ "num_probe_true_positives": 1542,
43
+ "num_split_features": 4
44
+ },
45
+ {
46
+ "first_letter": "c",
47
+ "mean_absorption_fraction": 0.6621675316077551,
48
+ "full_absorption_rate": 0.685204991087344,
49
+ "num_full_absorption": 1922,
50
+ "num_probe_true_positives": 2805,
51
+ "num_split_features": 7
52
+ },
53
+ {
54
+ "first_letter": "d",
55
+ "mean_absorption_fraction": 0.5686391650350938,
56
+ "full_absorption_rate": 0.6313253012048192,
57
+ "num_full_absorption": 1048,
58
+ "num_probe_true_positives": 1660,
59
+ "num_split_features": 5
60
+ },
61
+ {
62
+ "first_letter": "e",
63
+ "mean_absorption_fraction": 0.3860805878256704,
64
+ "full_absorption_rate": 0.5006188118811881,
65
+ "num_full_absorption": 809,
66
+ "num_probe_true_positives": 1616,
67
+ "num_split_features": 8
68
+ },
69
+ {
70
+ "first_letter": "f",
71
+ "mean_absorption_fraction": 0.4846739037643228,
72
+ "full_absorption_rate": 0.6058158319870759,
73
+ "num_full_absorption": 750,
74
+ "num_probe_true_positives": 1238,
75
+ "num_split_features": 6
76
+ },
77
+ {
78
+ "first_letter": "g",
79
+ "mean_absorption_fraction": 0.4674741077161477,
80
+ "full_absorption_rate": 0.5545851528384279,
81
+ "num_full_absorption": 635,
82
+ "num_probe_true_positives": 1145,
83
+ "num_split_features": 7
84
+ },
85
+ {
86
+ "first_letter": "h",
87
+ "mean_absorption_fraction": 0.4141624927147996,
88
+ "full_absorption_rate": 0.46956521739130436,
89
+ "num_full_absorption": 486,
90
+ "num_probe_true_positives": 1035,
91
+ "num_split_features": 5
92
+ },
93
+ {
94
+ "first_letter": "i",
95
+ "mean_absorption_fraction": 0.5685041971113255,
96
+ "full_absorption_rate": 0.6868131868131868,
97
+ "num_full_absorption": 1125,
98
+ "num_probe_true_positives": 1638,
99
+ "num_split_features": 1
100
+ },
101
+ {
102
+ "first_letter": "j",
103
+ "mean_absorption_fraction": 0.3294766484902135,
104
+ "full_absorption_rate": 0.34951456310679613,
105
+ "num_full_absorption": 144,
106
+ "num_probe_true_positives": 412,
107
+ "num_split_features": 3
108
+ },
109
+ {
110
+ "first_letter": "k",
111
+ "mean_absorption_fraction": 0.19406311445367022,
112
+ "full_absorption_rate": 0.2740740740740741,
113
+ "num_full_absorption": 185,
114
+ "num_probe_true_positives": 675,
115
+ "num_split_features": 3
116
+ },
117
+ {
118
+ "first_letter": "l",
119
+ "mean_absorption_fraction": 0.43236308517151945,
120
+ "full_absorption_rate": 0.48586118251928023,
121
+ "num_full_absorption": 567,
122
+ "num_probe_true_positives": 1167,
123
+ "num_split_features": 6
124
+ },
125
+ {
126
+ "first_letter": "m",
127
+ "mean_absorption_fraction": 0.6774860216572675,
128
+ "full_absorption_rate": 0.7957166392092258,
129
+ "num_full_absorption": 1449,
130
+ "num_probe_true_positives": 1821,
131
+ "num_split_features": 3
132
+ },
133
+ {
134
+ "first_letter": "n",
135
+ "mean_absorption_fraction": 0.48846356092785836,
136
+ "full_absorption_rate": 0.5604534005037783,
137
+ "num_full_absorption": 445,
138
+ "num_probe_true_positives": 794,
139
+ "num_split_features": 3
140
+ },
141
+ {
142
+ "first_letter": "o",
143
+ "mean_absorption_fraction": 0.591983877600817,
144
+ "full_absorption_rate": 0.6588566073102156,
145
+ "num_full_absorption": 703,
146
+ "num_probe_true_positives": 1067,
147
+ "num_split_features": 3
148
+ },
149
+ {
150
+ "first_letter": "p",
151
+ "mean_absorption_fraction": 0.692209094169467,
152
+ "full_absorption_rate": 0.7116564417177914,
153
+ "num_full_absorption": 1624,
154
+ "num_probe_true_positives": 2282,
155
+ "num_split_features": 8
156
+ },
157
+ {
158
+ "first_letter": "q",
159
+ "mean_absorption_fraction": 0.22241427485109083,
160
+ "full_absorption_rate": 0.23157894736842105,
161
+ "num_full_absorption": 44,
162
+ "num_probe_true_positives": 190,
163
+ "num_split_features": 2
164
+ },
165
+ {
166
+ "first_letter": "r",
167
+ "mean_absorption_fraction": 0.5417187972127966,
168
+ "full_absorption_rate": 0.5631981187536743,
169
+ "num_full_absorption": 958,
170
+ "num_probe_true_positives": 1701,
171
+ "num_split_features": 6
172
+ },
173
+ {
174
+ "first_letter": "s",
175
+ "mean_absorption_fraction": 0.7569173413620608,
176
+ "full_absorption_rate": 0.8065550409690061,
177
+ "num_full_absorption": 2264,
178
+ "num_probe_true_positives": 2807,
179
+ "num_split_features": 4
180
+ },
181
+ {
182
+ "first_letter": "t",
183
+ "mean_absorption_fraction": 0.5078413641592,
184
+ "full_absorption_rate": 0.5150442477876106,
185
+ "num_full_absorption": 873,
186
+ "num_probe_true_positives": 1695,
187
+ "num_split_features": 7
188
+ },
189
+ {
190
+ "first_letter": "u",
191
+ "mean_absorption_fraction": 0.36593236176612903,
192
+ "full_absorption_rate": 0.6079470198675496,
193
+ "num_full_absorption": 459,
194
+ "num_probe_true_positives": 755,
195
+ "num_split_features": 2
196
+ },
197
+ {
198
+ "first_letter": "v",
199
+ "mean_absorption_fraction": 0.33729577176136777,
200
+ "full_absorption_rate": 0.4664131812420786,
201
+ "num_full_absorption": 368,
202
+ "num_probe_true_positives": 789,
203
+ "num_split_features": 4
204
+ },
205
+ {
206
+ "first_letter": "w",
207
+ "mean_absorption_fraction": 0.5653991608245975,
208
+ "full_absorption_rate": 0.7052341597796143,
209
+ "num_full_absorption": 512,
210
+ "num_probe_true_positives": 726,
211
+ "num_split_features": 3
212
+ },
213
+ {
214
+ "first_letter": "x",
215
+ "mean_absorption_fraction": 0.09658828784853217,
216
+ "full_absorption_rate": 0.061946902654867256,
217
+ "num_full_absorption": 7,
218
+ "num_probe_true_positives": 113,
219
+ "num_split_features": 1
220
+ },
221
+ {
222
+ "first_letter": "y",
223
+ "mean_absorption_fraction": 0.1756755097631157,
224
+ "full_absorption_rate": 0.20454545454545456,
225
+ "num_full_absorption": 36,
226
+ "num_probe_true_positives": 176,
227
+ "num_split_features": 1
228
+ },
229
+ {
230
+ "first_letter": "z",
231
+ "mean_absorption_fraction": 0.20984874270307283,
232
+ "full_absorption_rate": 0.2851063829787234,
233
+ "num_full_absorption": 67,
234
+ "num_probe_true_positives": 235,
235
+ "num_split_features": 1
236
+ }
237
+ ],
238
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
239
+ "sae_lens_id": "custom_sae",
240
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0",
241
+ "sae_lens_version": "5.4.2",
242
+ "sae_cfg_dict": {
243
+ "model_name": "gemma-2-2b",
244
+ "d_in": 2304,
245
+ "d_sae": 65536,
246
+ "hook_layer": 12,
247
+ "hook_name": "blocks.12.hook_resid_post",
248
+ "context_size": null,
249
+ "hook_head_index": null,
250
+ "architecture": "topk",
251
+ "apply_b_dec_to_input": null,
252
+ "finetuning_scaling_factor": null,
253
+ "activation_fn_str": "",
254
+ "prepend_bos": true,
255
+ "normalize_activations": "none",
256
+ "dtype": "bfloat16",
257
+ "device": "",
258
+ "dataset_path": "",
259
+ "dataset_trust_remote_code": true,
260
+ "seqpos_slice": [
261
+ null
262
+ ],
263
+ "training_tokens": -100000,
264
+ "sae_lens_training_version": null,
265
+ "neuronpedia_id": null
266
+ },
267
+ "eval_result_unstructured": null
268
+ }
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "absorption_first_letter",
3
+ "eval_config": {
4
+ "model_name": "gemma-2-2b",
5
+ "random_seed": 42,
6
+ "f1_jump_threshold": 0.03,
7
+ "max_k_value": 10,
8
+ "prompt_template": "{word} has the first letter:",
9
+ "prompt_token_pos": -6,
10
+ "llm_batch_size": 32,
11
+ "llm_dtype": "bfloat16",
12
+ "k_sparse_probe_l1_decay": 0.01,
13
+ "k_sparse_probe_batch_size": 4096,
14
+ "k_sparse_probe_num_epochs": 50
15
+ },
16
+ "eval_id": "4a874c85-b7b8-4548-be1b-9967bf32571a",
17
+ "datetime_epoch_millis": 1740154752792,
18
+ "eval_result_metrics": {
19
+ "mean": {
20
+ "mean_absorption_fraction_score": 0.6616852261361333,
21
+ "mean_full_absorption_score": 0.615820361178369,
22
+ "mean_num_split_features": 2.6923076923076925,
23
+ "std_dev_absorption_fraction_score": 0.19938859527080388,
24
+ "std_dev_full_absorption_score": 0.20106605664354318,
25
+ "std_dev_num_split_features": 1.7382573059068274
26
+ }
27
+ },
28
+ "eval_result_details": [
29
+ {
30
+ "first_letter": "a",
31
+ "mean_absorption_fraction": 0.8075326747620873,
32
+ "full_absorption_rate": 0.6698564593301436,
33
+ "num_full_absorption": 1680,
34
+ "num_probe_true_positives": 2508,
35
+ "num_split_features": 2
36
+ },
37
+ {
38
+ "first_letter": "b",
39
+ "mean_absorption_fraction": 0.649433379164975,
40
+ "full_absorption_rate": 0.5719844357976653,
41
+ "num_full_absorption": 882,
42
+ "num_probe_true_positives": 1542,
43
+ "num_split_features": 6
44
+ },
45
+ {
46
+ "first_letter": "c",
47
+ "mean_absorption_fraction": 0.843160509366003,
48
+ "full_absorption_rate": 0.774331550802139,
49
+ "num_full_absorption": 2172,
50
+ "num_probe_true_positives": 2805,
51
+ "num_split_features": 3
52
+ },
53
+ {
54
+ "first_letter": "d",
55
+ "mean_absorption_fraction": 0.8053629676295623,
56
+ "full_absorption_rate": 0.7536144578313253,
57
+ "num_full_absorption": 1251,
58
+ "num_probe_true_positives": 1660,
59
+ "num_split_features": 2
60
+ },
61
+ {
62
+ "first_letter": "e",
63
+ "mean_absorption_fraction": 0.6165512751061573,
64
+ "full_absorption_rate": 0.6961633663366337,
65
+ "num_full_absorption": 1125,
66
+ "num_probe_true_positives": 1616,
67
+ "num_split_features": 2
68
+ },
69
+ {
70
+ "first_letter": "f",
71
+ "mean_absorption_fraction": 0.7104825885299954,
72
+ "full_absorption_rate": 0.6922455573505655,
73
+ "num_full_absorption": 857,
74
+ "num_probe_true_positives": 1238,
75
+ "num_split_features": 5
76
+ },
77
+ {
78
+ "first_letter": "g",
79
+ "mean_absorption_fraction": 0.7387823401404363,
80
+ "full_absorption_rate": 0.7397379912663755,
81
+ "num_full_absorption": 847,
82
+ "num_probe_true_positives": 1145,
83
+ "num_split_features": 1
84
+ },
85
+ {
86
+ "first_letter": "h",
87
+ "mean_absorption_fraction": 0.7596010190875191,
88
+ "full_absorption_rate": 0.702415458937198,
89
+ "num_full_absorption": 727,
90
+ "num_probe_true_positives": 1035,
91
+ "num_split_features": 4
92
+ },
93
+ {
94
+ "first_letter": "i",
95
+ "mean_absorption_fraction": 0.6661271457750678,
96
+ "full_absorption_rate": 0.6306471306471306,
97
+ "num_full_absorption": 1033,
98
+ "num_probe_true_positives": 1638,
99
+ "num_split_features": 4
100
+ },
101
+ {
102
+ "first_letter": "j",
103
+ "mean_absorption_fraction": 0.6232648925391066,
104
+ "full_absorption_rate": 0.5728155339805825,
105
+ "num_full_absorption": 236,
106
+ "num_probe_true_positives": 412,
107
+ "num_split_features": 2
108
+ },
109
+ {
110
+ "first_letter": "k",
111
+ "mean_absorption_fraction": 0.567028277479737,
112
+ "full_absorption_rate": 0.522962962962963,
113
+ "num_full_absorption": 353,
114
+ "num_probe_true_positives": 675,
115
+ "num_split_features": 1
116
+ },
117
+ {
118
+ "first_letter": "l",
119
+ "mean_absorption_fraction": 0.754960114510044,
120
+ "full_absorption_rate": 0.7343616109682948,
121
+ "num_full_absorption": 857,
122
+ "num_probe_true_positives": 1167,
123
+ "num_split_features": 3
124
+ },
125
+ {
126
+ "first_letter": "m",
127
+ "mean_absorption_fraction": 0.8203947792860443,
128
+ "full_absorption_rate": 0.7940691927512356,
129
+ "num_full_absorption": 1446,
130
+ "num_probe_true_positives": 1821,
131
+ "num_split_features": 1
132
+ },
133
+ {
134
+ "first_letter": "n",
135
+ "mean_absorption_fraction": 0.7950898206311865,
136
+ "full_absorption_rate": 0.7670025188916877,
137
+ "num_full_absorption": 609,
138
+ "num_probe_true_positives": 794,
139
+ "num_split_features": 3
140
+ },
141
+ {
142
+ "first_letter": "o",
143
+ "mean_absorption_fraction": 0.6647762399813312,
144
+ "full_absorption_rate": 0.521087160262418,
145
+ "num_full_absorption": 556,
146
+ "num_probe_true_positives": 1067,
147
+ "num_split_features": 6
148
+ },
149
+ {
150
+ "first_letter": "p",
151
+ "mean_absorption_fraction": 0.8176700167677401,
152
+ "full_absorption_rate": 0.7725679228746714,
153
+ "num_full_absorption": 1763,
154
+ "num_probe_true_positives": 2282,
155
+ "num_split_features": 5
156
+ },
157
+ {
158
+ "first_letter": "q",
159
+ "mean_absorption_fraction": 0.45423763895890557,
160
+ "full_absorption_rate": 0.3,
161
+ "num_full_absorption": 57,
162
+ "num_probe_true_positives": 190,
163
+ "num_split_features": 1
164
+ },
165
+ {
166
+ "first_letter": "r",
167
+ "mean_absorption_fraction": 0.7393716764914745,
168
+ "full_absorption_rate": 0.6843033509700176,
169
+ "num_full_absorption": 1164,
170
+ "num_probe_true_positives": 1701,
171
+ "num_split_features": 3
172
+ },
173
+ {
174
+ "first_letter": "s",
175
+ "mean_absorption_fraction": 0.8758378878071444,
176
+ "full_absorption_rate": 0.791948699679373,
177
+ "num_full_absorption": 2223,
178
+ "num_probe_true_positives": 2807,
179
+ "num_split_features": 1
180
+ },
181
+ {
182
+ "first_letter": "t",
183
+ "mean_absorption_fraction": 0.8617655260809893,
184
+ "full_absorption_rate": 0.743952802359882,
185
+ "num_full_absorption": 1261,
186
+ "num_probe_true_positives": 1695,
187
+ "num_split_features": 2
188
+ },
189
+ {
190
+ "first_letter": "u",
191
+ "mean_absorption_fraction": 0.7107051211722657,
192
+ "full_absorption_rate": 0.7841059602649006,
193
+ "num_full_absorption": 592,
194
+ "num_probe_true_positives": 755,
195
+ "num_split_features": 1
196
+ },
197
+ {
198
+ "first_letter": "v",
199
+ "mean_absorption_fraction": 0.5079324747158148,
200
+ "full_absorption_rate": 0.5057034220532319,
201
+ "num_full_absorption": 399,
202
+ "num_probe_true_positives": 789,
203
+ "num_split_features": 6
204
+ },
205
+ {
206
+ "first_letter": "w",
207
+ "mean_absorption_fraction": 0.7893559891999093,
208
+ "full_absorption_rate": 0.7741046831955923,
209
+ "num_full_absorption": 562,
210
+ "num_probe_true_positives": 726,
211
+ "num_split_features": 3
212
+ },
213
+ {
214
+ "first_letter": "x",
215
+ "mean_absorption_fraction": 0.2505387445283525,
216
+ "full_absorption_rate": 0.1592920353982301,
217
+ "num_full_absorption": 18,
218
+ "num_probe_true_positives": 113,
219
+ "num_split_features": 1
220
+ },
221
+ {
222
+ "first_letter": "y",
223
+ "mean_absorption_fraction": 0.2744647016381297,
224
+ "full_absorption_rate": 0.19886363636363635,
225
+ "num_full_absorption": 35,
226
+ "num_probe_true_positives": 176,
227
+ "num_split_features": 1
228
+ },
229
+ {
230
+ "first_letter": "z",
231
+ "mean_absorption_fraction": 0.09938807818948822,
232
+ "full_absorption_rate": 0.15319148936170213,
233
+ "num_full_absorption": 36,
234
+ "num_probe_true_positives": 235,
235
+ "num_split_features": 1
236
+ }
237
+ ],
238
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
239
+ "sae_lens_id": "custom_sae",
240
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0",
241
+ "sae_lens_version": "5.4.2",
242
+ "sae_cfg_dict": {
243
+ "model_name": "gemma-2-2b",
244
+ "d_in": 2304,
245
+ "d_sae": 65536,
246
+ "hook_layer": 12,
247
+ "hook_name": "blocks.12.hook_resid_post",
248
+ "context_size": null,
249
+ "hook_head_index": null,
250
+ "architecture": "topk",
251
+ "apply_b_dec_to_input": null,
252
+ "finetuning_scaling_factor": null,
253
+ "activation_fn_str": "",
254
+ "prepend_bos": true,
255
+ "normalize_activations": "none",
256
+ "dtype": "bfloat16",
257
+ "device": "",
258
+ "dataset_path": "",
259
+ "dataset_trust_remote_code": true,
260
+ "seqpos_slice": [
261
+ null
262
+ ],
263
+ "training_tokens": -100000,
264
+ "sae_lens_training_version": null,
265
+ "neuronpedia_id": null
266
+ },
267
+ "eval_result_unstructured": null
268
+ }
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "absorption_first_letter",
3
+ "eval_config": {
4
+ "model_name": "gemma-2-2b",
5
+ "random_seed": 42,
6
+ "f1_jump_threshold": 0.03,
7
+ "max_k_value": 10,
8
+ "prompt_template": "{word} has the first letter:",
9
+ "prompt_token_pos": -6,
10
+ "llm_batch_size": 32,
11
+ "llm_dtype": "bfloat16",
12
+ "k_sparse_probe_l1_decay": 0.01,
13
+ "k_sparse_probe_batch_size": 4096,
14
+ "k_sparse_probe_num_epochs": 50
15
+ },
16
+ "eval_id": "2be8745f-1a57-4404-ac46-dce11e8b68ff",
17
+ "datetime_epoch_millis": 1740153947708,
18
+ "eval_result_metrics": {
19
+ "mean": {
20
+ "mean_absorption_fraction_score": 0.5128081449664995,
21
+ "mean_full_absorption_score": 0.5412368653055878,
22
+ "mean_num_split_features": 3.6538461538461537,
23
+ "std_dev_absorption_fraction_score": 0.22140493761969532,
24
+ "std_dev_full_absorption_score": 0.2331752415340005,
25
+ "std_dev_num_split_features": 2.077350383393378
26
+ }
27
+ },
28
+ "eval_result_details": [
29
+ {
30
+ "first_letter": "a",
31
+ "mean_absorption_fraction": 0.7110525620728301,
32
+ "full_absorption_rate": 0.7256778309409888,
33
+ "num_full_absorption": 1820,
34
+ "num_probe_true_positives": 2508,
35
+ "num_split_features": 4
36
+ },
37
+ {
38
+ "first_letter": "b",
39
+ "mean_absorption_fraction": 0.5900107080679658,
40
+ "full_absorption_rate": 0.6504539559014267,
41
+ "num_full_absorption": 1003,
42
+ "num_probe_true_positives": 1542,
43
+ "num_split_features": 8
44
+ },
45
+ {
46
+ "first_letter": "c",
47
+ "mean_absorption_fraction": 0.7889150540988363,
48
+ "full_absorption_rate": 0.7992869875222817,
49
+ "num_full_absorption": 2242,
50
+ "num_probe_true_positives": 2805,
51
+ "num_split_features": 5
52
+ },
53
+ {
54
+ "first_letter": "d",
55
+ "mean_absorption_fraction": 0.7213593093818161,
56
+ "full_absorption_rate": 0.7777108433734939,
57
+ "num_full_absorption": 1291,
58
+ "num_probe_true_positives": 1660,
59
+ "num_split_features": 4
60
+ },
61
+ {
62
+ "first_letter": "e",
63
+ "mean_absorption_fraction": 0.41357739362504964,
64
+ "full_absorption_rate": 0.5705445544554455,
65
+ "num_full_absorption": 922,
66
+ "num_probe_true_positives": 1616,
67
+ "num_split_features": 5
68
+ },
69
+ {
70
+ "first_letter": "f",
71
+ "mean_absorption_fraction": 0.6904496997463666,
72
+ "full_absorption_rate": 0.7164781906300485,
73
+ "num_full_absorption": 887,
74
+ "num_probe_true_positives": 1238,
75
+ "num_split_features": 7
76
+ },
77
+ {
78
+ "first_letter": "g",
79
+ "mean_absorption_fraction": 0.6122705763206383,
80
+ "full_absorption_rate": 0.6462882096069869,
81
+ "num_full_absorption": 740,
82
+ "num_probe_true_positives": 1145,
83
+ "num_split_features": 6
84
+ },
85
+ {
86
+ "first_letter": "h",
87
+ "mean_absorption_fraction": 0.5132230230308134,
88
+ "full_absorption_rate": 0.5207729468599034,
89
+ "num_full_absorption": 539,
90
+ "num_probe_true_positives": 1035,
91
+ "num_split_features": 5
92
+ },
93
+ {
94
+ "first_letter": "i",
95
+ "mean_absorption_fraction": 0.5740640698732722,
96
+ "full_absorption_rate": 0.7197802197802198,
97
+ "num_full_absorption": 1179,
98
+ "num_probe_true_positives": 1638,
99
+ "num_split_features": 3
100
+ },
101
+ {
102
+ "first_letter": "j",
103
+ "mean_absorption_fraction": 0.366316889496265,
104
+ "full_absorption_rate": 0.3567961165048544,
105
+ "num_full_absorption": 147,
106
+ "num_probe_true_positives": 412,
107
+ "num_split_features": 1
108
+ },
109
+ {
110
+ "first_letter": "k",
111
+ "mean_absorption_fraction": 0.23633773281065845,
112
+ "full_absorption_rate": 0.23851851851851852,
113
+ "num_full_absorption": 161,
114
+ "num_probe_true_positives": 675,
115
+ "num_split_features": 1
116
+ },
117
+ {
118
+ "first_letter": "l",
119
+ "mean_absorption_fraction": 0.6116279835790608,
120
+ "full_absorption_rate": 0.6392459297343616,
121
+ "num_full_absorption": 746,
122
+ "num_probe_true_positives": 1167,
123
+ "num_split_features": 4
124
+ },
125
+ {
126
+ "first_letter": "m",
127
+ "mean_absorption_fraction": 0.6636309057144126,
128
+ "full_absorption_rate": 0.7177375068643602,
129
+ "num_full_absorption": 1307,
130
+ "num_probe_true_positives": 1821,
131
+ "num_split_features": 6
132
+ },
133
+ {
134
+ "first_letter": "n",
135
+ "mean_absorption_fraction": 0.577436439906812,
136
+ "full_absorption_rate": 0.5629722921914357,
137
+ "num_full_absorption": 447,
138
+ "num_probe_true_positives": 794,
139
+ "num_split_features": 4
140
+ },
141
+ {
142
+ "first_letter": "o",
143
+ "mean_absorption_fraction": 0.4891657688844448,
144
+ "full_absorption_rate": 0.5014058106841612,
145
+ "num_full_absorption": 535,
146
+ "num_probe_true_positives": 1067,
147
+ "num_split_features": 1
148
+ },
149
+ {
150
+ "first_letter": "p",
151
+ "mean_absorption_fraction": 0.8397761400133482,
152
+ "full_absorption_rate": 0.8347940403155127,
153
+ "num_full_absorption": 1905,
154
+ "num_probe_true_positives": 2282,
155
+ "num_split_features": 3
156
+ },
157
+ {
158
+ "first_letter": "q",
159
+ "mean_absorption_fraction": 0.33764789379308424,
160
+ "full_absorption_rate": 0.3105263157894737,
161
+ "num_full_absorption": 59,
162
+ "num_probe_true_positives": 190,
163
+ "num_split_features": 2
164
+ },
165
+ {
166
+ "first_letter": "r",
167
+ "mean_absorption_fraction": 0.6411389015355013,
168
+ "full_absorption_rate": 0.6431510875955321,
169
+ "num_full_absorption": 1094,
170
+ "num_probe_true_positives": 1701,
171
+ "num_split_features": 5
172
+ },
173
+ {
174
+ "first_letter": "s",
175
+ "mean_absorption_fraction": 0.7622223123066498,
176
+ "full_absorption_rate": 0.7716423227645173,
177
+ "num_full_absorption": 2166,
178
+ "num_probe_true_positives": 2807,
179
+ "num_split_features": 5
180
+ },
181
+ {
182
+ "first_letter": "t",
183
+ "mean_absorption_fraction": 0.7386404958442533,
184
+ "full_absorption_rate": 0.7480825958702065,
185
+ "num_full_absorption": 1268,
186
+ "num_probe_true_positives": 1695,
187
+ "num_split_features": 2
188
+ },
189
+ {
190
+ "first_letter": "u",
191
+ "mean_absorption_fraction": 0.19444390279769325,
192
+ "full_absorption_rate": 0.3258278145695364,
193
+ "num_full_absorption": 246,
194
+ "num_probe_true_positives": 755,
195
+ "num_split_features": 2
196
+ },
197
+ {
198
+ "first_letter": "v",
199
+ "mean_absorption_fraction": 0.36413111590891156,
200
+ "full_absorption_rate": 0.4435994930291508,
201
+ "num_full_absorption": 350,
202
+ "num_probe_true_positives": 789,
203
+ "num_split_features": 6
204
+ },
205
+ {
206
+ "first_letter": "w",
207
+ "mean_absorption_fraction": 0.5408512211581553,
208
+ "full_absorption_rate": 0.6060606060606061,
209
+ "num_full_absorption": 440,
210
+ "num_probe_true_positives": 726,
211
+ "num_split_features": 3
212
+ },
213
+ {
214
+ "first_letter": "x",
215
+ "mean_absorption_fraction": 0.09970442692753816,
216
+ "full_absorption_rate": 0.05309734513274336,
217
+ "num_full_absorption": 6,
218
+ "num_probe_true_positives": 113,
219
+ "num_split_features": 1
220
+ },
221
+ {
222
+ "first_letter": "y",
223
+ "mean_absorption_fraction": 0.20248330206807633,
224
+ "full_absorption_rate": 0.1534090909090909,
225
+ "num_full_absorption": 27,
226
+ "num_probe_true_positives": 176,
227
+ "num_split_features": 1
228
+ },
229
+ {
230
+ "first_letter": "z",
231
+ "mean_absorption_fraction": 0.05253394016653227,
232
+ "full_absorption_rate": 0.03829787234042553,
233
+ "num_full_absorption": 9,
234
+ "num_probe_true_positives": 235,
235
+ "num_split_features": 1
236
+ }
237
+ ],
238
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
239
+ "sae_lens_id": "custom_sae",
240
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0",
241
+ "sae_lens_version": "5.4.2",
242
+ "sae_cfg_dict": {
243
+ "model_name": "gemma-2-2b",
244
+ "d_in": 2304,
245
+ "d_sae": 65536,
246
+ "hook_layer": 12,
247
+ "hook_name": "blocks.12.hook_resid_post",
248
+ "context_size": null,
249
+ "hook_head_index": null,
250
+ "architecture": "topk",
251
+ "apply_b_dec_to_input": null,
252
+ "finetuning_scaling_factor": null,
253
+ "activation_fn_str": "",
254
+ "prepend_bos": true,
255
+ "normalize_activations": "none",
256
+ "dtype": "bfloat16",
257
+ "device": "",
258
+ "dataset_path": "",
259
+ "dataset_trust_remote_code": true,
260
+ "seqpos_slice": [
261
+ null
262
+ ],
263
+ "training_tokens": -100000,
264
+ "sae_lens_training_version": null,
265
+ "neuronpedia_id": null
266
+ },
267
+ "eval_result_unstructured": null
268
+ }
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "absorption_first_letter",
3
+ "eval_config": {
4
+ "model_name": "gemma-2-2b",
5
+ "random_seed": 42,
6
+ "f1_jump_threshold": 0.03,
7
+ "max_k_value": 10,
8
+ "prompt_template": "{word} has the first letter:",
9
+ "prompt_token_pos": -6,
10
+ "llm_batch_size": 32,
11
+ "llm_dtype": "bfloat16",
12
+ "k_sparse_probe_l1_decay": 0.01,
13
+ "k_sparse_probe_batch_size": 4096,
14
+ "k_sparse_probe_num_epochs": 50
15
+ },
16
+ "eval_id": "301cd6f2-5394-4131-b827-9d8a5bbeac04",
17
+ "datetime_epoch_millis": 1740153129410,
18
+ "eval_result_metrics": {
19
+ "mean": {
20
+ "mean_absorption_fraction_score": 0.6363149713432454,
21
+ "mean_full_absorption_score": 0.4939130321779123,
22
+ "mean_num_split_features": 3.3846153846153846,
23
+ "std_dev_absorption_fraction_score": 0.19938409617149097,
24
+ "std_dev_full_absorption_score": 0.17387178972743858,
25
+ "std_dev_num_split_features": 1.6751578570850707
26
+ }
27
+ },
28
+ "eval_result_details": [
29
+ {
30
+ "first_letter": "a",
31
+ "mean_absorption_fraction": 0.685226183958439,
32
+ "full_absorption_rate": 0.4597288676236045,
33
+ "num_full_absorption": 1153,
34
+ "num_probe_true_positives": 2508,
35
+ "num_split_features": 5
36
+ },
37
+ {
38
+ "first_letter": "b",
39
+ "mean_absorption_fraction": 0.6798069224264082,
40
+ "full_absorption_rate": 0.5012970168612192,
41
+ "num_full_absorption": 773,
42
+ "num_probe_true_positives": 1542,
43
+ "num_split_features": 4
44
+ },
45
+ {
46
+ "first_letter": "c",
47
+ "mean_absorption_fraction": 0.7703485846895846,
48
+ "full_absorption_rate": 0.6217468805704099,
49
+ "num_full_absorption": 1744,
50
+ "num_probe_true_positives": 2805,
51
+ "num_split_features": 2
52
+ },
53
+ {
54
+ "first_letter": "d",
55
+ "mean_absorption_fraction": 0.8082173823988077,
56
+ "full_absorption_rate": 0.6445783132530121,
57
+ "num_full_absorption": 1070,
58
+ "num_probe_true_positives": 1660,
59
+ "num_split_features": 3
60
+ },
61
+ {
62
+ "first_letter": "e",
63
+ "mean_absorption_fraction": 0.5316796151182803,
64
+ "full_absorption_rate": 0.49876237623762376,
65
+ "num_full_absorption": 806,
66
+ "num_probe_true_positives": 1616,
67
+ "num_split_features": 3
68
+ },
69
+ {
70
+ "first_letter": "f",
71
+ "mean_absorption_fraction": 0.6921068442218885,
72
+ "full_absorption_rate": 0.6106623586429726,
73
+ "num_full_absorption": 756,
74
+ "num_probe_true_positives": 1238,
75
+ "num_split_features": 5
76
+ },
77
+ {
78
+ "first_letter": "g",
79
+ "mean_absorption_fraction": 0.6652568101195918,
80
+ "full_absorption_rate": 0.537117903930131,
81
+ "num_full_absorption": 615,
82
+ "num_probe_true_positives": 1145,
83
+ "num_split_features": 5
84
+ },
85
+ {
86
+ "first_letter": "h",
87
+ "mean_absorption_fraction": 0.7274664367939123,
88
+ "full_absorption_rate": 0.5217391304347826,
89
+ "num_full_absorption": 540,
90
+ "num_probe_true_positives": 1035,
91
+ "num_split_features": 4
92
+ },
93
+ {
94
+ "first_letter": "i",
95
+ "mean_absorption_fraction": 0.5973165908932144,
96
+ "full_absorption_rate": 0.4896214896214896,
97
+ "num_full_absorption": 802,
98
+ "num_probe_true_positives": 1638,
99
+ "num_split_features": 3
100
+ },
101
+ {
102
+ "first_letter": "j",
103
+ "mean_absorption_fraction": 0.749237725468953,
104
+ "full_absorption_rate": 0.5072815533980582,
105
+ "num_full_absorption": 209,
106
+ "num_probe_true_positives": 412,
107
+ "num_split_features": 2
108
+ },
109
+ {
110
+ "first_letter": "k",
111
+ "mean_absorption_fraction": 0.7065567821922324,
112
+ "full_absorption_rate": 0.5955555555555555,
113
+ "num_full_absorption": 402,
114
+ "num_probe_true_positives": 675,
115
+ "num_split_features": 1
116
+ },
117
+ {
118
+ "first_letter": "l",
119
+ "mean_absorption_fraction": 0.7196233455554913,
120
+ "full_absorption_rate": 0.5355612682090831,
121
+ "num_full_absorption": 625,
122
+ "num_probe_true_positives": 1167,
123
+ "num_split_features": 4
124
+ },
125
+ {
126
+ "first_letter": "m",
127
+ "mean_absorption_fraction": 0.7175474373170088,
128
+ "full_absorption_rate": 0.4876441515650741,
129
+ "num_full_absorption": 888,
130
+ "num_probe_true_positives": 1821,
131
+ "num_split_features": 7
132
+ },
133
+ {
134
+ "first_letter": "n",
135
+ "mean_absorption_fraction": 0.7491123746163217,
136
+ "full_absorption_rate": 0.5654911838790933,
137
+ "num_full_absorption": 449,
138
+ "num_probe_true_positives": 794,
139
+ "num_split_features": 2
140
+ },
141
+ {
142
+ "first_letter": "o",
143
+ "mean_absorption_fraction": 0.5657368175863791,
144
+ "full_absorption_rate": 0.4217432052483599,
145
+ "num_full_absorption": 450,
146
+ "num_probe_true_positives": 1067,
147
+ "num_split_features": 7
148
+ },
149
+ {
150
+ "first_letter": "p",
151
+ "mean_absorption_fraction": 0.7904435804397044,
152
+ "full_absorption_rate": 0.6897458369851008,
153
+ "num_full_absorption": 1574,
154
+ "num_probe_true_positives": 2282,
155
+ "num_split_features": 3
156
+ },
157
+ {
158
+ "first_letter": "q",
159
+ "mean_absorption_fraction": 0.697611173748312,
160
+ "full_absorption_rate": 0.46842105263157896,
161
+ "num_full_absorption": 89,
162
+ "num_probe_true_positives": 190,
163
+ "num_split_features": 3
164
+ },
165
+ {
166
+ "first_letter": "r",
167
+ "mean_absorption_fraction": 0.6868794374337754,
168
+ "full_absorption_rate": 0.5955320399764844,
169
+ "num_full_absorption": 1013,
170
+ "num_probe_true_positives": 1701,
171
+ "num_split_features": 5
172
+ },
173
+ {
174
+ "first_letter": "s",
175
+ "mean_absorption_fraction": 0.8751669277159965,
176
+ "full_absorption_rate": 0.7064481653010332,
177
+ "num_full_absorption": 1983,
178
+ "num_probe_true_positives": 2807,
179
+ "num_split_features": 1
180
+ },
181
+ {
182
+ "first_letter": "t",
183
+ "mean_absorption_fraction": 0.7865680163892015,
184
+ "full_absorption_rate": 0.6070796460176991,
185
+ "num_full_absorption": 1029,
186
+ "num_probe_true_positives": 1695,
187
+ "num_split_features": 4
188
+ },
189
+ {
190
+ "first_letter": "u",
191
+ "mean_absorption_fraction": 0.5045876719987833,
192
+ "full_absorption_rate": 0.42251655629139073,
193
+ "num_full_absorption": 319,
194
+ "num_probe_true_positives": 755,
195
+ "num_split_features": 2
196
+ },
197
+ {
198
+ "first_letter": "v",
199
+ "mean_absorption_fraction": 0.573872745990794,
200
+ "full_absorption_rate": 0.47782002534854245,
201
+ "num_full_absorption": 377,
202
+ "num_probe_true_positives": 789,
203
+ "num_split_features": 4
204
+ },
205
+ {
206
+ "first_letter": "w",
207
+ "mean_absorption_fraction": 0.7610011483914709,
208
+ "full_absorption_rate": 0.6556473829201102,
209
+ "num_full_absorption": 476,
210
+ "num_probe_true_positives": 726,
211
+ "num_split_features": 4
212
+ },
213
+ {
214
+ "first_letter": "x",
215
+ "mean_absorption_fraction": 0.10157088518458829,
216
+ "full_absorption_rate": 0.035398230088495575,
217
+ "num_full_absorption": 4,
218
+ "num_probe_true_positives": 113,
219
+ "num_split_features": 1
220
+ },
221
+ {
222
+ "first_letter": "y",
223
+ "mean_absorption_fraction": 0.36839684040497356,
224
+ "full_absorption_rate": 0.14204545454545456,
225
+ "num_full_absorption": 25,
226
+ "num_probe_true_positives": 176,
227
+ "num_split_features": 3
228
+ },
229
+ {
230
+ "first_letter": "z",
231
+ "mean_absorption_fraction": 0.03285097387026813,
232
+ "full_absorption_rate": 0.0425531914893617,
233
+ "num_full_absorption": 10,
234
+ "num_probe_true_positives": 235,
235
+ "num_split_features": 1
236
+ }
237
+ ],
238
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
239
+ "sae_lens_id": "custom_sae",
240
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0",
241
+ "sae_lens_version": "5.4.2",
242
+ "sae_cfg_dict": {
243
+ "model_name": "gemma-2-2b",
244
+ "d_in": 2304,
245
+ "d_sae": 65536,
246
+ "hook_layer": 12,
247
+ "hook_name": "blocks.12.hook_resid_post",
248
+ "context_size": null,
249
+ "hook_head_index": null,
250
+ "architecture": "topk",
251
+ "apply_b_dec_to_input": null,
252
+ "finetuning_scaling_factor": null,
253
+ "activation_fn_str": "",
254
+ "prepend_bos": true,
255
+ "normalize_activations": "none",
256
+ "dtype": "bfloat16",
257
+ "device": "",
258
+ "dataset_path": "",
259
+ "dataset_trust_remote_code": true,
260
+ "seqpos_slice": [
261
+ null
262
+ ],
263
+ "training_tokens": -100000,
264
+ "sae_lens_training_version": null,
265
+ "neuronpedia_id": null
266
+ },
267
+ "eval_result_unstructured": null
268
+ }
eval_results_from_scratch/absorption/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "absorption_first_letter",
3
+ "eval_config": {
4
+ "model_name": "gemma-2-2b",
5
+ "random_seed": 42,
6
+ "f1_jump_threshold": 0.03,
7
+ "max_k_value": 10,
8
+ "prompt_template": "{word} has the first letter:",
9
+ "prompt_token_pos": -6,
10
+ "llm_batch_size": 32,
11
+ "llm_dtype": "bfloat16",
12
+ "k_sparse_probe_l1_decay": 0.01,
13
+ "k_sparse_probe_batch_size": 4096,
14
+ "k_sparse_probe_num_epochs": 50
15
+ },
16
+ "eval_id": "748a0895-c8af-4e84-ae36-2f3eb01ee78c",
17
+ "datetime_epoch_millis": 1740155568982,
18
+ "eval_result_metrics": {
19
+ "mean": {
20
+ "mean_absorption_fraction_score": 0.32544343256195074,
21
+ "mean_full_absorption_score": 0.32888806627437994,
22
+ "mean_num_split_features": 1.6538461538461537,
23
+ "std_dev_absorption_fraction_score": 0.20824169704299758,
24
+ "std_dev_full_absorption_score": 0.22060249667862672,
25
+ "std_dev_num_split_features": 1.0933364602832083
26
+ }
27
+ },
28
+ "eval_result_details": [
29
+ {
30
+ "first_letter": "a",
31
+ "mean_absorption_fraction": 0.6007277347131041,
32
+ "full_absorption_rate": 0.5251196172248804,
33
+ "num_full_absorption": 1317,
34
+ "num_probe_true_positives": 2508,
35
+ "num_split_features": 1
36
+ },
37
+ {
38
+ "first_letter": "b",
39
+ "mean_absorption_fraction": 0.29305024763176835,
40
+ "full_absorption_rate": 0.30804150453955903,
41
+ "num_full_absorption": 475,
42
+ "num_probe_true_positives": 1542,
43
+ "num_split_features": 1
44
+ },
45
+ {
46
+ "first_letter": "c",
47
+ "mean_absorption_fraction": 0.630569504589696,
48
+ "full_absorption_rate": 0.6559714795008913,
49
+ "num_full_absorption": 1840,
50
+ "num_probe_true_positives": 2805,
51
+ "num_split_features": 3
52
+ },
53
+ {
54
+ "first_letter": "d",
55
+ "mean_absorption_fraction": 0.398668071195782,
56
+ "full_absorption_rate": 0.4066265060240964,
57
+ "num_full_absorption": 675,
58
+ "num_probe_true_positives": 1660,
59
+ "num_split_features": 2
60
+ },
61
+ {
62
+ "first_letter": "e",
63
+ "mean_absorption_fraction": 0.3653745156003204,
64
+ "full_absorption_rate": 0.46410891089108913,
65
+ "num_full_absorption": 750,
66
+ "num_probe_true_positives": 1616,
67
+ "num_split_features": 3
68
+ },
69
+ {
70
+ "first_letter": "f",
71
+ "mean_absorption_fraction": 0.4766671026609379,
72
+ "full_absorption_rate": 0.5218093699515347,
73
+ "num_full_absorption": 646,
74
+ "num_probe_true_positives": 1238,
75
+ "num_split_features": 1
76
+ },
77
+ {
78
+ "first_letter": "g",
79
+ "mean_absorption_fraction": 0.20841239327824035,
80
+ "full_absorption_rate": 0.20611353711790392,
81
+ "num_full_absorption": 236,
82
+ "num_probe_true_positives": 1145,
83
+ "num_split_features": 1
84
+ },
85
+ {
86
+ "first_letter": "h",
87
+ "mean_absorption_fraction": 0.194335181953026,
88
+ "full_absorption_rate": 0.19033816425120773,
89
+ "num_full_absorption": 197,
90
+ "num_probe_true_positives": 1035,
91
+ "num_split_features": 1
92
+ },
93
+ {
94
+ "first_letter": "i",
95
+ "mean_absorption_fraction": 0.498854509106931,
96
+ "full_absorption_rate": 0.5384615384615384,
97
+ "num_full_absorption": 882,
98
+ "num_probe_true_positives": 1638,
99
+ "num_split_features": 3
100
+ },
101
+ {
102
+ "first_letter": "j",
103
+ "mean_absorption_fraction": 0.00809270254817472,
104
+ "full_absorption_rate": 0.012135922330097087,
105
+ "num_full_absorption": 5,
106
+ "num_probe_true_positives": 412,
107
+ "num_split_features": 1
108
+ },
109
+ {
110
+ "first_letter": "k",
111
+ "mean_absorption_fraction": 0.027364220505435494,
112
+ "full_absorption_rate": 0.022222222222222223,
113
+ "num_full_absorption": 15,
114
+ "num_probe_true_positives": 675,
115
+ "num_split_features": 1
116
+ },
117
+ {
118
+ "first_letter": "l",
119
+ "mean_absorption_fraction": 0.46655821897414346,
120
+ "full_absorption_rate": 0.46786632390745503,
121
+ "num_full_absorption": 546,
122
+ "num_probe_true_positives": 1167,
123
+ "num_split_features": 1
124
+ },
125
+ {
126
+ "first_letter": "m",
127
+ "mean_absorption_fraction": 0.48562595745684123,
128
+ "full_absorption_rate": 0.5332235035694673,
129
+ "num_full_absorption": 971,
130
+ "num_probe_true_positives": 1821,
131
+ "num_split_features": 2
132
+ },
133
+ {
134
+ "first_letter": "n",
135
+ "mean_absorption_fraction": 0.36360093234914864,
136
+ "full_absorption_rate": 0.3211586901763224,
137
+ "num_full_absorption": 255,
138
+ "num_probe_true_positives": 794,
139
+ "num_split_features": 4
140
+ },
141
+ {
142
+ "first_letter": "o",
143
+ "mean_absorption_fraction": 0.3188066412402324,
144
+ "full_absorption_rate": 0.3786316776007498,
145
+ "num_full_absorption": 404,
146
+ "num_probe_true_positives": 1067,
147
+ "num_split_features": 1
148
+ },
149
+ {
150
+ "first_letter": "p",
151
+ "mean_absorption_fraction": 0.6813615753502903,
152
+ "full_absorption_rate": 0.6919368974583698,
153
+ "num_full_absorption": 1579,
154
+ "num_probe_true_positives": 2282,
155
+ "num_split_features": 2
156
+ },
157
+ {
158
+ "first_letter": "q",
159
+ "mean_absorption_fraction": 0.030652585242806108,
160
+ "full_absorption_rate": 0.03684210526315789,
161
+ "num_full_absorption": 7,
162
+ "num_probe_true_positives": 190,
163
+ "num_split_features": 1
164
+ },
165
+ {
166
+ "first_letter": "r",
167
+ "mean_absorption_fraction": 0.4850781646941056,
168
+ "full_absorption_rate": 0.5202821869488536,
169
+ "num_full_absorption": 885,
170
+ "num_probe_true_positives": 1701,
171
+ "num_split_features": 1
172
+ },
173
+ {
174
+ "first_letter": "s",
175
+ "mean_absorption_fraction": 0.6328351318421925,
176
+ "full_absorption_rate": 0.6074100463127895,
177
+ "num_full_absorption": 1705,
178
+ "num_probe_true_positives": 2807,
179
+ "num_split_features": 5
180
+ },
181
+ {
182
+ "first_letter": "t",
183
+ "mean_absorption_fraction": 0.3754464221045167,
184
+ "full_absorption_rate": 0.31504424778761064,
185
+ "num_full_absorption": 534,
186
+ "num_probe_true_positives": 1695,
187
+ "num_split_features": 1
188
+ },
189
+ {
190
+ "first_letter": "u",
191
+ "mean_absorption_fraction": 0.33574647694082693,
192
+ "full_absorption_rate": 0.41456953642384103,
193
+ "num_full_absorption": 313,
194
+ "num_probe_true_positives": 755,
195
+ "num_split_features": 2
196
+ },
197
+ {
198
+ "first_letter": "v",
199
+ "mean_absorption_fraction": 0.07488270366700864,
200
+ "full_absorption_rate": 0.08745247148288973,
201
+ "num_full_absorption": 69,
202
+ "num_probe_true_positives": 789,
203
+ "num_split_features": 1
204
+ },
205
+ {
206
+ "first_letter": "w",
207
+ "mean_absorption_fraction": 0.24404390408289825,
208
+ "full_absorption_rate": 0.2327823691460055,
209
+ "num_full_absorption": 169,
210
+ "num_probe_true_positives": 726,
211
+ "num_split_features": 1
212
+ },
213
+ {
214
+ "first_letter": "x",
215
+ "mean_absorption_fraction": 0.17461273099505284,
216
+ "full_absorption_rate": 0.017699115044247787,
217
+ "num_full_absorption": 2,
218
+ "num_probe_true_positives": 113,
219
+ "num_split_features": 1
220
+ },
221
+ {
222
+ "first_letter": "y",
223
+ "mean_absorption_fraction": 0.07002365476192687,
224
+ "full_absorption_rate": 0.045454545454545456,
225
+ "num_full_absorption": 8,
226
+ "num_probe_true_positives": 176,
227
+ "num_split_features": 1
228
+ },
229
+ {
230
+ "first_letter": "z",
231
+ "mean_absorption_fraction": 0.020137963125312984,
232
+ "full_absorption_rate": 0.029787234042553193,
233
+ "num_full_absorption": 7,
234
+ "num_probe_true_positives": 235,
235
+ "num_split_features": 1
236
+ }
237
+ ],
238
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
239
+ "sae_lens_id": "custom_sae",
240
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0",
241
+ "sae_lens_version": "5.4.2",
242
+ "sae_cfg_dict": {
243
+ "model_name": "gemma-2-2b",
244
+ "d_in": 2304,
245
+ "d_sae": 65536,
246
+ "hook_layer": 12,
247
+ "hook_name": "blocks.12.hook_resid_post",
248
+ "context_size": null,
249
+ "hook_head_index": null,
250
+ "architecture": "topk",
251
+ "apply_b_dec_to_input": null,
252
+ "finetuning_scaling_factor": null,
253
+ "activation_fn_str": "",
254
+ "prepend_bos": true,
255
+ "normalize_activations": "none",
256
+ "dtype": "bfloat16",
257
+ "device": "",
258
+ "dataset_path": "",
259
+ "dataset_trust_remote_code": true,
260
+ "seqpos_slice": [
261
+ null
262
+ ],
263
+ "training_tokens": -100000,
264
+ "sae_lens_training_version": null,
265
+ "neuronpedia_id": null
266
+ },
267
+ "eval_result_unstructured": null
268
+ }
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ade8835091139a702090a82b583edafc17952df258ec2094ccae8475ffb0edd3
3
+ size 26038784
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ed9ba6ea6fbc0311812466993b7423a455115141b96aa55d92b5ffd60a716e5
3
+ size 26004823
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0e5eeaf5313765b9d51b5b018a3a14d86d0f405351fba34e5b131f893c59124
3
+ size 25526733
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:488c950c80e13463424f371370dba26e2461c059d86c19fbca50395bdf88341f
3
+ size 25600864
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:536ed4bc61cb843c976114f9a3c8bbe2767dc9126eff5af03e4263d82fbbdb63
3
+ size 25576485
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b5c2e8a088ce126cb3328b7b53055a573c73a6e3025fdbfa16eadebb694b70a
3
+ size 25376156
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91176fdc720c6a207b11313f9efbd110ecbe9b42e1d31f237d5c567195c4ca73
3
+ size 25873513
eval_results_from_scratch/autointerp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e4b4b5355f82658f66abf51ca1610565246c09085f9666cf6ba0497eaf78ed8
3
+ size 25938312
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:655ab0b13ade7b4ab554c5c3eec0134aa39dd1a462045bf320a8ae2d72d47c6e
3
+ size 21672734
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e4fa8826a42d156a4678073845c4b6c449760fa1a0e0b66b2d370935f2417fa
3
+ size 21773774
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b16f7aadd97950eb3f710d2dd1293998ff117f83c436ab19489a9f063abec1fe
3
+ size 21266212
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e1ee90bbe960ff9c633099c66a1c22fcbed7fe1ced711815c6035b4615010a2
3
+ size 21208670
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ea6375fb3f724b9d9f35c8f9da86a56cb07ffe09b252eebd68063e5af0549f9
3
+ size 21505205
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45fc340a10fb487df31d251f7b80d4d56629b38ccc606ce074441c8e2993478a
3
+ size 21585654
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1da2c27c4aed20429fa10b59ce05e61b689a887f52c352891e4ee7fa112da3a5
3
+ size 21630563
eval_results_from_scratch/core/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcac9aa9a10fc5e629bd04219f06be94fc721bec2c03f09d8f48ffc4cbeb469c
3
+ size 21697035
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "scr",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": true,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "76e1c3b5-c26c-4837-91c8-6d45919b460c",
73
+ "datetime_epoch_millis": 1740160660875,
74
+ "eval_result_metrics": {
75
+ "scr_metrics": {
76
+ "scr_dir1_threshold_2": 0.17937598651986972,
77
+ "scr_metric_threshold_2": 0.06750879575859692,
78
+ "scr_dir2_threshold_2": 0.07494545276237846,
79
+ "scr_dir1_threshold_5": 0.22402296524662565,
80
+ "scr_metric_threshold_5": 0.11204012359707471,
81
+ "scr_dir2_threshold_5": 0.11697061811839651,
82
+ "scr_dir1_threshold_10": 0.2336172994227871,
83
+ "scr_metric_threshold_10": 0.13293663214601245,
84
+ "scr_dir2_threshold_10": 0.1427465534868288,
85
+ "scr_dir1_threshold_20": 0.2505843441914149,
86
+ "scr_metric_threshold_20": 0.16341666656794324,
87
+ "scr_dir2_threshold_20": 0.17208719496326916,
88
+ "scr_dir1_threshold_50": 0.27174068905788434,
89
+ "scr_metric_threshold_50": 0.21225175803186222,
90
+ "scr_dir2_threshold_50": 0.2175195174361239,
91
+ "scr_dir1_threshold_100": 0.2464209622824807,
92
+ "scr_metric_threshold_100": 0.22702989060166226,
93
+ "scr_dir2_threshold_100": 0.23717704484868324,
94
+ "scr_dir1_threshold_500": 0.2697303489398643,
95
+ "scr_metric_threshold_500": 0.22331210595096943,
96
+ "scr_dir2_threshold_500": 0.23536249992024194
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
102
+ "scr_dir1_threshold_2": 0.47619029597980633,
103
+ "scr_metric_threshold_2": 0.004914087313907592,
104
+ "scr_dir2_threshold_2": 0.004914087313907592,
105
+ "scr_dir1_threshold_5": 0.555555345309774,
106
+ "scr_metric_threshold_5": 0.012284998611665989,
107
+ "scr_dir2_threshold_5": 0.012284998611665989,
108
+ "scr_dir1_threshold_10": 0.555555345309774,
109
+ "scr_metric_threshold_10": 0.027027114104653437,
110
+ "scr_dir2_threshold_10": 0.027027114104653437,
111
+ "scr_dir1_threshold_20": 0.5238097040201937,
112
+ "scr_metric_threshold_20": 0.036855142283733294,
113
+ "scr_dir2_threshold_20": 0.036855142283733294,
114
+ "scr_dir1_threshold_50": 0.333333017964661,
115
+ "scr_metric_threshold_50": 0.06633908037223754,
116
+ "scr_dir2_threshold_50": 0.06633908037223754,
117
+ "scr_dir1_threshold_100": 0.31746019731987085,
118
+ "scr_metric_threshold_100": 0.10565119308855696,
119
+ "scr_dir2_threshold_100": 0.10565119308855696,
120
+ "scr_dir1_threshold_500": 0.2698407892794835,
121
+ "scr_metric_threshold_500": 0.23587223695171058,
122
+ "scr_dir2_threshold_500": 0.23587223695171058
123
+ },
124
+ {
125
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
126
+ "scr_dir1_threshold_2": 0.1313126752015721,
127
+ "scr_metric_threshold_2": 0.08498575820403938,
128
+ "scr_dir2_threshold_2": 0.08498575820403938,
129
+ "scr_dir1_threshold_5": 0.16161592443815084,
130
+ "scr_metric_threshold_5": 0.1161473379602074,
131
+ "scr_dir2_threshold_5": 0.1161473379602074,
132
+ "scr_dir1_threshold_10": 0.17171680682792437,
133
+ "scr_metric_threshold_10": 0.15014157370114636,
134
+ "scr_dir2_threshold_10": 0.15014157370114636,
135
+ "scr_dir1_threshold_20": 0.17171680682792437,
136
+ "scr_metric_threshold_20": 0.17847032862085146,
137
+ "scr_dir2_threshold_20": 0.17847032862085146,
138
+ "scr_dir1_threshold_50": 0.3737374649596856,
139
+ "scr_metric_threshold_50": 0.2096317395253275,
140
+ "scr_dir2_threshold_50": 0.2096317395253275,
141
+ "scr_dir1_threshold_100": 0.11111091042202506,
142
+ "scr_metric_threshold_100": 0.24362614411795844,
143
+ "scr_dir2_threshold_100": 0.24362614411795844,
144
+ "scr_dir1_threshold_500": 0.03030264716932058,
145
+ "scr_metric_threshold_500": 0.07082146517003281,
146
+ "scr_dir2_threshold_500": 0.07082146517003281
147
+ },
148
+ {
149
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
150
+ "scr_dir1_threshold_2": 0.516128349998606,
151
+ "scr_metric_threshold_2": 0.007575810028571093,
152
+ "scr_dir2_threshold_2": 0.007575810028571093,
153
+ "scr_dir1_threshold_5": 0.5483869727270193,
154
+ "scr_metric_threshold_5": 0.022727279568944055,
155
+ "scr_dir2_threshold_5": 0.022727279568944055,
156
+ "scr_dir1_threshold_10": 0.516128349998606,
157
+ "scr_metric_threshold_10": 0.03535352927204973,
158
+ "scr_dir2_threshold_10": 0.03535352927204973,
159
+ "scr_dir1_threshold_20": 0.48387068863579336,
160
+ "scr_metric_threshold_20": 0.07070705854409946,
161
+ "scr_dir2_threshold_20": 0.07070705854409946,
162
+ "scr_dir1_threshold_50": 0.46774185795438705,
163
+ "scr_metric_threshold_50": 0.1010101481416146,
164
+ "scr_dir2_threshold_50": 0.1010101481416146,
165
+ "scr_dir1_threshold_100": 0.46774185795438705,
166
+ "scr_metric_threshold_100": 0.17171720668571405,
167
+ "scr_dir2_threshold_100": 0.17171720668571405,
168
+ "scr_dir1_threshold_500": 0.40322557386316116,
169
+ "scr_metric_threshold_500": -0.012626249703105673,
170
+ "scr_dir2_threshold_500": -0.012626249703105673
171
+ },
172
+ {
173
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
174
+ "scr_dir1_threshold_2": 0.1707318964255635,
175
+ "scr_metric_threshold_2": 0.03519051639302713,
176
+ "scr_dir2_threshold_2": 0.03519051639302713,
177
+ "scr_dir1_threshold_5": 0.2845529991791037,
178
+ "scr_metric_threshold_5": 0.07331372148022297,
179
+ "scr_dir2_threshold_5": 0.07331372148022297,
180
+ "scr_dir1_threshold_10": 0.15447146198048264,
181
+ "scr_metric_threshold_10": 0.10263921007231215,
182
+ "scr_dir2_threshold_10": 0.10263921007231215,
183
+ "scr_dir1_threshold_20": 0.16260192149827632,
184
+ "scr_metric_threshold_20": 0.11730195436835675,
185
+ "scr_dir2_threshold_20": 0.11730195436835675,
186
+ "scr_dir1_threshold_50": 0.12195107768082737,
187
+ "scr_metric_threshold_50": 0.17302041765206616,
188
+ "scr_dir2_threshold_50": 0.17302041765206616,
189
+ "scr_dir1_threshold_100": 0.1707318964255635,
190
+ "scr_metric_threshold_100": -0.005865202594637679,
191
+ "scr_dir2_threshold_100": -0.005865202594637679,
192
+ "scr_dir1_threshold_500": 0.5447155889858393,
193
+ "scr_metric_threshold_500": -0.07624641017439167,
194
+ "scr_dir2_threshold_500": -0.07624641017439167
195
+ },
196
+ {
197
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
198
+ "scr_dir1_threshold_2": 0.00546440968122594,
199
+ "scr_metric_threshold_2": 0.21875017462295412,
200
+ "scr_dir2_threshold_2": 0.21875017462295412,
201
+ "scr_dir1_threshold_5": 0.0,
202
+ "scr_metric_threshold_5": 0.371093800931695,
203
+ "scr_dir2_threshold_5": 0.371093800931695,
204
+ "scr_dir1_threshold_10": 0.05464474822904205,
205
+ "scr_metric_threshold_10": 0.417968888243172,
206
+ "scr_dir2_threshold_10": 0.417968888243172,
207
+ "scr_dir1_threshold_20": 0.10382508677685816,
208
+ "scr_metric_threshold_20": 0.4531251455191284,
209
+ "scr_dir2_threshold_20": 0.4531251455191284,
210
+ "scr_dir1_threshold_50": 0.1803277994391953,
211
+ "scr_metric_threshold_50": 0.5156250291038257,
212
+ "scr_dir2_threshold_50": 0.5156250291038257,
213
+ "scr_dir1_threshold_100": 0.10382508677685816,
214
+ "scr_metric_threshold_100": 0.5625001164153027,
215
+ "scr_dir2_threshold_100": 0.5625001164153027,
216
+ "scr_dir1_threshold_500": -0.06557389329988525,
217
+ "scr_metric_threshold_500": 0.6054687718278693,
218
+ "scr_dir2_threshold_500": 0.6054687718278693
219
+ },
220
+ {
221
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
222
+ "scr_dir1_threshold_2": 0.046153559299134936,
223
+ "scr_metric_threshold_2": 0.06451625307937271,
224
+ "scr_dir2_threshold_2": 0.06451625307937271,
225
+ "scr_dir1_threshold_5": 0.08717928497042846,
226
+ "scr_metric_threshold_5": 0.08870972781349516,
227
+ "scr_dir2_threshold_5": 0.08870972781349516,
228
+ "scr_dir1_threshold_10": 0.15384601277006638,
229
+ "scr_metric_threshold_10": 0.1008065853511987,
230
+ "scr_dir2_threshold_10": 0.1008065853511987,
231
+ "scr_dir1_threshold_20": 0.158974152062764,
232
+ "scr_metric_threshold_20": 0.12500006008532116,
233
+ "scr_dir2_threshold_20": 0.12500006008532116,
234
+ "scr_dir1_threshold_50": 0.20512801702675515,
235
+ "scr_metric_threshold_50": 0.20161293036111277,
236
+ "scr_dir2_threshold_50": 0.20161293036111277,
237
+ "scr_dir1_threshold_100": 0.24615374269804866,
238
+ "scr_metric_threshold_100": 0.29032265817460795,
239
+ "scr_dir2_threshold_100": 0.29032265817460795,
240
+ "scr_dir1_threshold_500": 0.35384589050412385,
241
+ "scr_metric_threshold_500": 0.3991936549900859,
242
+ "scr_dir2_threshold_500": 0.3991936549900859
243
+ },
244
+ {
245
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
246
+ "scr_dir1_threshold_2": 0.06756750225933819,
247
+ "scr_metric_threshold_2": 0.10267856311319266,
248
+ "scr_dir2_threshold_2": 0.10267856311319266,
249
+ "scr_dir1_threshold_5": 0.09909916440732847,
250
+ "scr_metric_threshold_5": 0.15625009146916644,
251
+ "scr_dir2_threshold_5": 0.15625009146916644,
252
+ "scr_dir1_threshold_10": 0.19819806032526952,
253
+ "scr_metric_threshold_10": 0.16517854648243513,
254
+ "scr_dir2_threshold_10": 0.16517854648243513,
255
+ "scr_dir1_threshold_20": 0.2882883318271079,
256
+ "scr_metric_threshold_20": 0.214285581239654,
257
+ "scr_dir2_threshold_20": 0.214285581239654,
258
+ "scr_dir1_threshold_50": 0.3243244404278432,
259
+ "scr_metric_threshold_50": 0.2633928820889934,
260
+ "scr_dir2_threshold_50": 0.2633928820889934,
261
+ "scr_dir1_threshold_100": 0.3783783348395588,
262
+ "scr_metric_threshold_100": 0.27232133710226214,
263
+ "scr_dir2_threshold_100": 0.27232133710226214,
264
+ "scr_dir1_threshold_500": 0.43693694419340684,
265
+ "scr_metric_threshold_500": 0.37946412772208915,
266
+ "scr_dir2_threshold_500": 0.37946412772208915
267
+ },
268
+ {
269
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
270
+ "scr_dir1_threshold_2": 0.021459203313710703,
271
+ "scr_metric_threshold_2": 0.021459203313710703,
272
+ "scr_dir2_threshold_2": 0.08095245934396302,
273
+ "scr_dir1_threshold_5": 0.05579403094120067,
274
+ "scr_metric_threshold_5": 0.05579403094120067,
275
+ "scr_dir2_threshold_5": 0.09523798711177515,
276
+ "scr_dir1_threshold_10": 0.0643776099411321,
277
+ "scr_metric_threshold_10": 0.0643776099411321,
278
+ "scr_dir2_threshold_10": 0.14285698066766273,
279
+ "scr_dir1_threshold_20": 0.11158806188240133,
280
+ "scr_metric_threshold_20": 0.11158806188240133,
281
+ "scr_dir2_threshold_20": 0.18095228904500887,
282
+ "scr_dir1_threshold_50": 0.1673818370097199,
283
+ "scr_metric_threshold_50": 0.1673818370097199,
284
+ "scr_dir2_threshold_50": 0.20952391224381361,
285
+ "scr_dir1_threshold_100": 0.17596567182353345,
286
+ "scr_metric_threshold_100": 0.17596567182353345,
287
+ "scr_dir2_threshold_100": 0.25714290579970117,
288
+ "scr_dir1_threshold_500": 0.18454925082346488,
289
+ "scr_metric_threshold_500": 0.18454925082346488,
290
+ "scr_dir2_threshold_500": 0.280952402577645
291
+ }
292
+ ],
293
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
294
+ "sae_lens_id": "custom_sae",
295
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0",
296
+ "sae_lens_version": "5.4.2",
297
+ "sae_cfg_dict": {
298
+ "model_name": "gemma-2-2b",
299
+ "d_in": 2304,
300
+ "d_sae": 65536,
301
+ "hook_layer": 12,
302
+ "hook_name": "blocks.12.hook_resid_post",
303
+ "context_size": null,
304
+ "hook_head_index": null,
305
+ "architecture": "topk",
306
+ "apply_b_dec_to_input": null,
307
+ "finetuning_scaling_factor": null,
308
+ "activation_fn_str": "",
309
+ "prepend_bos": true,
310
+ "normalize_activations": "none",
311
+ "dtype": "bfloat16",
312
+ "device": "",
313
+ "dataset_path": "",
314
+ "dataset_trust_remote_code": true,
315
+ "seqpos_slice": [
316
+ null
317
+ ],
318
+ "training_tokens": -100000,
319
+ "sae_lens_training_version": null,
320
+ "neuronpedia_id": null
321
+ },
322
+ "eval_result_unstructured": null
323
+ }
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "scr",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": true,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "e41564f8-711c-402f-be1a-c2c1e1a26d9b",
73
+ "datetime_epoch_millis": 1740159778191,
74
+ "eval_result_metrics": {
75
+ "scr_metrics": {
76
+ "scr_dir1_threshold_2": 0.21646779635345523,
77
+ "scr_metric_threshold_2": 0.10981927993643228,
78
+ "scr_dir2_threshold_2": 0.11159734942875701,
79
+ "scr_dir1_threshold_5": 0.2386560380069923,
80
+ "scr_metric_threshold_5": 0.15915722723436423,
81
+ "scr_dir2_threshold_5": 0.1647417441944568,
82
+ "scr_dir1_threshold_10": 0.2763340942473081,
83
+ "scr_metric_threshold_10": 0.20696168095936596,
84
+ "scr_dir2_threshold_10": 0.20938604847717682,
85
+ "scr_dir1_threshold_20": 0.29245964775791145,
86
+ "scr_metric_threshold_20": 0.2572680056753961,
87
+ "scr_dir2_threshold_20": 0.2562308120870308,
88
+ "scr_dir1_threshold_50": 0.2917372572864733,
89
+ "scr_metric_threshold_50": 0.35979093988002914,
90
+ "scr_dir2_threshold_50": 0.3474876389705775,
91
+ "scr_dir1_threshold_100": 0.22457255710535257,
92
+ "scr_metric_threshold_100": 0.4050724113957374,
93
+ "scr_dir2_threshold_100": 0.40371842827480603,
94
+ "scr_dir1_threshold_500": 0.06777817985891049,
95
+ "scr_metric_threshold_500": 0.31473923877780835,
96
+ "scr_dir2_threshold_500": 0.30271441063618476
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
102
+ "scr_dir1_threshold_2": 0.5079359372693867,
103
+ "scr_metric_threshold_2": 0.007371057746493725,
104
+ "scr_dir2_threshold_2": 0.007371057746493725,
105
+ "scr_dir1_threshold_5": 0.5396825246649839,
106
+ "scr_metric_threshold_5": 0.022113026790745845,
107
+ "scr_dir2_threshold_5": 0.022113026790745845,
108
+ "scr_dir1_threshold_10": 0.6031747533501614,
109
+ "scr_metric_threshold_10": 0.022113026790745845,
110
+ "scr_dir2_threshold_10": 0.022113026790745845,
111
+ "scr_dir1_threshold_20": 0.5873019327053712,
112
+ "scr_metric_threshold_20": 0.05651105219315768,
113
+ "scr_dir2_threshold_20": 0.05651105219315768,
114
+ "scr_dir1_threshold_50": 0.5714281659545642,
115
+ "scr_metric_threshold_50": 0.13513513117706122,
116
+ "scr_dir2_threshold_50": 0.13513513117706122,
117
+ "scr_dir1_threshold_100": 0.5238097040201937,
118
+ "scr_metric_threshold_100": 0.15479118753522092,
119
+ "scr_dir2_threshold_100": 0.15479118753522092,
120
+ "scr_dir1_threshold_500": 0.42857088793941894,
121
+ "scr_metric_threshold_500": -0.004913940865172265,
122
+ "scr_dir2_threshold_500": -0.004913940865172265
123
+ },
124
+ {
125
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
126
+ "scr_dir1_threshold_2": 0.2121209384542766,
127
+ "scr_metric_threshold_2": 0.0934844015651201,
128
+ "scr_dir2_threshold_2": 0.0934844015651201,
129
+ "scr_dir1_threshold_5": 0.25252507008062886,
130
+ "scr_metric_threshold_5": 0.15014157370114636,
131
+ "scr_dir2_threshold_5": 0.15014157370114636,
132
+ "scr_dir1_threshold_10": 0.31313096648652816,
133
+ "scr_metric_threshold_10": 0.20679891468886458,
134
+ "scr_dir2_threshold_10": 0.20679891468886458,
135
+ "scr_dir1_threshold_20": 0.3333333333333333,
136
+ "scr_metric_threshold_20": 0.23796032559334063,
137
+ "scr_dir2_threshold_20": 0.23796032559334063,
138
+ "scr_dir1_threshold_50": 0.15151504204837732,
139
+ "scr_metric_threshold_50": 0.3597733132264739,
140
+ "scr_dir2_threshold_50": 0.3597733132264739,
141
+ "scr_dir1_threshold_100": 0.11111091042202506,
142
+ "scr_metric_threshold_100": 0.43909342175758737,
143
+ "scr_dir2_threshold_100": 0.43909342175758737,
144
+ "scr_dir1_threshold_500": -0.8484855600188809,
145
+ "scr_metric_threshold_500": 0.10481586976266374,
146
+ "scr_dir2_threshold_500": 0.10481586976266374
147
+ },
148
+ {
149
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
150
+ "scr_dir1_threshold_2": 0.532258142045613,
151
+ "scr_metric_threshold_2": 0.017676839894409477,
152
+ "scr_dir2_threshold_2": 0.017676839894409477,
153
+ "scr_dir1_threshold_5": 0.5483869727270193,
154
+ "scr_metric_threshold_5": 0.03535352927204973,
155
+ "scr_dir2_threshold_5": 0.03535352927204973,
156
+ "scr_dir1_threshold_10": 0.5806446340898319,
157
+ "scr_metric_threshold_10": 0.06565661886956488,
158
+ "scr_dir2_threshold_10": 0.06565661886956488,
159
+ "scr_dir1_threshold_20": 0.48387068863579336,
160
+ "scr_metric_threshold_20": 0.12121220787329137,
161
+ "scr_dir2_threshold_20": 0.12121220787329137,
162
+ "scr_dir1_threshold_50": 0.532258142045613,
163
+ "scr_metric_threshold_50": 0.24242426522981353,
164
+ "scr_dir2_threshold_50": 0.24242426522981353,
165
+ "scr_dir1_threshold_100": 0.46774185795438705,
166
+ "scr_metric_threshold_100": 0.31060610393664567,
167
+ "scr_dir2_threshold_100": 0.31060610393664567,
168
+ "scr_dir1_threshold_500": 0.03225766136281265,
169
+ "scr_metric_threshold_500": 0.09090911827577622,
170
+ "scr_dir2_threshold_500": 0.09090911827577622
171
+ },
172
+ {
173
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
174
+ "scr_dir1_threshold_2": 0.2926829741063909,
175
+ "scr_metric_threshold_2": 0.0791787492811609,
176
+ "scr_dir2_threshold_2": 0.0791787492811609,
177
+ "scr_dir1_threshold_5": 0.2764230242518165,
178
+ "scr_metric_threshold_5": 0.15835767335602155,
179
+ "scr_dir2_threshold_5": 0.15835767335602155,
180
+ "scr_dir1_threshold_10": 0.2764230242518165,
181
+ "scr_metric_threshold_10": 0.21407613663973096,
182
+ "scr_dir2_threshold_10": 0.21407613663973096,
183
+ "scr_dir1_threshold_20": 0.1707318964255635,
184
+ "scr_metric_threshold_20": 0.28739003291365367,
185
+ "scr_dir2_threshold_20": 0.28739003291365367,
186
+ "scr_dir1_threshold_50": -0.06504076859931053,
187
+ "scr_metric_threshold_50": 0.39296193168013455,
188
+ "scr_dir2_threshold_50": 0.39296193168013455,
189
+ "scr_dir1_threshold_100": 0.032520384299655265,
190
+ "scr_metric_threshold_100": 0.4516129088643129,
191
+ "scr_dir2_threshold_100": 0.4516129088643129,
192
+ "scr_dir1_threshold_500": -0.22764220550708036,
193
+ "scr_metric_threshold_500": 0.09677418227137422,
194
+ "scr_dir2_threshold_500": 0.09677418227137422
195
+ },
196
+ {
197
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
198
+ "scr_dir1_threshold_2": 0.021857964433295084,
199
+ "scr_metric_threshold_2": 0.3671876018633899,
200
+ "scr_dir2_threshold_2": 0.3671876018633899,
201
+ "scr_dir1_threshold_5": 0.00546440968122594,
202
+ "scr_metric_threshold_5": 0.43750011641530273,
203
+ "scr_dir2_threshold_5": 0.43750011641530273,
204
+ "scr_dir1_threshold_10": 0.04918033854781611,
205
+ "scr_metric_threshold_10": 0.5078126309672156,
206
+ "scr_dir2_threshold_10": 0.5078126309672156,
207
+ "scr_dir1_threshold_20": 0.1803277994391953,
208
+ "scr_metric_threshold_20": 0.5664063154836078,
209
+ "scr_dir2_threshold_20": 0.5664063154836078,
210
+ "scr_dir1_threshold_50": 0.1912569445100385,
211
+ "scr_metric_threshold_50": 0.6562500582076514,
212
+ "scr_dir2_threshold_50": 0.6562500582076514,
213
+ "scr_dir1_threshold_100": -0.1092894964580841,
214
+ "scr_metric_threshold_100": 0.63281239813661,
215
+ "scr_dir2_threshold_100": 0.63281239813661,
216
+ "scr_dir1_threshold_500": -0.09289626741440628,
217
+ "scr_metric_threshold_500": 0.7812500582076514,
218
+ "scr_dir2_threshold_500": 0.7812500582076514
219
+ },
220
+ {
221
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
222
+ "scr_dir1_threshold_2": 0.07692300638503319,
223
+ "scr_metric_threshold_2": 0.056451601273808806,
224
+ "scr_dir2_threshold_2": 0.056451601273808806,
225
+ "scr_dir1_threshold_5": 0.12307687134902433,
226
+ "scr_metric_threshold_5": 0.0927419335456348,
227
+ "scr_dir2_threshold_5": 0.0927419335456348,
228
+ "scr_dir1_threshold_10": 0.15384601277006638,
229
+ "scr_metric_threshold_10": 0.12500006008532116,
230
+ "scr_dir2_threshold_10": 0.12500006008532116,
231
+ "scr_dir1_threshold_20": 0.23076901915509954,
232
+ "scr_metric_threshold_20": 0.16935480382142643,
233
+ "scr_dir2_threshold_20": 0.16935480382142643,
234
+ "scr_dir1_threshold_50": 0.3333333333333333,
235
+ "scr_metric_threshold_50": 0.3064517214444511,
236
+ "scr_dir2_threshold_50": 0.3064517214444511,
237
+ "scr_dir1_threshold_100": 0.36410247475437535,
238
+ "scr_metric_threshold_100": 0.39112900318452204,
239
+ "scr_dir2_threshold_100": 0.39112900318452204,
240
+ "scr_dir1_threshold_500": 0.45641020468235766,
241
+ "scr_metric_threshold_500": 0.43145154118848766,
242
+ "scr_dir2_threshold_500": 0.43145154118848766
243
+ },
244
+ {
245
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
246
+ "scr_dir1_threshold_2": 0.045045001506225466,
247
+ "scr_metric_threshold_2": 0.214285581239654,
248
+ "scr_dir2_threshold_2": 0.214285581239654,
249
+ "scr_dir1_threshold_5": 0.10360361086007354,
250
+ "scr_metric_threshold_5": 0.31696414435284664,
251
+ "scr_dir2_threshold_5": 0.31696414435284664,
252
+ "scr_dir1_threshold_10": 0.14414416591355395,
253
+ "scr_metric_threshold_10": 0.4241072010647942,
254
+ "scr_dir2_threshold_10": 0.4241072010647942,
255
+ "scr_dir1_threshold_20": 0.21171166817289214,
256
+ "scr_metric_threshold_20": 0.47767846332864744,
257
+ "scr_dir2_threshold_20": 0.47767846332864744,
258
+ "scr_dir1_threshold_50": 0.38738749623443636,
259
+ "scr_metric_threshold_50": 0.5535713953099135,
260
+ "scr_dir2_threshold_50": 0.5535713953099135,
261
+ "scr_dir1_threshold_100": 0.15765750527178912,
262
+ "scr_metric_threshold_100": 0.6116071511725216,
263
+ "scr_dir2_threshold_100": 0.6116071511725216,
264
+ "scr_dir1_threshold_500": 0.4549549984937745,
265
+ "scr_metric_threshold_500": 0.6785713620483984,
266
+ "scr_dir2_threshold_500": 0.6785713620483984
267
+ },
268
+ {
269
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
270
+ "scr_dir1_threshold_2": 0.042918406627421406,
271
+ "scr_metric_threshold_2": 0.042918406627421406,
272
+ "scr_dir2_threshold_2": 0.05714296256601923,
273
+ "scr_dir1_threshold_5": 0.060085820441166386,
274
+ "scr_metric_threshold_5": 0.060085820441166386,
275
+ "scr_dir2_threshold_5": 0.10476195612190681,
276
+ "scr_dir1_threshold_10": 0.09012885856869063,
277
+ "scr_metric_threshold_10": 0.09012885856869063,
278
+ "scr_dir2_threshold_10": 0.10952379871117751,
279
+ "scr_dir1_threshold_20": 0.1416308441960435,
280
+ "scr_metric_threshold_20": 0.1416308441960435,
281
+ "scr_dir2_threshold_20": 0.1333332954891213,
282
+ "scr_dir1_threshold_50": 0.23175970276473412,
283
+ "scr_metric_threshold_50": 0.23175970276473412,
284
+ "scr_dir2_threshold_50": 0.1333332954891213,
285
+ "scr_dir1_threshold_100": 0.2489271165784791,
286
+ "scr_metric_threshold_100": 0.2489271165784791,
287
+ "scr_dir2_threshold_100": 0.2380952516110281,
288
+ "scr_dir1_threshold_500": 0.33905571933328765,
289
+ "scr_metric_threshold_500": 0.33905571933328765,
290
+ "scr_dir2_threshold_500": 0.24285709420029883
291
+ }
292
+ ],
293
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
294
+ "sae_lens_id": "custom_sae",
295
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0",
296
+ "sae_lens_version": "5.4.2",
297
+ "sae_cfg_dict": {
298
+ "model_name": "gemma-2-2b",
299
+ "d_in": 2304,
300
+ "d_sae": 65536,
301
+ "hook_layer": 12,
302
+ "hook_name": "blocks.12.hook_resid_post",
303
+ "context_size": null,
304
+ "hook_head_index": null,
305
+ "architecture": "topk",
306
+ "apply_b_dec_to_input": null,
307
+ "finetuning_scaling_factor": null,
308
+ "activation_fn_str": "",
309
+ "prepend_bos": true,
310
+ "normalize_activations": "none",
311
+ "dtype": "bfloat16",
312
+ "device": "",
313
+ "dataset_path": "",
314
+ "dataset_trust_remote_code": true,
315
+ "seqpos_slice": [
316
+ null
317
+ ],
318
+ "training_tokens": -100000,
319
+ "sae_lens_training_version": null,
320
+ "neuronpedia_id": null
321
+ },
322
+ "eval_result_unstructured": null
323
+ }
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "scr",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": true,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "e84e2919-45d8-4739-935c-c4c49e6f01bd",
73
+ "datetime_epoch_millis": 1740159333994,
74
+ "eval_result_metrics": {
75
+ "scr_metrics": {
76
+ "scr_dir1_threshold_2": 0.13902672439841735,
77
+ "scr_metric_threshold_2": 0.052679827874983993,
78
+ "scr_dir2_threshold_2": 0.05023245750763616,
79
+ "scr_dir1_threshold_5": 0.15256902306733214,
80
+ "scr_metric_threshold_5": 0.08124373333242407,
81
+ "scr_dir2_threshold_5": 0.08373451444399863,
82
+ "scr_dir1_threshold_10": 0.18434644922627724,
83
+ "scr_metric_threshold_10": 0.11066224908587924,
84
+ "scr_dir2_threshold_10": 0.11285158655682409,
85
+ "scr_dir1_threshold_20": 0.1730640453138616,
86
+ "scr_metric_threshold_20": 0.13485719572436072,
87
+ "scr_dir2_threshold_20": 0.13591482918415104,
88
+ "scr_dir1_threshold_50": 0.16112645779738913,
89
+ "scr_metric_threshold_50": 0.17958703268862003,
90
+ "scr_dir2_threshold_50": 0.18450987025234128,
91
+ "scr_dir1_threshold_100": 0.16878289626201945,
92
+ "scr_metric_threshold_100": 0.17936453816540665,
93
+ "scr_dir2_threshold_100": 0.1792240185413299,
94
+ "scr_dir1_threshold_500": 0.0930010293225678,
95
+ "scr_metric_threshold_500": 0.1838392653323654,
96
+ "scr_dir2_threshold_500": 0.18834314598313068
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
102
+ "scr_dir1_threshold_2": 0.2857136099242737,
103
+ "scr_metric_threshold_2": 0.01474211549298745,
104
+ "scr_dir2_threshold_2": 0.01474211549298745,
105
+ "scr_dir1_threshold_5": 0.31746019731987085,
106
+ "scr_metric_threshold_5": 0.017199085925573582,
107
+ "scr_dir2_threshold_5": 0.017199085925573582,
108
+ "scr_dir1_threshold_10": 0.3492058386094512,
109
+ "scr_metric_threshold_10": 0.024570143672067307,
110
+ "scr_dir2_threshold_10": 0.024570143672067307,
111
+ "scr_dir1_threshold_20": 0.222222327345113,
112
+ "scr_metric_threshold_20": 0.036855142283733294,
113
+ "scr_dir2_threshold_20": 0.036855142283733294,
114
+ "scr_dir1_threshold_50": 0.23809514798990317,
115
+ "scr_metric_threshold_50": 0.05651105219315768,
116
+ "scr_dir2_threshold_50": 0.05651105219315768,
117
+ "scr_dir1_threshold_100": 0.23809514798990317,
118
+ "scr_metric_threshold_100": 0.06879605080482366,
119
+ "scr_dir2_threshold_100": 0.06879605080482366,
120
+ "scr_dir1_threshold_500": 0.15873009865993543,
121
+ "scr_metric_threshold_500": 0.13022119031188895,
122
+ "scr_dir2_threshold_500": 0.13022119031188895
123
+ },
124
+ {
125
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
126
+ "scr_dir1_threshold_2": 0.2121209384542766,
127
+ "scr_metric_threshold_2": 0.05099152246310042,
128
+ "scr_dir2_threshold_2": 0.05099152246310042,
129
+ "scr_dir1_threshold_5": 0.19191917367472955,
130
+ "scr_metric_threshold_5": 0.07648728369465063,
131
+ "scr_dir2_threshold_5": 0.07648728369465063,
132
+ "scr_dir1_threshold_10": 0.2323233053010818,
133
+ "scr_metric_threshold_10": 0.10764869459912667,
134
+ "scr_dir2_threshold_10": 0.10764869459912667,
135
+ "scr_dir1_threshold_20": 0.2828283193172076,
136
+ "scr_metric_threshold_20": 0.15014157370114636,
137
+ "scr_dir2_threshold_20": 0.15014157370114636,
138
+ "scr_dir1_threshold_50": 0.20202005606450307,
139
+ "scr_metric_threshold_50": 0.18130315345731438,
140
+ "scr_dir2_threshold_50": 0.18130315345731438,
141
+ "scr_dir1_threshold_100": 0.16161592443815084,
142
+ "scr_metric_threshold_100": 0.21246456436179043,
143
+ "scr_dir2_threshold_100": 0.21246456436179043,
144
+ "scr_dir1_threshold_500": -0.07070738086293096,
145
+ "scr_metric_threshold_500": 0.07932010853111354,
146
+ "scr_dir2_threshold_500": 0.07932010853111354
147
+ },
148
+ {
149
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
150
+ "scr_dir1_threshold_2": 0.258064174999303,
151
+ "scr_metric_threshold_2": 0.017676839894409477,
152
+ "scr_dir2_threshold_2": 0.017676839894409477,
153
+ "scr_dir1_threshold_5": 0.29032183636211567,
154
+ "scr_metric_threshold_5": 0.03535352927204973,
155
+ "scr_dir2_threshold_5": 0.03535352927204973,
156
+ "scr_dir1_threshold_10": 0.33870928977193526,
157
+ "scr_metric_threshold_10": 0.06565661886956488,
158
+ "scr_dir2_threshold_10": 0.06565661886956488,
159
+ "scr_dir1_threshold_20": 0.37096695113474787,
160
+ "scr_metric_threshold_20": 0.06565661886956488,
161
+ "scr_dir2_threshold_20": 0.06565661886956488,
162
+ "scr_dir1_threshold_50": 0.20967672158948342,
163
+ "scr_metric_threshold_50": 0.08838389843850893,
164
+ "scr_dir2_threshold_50": 0.08838389843850893,
165
+ "scr_dir1_threshold_100": 0.22580651363649037,
166
+ "scr_metric_threshold_100": 0.0959595579503108,
167
+ "scr_dir2_threshold_100": 0.0959595579503108,
168
+ "scr_dir1_threshold_500": 0.11290277613544487,
169
+ "scr_metric_threshold_500": 0.04545455913788811,
170
+ "scr_dir2_threshold_500": 0.04545455913788811
171
+ },
172
+ {
173
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
174
+ "scr_dir1_threshold_2": 0.13821151212590824,
175
+ "scr_metric_threshold_2": 0.052785774589540695,
176
+ "scr_dir2_threshold_2": 0.052785774589540695,
177
+ "scr_dir1_threshold_5": 0.1869918462801379,
178
+ "scr_metric_threshold_5": 0.0791787492811609,
179
+ "scr_dir2_threshold_5": 0.0791787492811609,
180
+ "scr_dir1_threshold_10": 0.21951223057979316,
181
+ "scr_metric_threshold_10": 0.1114369265674188,
182
+ "scr_dir2_threshold_10": 0.1114369265674188,
183
+ "scr_dir1_threshold_20": 0.23577218043436754,
184
+ "scr_metric_threshold_20": 0.13782990125903902,
185
+ "scr_dir2_threshold_20": 0.13782990125903902,
186
+ "scr_dir1_threshold_50": 0.23577218043436754,
187
+ "scr_metric_threshold_50": 0.17008790375159719,
188
+ "scr_dir2_threshold_50": 0.17008790375159719,
189
+ "scr_dir1_threshold_100": 0.21951223057979316,
190
+ "scr_metric_threshold_100": 0.052785774589540695,
191
+ "scr_dir2_threshold_100": 0.052785774589540695,
192
+ "scr_dir1_threshold_500": 0.11382110275354018,
193
+ "scr_metric_threshold_500": 0.10850441266694984,
194
+ "scr_dir2_threshold_500": 0.10850441266694984
195
+ },
196
+ {
197
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
198
+ "scr_dir1_threshold_2": 0.00546440968122594,
199
+ "scr_metric_threshold_2": 0.10937497089617432,
200
+ "scr_dir2_threshold_2": 0.10937497089617432,
201
+ "scr_dir1_threshold_5": 0.021857964433295084,
202
+ "scr_metric_threshold_5": 0.21093754365573852,
203
+ "scr_dir2_threshold_5": 0.21093754365573852,
204
+ "scr_dir1_threshold_10": 0.05464474822904205,
205
+ "scr_metric_threshold_10": 0.28125005820765137,
206
+ "scr_dir2_threshold_10": 0.28125005820765137,
207
+ "scr_dir1_threshold_20": -0.06010915791026799,
208
+ "scr_metric_threshold_20": 0.3320313445874335,
209
+ "scr_dir2_threshold_20": 0.3320313445874335,
210
+ "scr_dir1_threshold_50": -0.04918033854781611,
211
+ "scr_metric_threshold_50": 0.3984374272404358,
212
+ "scr_dir2_threshold_50": 0.3984374272404358,
213
+ "scr_dir1_threshold_100": -0.016393554752069144,
214
+ "scr_metric_threshold_100": 0.42578128637978213,
215
+ "scr_dir2_threshold_100": 0.42578128637978213,
216
+ "scr_dir1_threshold_500": -0.03825151918536423,
217
+ "scr_metric_threshold_500": 0.48828116996447934,
218
+ "scr_dir2_threshold_500": 0.48828116996447934
219
+ },
220
+ {
221
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
222
+ "scr_dir1_threshold_2": 0.09743586922067994,
223
+ "scr_metric_threshold_2": 0.05241939554166917,
224
+ "scr_dir2_threshold_2": 0.05241939554166917,
225
+ "scr_dir1_threshold_5": 0.09743586922067994,
226
+ "scr_metric_threshold_5": 0.07661287027579163,
227
+ "scr_dir2_threshold_5": 0.07661287027579163,
228
+ "scr_dir1_threshold_10": 0.11282028709877284,
229
+ "scr_metric_threshold_10": 0.09677413927777444,
230
+ "scr_dir2_threshold_10": 0.09677413927777444,
231
+ "scr_dir1_threshold_20": 0.13846128922711723,
232
+ "scr_metric_threshold_20": 0.14516132908730398,
233
+ "scr_dir2_threshold_20": 0.14516132908730398,
234
+ "scr_dir1_threshold_50": 0.1692307363130155,
235
+ "scr_metric_threshold_50": 0.23790326263293876,
236
+ "scr_dir2_threshold_50": 0.23790326263293876,
237
+ "scr_dir1_threshold_100": 0.18974359914866223,
238
+ "scr_metric_threshold_100": 0.2580645316349216,
239
+ "scr_dir2_threshold_100": 0.2580645316349216,
240
+ "scr_dir1_threshold_500": 0.14871787347736873,
241
+ "scr_metric_threshold_500": 0.3064517214444511,
242
+ "scr_dir2_threshold_500": 0.3064517214444511
243
+ },
244
+ {
245
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
246
+ "scr_dir1_threshold_2": 0.07657666365421574,
247
+ "scr_metric_threshold_2": 0.08482138699453473,
248
+ "scr_dir2_threshold_2": 0.08482138699453473,
249
+ "scr_dir1_threshold_5": 0.06306305580659313,
250
+ "scr_metric_threshold_5": 0.10267856311319266,
251
+ "scr_dir2_threshold_5": 0.10267856311319266,
252
+ "scr_dir1_threshold_10": 0.09459444946519599,
253
+ "scr_metric_threshold_10": 0.12499996673848493,
254
+ "scr_dir2_threshold_10": 0.12499996673848493,
255
+ "scr_dir1_threshold_20": 0.11711721870769615,
256
+ "scr_metric_threshold_20": 0.13392842175175362,
257
+ "scr_dir2_threshold_20": 0.13392842175175362,
258
+ "scr_dir1_threshold_50": 0.18468472096703434,
259
+ "scr_metric_threshold_50": 0.20535712622638533,
260
+ "scr_dir2_threshold_50": 0.20535712622638533,
261
+ "scr_dir1_threshold_100": 0.21171166817289214,
262
+ "scr_metric_threshold_100": 0.20089289871975097,
263
+ "scr_dir2_threshold_100": 0.20089289871975097,
264
+ "scr_dir1_threshold_500": 0.20720722172014708,
265
+ "scr_metric_threshold_500": 0.20089289871975097,
266
+ "scr_dir2_threshold_500": 0.20089289871975097
267
+ },
268
+ {
269
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
270
+ "scr_dir1_threshold_2": 0.03862661712745569,
271
+ "scr_metric_threshold_2": 0.03862661712745569,
272
+ "scr_dir2_threshold_2": 0.019047654188673074,
273
+ "scr_dir1_threshold_5": 0.05150224144123495,
274
+ "scr_metric_threshold_5": 0.05150224144123495,
275
+ "scr_dir2_threshold_5": 0.07142849033383136,
276
+ "scr_dir1_threshold_10": 0.07296144475494565,
277
+ "scr_metric_threshold_10": 0.07296144475494565,
278
+ "scr_dir2_threshold_10": 0.09047614452250444,
279
+ "scr_dir1_threshold_20": 0.07725323425491137,
280
+ "scr_metric_threshold_20": 0.07725323425491137,
281
+ "scr_dir2_threshold_20": 0.08571430193323373,
282
+ "scr_dir1_threshold_50": 0.09871243756862208,
283
+ "scr_metric_threshold_50": 0.09871243756862208,
284
+ "scr_dir2_threshold_50": 0.13809513807839202,
285
+ "scr_dir1_threshold_100": 0.12017164088233277,
286
+ "scr_metric_threshold_100": 0.12017164088233277,
287
+ "scr_dir2_threshold_100": 0.11904748388971893,
288
+ "scr_dir1_threshold_500": 0.11158806188240133,
289
+ "scr_metric_threshold_500": 0.11158806188240133,
290
+ "scr_dir2_threshold_500": 0.14761910708852366
291
+ }
292
+ ],
293
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
294
+ "sae_lens_id": "custom_sae",
295
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0",
296
+ "sae_lens_version": "5.4.2",
297
+ "sae_cfg_dict": {
298
+ "model_name": "gemma-2-2b",
299
+ "d_in": 2304,
300
+ "d_sae": 65536,
301
+ "hook_layer": 12,
302
+ "hook_name": "blocks.12.hook_resid_post",
303
+ "context_size": null,
304
+ "hook_head_index": null,
305
+ "architecture": "topk",
306
+ "apply_b_dec_to_input": null,
307
+ "finetuning_scaling_factor": null,
308
+ "activation_fn_str": "",
309
+ "prepend_bos": true,
310
+ "normalize_activations": "none",
311
+ "dtype": "bfloat16",
312
+ "device": "",
313
+ "dataset_path": "",
314
+ "dataset_trust_remote_code": true,
315
+ "seqpos_slice": [
316
+ null
317
+ ],
318
+ "training_tokens": -100000,
319
+ "sae_lens_training_version": null,
320
+ "neuronpedia_id": null
321
+ },
322
+ "eval_result_unstructured": null
323
+ }
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "scr",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": true,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "744efb4f-9f9c-4a65-bd36-c9b19a4ffd71",
73
+ "datetime_epoch_millis": 1740160219317,
74
+ "eval_result_metrics": {
75
+ "scr_metrics": {
76
+ "scr_dir1_threshold_2": 0.1334585580077991,
77
+ "scr_metric_threshold_2": 0.07676722148137892,
78
+ "scr_dir2_threshold_2": 0.07675956452377836,
79
+ "scr_dir1_threshold_5": 0.1798632781490608,
80
+ "scr_metric_threshold_5": 0.12875666458236273,
81
+ "scr_dir2_threshold_5": 0.13481889859378787,
82
+ "scr_dir1_threshold_10": 0.18356951723363868,
83
+ "scr_metric_threshold_10": 0.18907230490584973,
84
+ "scr_dir2_threshold_10": 0.1949506085489714,
85
+ "scr_dir1_threshold_20": 0.17099327770246797,
86
+ "scr_metric_threshold_20": 0.237307483183272,
87
+ "scr_dir2_threshold_20": 0.24175260369566065,
88
+ "scr_dir1_threshold_50": 0.07798598324296178,
89
+ "scr_metric_threshold_50": 0.2928094354498369,
90
+ "scr_dir2_threshold_50": 0.30373578236555543,
91
+ "scr_dir1_threshold_100": 0.0037336381161294126,
92
+ "scr_metric_threshold_100": 0.2911327859028171,
93
+ "scr_dir2_threshold_100": 0.3050353199157786,
94
+ "scr_dir1_threshold_500": -0.09587675427885789,
95
+ "scr_metric_threshold_500": 0.2517752817013341,
96
+ "scr_dir2_threshold_500": 0.2716966035025452
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
102
+ "scr_dir1_threshold_2": 0.333333017964661,
103
+ "scr_metric_threshold_2": 0.009828028179079856,
104
+ "scr_dir2_threshold_2": 0.009828028179079856,
105
+ "scr_dir1_threshold_5": 0.3809524260050484,
106
+ "scr_metric_threshold_5": 0.019656056358159712,
107
+ "scr_dir2_threshold_5": 0.019656056358159712,
108
+ "scr_dir1_threshold_10": 0.3650786592542414,
109
+ "scr_metric_threshold_10": 0.05651105219315768,
110
+ "scr_dir2_threshold_10": 0.05651105219315768,
111
+ "scr_dir1_threshold_20": 0.31746019731987085,
112
+ "scr_metric_threshold_20": 0.09336619447689097,
113
+ "scr_dir2_threshold_20": 0.09336619447689097,
114
+ "scr_dir1_threshold_50": 0.1746029193047256,
115
+ "scr_metric_threshold_50": 0.12285013256539522,
116
+ "scr_dir2_threshold_50": 0.12285013256539522,
117
+ "scr_dir1_threshold_100": 0.20634856059430595,
118
+ "scr_metric_threshold_100": 0.1474201297887272,
119
+ "scr_dir2_threshold_100": 0.1474201297887272,
120
+ "scr_dir1_threshold_500": 0.14285633190912841,
121
+ "scr_metric_threshold_500": 0.05896816907447914,
122
+ "scr_dir2_threshold_500": 0.05896816907447914
123
+ },
124
+ {
125
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
126
+ "scr_dir1_threshold_2": 0.08080766118544634,
127
+ "scr_metric_threshold_2": 0.11048151943558958,
128
+ "scr_dir2_threshold_2": 0.11048151943558958,
129
+ "scr_dir1_threshold_5": 0.16161592443815084,
130
+ "scr_metric_threshold_5": 0.1671388604233078,
131
+ "scr_dir2_threshold_5": 0.1671388604233078,
132
+ "scr_dir1_threshold_10": 0.12121179281179859,
133
+ "scr_metric_threshold_10": 0.24079315042980354,
134
+ "scr_dir2_threshold_10": 0.24079315042980354,
135
+ "scr_dir1_threshold_20": 0.12121179281179859,
136
+ "scr_metric_threshold_20": 0.2861190232199781,
137
+ "scr_dir2_threshold_20": 0.2861190232199781,
138
+ "scr_dir1_threshold_50": -0.2828283193172076,
139
+ "scr_metric_threshold_50": 0.3456090201924673,
140
+ "scr_dir2_threshold_50": 0.3456090201924673,
141
+ "scr_dir1_threshold_100": -0.31313156855378627,
142
+ "scr_metric_threshold_100": 0.20679891468886458,
143
+ "scr_dir2_threshold_100": 0.20679891468886458,
144
+ "scr_dir1_threshold_500": -0.7373740475295977,
145
+ "scr_metric_threshold_500": 0.28895184805644103,
146
+ "scr_dir2_threshold_500": 0.28895184805644103
147
+ },
148
+ {
149
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
150
+ "scr_dir1_threshold_2": 0.37096695113474787,
151
+ "scr_metric_threshold_2": 0.010101029865838383,
152
+ "scr_dir2_threshold_2": 0.010101029865838383,
153
+ "scr_dir1_threshold_5": 0.41935440454456746,
154
+ "scr_metric_threshold_5": 0.042929339300620824,
155
+ "scr_dir2_threshold_5": 0.042929339300620824,
156
+ "scr_dir1_threshold_10": 0.45161302727298075,
157
+ "scr_metric_threshold_10": 0.09343433811304351,
158
+ "scr_dir2_threshold_10": 0.09343433811304351,
159
+ "scr_dir1_threshold_20": 0.37096695113474787,
160
+ "scr_metric_threshold_20": 0.11868683751925485,
161
+ "scr_dir2_threshold_20": 0.11868683751925485,
162
+ "scr_dir1_threshold_50": 0.16129022954526445,
163
+ "scr_metric_threshold_50": 0.16919198684844677,
164
+ "scr_dir2_threshold_50": 0.16919198684844677,
165
+ "scr_dir1_threshold_100": 0.09677394545403856,
166
+ "scr_metric_threshold_100": 0.2045455161204965,
167
+ "scr_dir2_threshold_100": 0.2045455161204965,
168
+ "scr_dir1_threshold_500": -0.016129792047006934,
169
+ "scr_metric_threshold_500": 0.08838389843850893,
170
+ "scr_dir2_threshold_500": 0.08838389843850893
171
+ },
172
+ {
173
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
174
+ "scr_dir1_threshold_2": 0.13008153719862106,
175
+ "scr_metric_threshold_2": 0.1671553898511282,
176
+ "scr_dir2_threshold_2": 0.1671553898511282,
177
+ "scr_dir1_threshold_5": 0.22764220550708036,
178
+ "scr_metric_threshold_5": 0.23167156962994426,
179
+ "scr_dir2_threshold_5": 0.23167156962994426,
180
+ "scr_dir1_threshold_10": 0.13821151212590824,
181
+ "scr_metric_threshold_10": 0.27565980251807803,
182
+ "scr_dir2_threshold_10": 0.27565980251807803,
183
+ "scr_dir1_threshold_20": 0.13821151212590824,
184
+ "scr_metric_threshold_20": 0.34604101009783206,
185
+ "scr_dir2_threshold_20": 0.34604101009783206,
186
+ "scr_dir1_threshold_50": 0.08130071845388491,
187
+ "scr_metric_threshold_50": 0.38416421518502786,
188
+ "scr_dir2_threshold_50": 0.38416421518502786,
189
+ "scr_dir1_threshold_100": -0.5365851294680456,
190
+ "scr_metric_threshold_100": 0.4105571898766481,
191
+ "scr_dir2_threshold_100": 0.4105571898766481,
192
+ "scr_dir1_threshold_500": -0.658536207148873,
193
+ "scr_metric_threshold_500": 0.1964808784432174,
194
+ "scr_dir2_threshold_500": 0.1964808784432174
195
+ },
196
+ {
197
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
198
+ "scr_dir1_threshold_2": 0.03825119347697291,
199
+ "scr_metric_threshold_2": 0.10156257275956422,
200
+ "scr_dir2_threshold_2": 0.10156257275956422,
201
+ "scr_dir1_threshold_5": 0.04371592886659017,
202
+ "scr_metric_threshold_5": 0.22265637369125918,
203
+ "scr_dir2_threshold_5": 0.22265637369125918,
204
+ "scr_dir1_threshold_10": 0.06557389329988525,
205
+ "scr_metric_threshold_10": 0.3320313445874335,
206
+ "scr_dir2_threshold_10": 0.3320313445874335,
207
+ "scr_dir1_threshold_20": 0.01092881936245188,
208
+ "scr_metric_threshold_20": 0.42578128637978213,
209
+ "scr_dir2_threshold_20": 0.42578128637978213,
210
+ "scr_dir1_threshold_50": -0.00546440968122594,
211
+ "scr_metric_threshold_50": 0.542968888243172,
212
+ "scr_dir2_threshold_50": 0.542968888243172,
213
+ "scr_dir1_threshold_100": 0.01092881936245188,
214
+ "scr_metric_threshold_100": 0.5859375436557386,
215
+ "scr_dir2_threshold_100": 0.5859375436557386,
216
+ "scr_dir1_threshold_500": -0.03825151918536423,
217
+ "scr_metric_threshold_500": 0.578124912688523,
218
+ "scr_dir2_threshold_500": 0.578124912688523
219
+ },
220
+ {
221
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
222
+ "scr_dir1_threshold_2": 0.03076914142104203,
223
+ "scr_metric_threshold_2": 0.060483807005948444,
224
+ "scr_dir2_threshold_2": 0.060483807005948444,
225
+ "scr_dir1_threshold_5": 0.08205114567773082,
226
+ "scr_metric_threshold_5": 0.08467752208135552,
227
+ "scr_dir2_threshold_5": 0.08467752208135552,
228
+ "scr_dir1_threshold_10": 0.12820501064172196,
229
+ "scr_metric_threshold_10": 0.1290322658174608,
230
+ "scr_dir2_threshold_10": 0.1290322658174608,
231
+ "scr_dir1_threshold_20": 0.14871787347736873,
232
+ "scr_metric_threshold_20": 0.17741945562699032,
233
+ "scr_dir2_threshold_20": 0.17741945562699032,
234
+ "scr_dir1_threshold_50": 0.09230742426312609,
235
+ "scr_metric_threshold_50": 0.2701613891726251,
236
+ "scr_dir2_threshold_50": 0.2701613891726251,
237
+ "scr_dir1_threshold_100": 0.15384601277006638,
238
+ "scr_metric_threshold_100": 0.3064517214444511,
239
+ "scr_dir2_threshold_100": 0.3064517214444511,
240
+ "scr_dir1_threshold_500": 0.15384601277006638,
241
+ "scr_metric_threshold_500": 0.3790323859881031,
242
+ "scr_dir2_threshold_500": 0.3790323859881031
243
+ },
244
+ {
245
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
246
+ "scr_dir1_threshold_2": 0.0405405550534804,
247
+ "scr_metric_threshold_2": 0.11160701812646134,
248
+ "scr_dir2_threshold_2": 0.11160701812646134,
249
+ "scr_dir1_threshold_5": 0.07207194871208326,
250
+ "scr_metric_threshold_5": 0.20982135373301966,
251
+ "scr_dir2_threshold_5": 0.20982135373301966,
252
+ "scr_dir1_threshold_10": 0.11711721870769615,
253
+ "scr_metric_threshold_10": 0.30357146183294365,
254
+ "scr_dir2_threshold_10": 0.30357146183294365,
255
+ "scr_dir1_threshold_20": 0.15315305881904406,
256
+ "scr_metric_threshold_20": 0.3437500415768938,
257
+ "scr_dir2_threshold_20": 0.3437500415768938,
258
+ "scr_dir1_threshold_50": 0.2567566696791176,
259
+ "scr_metric_threshold_50": 0.36160721769555176,
260
+ "scr_dir2_threshold_50": 0.36160721769555176,
261
+ "scr_dir1_threshold_100": 0.26576583107399515,
262
+ "scr_metric_threshold_100": 0.3214286379516016,
263
+ "scr_dir2_threshold_100": 0.3214286379516016,
264
+ "scr_dir1_threshold_500": 0.27927917043223033,
265
+ "scr_metric_threshold_500": 0.31696414435284664,
266
+ "scr_dir2_threshold_500": 0.31696414435284664
267
+ },
268
+ {
269
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
270
+ "scr_dir1_threshold_2": 0.042918406627421406,
271
+ "scr_metric_threshold_2": 0.042918406627421406,
272
+ "scr_dir2_threshold_2": 0.042857150966616867,
273
+ "scr_dir1_threshold_5": 0.05150224144123495,
274
+ "scr_metric_threshold_5": 0.05150224144123495,
275
+ "scr_dir2_threshold_5": 0.10000011353263609,
276
+ "scr_dir1_threshold_10": 0.08154502375487709,
277
+ "scr_metric_threshold_10": 0.08154502375487709,
278
+ "scr_dir2_threshold_10": 0.12857145289985059,
279
+ "scr_dir1_threshold_20": 0.10729601656855352,
280
+ "scr_metric_threshold_20": 0.10729601656855352,
281
+ "scr_dir2_threshold_20": 0.14285698066766273,
282
+ "scr_dir1_threshold_50": 0.1459226336960092,
283
+ "scr_metric_threshold_50": 0.1459226336960092,
284
+ "scr_dir2_threshold_50": 0.2333334090217574,
285
+ "scr_dir1_threshold_100": 0.1459226336960092,
286
+ "scr_metric_threshold_100": 0.1459226336960092,
287
+ "scr_dir2_threshold_100": 0.25714290579970117,
288
+ "scr_dir1_threshold_500": 0.10729601656855352,
289
+ "scr_metric_threshold_500": 0.10729601656855352,
290
+ "scr_dir2_threshold_500": 0.2666665909782426
291
+ }
292
+ ],
293
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
294
+ "sae_lens_id": "custom_sae",
295
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0",
296
+ "sae_lens_version": "5.4.2",
297
+ "sae_cfg_dict": {
298
+ "model_name": "gemma-2-2b",
299
+ "d_in": 2304,
300
+ "d_sae": 65536,
301
+ "hook_layer": 12,
302
+ "hook_name": "blocks.12.hook_resid_post",
303
+ "context_size": null,
304
+ "hook_head_index": null,
305
+ "architecture": "topk",
306
+ "apply_b_dec_to_input": null,
307
+ "finetuning_scaling_factor": null,
308
+ "activation_fn_str": "",
309
+ "prepend_bos": true,
310
+ "normalize_activations": "none",
311
+ "dtype": "bfloat16",
312
+ "device": "",
313
+ "dataset_path": "",
314
+ "dataset_trust_remote_code": true,
315
+ "seqpos_slice": [
316
+ null
317
+ ],
318
+ "training_tokens": -100000,
319
+ "sae_lens_training_version": null,
320
+ "neuronpedia_id": null
321
+ },
322
+ "eval_result_unstructured": null
323
+ }
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "scr",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": true,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "4945c79f-5ec8-4cf1-817b-560bfcc8ec29",
73
+ "datetime_epoch_millis": 1740161984895,
74
+ "eval_result_metrics": {
75
+ "scr_metrics": {
76
+ "scr_dir1_threshold_2": 0.16161854897121994,
77
+ "scr_metric_threshold_2": 0.06784648898938839,
78
+ "scr_dir2_threshold_2": 0.0659943489456994,
79
+ "scr_dir1_threshold_5": 0.20247854428196788,
80
+ "scr_metric_threshold_5": 0.09403415996971003,
81
+ "scr_dir2_threshold_5": 0.0946217333357683,
82
+ "scr_dir1_threshold_10": 0.1762122452594689,
83
+ "scr_metric_threshold_10": 0.13623078368990285,
84
+ "scr_dir2_threshold_10": 0.14062480452372897,
85
+ "scr_dir1_threshold_20": 0.17380918952629806,
86
+ "scr_metric_threshold_20": 0.1723823540256287,
87
+ "scr_dir2_threshold_20": 0.17415276656035286,
88
+ "scr_dir1_threshold_50": 0.1849406260352424,
89
+ "scr_metric_threshold_50": 0.20896025265643603,
90
+ "scr_dir2_threshold_50": 0.21573523026384628,
91
+ "scr_dir1_threshold_100": 0.14974715894436086,
92
+ "scr_metric_threshold_100": 0.18597532300507813,
93
+ "scr_dir2_threshold_100": 0.1965567480802562,
94
+ "scr_dir1_threshold_500": 0.0778989845782214,
95
+ "scr_metric_threshold_500": 0.21980959861610377,
96
+ "scr_dir2_threshold_500": 0.23241178018807312
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
102
+ "scr_dir1_threshold_2": 0.3650786592542414,
103
+ "scr_metric_threshold_2": -0.0024569704325861324,
104
+ "scr_dir2_threshold_2": -0.0024569704325861324,
105
+ "scr_dir1_threshold_5": 0.3968252466498386,
106
+ "scr_metric_threshold_5": 0.009828028179079856,
107
+ "scr_dir2_threshold_5": 0.009828028179079856,
108
+ "scr_dir1_threshold_10": 0.3809524260050484,
109
+ "scr_metric_threshold_10": 0.01474211549298745,
110
+ "scr_dir2_threshold_10": 0.01474211549298745,
111
+ "scr_dir1_threshold_20": 0.444444654690226,
112
+ "scr_metric_threshold_20": 0.027027114104653437,
113
+ "scr_dir2_threshold_20": 0.027027114104653437,
114
+ "scr_dir1_threshold_50": 0.3015873766750807,
115
+ "scr_metric_threshold_50": 0.05896816907447914,
116
+ "scr_dir2_threshold_50": 0.05896816907447914,
117
+ "scr_dir1_threshold_100": 0.2857136099242737,
118
+ "scr_metric_threshold_100": 0.1031940762072355,
119
+ "scr_dir2_threshold_100": 0.1031940762072355,
120
+ "scr_dir1_threshold_500": 0.19047573994951578,
121
+ "scr_metric_threshold_500": 0.16216224528171463,
122
+ "scr_dir2_threshold_500": 0.16216224528171463
123
+ },
124
+ {
125
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
126
+ "scr_dir1_threshold_2": 0.15151504204837732,
127
+ "scr_metric_threshold_2": 0.1161473379602074,
128
+ "scr_dir2_threshold_2": 0.1161473379602074,
129
+ "scr_dir1_threshold_5": 0.22222182084405012,
130
+ "scr_metric_threshold_5": 0.1359772806671398,
131
+ "scr_dir2_threshold_5": 0.1359772806671398,
132
+ "scr_dir1_threshold_10": 0.17171680682792437,
133
+ "scr_metric_threshold_10": 0.18413597829377729,
134
+ "scr_dir2_threshold_10": 0.18413597829377729,
135
+ "scr_dir1_threshold_20": 0.15151504204837732,
136
+ "scr_metric_threshold_20": 0.18980162796670313,
137
+ "scr_dir2_threshold_20": 0.18980162796670313,
138
+ "scr_dir1_threshold_50": 0.1818176892176979,
139
+ "scr_metric_threshold_50": 0.2549574434638101,
140
+ "scr_dir2_threshold_50": 0.2549574434638101,
141
+ "scr_dir1_threshold_100": 0.03030264716932058,
142
+ "scr_metric_threshold_100": 0.10764869459912667,
143
+ "scr_dir2_threshold_100": 0.10764869459912667,
144
+ "scr_dir1_threshold_500": -0.09090914564247801,
145
+ "scr_metric_threshold_500": 0.1161473379602074,
146
+ "scr_dir2_threshold_500": 0.1161473379602074
147
+ },
148
+ {
149
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
150
+ "scr_dir1_threshold_2": 0.41935440454456746,
151
+ "scr_metric_threshold_2": 0.020202059731676766,
152
+ "scr_dir2_threshold_2": 0.020202059731676766,
153
+ "scr_dir1_threshold_5": 0.45161302727298075,
154
+ "scr_metric_threshold_5": 0.03535352927204973,
155
+ "scr_dir2_threshold_5": 0.03535352927204973,
156
+ "scr_dir1_threshold_10": 0.33870928977193526,
157
+ "scr_metric_threshold_10": 0.04040411946335353,
158
+ "scr_dir2_threshold_10": 0.04040411946335353,
159
+ "scr_dir1_threshold_20": 0.2741930056807093,
160
+ "scr_metric_threshold_20": 0.08333330824720513,
161
+ "scr_dir2_threshold_20": 0.08333330824720513,
162
+ "scr_dir1_threshold_50": 0.2741930056807093,
163
+ "scr_metric_threshold_50": 0.11616161768198757,
164
+ "scr_dir2_threshold_50": 0.11616161768198757,
165
+ "scr_dir1_threshold_100": 0.3064516284091226,
166
+ "scr_metric_threshold_100": 0.13383845757639704,
167
+ "scr_dir2_threshold_100": 0.13383845757639704,
168
+ "scr_dir1_threshold_500": 0.04838649204421897,
169
+ "scr_metric_threshold_500": 0.06818183870683217,
170
+ "scr_dir2_threshold_500": 0.06818183870683217
171
+ },
172
+ {
173
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
174
+ "scr_dir1_threshold_2": 0.15447146198048264,
175
+ "scr_metric_threshold_2": 0.0557184632837094,
176
+ "scr_dir2_threshold_2": 0.0557184632837094,
177
+ "scr_dir1_threshold_5": 0.24390263995216122,
178
+ "scr_metric_threshold_5": 0.11730195436835675,
179
+ "scr_dir2_threshold_5": 0.11730195436835675,
180
+ "scr_dir1_threshold_10": 0.12195107768082737,
181
+ "scr_metric_threshold_10": 0.20527859493832404,
182
+ "scr_dir2_threshold_10": 0.20527859493832404,
183
+ "scr_dir1_threshold_20": 0.0894311779716786,
184
+ "scr_metric_threshold_20": 0.2434018000255199,
185
+ "scr_dir2_threshold_20": 0.2434018000255199,
186
+ "scr_dir1_threshold_50": 0.24390263995216122,
187
+ "scr_metric_threshold_50": 0.28445751901318467,
188
+ "scr_dir2_threshold_50": 0.28445751901318467,
189
+ "scr_dir1_threshold_100": -0.06504076859931053,
190
+ "scr_metric_threshold_100": 0.09384166837090524,
191
+ "scr_dir2_threshold_100": 0.09384166837090524,
192
+ "scr_dir1_threshold_500": -0.008129974927287195,
193
+ "scr_metric_threshold_500": 0.11730195436835675,
194
+ "scr_dir2_threshold_500": 0.11730195436835675
195
+ },
196
+ {
197
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
198
+ "scr_dir1_threshold_2": 0.03278678379574697,
199
+ "scr_metric_threshold_2": 0.12890643189891055,
200
+ "scr_dir2_threshold_2": 0.12890643189891055,
201
+ "scr_dir1_threshold_5": 0.04918033854781611,
202
+ "scr_metric_threshold_5": 0.16406245634426148,
203
+ "scr_dir2_threshold_5": 0.16406245634426148,
204
+ "scr_dir1_threshold_10": 0.016393554752069144,
205
+ "scr_metric_threshold_10": 0.2695312281721307,
206
+ "scr_dir2_threshold_10": 0.2695312281721307,
207
+ "scr_dir1_threshold_20": 0.03278678379574697,
208
+ "scr_metric_threshold_20": 0.36328140279508486,
209
+ "scr_dir2_threshold_20": 0.36328140279508486,
210
+ "scr_dir1_threshold_50": 0.00546440968122594,
211
+ "scr_metric_threshold_50": 0.41015625727595645,
212
+ "scr_dir2_threshold_50": 0.41015625727595645,
213
+ "scr_dir1_threshold_100": 0.06010915791026799,
214
+ "scr_metric_threshold_100": 0.4609375436557385,
215
+ "scr_dir2_threshold_100": 0.4609375436557385,
216
+ "scr_dir1_threshold_500": 0.05464474822904205,
217
+ "scr_metric_threshold_500": 0.5195312281721307,
218
+ "scr_dir2_threshold_500": 0.5195312281721307
219
+ },
220
+ {
221
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
222
+ "scr_dir1_threshold_2": 0.0410254200064373,
223
+ "scr_metric_threshold_2": 0.1008065853511987,
224
+ "scr_dir2_threshold_2": 0.1008065853511987,
225
+ "scr_dir1_threshold_5": 0.12307687134902433,
226
+ "scr_metric_threshold_5": 0.11290320254761761,
227
+ "scr_dir2_threshold_5": 0.11290320254761761,
228
+ "scr_dir1_threshold_10": 0.14871787347736873,
229
+ "scr_metric_threshold_10": 0.1370969176230247,
230
+ "scr_dir2_threshold_10": 0.1370969176230247,
231
+ "scr_dir1_threshold_20": 0.12307687134902433,
232
+ "scr_metric_threshold_20": 0.1854838670912696,
233
+ "scr_dir2_threshold_20": 0.1854838670912696,
234
+ "scr_dir1_threshold_50": 0.1692307363130155,
235
+ "scr_metric_threshold_50": 0.2056451360932524,
236
+ "scr_dir2_threshold_50": 0.2056451360932524,
237
+ "scr_dir1_threshold_100": 0.17435887560571312,
238
+ "scr_metric_threshold_100": 0.22983885116865949,
239
+ "scr_dir2_threshold_100": 0.22983885116865949,
240
+ "scr_dir1_threshold_500": 0.08205114567773082,
241
+ "scr_metric_threshold_500": 0.35483867091269605,
242
+ "scr_dir2_threshold_500": 0.35483867091269605
243
+ },
244
+ {
245
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
246
+ "scr_dir1_threshold_2": 0.09009000301245093,
247
+ "scr_metric_threshold_2": 0.08482138699453473,
248
+ "scr_dir2_threshold_2": 0.08482138699453473,
249
+ "scr_dir1_threshold_5": 0.09009000301245093,
250
+ "scr_metric_threshold_5": 0.13392842175175362,
251
+ "scr_dir2_threshold_5": 0.13392842175175362,
252
+ "scr_dir1_threshold_10": 0.17117111311941172,
253
+ "scr_metric_threshold_10": 0.17857149509445872,
254
+ "scr_dir2_threshold_10": 0.17857149509445872,
255
+ "scr_dir1_threshold_20": 0.1891891674197794,
256
+ "scr_metric_threshold_20": 0.20089289871975097,
257
+ "scr_dir2_threshold_20": 0.20089289871975097,
258
+ "scr_dir1_threshold_50": 0.2432433303208824,
259
+ "scr_metric_threshold_50": 0.28125005820765137,
260
+ "scr_dir2_threshold_50": 0.28125005820765137,
261
+ "scr_dir1_threshold_100": 0.32882888688058826,
262
+ "scr_metric_threshold_100": 0.28125005820765137,
263
+ "scr_dir2_threshold_100": 0.28125005820765137,
264
+ "scr_dir1_threshold_500": 0.25225222322637253,
265
+ "scr_metric_threshold_500": 0.3258928654582359,
266
+ "scr_dir2_threshold_500": 0.3258928654582359
267
+ },
268
+ {
269
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
270
+ "scr_dir1_threshold_2": 0.03862661712745569,
271
+ "scr_metric_threshold_2": 0.03862661712745569,
272
+ "scr_dir2_threshold_2": 0.02380949677794379,
273
+ "scr_dir1_threshold_5": 0.042918406627421406,
274
+ "scr_metric_threshold_5": 0.042918406627421406,
275
+ "scr_dir2_threshold_5": 0.04761899355588758,
276
+ "scr_dir1_threshold_10": 0.060085820441166386,
277
+ "scr_metric_threshold_10": 0.060085820441166386,
278
+ "scr_dir2_threshold_10": 0.09523798711177515,
279
+ "scr_dir1_threshold_20": 0.08583681325484281,
280
+ "scr_metric_threshold_20": 0.08583681325484281,
281
+ "scr_dir2_threshold_20": 0.10000011353263609,
282
+ "scr_dir1_threshold_50": 0.060085820441166386,
283
+ "scr_metric_threshold_50": 0.060085820441166386,
284
+ "scr_dir2_threshold_50": 0.11428564130044823,
285
+ "scr_dir1_threshold_100": 0.07725323425491137,
286
+ "scr_metric_threshold_100": 0.07725323425491137,
287
+ "scr_dir2_threshold_100": 0.1619046348563358,
288
+ "scr_dir1_threshold_500": 0.09442064806865635,
289
+ "scr_metric_threshold_500": 0.09442064806865635,
290
+ "scr_dir2_threshold_500": 0.19523810064441124
291
+ }
292
+ ],
293
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
294
+ "sae_lens_id": "custom_sae",
295
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0",
296
+ "sae_lens_version": "5.4.2",
297
+ "sae_cfg_dict": {
298
+ "model_name": "gemma-2-2b",
299
+ "d_in": 2304,
300
+ "d_sae": 65536,
301
+ "hook_layer": 12,
302
+ "hook_name": "blocks.12.hook_resid_post",
303
+ "context_size": null,
304
+ "hook_head_index": null,
305
+ "architecture": "topk",
306
+ "apply_b_dec_to_input": null,
307
+ "finetuning_scaling_factor": null,
308
+ "activation_fn_str": "",
309
+ "prepend_bos": true,
310
+ "normalize_activations": "none",
311
+ "dtype": "bfloat16",
312
+ "device": "",
313
+ "dataset_path": "",
314
+ "dataset_trust_remote_code": true,
315
+ "seqpos_slice": [
316
+ null
317
+ ],
318
+ "training_tokens": -100000,
319
+ "sae_lens_training_version": null,
320
+ "neuronpedia_id": null
321
+ },
322
+ "eval_result_unstructured": null
323
+ }
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "scr",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": true,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "b9f069ac-8088-45a3-9274-9b1e86e4771e",
73
+ "datetime_epoch_millis": 1740161543241,
74
+ "eval_result_metrics": {
75
+ "scr_metrics": {
76
+ "scr_dir1_threshold_2": 0.18865540875749043,
77
+ "scr_metric_threshold_2": 0.073516336917133,
78
+ "scr_dir2_threshold_2": 0.07225177023950229,
79
+ "scr_dir1_threshold_5": 0.23303036096673593,
80
+ "scr_metric_threshold_5": 0.1302813962622398,
81
+ "scr_dir2_threshold_5": 0.1305598690300678,
82
+ "scr_dir1_threshold_10": 0.23417802766807275,
83
+ "scr_metric_threshold_10": 0.1620880555443666,
84
+ "scr_dir2_threshold_10": 0.16742988549999255,
85
+ "scr_dir1_threshold_20": 0.2599068683780712,
86
+ "scr_metric_threshold_20": 0.21384163954000984,
87
+ "scr_dir2_threshold_20": 0.22096147153227652,
88
+ "scr_dir1_threshold_50": 0.1619087251521188,
89
+ "scr_metric_threshold_50": 0.2891392857052675,
90
+ "scr_dir2_threshold_50": 0.29613397944434255,
91
+ "scr_dir1_threshold_100": 0.08543218428340123,
92
+ "scr_metric_threshold_100": 0.31203469249820925,
93
+ "scr_dir2_threshold_100": 0.3178389255899667,
94
+ "scr_dir1_threshold_500": 0.07042284538823904,
95
+ "scr_metric_threshold_500": 0.2821800078466572,
96
+ "scr_dir2_threshold_500": 0.2820164853730435
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
102
+ "scr_dir1_threshold_2": 0.3809524260050484,
103
+ "scr_metric_threshold_2": 0.012284998611665989,
104
+ "scr_dir2_threshold_2": 0.012284998611665989,
105
+ "scr_dir1_threshold_5": 0.41269806729462877,
106
+ "scr_metric_threshold_5": 0.019656056358159712,
107
+ "scr_dir2_threshold_5": 0.019656056358159712,
108
+ "scr_dir1_threshold_10": 0.41269806729462877,
109
+ "scr_metric_threshold_10": 0.061425139507065275,
110
+ "scr_dir2_threshold_10": 0.061425139507065275,
111
+ "scr_dir1_threshold_20": 0.444444654690226,
112
+ "scr_metric_threshold_20": 0.09336619447689097,
113
+ "scr_dir2_threshold_20": 0.09336619447689097,
114
+ "scr_dir1_threshold_50": 0.3809524260050484,
115
+ "scr_metric_threshold_50": 0.12039316213280908,
116
+ "scr_dir2_threshold_50": 0.12039316213280908,
117
+ "scr_dir1_threshold_100": 0.3650786592542414,
118
+ "scr_metric_threshold_100": 0.13759224805838266,
119
+ "scr_dir2_threshold_100": 0.13759224805838266,
120
+ "scr_dir1_threshold_500": 0.3015873766750807,
121
+ "scr_metric_threshold_500": 0.05896816907447914,
122
+ "scr_dir2_threshold_500": 0.05896816907447914
123
+ },
124
+ {
125
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
126
+ "scr_dir1_threshold_2": 0.16161592443815084,
127
+ "scr_metric_threshold_2": 0.14447592402822051,
128
+ "scr_dir2_threshold_2": 0.14447592402822051,
129
+ "scr_dir1_threshold_5": 0.19191917367472955,
130
+ "scr_metric_threshold_5": 0.2294616822322599,
131
+ "scr_dir2_threshold_5": 0.2294616822322599,
132
+ "scr_dir1_threshold_10": 0.20202005606450307,
133
+ "scr_metric_threshold_10": 0.2691217364978167,
134
+ "scr_dir2_threshold_10": 0.2691217364978167,
135
+ "scr_dir1_threshold_20": 0.2121209384542766,
136
+ "scr_metric_threshold_20": 0.31161478445152835,
137
+ "scr_dir2_threshold_20": 0.31161478445152835,
138
+ "scr_dir1_threshold_50": -0.343434817790365,
139
+ "scr_metric_threshold_50": 0.38243624962156114,
140
+ "scr_dir2_threshold_50": 0.38243624962156114,
141
+ "scr_dir1_threshold_100": -0.3737374649596856,
142
+ "scr_metric_threshold_100": 0.21813038288640824,
143
+ "scr_dir2_threshold_100": 0.21813038288640824,
144
+ "scr_dir1_threshold_500": -0.5151516246182893,
145
+ "scr_metric_threshold_500": 0.28895184805644103,
146
+ "scr_dir2_threshold_500": 0.28895184805644103
147
+ },
148
+ {
149
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
150
+ "scr_dir1_threshold_2": 0.4999995193171997,
151
+ "scr_metric_threshold_2": 0.022727279568944055,
152
+ "scr_dir2_threshold_2": 0.022727279568944055,
153
+ "scr_dir1_threshold_5": 0.5483869727270193,
154
+ "scr_metric_threshold_5": 0.0479797789751554,
155
+ "scr_dir2_threshold_5": 0.0479797789751554,
156
+ "scr_dir1_threshold_10": 0.5483869727270193,
157
+ "scr_metric_threshold_10": 0.07323242889813597,
158
+ "scr_dir2_threshold_10": 0.07323242889813597,
159
+ "scr_dir1_threshold_20": 0.46774185795438705,
160
+ "scr_metric_threshold_20": 0.15151514695403728,
161
+ "scr_dir2_threshold_20": 0.15151514695403728,
162
+ "scr_dir1_threshold_50": 0.29032183636211567,
163
+ "scr_metric_threshold_50": 0.21717176582360218,
164
+ "scr_dir2_threshold_50": 0.21717176582360218,
165
+ "scr_dir1_threshold_100": 0.2741930056807093,
166
+ "scr_metric_threshold_100": 0.21212117563229838,
167
+ "scr_dir2_threshold_100": 0.21212117563229838,
168
+ "scr_dir1_threshold_500": 0.16129022954526445,
169
+ "scr_metric_threshold_500": 0.06565661886956488,
170
+ "scr_dir2_threshold_500": 0.06565661886956488
171
+ },
172
+ {
173
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
174
+ "scr_dir1_threshold_2": 0.15447146198048264,
175
+ "scr_metric_threshold_2": 0.09970669617184318,
176
+ "scr_dir2_threshold_2": 0.09970669617184318,
177
+ "scr_dir1_threshold_5": 0.2764230242518165,
178
+ "scr_metric_threshold_5": 0.18475064804764177,
179
+ "scr_dir2_threshold_5": 0.18475064804764177,
180
+ "scr_dir1_threshold_10": 0.19512182120742508,
181
+ "scr_metric_threshold_10": 0.24633431392598884,
182
+ "scr_dir2_threshold_10": 0.24633431392598884,
183
+ "scr_dir1_threshold_20": 0.2682925647340228,
184
+ "scr_metric_threshold_20": 0.2727272886176091,
185
+ "scr_dir2_threshold_20": 0.2727272886176091,
186
+ "scr_dir1_threshold_50": 0.15447146198048264,
187
+ "scr_metric_threshold_50": 0.3665689569885143,
188
+ "scr_dir2_threshold_50": 0.3665689569885143,
189
+ "scr_dir1_threshold_100": -0.382113667487563,
190
+ "scr_metric_threshold_100": 0.40762467597617913,
191
+ "scr_dir2_threshold_100": 0.40762467597617913,
192
+ "scr_dir1_threshold_500": -0.487804795313816,
193
+ "scr_metric_threshold_500": 0.13196469866440133,
194
+ "scr_dir2_threshold_500": 0.13196469866440133
195
+ },
196
+ {
197
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
198
+ "scr_dir1_threshold_2": 0.021857964433295084,
199
+ "scr_metric_threshold_2": 0.09375017462295412,
200
+ "scr_dir2_threshold_2": 0.09375017462295412,
201
+ "scr_dir1_threshold_5": 0.03278678379574697,
202
+ "scr_metric_threshold_5": 0.1992187136202179,
203
+ "scr_dir2_threshold_5": 0.1992187136202179,
204
+ "scr_dir1_threshold_10": 0.04918033854781611,
205
+ "scr_metric_threshold_10": 0.21484374272404358,
206
+ "scr_dir2_threshold_10": 0.21484374272404358,
207
+ "scr_dir1_threshold_20": 0.09289626741440628,
208
+ "scr_metric_threshold_20": 0.28515625727595645,
209
+ "scr_dir2_threshold_20": 0.28515625727595645,
210
+ "scr_dir1_threshold_50": 0.12568305121015325,
211
+ "scr_metric_threshold_50": 0.4570313445874335,
212
+ "scr_dir2_threshold_50": 0.4570313445874335,
213
+ "scr_dir1_threshold_100": -0.00546440968122594,
214
+ "scr_metric_threshold_100": 0.5625001164153027,
215
+ "scr_dir2_threshold_100": 0.5625001164153027,
216
+ "scr_dir1_threshold_500": 0.07650271266233713,
217
+ "scr_metric_threshold_500": 0.6523438591393463,
218
+ "scr_dir2_threshold_500": 0.6523438591393463
219
+ },
220
+ {
221
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
222
+ "scr_dir1_threshold_2": 0.08717928497042846,
223
+ "scr_metric_threshold_2": 0.04435498407738989,
224
+ "scr_dir2_threshold_2": 0.04435498407738989,
225
+ "scr_dir1_threshold_5": 0.16410229135546164,
226
+ "scr_metric_threshold_5": 0.08870972781349516,
227
+ "scr_dir2_threshold_5": 0.08870972781349516,
228
+ "scr_dir1_threshold_10": 0.1999998777340575,
229
+ "scr_metric_threshold_10": 0.10483879108333834,
230
+ "scr_dir2_threshold_10": 0.10483879108333834,
231
+ "scr_dir1_threshold_20": 0.23076901915509954,
232
+ "scr_metric_threshold_20": 0.17741945562699032,
233
+ "scr_dir2_threshold_20": 0.17741945562699032,
234
+ "scr_dir1_threshold_50": 0.2358974641126534,
235
+ "scr_metric_threshold_50": 0.2983870696388872,
236
+ "scr_dir2_threshold_50": 0.2983870696388872,
237
+ "scr_dir1_threshold_100": 0.28205102341178834,
238
+ "scr_metric_threshold_100": 0.4112902721865048,
239
+ "scr_dir2_threshold_100": 0.4112902721865048,
240
+ "scr_dir1_threshold_500": 0.32307674908308187,
241
+ "scr_metric_threshold_500": 0.3508064651805564,
242
+ "scr_dir2_threshold_500": 0.3508064651805564
243
+ },
244
+ {
245
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
246
+ "scr_dir1_threshold_2": 0.1216216651604412,
247
+ "scr_metric_threshold_2": 0.08928561450116908,
248
+ "scr_dir2_threshold_2": 0.08928561450116908,
249
+ "scr_dir1_threshold_5": 0.13063055806593132,
250
+ "scr_metric_threshold_5": 0.16517854648243513,
251
+ "scr_dir2_threshold_5": 0.16517854648243513,
252
+ "scr_dir1_threshold_10": 0.18018027451428928,
253
+ "scr_metric_threshold_10": 0.24107147846370117,
254
+ "scr_dir2_threshold_10": 0.24107147846370117,
255
+ "scr_dir1_threshold_20": 0.23423416892600485,
256
+ "scr_metric_threshold_20": 0.29017851322092003,
257
+ "scr_dir2_threshold_20": 0.29017851322092003,
258
+ "scr_dir1_threshold_50": 0.2882883318271079,
259
+ "scr_metric_threshold_50": 0.308035689339578,
260
+ "scr_dir2_threshold_50": 0.308035689339578,
261
+ "scr_dir1_threshold_100": 0.3603602805391911,
262
+ "scr_metric_threshold_100": 0.38392862132084404,
263
+ "scr_dir2_threshold_100": 0.38392862132084404,
264
+ "scr_dir1_threshold_500": 0.4549549984937745,
265
+ "scr_metric_threshold_500": 0.45982128720998955,
266
+ "scr_dir2_threshold_500": 0.45982128720998955
267
+ },
268
+ {
269
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
270
+ "scr_dir1_threshold_2": 0.08154502375487709,
271
+ "scr_metric_threshold_2": 0.08154502375487709,
272
+ "scr_dir2_threshold_2": 0.07142849033383136,
273
+ "scr_dir1_threshold_5": 0.10729601656855352,
274
+ "scr_metric_threshold_5": 0.10729601656855352,
275
+ "scr_dir2_threshold_5": 0.10952379871117751,
276
+ "scr_dir1_threshold_10": 0.08583681325484281,
277
+ "scr_metric_threshold_10": 0.08583681325484281,
278
+ "scr_dir2_threshold_10": 0.12857145289985059,
279
+ "scr_dir1_threshold_20": 0.12875547569614632,
280
+ "scr_metric_threshold_20": 0.12875547569614632,
281
+ "scr_dir2_threshold_20": 0.18571413163427958,
282
+ "scr_dir1_threshold_50": 0.16309004750975417,
283
+ "scr_metric_threshold_50": 0.16309004750975417,
284
+ "scr_dir2_threshold_50": 0.21904759742235502,
285
+ "scr_dir1_threshold_100": 0.16309004750975417,
286
+ "scr_metric_threshold_100": 0.16309004750975417,
287
+ "scr_dir2_threshold_100": 0.20952391224381361,
288
+ "scr_dir1_threshold_500": 0.2489271165784791,
289
+ "scr_metric_threshold_500": 0.2489271165784791,
290
+ "scr_dir2_threshold_500": 0.24761893678956953
291
+ }
292
+ ],
293
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
294
+ "sae_lens_id": "custom_sae",
295
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0",
296
+ "sae_lens_version": "5.4.2",
297
+ "sae_cfg_dict": {
298
+ "model_name": "gemma-2-2b",
299
+ "d_in": 2304,
300
+ "d_sae": 65536,
301
+ "hook_layer": 12,
302
+ "hook_name": "blocks.12.hook_resid_post",
303
+ "context_size": null,
304
+ "hook_head_index": null,
305
+ "architecture": "topk",
306
+ "apply_b_dec_to_input": null,
307
+ "finetuning_scaling_factor": null,
308
+ "activation_fn_str": "",
309
+ "prepend_bos": true,
310
+ "normalize_activations": "none",
311
+ "dtype": "bfloat16",
312
+ "device": "",
313
+ "dataset_path": "",
314
+ "dataset_trust_remote_code": true,
315
+ "seqpos_slice": [
316
+ null
317
+ ],
318
+ "training_tokens": -100000,
319
+ "sae_lens_training_version": null,
320
+ "neuronpedia_id": null
321
+ },
322
+ "eval_result_unstructured": null
323
+ }
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "scr",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": true,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "21df4103-067e-4b1e-992d-4ee5b7c51b7b",
73
+ "datetime_epoch_millis": 1740161101577,
74
+ "eval_result_metrics": {
75
+ "scr_metrics": {
76
+ "scr_dir1_threshold_2": 0.2009693612201276,
77
+ "scr_metric_threshold_2": 0.08333806598832513,
78
+ "scr_dir2_threshold_2": 0.08053045986523572,
79
+ "scr_dir1_threshold_5": 0.23069728357870245,
80
+ "scr_metric_threshold_5": 0.12271415632570293,
81
+ "scr_dir2_threshold_5": 0.12228750697619764,
82
+ "scr_dir1_threshold_10": 0.24536139627163156,
83
+ "scr_metric_threshold_10": 0.14820108338134885,
84
+ "scr_dir2_threshold_10": 0.14854593776620528,
85
+ "scr_dir1_threshold_20": 0.26804181768674956,
86
+ "scr_metric_threshold_20": 0.16917277755556245,
87
+ "scr_dir2_threshold_20": 0.1705906112921456,
88
+ "scr_dir1_threshold_50": 0.2558397963519864,
89
+ "scr_metric_threshold_50": 0.2246913468132941,
90
+ "scr_dir2_threshold_50": 0.22294137414999496,
91
+ "scr_dir1_threshold_100": 0.2694834525923413,
92
+ "scr_metric_threshold_100": 0.22275277451374648,
93
+ "scr_dir2_threshold_100": 0.22045867120535106,
94
+ "scr_dir1_threshold_500": 0.23308094185248288,
95
+ "scr_metric_threshold_500": 0.23783672950733417,
96
+ "scr_dir2_threshold_500": 0.23189713822667268
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
102
+ "scr_dir1_threshold_2": 0.42857088793941894,
103
+ "scr_metric_threshold_2": 0.0,
104
+ "scr_dir2_threshold_2": 0.0,
105
+ "scr_dir1_threshold_5": 0.47619029597980633,
106
+ "scr_metric_threshold_5": 0.0,
107
+ "scr_dir2_threshold_5": 0.0,
108
+ "scr_dir1_threshold_10": 0.46031747533501616,
109
+ "scr_metric_threshold_10": 0.0024571168813214595,
110
+ "scr_dir2_threshold_10": 0.0024571168813214595,
111
+ "scr_dir1_threshold_20": 0.47619029597980633,
112
+ "scr_metric_threshold_20": 0.039312112716319424,
113
+ "scr_dir2_threshold_20": 0.039312112716319424,
114
+ "scr_dir1_threshold_50": 0.42857088793941894,
115
+ "scr_metric_threshold_50": 0.05896816907447914,
116
+ "scr_dir2_threshold_50": 0.05896816907447914,
117
+ "scr_dir1_threshold_100": 0.3968252466498386,
118
+ "scr_metric_threshold_100": 0.09336619447689097,
119
+ "scr_dir2_threshold_100": 0.09336619447689097,
120
+ "scr_dir1_threshold_500": 0.3968252466498386,
121
+ "scr_metric_threshold_500": 0.022113026790745845,
122
+ "scr_dir2_threshold_500": 0.022113026790745845
123
+ },
124
+ {
125
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
126
+ "scr_dir1_threshold_2": 0.2121209384542766,
127
+ "scr_metric_threshold_2": 0.09631722640158302,
128
+ "scr_dir2_threshold_2": 0.09631722640158302,
129
+ "scr_dir1_threshold_5": 0.2121209384542766,
130
+ "scr_metric_threshold_5": 0.13031163099421397,
131
+ "scr_dir2_threshold_5": 0.13031163099421397,
132
+ "scr_dir1_threshold_10": 0.2121209384542766,
133
+ "scr_metric_threshold_10": 0.1671388604233078,
134
+ "scr_dir2_threshold_10": 0.1671388604233078,
135
+ "scr_dir1_threshold_20": 0.2828283193172076,
136
+ "scr_metric_threshold_20": 0.18130315345731438,
137
+ "scr_dir2_threshold_20": 0.18130315345731438,
138
+ "scr_dir1_threshold_50": 0.040403529559094105,
139
+ "scr_metric_threshold_50": 0.24362614411795844,
140
+ "scr_dir2_threshold_50": 0.24362614411795844,
141
+ "scr_dir1_threshold_100": 0.040403529559094105,
142
+ "scr_metric_threshold_100": 0.26345608682489086,
143
+ "scr_dir2_threshold_100": 0.26345608682489086,
144
+ "scr_dir1_threshold_500": -0.2828283193172076,
145
+ "scr_metric_threshold_500": 0.16147304189869,
146
+ "scr_dir2_threshold_500": 0.16147304189869
147
+ },
148
+ {
149
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
150
+ "scr_dir1_threshold_2": 0.5483869727270193,
151
+ "scr_metric_threshold_2": 0.015151620057142186,
152
+ "scr_dir2_threshold_2": 0.015151620057142186,
153
+ "scr_dir1_threshold_5": 0.5483869727270193,
154
+ "scr_metric_threshold_5": 0.012626249703105673,
155
+ "scr_dir2_threshold_5": 0.012626249703105673,
156
+ "scr_dir1_threshold_10": 0.4999995193171997,
157
+ "scr_metric_threshold_10": 0.04040411946335353,
158
+ "scr_dir2_threshold_10": 0.04040411946335353,
159
+ "scr_dir1_threshold_20": 0.532258142045613,
160
+ "scr_metric_threshold_20": 0.050505149329191916,
161
+ "scr_dir2_threshold_20": 0.050505149329191916,
162
+ "scr_dir1_threshold_50": 0.516128349998606,
163
+ "scr_metric_threshold_50": 0.0530303691664592,
164
+ "scr_dir2_threshold_50": 0.0530303691664592,
165
+ "scr_dir1_threshold_100": 0.48387068863579336,
166
+ "scr_metric_threshold_100": 0.08080808840993783,
167
+ "scr_dir2_threshold_100": 0.08080808840993783,
168
+ "scr_dir1_threshold_500": 0.45161302727298075,
169
+ "scr_metric_threshold_500": 0.03030308959751515,
170
+ "scr_dir2_threshold_500": 0.03030308959751515
171
+ },
172
+ {
173
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
174
+ "scr_dir1_threshold_2": 0.25203261487944845,
175
+ "scr_metric_threshold_2": 0.06451600498511631,
176
+ "scr_dir2_threshold_2": 0.06451600498511631,
177
+ "scr_dir1_threshold_5": 0.2845529991791037,
178
+ "scr_metric_threshold_5": 0.09090897967673653,
179
+ "scr_dir2_threshold_5": 0.09090897967673653,
180
+ "scr_dir1_threshold_10": 0.2601625898067356,
181
+ "scr_metric_threshold_10": 0.1524926455550836,
182
+ "scr_dir2_threshold_10": 0.1524926455550836,
183
+ "scr_dir1_threshold_20": 0.25203261487944845,
184
+ "scr_metric_threshold_20": 0.1964808784432174,
185
+ "scr_dir2_threshold_20": 0.1964808784432174,
186
+ "scr_dir1_threshold_50": 0.3089429239609653,
187
+ "scr_metric_threshold_50": 0.23460408353041323,
188
+ "scr_dir2_threshold_50": 0.23460408353041323,
189
+ "scr_dir1_threshold_100": 0.2682925647340228,
190
+ "scr_metric_threshold_100": 0.032258002492558155,
191
+ "scr_dir2_threshold_100": 0.032258002492558155,
192
+ "scr_dir1_threshold_500": 0.39024412700535666,
193
+ "scr_metric_threshold_500": 0.070381207579754,
194
+ "scr_dir2_threshold_500": 0.070381207579754
195
+ },
196
+ {
197
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
198
+ "scr_dir1_threshold_2": 0.016393554752069144,
199
+ "scr_metric_threshold_2": 0.16015625727595642,
200
+ "scr_dir2_threshold_2": 0.16015625727595642,
201
+ "scr_dir1_threshold_5": 0.07650271266233713,
202
+ "scr_metric_threshold_5": 0.3359375436557385,
203
+ "scr_dir2_threshold_5": 0.3359375436557385,
204
+ "scr_dir1_threshold_10": 0.10382508677685816,
205
+ "scr_metric_threshold_10": 0.371093800931695,
206
+ "scr_dir2_threshold_10": 0.371093800931695,
207
+ "scr_dir1_threshold_20": 0.1202186415289273,
208
+ "scr_metric_threshold_20": 0.42578128637978213,
209
+ "scr_dir2_threshold_20": 0.42578128637978213,
210
+ "scr_dir1_threshold_50": 0.10382508677685816,
211
+ "scr_metric_threshold_50": 0.48828116996447934,
212
+ "scr_dir2_threshold_50": 0.48828116996447934,
213
+ "scr_dir1_threshold_100": 0.1803277994391953,
214
+ "scr_metric_threshold_100": 0.5234374272404357,
215
+ "scr_dir2_threshold_100": 0.5234374272404357,
216
+ "scr_dir1_threshold_500": 0.07103830298111119,
217
+ "scr_metric_threshold_500": 0.6054687718278693,
218
+ "scr_dir2_threshold_500": 0.6054687718278693
219
+ },
220
+ {
221
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
222
+ "scr_dir1_threshold_2": 0.03589728071373967,
223
+ "scr_metric_threshold_2": 0.060483807005948444,
224
+ "scr_dir2_threshold_2": 0.060483807005948444,
225
+ "scr_dir1_threshold_5": 0.09743586922067994,
226
+ "scr_metric_threshold_5": 0.0927419335456348,
227
+ "scr_dir2_threshold_5": 0.0927419335456348,
228
+ "scr_dir1_threshold_10": 0.16410229135546164,
229
+ "scr_metric_threshold_10": 0.08870972781349516,
230
+ "scr_dir2_threshold_10": 0.08870972781349516,
231
+ "scr_dir1_threshold_20": 0.1999998777340575,
232
+ "scr_metric_threshold_20": 0.1008065853511987,
233
+ "scr_dir2_threshold_20": 0.1008065853511987,
234
+ "scr_dir1_threshold_50": 0.24102560340535104,
235
+ "scr_metric_threshold_50": 0.19758072462897314,
236
+ "scr_dir2_threshold_50": 0.19758072462897314,
237
+ "scr_dir1_threshold_100": 0.24102560340535104,
238
+ "scr_metric_threshold_100": 0.23790326263293876,
239
+ "scr_dir2_threshold_100": 0.23790326263293876,
240
+ "scr_dir1_threshold_500": 0.3282048883757795,
241
+ "scr_metric_threshold_500": 0.3225807847142943,
242
+ "scr_dir2_threshold_500": 0.3225807847142943
243
+ },
244
+ {
245
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
246
+ "scr_dir1_threshold_2": 0.05855860935384807,
247
+ "scr_metric_threshold_2": 0.214285581239654,
248
+ "scr_dir2_threshold_2": 0.214285581239654,
249
+ "scr_dir1_threshold_5": 0.09459444946519599,
250
+ "scr_metric_threshold_5": 0.2633928820889934,
251
+ "scr_dir2_threshold_5": 0.2633928820889934,
252
+ "scr_dir1_threshold_10": 0.19369361387252446,
253
+ "scr_metric_threshold_10": 0.2946427407275544,
254
+ "scr_dir2_threshold_10": 0.2946427407275544,
255
+ "scr_dir1_threshold_20": 0.22072082956776967,
256
+ "scr_metric_threshold_20": 0.2991072343263093,
257
+ "scr_dir2_threshold_20": 0.2991072343263093,
258
+ "scr_dir1_threshold_50": 0.27477472397948527,
259
+ "scr_metric_threshold_50": 0.3883928488274784,
260
+ "scr_dir2_threshold_50": 0.3883928488274784,
261
+ "scr_dir1_threshold_100": 0.3648647269919362,
262
+ "scr_metric_threshold_100": 0.37053567270882043,
263
+ "scr_dir2_threshold_100": 0.37053567270882043,
264
+ "scr_dir1_threshold_500": 0.24774777677362747,
265
+ "scr_metric_threshold_500": 0.42857142857142855,
266
+ "scr_dir2_threshold_500": 0.42857142857142855
267
+ },
268
+ {
269
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
270
+ "scr_dir1_threshold_2": 0.05579403094120067,
271
+ "scr_metric_threshold_2": 0.05579403094120067,
272
+ "scr_dir2_threshold_2": 0.033333181956485214,
273
+ "scr_dir1_threshold_5": 0.05579403094120067,
274
+ "scr_metric_threshold_5": 0.05579403094120067,
275
+ "scr_dir2_threshold_5": 0.05238083614515829,
276
+ "scr_dir1_threshold_10": 0.06866965525497992,
277
+ "scr_metric_threshold_10": 0.06866965525497992,
278
+ "scr_dir2_threshold_10": 0.07142849033383136,
279
+ "scr_dir1_threshold_20": 0.060085820441166386,
280
+ "scr_metric_threshold_20": 0.060085820441166386,
281
+ "scr_dir2_threshold_20": 0.07142849033383136,
282
+ "scr_dir1_threshold_50": 0.13304726519611204,
283
+ "scr_metric_threshold_50": 0.13304726519611204,
284
+ "scr_dir2_threshold_50": 0.11904748388971893,
285
+ "scr_dir1_threshold_100": 0.18025746132349915,
286
+ "scr_metric_threshold_100": 0.18025746132349915,
287
+ "scr_dir2_threshold_100": 0.1619046348563358,
288
+ "scr_dir1_threshold_500": 0.2618024850783763,
289
+ "scr_metric_threshold_500": 0.2618024850783763,
290
+ "scr_dir2_threshold_500": 0.21428575483308432
291
+ }
292
+ ],
293
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
294
+ "sae_lens_id": "custom_sae",
295
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0",
296
+ "sae_lens_version": "5.4.2",
297
+ "sae_cfg_dict": {
298
+ "model_name": "gemma-2-2b",
299
+ "d_in": 2304,
300
+ "d_sae": 65536,
301
+ "hook_layer": 12,
302
+ "hook_name": "blocks.12.hook_resid_post",
303
+ "context_size": null,
304
+ "hook_head_index": null,
305
+ "architecture": "topk",
306
+ "apply_b_dec_to_input": null,
307
+ "finetuning_scaling_factor": null,
308
+ "activation_fn_str": "",
309
+ "prepend_bos": true,
310
+ "normalize_activations": "none",
311
+ "dtype": "bfloat16",
312
+ "device": "",
313
+ "dataset_path": "",
314
+ "dataset_trust_remote_code": true,
315
+ "seqpos_slice": [
316
+ null
317
+ ],
318
+ "training_tokens": -100000,
319
+ "sae_lens_training_version": null,
320
+ "neuronpedia_id": null
321
+ },
322
+ "eval_result_unstructured": null
323
+ }
eval_results_from_scratch/scr/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "scr",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": true,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "2f284c7a-10d3-4f19-8264-97e91f4bb2f2",
73
+ "datetime_epoch_millis": 1740162428754,
74
+ "eval_result_metrics": {
75
+ "scr_metrics": {
76
+ "scr_dir1_threshold_2": 0.19644145298086652,
77
+ "scr_metric_threshold_2": 0.07734320571232689,
78
+ "scr_dir2_threshold_2": 0.0759023691262068,
79
+ "scr_dir1_threshold_5": 0.2317746907633326,
80
+ "scr_metric_threshold_5": 0.13025074424245692,
81
+ "scr_dir2_threshold_5": 0.13351306106512845,
82
+ "scr_dir1_threshold_10": 0.237756725215616,
83
+ "scr_metric_threshold_10": 0.1905854353466263,
84
+ "scr_dir2_threshold_10": 0.19497179922285185,
85
+ "scr_dir1_threshold_20": 0.24107600003630494,
86
+ "scr_metric_threshold_20": 0.24773476074996353,
87
+ "scr_dir2_threshold_20": 0.257654541907719,
88
+ "scr_dir1_threshold_50": 0.19159755118507713,
89
+ "scr_metric_threshold_50": 0.32827118012785683,
90
+ "scr_dir2_threshold_50": 0.3363388212419233,
91
+ "scr_dir1_threshold_100": 0.0755746565868114,
92
+ "scr_metric_threshold_100": 0.3171029788525398,
93
+ "scr_dir2_threshold_100": 0.3217090588604302,
94
+ "scr_dir1_threshold_500": 0.00475685136491143,
95
+ "scr_metric_threshold_500": 0.2958876177808421,
96
+ "scr_dir2_threshold_500": 0.3115094291710164
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
102
+ "scr_dir1_threshold_2": 0.42857088793941894,
103
+ "scr_metric_threshold_2": 0.009828028179079856,
104
+ "scr_dir2_threshold_2": 0.009828028179079856,
105
+ "scr_dir1_threshold_5": 0.444444654690226,
106
+ "scr_metric_threshold_5": 0.017199085925573582,
107
+ "scr_dir2_threshold_5": 0.017199085925573582,
108
+ "scr_dir1_threshold_10": 0.47619029597980633,
109
+ "scr_metric_threshold_10": 0.04668317046281315,
110
+ "scr_dir2_threshold_10": 0.04668317046281315,
111
+ "scr_dir1_threshold_20": 0.5079359372693867,
112
+ "scr_metric_threshold_20": 0.08845210716298338,
113
+ "scr_dir2_threshold_20": 0.08845210716298338,
114
+ "scr_dir1_threshold_50": 0.47619029597980633,
115
+ "scr_metric_threshold_50": 0.10565119308855696,
116
+ "scr_dir2_threshold_50": 0.10565119308855696,
117
+ "scr_dir1_threshold_100": 0.3809524260050484,
118
+ "scr_metric_threshold_100": 0.14496315935614107,
119
+ "scr_dir2_threshold_100": 0.14496315935614107,
120
+ "scr_dir1_threshold_500": 0.31746019731987085,
121
+ "scr_metric_threshold_500": 0.02948408453723957,
122
+ "scr_dir2_threshold_500": 0.02948408453723957
123
+ },
124
+ {
125
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
126
+ "scr_dir1_threshold_2": 0.19191917367472955,
127
+ "scr_metric_threshold_2": 0.1359772806671398,
128
+ "scr_dir2_threshold_2": 0.1359772806671398,
129
+ "scr_dir1_threshold_5": 0.2828283193172076,
130
+ "scr_metric_threshold_5": 0.21529738919825334,
131
+ "scr_dir2_threshold_5": 0.21529738919825334,
132
+ "scr_dir1_threshold_10": 0.2828283193172076,
133
+ "scr_metric_threshold_10": 0.2832861983835152,
134
+ "scr_dir2_threshold_10": 0.2832861983835152,
135
+ "scr_dir1_threshold_20": 0.2929292017069811,
136
+ "scr_metric_threshold_20": 0.3371105456830785,
137
+ "scr_dir2_threshold_20": 0.3371105456830785,
138
+ "scr_dir1_threshold_50": -0.16161652650540898,
139
+ "scr_metric_threshold_50": 0.4277621224117357,
140
+ "scr_dir2_threshold_50": 0.4277621224117357,
141
+ "scr_dir1_threshold_100": -0.42424308104306946,
142
+ "scr_metric_threshold_100": 0.2096317395253275,
143
+ "scr_dir2_threshold_100": 0.2096317395253275,
144
+ "scr_dir1_threshold_500": -0.6767681511236984,
145
+ "scr_metric_threshold_500": 0.26345608682489086,
146
+ "scr_dir2_threshold_500": 0.26345608682489086
147
+ },
148
+ {
149
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
150
+ "scr_dir1_threshold_2": 0.532258142045613,
151
+ "scr_metric_threshold_2": 0.03030308959751515,
152
+ "scr_dir2_threshold_2": 0.03030308959751515,
153
+ "scr_dir1_threshold_5": 0.532258142045613,
154
+ "scr_metric_threshold_5": 0.0530303691664592,
155
+ "scr_dir2_threshold_5": 0.0530303691664592,
156
+ "scr_dir1_threshold_10": 0.516128349998606,
157
+ "scr_metric_threshold_10": 0.07828286857267056,
158
+ "scr_dir2_threshold_10": 0.07828286857267056,
159
+ "scr_dir1_threshold_20": 0.48387068863579336,
160
+ "scr_metric_threshold_20": 0.13383845757639704,
161
+ "scr_dir2_threshold_20": 0.13383845757639704,
162
+ "scr_dir1_threshold_50": 0.35483812045334157,
163
+ "scr_metric_threshold_50": 0.22979801552670784,
164
+ "scr_dir2_threshold_50": 0.22979801552670784,
165
+ "scr_dir1_threshold_100": 0.3064516284091226,
166
+ "scr_metric_threshold_100": 0.29797985423354,
167
+ "scr_dir2_threshold_100": 0.29797985423354,
168
+ "scr_dir1_threshold_500": 0.14516043749825752,
169
+ "scr_metric_threshold_500": 0.10606058781614919,
170
+ "scr_dir2_threshold_500": 0.10606058781614919
171
+ },
172
+ {
173
+ "dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
174
+ "scr_dir1_threshold_2": 0.2601625898067356,
175
+ "scr_metric_threshold_2": 0.10850441266694984,
176
+ "scr_dir2_threshold_2": 0.10850441266694984,
177
+ "scr_dir1_threshold_5": 0.31707338347875896,
178
+ "scr_metric_threshold_5": 0.17302041765206616,
179
+ "scr_dir2_threshold_5": 0.17302041765206616,
180
+ "scr_dir1_threshold_10": 0.21138225565250596,
181
+ "scr_metric_threshold_10": 0.26099705822203345,
182
+ "scr_dir2_threshold_10": 0.26099705822203345,
183
+ "scr_dir1_threshold_20": 0.13008153719862106,
184
+ "scr_metric_threshold_20": 0.31964803540621184,
185
+ "scr_dir2_threshold_20": 0.31964803540621184,
186
+ "scr_dir1_threshold_50": 0.06504076859931053,
187
+ "scr_metric_threshold_50": 0.3900292429859658,
188
+ "scr_dir2_threshold_50": 0.3900292429859658,
189
+ "scr_dir1_threshold_100": -0.39024364241485016,
190
+ "scr_metric_threshold_100": 0.140762415159508,
191
+ "scr_dir2_threshold_100": 0.140762415159508,
192
+ "scr_dir1_threshold_500": -0.6341462823670114,
193
+ "scr_metric_threshold_500": 0.13489738735857004,
194
+ "scr_dir2_threshold_500": 0.13489738735857004
195
+ },
196
+ {
197
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
198
+ "scr_dir1_threshold_2": 0.0,
199
+ "scr_metric_threshold_2": 0.13671883003552063,
200
+ "scr_dir2_threshold_2": 0.13671883003552063,
201
+ "scr_dir1_threshold_5": 0.01092881936245188,
202
+ "scr_metric_threshold_5": 0.23437497089617432,
203
+ "scr_dir2_threshold_5": 0.23437497089617432,
204
+ "scr_dir1_threshold_10": 0.04918033854781611,
205
+ "scr_metric_threshold_10": 0.3085936845163922,
206
+ "scr_dir2_threshold_10": 0.3085936845163922,
207
+ "scr_dir1_threshold_20": 0.00546440968122594,
208
+ "scr_metric_threshold_20": 0.417968888243172,
209
+ "scr_dir2_threshold_20": 0.417968888243172,
210
+ "scr_dir1_threshold_50": 0.09836067709563222,
211
+ "scr_metric_threshold_50": 0.5351562572759564,
212
+ "scr_dir2_threshold_50": 0.5351562572759564,
213
+ "scr_dir1_threshold_100": -0.016393554752069144,
214
+ "scr_metric_threshold_100": 0.6015625727595643,
215
+ "scr_dir2_threshold_100": 0.6015625727595643,
216
+ "scr_dir1_threshold_500": 0.03825119347697291,
217
+ "scr_metric_threshold_500": 0.6953125145519129,
218
+ "scr_dir2_threshold_500": 0.6953125145519129
219
+ },
220
+ {
221
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
222
+ "scr_dir1_threshold_2": 0.03589728071373967,
223
+ "scr_metric_threshold_2": 0.04838718980952953,
224
+ "scr_dir2_threshold_2": 0.04838718980952953,
225
+ "scr_dir1_threshold_5": 0.10769214780607521,
226
+ "scr_metric_threshold_5": 0.0927419335456348,
227
+ "scr_dir2_threshold_5": 0.0927419335456348,
228
+ "scr_dir1_threshold_10": 0.12820501064172196,
229
+ "scr_metric_threshold_10": 0.1491935348194436,
230
+ "scr_dir2_threshold_10": 0.1491935348194436,
231
+ "scr_dir1_threshold_20": 0.18974359914866223,
232
+ "scr_metric_threshold_20": 0.18951607282340924,
233
+ "scr_dir2_threshold_20": 0.18951607282340924,
234
+ "scr_dir1_threshold_50": 0.22564087986240192,
235
+ "scr_metric_threshold_50": 0.3145161329087304,
236
+ "scr_dir2_threshold_50": 0.3145161329087304,
237
+ "scr_dir1_threshold_100": 0.27179474482639304,
238
+ "scr_metric_threshold_100": 0.43145154118848766,
239
+ "scr_dir2_threshold_100": 0.43145154118848766,
240
+ "scr_dir1_threshold_500": 0.30769233120498896,
241
+ "scr_metric_threshold_500": 0.4354839872619119,
242
+ "scr_dir2_threshold_500": 0.4354839872619119
243
+ },
244
+ {
245
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
246
+ "scr_dir1_threshold_2": 0.05405389441171559,
247
+ "scr_metric_threshold_2": 0.0803571594879004,
248
+ "scr_dir2_threshold_2": 0.0803571594879004,
249
+ "scr_dir1_threshold_5": 0.09459444946519599,
250
+ "scr_metric_threshold_5": 0.19196417761436174,
251
+ "scr_dir2_threshold_5": 0.19196417761436174,
252
+ "scr_dir1_threshold_10": 0.13513500451867638,
253
+ "scr_metric_threshold_10": 0.2946427407275544,
254
+ "scr_dir2_threshold_10": 0.2946427407275544,
255
+ "scr_dir1_threshold_20": 0.20270277526740202,
256
+ "scr_metric_threshold_20": 0.37946412772208915,
257
+ "scr_dir2_threshold_20": 0.37946412772208915,
258
+ "scr_dir1_threshold_50": 0.31981972548571075,
259
+ "scr_metric_threshold_50": 0.4687500083153788,
260
+ "scr_dir2_threshold_50": 0.4687500083153788,
261
+ "scr_dir1_threshold_100": 0.2702702775267402,
262
+ "scr_metric_threshold_100": 0.504464094460574,
263
+ "scr_dir2_threshold_100": 0.504464094460574,
264
+ "scr_dir1_threshold_500": 0.35585583408644605,
265
+ "scr_metric_threshold_500": 0.5178570430725976,
266
+ "scr_dir2_threshold_500": 0.5178570430725976
267
+ },
268
+ {
269
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
270
+ "scr_dir1_threshold_2": 0.06866965525497992,
271
+ "scr_metric_threshold_2": 0.06866965525497992,
272
+ "scr_dir2_threshold_2": 0.05714296256601923,
273
+ "scr_dir1_threshold_5": 0.0643776099411321,
274
+ "scr_metric_threshold_5": 0.0643776099411321,
275
+ "scr_dir2_threshold_5": 0.09047614452250444,
276
+ "scr_dir1_threshold_10": 0.10300422706858779,
277
+ "scr_metric_threshold_10": 0.10300422706858779,
278
+ "scr_dir2_threshold_10": 0.13809513807839202,
279
+ "scr_dir1_threshold_20": 0.11587985138236706,
280
+ "scr_metric_threshold_20": 0.11587985138236706,
281
+ "scr_dir2_threshold_20": 0.19523810064441124,
282
+ "scr_dir1_threshold_50": 0.15450646850982275,
283
+ "scr_metric_threshold_50": 0.15450646850982275,
284
+ "scr_dir2_threshold_50": 0.21904759742235502,
285
+ "scr_dir1_threshold_100": 0.20600845413717558,
286
+ "scr_metric_threshold_100": 0.20600845413717558,
287
+ "scr_dir2_threshold_100": 0.24285709420029883,
288
+ "scr_dir1_threshold_500": 0.18454925082346488,
289
+ "scr_metric_threshold_500": 0.18454925082346488,
290
+ "scr_dir2_threshold_500": 0.3095237419448595
291
+ }
292
+ ],
293
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
294
+ "sae_lens_id": "custom_sae",
295
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0",
296
+ "sae_lens_version": "5.4.2",
297
+ "sae_cfg_dict": {
298
+ "model_name": "gemma-2-2b",
299
+ "d_in": 2304,
300
+ "d_sae": 65536,
301
+ "hook_layer": 12,
302
+ "hook_name": "blocks.12.hook_resid_post",
303
+ "context_size": null,
304
+ "hook_head_index": null,
305
+ "architecture": "topk",
306
+ "apply_b_dec_to_input": null,
307
+ "finetuning_scaling_factor": null,
308
+ "activation_fn_str": "",
309
+ "prepend_bos": true,
310
+ "normalize_activations": "none",
311
+ "dtype": "bfloat16",
312
+ "device": "",
313
+ "dataset_path": "",
314
+ "dataset_trust_remote_code": true,
315
+ "seqpos_slice": [
316
+ null
317
+ ],
318
+ "training_tokens": -100000,
319
+ "sae_lens_training_version": null,
320
+ "neuronpedia_id": null
321
+ },
322
+ "eval_result_unstructured": null
323
+ }
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "sparse_probing",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "LabHC/bias_in_bios_class_set2",
8
+ "LabHC/bias_in_bios_class_set3",
9
+ "canrager/amazon_reviews_mcauley_1and5",
10
+ "canrager/amazon_reviews_mcauley_1and5_sentiment",
11
+ "codeparrot/github-code",
12
+ "fancyzhx/ag_news",
13
+ "Helsinki-NLP/europarl"
14
+ ],
15
+ "probe_train_set_size": 4000,
16
+ "probe_test_set_size": 1000,
17
+ "context_length": 128,
18
+ "sae_batch_size": 125,
19
+ "llm_batch_size": 32,
20
+ "llm_dtype": "bfloat16",
21
+ "model_name": "gemma-2-2b",
22
+ "k_values": [
23
+ 1,
24
+ 2,
25
+ 5
26
+ ],
27
+ "lower_vram_usage": false
28
+ },
29
+ "eval_id": "fe2f63b4-1d22-4879-9af8-4e98fff7d830",
30
+ "datetime_epoch_millis": 1740164930002,
31
+ "eval_result_metrics": {
32
+ "llm": {
33
+ "llm_test_accuracy": 0.9595375448465346,
34
+ "llm_top_1_test_accuracy": 0.64956875,
35
+ "llm_top_2_test_accuracy": 0.72589375,
36
+ "llm_top_5_test_accuracy": 0.78265625,
37
+ "llm_top_10_test_accuracy": null,
38
+ "llm_top_20_test_accuracy": null,
39
+ "llm_top_50_test_accuracy": null,
40
+ "llm_top_100_test_accuracy": null
41
+ },
42
+ "sae": {
43
+ "sae_test_accuracy": 0.9598625473678112,
44
+ "sae_top_1_test_accuracy": 0.7501000000000001,
45
+ "sae_top_2_test_accuracy": 0.7998875,
46
+ "sae_top_5_test_accuracy": 0.8547375,
47
+ "sae_top_10_test_accuracy": null,
48
+ "sae_top_20_test_accuracy": null,
49
+ "sae_top_50_test_accuracy": null,
50
+ "sae_top_100_test_accuracy": null
51
+ }
52
+ },
53
+ "eval_result_details": [
54
+ {
55
+ "dataset_name": "LabHC/bias_in_bios_class_set1_results",
56
+ "llm_test_accuracy": 0.966800057888031,
57
+ "llm_top_1_test_accuracy": 0.6397999999999999,
58
+ "llm_top_2_test_accuracy": 0.6954,
59
+ "llm_top_5_test_accuracy": 0.7869999999999999,
60
+ "llm_top_10_test_accuracy": null,
61
+ "llm_top_20_test_accuracy": null,
62
+ "llm_top_50_test_accuracy": null,
63
+ "llm_top_100_test_accuracy": null,
64
+ "sae_test_accuracy": 0.9674000382423401,
65
+ "sae_top_1_test_accuracy": 0.7148000000000001,
66
+ "sae_top_2_test_accuracy": 0.7748,
67
+ "sae_top_5_test_accuracy": 0.8404,
68
+ "sae_top_10_test_accuracy": null,
69
+ "sae_top_20_test_accuracy": null,
70
+ "sae_top_50_test_accuracy": null,
71
+ "sae_top_100_test_accuracy": null
72
+ },
73
+ {
74
+ "dataset_name": "LabHC/bias_in_bios_class_set2_results",
75
+ "llm_test_accuracy": 0.9578000426292419,
76
+ "llm_top_1_test_accuracy": 0.6694000000000001,
77
+ "llm_top_2_test_accuracy": 0.725,
78
+ "llm_top_5_test_accuracy": 0.7654,
79
+ "llm_top_10_test_accuracy": null,
80
+ "llm_top_20_test_accuracy": null,
81
+ "llm_top_50_test_accuracy": null,
82
+ "llm_top_100_test_accuracy": null,
83
+ "sae_test_accuracy": 0.9522000312805176,
84
+ "sae_top_1_test_accuracy": 0.712,
85
+ "sae_top_2_test_accuracy": 0.7488,
86
+ "sae_top_5_test_accuracy": 0.8006,
87
+ "sae_top_10_test_accuracy": null,
88
+ "sae_top_20_test_accuracy": null,
89
+ "sae_top_50_test_accuracy": null,
90
+ "sae_top_100_test_accuracy": null
91
+ },
92
+ {
93
+ "dataset_name": "LabHC/bias_in_bios_class_set3_results",
94
+ "llm_test_accuracy": 0.9316000461578369,
95
+ "llm_top_1_test_accuracy": 0.687,
96
+ "llm_top_2_test_accuracy": 0.7492,
97
+ "llm_top_5_test_accuracy": 0.7704000000000001,
98
+ "llm_top_10_test_accuracy": null,
99
+ "llm_top_20_test_accuracy": null,
100
+ "llm_top_50_test_accuracy": null,
101
+ "llm_top_100_test_accuracy": null,
102
+ "sae_test_accuracy": 0.935800039768219,
103
+ "sae_top_1_test_accuracy": 0.6752,
104
+ "sae_top_2_test_accuracy": 0.7123999999999999,
105
+ "sae_top_5_test_accuracy": 0.797,
106
+ "sae_top_10_test_accuracy": null,
107
+ "sae_top_20_test_accuracy": null,
108
+ "sae_top_50_test_accuracy": null,
109
+ "sae_top_100_test_accuracy": null
110
+ },
111
+ {
112
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
113
+ "llm_test_accuracy": 0.9202000379562378,
114
+ "llm_top_1_test_accuracy": 0.599,
115
+ "llm_top_2_test_accuracy": 0.6474,
116
+ "llm_top_5_test_accuracy": 0.6734,
117
+ "llm_top_10_test_accuracy": null,
118
+ "llm_top_20_test_accuracy": null,
119
+ "llm_top_50_test_accuracy": null,
120
+ "llm_top_100_test_accuracy": null,
121
+ "sae_test_accuracy": 0.9280000567436218,
122
+ "sae_top_1_test_accuracy": 0.7084,
123
+ "sae_top_2_test_accuracy": 0.7418000000000001,
124
+ "sae_top_5_test_accuracy": 0.7746,
125
+ "sae_top_10_test_accuracy": null,
126
+ "sae_top_20_test_accuracy": null,
127
+ "sae_top_50_test_accuracy": null,
128
+ "sae_top_100_test_accuracy": null
129
+ },
130
+ {
131
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
132
+ "llm_test_accuracy": 0.9795000553131104,
133
+ "llm_top_1_test_accuracy": 0.673,
134
+ "llm_top_2_test_accuracy": 0.724,
135
+ "llm_top_5_test_accuracy": 0.766,
136
+ "llm_top_10_test_accuracy": null,
137
+ "llm_top_20_test_accuracy": null,
138
+ "llm_top_50_test_accuracy": null,
139
+ "llm_top_100_test_accuracy": null,
140
+ "sae_test_accuracy": 0.9715000689029694,
141
+ "sae_top_1_test_accuracy": 0.764,
142
+ "sae_top_2_test_accuracy": 0.898,
143
+ "sae_top_5_test_accuracy": 0.921,
144
+ "sae_top_10_test_accuracy": null,
145
+ "sae_top_20_test_accuracy": null,
146
+ "sae_top_50_test_accuracy": null,
147
+ "sae_top_100_test_accuracy": null
148
+ },
149
+ {
150
+ "dataset_name": "codeparrot/github-code_results",
151
+ "llm_test_accuracy": 0.9708000421524048,
152
+ "llm_top_1_test_accuracy": 0.6451999999999999,
153
+ "llm_top_2_test_accuracy": 0.6960000000000001,
154
+ "llm_top_5_test_accuracy": 0.7766,
155
+ "llm_top_10_test_accuracy": null,
156
+ "llm_top_20_test_accuracy": null,
157
+ "llm_top_50_test_accuracy": null,
158
+ "llm_top_100_test_accuracy": null,
159
+ "sae_test_accuracy": 0.9712000489234924,
160
+ "sae_top_1_test_accuracy": 0.6984,
161
+ "sae_top_2_test_accuracy": 0.7142000000000001,
162
+ "sae_top_5_test_accuracy": 0.8362,
163
+ "sae_top_10_test_accuracy": null,
164
+ "sae_top_20_test_accuracy": null,
165
+ "sae_top_50_test_accuracy": null,
166
+ "sae_top_100_test_accuracy": null
167
+ },
168
+ {
169
+ "dataset_name": "fancyzhx/ag_news_results",
170
+ "llm_test_accuracy": 0.9500000476837158,
171
+ "llm_top_1_test_accuracy": 0.63775,
172
+ "llm_top_2_test_accuracy": 0.78175,
173
+ "llm_top_5_test_accuracy": 0.82125,
174
+ "llm_top_10_test_accuracy": null,
175
+ "llm_top_20_test_accuracy": null,
176
+ "llm_top_50_test_accuracy": null,
177
+ "llm_top_100_test_accuracy": null,
178
+ "sae_test_accuracy": 0.9540000557899475,
179
+ "sae_top_1_test_accuracy": 0.813,
180
+ "sae_top_2_test_accuracy": 0.8325,
181
+ "sae_top_5_test_accuracy": 0.8765000000000001,
182
+ "sae_top_10_test_accuracy": null,
183
+ "sae_top_20_test_accuracy": null,
184
+ "sae_top_50_test_accuracy": null,
185
+ "sae_top_100_test_accuracy": null
186
+ },
187
+ {
188
+ "dataset_name": "Helsinki-NLP/europarl_results",
189
+ "llm_test_accuracy": 0.9996000289916992,
190
+ "llm_top_1_test_accuracy": 0.6454,
191
+ "llm_top_2_test_accuracy": 0.7884,
192
+ "llm_top_5_test_accuracy": 0.9012,
193
+ "llm_top_10_test_accuracy": null,
194
+ "llm_top_20_test_accuracy": null,
195
+ "llm_top_50_test_accuracy": null,
196
+ "llm_top_100_test_accuracy": null,
197
+ "sae_test_accuracy": 0.9988000392913818,
198
+ "sae_top_1_test_accuracy": 0.915,
199
+ "sae_top_2_test_accuracy": 0.9766,
200
+ "sae_top_5_test_accuracy": 0.9916,
201
+ "sae_top_10_test_accuracy": null,
202
+ "sae_top_20_test_accuracy": null,
203
+ "sae_top_50_test_accuracy": null,
204
+ "sae_top_100_test_accuracy": null
205
+ }
206
+ ],
207
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
208
+ "sae_lens_id": "custom_sae",
209
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0",
210
+ "sae_lens_version": "5.4.2",
211
+ "sae_cfg_dict": {
212
+ "model_name": "gemma-2-2b",
213
+ "d_in": 2304,
214
+ "d_sae": 65536,
215
+ "hook_layer": 12,
216
+ "hook_name": "blocks.12.hook_resid_post",
217
+ "context_size": null,
218
+ "hook_head_index": null,
219
+ "architecture": "topk",
220
+ "apply_b_dec_to_input": null,
221
+ "finetuning_scaling_factor": null,
222
+ "activation_fn_str": "",
223
+ "prepend_bos": true,
224
+ "normalize_activations": "none",
225
+ "dtype": "bfloat16",
226
+ "device": "",
227
+ "dataset_path": "",
228
+ "dataset_trust_remote_code": true,
229
+ "seqpos_slice": [
230
+ null
231
+ ],
232
+ "training_tokens": -100000,
233
+ "sae_lens_training_version": null,
234
+ "neuronpedia_id": null
235
+ },
236
+ "eval_result_unstructured": {
237
+ "LabHC/bias_in_bios_class_set1_results": {
238
+ "sae_test_accuracy": {
239
+ "0": 0.9500000476837158,
240
+ "1": 0.9700000286102295,
241
+ "2": 0.9520000219345093,
242
+ "6": 0.9880000352859497,
243
+ "9": 0.9770000576972961
244
+ },
245
+ "llm_test_accuracy": {
246
+ "0": 0.9510000348091125,
247
+ "1": 0.9670000672340393,
248
+ "2": 0.9530000686645508,
249
+ "6": 0.987000048160553,
250
+ "9": 0.9760000705718994
251
+ },
252
+ "llm_top_1_test_accuracy": {
253
+ "0": 0.577,
254
+ "1": 0.613,
255
+ "2": 0.662,
256
+ "6": 0.787,
257
+ "9": 0.56
258
+ },
259
+ "llm_top_2_test_accuracy": {
260
+ "0": 0.574,
261
+ "1": 0.66,
262
+ "2": 0.718,
263
+ "6": 0.811,
264
+ "9": 0.714
265
+ },
266
+ "llm_top_5_test_accuracy": {
267
+ "0": 0.713,
268
+ "1": 0.711,
269
+ "2": 0.755,
270
+ "6": 0.895,
271
+ "9": 0.861
272
+ },
273
+ "sae_top_1_test_accuracy": {
274
+ "0": 0.679,
275
+ "1": 0.57,
276
+ "2": 0.622,
277
+ "6": 0.76,
278
+ "9": 0.943
279
+ },
280
+ "sae_top_2_test_accuracy": {
281
+ "0": 0.676,
282
+ "1": 0.565,
283
+ "2": 0.924,
284
+ "6": 0.762,
285
+ "9": 0.947
286
+ },
287
+ "sae_top_5_test_accuracy": {
288
+ "0": 0.828,
289
+ "1": 0.63,
290
+ "2": 0.924,
291
+ "6": 0.878,
292
+ "9": 0.942
293
+ }
294
+ },
295
+ "LabHC/bias_in_bios_class_set2_results": {
296
+ "sae_test_accuracy": {
297
+ "11": 0.9570000171661377,
298
+ "13": 0.9520000219345093,
299
+ "14": 0.9520000219345093,
300
+ "18": 0.9320000410079956,
301
+ "19": 0.968000054359436
302
+ },
303
+ "llm_test_accuracy": {
304
+ "11": 0.9690000414848328,
305
+ "13": 0.9600000381469727,
306
+ "14": 0.9600000381469727,
307
+ "18": 0.9390000700950623,
308
+ "19": 0.9610000252723694
309
+ },
310
+ "llm_top_1_test_accuracy": {
311
+ "11": 0.555,
312
+ "13": 0.668,
313
+ "14": 0.638,
314
+ "18": 0.69,
315
+ "19": 0.796
316
+ },
317
+ "llm_top_2_test_accuracy": {
318
+ "11": 0.756,
319
+ "13": 0.714,
320
+ "14": 0.67,
321
+ "18": 0.717,
322
+ "19": 0.768
323
+ },
324
+ "llm_top_5_test_accuracy": {
325
+ "11": 0.794,
326
+ "13": 0.749,
327
+ "14": 0.723,
328
+ "18": 0.73,
329
+ "19": 0.831
330
+ },
331
+ "sae_top_1_test_accuracy": {
332
+ "11": 0.86,
333
+ "13": 0.646,
334
+ "14": 0.73,
335
+ "18": 0.628,
336
+ "19": 0.696
337
+ },
338
+ "sae_top_2_test_accuracy": {
339
+ "11": 0.854,
340
+ "13": 0.708,
341
+ "14": 0.728,
342
+ "18": 0.669,
343
+ "19": 0.785
344
+ },
345
+ "sae_top_5_test_accuracy": {
346
+ "11": 0.861,
347
+ "13": 0.728,
348
+ "14": 0.862,
349
+ "18": 0.714,
350
+ "19": 0.838
351
+ }
352
+ },
353
+ "LabHC/bias_in_bios_class_set3_results": {
354
+ "sae_test_accuracy": {
355
+ "20": 0.9550000429153442,
356
+ "21": 0.9260000586509705,
357
+ "22": 0.9290000200271606,
358
+ "25": 0.9750000238418579,
359
+ "26": 0.8940000534057617
360
+ },
361
+ "llm_test_accuracy": {
362
+ "20": 0.956000030040741,
363
+ "21": 0.9350000619888306,
364
+ "22": 0.9180000424385071,
365
+ "25": 0.9640000462532043,
366
+ "26": 0.8850000500679016
367
+ },
368
+ "llm_top_1_test_accuracy": {
369
+ "20": 0.693,
370
+ "21": 0.775,
371
+ "22": 0.645,
372
+ "25": 0.706,
373
+ "26": 0.616
374
+ },
375
+ "llm_top_2_test_accuracy": {
376
+ "20": 0.827,
377
+ "21": 0.761,
378
+ "22": 0.694,
379
+ "25": 0.778,
380
+ "26": 0.686
381
+ },
382
+ "llm_top_5_test_accuracy": {
383
+ "20": 0.855,
384
+ "21": 0.791,
385
+ "22": 0.725,
386
+ "25": 0.809,
387
+ "26": 0.672
388
+ },
389
+ "sae_top_1_test_accuracy": {
390
+ "20": 0.869,
391
+ "21": 0.59,
392
+ "22": 0.661,
393
+ "25": 0.633,
394
+ "26": 0.623
395
+ },
396
+ "sae_top_2_test_accuracy": {
397
+ "20": 0.861,
398
+ "21": 0.616,
399
+ "22": 0.689,
400
+ "25": 0.665,
401
+ "26": 0.731
402
+ },
403
+ "sae_top_5_test_accuracy": {
404
+ "20": 0.914,
405
+ "21": 0.831,
406
+ "22": 0.719,
407
+ "25": 0.765,
408
+ "26": 0.756
409
+ }
410
+ },
411
+ "canrager/amazon_reviews_mcauley_1and5_results": {
412
+ "sae_test_accuracy": {
413
+ "1": 0.9530000686645508,
414
+ "2": 0.940000057220459,
415
+ "3": 0.9160000681877136,
416
+ "5": 0.9390000700950623,
417
+ "6": 0.8920000195503235
418
+ },
419
+ "llm_test_accuracy": {
420
+ "1": 0.9580000638961792,
421
+ "2": 0.9330000281333923,
422
+ "3": 0.9280000329017639,
423
+ "5": 0.9200000166893005,
424
+ "6": 0.862000048160553
425
+ },
426
+ "llm_top_1_test_accuracy": {
427
+ "1": 0.647,
428
+ "2": 0.603,
429
+ "3": 0.598,
430
+ "5": 0.555,
431
+ "6": 0.592
432
+ },
433
+ "llm_top_2_test_accuracy": {
434
+ "1": 0.75,
435
+ "2": 0.648,
436
+ "3": 0.607,
437
+ "5": 0.606,
438
+ "6": 0.626
439
+ },
440
+ "llm_top_5_test_accuracy": {
441
+ "1": 0.767,
442
+ "2": 0.641,
443
+ "3": 0.645,
444
+ "5": 0.638,
445
+ "6": 0.676
446
+ },
447
+ "sae_top_1_test_accuracy": {
448
+ "1": 0.854,
449
+ "2": 0.806,
450
+ "3": 0.667,
451
+ "5": 0.543,
452
+ "6": 0.672
453
+ },
454
+ "sae_top_2_test_accuracy": {
455
+ "1": 0.891,
456
+ "2": 0.793,
457
+ "3": 0.753,
458
+ "5": 0.592,
459
+ "6": 0.68
460
+ },
461
+ "sae_top_5_test_accuracy": {
462
+ "1": 0.885,
463
+ "2": 0.809,
464
+ "3": 0.758,
465
+ "5": 0.703,
466
+ "6": 0.718
467
+ }
468
+ },
469
+ "canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
470
+ "sae_test_accuracy": {
471
+ "1.0": 0.971000075340271,
472
+ "5.0": 0.9720000624656677
473
+ },
474
+ "llm_test_accuracy": {
475
+ "1.0": 0.9780000448226929,
476
+ "5.0": 0.9810000658035278
477
+ },
478
+ "llm_top_1_test_accuracy": {
479
+ "1.0": 0.673,
480
+ "5.0": 0.673
481
+ },
482
+ "llm_top_2_test_accuracy": {
483
+ "1.0": 0.724,
484
+ "5.0": 0.724
485
+ },
486
+ "llm_top_5_test_accuracy": {
487
+ "1.0": 0.766,
488
+ "5.0": 0.766
489
+ },
490
+ "sae_top_1_test_accuracy": {
491
+ "1.0": 0.764,
492
+ "5.0": 0.764
493
+ },
494
+ "sae_top_2_test_accuracy": {
495
+ "1.0": 0.898,
496
+ "5.0": 0.898
497
+ },
498
+ "sae_top_5_test_accuracy": {
499
+ "1.0": 0.921,
500
+ "5.0": 0.921
501
+ }
502
+ },
503
+ "codeparrot/github-code_results": {
504
+ "sae_test_accuracy": {
505
+ "C": 0.9530000686645508,
506
+ "Python": 0.984000027179718,
507
+ "HTML": 0.9860000610351562,
508
+ "Java": 0.9750000238418579,
509
+ "PHP": 0.9580000638961792
510
+ },
511
+ "llm_test_accuracy": {
512
+ "C": 0.956000030040741,
513
+ "Python": 0.984000027179718,
514
+ "HTML": 0.9900000691413879,
515
+ "Java": 0.9670000672340393,
516
+ "PHP": 0.9570000171661377
517
+ },
518
+ "llm_top_1_test_accuracy": {
519
+ "C": 0.666,
520
+ "Python": 0.626,
521
+ "HTML": 0.721,
522
+ "Java": 0.619,
523
+ "PHP": 0.594
524
+ },
525
+ "llm_top_2_test_accuracy": {
526
+ "C": 0.679,
527
+ "Python": 0.674,
528
+ "HTML": 0.8,
529
+ "Java": 0.676,
530
+ "PHP": 0.651
531
+ },
532
+ "llm_top_5_test_accuracy": {
533
+ "C": 0.783,
534
+ "Python": 0.717,
535
+ "HTML": 0.935,
536
+ "Java": 0.733,
537
+ "PHP": 0.715
538
+ },
539
+ "sae_top_1_test_accuracy": {
540
+ "C": 0.598,
541
+ "Python": 0.566,
542
+ "HTML": 0.742,
543
+ "Java": 0.656,
544
+ "PHP": 0.93
545
+ },
546
+ "sae_top_2_test_accuracy": {
547
+ "C": 0.654,
548
+ "Python": 0.612,
549
+ "HTML": 0.738,
550
+ "Java": 0.637,
551
+ "PHP": 0.93
552
+ },
553
+ "sae_top_5_test_accuracy": {
554
+ "C": 0.704,
555
+ "Python": 0.917,
556
+ "HTML": 0.938,
557
+ "Java": 0.688,
558
+ "PHP": 0.934
559
+ }
560
+ },
561
+ "fancyzhx/ag_news_results": {
562
+ "sae_test_accuracy": {
563
+ "0": 0.9500000476837158,
564
+ "1": 0.987000048160553,
565
+ "2": 0.9350000619888306,
566
+ "3": 0.9440000653266907
567
+ },
568
+ "llm_test_accuracy": {
569
+ "0": 0.940000057220459,
570
+ "1": 0.9860000610351562,
571
+ "2": 0.9200000166893005,
572
+ "3": 0.9540000557899475
573
+ },
574
+ "llm_top_1_test_accuracy": {
575
+ "0": 0.573,
576
+ "1": 0.671,
577
+ "2": 0.672,
578
+ "3": 0.635
579
+ },
580
+ "llm_top_2_test_accuracy": {
581
+ "0": 0.802,
582
+ "1": 0.808,
583
+ "2": 0.701,
584
+ "3": 0.816
585
+ },
586
+ "llm_top_5_test_accuracy": {
587
+ "0": 0.81,
588
+ "1": 0.891,
589
+ "2": 0.752,
590
+ "3": 0.832
591
+ },
592
+ "sae_top_1_test_accuracy": {
593
+ "0": 0.793,
594
+ "1": 0.972,
595
+ "2": 0.817,
596
+ "3": 0.67
597
+ },
598
+ "sae_top_2_test_accuracy": {
599
+ "0": 0.838,
600
+ "1": 0.972,
601
+ "2": 0.818,
602
+ "3": 0.702
603
+ },
604
+ "sae_top_5_test_accuracy": {
605
+ "0": 0.871,
606
+ "1": 0.976,
607
+ "2": 0.821,
608
+ "3": 0.838
609
+ }
610
+ },
611
+ "Helsinki-NLP/europarl_results": {
612
+ "sae_test_accuracy": {
613
+ "en": 0.999000072479248,
614
+ "fr": 1.0,
615
+ "de": 1.0,
616
+ "es": 0.999000072479248,
617
+ "nl": 0.9960000514984131
618
+ },
619
+ "llm_test_accuracy": {
620
+ "en": 1.0,
621
+ "fr": 0.999000072479248,
622
+ "de": 1.0,
623
+ "es": 0.999000072479248,
624
+ "nl": 1.0
625
+ },
626
+ "llm_top_1_test_accuracy": {
627
+ "en": 0.739,
628
+ "fr": 0.585,
629
+ "de": 0.758,
630
+ "es": 0.496,
631
+ "nl": 0.649
632
+ },
633
+ "llm_top_2_test_accuracy": {
634
+ "en": 0.829,
635
+ "fr": 0.582,
636
+ "de": 0.82,
637
+ "es": 0.958,
638
+ "nl": 0.753
639
+ },
640
+ "llm_top_5_test_accuracy": {
641
+ "en": 0.892,
642
+ "fr": 0.888,
643
+ "de": 0.894,
644
+ "es": 0.98,
645
+ "nl": 0.852
646
+ },
647
+ "sae_top_1_test_accuracy": {
648
+ "en": 0.996,
649
+ "fr": 0.994,
650
+ "de": 0.926,
651
+ "es": 0.839,
652
+ "nl": 0.82
653
+ },
654
+ "sae_top_2_test_accuracy": {
655
+ "en": 0.996,
656
+ "fr": 0.995,
657
+ "de": 0.959,
658
+ "es": 0.938,
659
+ "nl": 0.995
660
+ },
661
+ "sae_top_5_test_accuracy": {
662
+ "en": 0.996,
663
+ "fr": 0.997,
664
+ "de": 0.975,
665
+ "es": 0.995,
666
+ "nl": 0.995
667
+ }
668
+ }
669
+ }
670
+ }
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "sparse_probing",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "LabHC/bias_in_bios_class_set2",
8
+ "LabHC/bias_in_bios_class_set3",
9
+ "canrager/amazon_reviews_mcauley_1and5",
10
+ "canrager/amazon_reviews_mcauley_1and5_sentiment",
11
+ "codeparrot/github-code",
12
+ "fancyzhx/ag_news",
13
+ "Helsinki-NLP/europarl"
14
+ ],
15
+ "probe_train_set_size": 4000,
16
+ "probe_test_set_size": 1000,
17
+ "context_length": 128,
18
+ "sae_batch_size": 125,
19
+ "llm_batch_size": 32,
20
+ "llm_dtype": "bfloat16",
21
+ "model_name": "gemma-2-2b",
22
+ "k_values": [
23
+ 1,
24
+ 2,
25
+ 5
26
+ ],
27
+ "lower_vram_usage": false
28
+ },
29
+ "eval_id": "5fcf2fd9-d1b6-4284-8363-b605ba094c6c",
30
+ "datetime_epoch_millis": 1740164658660,
31
+ "eval_result_metrics": {
32
+ "llm": {
33
+ "llm_test_accuracy": 0.9595375448465346,
34
+ "llm_top_1_test_accuracy": 0.64956875,
35
+ "llm_top_2_test_accuracy": 0.72589375,
36
+ "llm_top_5_test_accuracy": 0.78265625,
37
+ "llm_top_10_test_accuracy": null,
38
+ "llm_top_20_test_accuracy": null,
39
+ "llm_top_50_test_accuracy": null,
40
+ "llm_top_100_test_accuracy": null
41
+ },
42
+ "sae": {
43
+ "sae_test_accuracy": 0.9564437940716745,
44
+ "sae_top_1_test_accuracy": 0.68616875,
45
+ "sae_top_2_test_accuracy": 0.7763499999999999,
46
+ "sae_top_5_test_accuracy": 0.87125,
47
+ "sae_top_10_test_accuracy": null,
48
+ "sae_top_20_test_accuracy": null,
49
+ "sae_top_50_test_accuracy": null,
50
+ "sae_top_100_test_accuracy": null
51
+ }
52
+ },
53
+ "eval_result_details": [
54
+ {
55
+ "dataset_name": "LabHC/bias_in_bios_class_set1_results",
56
+ "llm_test_accuracy": 0.966800057888031,
57
+ "llm_top_1_test_accuracy": 0.6397999999999999,
58
+ "llm_top_2_test_accuracy": 0.6954,
59
+ "llm_top_5_test_accuracy": 0.7869999999999999,
60
+ "llm_top_10_test_accuracy": null,
61
+ "llm_top_20_test_accuracy": null,
62
+ "llm_top_50_test_accuracy": null,
63
+ "llm_top_100_test_accuracy": null,
64
+ "sae_test_accuracy": 0.9612000465393067,
65
+ "sae_top_1_test_accuracy": 0.6982,
66
+ "sae_top_2_test_accuracy": 0.8256,
67
+ "sae_top_5_test_accuracy": 0.8992000000000001,
68
+ "sae_top_10_test_accuracy": null,
69
+ "sae_top_20_test_accuracy": null,
70
+ "sae_top_50_test_accuracy": null,
71
+ "sae_top_100_test_accuracy": null
72
+ },
73
+ {
74
+ "dataset_name": "LabHC/bias_in_bios_class_set2_results",
75
+ "llm_test_accuracy": 0.9578000426292419,
76
+ "llm_top_1_test_accuracy": 0.6694000000000001,
77
+ "llm_top_2_test_accuracy": 0.725,
78
+ "llm_top_5_test_accuracy": 0.7654,
79
+ "llm_top_10_test_accuracy": null,
80
+ "llm_top_20_test_accuracy": null,
81
+ "llm_top_50_test_accuracy": null,
82
+ "llm_top_100_test_accuracy": null,
83
+ "sae_test_accuracy": 0.9466000437736511,
84
+ "sae_top_1_test_accuracy": 0.6604,
85
+ "sae_top_2_test_accuracy": 0.7849999999999999,
86
+ "sae_top_5_test_accuracy": 0.8326,
87
+ "sae_top_10_test_accuracy": null,
88
+ "sae_top_20_test_accuracy": null,
89
+ "sae_top_50_test_accuracy": null,
90
+ "sae_top_100_test_accuracy": null
91
+ },
92
+ {
93
+ "dataset_name": "LabHC/bias_in_bios_class_set3_results",
94
+ "llm_test_accuracy": 0.9316000461578369,
95
+ "llm_top_1_test_accuracy": 0.687,
96
+ "llm_top_2_test_accuracy": 0.7492,
97
+ "llm_top_5_test_accuracy": 0.7704000000000001,
98
+ "llm_top_10_test_accuracy": null,
99
+ "llm_top_20_test_accuracy": null,
100
+ "llm_top_50_test_accuracy": null,
101
+ "llm_top_100_test_accuracy": null,
102
+ "sae_test_accuracy": 0.9314000368118286,
103
+ "sae_top_1_test_accuracy": 0.6974,
104
+ "sae_top_2_test_accuracy": 0.7984,
105
+ "sae_top_5_test_accuracy": 0.853,
106
+ "sae_top_10_test_accuracy": null,
107
+ "sae_top_20_test_accuracy": null,
108
+ "sae_top_50_test_accuracy": null,
109
+ "sae_top_100_test_accuracy": null
110
+ },
111
+ {
112
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
113
+ "llm_test_accuracy": 0.9202000379562378,
114
+ "llm_top_1_test_accuracy": 0.599,
115
+ "llm_top_2_test_accuracy": 0.6474,
116
+ "llm_top_5_test_accuracy": 0.6734,
117
+ "llm_top_10_test_accuracy": null,
118
+ "llm_top_20_test_accuracy": null,
119
+ "llm_top_50_test_accuracy": null,
120
+ "llm_top_100_test_accuracy": null,
121
+ "sae_test_accuracy": 0.9216000437736511,
122
+ "sae_top_1_test_accuracy": 0.6776000000000001,
123
+ "sae_top_2_test_accuracy": 0.7224,
124
+ "sae_top_5_test_accuracy": 0.8231999999999999,
125
+ "sae_top_10_test_accuracy": null,
126
+ "sae_top_20_test_accuracy": null,
127
+ "sae_top_50_test_accuracy": null,
128
+ "sae_top_100_test_accuracy": null
129
+ },
130
+ {
131
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
132
+ "llm_test_accuracy": 0.9795000553131104,
133
+ "llm_top_1_test_accuracy": 0.673,
134
+ "llm_top_2_test_accuracy": 0.724,
135
+ "llm_top_5_test_accuracy": 0.766,
136
+ "llm_top_10_test_accuracy": null,
137
+ "llm_top_20_test_accuracy": null,
138
+ "llm_top_50_test_accuracy": null,
139
+ "llm_top_100_test_accuracy": null,
140
+ "sae_test_accuracy": 0.9725000560283661,
141
+ "sae_top_1_test_accuracy": 0.644,
142
+ "sae_top_2_test_accuracy": 0.661,
143
+ "sae_top_5_test_accuracy": 0.909,
144
+ "sae_top_10_test_accuracy": null,
145
+ "sae_top_20_test_accuracy": null,
146
+ "sae_top_50_test_accuracy": null,
147
+ "sae_top_100_test_accuracy": null
148
+ },
149
+ {
150
+ "dataset_name": "codeparrot/github-code_results",
151
+ "llm_test_accuracy": 0.9708000421524048,
152
+ "llm_top_1_test_accuracy": 0.6451999999999999,
153
+ "llm_top_2_test_accuracy": 0.6960000000000001,
154
+ "llm_top_5_test_accuracy": 0.7766,
155
+ "llm_top_10_test_accuracy": null,
156
+ "llm_top_20_test_accuracy": null,
157
+ "llm_top_50_test_accuracy": null,
158
+ "llm_top_100_test_accuracy": null,
159
+ "sae_test_accuracy": 0.9682000517845154,
160
+ "sae_top_1_test_accuracy": 0.6364000000000001,
161
+ "sae_top_2_test_accuracy": 0.7253999999999999,
162
+ "sae_top_5_test_accuracy": 0.8042,
163
+ "sae_top_10_test_accuracy": null,
164
+ "sae_top_20_test_accuracy": null,
165
+ "sae_top_50_test_accuracy": null,
166
+ "sae_top_100_test_accuracy": null
167
+ },
168
+ {
169
+ "dataset_name": "fancyzhx/ag_news_results",
170
+ "llm_test_accuracy": 0.9500000476837158,
171
+ "llm_top_1_test_accuracy": 0.63775,
172
+ "llm_top_2_test_accuracy": 0.78175,
173
+ "llm_top_5_test_accuracy": 0.82125,
174
+ "llm_top_10_test_accuracy": null,
175
+ "llm_top_20_test_accuracy": null,
176
+ "llm_top_50_test_accuracy": null,
177
+ "llm_top_100_test_accuracy": null,
178
+ "sae_test_accuracy": 0.9502500593662262,
179
+ "sae_top_1_test_accuracy": 0.6407499999999999,
180
+ "sae_top_2_test_accuracy": 0.7829999999999999,
181
+ "sae_top_5_test_accuracy": 0.8530000000000001,
182
+ "sae_top_10_test_accuracy": null,
183
+ "sae_top_20_test_accuracy": null,
184
+ "sae_top_50_test_accuracy": null,
185
+ "sae_top_100_test_accuracy": null
186
+ },
187
+ {
188
+ "dataset_name": "Helsinki-NLP/europarl_results",
189
+ "llm_test_accuracy": 0.9996000289916992,
190
+ "llm_top_1_test_accuracy": 0.6454,
191
+ "llm_top_2_test_accuracy": 0.7884,
192
+ "llm_top_5_test_accuracy": 0.9012,
193
+ "llm_top_10_test_accuracy": null,
194
+ "llm_top_20_test_accuracy": null,
195
+ "llm_top_50_test_accuracy": null,
196
+ "llm_top_100_test_accuracy": null,
197
+ "sae_test_accuracy": 0.9998000144958497,
198
+ "sae_top_1_test_accuracy": 0.8346,
199
+ "sae_top_2_test_accuracy": 0.9099999999999999,
200
+ "sae_top_5_test_accuracy": 0.9957999999999998,
201
+ "sae_top_10_test_accuracy": null,
202
+ "sae_top_20_test_accuracy": null,
203
+ "sae_top_50_test_accuracy": null,
204
+ "sae_top_100_test_accuracy": null
205
+ }
206
+ ],
207
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
208
+ "sae_lens_id": "custom_sae",
209
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0",
210
+ "sae_lens_version": "5.4.2",
211
+ "sae_cfg_dict": {
212
+ "model_name": "gemma-2-2b",
213
+ "d_in": 2304,
214
+ "d_sae": 65536,
215
+ "hook_layer": 12,
216
+ "hook_name": "blocks.12.hook_resid_post",
217
+ "context_size": null,
218
+ "hook_head_index": null,
219
+ "architecture": "topk",
220
+ "apply_b_dec_to_input": null,
221
+ "finetuning_scaling_factor": null,
222
+ "activation_fn_str": "",
223
+ "prepend_bos": true,
224
+ "normalize_activations": "none",
225
+ "dtype": "bfloat16",
226
+ "device": "",
227
+ "dataset_path": "",
228
+ "dataset_trust_remote_code": true,
229
+ "seqpos_slice": [
230
+ null
231
+ ],
232
+ "training_tokens": -100000,
233
+ "sae_lens_training_version": null,
234
+ "neuronpedia_id": null
235
+ },
236
+ "eval_result_unstructured": {
237
+ "LabHC/bias_in_bios_class_set1_results": {
238
+ "sae_test_accuracy": {
239
+ "0": 0.9450000524520874,
240
+ "1": 0.9540000557899475,
241
+ "2": 0.9440000653266907,
242
+ "6": 0.984000027179718,
243
+ "9": 0.9790000319480896
244
+ },
245
+ "llm_test_accuracy": {
246
+ "0": 0.9510000348091125,
247
+ "1": 0.9670000672340393,
248
+ "2": 0.9530000686645508,
249
+ "6": 0.987000048160553,
250
+ "9": 0.9760000705718994
251
+ },
252
+ "llm_top_1_test_accuracy": {
253
+ "0": 0.577,
254
+ "1": 0.613,
255
+ "2": 0.662,
256
+ "6": 0.787,
257
+ "9": 0.56
258
+ },
259
+ "llm_top_2_test_accuracy": {
260
+ "0": 0.574,
261
+ "1": 0.66,
262
+ "2": 0.718,
263
+ "6": 0.811,
264
+ "9": 0.714
265
+ },
266
+ "llm_top_5_test_accuracy": {
267
+ "0": 0.713,
268
+ "1": 0.711,
269
+ "2": 0.755,
270
+ "6": 0.895,
271
+ "9": 0.861
272
+ },
273
+ "sae_top_1_test_accuracy": {
274
+ "0": 0.547,
275
+ "1": 0.666,
276
+ "2": 0.849,
277
+ "6": 0.82,
278
+ "9": 0.609
279
+ },
280
+ "sae_top_2_test_accuracy": {
281
+ "0": 0.806,
282
+ "1": 0.662,
283
+ "2": 0.89,
284
+ "6": 0.832,
285
+ "9": 0.938
286
+ },
287
+ "sae_top_5_test_accuracy": {
288
+ "0": 0.811,
289
+ "1": 0.849,
290
+ "2": 0.9,
291
+ "6": 0.991,
292
+ "9": 0.945
293
+ }
294
+ },
295
+ "LabHC/bias_in_bios_class_set2_results": {
296
+ "sae_test_accuracy": {
297
+ "11": 0.9540000557899475,
298
+ "13": 0.9420000314712524,
299
+ "14": 0.9500000476837158,
300
+ "18": 0.9240000247955322,
301
+ "19": 0.9630000591278076
302
+ },
303
+ "llm_test_accuracy": {
304
+ "11": 0.9690000414848328,
305
+ "13": 0.9600000381469727,
306
+ "14": 0.9600000381469727,
307
+ "18": 0.9390000700950623,
308
+ "19": 0.9610000252723694
309
+ },
310
+ "llm_top_1_test_accuracy": {
311
+ "11": 0.555,
312
+ "13": 0.668,
313
+ "14": 0.638,
314
+ "18": 0.69,
315
+ "19": 0.796
316
+ },
317
+ "llm_top_2_test_accuracy": {
318
+ "11": 0.756,
319
+ "13": 0.714,
320
+ "14": 0.67,
321
+ "18": 0.717,
322
+ "19": 0.768
323
+ },
324
+ "llm_top_5_test_accuracy": {
325
+ "11": 0.794,
326
+ "13": 0.749,
327
+ "14": 0.723,
328
+ "18": 0.73,
329
+ "19": 0.831
330
+ },
331
+ "sae_top_1_test_accuracy": {
332
+ "11": 0.565,
333
+ "13": 0.656,
334
+ "14": 0.628,
335
+ "18": 0.646,
336
+ "19": 0.807
337
+ },
338
+ "sae_top_2_test_accuracy": {
339
+ "11": 0.854,
340
+ "13": 0.679,
341
+ "14": 0.869,
342
+ "18": 0.699,
343
+ "19": 0.824
344
+ },
345
+ "sae_top_5_test_accuracy": {
346
+ "11": 0.953,
347
+ "13": 0.758,
348
+ "14": 0.891,
349
+ "18": 0.73,
350
+ "19": 0.831
351
+ }
352
+ },
353
+ "LabHC/bias_in_bios_class_set3_results": {
354
+ "sae_test_accuracy": {
355
+ "20": 0.9600000381469727,
356
+ "21": 0.9290000200271606,
357
+ "22": 0.9120000600814819,
358
+ "25": 0.9700000286102295,
359
+ "26": 0.8860000371932983
360
+ },
361
+ "llm_test_accuracy": {
362
+ "20": 0.956000030040741,
363
+ "21": 0.9350000619888306,
364
+ "22": 0.9180000424385071,
365
+ "25": 0.9640000462532043,
366
+ "26": 0.8850000500679016
367
+ },
368
+ "llm_top_1_test_accuracy": {
369
+ "20": 0.693,
370
+ "21": 0.775,
371
+ "22": 0.645,
372
+ "25": 0.706,
373
+ "26": 0.616
374
+ },
375
+ "llm_top_2_test_accuracy": {
376
+ "20": 0.827,
377
+ "21": 0.761,
378
+ "22": 0.694,
379
+ "25": 0.778,
380
+ "26": 0.686
381
+ },
382
+ "llm_top_5_test_accuracy": {
383
+ "20": 0.855,
384
+ "21": 0.791,
385
+ "22": 0.725,
386
+ "25": 0.809,
387
+ "26": 0.672
388
+ },
389
+ "sae_top_1_test_accuracy": {
390
+ "20": 0.568,
391
+ "21": 0.75,
392
+ "22": 0.858,
393
+ "25": 0.715,
394
+ "26": 0.596
395
+ },
396
+ "sae_top_2_test_accuracy": {
397
+ "20": 0.841,
398
+ "21": 0.782,
399
+ "22": 0.887,
400
+ "25": 0.865,
401
+ "26": 0.617
402
+ },
403
+ "sae_top_5_test_accuracy": {
404
+ "20": 0.913,
405
+ "21": 0.842,
406
+ "22": 0.883,
407
+ "25": 0.882,
408
+ "26": 0.745
409
+ }
410
+ },
411
+ "canrager/amazon_reviews_mcauley_1and5_results": {
412
+ "sae_test_accuracy": {
413
+ "1": 0.9500000476837158,
414
+ "2": 0.9350000619888306,
415
+ "3": 0.9240000247955322,
416
+ "5": 0.9180000424385071,
417
+ "6": 0.8810000419616699
418
+ },
419
+ "llm_test_accuracy": {
420
+ "1": 0.9580000638961792,
421
+ "2": 0.9330000281333923,
422
+ "3": 0.9280000329017639,
423
+ "5": 0.9200000166893005,
424
+ "6": 0.862000048160553
425
+ },
426
+ "llm_top_1_test_accuracy": {
427
+ "1": 0.647,
428
+ "2": 0.603,
429
+ "3": 0.598,
430
+ "5": 0.555,
431
+ "6": 0.592
432
+ },
433
+ "llm_top_2_test_accuracy": {
434
+ "1": 0.75,
435
+ "2": 0.648,
436
+ "3": 0.607,
437
+ "5": 0.606,
438
+ "6": 0.626
439
+ },
440
+ "llm_top_5_test_accuracy": {
441
+ "1": 0.767,
442
+ "2": 0.641,
443
+ "3": 0.645,
444
+ "5": 0.638,
445
+ "6": 0.676
446
+ },
447
+ "sae_top_1_test_accuracy": {
448
+ "1": 0.902,
449
+ "2": 0.852,
450
+ "3": 0.52,
451
+ "5": 0.555,
452
+ "6": 0.559
453
+ },
454
+ "sae_top_2_test_accuracy": {
455
+ "1": 0.907,
456
+ "2": 0.852,
457
+ "3": 0.553,
458
+ "5": 0.749,
459
+ "6": 0.551
460
+ },
461
+ "sae_top_5_test_accuracy": {
462
+ "1": 0.925,
463
+ "2": 0.882,
464
+ "3": 0.73,
465
+ "5": 0.852,
466
+ "6": 0.727
467
+ }
468
+ },
469
+ "canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
470
+ "sae_test_accuracy": {
471
+ "1.0": 0.9720000624656677,
472
+ "5.0": 0.9730000495910645
473
+ },
474
+ "llm_test_accuracy": {
475
+ "1.0": 0.9780000448226929,
476
+ "5.0": 0.9810000658035278
477
+ },
478
+ "llm_top_1_test_accuracy": {
479
+ "1.0": 0.673,
480
+ "5.0": 0.673
481
+ },
482
+ "llm_top_2_test_accuracy": {
483
+ "1.0": 0.724,
484
+ "5.0": 0.724
485
+ },
486
+ "llm_top_5_test_accuracy": {
487
+ "1.0": 0.766,
488
+ "5.0": 0.766
489
+ },
490
+ "sae_top_1_test_accuracy": {
491
+ "1.0": 0.644,
492
+ "5.0": 0.644
493
+ },
494
+ "sae_top_2_test_accuracy": {
495
+ "1.0": 0.661,
496
+ "5.0": 0.661
497
+ },
498
+ "sae_top_5_test_accuracy": {
499
+ "1.0": 0.909,
500
+ "5.0": 0.909
501
+ }
502
+ },
503
+ "codeparrot/github-code_results": {
504
+ "sae_test_accuracy": {
505
+ "C": 0.9540000557899475,
506
+ "Python": 0.9860000610351562,
507
+ "HTML": 0.9820000529289246,
508
+ "Java": 0.9620000720024109,
509
+ "PHP": 0.9570000171661377
510
+ },
511
+ "llm_test_accuracy": {
512
+ "C": 0.956000030040741,
513
+ "Python": 0.984000027179718,
514
+ "HTML": 0.9900000691413879,
515
+ "Java": 0.9670000672340393,
516
+ "PHP": 0.9570000171661377
517
+ },
518
+ "llm_top_1_test_accuracy": {
519
+ "C": 0.666,
520
+ "Python": 0.626,
521
+ "HTML": 0.721,
522
+ "Java": 0.619,
523
+ "PHP": 0.594
524
+ },
525
+ "llm_top_2_test_accuracy": {
526
+ "C": 0.679,
527
+ "Python": 0.674,
528
+ "HTML": 0.8,
529
+ "Java": 0.676,
530
+ "PHP": 0.651
531
+ },
532
+ "llm_top_5_test_accuracy": {
533
+ "C": 0.783,
534
+ "Python": 0.717,
535
+ "HTML": 0.935,
536
+ "Java": 0.733,
537
+ "PHP": 0.715
538
+ },
539
+ "sae_top_1_test_accuracy": {
540
+ "C": 0.623,
541
+ "Python": 0.623,
542
+ "HTML": 0.689,
543
+ "Java": 0.632,
544
+ "PHP": 0.615
545
+ },
546
+ "sae_top_2_test_accuracy": {
547
+ "C": 0.617,
548
+ "Python": 0.657,
549
+ "HTML": 0.811,
550
+ "Java": 0.629,
551
+ "PHP": 0.913
552
+ },
553
+ "sae_top_5_test_accuracy": {
554
+ "C": 0.681,
555
+ "Python": 0.94,
556
+ "HTML": 0.833,
557
+ "Java": 0.645,
558
+ "PHP": 0.922
559
+ }
560
+ },
561
+ "fancyzhx/ag_news_results": {
562
+ "sae_test_accuracy": {
563
+ "0": 0.9390000700950623,
564
+ "1": 0.9790000319480896,
565
+ "2": 0.9390000700950623,
566
+ "3": 0.9440000653266907
567
+ },
568
+ "llm_test_accuracy": {
569
+ "0": 0.940000057220459,
570
+ "1": 0.9860000610351562,
571
+ "2": 0.9200000166893005,
572
+ "3": 0.9540000557899475
573
+ },
574
+ "llm_top_1_test_accuracy": {
575
+ "0": 0.573,
576
+ "1": 0.671,
577
+ "2": 0.672,
578
+ "3": 0.635
579
+ },
580
+ "llm_top_2_test_accuracy": {
581
+ "0": 0.802,
582
+ "1": 0.808,
583
+ "2": 0.701,
584
+ "3": 0.816
585
+ },
586
+ "llm_top_5_test_accuracy": {
587
+ "0": 0.81,
588
+ "1": 0.891,
589
+ "2": 0.752,
590
+ "3": 0.832
591
+ },
592
+ "sae_top_1_test_accuracy": {
593
+ "0": 0.67,
594
+ "1": 0.641,
595
+ "2": 0.546,
596
+ "3": 0.706
597
+ },
598
+ "sae_top_2_test_accuracy": {
599
+ "0": 0.695,
600
+ "1": 0.94,
601
+ "2": 0.76,
602
+ "3": 0.737
603
+ },
604
+ "sae_top_5_test_accuracy": {
605
+ "0": 0.842,
606
+ "1": 0.96,
607
+ "2": 0.809,
608
+ "3": 0.801
609
+ }
610
+ },
611
+ "Helsinki-NLP/europarl_results": {
612
+ "sae_test_accuracy": {
613
+ "en": 1.0,
614
+ "fr": 1.0,
615
+ "de": 1.0,
616
+ "es": 1.0,
617
+ "nl": 0.999000072479248
618
+ },
619
+ "llm_test_accuracy": {
620
+ "en": 1.0,
621
+ "fr": 0.999000072479248,
622
+ "de": 1.0,
623
+ "es": 0.999000072479248,
624
+ "nl": 1.0
625
+ },
626
+ "llm_top_1_test_accuracy": {
627
+ "en": 0.739,
628
+ "fr": 0.585,
629
+ "de": 0.758,
630
+ "es": 0.496,
631
+ "nl": 0.649
632
+ },
633
+ "llm_top_2_test_accuracy": {
634
+ "en": 0.829,
635
+ "fr": 0.582,
636
+ "de": 0.82,
637
+ "es": 0.958,
638
+ "nl": 0.753
639
+ },
640
+ "llm_top_5_test_accuracy": {
641
+ "en": 0.892,
642
+ "fr": 0.888,
643
+ "de": 0.894,
644
+ "es": 0.98,
645
+ "nl": 0.852
646
+ },
647
+ "sae_top_1_test_accuracy": {
648
+ "en": 0.718,
649
+ "fr": 0.993,
650
+ "de": 0.9,
651
+ "es": 0.902,
652
+ "nl": 0.66
653
+ },
654
+ "sae_top_2_test_accuracy": {
655
+ "en": 0.739,
656
+ "fr": 0.994,
657
+ "de": 0.907,
658
+ "es": 0.913,
659
+ "nl": 0.997
660
+ },
661
+ "sae_top_5_test_accuracy": {
662
+ "en": 0.998,
663
+ "fr": 0.996,
664
+ "de": 0.992,
665
+ "es": 0.993,
666
+ "nl": 1.0
667
+ }
668
+ }
669
+ }
670
+ }
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "sparse_probing",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "LabHC/bias_in_bios_class_set2",
8
+ "LabHC/bias_in_bios_class_set3",
9
+ "canrager/amazon_reviews_mcauley_1and5",
10
+ "canrager/amazon_reviews_mcauley_1and5_sentiment",
11
+ "codeparrot/github-code",
12
+ "fancyzhx/ag_news",
13
+ "Helsinki-NLP/europarl"
14
+ ],
15
+ "probe_train_set_size": 4000,
16
+ "probe_test_set_size": 1000,
17
+ "context_length": 128,
18
+ "sae_batch_size": 125,
19
+ "llm_batch_size": 32,
20
+ "llm_dtype": "bfloat16",
21
+ "model_name": "gemma-2-2b",
22
+ "k_values": [
23
+ 1,
24
+ 2,
25
+ 5
26
+ ],
27
+ "lower_vram_usage": false
28
+ },
29
+ "eval_id": "82b9980d-82e4-4ea8-9e22-8e6990b9b64e",
30
+ "datetime_epoch_millis": 1740164519938,
31
+ "eval_result_metrics": {
32
+ "llm": {
33
+ "llm_test_accuracy": 0.9595375448465346,
34
+ "llm_top_1_test_accuracy": 0.64956875,
35
+ "llm_top_2_test_accuracy": 0.72589375,
36
+ "llm_top_5_test_accuracy": 0.78265625,
37
+ "llm_top_10_test_accuracy": null,
38
+ "llm_top_20_test_accuracy": null,
39
+ "llm_top_50_test_accuracy": null,
40
+ "llm_top_100_test_accuracy": null
41
+ },
42
+ "sae": {
43
+ "sae_test_accuracy": 0.9534375388175249,
44
+ "sae_top_1_test_accuracy": 0.7455499999999999,
45
+ "sae_top_2_test_accuracy": 0.81490625,
46
+ "sae_top_5_test_accuracy": 0.8733625,
47
+ "sae_top_10_test_accuracy": null,
48
+ "sae_top_20_test_accuracy": null,
49
+ "sae_top_50_test_accuracy": null,
50
+ "sae_top_100_test_accuracy": null
51
+ }
52
+ },
53
+ "eval_result_details": [
54
+ {
55
+ "dataset_name": "LabHC/bias_in_bios_class_set1_results",
56
+ "llm_test_accuracy": 0.966800057888031,
57
+ "llm_top_1_test_accuracy": 0.6397999999999999,
58
+ "llm_top_2_test_accuracy": 0.6954,
59
+ "llm_top_5_test_accuracy": 0.7869999999999999,
60
+ "llm_top_10_test_accuracy": null,
61
+ "llm_top_20_test_accuracy": null,
62
+ "llm_top_50_test_accuracy": null,
63
+ "llm_top_100_test_accuracy": null,
64
+ "sae_test_accuracy": 0.9592000484466553,
65
+ "sae_top_1_test_accuracy": 0.7891999999999999,
66
+ "sae_top_2_test_accuracy": 0.8113999999999999,
67
+ "sae_top_5_test_accuracy": 0.9046,
68
+ "sae_top_10_test_accuracy": null,
69
+ "sae_top_20_test_accuracy": null,
70
+ "sae_top_50_test_accuracy": null,
71
+ "sae_top_100_test_accuracy": null
72
+ },
73
+ {
74
+ "dataset_name": "LabHC/bias_in_bios_class_set2_results",
75
+ "llm_test_accuracy": 0.9578000426292419,
76
+ "llm_top_1_test_accuracy": 0.6694000000000001,
77
+ "llm_top_2_test_accuracy": 0.725,
78
+ "llm_top_5_test_accuracy": 0.7654,
79
+ "llm_top_10_test_accuracy": null,
80
+ "llm_top_20_test_accuracy": null,
81
+ "llm_top_50_test_accuracy": null,
82
+ "llm_top_100_test_accuracy": null,
83
+ "sae_test_accuracy": 0.9442000389099121,
84
+ "sae_top_1_test_accuracy": 0.6958,
85
+ "sae_top_2_test_accuracy": 0.8019999999999999,
86
+ "sae_top_5_test_accuracy": 0.8480000000000001,
87
+ "sae_top_10_test_accuracy": null,
88
+ "sae_top_20_test_accuracy": null,
89
+ "sae_top_50_test_accuracy": null,
90
+ "sae_top_100_test_accuracy": null
91
+ },
92
+ {
93
+ "dataset_name": "LabHC/bias_in_bios_class_set3_results",
94
+ "llm_test_accuracy": 0.9316000461578369,
95
+ "llm_top_1_test_accuracy": 0.687,
96
+ "llm_top_2_test_accuracy": 0.7492,
97
+ "llm_top_5_test_accuracy": 0.7704000000000001,
98
+ "llm_top_10_test_accuracy": null,
99
+ "llm_top_20_test_accuracy": null,
100
+ "llm_top_50_test_accuracy": null,
101
+ "llm_top_100_test_accuracy": null,
102
+ "sae_test_accuracy": 0.9224000334739685,
103
+ "sae_top_1_test_accuracy": 0.7988,
104
+ "sae_top_2_test_accuracy": 0.836,
105
+ "sae_top_5_test_accuracy": 0.8666,
106
+ "sae_top_10_test_accuracy": null,
107
+ "sae_top_20_test_accuracy": null,
108
+ "sae_top_50_test_accuracy": null,
109
+ "sae_top_100_test_accuracy": null
110
+ },
111
+ {
112
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
113
+ "llm_test_accuracy": 0.9202000379562378,
114
+ "llm_top_1_test_accuracy": 0.599,
115
+ "llm_top_2_test_accuracy": 0.6474,
116
+ "llm_top_5_test_accuracy": 0.6734,
117
+ "llm_top_10_test_accuracy": null,
118
+ "llm_top_20_test_accuracy": null,
119
+ "llm_top_50_test_accuracy": null,
120
+ "llm_top_100_test_accuracy": null,
121
+ "sae_test_accuracy": 0.9188000440597535,
122
+ "sae_top_1_test_accuracy": 0.6394,
123
+ "sae_top_2_test_accuracy": 0.7183999999999999,
124
+ "sae_top_5_test_accuracy": 0.798,
125
+ "sae_top_10_test_accuracy": null,
126
+ "sae_top_20_test_accuracy": null,
127
+ "sae_top_50_test_accuracy": null,
128
+ "sae_top_100_test_accuracy": null
129
+ },
130
+ {
131
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
132
+ "llm_test_accuracy": 0.9795000553131104,
133
+ "llm_top_1_test_accuracy": 0.673,
134
+ "llm_top_2_test_accuracy": 0.724,
135
+ "llm_top_5_test_accuracy": 0.766,
136
+ "llm_top_10_test_accuracy": null,
137
+ "llm_top_20_test_accuracy": null,
138
+ "llm_top_50_test_accuracy": null,
139
+ "llm_top_100_test_accuracy": null,
140
+ "sae_test_accuracy": 0.9685000479221344,
141
+ "sae_top_1_test_accuracy": 0.76,
142
+ "sae_top_2_test_accuracy": 0.931,
143
+ "sae_top_5_test_accuracy": 0.939,
144
+ "sae_top_10_test_accuracy": null,
145
+ "sae_top_20_test_accuracy": null,
146
+ "sae_top_50_test_accuracy": null,
147
+ "sae_top_100_test_accuracy": null
148
+ },
149
+ {
150
+ "dataset_name": "codeparrot/github-code_results",
151
+ "llm_test_accuracy": 0.9708000421524048,
152
+ "llm_top_1_test_accuracy": 0.6451999999999999,
153
+ "llm_top_2_test_accuracy": 0.6960000000000001,
154
+ "llm_top_5_test_accuracy": 0.7766,
155
+ "llm_top_10_test_accuracy": null,
156
+ "llm_top_20_test_accuracy": null,
157
+ "llm_top_50_test_accuracy": null,
158
+ "llm_top_100_test_accuracy": null,
159
+ "sae_test_accuracy": 0.9700000286102295,
160
+ "sae_top_1_test_accuracy": 0.6275999999999999,
161
+ "sae_top_2_test_accuracy": 0.7262000000000001,
162
+ "sae_top_5_test_accuracy": 0.7979999999999999,
163
+ "sae_top_10_test_accuracy": null,
164
+ "sae_top_20_test_accuracy": null,
165
+ "sae_top_50_test_accuracy": null,
166
+ "sae_top_100_test_accuracy": null
167
+ },
168
+ {
169
+ "dataset_name": "fancyzhx/ag_news_results",
170
+ "llm_test_accuracy": 0.9500000476837158,
171
+ "llm_top_1_test_accuracy": 0.63775,
172
+ "llm_top_2_test_accuracy": 0.78175,
173
+ "llm_top_5_test_accuracy": 0.82125,
174
+ "llm_top_10_test_accuracy": null,
175
+ "llm_top_20_test_accuracy": null,
176
+ "llm_top_50_test_accuracy": null,
177
+ "llm_top_100_test_accuracy": null,
178
+ "sae_test_accuracy": 0.9460000246763229,
179
+ "sae_top_1_test_accuracy": 0.713,
180
+ "sae_top_2_test_accuracy": 0.75225,
181
+ "sae_top_5_test_accuracy": 0.8434999999999999,
182
+ "sae_top_10_test_accuracy": null,
183
+ "sae_top_20_test_accuracy": null,
184
+ "sae_top_50_test_accuracy": null,
185
+ "sae_top_100_test_accuracy": null
186
+ },
187
+ {
188
+ "dataset_name": "Helsinki-NLP/europarl_results",
189
+ "llm_test_accuracy": 0.9996000289916992,
190
+ "llm_top_1_test_accuracy": 0.6454,
191
+ "llm_top_2_test_accuracy": 0.7884,
192
+ "llm_top_5_test_accuracy": 0.9012,
193
+ "llm_top_10_test_accuracy": null,
194
+ "llm_top_20_test_accuracy": null,
195
+ "llm_top_50_test_accuracy": null,
196
+ "llm_top_100_test_accuracy": null,
197
+ "sae_test_accuracy": 0.9984000444412231,
198
+ "sae_top_1_test_accuracy": 0.9406000000000001,
199
+ "sae_top_2_test_accuracy": 0.942,
200
+ "sae_top_5_test_accuracy": 0.9892,
201
+ "sae_top_10_test_accuracy": null,
202
+ "sae_top_20_test_accuracy": null,
203
+ "sae_top_50_test_accuracy": null,
204
+ "sae_top_100_test_accuracy": null
205
+ }
206
+ ],
207
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
208
+ "sae_lens_id": "custom_sae",
209
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0",
210
+ "sae_lens_version": "5.4.2",
211
+ "sae_cfg_dict": {
212
+ "model_name": "gemma-2-2b",
213
+ "d_in": 2304,
214
+ "d_sae": 65536,
215
+ "hook_layer": 12,
216
+ "hook_name": "blocks.12.hook_resid_post",
217
+ "context_size": null,
218
+ "hook_head_index": null,
219
+ "architecture": "topk",
220
+ "apply_b_dec_to_input": null,
221
+ "finetuning_scaling_factor": null,
222
+ "activation_fn_str": "",
223
+ "prepend_bos": true,
224
+ "normalize_activations": "none",
225
+ "dtype": "bfloat16",
226
+ "device": "",
227
+ "dataset_path": "",
228
+ "dataset_trust_remote_code": true,
229
+ "seqpos_slice": [
230
+ null
231
+ ],
232
+ "training_tokens": -100000,
233
+ "sae_lens_training_version": null,
234
+ "neuronpedia_id": null
235
+ },
236
+ "eval_result_unstructured": {
237
+ "LabHC/bias_in_bios_class_set1_results": {
238
+ "sae_test_accuracy": {
239
+ "0": 0.9390000700950623,
240
+ "1": 0.9580000638961792,
241
+ "2": 0.9500000476837158,
242
+ "6": 0.984000027179718,
243
+ "9": 0.9650000333786011
244
+ },
245
+ "llm_test_accuracy": {
246
+ "0": 0.9510000348091125,
247
+ "1": 0.9670000672340393,
248
+ "2": 0.9530000686645508,
249
+ "6": 0.987000048160553,
250
+ "9": 0.9760000705718994
251
+ },
252
+ "llm_top_1_test_accuracy": {
253
+ "0": 0.577,
254
+ "1": 0.613,
255
+ "2": 0.662,
256
+ "6": 0.787,
257
+ "9": 0.56
258
+ },
259
+ "llm_top_2_test_accuracy": {
260
+ "0": 0.574,
261
+ "1": 0.66,
262
+ "2": 0.718,
263
+ "6": 0.811,
264
+ "9": 0.714
265
+ },
266
+ "llm_top_5_test_accuracy": {
267
+ "0": 0.713,
268
+ "1": 0.711,
269
+ "2": 0.755,
270
+ "6": 0.895,
271
+ "9": 0.861
272
+ },
273
+ "sae_top_1_test_accuracy": {
274
+ "0": 0.573,
275
+ "1": 0.579,
276
+ "2": 0.896,
277
+ "6": 0.961,
278
+ "9": 0.937
279
+ },
280
+ "sae_top_2_test_accuracy": {
281
+ "0": 0.624,
282
+ "1": 0.633,
283
+ "2": 0.905,
284
+ "6": 0.961,
285
+ "9": 0.934
286
+ },
287
+ "sae_top_5_test_accuracy": {
288
+ "0": 0.856,
289
+ "1": 0.83,
290
+ "2": 0.91,
291
+ "6": 0.984,
292
+ "9": 0.943
293
+ }
294
+ },
295
+ "LabHC/bias_in_bios_class_set2_results": {
296
+ "sae_test_accuracy": {
297
+ "11": 0.9520000219345093,
298
+ "13": 0.9470000267028809,
299
+ "14": 0.940000057220459,
300
+ "18": 0.9190000295639038,
301
+ "19": 0.9630000591278076
302
+ },
303
+ "llm_test_accuracy": {
304
+ "11": 0.9690000414848328,
305
+ "13": 0.9600000381469727,
306
+ "14": 0.9600000381469727,
307
+ "18": 0.9390000700950623,
308
+ "19": 0.9610000252723694
309
+ },
310
+ "llm_top_1_test_accuracy": {
311
+ "11": 0.555,
312
+ "13": 0.668,
313
+ "14": 0.638,
314
+ "18": 0.69,
315
+ "19": 0.796
316
+ },
317
+ "llm_top_2_test_accuracy": {
318
+ "11": 0.756,
319
+ "13": 0.714,
320
+ "14": 0.67,
321
+ "18": 0.717,
322
+ "19": 0.768
323
+ },
324
+ "llm_top_5_test_accuracy": {
325
+ "11": 0.794,
326
+ "13": 0.749,
327
+ "14": 0.723,
328
+ "18": 0.73,
329
+ "19": 0.831
330
+ },
331
+ "sae_top_1_test_accuracy": {
332
+ "11": 0.618,
333
+ "13": 0.646,
334
+ "14": 0.731,
335
+ "18": 0.641,
336
+ "19": 0.843
337
+ },
338
+ "sae_top_2_test_accuracy": {
339
+ "11": 0.723,
340
+ "13": 0.689,
341
+ "14": 0.854,
342
+ "18": 0.882,
343
+ "19": 0.862
344
+ },
345
+ "sae_top_5_test_accuracy": {
346
+ "11": 0.835,
347
+ "13": 0.746,
348
+ "14": 0.858,
349
+ "18": 0.899,
350
+ "19": 0.902
351
+ }
352
+ },
353
+ "LabHC/bias_in_bios_class_set3_results": {
354
+ "sae_test_accuracy": {
355
+ "20": 0.9520000219345093,
356
+ "21": 0.9160000681877136,
357
+ "22": 0.9100000262260437,
358
+ "25": 0.9470000267028809,
359
+ "26": 0.8870000243186951
360
+ },
361
+ "llm_test_accuracy": {
362
+ "20": 0.956000030040741,
363
+ "21": 0.9350000619888306,
364
+ "22": 0.9180000424385071,
365
+ "25": 0.9640000462532043,
366
+ "26": 0.8850000500679016
367
+ },
368
+ "llm_top_1_test_accuracy": {
369
+ "20": 0.693,
370
+ "21": 0.775,
371
+ "22": 0.645,
372
+ "25": 0.706,
373
+ "26": 0.616
374
+ },
375
+ "llm_top_2_test_accuracy": {
376
+ "20": 0.827,
377
+ "21": 0.761,
378
+ "22": 0.694,
379
+ "25": 0.778,
380
+ "26": 0.686
381
+ },
382
+ "llm_top_5_test_accuracy": {
383
+ "20": 0.855,
384
+ "21": 0.791,
385
+ "22": 0.725,
386
+ "25": 0.809,
387
+ "26": 0.672
388
+ },
389
+ "sae_top_1_test_accuracy": {
390
+ "20": 0.909,
391
+ "21": 0.795,
392
+ "22": 0.799,
393
+ "25": 0.876,
394
+ "26": 0.615
395
+ },
396
+ "sae_top_2_test_accuracy": {
397
+ "20": 0.914,
398
+ "21": 0.812,
399
+ "22": 0.872,
400
+ "25": 0.887,
401
+ "26": 0.695
402
+ },
403
+ "sae_top_5_test_accuracy": {
404
+ "20": 0.921,
405
+ "21": 0.817,
406
+ "22": 0.86,
407
+ "25": 0.918,
408
+ "26": 0.817
409
+ }
410
+ },
411
+ "canrager/amazon_reviews_mcauley_1and5_results": {
412
+ "sae_test_accuracy": {
413
+ "1": 0.9510000348091125,
414
+ "2": 0.937000036239624,
415
+ "3": 0.9130000472068787,
416
+ "5": 0.921000063419342,
417
+ "6": 0.8720000386238098
418
+ },
419
+ "llm_test_accuracy": {
420
+ "1": 0.9580000638961792,
421
+ "2": 0.9330000281333923,
422
+ "3": 0.9280000329017639,
423
+ "5": 0.9200000166893005,
424
+ "6": 0.862000048160553
425
+ },
426
+ "llm_top_1_test_accuracy": {
427
+ "1": 0.647,
428
+ "2": 0.603,
429
+ "3": 0.598,
430
+ "5": 0.555,
431
+ "6": 0.592
432
+ },
433
+ "llm_top_2_test_accuracy": {
434
+ "1": 0.75,
435
+ "2": 0.648,
436
+ "3": 0.607,
437
+ "5": 0.606,
438
+ "6": 0.626
439
+ },
440
+ "llm_top_5_test_accuracy": {
441
+ "1": 0.767,
442
+ "2": 0.641,
443
+ "3": 0.645,
444
+ "5": 0.638,
445
+ "6": 0.676
446
+ },
447
+ "sae_top_1_test_accuracy": {
448
+ "1": 0.724,
449
+ "2": 0.626,
450
+ "3": 0.577,
451
+ "5": 0.553,
452
+ "6": 0.717
453
+ },
454
+ "sae_top_2_test_accuracy": {
455
+ "1": 0.733,
456
+ "2": 0.859,
457
+ "3": 0.615,
458
+ "5": 0.666,
459
+ "6": 0.719
460
+ },
461
+ "sae_top_5_test_accuracy": {
462
+ "1": 0.817,
463
+ "2": 0.887,
464
+ "3": 0.704,
465
+ "5": 0.826,
466
+ "6": 0.756
467
+ }
468
+ },
469
+ "canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
470
+ "sae_test_accuracy": {
471
+ "1.0": 0.9690000414848328,
472
+ "5.0": 0.968000054359436
473
+ },
474
+ "llm_test_accuracy": {
475
+ "1.0": 0.9780000448226929,
476
+ "5.0": 0.9810000658035278
477
+ },
478
+ "llm_top_1_test_accuracy": {
479
+ "1.0": 0.673,
480
+ "5.0": 0.673
481
+ },
482
+ "llm_top_2_test_accuracy": {
483
+ "1.0": 0.724,
484
+ "5.0": 0.724
485
+ },
486
+ "llm_top_5_test_accuracy": {
487
+ "1.0": 0.766,
488
+ "5.0": 0.766
489
+ },
490
+ "sae_top_1_test_accuracy": {
491
+ "1.0": 0.76,
492
+ "5.0": 0.76
493
+ },
494
+ "sae_top_2_test_accuracy": {
495
+ "1.0": 0.931,
496
+ "5.0": 0.931
497
+ },
498
+ "sae_top_5_test_accuracy": {
499
+ "1.0": 0.939,
500
+ "5.0": 0.939
501
+ }
502
+ },
503
+ "codeparrot/github-code_results": {
504
+ "sae_test_accuracy": {
505
+ "C": 0.956000030040741,
506
+ "Python": 0.9890000224113464,
507
+ "HTML": 0.984000027179718,
508
+ "Java": 0.9650000333786011,
509
+ "PHP": 0.956000030040741
510
+ },
511
+ "llm_test_accuracy": {
512
+ "C": 0.956000030040741,
513
+ "Python": 0.984000027179718,
514
+ "HTML": 0.9900000691413879,
515
+ "Java": 0.9670000672340393,
516
+ "PHP": 0.9570000171661377
517
+ },
518
+ "llm_top_1_test_accuracy": {
519
+ "C": 0.666,
520
+ "Python": 0.626,
521
+ "HTML": 0.721,
522
+ "Java": 0.619,
523
+ "PHP": 0.594
524
+ },
525
+ "llm_top_2_test_accuracy": {
526
+ "C": 0.679,
527
+ "Python": 0.674,
528
+ "HTML": 0.8,
529
+ "Java": 0.676,
530
+ "PHP": 0.651
531
+ },
532
+ "llm_top_5_test_accuracy": {
533
+ "C": 0.783,
534
+ "Python": 0.717,
535
+ "HTML": 0.935,
536
+ "Java": 0.733,
537
+ "PHP": 0.715
538
+ },
539
+ "sae_top_1_test_accuracy": {
540
+ "C": 0.576,
541
+ "Python": 0.547,
542
+ "HTML": 0.768,
543
+ "Java": 0.652,
544
+ "PHP": 0.595
545
+ },
546
+ "sae_top_2_test_accuracy": {
547
+ "C": 0.595,
548
+ "Python": 0.699,
549
+ "HTML": 0.777,
550
+ "Java": 0.638,
551
+ "PHP": 0.922
552
+ },
553
+ "sae_top_5_test_accuracy": {
554
+ "C": 0.705,
555
+ "Python": 0.715,
556
+ "HTML": 0.936,
557
+ "Java": 0.714,
558
+ "PHP": 0.92
559
+ }
560
+ },
561
+ "fancyzhx/ag_news_results": {
562
+ "sae_test_accuracy": {
563
+ "0": 0.9340000152587891,
564
+ "1": 0.9800000190734863,
565
+ "2": 0.9200000166893005,
566
+ "3": 0.9500000476837158
567
+ },
568
+ "llm_test_accuracy": {
569
+ "0": 0.940000057220459,
570
+ "1": 0.9860000610351562,
571
+ "2": 0.9200000166893005,
572
+ "3": 0.9540000557899475
573
+ },
574
+ "llm_top_1_test_accuracy": {
575
+ "0": 0.573,
576
+ "1": 0.671,
577
+ "2": 0.672,
578
+ "3": 0.635
579
+ },
580
+ "llm_top_2_test_accuracy": {
581
+ "0": 0.802,
582
+ "1": 0.808,
583
+ "2": 0.701,
584
+ "3": 0.816
585
+ },
586
+ "llm_top_5_test_accuracy": {
587
+ "0": 0.81,
588
+ "1": 0.891,
589
+ "2": 0.752,
590
+ "3": 0.832
591
+ },
592
+ "sae_top_1_test_accuracy": {
593
+ "0": 0.67,
594
+ "1": 0.811,
595
+ "2": 0.741,
596
+ "3": 0.63
597
+ },
598
+ "sae_top_2_test_accuracy": {
599
+ "0": 0.745,
600
+ "1": 0.909,
601
+ "2": 0.734,
602
+ "3": 0.621
603
+ },
604
+ "sae_top_5_test_accuracy": {
605
+ "0": 0.847,
606
+ "1": 0.927,
607
+ "2": 0.813,
608
+ "3": 0.787
609
+ }
610
+ },
611
+ "Helsinki-NLP/europarl_results": {
612
+ "sae_test_accuracy": {
613
+ "en": 0.9980000257492065,
614
+ "fr": 0.999000072479248,
615
+ "de": 0.9980000257492065,
616
+ "es": 0.999000072479248,
617
+ "nl": 0.9980000257492065
618
+ },
619
+ "llm_test_accuracy": {
620
+ "en": 1.0,
621
+ "fr": 0.999000072479248,
622
+ "de": 1.0,
623
+ "es": 0.999000072479248,
624
+ "nl": 1.0
625
+ },
626
+ "llm_top_1_test_accuracy": {
627
+ "en": 0.739,
628
+ "fr": 0.585,
629
+ "de": 0.758,
630
+ "es": 0.496,
631
+ "nl": 0.649
632
+ },
633
+ "llm_top_2_test_accuracy": {
634
+ "en": 0.829,
635
+ "fr": 0.582,
636
+ "de": 0.82,
637
+ "es": 0.958,
638
+ "nl": 0.753
639
+ },
640
+ "llm_top_5_test_accuracy": {
641
+ "en": 0.892,
642
+ "fr": 0.888,
643
+ "de": 0.894,
644
+ "es": 0.98,
645
+ "nl": 0.852
646
+ },
647
+ "sae_top_1_test_accuracy": {
648
+ "en": 0.739,
649
+ "fr": 0.991,
650
+ "de": 0.98,
651
+ "es": 0.995,
652
+ "nl": 0.998
653
+ },
654
+ "sae_top_2_test_accuracy": {
655
+ "en": 0.747,
656
+ "fr": 0.99,
657
+ "de": 0.98,
658
+ "es": 0.995,
659
+ "nl": 0.998
660
+ },
661
+ "sae_top_5_test_accuracy": {
662
+ "en": 0.97,
663
+ "fr": 0.996,
664
+ "de": 0.988,
665
+ "es": 0.994,
666
+ "nl": 0.998
667
+ }
668
+ }
669
+ }
670
+ }
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "sparse_probing",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "LabHC/bias_in_bios_class_set2",
8
+ "LabHC/bias_in_bios_class_set3",
9
+ "canrager/amazon_reviews_mcauley_1and5",
10
+ "canrager/amazon_reviews_mcauley_1and5_sentiment",
11
+ "codeparrot/github-code",
12
+ "fancyzhx/ag_news",
13
+ "Helsinki-NLP/europarl"
14
+ ],
15
+ "probe_train_set_size": 4000,
16
+ "probe_test_set_size": 1000,
17
+ "context_length": 128,
18
+ "sae_batch_size": 125,
19
+ "llm_batch_size": 32,
20
+ "llm_dtype": "bfloat16",
21
+ "model_name": "gemma-2-2b",
22
+ "k_values": [
23
+ 1,
24
+ 2,
25
+ 5
26
+ ],
27
+ "lower_vram_usage": false
28
+ },
29
+ "eval_id": "ead5eae1-ac4a-495a-aa9d-6980c16e8482",
30
+ "datetime_epoch_millis": 1740164809112,
31
+ "eval_result_metrics": {
32
+ "llm": {
33
+ "llm_test_accuracy": 0.9595375448465346,
34
+ "llm_top_1_test_accuracy": 0.64956875,
35
+ "llm_top_2_test_accuracy": 0.72589375,
36
+ "llm_top_5_test_accuracy": 0.78265625,
37
+ "llm_top_10_test_accuracy": null,
38
+ "llm_top_20_test_accuracy": null,
39
+ "llm_top_50_test_accuracy": null,
40
+ "llm_top_100_test_accuracy": null
41
+ },
42
+ "sae": {
43
+ "sae_test_accuracy": 0.9516125384718179,
44
+ "sae_top_1_test_accuracy": 0.7007125,
45
+ "sae_top_2_test_accuracy": 0.808825,
46
+ "sae_top_5_test_accuracy": 0.8754875000000001,
47
+ "sae_top_10_test_accuracy": null,
48
+ "sae_top_20_test_accuracy": null,
49
+ "sae_top_50_test_accuracy": null,
50
+ "sae_top_100_test_accuracy": null
51
+ }
52
+ },
53
+ "eval_result_details": [
54
+ {
55
+ "dataset_name": "LabHC/bias_in_bios_class_set1_results",
56
+ "llm_test_accuracy": 0.966800057888031,
57
+ "llm_top_1_test_accuracy": 0.6397999999999999,
58
+ "llm_top_2_test_accuracy": 0.6954,
59
+ "llm_top_5_test_accuracy": 0.7869999999999999,
60
+ "llm_top_10_test_accuracy": null,
61
+ "llm_top_20_test_accuracy": null,
62
+ "llm_top_50_test_accuracy": null,
63
+ "llm_top_100_test_accuracy": null,
64
+ "sae_test_accuracy": 0.9568000435829163,
65
+ "sae_top_1_test_accuracy": 0.6923999999999999,
66
+ "sae_top_2_test_accuracy": 0.8591999999999999,
67
+ "sae_top_5_test_accuracy": 0.9028,
68
+ "sae_top_10_test_accuracy": null,
69
+ "sae_top_20_test_accuracy": null,
70
+ "sae_top_50_test_accuracy": null,
71
+ "sae_top_100_test_accuracy": null
72
+ },
73
+ {
74
+ "dataset_name": "LabHC/bias_in_bios_class_set2_results",
75
+ "llm_test_accuracy": 0.9578000426292419,
76
+ "llm_top_1_test_accuracy": 0.6694000000000001,
77
+ "llm_top_2_test_accuracy": 0.725,
78
+ "llm_top_5_test_accuracy": 0.7654,
79
+ "llm_top_10_test_accuracy": null,
80
+ "llm_top_20_test_accuracy": null,
81
+ "llm_top_50_test_accuracy": null,
82
+ "llm_top_100_test_accuracy": null,
83
+ "sae_test_accuracy": 0.9420000433921814,
84
+ "sae_top_1_test_accuracy": 0.671,
85
+ "sae_top_2_test_accuracy": 0.7924,
86
+ "sae_top_5_test_accuracy": 0.8718,
87
+ "sae_top_10_test_accuracy": null,
88
+ "sae_top_20_test_accuracy": null,
89
+ "sae_top_50_test_accuracy": null,
90
+ "sae_top_100_test_accuracy": null
91
+ },
92
+ {
93
+ "dataset_name": "LabHC/bias_in_bios_class_set3_results",
94
+ "llm_test_accuracy": 0.9316000461578369,
95
+ "llm_top_1_test_accuracy": 0.687,
96
+ "llm_top_2_test_accuracy": 0.7492,
97
+ "llm_top_5_test_accuracy": 0.7704000000000001,
98
+ "llm_top_10_test_accuracy": null,
99
+ "llm_top_20_test_accuracy": null,
100
+ "llm_top_50_test_accuracy": null,
101
+ "llm_top_100_test_accuracy": null,
102
+ "sae_test_accuracy": 0.9232000350952149,
103
+ "sae_top_1_test_accuracy": 0.7499999999999999,
104
+ "sae_top_2_test_accuracy": 0.7924,
105
+ "sae_top_5_test_accuracy": 0.8560000000000001,
106
+ "sae_top_10_test_accuracy": null,
107
+ "sae_top_20_test_accuracy": null,
108
+ "sae_top_50_test_accuracy": null,
109
+ "sae_top_100_test_accuracy": null
110
+ },
111
+ {
112
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
113
+ "llm_test_accuracy": 0.9202000379562378,
114
+ "llm_top_1_test_accuracy": 0.599,
115
+ "llm_top_2_test_accuracy": 0.6474,
116
+ "llm_top_5_test_accuracy": 0.6734,
117
+ "llm_top_10_test_accuracy": null,
118
+ "llm_top_20_test_accuracy": null,
119
+ "llm_top_50_test_accuracy": null,
120
+ "llm_top_100_test_accuracy": null,
121
+ "sae_test_accuracy": 0.9094000339508057,
122
+ "sae_top_1_test_accuracy": 0.6928,
123
+ "sae_top_2_test_accuracy": 0.744,
124
+ "sae_top_5_test_accuracy": 0.8098000000000001,
125
+ "sae_top_10_test_accuracy": null,
126
+ "sae_top_20_test_accuracy": null,
127
+ "sae_top_50_test_accuracy": null,
128
+ "sae_top_100_test_accuracy": null
129
+ },
130
+ {
131
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
132
+ "llm_test_accuracy": 0.9795000553131104,
133
+ "llm_top_1_test_accuracy": 0.673,
134
+ "llm_top_2_test_accuracy": 0.724,
135
+ "llm_top_5_test_accuracy": 0.766,
136
+ "llm_top_10_test_accuracy": null,
137
+ "llm_top_20_test_accuracy": null,
138
+ "llm_top_50_test_accuracy": null,
139
+ "llm_top_100_test_accuracy": null,
140
+ "sae_test_accuracy": 0.9650000333786011,
141
+ "sae_top_1_test_accuracy": 0.745,
142
+ "sae_top_2_test_accuracy": 0.895,
143
+ "sae_top_5_test_accuracy": 0.936,
144
+ "sae_top_10_test_accuracy": null,
145
+ "sae_top_20_test_accuracy": null,
146
+ "sae_top_50_test_accuracy": null,
147
+ "sae_top_100_test_accuracy": null
148
+ },
149
+ {
150
+ "dataset_name": "codeparrot/github-code_results",
151
+ "llm_test_accuracy": 0.9708000421524048,
152
+ "llm_top_1_test_accuracy": 0.6451999999999999,
153
+ "llm_top_2_test_accuracy": 0.6960000000000001,
154
+ "llm_top_5_test_accuracy": 0.7766,
155
+ "llm_top_10_test_accuracy": null,
156
+ "llm_top_20_test_accuracy": null,
157
+ "llm_top_50_test_accuracy": null,
158
+ "llm_top_100_test_accuracy": null,
159
+ "sae_test_accuracy": 0.9684000372886657,
160
+ "sae_top_1_test_accuracy": 0.5955999999999999,
161
+ "sae_top_2_test_accuracy": 0.7612,
162
+ "sae_top_5_test_accuracy": 0.7986000000000001,
163
+ "sae_top_10_test_accuracy": null,
164
+ "sae_top_20_test_accuracy": null,
165
+ "sae_top_50_test_accuracy": null,
166
+ "sae_top_100_test_accuracy": null
167
+ },
168
+ {
169
+ "dataset_name": "fancyzhx/ag_news_results",
170
+ "llm_test_accuracy": 0.9500000476837158,
171
+ "llm_top_1_test_accuracy": 0.63775,
172
+ "llm_top_2_test_accuracy": 0.78175,
173
+ "llm_top_5_test_accuracy": 0.82125,
174
+ "llm_top_10_test_accuracy": null,
175
+ "llm_top_20_test_accuracy": null,
176
+ "llm_top_50_test_accuracy": null,
177
+ "llm_top_100_test_accuracy": null,
178
+ "sae_test_accuracy": 0.9495000392198563,
179
+ "sae_top_1_test_accuracy": 0.6034999999999999,
180
+ "sae_top_2_test_accuracy": 0.6970000000000001,
181
+ "sae_top_5_test_accuracy": 0.8434999999999999,
182
+ "sae_top_10_test_accuracy": null,
183
+ "sae_top_20_test_accuracy": null,
184
+ "sae_top_50_test_accuracy": null,
185
+ "sae_top_100_test_accuracy": null
186
+ },
187
+ {
188
+ "dataset_name": "Helsinki-NLP/europarl_results",
189
+ "llm_test_accuracy": 0.9996000289916992,
190
+ "llm_top_1_test_accuracy": 0.6454,
191
+ "llm_top_2_test_accuracy": 0.7884,
192
+ "llm_top_5_test_accuracy": 0.9012,
193
+ "llm_top_10_test_accuracy": null,
194
+ "llm_top_20_test_accuracy": null,
195
+ "llm_top_50_test_accuracy": null,
196
+ "llm_top_100_test_accuracy": null,
197
+ "sae_test_accuracy": 0.9986000418663025,
198
+ "sae_top_1_test_accuracy": 0.8554,
199
+ "sae_top_2_test_accuracy": 0.9294,
200
+ "sae_top_5_test_accuracy": 0.9853999999999999,
201
+ "sae_top_10_test_accuracy": null,
202
+ "sae_top_20_test_accuracy": null,
203
+ "sae_top_50_test_accuracy": null,
204
+ "sae_top_100_test_accuracy": null
205
+ }
206
+ ],
207
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
208
+ "sae_lens_id": "custom_sae",
209
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0",
210
+ "sae_lens_version": "5.4.2",
211
+ "sae_cfg_dict": {
212
+ "model_name": "gemma-2-2b",
213
+ "d_in": 2304,
214
+ "d_sae": 65536,
215
+ "hook_layer": 12,
216
+ "hook_name": "blocks.12.hook_resid_post",
217
+ "context_size": null,
218
+ "hook_head_index": null,
219
+ "architecture": "topk",
220
+ "apply_b_dec_to_input": null,
221
+ "finetuning_scaling_factor": null,
222
+ "activation_fn_str": "",
223
+ "prepend_bos": true,
224
+ "normalize_activations": "none",
225
+ "dtype": "bfloat16",
226
+ "device": "",
227
+ "dataset_path": "",
228
+ "dataset_trust_remote_code": true,
229
+ "seqpos_slice": [
230
+ null
231
+ ],
232
+ "training_tokens": -100000,
233
+ "sae_lens_training_version": null,
234
+ "neuronpedia_id": null
235
+ },
236
+ "eval_result_unstructured": {
237
+ "LabHC/bias_in_bios_class_set1_results": {
238
+ "sae_test_accuracy": {
239
+ "0": 0.9350000619888306,
240
+ "1": 0.9600000381469727,
241
+ "2": 0.937000036239624,
242
+ "6": 0.987000048160553,
243
+ "9": 0.9650000333786011
244
+ },
245
+ "llm_test_accuracy": {
246
+ "0": 0.9510000348091125,
247
+ "1": 0.9670000672340393,
248
+ "2": 0.9530000686645508,
249
+ "6": 0.987000048160553,
250
+ "9": 0.9760000705718994
251
+ },
252
+ "llm_top_1_test_accuracy": {
253
+ "0": 0.577,
254
+ "1": 0.613,
255
+ "2": 0.662,
256
+ "6": 0.787,
257
+ "9": 0.56
258
+ },
259
+ "llm_top_2_test_accuracy": {
260
+ "0": 0.574,
261
+ "1": 0.66,
262
+ "2": 0.718,
263
+ "6": 0.811,
264
+ "9": 0.714
265
+ },
266
+ "llm_top_5_test_accuracy": {
267
+ "0": 0.713,
268
+ "1": 0.711,
269
+ "2": 0.755,
270
+ "6": 0.895,
271
+ "9": 0.861
272
+ },
273
+ "sae_top_1_test_accuracy": {
274
+ "0": 0.574,
275
+ "1": 0.632,
276
+ "2": 0.891,
277
+ "6": 0.803,
278
+ "9": 0.562
279
+ },
280
+ "sae_top_2_test_accuracy": {
281
+ "0": 0.874,
282
+ "1": 0.741,
283
+ "2": 0.898,
284
+ "6": 0.94,
285
+ "9": 0.843
286
+ },
287
+ "sae_top_5_test_accuracy": {
288
+ "0": 0.879,
289
+ "1": 0.872,
290
+ "2": 0.895,
291
+ "6": 0.967,
292
+ "9": 0.901
293
+ }
294
+ },
295
+ "LabHC/bias_in_bios_class_set2_results": {
296
+ "sae_test_accuracy": {
297
+ "11": 0.9510000348091125,
298
+ "13": 0.9440000653266907,
299
+ "14": 0.9460000395774841,
300
+ "18": 0.9120000600814819,
301
+ "19": 0.9570000171661377
302
+ },
303
+ "llm_test_accuracy": {
304
+ "11": 0.9690000414848328,
305
+ "13": 0.9600000381469727,
306
+ "14": 0.9600000381469727,
307
+ "18": 0.9390000700950623,
308
+ "19": 0.9610000252723694
309
+ },
310
+ "llm_top_1_test_accuracy": {
311
+ "11": 0.555,
312
+ "13": 0.668,
313
+ "14": 0.638,
314
+ "18": 0.69,
315
+ "19": 0.796
316
+ },
317
+ "llm_top_2_test_accuracy": {
318
+ "11": 0.756,
319
+ "13": 0.714,
320
+ "14": 0.67,
321
+ "18": 0.717,
322
+ "19": 0.768
323
+ },
324
+ "llm_top_5_test_accuracy": {
325
+ "11": 0.794,
326
+ "13": 0.749,
327
+ "14": 0.723,
328
+ "18": 0.73,
329
+ "19": 0.831
330
+ },
331
+ "sae_top_1_test_accuracy": {
332
+ "11": 0.55,
333
+ "13": 0.667,
334
+ "14": 0.652,
335
+ "18": 0.697,
336
+ "19": 0.789
337
+ },
338
+ "sae_top_2_test_accuracy": {
339
+ "11": 0.853,
340
+ "13": 0.705,
341
+ "14": 0.88,
342
+ "18": 0.732,
343
+ "19": 0.792
344
+ },
345
+ "sae_top_5_test_accuracy": {
346
+ "11": 0.916,
347
+ "13": 0.749,
348
+ "14": 0.883,
349
+ "18": 0.917,
350
+ "19": 0.894
351
+ }
352
+ },
353
+ "LabHC/bias_in_bios_class_set3_results": {
354
+ "sae_test_accuracy": {
355
+ "20": 0.9530000686645508,
356
+ "21": 0.9240000247955322,
357
+ "22": 0.9010000228881836,
358
+ "25": 0.9520000219345093,
359
+ "26": 0.8860000371932983
360
+ },
361
+ "llm_test_accuracy": {
362
+ "20": 0.956000030040741,
363
+ "21": 0.9350000619888306,
364
+ "22": 0.9180000424385071,
365
+ "25": 0.9640000462532043,
366
+ "26": 0.8850000500679016
367
+ },
368
+ "llm_top_1_test_accuracy": {
369
+ "20": 0.693,
370
+ "21": 0.775,
371
+ "22": 0.645,
372
+ "25": 0.706,
373
+ "26": 0.616
374
+ },
375
+ "llm_top_2_test_accuracy": {
376
+ "20": 0.827,
377
+ "21": 0.761,
378
+ "22": 0.694,
379
+ "25": 0.778,
380
+ "26": 0.686
381
+ },
382
+ "llm_top_5_test_accuracy": {
383
+ "20": 0.855,
384
+ "21": 0.791,
385
+ "22": 0.725,
386
+ "25": 0.809,
387
+ "26": 0.672
388
+ },
389
+ "sae_top_1_test_accuracy": {
390
+ "20": 0.852,
391
+ "21": 0.736,
392
+ "22": 0.829,
393
+ "25": 0.699,
394
+ "26": 0.634
395
+ },
396
+ "sae_top_2_test_accuracy": {
397
+ "20": 0.86,
398
+ "21": 0.78,
399
+ "22": 0.852,
400
+ "25": 0.837,
401
+ "26": 0.633
402
+ },
403
+ "sae_top_5_test_accuracy": {
404
+ "20": 0.93,
405
+ "21": 0.833,
406
+ "22": 0.851,
407
+ "25": 0.875,
408
+ "26": 0.791
409
+ }
410
+ },
411
+ "canrager/amazon_reviews_mcauley_1and5_results": {
412
+ "sae_test_accuracy": {
413
+ "1": 0.9360000491142273,
414
+ "2": 0.9330000281333923,
415
+ "3": 0.9150000214576721,
416
+ "5": 0.8990000486373901,
417
+ "6": 0.8640000224113464
418
+ },
419
+ "llm_test_accuracy": {
420
+ "1": 0.9580000638961792,
421
+ "2": 0.9330000281333923,
422
+ "3": 0.9280000329017639,
423
+ "5": 0.9200000166893005,
424
+ "6": 0.862000048160553
425
+ },
426
+ "llm_top_1_test_accuracy": {
427
+ "1": 0.647,
428
+ "2": 0.603,
429
+ "3": 0.598,
430
+ "5": 0.555,
431
+ "6": 0.592
432
+ },
433
+ "llm_top_2_test_accuracy": {
434
+ "1": 0.75,
435
+ "2": 0.648,
436
+ "3": 0.607,
437
+ "5": 0.606,
438
+ "6": 0.626
439
+ },
440
+ "llm_top_5_test_accuracy": {
441
+ "1": 0.767,
442
+ "2": 0.641,
443
+ "3": 0.645,
444
+ "5": 0.638,
445
+ "6": 0.676
446
+ },
447
+ "sae_top_1_test_accuracy": {
448
+ "1": 0.882,
449
+ "2": 0.869,
450
+ "3": 0.551,
451
+ "5": 0.537,
452
+ "6": 0.625
453
+ },
454
+ "sae_top_2_test_accuracy": {
455
+ "1": 0.885,
456
+ "2": 0.866,
457
+ "3": 0.657,
458
+ "5": 0.606,
459
+ "6": 0.706
460
+ },
461
+ "sae_top_5_test_accuracy": {
462
+ "1": 0.908,
463
+ "2": 0.887,
464
+ "3": 0.656,
465
+ "5": 0.869,
466
+ "6": 0.729
467
+ }
468
+ },
469
+ "canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
470
+ "sae_test_accuracy": {
471
+ "1.0": 0.9640000462532043,
472
+ "5.0": 0.9660000205039978
473
+ },
474
+ "llm_test_accuracy": {
475
+ "1.0": 0.9780000448226929,
476
+ "5.0": 0.9810000658035278
477
+ },
478
+ "llm_top_1_test_accuracy": {
479
+ "1.0": 0.673,
480
+ "5.0": 0.673
481
+ },
482
+ "llm_top_2_test_accuracy": {
483
+ "1.0": 0.724,
484
+ "5.0": 0.724
485
+ },
486
+ "llm_top_5_test_accuracy": {
487
+ "1.0": 0.766,
488
+ "5.0": 0.766
489
+ },
490
+ "sae_top_1_test_accuracy": {
491
+ "1.0": 0.745,
492
+ "5.0": 0.745
493
+ },
494
+ "sae_top_2_test_accuracy": {
495
+ "1.0": 0.895,
496
+ "5.0": 0.895
497
+ },
498
+ "sae_top_5_test_accuracy": {
499
+ "1.0": 0.936,
500
+ "5.0": 0.936
501
+ }
502
+ },
503
+ "codeparrot/github-code_results": {
504
+ "sae_test_accuracy": {
505
+ "C": 0.9570000171661377,
506
+ "Python": 0.9830000400543213,
507
+ "HTML": 0.9810000658035278,
508
+ "Java": 0.9640000462532043,
509
+ "PHP": 0.9570000171661377
510
+ },
511
+ "llm_test_accuracy": {
512
+ "C": 0.956000030040741,
513
+ "Python": 0.984000027179718,
514
+ "HTML": 0.9900000691413879,
515
+ "Java": 0.9670000672340393,
516
+ "PHP": 0.9570000171661377
517
+ },
518
+ "llm_top_1_test_accuracy": {
519
+ "C": 0.666,
520
+ "Python": 0.626,
521
+ "HTML": 0.721,
522
+ "Java": 0.619,
523
+ "PHP": 0.594
524
+ },
525
+ "llm_top_2_test_accuracy": {
526
+ "C": 0.679,
527
+ "Python": 0.674,
528
+ "HTML": 0.8,
529
+ "Java": 0.676,
530
+ "PHP": 0.651
531
+ },
532
+ "llm_top_5_test_accuracy": {
533
+ "C": 0.783,
534
+ "Python": 0.717,
535
+ "HTML": 0.935,
536
+ "Java": 0.733,
537
+ "PHP": 0.715
538
+ },
539
+ "sae_top_1_test_accuracy": {
540
+ "C": 0.523,
541
+ "Python": 0.641,
542
+ "HTML": 0.596,
543
+ "Java": 0.623,
544
+ "PHP": 0.595
545
+ },
546
+ "sae_top_2_test_accuracy": {
547
+ "C": 0.61,
548
+ "Python": 0.881,
549
+ "HTML": 0.74,
550
+ "Java": 0.66,
551
+ "PHP": 0.915
552
+ },
553
+ "sae_top_5_test_accuracy": {
554
+ "C": 0.643,
555
+ "Python": 0.883,
556
+ "HTML": 0.897,
557
+ "Java": 0.657,
558
+ "PHP": 0.913
559
+ }
560
+ },
561
+ "fancyzhx/ag_news_results": {
562
+ "sae_test_accuracy": {
563
+ "0": 0.9320000410079956,
564
+ "1": 0.9830000400543213,
565
+ "2": 0.9290000200271606,
566
+ "3": 0.9540000557899475
567
+ },
568
+ "llm_test_accuracy": {
569
+ "0": 0.940000057220459,
570
+ "1": 0.9860000610351562,
571
+ "2": 0.9200000166893005,
572
+ "3": 0.9540000557899475
573
+ },
574
+ "llm_top_1_test_accuracy": {
575
+ "0": 0.573,
576
+ "1": 0.671,
577
+ "2": 0.672,
578
+ "3": 0.635
579
+ },
580
+ "llm_top_2_test_accuracy": {
581
+ "0": 0.802,
582
+ "1": 0.808,
583
+ "2": 0.701,
584
+ "3": 0.816
585
+ },
586
+ "llm_top_5_test_accuracy": {
587
+ "0": 0.81,
588
+ "1": 0.891,
589
+ "2": 0.752,
590
+ "3": 0.832
591
+ },
592
+ "sae_top_1_test_accuracy": {
593
+ "0": 0.583,
594
+ "1": 0.652,
595
+ "2": 0.554,
596
+ "3": 0.625
597
+ },
598
+ "sae_top_2_test_accuracy": {
599
+ "0": 0.667,
600
+ "1": 0.689,
601
+ "2": 0.748,
602
+ "3": 0.684
603
+ },
604
+ "sae_top_5_test_accuracy": {
605
+ "0": 0.81,
606
+ "1": 0.915,
607
+ "2": 0.841,
608
+ "3": 0.808
609
+ }
610
+ },
611
+ "Helsinki-NLP/europarl_results": {
612
+ "sae_test_accuracy": {
613
+ "en": 0.9980000257492065,
614
+ "fr": 0.999000072479248,
615
+ "de": 1.0,
616
+ "es": 0.9970000386238098,
617
+ "nl": 0.999000072479248
618
+ },
619
+ "llm_test_accuracy": {
620
+ "en": 1.0,
621
+ "fr": 0.999000072479248,
622
+ "de": 1.0,
623
+ "es": 0.999000072479248,
624
+ "nl": 1.0
625
+ },
626
+ "llm_top_1_test_accuracy": {
627
+ "en": 0.739,
628
+ "fr": 0.585,
629
+ "de": 0.758,
630
+ "es": 0.496,
631
+ "nl": 0.649
632
+ },
633
+ "llm_top_2_test_accuracy": {
634
+ "en": 0.829,
635
+ "fr": 0.582,
636
+ "de": 0.82,
637
+ "es": 0.958,
638
+ "nl": 0.753
639
+ },
640
+ "llm_top_5_test_accuracy": {
641
+ "en": 0.892,
642
+ "fr": 0.888,
643
+ "de": 0.894,
644
+ "es": 0.98,
645
+ "nl": 0.852
646
+ },
647
+ "sae_top_1_test_accuracy": {
648
+ "en": 0.737,
649
+ "fr": 0.985,
650
+ "de": 0.93,
651
+ "es": 0.986,
652
+ "nl": 0.639
653
+ },
654
+ "sae_top_2_test_accuracy": {
655
+ "en": 0.972,
656
+ "fr": 0.991,
657
+ "de": 0.937,
658
+ "es": 0.989,
659
+ "nl": 0.758
660
+ },
661
+ "sae_top_5_test_accuracy": {
662
+ "en": 0.998,
663
+ "fr": 0.996,
664
+ "de": 0.944,
665
+ "es": 0.993,
666
+ "nl": 0.996
667
+ }
668
+ }
669
+ }
670
+ }
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "sparse_probing",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "LabHC/bias_in_bios_class_set2",
8
+ "LabHC/bias_in_bios_class_set3",
9
+ "canrager/amazon_reviews_mcauley_1and5",
10
+ "canrager/amazon_reviews_mcauley_1and5_sentiment",
11
+ "codeparrot/github-code",
12
+ "fancyzhx/ag_news",
13
+ "Helsinki-NLP/europarl"
14
+ ],
15
+ "probe_train_set_size": 4000,
16
+ "probe_test_set_size": 1000,
17
+ "context_length": 128,
18
+ "sae_batch_size": 125,
19
+ "llm_batch_size": 32,
20
+ "llm_dtype": "bfloat16",
21
+ "model_name": "gemma-2-2b",
22
+ "k_values": [
23
+ 1,
24
+ 2,
25
+ 5
26
+ ],
27
+ "lower_vram_usage": false
28
+ },
29
+ "eval_id": "01df6242-51fa-47d4-af93-9c80a172184d",
30
+ "datetime_epoch_millis": 1740165335112,
31
+ "eval_result_metrics": {
32
+ "llm": {
33
+ "llm_test_accuracy": 0.9595375448465346,
34
+ "llm_top_1_test_accuracy": 0.64956875,
35
+ "llm_top_2_test_accuracy": 0.72589375,
36
+ "llm_top_5_test_accuracy": 0.78265625,
37
+ "llm_top_10_test_accuracy": null,
38
+ "llm_top_20_test_accuracy": null,
39
+ "llm_top_50_test_accuracy": null,
40
+ "llm_top_100_test_accuracy": null
41
+ },
42
+ "sae": {
43
+ "sae_test_accuracy": 0.9563000392168761,
44
+ "sae_top_1_test_accuracy": 0.7563249999999999,
45
+ "sae_top_2_test_accuracy": 0.80766875,
46
+ "sae_top_5_test_accuracy": 0.861175,
47
+ "sae_top_10_test_accuracy": null,
48
+ "sae_top_20_test_accuracy": null,
49
+ "sae_top_50_test_accuracy": null,
50
+ "sae_top_100_test_accuracy": null
51
+ }
52
+ },
53
+ "eval_result_details": [
54
+ {
55
+ "dataset_name": "LabHC/bias_in_bios_class_set1_results",
56
+ "llm_test_accuracy": 0.966800057888031,
57
+ "llm_top_1_test_accuracy": 0.6397999999999999,
58
+ "llm_top_2_test_accuracy": 0.6954,
59
+ "llm_top_5_test_accuracy": 0.7869999999999999,
60
+ "llm_top_10_test_accuracy": null,
61
+ "llm_top_20_test_accuracy": null,
62
+ "llm_top_50_test_accuracy": null,
63
+ "llm_top_100_test_accuracy": null,
64
+ "sae_test_accuracy": 0.9606000423431397,
65
+ "sae_top_1_test_accuracy": 0.735,
66
+ "sae_top_2_test_accuracy": 0.8328,
67
+ "sae_top_5_test_accuracy": 0.8700000000000001,
68
+ "sae_top_10_test_accuracy": null,
69
+ "sae_top_20_test_accuracy": null,
70
+ "sae_top_50_test_accuracy": null,
71
+ "sae_top_100_test_accuracy": null
72
+ },
73
+ {
74
+ "dataset_name": "LabHC/bias_in_bios_class_set2_results",
75
+ "llm_test_accuracy": 0.9578000426292419,
76
+ "llm_top_1_test_accuracy": 0.6694000000000001,
77
+ "llm_top_2_test_accuracy": 0.725,
78
+ "llm_top_5_test_accuracy": 0.7654,
79
+ "llm_top_10_test_accuracy": null,
80
+ "llm_top_20_test_accuracy": null,
81
+ "llm_top_50_test_accuracy": null,
82
+ "llm_top_100_test_accuracy": null,
83
+ "sae_test_accuracy": 0.9454000473022461,
84
+ "sae_top_1_test_accuracy": 0.7318,
85
+ "sae_top_2_test_accuracy": 0.7548,
86
+ "sae_top_5_test_accuracy": 0.8513999999999999,
87
+ "sae_top_10_test_accuracy": null,
88
+ "sae_top_20_test_accuracy": null,
89
+ "sae_top_50_test_accuracy": null,
90
+ "sae_top_100_test_accuracy": null
91
+ },
92
+ {
93
+ "dataset_name": "LabHC/bias_in_bios_class_set3_results",
94
+ "llm_test_accuracy": 0.9316000461578369,
95
+ "llm_top_1_test_accuracy": 0.687,
96
+ "llm_top_2_test_accuracy": 0.7492,
97
+ "llm_top_5_test_accuracy": 0.7704000000000001,
98
+ "llm_top_10_test_accuracy": null,
99
+ "llm_top_20_test_accuracy": null,
100
+ "llm_top_50_test_accuracy": null,
101
+ "llm_top_100_test_accuracy": null,
102
+ "sae_test_accuracy": 0.931600034236908,
103
+ "sae_top_1_test_accuracy": 0.7748000000000002,
104
+ "sae_top_2_test_accuracy": 0.8088000000000001,
105
+ "sae_top_5_test_accuracy": 0.8513999999999999,
106
+ "sae_top_10_test_accuracy": null,
107
+ "sae_top_20_test_accuracy": null,
108
+ "sae_top_50_test_accuracy": null,
109
+ "sae_top_100_test_accuracy": null
110
+ },
111
+ {
112
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
113
+ "llm_test_accuracy": 0.9202000379562378,
114
+ "llm_top_1_test_accuracy": 0.599,
115
+ "llm_top_2_test_accuracy": 0.6474,
116
+ "llm_top_5_test_accuracy": 0.6734,
117
+ "llm_top_10_test_accuracy": null,
118
+ "llm_top_20_test_accuracy": null,
119
+ "llm_top_50_test_accuracy": null,
120
+ "llm_top_100_test_accuracy": null,
121
+ "sae_test_accuracy": 0.9192000389099121,
122
+ "sae_top_1_test_accuracy": 0.683,
123
+ "sae_top_2_test_accuracy": 0.6950000000000001,
124
+ "sae_top_5_test_accuracy": 0.7598,
125
+ "sae_top_10_test_accuracy": null,
126
+ "sae_top_20_test_accuracy": null,
127
+ "sae_top_50_test_accuracy": null,
128
+ "sae_top_100_test_accuracy": null
129
+ },
130
+ {
131
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
132
+ "llm_test_accuracy": 0.9795000553131104,
133
+ "llm_top_1_test_accuracy": 0.673,
134
+ "llm_top_2_test_accuracy": 0.724,
135
+ "llm_top_5_test_accuracy": 0.766,
136
+ "llm_top_10_test_accuracy": null,
137
+ "llm_top_20_test_accuracy": null,
138
+ "llm_top_50_test_accuracy": null,
139
+ "llm_top_100_test_accuracy": null,
140
+ "sae_test_accuracy": 0.9755000472068787,
141
+ "sae_top_1_test_accuracy": 0.94,
142
+ "sae_top_2_test_accuracy": 0.941,
143
+ "sae_top_5_test_accuracy": 0.95,
144
+ "sae_top_10_test_accuracy": null,
145
+ "sae_top_20_test_accuracy": null,
146
+ "sae_top_50_test_accuracy": null,
147
+ "sae_top_100_test_accuracy": null
148
+ },
149
+ {
150
+ "dataset_name": "codeparrot/github-code_results",
151
+ "llm_test_accuracy": 0.9708000421524048,
152
+ "llm_top_1_test_accuracy": 0.6451999999999999,
153
+ "llm_top_2_test_accuracy": 0.6960000000000001,
154
+ "llm_top_5_test_accuracy": 0.7766,
155
+ "llm_top_10_test_accuracy": null,
156
+ "llm_top_20_test_accuracy": null,
157
+ "llm_top_50_test_accuracy": null,
158
+ "llm_top_100_test_accuracy": null,
159
+ "sae_test_accuracy": 0.9714000463485718,
160
+ "sae_top_1_test_accuracy": 0.6077999999999999,
161
+ "sae_top_2_test_accuracy": 0.7322,
162
+ "sae_top_5_test_accuracy": 0.7504,
163
+ "sae_top_10_test_accuracy": null,
164
+ "sae_top_20_test_accuracy": null,
165
+ "sae_top_50_test_accuracy": null,
166
+ "sae_top_100_test_accuracy": null
167
+ },
168
+ {
169
+ "dataset_name": "fancyzhx/ag_news_results",
170
+ "llm_test_accuracy": 0.9500000476837158,
171
+ "llm_top_1_test_accuracy": 0.63775,
172
+ "llm_top_2_test_accuracy": 0.78175,
173
+ "llm_top_5_test_accuracy": 0.82125,
174
+ "llm_top_10_test_accuracy": null,
175
+ "llm_top_20_test_accuracy": null,
176
+ "llm_top_50_test_accuracy": null,
177
+ "llm_top_100_test_accuracy": null,
178
+ "sae_test_accuracy": 0.9485000222921371,
179
+ "sae_top_1_test_accuracy": 0.6849999999999999,
180
+ "sae_top_2_test_accuracy": 0.74675,
181
+ "sae_top_5_test_accuracy": 0.859,
182
+ "sae_top_10_test_accuracy": null,
183
+ "sae_top_20_test_accuracy": null,
184
+ "sae_top_50_test_accuracy": null,
185
+ "sae_top_100_test_accuracy": null
186
+ },
187
+ {
188
+ "dataset_name": "Helsinki-NLP/europarl_results",
189
+ "llm_test_accuracy": 0.9996000289916992,
190
+ "llm_top_1_test_accuracy": 0.6454,
191
+ "llm_top_2_test_accuracy": 0.7884,
192
+ "llm_top_5_test_accuracy": 0.9012,
193
+ "llm_top_10_test_accuracy": null,
194
+ "llm_top_20_test_accuracy": null,
195
+ "llm_top_50_test_accuracy": null,
196
+ "llm_top_100_test_accuracy": null,
197
+ "sae_test_accuracy": 0.9982000350952148,
198
+ "sae_top_1_test_accuracy": 0.8932,
199
+ "sae_top_2_test_accuracy": 0.95,
200
+ "sae_top_5_test_accuracy": 0.9974000000000001,
201
+ "sae_top_10_test_accuracy": null,
202
+ "sae_top_20_test_accuracy": null,
203
+ "sae_top_50_test_accuracy": null,
204
+ "sae_top_100_test_accuracy": null
205
+ }
206
+ ],
207
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
208
+ "sae_lens_id": "custom_sae",
209
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0",
210
+ "sae_lens_version": "5.4.2",
211
+ "sae_cfg_dict": {
212
+ "model_name": "gemma-2-2b",
213
+ "d_in": 2304,
214
+ "d_sae": 65536,
215
+ "hook_layer": 12,
216
+ "hook_name": "blocks.12.hook_resid_post",
217
+ "context_size": null,
218
+ "hook_head_index": null,
219
+ "architecture": "topk",
220
+ "apply_b_dec_to_input": null,
221
+ "finetuning_scaling_factor": null,
222
+ "activation_fn_str": "",
223
+ "prepend_bos": true,
224
+ "normalize_activations": "none",
225
+ "dtype": "bfloat16",
226
+ "device": "",
227
+ "dataset_path": "",
228
+ "dataset_trust_remote_code": true,
229
+ "seqpos_slice": [
230
+ null
231
+ ],
232
+ "training_tokens": -100000,
233
+ "sae_lens_training_version": null,
234
+ "neuronpedia_id": null
235
+ },
236
+ "eval_result_unstructured": {
237
+ "LabHC/bias_in_bios_class_set1_results": {
238
+ "sae_test_accuracy": {
239
+ "0": 0.9460000395774841,
240
+ "1": 0.9550000429153442,
241
+ "2": 0.9440000653266907,
242
+ "6": 0.9830000400543213,
243
+ "9": 0.9750000238418579
244
+ },
245
+ "llm_test_accuracy": {
246
+ "0": 0.9510000348091125,
247
+ "1": 0.9670000672340393,
248
+ "2": 0.9530000686645508,
249
+ "6": 0.987000048160553,
250
+ "9": 0.9760000705718994
251
+ },
252
+ "llm_top_1_test_accuracy": {
253
+ "0": 0.577,
254
+ "1": 0.613,
255
+ "2": 0.662,
256
+ "6": 0.787,
257
+ "9": 0.56
258
+ },
259
+ "llm_top_2_test_accuracy": {
260
+ "0": 0.574,
261
+ "1": 0.66,
262
+ "2": 0.718,
263
+ "6": 0.811,
264
+ "9": 0.714
265
+ },
266
+ "llm_top_5_test_accuracy": {
267
+ "0": 0.713,
268
+ "1": 0.711,
269
+ "2": 0.755,
270
+ "6": 0.895,
271
+ "9": 0.861
272
+ },
273
+ "sae_top_1_test_accuracy": {
274
+ "0": 0.876,
275
+ "1": 0.586,
276
+ "2": 0.852,
277
+ "6": 0.751,
278
+ "9": 0.61
279
+ },
280
+ "sae_top_2_test_accuracy": {
281
+ "0": 0.878,
282
+ "1": 0.607,
283
+ "2": 0.848,
284
+ "6": 0.976,
285
+ "9": 0.855
286
+ },
287
+ "sae_top_5_test_accuracy": {
288
+ "0": 0.884,
289
+ "1": 0.696,
290
+ "2": 0.864,
291
+ "6": 0.982,
292
+ "9": 0.924
293
+ }
294
+ },
295
+ "LabHC/bias_in_bios_class_set2_results": {
296
+ "sae_test_accuracy": {
297
+ "11": 0.9500000476837158,
298
+ "13": 0.9490000605583191,
299
+ "14": 0.9550000429153442,
300
+ "18": 0.9130000472068787,
301
+ "19": 0.9600000381469727
302
+ },
303
+ "llm_test_accuracy": {
304
+ "11": 0.9690000414848328,
305
+ "13": 0.9600000381469727,
306
+ "14": 0.9600000381469727,
307
+ "18": 0.9390000700950623,
308
+ "19": 0.9610000252723694
309
+ },
310
+ "llm_top_1_test_accuracy": {
311
+ "11": 0.555,
312
+ "13": 0.668,
313
+ "14": 0.638,
314
+ "18": 0.69,
315
+ "19": 0.796
316
+ },
317
+ "llm_top_2_test_accuracy": {
318
+ "11": 0.756,
319
+ "13": 0.714,
320
+ "14": 0.67,
321
+ "18": 0.717,
322
+ "19": 0.768
323
+ },
324
+ "llm_top_5_test_accuracy": {
325
+ "11": 0.794,
326
+ "13": 0.749,
327
+ "14": 0.723,
328
+ "18": 0.73,
329
+ "19": 0.831
330
+ },
331
+ "sae_top_1_test_accuracy": {
332
+ "11": 0.855,
333
+ "13": 0.662,
334
+ "14": 0.618,
335
+ "18": 0.672,
336
+ "19": 0.852
337
+ },
338
+ "sae_top_2_test_accuracy": {
339
+ "11": 0.853,
340
+ "13": 0.641,
341
+ "14": 0.736,
342
+ "18": 0.69,
343
+ "19": 0.854
344
+ },
345
+ "sae_top_5_test_accuracy": {
346
+ "11": 0.847,
347
+ "13": 0.769,
348
+ "14": 0.872,
349
+ "18": 0.907,
350
+ "19": 0.862
351
+ }
352
+ },
353
+ "LabHC/bias_in_bios_class_set3_results": {
354
+ "sae_test_accuracy": {
355
+ "20": 0.9580000638961792,
356
+ "21": 0.9240000247955322,
357
+ "22": 0.9190000295639038,
358
+ "25": 0.9570000171661377,
359
+ "26": 0.9000000357627869
360
+ },
361
+ "llm_test_accuracy": {
362
+ "20": 0.956000030040741,
363
+ "21": 0.9350000619888306,
364
+ "22": 0.9180000424385071,
365
+ "25": 0.9640000462532043,
366
+ "26": 0.8850000500679016
367
+ },
368
+ "llm_top_1_test_accuracy": {
369
+ "20": 0.693,
370
+ "21": 0.775,
371
+ "22": 0.645,
372
+ "25": 0.706,
373
+ "26": 0.616
374
+ },
375
+ "llm_top_2_test_accuracy": {
376
+ "20": 0.827,
377
+ "21": 0.761,
378
+ "22": 0.694,
379
+ "25": 0.778,
380
+ "26": 0.686
381
+ },
382
+ "llm_top_5_test_accuracy": {
383
+ "20": 0.855,
384
+ "21": 0.791,
385
+ "22": 0.725,
386
+ "25": 0.809,
387
+ "26": 0.672
388
+ },
389
+ "sae_top_1_test_accuracy": {
390
+ "20": 0.889,
391
+ "21": 0.61,
392
+ "22": 0.877,
393
+ "25": 0.881,
394
+ "26": 0.617
395
+ },
396
+ "sae_top_2_test_accuracy": {
397
+ "20": 0.866,
398
+ "21": 0.782,
399
+ "22": 0.873,
400
+ "25": 0.898,
401
+ "26": 0.625
402
+ },
403
+ "sae_top_5_test_accuracy": {
404
+ "20": 0.905,
405
+ "21": 0.789,
406
+ "22": 0.885,
407
+ "25": 0.899,
408
+ "26": 0.779
409
+ }
410
+ },
411
+ "canrager/amazon_reviews_mcauley_1and5_results": {
412
+ "sae_test_accuracy": {
413
+ "1": 0.9550000429153442,
414
+ "2": 0.9360000491142273,
415
+ "3": 0.9180000424385071,
416
+ "5": 0.9240000247955322,
417
+ "6": 0.8630000352859497
418
+ },
419
+ "llm_test_accuracy": {
420
+ "1": 0.9580000638961792,
421
+ "2": 0.9330000281333923,
422
+ "3": 0.9280000329017639,
423
+ "5": 0.9200000166893005,
424
+ "6": 0.862000048160553
425
+ },
426
+ "llm_top_1_test_accuracy": {
427
+ "1": 0.647,
428
+ "2": 0.603,
429
+ "3": 0.598,
430
+ "5": 0.555,
431
+ "6": 0.592
432
+ },
433
+ "llm_top_2_test_accuracy": {
434
+ "1": 0.75,
435
+ "2": 0.648,
436
+ "3": 0.607,
437
+ "5": 0.606,
438
+ "6": 0.626
439
+ },
440
+ "llm_top_5_test_accuracy": {
441
+ "1": 0.767,
442
+ "2": 0.641,
443
+ "3": 0.645,
444
+ "5": 0.638,
445
+ "6": 0.676
446
+ },
447
+ "sae_top_1_test_accuracy": {
448
+ "1": 0.792,
449
+ "2": 0.591,
450
+ "3": 0.568,
451
+ "5": 0.829,
452
+ "6": 0.635
453
+ },
454
+ "sae_top_2_test_accuracy": {
455
+ "1": 0.822,
456
+ "2": 0.621,
457
+ "3": 0.571,
458
+ "5": 0.823,
459
+ "6": 0.638
460
+ },
461
+ "sae_top_5_test_accuracy": {
462
+ "1": 0.838,
463
+ "2": 0.862,
464
+ "3": 0.609,
465
+ "5": 0.825,
466
+ "6": 0.665
467
+ }
468
+ },
469
+ "canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
470
+ "sae_test_accuracy": {
471
+ "1.0": 0.9770000576972961,
472
+ "5.0": 0.9740000367164612
473
+ },
474
+ "llm_test_accuracy": {
475
+ "1.0": 0.9780000448226929,
476
+ "5.0": 0.9810000658035278
477
+ },
478
+ "llm_top_1_test_accuracy": {
479
+ "1.0": 0.673,
480
+ "5.0": 0.673
481
+ },
482
+ "llm_top_2_test_accuracy": {
483
+ "1.0": 0.724,
484
+ "5.0": 0.724
485
+ },
486
+ "llm_top_5_test_accuracy": {
487
+ "1.0": 0.766,
488
+ "5.0": 0.766
489
+ },
490
+ "sae_top_1_test_accuracy": {
491
+ "1.0": 0.94,
492
+ "5.0": 0.94
493
+ },
494
+ "sae_top_2_test_accuracy": {
495
+ "1.0": 0.941,
496
+ "5.0": 0.941
497
+ },
498
+ "sae_top_5_test_accuracy": {
499
+ "1.0": 0.95,
500
+ "5.0": 0.95
501
+ }
502
+ },
503
+ "codeparrot/github-code_results": {
504
+ "sae_test_accuracy": {
505
+ "C": 0.9600000381469727,
506
+ "Python": 0.984000027179718,
507
+ "HTML": 0.9920000433921814,
508
+ "Java": 0.9670000672340393,
509
+ "PHP": 0.9540000557899475
510
+ },
511
+ "llm_test_accuracy": {
512
+ "C": 0.956000030040741,
513
+ "Python": 0.984000027179718,
514
+ "HTML": 0.9900000691413879,
515
+ "Java": 0.9670000672340393,
516
+ "PHP": 0.9570000171661377
517
+ },
518
+ "llm_top_1_test_accuracy": {
519
+ "C": 0.666,
520
+ "Python": 0.626,
521
+ "HTML": 0.721,
522
+ "Java": 0.619,
523
+ "PHP": 0.594
524
+ },
525
+ "llm_top_2_test_accuracy": {
526
+ "C": 0.679,
527
+ "Python": 0.674,
528
+ "HTML": 0.8,
529
+ "Java": 0.676,
530
+ "PHP": 0.651
531
+ },
532
+ "llm_top_5_test_accuracy": {
533
+ "C": 0.783,
534
+ "Python": 0.717,
535
+ "HTML": 0.935,
536
+ "Java": 0.733,
537
+ "PHP": 0.715
538
+ },
539
+ "sae_top_1_test_accuracy": {
540
+ "C": 0.595,
541
+ "Python": 0.629,
542
+ "HTML": 0.565,
543
+ "Java": 0.647,
544
+ "PHP": 0.603
545
+ },
546
+ "sae_top_2_test_accuracy": {
547
+ "C": 0.622,
548
+ "Python": 0.654,
549
+ "HTML": 0.824,
550
+ "Java": 0.649,
551
+ "PHP": 0.912
552
+ },
553
+ "sae_top_5_test_accuracy": {
554
+ "C": 0.654,
555
+ "Python": 0.7,
556
+ "HTML": 0.807,
557
+ "Java": 0.674,
558
+ "PHP": 0.917
559
+ }
560
+ },
561
+ "fancyzhx/ag_news_results": {
562
+ "sae_test_accuracy": {
563
+ "0": 0.9380000233650208,
564
+ "1": 0.9800000190734863,
565
+ "2": 0.9330000281333923,
566
+ "3": 0.9430000185966492
567
+ },
568
+ "llm_test_accuracy": {
569
+ "0": 0.940000057220459,
570
+ "1": 0.9860000610351562,
571
+ "2": 0.9200000166893005,
572
+ "3": 0.9540000557899475
573
+ },
574
+ "llm_top_1_test_accuracy": {
575
+ "0": 0.573,
576
+ "1": 0.671,
577
+ "2": 0.672,
578
+ "3": 0.635
579
+ },
580
+ "llm_top_2_test_accuracy": {
581
+ "0": 0.802,
582
+ "1": 0.808,
583
+ "2": 0.701,
584
+ "3": 0.816
585
+ },
586
+ "llm_top_5_test_accuracy": {
587
+ "0": 0.81,
588
+ "1": 0.891,
589
+ "2": 0.752,
590
+ "3": 0.832
591
+ },
592
+ "sae_top_1_test_accuracy": {
593
+ "0": 0.591,
594
+ "1": 0.934,
595
+ "2": 0.562,
596
+ "3": 0.653
597
+ },
598
+ "sae_top_2_test_accuracy": {
599
+ "0": 0.704,
600
+ "1": 0.937,
601
+ "2": 0.69,
602
+ "3": 0.656
603
+ },
604
+ "sae_top_5_test_accuracy": {
605
+ "0": 0.835,
606
+ "1": 0.954,
607
+ "2": 0.834,
608
+ "3": 0.813
609
+ }
610
+ },
611
+ "Helsinki-NLP/europarl_results": {
612
+ "sae_test_accuracy": {
613
+ "en": 0.999000072479248,
614
+ "fr": 0.9970000386238098,
615
+ "de": 0.9980000257492065,
616
+ "es": 1.0,
617
+ "nl": 0.9970000386238098
618
+ },
619
+ "llm_test_accuracy": {
620
+ "en": 1.0,
621
+ "fr": 0.999000072479248,
622
+ "de": 1.0,
623
+ "es": 0.999000072479248,
624
+ "nl": 1.0
625
+ },
626
+ "llm_top_1_test_accuracy": {
627
+ "en": 0.739,
628
+ "fr": 0.585,
629
+ "de": 0.758,
630
+ "es": 0.496,
631
+ "nl": 0.649
632
+ },
633
+ "llm_top_2_test_accuracy": {
634
+ "en": 0.829,
635
+ "fr": 0.582,
636
+ "de": 0.82,
637
+ "es": 0.958,
638
+ "nl": 0.753
639
+ },
640
+ "llm_top_5_test_accuracy": {
641
+ "en": 0.892,
642
+ "fr": 0.888,
643
+ "de": 0.894,
644
+ "es": 0.98,
645
+ "nl": 0.852
646
+ },
647
+ "sae_top_1_test_accuracy": {
648
+ "en": 0.651,
649
+ "fr": 0.996,
650
+ "de": 0.925,
651
+ "es": 0.897,
652
+ "nl": 0.997
653
+ },
654
+ "sae_top_2_test_accuracy": {
655
+ "en": 0.776,
656
+ "fr": 0.997,
657
+ "de": 0.988,
658
+ "es": 0.992,
659
+ "nl": 0.997
660
+ },
661
+ "sae_top_5_test_accuracy": {
662
+ "en": 0.998,
663
+ "fr": 0.997,
664
+ "de": 0.997,
665
+ "es": 0.996,
666
+ "nl": 0.999
667
+ }
668
+ }
669
+ }
670
+ }
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "sparse_probing",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "LabHC/bias_in_bios_class_set2",
8
+ "LabHC/bias_in_bios_class_set3",
9
+ "canrager/amazon_reviews_mcauley_1and5",
10
+ "canrager/amazon_reviews_mcauley_1and5_sentiment",
11
+ "codeparrot/github-code",
12
+ "fancyzhx/ag_news",
13
+ "Helsinki-NLP/europarl"
14
+ ],
15
+ "probe_train_set_size": 4000,
16
+ "probe_test_set_size": 1000,
17
+ "context_length": 128,
18
+ "sae_batch_size": 125,
19
+ "llm_batch_size": 32,
20
+ "llm_dtype": "bfloat16",
21
+ "model_name": "gemma-2-2b",
22
+ "k_values": [
23
+ 1,
24
+ 2,
25
+ 5
26
+ ],
27
+ "lower_vram_usage": false
28
+ },
29
+ "eval_id": "c9b1c992-08d4-4128-85b3-7725ac95e5a6",
30
+ "datetime_epoch_millis": 1740165200108,
31
+ "eval_result_metrics": {
32
+ "llm": {
33
+ "llm_test_accuracy": 0.9595375448465346,
34
+ "llm_top_1_test_accuracy": 0.64956875,
35
+ "llm_top_2_test_accuracy": 0.72589375,
36
+ "llm_top_5_test_accuracy": 0.78265625,
37
+ "llm_top_10_test_accuracy": null,
38
+ "llm_top_20_test_accuracy": null,
39
+ "llm_top_50_test_accuracy": null,
40
+ "llm_top_100_test_accuracy": null
41
+ },
42
+ "sae": {
43
+ "sae_test_accuracy": 0.9555687937885523,
44
+ "sae_top_1_test_accuracy": 0.71979375,
45
+ "sae_top_2_test_accuracy": 0.8020937500000002,
46
+ "sae_top_5_test_accuracy": 0.86435625,
47
+ "sae_top_10_test_accuracy": null,
48
+ "sae_top_20_test_accuracy": null,
49
+ "sae_top_50_test_accuracy": null,
50
+ "sae_top_100_test_accuracy": null
51
+ }
52
+ },
53
+ "eval_result_details": [
54
+ {
55
+ "dataset_name": "LabHC/bias_in_bios_class_set1_results",
56
+ "llm_test_accuracy": 0.966800057888031,
57
+ "llm_top_1_test_accuracy": 0.6397999999999999,
58
+ "llm_top_2_test_accuracy": 0.6954,
59
+ "llm_top_5_test_accuracy": 0.7869999999999999,
60
+ "llm_top_10_test_accuracy": null,
61
+ "llm_top_20_test_accuracy": null,
62
+ "llm_top_50_test_accuracy": null,
63
+ "llm_top_100_test_accuracy": null,
64
+ "sae_test_accuracy": 0.962000048160553,
65
+ "sae_top_1_test_accuracy": 0.6808,
66
+ "sae_top_2_test_accuracy": 0.8480000000000001,
67
+ "sae_top_5_test_accuracy": 0.9032,
68
+ "sae_top_10_test_accuracy": null,
69
+ "sae_top_20_test_accuracy": null,
70
+ "sae_top_50_test_accuracy": null,
71
+ "sae_top_100_test_accuracy": null
72
+ },
73
+ {
74
+ "dataset_name": "LabHC/bias_in_bios_class_set2_results",
75
+ "llm_test_accuracy": 0.9578000426292419,
76
+ "llm_top_1_test_accuracy": 0.6694000000000001,
77
+ "llm_top_2_test_accuracy": 0.725,
78
+ "llm_top_5_test_accuracy": 0.7654,
79
+ "llm_top_10_test_accuracy": null,
80
+ "llm_top_20_test_accuracy": null,
81
+ "llm_top_50_test_accuracy": null,
82
+ "llm_top_100_test_accuracy": null,
83
+ "sae_test_accuracy": 0.9456000447273254,
84
+ "sae_top_1_test_accuracy": 0.669,
85
+ "sae_top_2_test_accuracy": 0.7938000000000001,
86
+ "sae_top_5_test_accuracy": 0.8597999999999999,
87
+ "sae_top_10_test_accuracy": null,
88
+ "sae_top_20_test_accuracy": null,
89
+ "sae_top_50_test_accuracy": null,
90
+ "sae_top_100_test_accuracy": null
91
+ },
92
+ {
93
+ "dataset_name": "LabHC/bias_in_bios_class_set3_results",
94
+ "llm_test_accuracy": 0.9316000461578369,
95
+ "llm_top_1_test_accuracy": 0.687,
96
+ "llm_top_2_test_accuracy": 0.7492,
97
+ "llm_top_5_test_accuracy": 0.7704000000000001,
98
+ "llm_top_10_test_accuracy": null,
99
+ "llm_top_20_test_accuracy": null,
100
+ "llm_top_50_test_accuracy": null,
101
+ "llm_top_100_test_accuracy": null,
102
+ "sae_test_accuracy": 0.9292000293731689,
103
+ "sae_top_1_test_accuracy": 0.704,
104
+ "sae_top_2_test_accuracy": 0.8064,
105
+ "sae_top_5_test_accuracy": 0.8610000000000001,
106
+ "sae_top_10_test_accuracy": null,
107
+ "sae_top_20_test_accuracy": null,
108
+ "sae_top_50_test_accuracy": null,
109
+ "sae_top_100_test_accuracy": null
110
+ },
111
+ {
112
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
113
+ "llm_test_accuracy": 0.9202000379562378,
114
+ "llm_top_1_test_accuracy": 0.599,
115
+ "llm_top_2_test_accuracy": 0.6474,
116
+ "llm_top_5_test_accuracy": 0.6734,
117
+ "llm_top_10_test_accuracy": null,
118
+ "llm_top_20_test_accuracy": null,
119
+ "llm_top_50_test_accuracy": null,
120
+ "llm_top_100_test_accuracy": null,
121
+ "sae_test_accuracy": 0.9170000433921814,
122
+ "sae_top_1_test_accuracy": 0.7496,
123
+ "sae_top_2_test_accuracy": 0.774,
124
+ "sae_top_5_test_accuracy": 0.8013999999999999,
125
+ "sae_top_10_test_accuracy": null,
126
+ "sae_top_20_test_accuracy": null,
127
+ "sae_top_50_test_accuracy": null,
128
+ "sae_top_100_test_accuracy": null
129
+ },
130
+ {
131
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
132
+ "llm_test_accuracy": 0.9795000553131104,
133
+ "llm_top_1_test_accuracy": 0.673,
134
+ "llm_top_2_test_accuracy": 0.724,
135
+ "llm_top_5_test_accuracy": 0.766,
136
+ "llm_top_10_test_accuracy": null,
137
+ "llm_top_20_test_accuracy": null,
138
+ "llm_top_50_test_accuracy": null,
139
+ "llm_top_100_test_accuracy": null,
140
+ "sae_test_accuracy": 0.9750000536441803,
141
+ "sae_top_1_test_accuracy": 0.847,
142
+ "sae_top_2_test_accuracy": 0.847,
143
+ "sae_top_5_test_accuracy": 0.929,
144
+ "sae_top_10_test_accuracy": null,
145
+ "sae_top_20_test_accuracy": null,
146
+ "sae_top_50_test_accuracy": null,
147
+ "sae_top_100_test_accuracy": null
148
+ },
149
+ {
150
+ "dataset_name": "codeparrot/github-code_results",
151
+ "llm_test_accuracy": 0.9708000421524048,
152
+ "llm_top_1_test_accuracy": 0.6451999999999999,
153
+ "llm_top_2_test_accuracy": 0.6960000000000001,
154
+ "llm_top_5_test_accuracy": 0.7766,
155
+ "llm_top_10_test_accuracy": null,
156
+ "llm_top_20_test_accuracy": null,
157
+ "llm_top_50_test_accuracy": null,
158
+ "llm_top_100_test_accuracy": null,
159
+ "sae_test_accuracy": 0.9676000475883484,
160
+ "sae_top_1_test_accuracy": 0.64,
161
+ "sae_top_2_test_accuracy": 0.6508,
162
+ "sae_top_5_test_accuracy": 0.7672,
163
+ "sae_top_10_test_accuracy": null,
164
+ "sae_top_20_test_accuracy": null,
165
+ "sae_top_50_test_accuracy": null,
166
+ "sae_top_100_test_accuracy": null
167
+ },
168
+ {
169
+ "dataset_name": "fancyzhx/ag_news_results",
170
+ "llm_test_accuracy": 0.9500000476837158,
171
+ "llm_top_1_test_accuracy": 0.63775,
172
+ "llm_top_2_test_accuracy": 0.78175,
173
+ "llm_top_5_test_accuracy": 0.82125,
174
+ "llm_top_10_test_accuracy": null,
175
+ "llm_top_20_test_accuracy": null,
176
+ "llm_top_50_test_accuracy": null,
177
+ "llm_top_100_test_accuracy": null,
178
+ "sae_test_accuracy": 0.9497500509023666,
179
+ "sae_top_1_test_accuracy": 0.60775,
180
+ "sae_top_2_test_accuracy": 0.70375,
181
+ "sae_top_5_test_accuracy": 0.7982499999999999,
182
+ "sae_top_10_test_accuracy": null,
183
+ "sae_top_20_test_accuracy": null,
184
+ "sae_top_50_test_accuracy": null,
185
+ "sae_top_100_test_accuracy": null
186
+ },
187
+ {
188
+ "dataset_name": "Helsinki-NLP/europarl_results",
189
+ "llm_test_accuracy": 0.9996000289916992,
190
+ "llm_top_1_test_accuracy": 0.6454,
191
+ "llm_top_2_test_accuracy": 0.7884,
192
+ "llm_top_5_test_accuracy": 0.9012,
193
+ "llm_top_10_test_accuracy": null,
194
+ "llm_top_20_test_accuracy": null,
195
+ "llm_top_50_test_accuracy": null,
196
+ "llm_top_100_test_accuracy": null,
197
+ "sae_test_accuracy": 0.9984000325202942,
198
+ "sae_top_1_test_accuracy": 0.8602000000000001,
199
+ "sae_top_2_test_accuracy": 0.993,
200
+ "sae_top_5_test_accuracy": 0.9950000000000001,
201
+ "sae_top_10_test_accuracy": null,
202
+ "sae_top_20_test_accuracy": null,
203
+ "sae_top_50_test_accuracy": null,
204
+ "sae_top_100_test_accuracy": null
205
+ }
206
+ ],
207
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
208
+ "sae_lens_id": "custom_sae",
209
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0",
210
+ "sae_lens_version": "5.4.2",
211
+ "sae_cfg_dict": {
212
+ "model_name": "gemma-2-2b",
213
+ "d_in": 2304,
214
+ "d_sae": 65536,
215
+ "hook_layer": 12,
216
+ "hook_name": "blocks.12.hook_resid_post",
217
+ "context_size": null,
218
+ "hook_head_index": null,
219
+ "architecture": "topk",
220
+ "apply_b_dec_to_input": null,
221
+ "finetuning_scaling_factor": null,
222
+ "activation_fn_str": "",
223
+ "prepend_bos": true,
224
+ "normalize_activations": "none",
225
+ "dtype": "bfloat16",
226
+ "device": "",
227
+ "dataset_path": "",
228
+ "dataset_trust_remote_code": true,
229
+ "seqpos_slice": [
230
+ null
231
+ ],
232
+ "training_tokens": -100000,
233
+ "sae_lens_training_version": null,
234
+ "neuronpedia_id": null
235
+ },
236
+ "eval_result_unstructured": {
237
+ "LabHC/bias_in_bios_class_set1_results": {
238
+ "sae_test_accuracy": {
239
+ "0": 0.9450000524520874,
240
+ "1": 0.9600000381469727,
241
+ "2": 0.9470000267028809,
242
+ "6": 0.9860000610351562,
243
+ "9": 0.9720000624656677
244
+ },
245
+ "llm_test_accuracy": {
246
+ "0": 0.9510000348091125,
247
+ "1": 0.9670000672340393,
248
+ "2": 0.9530000686645508,
249
+ "6": 0.987000048160553,
250
+ "9": 0.9760000705718994
251
+ },
252
+ "llm_top_1_test_accuracy": {
253
+ "0": 0.577,
254
+ "1": 0.613,
255
+ "2": 0.662,
256
+ "6": 0.787,
257
+ "9": 0.56
258
+ },
259
+ "llm_top_2_test_accuracy": {
260
+ "0": 0.574,
261
+ "1": 0.66,
262
+ "2": 0.718,
263
+ "6": 0.811,
264
+ "9": 0.714
265
+ },
266
+ "llm_top_5_test_accuracy": {
267
+ "0": 0.713,
268
+ "1": 0.711,
269
+ "2": 0.755,
270
+ "6": 0.895,
271
+ "9": 0.861
272
+ },
273
+ "sae_top_1_test_accuracy": {
274
+ "0": 0.571,
275
+ "1": 0.631,
276
+ "2": 0.835,
277
+ "6": 0.808,
278
+ "9": 0.559
279
+ },
280
+ "sae_top_2_test_accuracy": {
281
+ "0": 0.856,
282
+ "1": 0.809,
283
+ "2": 0.843,
284
+ "6": 0.976,
285
+ "9": 0.756
286
+ },
287
+ "sae_top_5_test_accuracy": {
288
+ "0": 0.869,
289
+ "1": 0.851,
290
+ "2": 0.864,
291
+ "6": 0.99,
292
+ "9": 0.942
293
+ }
294
+ },
295
+ "LabHC/bias_in_bios_class_set2_results": {
296
+ "sae_test_accuracy": {
297
+ "11": 0.9580000638961792,
298
+ "13": 0.9470000267028809,
299
+ "14": 0.9460000395774841,
300
+ "18": 0.9220000505447388,
301
+ "19": 0.9550000429153442
302
+ },
303
+ "llm_test_accuracy": {
304
+ "11": 0.9690000414848328,
305
+ "13": 0.9600000381469727,
306
+ "14": 0.9600000381469727,
307
+ "18": 0.9390000700950623,
308
+ "19": 0.9610000252723694
309
+ },
310
+ "llm_top_1_test_accuracy": {
311
+ "11": 0.555,
312
+ "13": 0.668,
313
+ "14": 0.638,
314
+ "18": 0.69,
315
+ "19": 0.796
316
+ },
317
+ "llm_top_2_test_accuracy": {
318
+ "11": 0.756,
319
+ "13": 0.714,
320
+ "14": 0.67,
321
+ "18": 0.717,
322
+ "19": 0.768
323
+ },
324
+ "llm_top_5_test_accuracy": {
325
+ "11": 0.794,
326
+ "13": 0.749,
327
+ "14": 0.723,
328
+ "18": 0.73,
329
+ "19": 0.831
330
+ },
331
+ "sae_top_1_test_accuracy": {
332
+ "11": 0.539,
333
+ "13": 0.658,
334
+ "14": 0.648,
335
+ "18": 0.703,
336
+ "19": 0.797
337
+ },
338
+ "sae_top_2_test_accuracy": {
339
+ "11": 0.858,
340
+ "13": 0.675,
341
+ "14": 0.878,
342
+ "18": 0.729,
343
+ "19": 0.829
344
+ },
345
+ "sae_top_5_test_accuracy": {
346
+ "11": 0.871,
347
+ "13": 0.792,
348
+ "14": 0.876,
349
+ "18": 0.897,
350
+ "19": 0.863
351
+ }
352
+ },
353
+ "LabHC/bias_in_bios_class_set3_results": {
354
+ "sae_test_accuracy": {
355
+ "20": 0.9520000219345093,
356
+ "21": 0.9220000505447388,
357
+ "22": 0.9240000247955322,
358
+ "25": 0.956000030040741,
359
+ "26": 0.8920000195503235
360
+ },
361
+ "llm_test_accuracy": {
362
+ "20": 0.956000030040741,
363
+ "21": 0.9350000619888306,
364
+ "22": 0.9180000424385071,
365
+ "25": 0.9640000462532043,
366
+ "26": 0.8850000500679016
367
+ },
368
+ "llm_top_1_test_accuracy": {
369
+ "20": 0.693,
370
+ "21": 0.775,
371
+ "22": 0.645,
372
+ "25": 0.706,
373
+ "26": 0.616
374
+ },
375
+ "llm_top_2_test_accuracy": {
376
+ "20": 0.827,
377
+ "21": 0.761,
378
+ "22": 0.694,
379
+ "25": 0.778,
380
+ "26": 0.686
381
+ },
382
+ "llm_top_5_test_accuracy": {
383
+ "20": 0.855,
384
+ "21": 0.791,
385
+ "22": 0.725,
386
+ "25": 0.809,
387
+ "26": 0.672
388
+ },
389
+ "sae_top_1_test_accuracy": {
390
+ "20": 0.84,
391
+ "21": 0.476,
392
+ "22": 0.88,
393
+ "25": 0.693,
394
+ "26": 0.631
395
+ },
396
+ "sae_top_2_test_accuracy": {
397
+ "20": 0.849,
398
+ "21": 0.747,
399
+ "22": 0.881,
400
+ "25": 0.849,
401
+ "26": 0.706
402
+ },
403
+ "sae_top_5_test_accuracy": {
404
+ "20": 0.911,
405
+ "21": 0.844,
406
+ "22": 0.874,
407
+ "25": 0.893,
408
+ "26": 0.783
409
+ }
410
+ },
411
+ "canrager/amazon_reviews_mcauley_1and5_results": {
412
+ "sae_test_accuracy": {
413
+ "1": 0.9510000348091125,
414
+ "2": 0.9460000395774841,
415
+ "3": 0.9110000729560852,
416
+ "5": 0.9200000166893005,
417
+ "6": 0.8570000529289246
418
+ },
419
+ "llm_test_accuracy": {
420
+ "1": 0.9580000638961792,
421
+ "2": 0.9330000281333923,
422
+ "3": 0.9280000329017639,
423
+ "5": 0.9200000166893005,
424
+ "6": 0.862000048160553
425
+ },
426
+ "llm_top_1_test_accuracy": {
427
+ "1": 0.647,
428
+ "2": 0.603,
429
+ "3": 0.598,
430
+ "5": 0.555,
431
+ "6": 0.592
432
+ },
433
+ "llm_top_2_test_accuracy": {
434
+ "1": 0.75,
435
+ "2": 0.648,
436
+ "3": 0.607,
437
+ "5": 0.606,
438
+ "6": 0.626
439
+ },
440
+ "llm_top_5_test_accuracy": {
441
+ "1": 0.767,
442
+ "2": 0.641,
443
+ "3": 0.645,
444
+ "5": 0.638,
445
+ "6": 0.676
446
+ },
447
+ "sae_top_1_test_accuracy": {
448
+ "1": 0.856,
449
+ "2": 0.862,
450
+ "3": 0.592,
451
+ "5": 0.819,
452
+ "6": 0.619
453
+ },
454
+ "sae_top_2_test_accuracy": {
455
+ "1": 0.905,
456
+ "2": 0.867,
457
+ "3": 0.612,
458
+ "5": 0.818,
459
+ "6": 0.668
460
+ },
461
+ "sae_top_5_test_accuracy": {
462
+ "1": 0.908,
463
+ "2": 0.862,
464
+ "3": 0.63,
465
+ "5": 0.873,
466
+ "6": 0.734
467
+ }
468
+ },
469
+ "canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
470
+ "sae_test_accuracy": {
471
+ "1.0": 0.9760000705718994,
472
+ "5.0": 0.9740000367164612
473
+ },
474
+ "llm_test_accuracy": {
475
+ "1.0": 0.9780000448226929,
476
+ "5.0": 0.9810000658035278
477
+ },
478
+ "llm_top_1_test_accuracy": {
479
+ "1.0": 0.673,
480
+ "5.0": 0.673
481
+ },
482
+ "llm_top_2_test_accuracy": {
483
+ "1.0": 0.724,
484
+ "5.0": 0.724
485
+ },
486
+ "llm_top_5_test_accuracy": {
487
+ "1.0": 0.766,
488
+ "5.0": 0.766
489
+ },
490
+ "sae_top_1_test_accuracy": {
491
+ "1.0": 0.847,
492
+ "5.0": 0.847
493
+ },
494
+ "sae_top_2_test_accuracy": {
495
+ "1.0": 0.847,
496
+ "5.0": 0.847
497
+ },
498
+ "sae_top_5_test_accuracy": {
499
+ "1.0": 0.929,
500
+ "5.0": 0.929
501
+ }
502
+ },
503
+ "codeparrot/github-code_results": {
504
+ "sae_test_accuracy": {
505
+ "C": 0.9540000557899475,
506
+ "Python": 0.984000027179718,
507
+ "HTML": 0.9810000658035278,
508
+ "Java": 0.9640000462532043,
509
+ "PHP": 0.9550000429153442
510
+ },
511
+ "llm_test_accuracy": {
512
+ "C": 0.956000030040741,
513
+ "Python": 0.984000027179718,
514
+ "HTML": 0.9900000691413879,
515
+ "Java": 0.9670000672340393,
516
+ "PHP": 0.9570000171661377
517
+ },
518
+ "llm_top_1_test_accuracy": {
519
+ "C": 0.666,
520
+ "Python": 0.626,
521
+ "HTML": 0.721,
522
+ "Java": 0.619,
523
+ "PHP": 0.594
524
+ },
525
+ "llm_top_2_test_accuracy": {
526
+ "C": 0.679,
527
+ "Python": 0.674,
528
+ "HTML": 0.8,
529
+ "Java": 0.676,
530
+ "PHP": 0.651
531
+ },
532
+ "llm_top_5_test_accuracy": {
533
+ "C": 0.783,
534
+ "Python": 0.717,
535
+ "HTML": 0.935,
536
+ "Java": 0.733,
537
+ "PHP": 0.715
538
+ },
539
+ "sae_top_1_test_accuracy": {
540
+ "C": 0.622,
541
+ "Python": 0.661,
542
+ "HTML": 0.692,
543
+ "Java": 0.629,
544
+ "PHP": 0.596
545
+ },
546
+ "sae_top_2_test_accuracy": {
547
+ "C": 0.599,
548
+ "Python": 0.65,
549
+ "HTML": 0.795,
550
+ "Java": 0.628,
551
+ "PHP": 0.582
552
+ },
553
+ "sae_top_5_test_accuracy": {
554
+ "C": 0.671,
555
+ "Python": 0.684,
556
+ "HTML": 0.865,
557
+ "Java": 0.705,
558
+ "PHP": 0.911
559
+ }
560
+ },
561
+ "fancyzhx/ag_news_results": {
562
+ "sae_test_accuracy": {
563
+ "0": 0.9360000491142273,
564
+ "1": 0.9850000739097595,
565
+ "2": 0.9330000281333923,
566
+ "3": 0.9450000524520874
567
+ },
568
+ "llm_test_accuracy": {
569
+ "0": 0.940000057220459,
570
+ "1": 0.9860000610351562,
571
+ "2": 0.9200000166893005,
572
+ "3": 0.9540000557899475
573
+ },
574
+ "llm_top_1_test_accuracy": {
575
+ "0": 0.573,
576
+ "1": 0.671,
577
+ "2": 0.672,
578
+ "3": 0.635
579
+ },
580
+ "llm_top_2_test_accuracy": {
581
+ "0": 0.802,
582
+ "1": 0.808,
583
+ "2": 0.701,
584
+ "3": 0.816
585
+ },
586
+ "llm_top_5_test_accuracy": {
587
+ "0": 0.81,
588
+ "1": 0.891,
589
+ "2": 0.752,
590
+ "3": 0.832
591
+ },
592
+ "sae_top_1_test_accuracy": {
593
+ "0": 0.578,
594
+ "1": 0.664,
595
+ "2": 0.552,
596
+ "3": 0.637
597
+ },
598
+ "sae_top_2_test_accuracy": {
599
+ "0": 0.795,
600
+ "1": 0.697,
601
+ "2": 0.673,
602
+ "3": 0.65
603
+ },
604
+ "sae_top_5_test_accuracy": {
605
+ "0": 0.822,
606
+ "1": 0.869,
607
+ "2": 0.691,
608
+ "3": 0.811
609
+ }
610
+ },
611
+ "Helsinki-NLP/europarl_results": {
612
+ "sae_test_accuracy": {
613
+ "en": 0.999000072479248,
614
+ "fr": 0.9980000257492065,
615
+ "de": 0.9970000386238098,
616
+ "es": 1.0,
617
+ "nl": 0.9980000257492065
618
+ },
619
+ "llm_test_accuracy": {
620
+ "en": 1.0,
621
+ "fr": 0.999000072479248,
622
+ "de": 1.0,
623
+ "es": 0.999000072479248,
624
+ "nl": 1.0
625
+ },
626
+ "llm_top_1_test_accuracy": {
627
+ "en": 0.739,
628
+ "fr": 0.585,
629
+ "de": 0.758,
630
+ "es": 0.496,
631
+ "nl": 0.649
632
+ },
633
+ "llm_top_2_test_accuracy": {
634
+ "en": 0.829,
635
+ "fr": 0.582,
636
+ "de": 0.82,
637
+ "es": 0.958,
638
+ "nl": 0.753
639
+ },
640
+ "llm_top_5_test_accuracy": {
641
+ "en": 0.892,
642
+ "fr": 0.888,
643
+ "de": 0.894,
644
+ "es": 0.98,
645
+ "nl": 0.852
646
+ },
647
+ "sae_top_1_test_accuracy": {
648
+ "en": 0.749,
649
+ "fr": 0.991,
650
+ "de": 0.932,
651
+ "es": 0.99,
652
+ "nl": 0.639
653
+ },
654
+ "sae_top_2_test_accuracy": {
655
+ "en": 0.995,
656
+ "fr": 0.994,
657
+ "de": 0.988,
658
+ "es": 0.99,
659
+ "nl": 0.998
660
+ },
661
+ "sae_top_5_test_accuracy": {
662
+ "en": 0.998,
663
+ "fr": 0.997,
664
+ "de": 0.988,
665
+ "es": 0.994,
666
+ "nl": 0.998
667
+ }
668
+ }
669
+ }
670
+ }
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "sparse_probing",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "LabHC/bias_in_bios_class_set2",
8
+ "LabHC/bias_in_bios_class_set3",
9
+ "canrager/amazon_reviews_mcauley_1and5",
10
+ "canrager/amazon_reviews_mcauley_1and5_sentiment",
11
+ "codeparrot/github-code",
12
+ "fancyzhx/ag_news",
13
+ "Helsinki-NLP/europarl"
14
+ ],
15
+ "probe_train_set_size": 4000,
16
+ "probe_test_set_size": 1000,
17
+ "context_length": 128,
18
+ "sae_batch_size": 125,
19
+ "llm_batch_size": 32,
20
+ "llm_dtype": "bfloat16",
21
+ "model_name": "gemma-2-2b",
22
+ "k_values": [
23
+ 1,
24
+ 2,
25
+ 5
26
+ ],
27
+ "lower_vram_usage": false
28
+ },
29
+ "eval_id": "d3e0c3ec-e2e5-4d60-ae48-22cfd7d5fba2",
30
+ "datetime_epoch_millis": 1740165066678,
31
+ "eval_result_metrics": {
32
+ "llm": {
33
+ "llm_test_accuracy": 0.9595375448465346,
34
+ "llm_top_1_test_accuracy": 0.64956875,
35
+ "llm_top_2_test_accuracy": 0.72589375,
36
+ "llm_top_5_test_accuracy": 0.78265625,
37
+ "llm_top_10_test_accuracy": null,
38
+ "llm_top_20_test_accuracy": null,
39
+ "llm_top_50_test_accuracy": null,
40
+ "llm_top_100_test_accuracy": null
41
+ },
42
+ "sae": {
43
+ "sae_test_accuracy": 0.9570625454187394,
44
+ "sae_top_1_test_accuracy": 0.7210749999999999,
45
+ "sae_top_2_test_accuracy": 0.7653625,
46
+ "sae_top_5_test_accuracy": 0.8490062500000001,
47
+ "sae_top_10_test_accuracy": null,
48
+ "sae_top_20_test_accuracy": null,
49
+ "sae_top_50_test_accuracy": null,
50
+ "sae_top_100_test_accuracy": null
51
+ }
52
+ },
53
+ "eval_result_details": [
54
+ {
55
+ "dataset_name": "LabHC/bias_in_bios_class_set1_results",
56
+ "llm_test_accuracy": 0.966800057888031,
57
+ "llm_top_1_test_accuracy": 0.6397999999999999,
58
+ "llm_top_2_test_accuracy": 0.6954,
59
+ "llm_top_5_test_accuracy": 0.7869999999999999,
60
+ "llm_top_10_test_accuracy": null,
61
+ "llm_top_20_test_accuracy": null,
62
+ "llm_top_50_test_accuracy": null,
63
+ "llm_top_100_test_accuracy": null,
64
+ "sae_test_accuracy": 0.9642000436782837,
65
+ "sae_top_1_test_accuracy": 0.7083999999999999,
66
+ "sae_top_2_test_accuracy": 0.7636000000000001,
67
+ "sae_top_5_test_accuracy": 0.8354000000000001,
68
+ "sae_top_10_test_accuracy": null,
69
+ "sae_top_20_test_accuracy": null,
70
+ "sae_top_50_test_accuracy": null,
71
+ "sae_top_100_test_accuracy": null
72
+ },
73
+ {
74
+ "dataset_name": "LabHC/bias_in_bios_class_set2_results",
75
+ "llm_test_accuracy": 0.9578000426292419,
76
+ "llm_top_1_test_accuracy": 0.6694000000000001,
77
+ "llm_top_2_test_accuracy": 0.725,
78
+ "llm_top_5_test_accuracy": 0.7654,
79
+ "llm_top_10_test_accuracy": null,
80
+ "llm_top_20_test_accuracy": null,
81
+ "llm_top_50_test_accuracy": null,
82
+ "llm_top_100_test_accuracy": null,
83
+ "sae_test_accuracy": 0.9512000441551208,
84
+ "sae_top_1_test_accuracy": 0.7248000000000001,
85
+ "sae_top_2_test_accuracy": 0.7267999999999999,
86
+ "sae_top_5_test_accuracy": 0.828,
87
+ "sae_top_10_test_accuracy": null,
88
+ "sae_top_20_test_accuracy": null,
89
+ "sae_top_50_test_accuracy": null,
90
+ "sae_top_100_test_accuracy": null
91
+ },
92
+ {
93
+ "dataset_name": "LabHC/bias_in_bios_class_set3_results",
94
+ "llm_test_accuracy": 0.9316000461578369,
95
+ "llm_top_1_test_accuracy": 0.687,
96
+ "llm_top_2_test_accuracy": 0.7492,
97
+ "llm_top_5_test_accuracy": 0.7704000000000001,
98
+ "llm_top_10_test_accuracy": null,
99
+ "llm_top_20_test_accuracy": null,
100
+ "llm_top_50_test_accuracy": null,
101
+ "llm_top_100_test_accuracy": null,
102
+ "sae_test_accuracy": 0.9294000387191772,
103
+ "sae_top_1_test_accuracy": 0.7074,
104
+ "sae_top_2_test_accuracy": 0.784,
105
+ "sae_top_5_test_accuracy": 0.825,
106
+ "sae_top_10_test_accuracy": null,
107
+ "sae_top_20_test_accuracy": null,
108
+ "sae_top_50_test_accuracy": null,
109
+ "sae_top_100_test_accuracy": null
110
+ },
111
+ {
112
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
113
+ "llm_test_accuracy": 0.9202000379562378,
114
+ "llm_top_1_test_accuracy": 0.599,
115
+ "llm_top_2_test_accuracy": 0.6474,
116
+ "llm_top_5_test_accuracy": 0.6734,
117
+ "llm_top_10_test_accuracy": null,
118
+ "llm_top_20_test_accuracy": null,
119
+ "llm_top_50_test_accuracy": null,
120
+ "llm_top_100_test_accuracy": null,
121
+ "sae_test_accuracy": 0.9192000508308411,
122
+ "sae_top_1_test_accuracy": 0.6622,
123
+ "sae_top_2_test_accuracy": 0.6678000000000001,
124
+ "sae_top_5_test_accuracy": 0.7878000000000001,
125
+ "sae_top_10_test_accuracy": null,
126
+ "sae_top_20_test_accuracy": null,
127
+ "sae_top_50_test_accuracy": null,
128
+ "sae_top_100_test_accuracy": null
129
+ },
130
+ {
131
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
132
+ "llm_test_accuracy": 0.9795000553131104,
133
+ "llm_top_1_test_accuracy": 0.673,
134
+ "llm_top_2_test_accuracy": 0.724,
135
+ "llm_top_5_test_accuracy": 0.766,
136
+ "llm_top_10_test_accuracy": null,
137
+ "llm_top_20_test_accuracy": null,
138
+ "llm_top_50_test_accuracy": null,
139
+ "llm_top_100_test_accuracy": null,
140
+ "sae_test_accuracy": 0.9725000560283661,
141
+ "sae_top_1_test_accuracy": 0.6,
142
+ "sae_top_2_test_accuracy": 0.764,
143
+ "sae_top_5_test_accuracy": 0.942,
144
+ "sae_top_10_test_accuracy": null,
145
+ "sae_top_20_test_accuracy": null,
146
+ "sae_top_50_test_accuracy": null,
147
+ "sae_top_100_test_accuracy": null
148
+ },
149
+ {
150
+ "dataset_name": "codeparrot/github-code_results",
151
+ "llm_test_accuracy": 0.9708000421524048,
152
+ "llm_top_1_test_accuracy": 0.6451999999999999,
153
+ "llm_top_2_test_accuracy": 0.6960000000000001,
154
+ "llm_top_5_test_accuracy": 0.7766,
155
+ "llm_top_10_test_accuracy": null,
156
+ "llm_top_20_test_accuracy": null,
157
+ "llm_top_50_test_accuracy": null,
158
+ "llm_top_100_test_accuracy": null,
159
+ "sae_test_accuracy": 0.968600058555603,
160
+ "sae_top_1_test_accuracy": 0.6384000000000001,
161
+ "sae_top_2_test_accuracy": 0.643,
162
+ "sae_top_5_test_accuracy": 0.7448,
163
+ "sae_top_10_test_accuracy": null,
164
+ "sae_top_20_test_accuracy": null,
165
+ "sae_top_50_test_accuracy": null,
166
+ "sae_top_100_test_accuracy": null
167
+ },
168
+ {
169
+ "dataset_name": "fancyzhx/ag_news_results",
170
+ "llm_test_accuracy": 0.9500000476837158,
171
+ "llm_top_1_test_accuracy": 0.63775,
172
+ "llm_top_2_test_accuracy": 0.78175,
173
+ "llm_top_5_test_accuracy": 0.82125,
174
+ "llm_top_10_test_accuracy": null,
175
+ "llm_top_20_test_accuracy": null,
176
+ "llm_top_50_test_accuracy": null,
177
+ "llm_top_100_test_accuracy": null,
178
+ "sae_test_accuracy": 0.9520000517368317,
179
+ "sae_top_1_test_accuracy": 0.77,
180
+ "sae_top_2_test_accuracy": 0.7875000000000001,
181
+ "sae_top_5_test_accuracy": 0.8322499999999999,
182
+ "sae_top_10_test_accuracy": null,
183
+ "sae_top_20_test_accuracy": null,
184
+ "sae_top_50_test_accuracy": null,
185
+ "sae_top_100_test_accuracy": null
186
+ },
187
+ {
188
+ "dataset_name": "Helsinki-NLP/europarl_results",
189
+ "llm_test_accuracy": 0.9996000289916992,
190
+ "llm_top_1_test_accuracy": 0.6454,
191
+ "llm_top_2_test_accuracy": 0.7884,
192
+ "llm_top_5_test_accuracy": 0.9012,
193
+ "llm_top_10_test_accuracy": null,
194
+ "llm_top_20_test_accuracy": null,
195
+ "llm_top_50_test_accuracy": null,
196
+ "llm_top_100_test_accuracy": null,
197
+ "sae_test_accuracy": 0.9994000196456909,
198
+ "sae_top_1_test_accuracy": 0.9574,
199
+ "sae_top_2_test_accuracy": 0.9862,
200
+ "sae_top_5_test_accuracy": 0.9968,
201
+ "sae_top_10_test_accuracy": null,
202
+ "sae_top_20_test_accuracy": null,
203
+ "sae_top_50_test_accuracy": null,
204
+ "sae_top_100_test_accuracy": null
205
+ }
206
+ ],
207
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
208
+ "sae_lens_id": "custom_sae",
209
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0",
210
+ "sae_lens_version": "5.4.2",
211
+ "sae_cfg_dict": {
212
+ "model_name": "gemma-2-2b",
213
+ "d_in": 2304,
214
+ "d_sae": 65536,
215
+ "hook_layer": 12,
216
+ "hook_name": "blocks.12.hook_resid_post",
217
+ "context_size": null,
218
+ "hook_head_index": null,
219
+ "architecture": "topk",
220
+ "apply_b_dec_to_input": null,
221
+ "finetuning_scaling_factor": null,
222
+ "activation_fn_str": "",
223
+ "prepend_bos": true,
224
+ "normalize_activations": "none",
225
+ "dtype": "bfloat16",
226
+ "device": "",
227
+ "dataset_path": "",
228
+ "dataset_trust_remote_code": true,
229
+ "seqpos_slice": [
230
+ null
231
+ ],
232
+ "training_tokens": -100000,
233
+ "sae_lens_training_version": null,
234
+ "neuronpedia_id": null
235
+ },
236
+ "eval_result_unstructured": {
237
+ "LabHC/bias_in_bios_class_set1_results": {
238
+ "sae_test_accuracy": {
239
+ "0": 0.9480000734329224,
240
+ "1": 0.9660000205039978,
241
+ "2": 0.9480000734329224,
242
+ "6": 0.984000027179718,
243
+ "9": 0.9750000238418579
244
+ },
245
+ "llm_test_accuracy": {
246
+ "0": 0.9510000348091125,
247
+ "1": 0.9670000672340393,
248
+ "2": 0.9530000686645508,
249
+ "6": 0.987000048160553,
250
+ "9": 0.9760000705718994
251
+ },
252
+ "llm_top_1_test_accuracy": {
253
+ "0": 0.577,
254
+ "1": 0.613,
255
+ "2": 0.662,
256
+ "6": 0.787,
257
+ "9": 0.56
258
+ },
259
+ "llm_top_2_test_accuracy": {
260
+ "0": 0.574,
261
+ "1": 0.66,
262
+ "2": 0.718,
263
+ "6": 0.811,
264
+ "9": 0.714
265
+ },
266
+ "llm_top_5_test_accuracy": {
267
+ "0": 0.713,
268
+ "1": 0.711,
269
+ "2": 0.755,
270
+ "6": 0.895,
271
+ "9": 0.861
272
+ },
273
+ "sae_top_1_test_accuracy": {
274
+ "0": 0.588,
275
+ "1": 0.612,
276
+ "2": 0.887,
277
+ "6": 0.751,
278
+ "9": 0.704
279
+ },
280
+ "sae_top_2_test_accuracy": {
281
+ "0": 0.627,
282
+ "1": 0.621,
283
+ "2": 0.881,
284
+ "6": 0.766,
285
+ "9": 0.923
286
+ },
287
+ "sae_top_5_test_accuracy": {
288
+ "0": 0.703,
289
+ "1": 0.7,
290
+ "2": 0.878,
291
+ "6": 0.971,
292
+ "9": 0.925
293
+ }
294
+ },
295
+ "LabHC/bias_in_bios_class_set2_results": {
296
+ "sae_test_accuracy": {
297
+ "11": 0.9580000638961792,
298
+ "13": 0.9510000348091125,
299
+ "14": 0.9530000686645508,
300
+ "18": 0.9290000200271606,
301
+ "19": 0.9650000333786011
302
+ },
303
+ "llm_test_accuracy": {
304
+ "11": 0.9690000414848328,
305
+ "13": 0.9600000381469727,
306
+ "14": 0.9600000381469727,
307
+ "18": 0.9390000700950623,
308
+ "19": 0.9610000252723694
309
+ },
310
+ "llm_top_1_test_accuracy": {
311
+ "11": 0.555,
312
+ "13": 0.668,
313
+ "14": 0.638,
314
+ "18": 0.69,
315
+ "19": 0.796
316
+ },
317
+ "llm_top_2_test_accuracy": {
318
+ "11": 0.756,
319
+ "13": 0.714,
320
+ "14": 0.67,
321
+ "18": 0.717,
322
+ "19": 0.768
323
+ },
324
+ "llm_top_5_test_accuracy": {
325
+ "11": 0.794,
326
+ "13": 0.749,
327
+ "14": 0.723,
328
+ "18": 0.73,
329
+ "19": 0.831
330
+ },
331
+ "sae_top_1_test_accuracy": {
332
+ "11": 0.849,
333
+ "13": 0.677,
334
+ "14": 0.638,
335
+ "18": 0.627,
336
+ "19": 0.833
337
+ },
338
+ "sae_top_2_test_accuracy": {
339
+ "11": 0.849,
340
+ "13": 0.684,
341
+ "14": 0.614,
342
+ "18": 0.666,
343
+ "19": 0.821
344
+ },
345
+ "sae_top_5_test_accuracy": {
346
+ "11": 0.921,
347
+ "13": 0.762,
348
+ "14": 0.874,
349
+ "18": 0.761,
350
+ "19": 0.822
351
+ }
352
+ },
353
+ "LabHC/bias_in_bios_class_set3_results": {
354
+ "sae_test_accuracy": {
355
+ "20": 0.9520000219345093,
356
+ "21": 0.9220000505447388,
357
+ "22": 0.9120000600814819,
358
+ "25": 0.9610000252723694,
359
+ "26": 0.9000000357627869
360
+ },
361
+ "llm_test_accuracy": {
362
+ "20": 0.956000030040741,
363
+ "21": 0.9350000619888306,
364
+ "22": 0.9180000424385071,
365
+ "25": 0.9640000462532043,
366
+ "26": 0.8850000500679016
367
+ },
368
+ "llm_top_1_test_accuracy": {
369
+ "20": 0.693,
370
+ "21": 0.775,
371
+ "22": 0.645,
372
+ "25": 0.706,
373
+ "26": 0.616
374
+ },
375
+ "llm_top_2_test_accuracy": {
376
+ "20": 0.827,
377
+ "21": 0.761,
378
+ "22": 0.694,
379
+ "25": 0.778,
380
+ "26": 0.686
381
+ },
382
+ "llm_top_5_test_accuracy": {
383
+ "20": 0.855,
384
+ "21": 0.791,
385
+ "22": 0.725,
386
+ "25": 0.809,
387
+ "26": 0.672
388
+ },
389
+ "sae_top_1_test_accuracy": {
390
+ "20": 0.814,
391
+ "21": 0.618,
392
+ "22": 0.82,
393
+ "25": 0.664,
394
+ "26": 0.621
395
+ },
396
+ "sae_top_2_test_accuracy": {
397
+ "20": 0.841,
398
+ "21": 0.779,
399
+ "22": 0.817,
400
+ "25": 0.86,
401
+ "26": 0.623
402
+ },
403
+ "sae_top_5_test_accuracy": {
404
+ "20": 0.932,
405
+ "21": 0.8,
406
+ "22": 0.859,
407
+ "25": 0.86,
408
+ "26": 0.674
409
+ }
410
+ },
411
+ "canrager/amazon_reviews_mcauley_1and5_results": {
412
+ "sae_test_accuracy": {
413
+ "1": 0.9480000734329224,
414
+ "2": 0.9380000233650208,
415
+ "3": 0.9190000295639038,
416
+ "5": 0.921000063419342,
417
+ "6": 0.8700000643730164
418
+ },
419
+ "llm_test_accuracy": {
420
+ "1": 0.9580000638961792,
421
+ "2": 0.9330000281333923,
422
+ "3": 0.9280000329017639,
423
+ "5": 0.9200000166893005,
424
+ "6": 0.862000048160553
425
+ },
426
+ "llm_top_1_test_accuracy": {
427
+ "1": 0.647,
428
+ "2": 0.603,
429
+ "3": 0.598,
430
+ "5": 0.555,
431
+ "6": 0.592
432
+ },
433
+ "llm_top_2_test_accuracy": {
434
+ "1": 0.75,
435
+ "2": 0.648,
436
+ "3": 0.607,
437
+ "5": 0.606,
438
+ "6": 0.626
439
+ },
440
+ "llm_top_5_test_accuracy": {
441
+ "1": 0.767,
442
+ "2": 0.641,
443
+ "3": 0.645,
444
+ "5": 0.638,
445
+ "6": 0.676
446
+ },
447
+ "sae_top_1_test_accuracy": {
448
+ "1": 0.82,
449
+ "2": 0.647,
450
+ "3": 0.557,
451
+ "5": 0.547,
452
+ "6": 0.74
453
+ },
454
+ "sae_top_2_test_accuracy": {
455
+ "1": 0.817,
456
+ "2": 0.639,
457
+ "3": 0.584,
458
+ "5": 0.555,
459
+ "6": 0.744
460
+ },
461
+ "sae_top_5_test_accuracy": {
462
+ "1": 0.87,
463
+ "2": 0.872,
464
+ "3": 0.641,
465
+ "5": 0.8,
466
+ "6": 0.756
467
+ }
468
+ },
469
+ "canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
470
+ "sae_test_accuracy": {
471
+ "1.0": 0.9730000495910645,
472
+ "5.0": 0.9720000624656677
473
+ },
474
+ "llm_test_accuracy": {
475
+ "1.0": 0.9780000448226929,
476
+ "5.0": 0.9810000658035278
477
+ },
478
+ "llm_top_1_test_accuracy": {
479
+ "1.0": 0.673,
480
+ "5.0": 0.673
481
+ },
482
+ "llm_top_2_test_accuracy": {
483
+ "1.0": 0.724,
484
+ "5.0": 0.724
485
+ },
486
+ "llm_top_5_test_accuracy": {
487
+ "1.0": 0.766,
488
+ "5.0": 0.766
489
+ },
490
+ "sae_top_1_test_accuracy": {
491
+ "1.0": 0.6,
492
+ "5.0": 0.6
493
+ },
494
+ "sae_top_2_test_accuracy": {
495
+ "1.0": 0.764,
496
+ "5.0": 0.764
497
+ },
498
+ "sae_top_5_test_accuracy": {
499
+ "1.0": 0.942,
500
+ "5.0": 0.942
501
+ }
502
+ },
503
+ "codeparrot/github-code_results": {
504
+ "sae_test_accuracy": {
505
+ "C": 0.9590000510215759,
506
+ "Python": 0.9860000610351562,
507
+ "HTML": 0.9820000529289246,
508
+ "Java": 0.9630000591278076,
509
+ "PHP": 0.9530000686645508
510
+ },
511
+ "llm_test_accuracy": {
512
+ "C": 0.956000030040741,
513
+ "Python": 0.984000027179718,
514
+ "HTML": 0.9900000691413879,
515
+ "Java": 0.9670000672340393,
516
+ "PHP": 0.9570000171661377
517
+ },
518
+ "llm_top_1_test_accuracy": {
519
+ "C": 0.666,
520
+ "Python": 0.626,
521
+ "HTML": 0.721,
522
+ "Java": 0.619,
523
+ "PHP": 0.594
524
+ },
525
+ "llm_top_2_test_accuracy": {
526
+ "C": 0.679,
527
+ "Python": 0.674,
528
+ "HTML": 0.8,
529
+ "Java": 0.676,
530
+ "PHP": 0.651
531
+ },
532
+ "llm_top_5_test_accuracy": {
533
+ "C": 0.783,
534
+ "Python": 0.717,
535
+ "HTML": 0.935,
536
+ "Java": 0.733,
537
+ "PHP": 0.715
538
+ },
539
+ "sae_top_1_test_accuracy": {
540
+ "C": 0.607,
541
+ "Python": 0.648,
542
+ "HTML": 0.689,
543
+ "Java": 0.633,
544
+ "PHP": 0.615
545
+ },
546
+ "sae_top_2_test_accuracy": {
547
+ "C": 0.601,
548
+ "Python": 0.685,
549
+ "HTML": 0.674,
550
+ "Java": 0.638,
551
+ "PHP": 0.617
552
+ },
553
+ "sae_top_5_test_accuracy": {
554
+ "C": 0.607,
555
+ "Python": 0.696,
556
+ "HTML": 0.879,
557
+ "Java": 0.656,
558
+ "PHP": 0.886
559
+ }
560
+ },
561
+ "fancyzhx/ag_news_results": {
562
+ "sae_test_accuracy": {
563
+ "0": 0.940000057220459,
564
+ "1": 0.9850000739097595,
565
+ "2": 0.9310000538825989,
566
+ "3": 0.9520000219345093
567
+ },
568
+ "llm_test_accuracy": {
569
+ "0": 0.940000057220459,
570
+ "1": 0.9860000610351562,
571
+ "2": 0.9200000166893005,
572
+ "3": 0.9540000557899475
573
+ },
574
+ "llm_top_1_test_accuracy": {
575
+ "0": 0.573,
576
+ "1": 0.671,
577
+ "2": 0.672,
578
+ "3": 0.635
579
+ },
580
+ "llm_top_2_test_accuracy": {
581
+ "0": 0.802,
582
+ "1": 0.808,
583
+ "2": 0.701,
584
+ "3": 0.816
585
+ },
586
+ "llm_top_5_test_accuracy": {
587
+ "0": 0.81,
588
+ "1": 0.891,
589
+ "2": 0.752,
590
+ "3": 0.832
591
+ },
592
+ "sae_top_1_test_accuracy": {
593
+ "0": 0.661,
594
+ "1": 0.948,
595
+ "2": 0.815,
596
+ "3": 0.656
597
+ },
598
+ "sae_top_2_test_accuracy": {
599
+ "0": 0.658,
600
+ "1": 0.958,
601
+ "2": 0.83,
602
+ "3": 0.704
603
+ },
604
+ "sae_top_5_test_accuracy": {
605
+ "0": 0.764,
606
+ "1": 0.96,
607
+ "2": 0.83,
608
+ "3": 0.775
609
+ }
610
+ },
611
+ "Helsinki-NLP/europarl_results": {
612
+ "sae_test_accuracy": {
613
+ "en": 0.9980000257492065,
614
+ "fr": 1.0,
615
+ "de": 1.0,
616
+ "es": 1.0,
617
+ "nl": 0.999000072479248
618
+ },
619
+ "llm_test_accuracy": {
620
+ "en": 1.0,
621
+ "fr": 0.999000072479248,
622
+ "de": 1.0,
623
+ "es": 0.999000072479248,
624
+ "nl": 1.0
625
+ },
626
+ "llm_top_1_test_accuracy": {
627
+ "en": 0.739,
628
+ "fr": 0.585,
629
+ "de": 0.758,
630
+ "es": 0.496,
631
+ "nl": 0.649
632
+ },
633
+ "llm_top_2_test_accuracy": {
634
+ "en": 0.829,
635
+ "fr": 0.582,
636
+ "de": 0.82,
637
+ "es": 0.958,
638
+ "nl": 0.753
639
+ },
640
+ "llm_top_5_test_accuracy": {
641
+ "en": 0.892,
642
+ "fr": 0.888,
643
+ "de": 0.894,
644
+ "es": 0.98,
645
+ "nl": 0.852
646
+ },
647
+ "sae_top_1_test_accuracy": {
648
+ "en": 0.998,
649
+ "fr": 0.84,
650
+ "de": 0.96,
651
+ "es": 0.992,
652
+ "nl": 0.997
653
+ },
654
+ "sae_top_2_test_accuracy": {
655
+ "en": 0.997,
656
+ "fr": 0.989,
657
+ "de": 0.956,
658
+ "es": 0.99,
659
+ "nl": 0.999
660
+ },
661
+ "sae_top_5_test_accuracy": {
662
+ "en": 0.999,
663
+ "fr": 0.993,
664
+ "de": 0.998,
665
+ "es": 0.995,
666
+ "nl": 0.999
667
+ }
668
+ }
669
+ }
670
+ }
eval_results_from_scratch/sparse_probing/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "sparse_probing",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "LabHC/bias_in_bios_class_set2",
8
+ "LabHC/bias_in_bios_class_set3",
9
+ "canrager/amazon_reviews_mcauley_1and5",
10
+ "canrager/amazon_reviews_mcauley_1and5_sentiment",
11
+ "codeparrot/github-code",
12
+ "fancyzhx/ag_news",
13
+ "Helsinki-NLP/europarl"
14
+ ],
15
+ "probe_train_set_size": 4000,
16
+ "probe_test_set_size": 1000,
17
+ "context_length": 128,
18
+ "sae_batch_size": 125,
19
+ "llm_batch_size": 32,
20
+ "llm_dtype": "bfloat16",
21
+ "model_name": "gemma-2-2b",
22
+ "k_values": [
23
+ 1,
24
+ 2,
25
+ 5
26
+ ],
27
+ "lower_vram_usage": false
28
+ },
29
+ "eval_id": "5982f843-09ae-423f-897e-88fdc5e9b765",
30
+ "datetime_epoch_millis": 1740165484719,
31
+ "eval_result_metrics": {
32
+ "llm": {
33
+ "llm_test_accuracy": 0.9595375448465346,
34
+ "llm_top_1_test_accuracy": 0.64956875,
35
+ "llm_top_2_test_accuracy": 0.72589375,
36
+ "llm_top_5_test_accuracy": 0.78265625,
37
+ "llm_top_10_test_accuracy": null,
38
+ "llm_top_20_test_accuracy": null,
39
+ "llm_top_50_test_accuracy": null,
40
+ "llm_top_100_test_accuracy": null
41
+ },
42
+ "sae": {
43
+ "sae_test_accuracy": 0.9565250385552645,
44
+ "sae_top_1_test_accuracy": 0.70693125,
45
+ "sae_top_2_test_accuracy": 0.8177062500000001,
46
+ "sae_top_5_test_accuracy": 0.8643937500000001,
47
+ "sae_top_10_test_accuracy": null,
48
+ "sae_top_20_test_accuracy": null,
49
+ "sae_top_50_test_accuracy": null,
50
+ "sae_top_100_test_accuracy": null
51
+ }
52
+ },
53
+ "eval_result_details": [
54
+ {
55
+ "dataset_name": "LabHC/bias_in_bios_class_set1_results",
56
+ "llm_test_accuracy": 0.966800057888031,
57
+ "llm_top_1_test_accuracy": 0.6397999999999999,
58
+ "llm_top_2_test_accuracy": 0.6954,
59
+ "llm_top_5_test_accuracy": 0.7869999999999999,
60
+ "llm_top_10_test_accuracy": null,
61
+ "llm_top_20_test_accuracy": null,
62
+ "llm_top_50_test_accuracy": null,
63
+ "llm_top_100_test_accuracy": null,
64
+ "sae_test_accuracy": 0.9606000423431397,
65
+ "sae_top_1_test_accuracy": 0.677,
66
+ "sae_top_2_test_accuracy": 0.8896000000000001,
67
+ "sae_top_5_test_accuracy": 0.9054,
68
+ "sae_top_10_test_accuracy": null,
69
+ "sae_top_20_test_accuracy": null,
70
+ "sae_top_50_test_accuracy": null,
71
+ "sae_top_100_test_accuracy": null
72
+ },
73
+ {
74
+ "dataset_name": "LabHC/bias_in_bios_class_set2_results",
75
+ "llm_test_accuracy": 0.9578000426292419,
76
+ "llm_top_1_test_accuracy": 0.6694000000000001,
77
+ "llm_top_2_test_accuracy": 0.725,
78
+ "llm_top_5_test_accuracy": 0.7654,
79
+ "llm_top_10_test_accuracy": null,
80
+ "llm_top_20_test_accuracy": null,
81
+ "llm_top_50_test_accuracy": null,
82
+ "llm_top_100_test_accuracy": null,
83
+ "sae_test_accuracy": 0.9488000273704529,
84
+ "sae_top_1_test_accuracy": 0.679,
85
+ "sae_top_2_test_accuracy": 0.758,
86
+ "sae_top_5_test_accuracy": 0.8614,
87
+ "sae_top_10_test_accuracy": null,
88
+ "sae_top_20_test_accuracy": null,
89
+ "sae_top_50_test_accuracy": null,
90
+ "sae_top_100_test_accuracy": null
91
+ },
92
+ {
93
+ "dataset_name": "LabHC/bias_in_bios_class_set3_results",
94
+ "llm_test_accuracy": 0.9316000461578369,
95
+ "llm_top_1_test_accuracy": 0.687,
96
+ "llm_top_2_test_accuracy": 0.7492,
97
+ "llm_top_5_test_accuracy": 0.7704000000000001,
98
+ "llm_top_10_test_accuracy": null,
99
+ "llm_top_20_test_accuracy": null,
100
+ "llm_top_50_test_accuracy": null,
101
+ "llm_top_100_test_accuracy": null,
102
+ "sae_test_accuracy": 0.929200041294098,
103
+ "sae_top_1_test_accuracy": 0.7186,
104
+ "sae_top_2_test_accuracy": 0.8170000000000002,
105
+ "sae_top_5_test_accuracy": 0.8568,
106
+ "sae_top_10_test_accuracy": null,
107
+ "sae_top_20_test_accuracy": null,
108
+ "sae_top_50_test_accuracy": null,
109
+ "sae_top_100_test_accuracy": null
110
+ },
111
+ {
112
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
113
+ "llm_test_accuracy": 0.9202000379562378,
114
+ "llm_top_1_test_accuracy": 0.599,
115
+ "llm_top_2_test_accuracy": 0.6474,
116
+ "llm_top_5_test_accuracy": 0.6734,
117
+ "llm_top_10_test_accuracy": null,
118
+ "llm_top_20_test_accuracy": null,
119
+ "llm_top_50_test_accuracy": null,
120
+ "llm_top_100_test_accuracy": null,
121
+ "sae_test_accuracy": 0.9192000389099121,
122
+ "sae_top_1_test_accuracy": 0.6487999999999999,
123
+ "sae_top_2_test_accuracy": 0.7392,
124
+ "sae_top_5_test_accuracy": 0.7898,
125
+ "sae_top_10_test_accuracy": null,
126
+ "sae_top_20_test_accuracy": null,
127
+ "sae_top_50_test_accuracy": null,
128
+ "sae_top_100_test_accuracy": null
129
+ },
130
+ {
131
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
132
+ "llm_test_accuracy": 0.9795000553131104,
133
+ "llm_top_1_test_accuracy": 0.673,
134
+ "llm_top_2_test_accuracy": 0.724,
135
+ "llm_top_5_test_accuracy": 0.766,
136
+ "llm_top_10_test_accuracy": null,
137
+ "llm_top_20_test_accuracy": null,
138
+ "llm_top_50_test_accuracy": null,
139
+ "llm_top_100_test_accuracy": null,
140
+ "sae_test_accuracy": 0.9725000560283661,
141
+ "sae_top_1_test_accuracy": 0.885,
142
+ "sae_top_2_test_accuracy": 0.889,
143
+ "sae_top_5_test_accuracy": 0.931,
144
+ "sae_top_10_test_accuracy": null,
145
+ "sae_top_20_test_accuracy": null,
146
+ "sae_top_50_test_accuracy": null,
147
+ "sae_top_100_test_accuracy": null
148
+ },
149
+ {
150
+ "dataset_name": "codeparrot/github-code_results",
151
+ "llm_test_accuracy": 0.9708000421524048,
152
+ "llm_top_1_test_accuracy": 0.6451999999999999,
153
+ "llm_top_2_test_accuracy": 0.6960000000000001,
154
+ "llm_top_5_test_accuracy": 0.7766,
155
+ "llm_top_10_test_accuracy": null,
156
+ "llm_top_20_test_accuracy": null,
157
+ "llm_top_50_test_accuracy": null,
158
+ "llm_top_100_test_accuracy": null,
159
+ "sae_test_accuracy": 0.9698000431060791,
160
+ "sae_top_1_test_accuracy": 0.5916,
161
+ "sae_top_2_test_accuracy": 0.756,
162
+ "sae_top_5_test_accuracy": 0.8038000000000001,
163
+ "sae_top_10_test_accuracy": null,
164
+ "sae_top_20_test_accuracy": null,
165
+ "sae_top_50_test_accuracy": null,
166
+ "sae_top_100_test_accuracy": null
167
+ },
168
+ {
169
+ "dataset_name": "fancyzhx/ag_news_results",
170
+ "llm_test_accuracy": 0.9500000476837158,
171
+ "llm_top_1_test_accuracy": 0.63775,
172
+ "llm_top_2_test_accuracy": 0.78175,
173
+ "llm_top_5_test_accuracy": 0.82125,
174
+ "llm_top_10_test_accuracy": null,
175
+ "llm_top_20_test_accuracy": null,
176
+ "llm_top_50_test_accuracy": null,
177
+ "llm_top_100_test_accuracy": null,
178
+ "sae_test_accuracy": 0.9525000303983688,
179
+ "sae_top_1_test_accuracy": 0.61025,
180
+ "sae_top_2_test_accuracy": 0.7112499999999999,
181
+ "sae_top_5_test_accuracy": 0.7747499999999999,
182
+ "sae_top_10_test_accuracy": null,
183
+ "sae_top_20_test_accuracy": null,
184
+ "sae_top_50_test_accuracy": null,
185
+ "sae_top_100_test_accuracy": null
186
+ },
187
+ {
188
+ "dataset_name": "Helsinki-NLP/europarl_results",
189
+ "llm_test_accuracy": 0.9996000289916992,
190
+ "llm_top_1_test_accuracy": 0.6454,
191
+ "llm_top_2_test_accuracy": 0.7884,
192
+ "llm_top_5_test_accuracy": 0.9012,
193
+ "llm_top_10_test_accuracy": null,
194
+ "llm_top_20_test_accuracy": null,
195
+ "llm_top_50_test_accuracy": null,
196
+ "llm_top_100_test_accuracy": null,
197
+ "sae_test_accuracy": 0.9996000289916992,
198
+ "sae_top_1_test_accuracy": 0.8452,
199
+ "sae_top_2_test_accuracy": 0.9816,
200
+ "sae_top_5_test_accuracy": 0.9922000000000001,
201
+ "sae_top_10_test_accuracy": null,
202
+ "sae_top_20_test_accuracy": null,
203
+ "sae_top_50_test_accuracy": null,
204
+ "sae_top_100_test_accuracy": null
205
+ }
206
+ ],
207
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
208
+ "sae_lens_id": "custom_sae",
209
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0",
210
+ "sae_lens_version": "5.4.2",
211
+ "sae_cfg_dict": {
212
+ "model_name": "gemma-2-2b",
213
+ "d_in": 2304,
214
+ "d_sae": 65536,
215
+ "hook_layer": 12,
216
+ "hook_name": "blocks.12.hook_resid_post",
217
+ "context_size": null,
218
+ "hook_head_index": null,
219
+ "architecture": "topk",
220
+ "apply_b_dec_to_input": null,
221
+ "finetuning_scaling_factor": null,
222
+ "activation_fn_str": "",
223
+ "prepend_bos": true,
224
+ "normalize_activations": "none",
225
+ "dtype": "bfloat16",
226
+ "device": "",
227
+ "dataset_path": "",
228
+ "dataset_trust_remote_code": true,
229
+ "seqpos_slice": [
230
+ null
231
+ ],
232
+ "training_tokens": -100000,
233
+ "sae_lens_training_version": null,
234
+ "neuronpedia_id": null
235
+ },
236
+ "eval_result_unstructured": {
237
+ "LabHC/bias_in_bios_class_set1_results": {
238
+ "sae_test_accuracy": {
239
+ "0": 0.9420000314712524,
240
+ "1": 0.9530000686645508,
241
+ "2": 0.9470000267028809,
242
+ "6": 0.984000027179718,
243
+ "9": 0.9770000576972961
244
+ },
245
+ "llm_test_accuracy": {
246
+ "0": 0.9510000348091125,
247
+ "1": 0.9670000672340393,
248
+ "2": 0.9530000686645508,
249
+ "6": 0.987000048160553,
250
+ "9": 0.9760000705718994
251
+ },
252
+ "llm_top_1_test_accuracy": {
253
+ "0": 0.577,
254
+ "1": 0.613,
255
+ "2": 0.662,
256
+ "6": 0.787,
257
+ "9": 0.56
258
+ },
259
+ "llm_top_2_test_accuracy": {
260
+ "0": 0.574,
261
+ "1": 0.66,
262
+ "2": 0.718,
263
+ "6": 0.811,
264
+ "9": 0.714
265
+ },
266
+ "llm_top_5_test_accuracy": {
267
+ "0": 0.713,
268
+ "1": 0.711,
269
+ "2": 0.755,
270
+ "6": 0.895,
271
+ "9": 0.861
272
+ },
273
+ "sae_top_1_test_accuracy": {
274
+ "0": 0.566,
275
+ "1": 0.643,
276
+ "2": 0.826,
277
+ "6": 0.797,
278
+ "9": 0.553
279
+ },
280
+ "sae_top_2_test_accuracy": {
281
+ "0": 0.868,
282
+ "1": 0.806,
283
+ "2": 0.853,
284
+ "6": 0.981,
285
+ "9": 0.94
286
+ },
287
+ "sae_top_5_test_accuracy": {
288
+ "0": 0.881,
289
+ "1": 0.853,
290
+ "2": 0.858,
291
+ "6": 0.989,
292
+ "9": 0.946
293
+ }
294
+ },
295
+ "LabHC/bias_in_bios_class_set2_results": {
296
+ "sae_test_accuracy": {
297
+ "11": 0.9660000205039978,
298
+ "13": 0.9520000219345093,
299
+ "14": 0.9430000185966492,
300
+ "18": 0.9230000376701355,
301
+ "19": 0.9600000381469727
302
+ },
303
+ "llm_test_accuracy": {
304
+ "11": 0.9690000414848328,
305
+ "13": 0.9600000381469727,
306
+ "14": 0.9600000381469727,
307
+ "18": 0.9390000700950623,
308
+ "19": 0.9610000252723694
309
+ },
310
+ "llm_top_1_test_accuracy": {
311
+ "11": 0.555,
312
+ "13": 0.668,
313
+ "14": 0.638,
314
+ "18": 0.69,
315
+ "19": 0.796
316
+ },
317
+ "llm_top_2_test_accuracy": {
318
+ "11": 0.756,
319
+ "13": 0.714,
320
+ "14": 0.67,
321
+ "18": 0.717,
322
+ "19": 0.768
323
+ },
324
+ "llm_top_5_test_accuracy": {
325
+ "11": 0.794,
326
+ "13": 0.749,
327
+ "14": 0.723,
328
+ "18": 0.73,
329
+ "19": 0.831
330
+ },
331
+ "sae_top_1_test_accuracy": {
332
+ "11": 0.555,
333
+ "13": 0.666,
334
+ "14": 0.661,
335
+ "18": 0.712,
336
+ "19": 0.801
337
+ },
338
+ "sae_top_2_test_accuracy": {
339
+ "11": 0.736,
340
+ "13": 0.697,
341
+ "14": 0.795,
342
+ "18": 0.737,
343
+ "19": 0.825
344
+ },
345
+ "sae_top_5_test_accuracy": {
346
+ "11": 0.909,
347
+ "13": 0.745,
348
+ "14": 0.904,
349
+ "18": 0.897,
350
+ "19": 0.852
351
+ }
352
+ },
353
+ "LabHC/bias_in_bios_class_set3_results": {
354
+ "sae_test_accuracy": {
355
+ "20": 0.9510000348091125,
356
+ "21": 0.9240000247955322,
357
+ "22": 0.9180000424385071,
358
+ "25": 0.9510000348091125,
359
+ "26": 0.9020000696182251
360
+ },
361
+ "llm_test_accuracy": {
362
+ "20": 0.956000030040741,
363
+ "21": 0.9350000619888306,
364
+ "22": 0.9180000424385071,
365
+ "25": 0.9640000462532043,
366
+ "26": 0.8850000500679016
367
+ },
368
+ "llm_top_1_test_accuracy": {
369
+ "20": 0.693,
370
+ "21": 0.775,
371
+ "22": 0.645,
372
+ "25": 0.706,
373
+ "26": 0.616
374
+ },
375
+ "llm_top_2_test_accuracy": {
376
+ "20": 0.827,
377
+ "21": 0.761,
378
+ "22": 0.694,
379
+ "25": 0.778,
380
+ "26": 0.686
381
+ },
382
+ "llm_top_5_test_accuracy": {
383
+ "20": 0.855,
384
+ "21": 0.791,
385
+ "22": 0.725,
386
+ "25": 0.809,
387
+ "26": 0.672
388
+ },
389
+ "sae_top_1_test_accuracy": {
390
+ "20": 0.856,
391
+ "21": 0.501,
392
+ "22": 0.893,
393
+ "25": 0.695,
394
+ "26": 0.648
395
+ },
396
+ "sae_top_2_test_accuracy": {
397
+ "20": 0.863,
398
+ "21": 0.737,
399
+ "22": 0.893,
400
+ "25": 0.874,
401
+ "26": 0.718
402
+ },
403
+ "sae_top_5_test_accuracy": {
404
+ "20": 0.897,
405
+ "21": 0.841,
406
+ "22": 0.886,
407
+ "25": 0.899,
408
+ "26": 0.761
409
+ }
410
+ },
411
+ "canrager/amazon_reviews_mcauley_1and5_results": {
412
+ "sae_test_accuracy": {
413
+ "1": 0.9510000348091125,
414
+ "2": 0.9300000667572021,
415
+ "3": 0.9200000166893005,
416
+ "5": 0.9270000457763672,
417
+ "6": 0.8680000305175781
418
+ },
419
+ "llm_test_accuracy": {
420
+ "1": 0.9580000638961792,
421
+ "2": 0.9330000281333923,
422
+ "3": 0.9280000329017639,
423
+ "5": 0.9200000166893005,
424
+ "6": 0.862000048160553
425
+ },
426
+ "llm_top_1_test_accuracy": {
427
+ "1": 0.647,
428
+ "2": 0.603,
429
+ "3": 0.598,
430
+ "5": 0.555,
431
+ "6": 0.592
432
+ },
433
+ "llm_top_2_test_accuracy": {
434
+ "1": 0.75,
435
+ "2": 0.648,
436
+ "3": 0.607,
437
+ "5": 0.606,
438
+ "6": 0.626
439
+ },
440
+ "llm_top_5_test_accuracy": {
441
+ "1": 0.767,
442
+ "2": 0.641,
443
+ "3": 0.645,
444
+ "5": 0.638,
445
+ "6": 0.676
446
+ },
447
+ "sae_top_1_test_accuracy": {
448
+ "1": 0.842,
449
+ "2": 0.729,
450
+ "3": 0.54,
451
+ "5": 0.536,
452
+ "6": 0.597
453
+ },
454
+ "sae_top_2_test_accuracy": {
455
+ "1": 0.906,
456
+ "2": 0.84,
457
+ "3": 0.569,
458
+ "5": 0.791,
459
+ "6": 0.59
460
+ },
461
+ "sae_top_5_test_accuracy": {
462
+ "1": 0.918,
463
+ "2": 0.844,
464
+ "3": 0.6,
465
+ "5": 0.877,
466
+ "6": 0.71
467
+ }
468
+ },
469
+ "canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
470
+ "sae_test_accuracy": {
471
+ "1.0": 0.9720000624656677,
472
+ "5.0": 0.9730000495910645
473
+ },
474
+ "llm_test_accuracy": {
475
+ "1.0": 0.9780000448226929,
476
+ "5.0": 0.9810000658035278
477
+ },
478
+ "llm_top_1_test_accuracy": {
479
+ "1.0": 0.673,
480
+ "5.0": 0.673
481
+ },
482
+ "llm_top_2_test_accuracy": {
483
+ "1.0": 0.724,
484
+ "5.0": 0.724
485
+ },
486
+ "llm_top_5_test_accuracy": {
487
+ "1.0": 0.766,
488
+ "5.0": 0.766
489
+ },
490
+ "sae_top_1_test_accuracy": {
491
+ "1.0": 0.885,
492
+ "5.0": 0.885
493
+ },
494
+ "sae_top_2_test_accuracy": {
495
+ "1.0": 0.889,
496
+ "5.0": 0.889
497
+ },
498
+ "sae_top_5_test_accuracy": {
499
+ "1.0": 0.931,
500
+ "5.0": 0.931
501
+ }
502
+ },
503
+ "codeparrot/github-code_results": {
504
+ "sae_test_accuracy": {
505
+ "C": 0.9530000686645508,
506
+ "Python": 0.9930000305175781,
507
+ "HTML": 0.984000027179718,
508
+ "Java": 0.9630000591278076,
509
+ "PHP": 0.956000030040741
510
+ },
511
+ "llm_test_accuracy": {
512
+ "C": 0.956000030040741,
513
+ "Python": 0.984000027179718,
514
+ "HTML": 0.9900000691413879,
515
+ "Java": 0.9670000672340393,
516
+ "PHP": 0.9570000171661377
517
+ },
518
+ "llm_top_1_test_accuracy": {
519
+ "C": 0.666,
520
+ "Python": 0.626,
521
+ "HTML": 0.721,
522
+ "Java": 0.619,
523
+ "PHP": 0.594
524
+ },
525
+ "llm_top_2_test_accuracy": {
526
+ "C": 0.679,
527
+ "Python": 0.674,
528
+ "HTML": 0.8,
529
+ "Java": 0.676,
530
+ "PHP": 0.651
531
+ },
532
+ "llm_top_5_test_accuracy": {
533
+ "C": 0.783,
534
+ "Python": 0.717,
535
+ "HTML": 0.935,
536
+ "Java": 0.733,
537
+ "PHP": 0.715
538
+ },
539
+ "sae_top_1_test_accuracy": {
540
+ "C": 0.536,
541
+ "Python": 0.645,
542
+ "HTML": 0.579,
543
+ "Java": 0.613,
544
+ "PHP": 0.585
545
+ },
546
+ "sae_top_2_test_accuracy": {
547
+ "C": 0.634,
548
+ "Python": 0.94,
549
+ "HTML": 0.691,
550
+ "Java": 0.617,
551
+ "PHP": 0.898
552
+ },
553
+ "sae_top_5_test_accuracy": {
554
+ "C": 0.687,
555
+ "Python": 0.943,
556
+ "HTML": 0.804,
557
+ "Java": 0.675,
558
+ "PHP": 0.91
559
+ }
560
+ },
561
+ "fancyzhx/ag_news_results": {
562
+ "sae_test_accuracy": {
563
+ "0": 0.9380000233650208,
564
+ "1": 0.984000027179718,
565
+ "2": 0.9330000281333923,
566
+ "3": 0.9550000429153442
567
+ },
568
+ "llm_test_accuracy": {
569
+ "0": 0.940000057220459,
570
+ "1": 0.9860000610351562,
571
+ "2": 0.9200000166893005,
572
+ "3": 0.9540000557899475
573
+ },
574
+ "llm_top_1_test_accuracy": {
575
+ "0": 0.573,
576
+ "1": 0.671,
577
+ "2": 0.672,
578
+ "3": 0.635
579
+ },
580
+ "llm_top_2_test_accuracy": {
581
+ "0": 0.802,
582
+ "1": 0.808,
583
+ "2": 0.701,
584
+ "3": 0.816
585
+ },
586
+ "llm_top_5_test_accuracy": {
587
+ "0": 0.81,
588
+ "1": 0.891,
589
+ "2": 0.752,
590
+ "3": 0.832
591
+ },
592
+ "sae_top_1_test_accuracy": {
593
+ "0": 0.575,
594
+ "1": 0.667,
595
+ "2": 0.566,
596
+ "3": 0.633
597
+ },
598
+ "sae_top_2_test_accuracy": {
599
+ "0": 0.737,
600
+ "1": 0.706,
601
+ "2": 0.708,
602
+ "3": 0.694
603
+ },
604
+ "sae_top_5_test_accuracy": {
605
+ "0": 0.799,
606
+ "1": 0.749,
607
+ "2": 0.821,
608
+ "3": 0.73
609
+ }
610
+ },
611
+ "Helsinki-NLP/europarl_results": {
612
+ "sae_test_accuracy": {
613
+ "en": 1.0,
614
+ "fr": 0.999000072479248,
615
+ "de": 1.0,
616
+ "es": 1.0,
617
+ "nl": 0.999000072479248
618
+ },
619
+ "llm_test_accuracy": {
620
+ "en": 1.0,
621
+ "fr": 0.999000072479248,
622
+ "de": 1.0,
623
+ "es": 0.999000072479248,
624
+ "nl": 1.0
625
+ },
626
+ "llm_top_1_test_accuracy": {
627
+ "en": 0.739,
628
+ "fr": 0.585,
629
+ "de": 0.758,
630
+ "es": 0.496,
631
+ "nl": 0.649
632
+ },
633
+ "llm_top_2_test_accuracy": {
634
+ "en": 0.829,
635
+ "fr": 0.582,
636
+ "de": 0.82,
637
+ "es": 0.958,
638
+ "nl": 0.753
639
+ },
640
+ "llm_top_5_test_accuracy": {
641
+ "en": 0.892,
642
+ "fr": 0.888,
643
+ "de": 0.894,
644
+ "es": 0.98,
645
+ "nl": 0.852
646
+ },
647
+ "sae_top_1_test_accuracy": {
648
+ "en": 0.76,
649
+ "fr": 0.994,
650
+ "de": 0.923,
651
+ "es": 0.884,
652
+ "nl": 0.665
653
+ },
654
+ "sae_top_2_test_accuracy": {
655
+ "en": 0.997,
656
+ "fr": 0.994,
657
+ "de": 0.924,
658
+ "es": 0.996,
659
+ "nl": 0.997
660
+ },
661
+ "sae_top_5_test_accuracy": {
662
+ "en": 0.998,
663
+ "fr": 0.997,
664
+ "de": 0.975,
665
+ "es": 0.995,
666
+ "nl": 0.996
667
+ }
668
+ }
669
+ }
670
+ }
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "tpp",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": false,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "8a95660d-45b7-41a4-a525-961bf9e6596a",
73
+ "datetime_epoch_millis": 1740163272675,
74
+ "eval_result_metrics": {
75
+ "tpp_metrics": {
76
+ "tpp_threshold_2_total_metric": 0.002374991774559021,
77
+ "tpp_threshold_2_intended_diff_only": 0.004799991846084595,
78
+ "tpp_threshold_2_unintended_diff_only": 0.0024250000715255737,
79
+ "tpp_threshold_5_total_metric": 0.0023500144481658934,
80
+ "tpp_threshold_5_intended_diff_only": 0.0051000118255615234,
81
+ "tpp_threshold_5_unintended_diff_only": 0.00274999737739563,
82
+ "tpp_threshold_10_total_metric": 0.007099992036819458,
83
+ "tpp_threshold_10_intended_diff_only": 0.010999995470046996,
84
+ "tpp_threshold_10_unintended_diff_only": 0.003900003433227539,
85
+ "tpp_threshold_20_total_metric": 0.018574997782707214,
86
+ "tpp_threshold_20_intended_diff_only": 0.025499999523162842,
87
+ "tpp_threshold_20_unintended_diff_only": 0.0069250017404556274,
88
+ "tpp_threshold_50_total_metric": 0.04237500578165054,
89
+ "tpp_threshold_50_intended_diff_only": 0.05090000629425048,
90
+ "tpp_threshold_50_unintended_diff_only": 0.008525000512599945,
91
+ "tpp_threshold_100_total_metric": 0.08157499581575393,
92
+ "tpp_threshold_100_intended_diff_only": 0.0940999984741211,
93
+ "tpp_threshold_100_unintended_diff_only": 0.012525002658367156,
94
+ "tpp_threshold_500_total_metric": 0.2862000107765198,
95
+ "tpp_threshold_500_intended_diff_only": 0.30840001106262205,
96
+ "tpp_threshold_500_unintended_diff_only": 0.022200000286102296
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
102
+ "tpp_threshold_2_total_metric": 0.0029499828815460205,
103
+ "tpp_threshold_2_intended_diff_only": 0.0045999884605407715,
104
+ "tpp_threshold_2_unintended_diff_only": 0.001650005578994751,
105
+ "tpp_threshold_5_total_metric": 0.0015500158071517945,
106
+ "tpp_threshold_5_intended_diff_only": 0.004000020027160644,
107
+ "tpp_threshold_5_unintended_diff_only": 0.00245000422000885,
108
+ "tpp_threshold_10_total_metric": 0.0034999847412109375,
109
+ "tpp_threshold_10_intended_diff_only": 0.0054000020027160645,
110
+ "tpp_threshold_10_unintended_diff_only": 0.001900017261505127,
111
+ "tpp_threshold_20_total_metric": 0.008800002932548522,
112
+ "tpp_threshold_20_intended_diff_only": 0.01100001335144043,
113
+ "tpp_threshold_20_unintended_diff_only": 0.002200010418891907,
114
+ "tpp_threshold_50_total_metric": 0.020700007677078247,
115
+ "tpp_threshold_50_intended_diff_only": 0.022600018978118898,
116
+ "tpp_threshold_50_unintended_diff_only": 0.0019000113010406495,
117
+ "tpp_threshold_100_total_metric": 0.05239999294281006,
118
+ "tpp_threshold_100_intended_diff_only": 0.05540000200271607,
119
+ "tpp_threshold_100_unintended_diff_only": 0.003000009059906006,
120
+ "tpp_threshold_500_total_metric": 0.2711500138044357,
121
+ "tpp_threshold_500_intended_diff_only": 0.27920001745224,
122
+ "tpp_threshold_500_unintended_diff_only": 0.00805000364780426
123
+ },
124
+ {
125
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
126
+ "tpp_threshold_2_total_metric": 0.0018000006675720215,
127
+ "tpp_threshold_2_intended_diff_only": 0.004999995231628418,
128
+ "tpp_threshold_2_unintended_diff_only": 0.0031999945640563965,
129
+ "tpp_threshold_5_total_metric": 0.0031500130891799925,
130
+ "tpp_threshold_5_intended_diff_only": 0.0062000036239624025,
131
+ "tpp_threshold_5_unintended_diff_only": 0.0030499905347824096,
132
+ "tpp_threshold_10_total_metric": 0.01069999933242798,
133
+ "tpp_threshold_10_intended_diff_only": 0.01659998893737793,
134
+ "tpp_threshold_10_unintended_diff_only": 0.005899989604949951,
135
+ "tpp_threshold_20_total_metric": 0.028349992632865906,
136
+ "tpp_threshold_20_intended_diff_only": 0.039999985694885255,
137
+ "tpp_threshold_20_unintended_diff_only": 0.011649993062019349,
138
+ "tpp_threshold_50_total_metric": 0.06405000388622284,
139
+ "tpp_threshold_50_intended_diff_only": 0.07919999361038207,
140
+ "tpp_threshold_50_unintended_diff_only": 0.01514998972415924,
141
+ "tpp_threshold_100_total_metric": 0.11074999868869781,
142
+ "tpp_threshold_100_intended_diff_only": 0.13279999494552613,
143
+ "tpp_threshold_100_unintended_diff_only": 0.022049996256828307,
144
+ "tpp_threshold_500_total_metric": 0.30125000774860383,
145
+ "tpp_threshold_500_intended_diff_only": 0.33760000467300416,
146
+ "tpp_threshold_500_unintended_diff_only": 0.03634999692440033
147
+ }
148
+ ],
149
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
150
+ "sae_lens_id": "custom_sae",
151
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0",
152
+ "sae_lens_version": "5.4.2",
153
+ "sae_cfg_dict": {
154
+ "model_name": "gemma-2-2b",
155
+ "d_in": 2304,
156
+ "d_sae": 65536,
157
+ "hook_layer": 12,
158
+ "hook_name": "blocks.12.hook_resid_post",
159
+ "context_size": null,
160
+ "hook_head_index": null,
161
+ "architecture": "topk",
162
+ "apply_b_dec_to_input": null,
163
+ "finetuning_scaling_factor": null,
164
+ "activation_fn_str": "",
165
+ "prepend_bos": true,
166
+ "normalize_activations": "none",
167
+ "dtype": "bfloat16",
168
+ "device": "",
169
+ "dataset_path": "",
170
+ "dataset_trust_remote_code": true,
171
+ "seqpos_slice": [
172
+ null
173
+ ],
174
+ "training_tokens": -100000,
175
+ "sae_lens_training_version": null,
176
+ "neuronpedia_id": null
177
+ },
178
+ "eval_result_unstructured": {
179
+ "LabHC/bias_in_bios_class_set1": {
180
+ "0": {
181
+ "tpp_threshold_2_total_metric": 0.006750002503395081,
182
+ "tpp_threshold_2_intended_diff_only": 0.009000003337860107,
183
+ "tpp_threshold_2_unintended_diff_only": 0.002250000834465027,
184
+ "tpp_threshold_5_total_metric": 0.011500045657157898,
185
+ "tpp_threshold_5_intended_diff_only": 0.01500004529953003,
186
+ "tpp_threshold_5_unintended_diff_only": 0.0034999996423721313,
187
+ "tpp_threshold_10_total_metric": 0.004999995231628418,
188
+ "tpp_threshold_10_intended_diff_only": 0.008000016212463379,
189
+ "tpp_threshold_10_unintended_diff_only": 0.003000020980834961,
190
+ "tpp_threshold_20_total_metric": 0.01800002157688141,
191
+ "tpp_threshold_20_intended_diff_only": 0.020000040531158447,
192
+ "tpp_threshold_20_unintended_diff_only": 0.0020000189542770386,
193
+ "tpp_threshold_50_total_metric": 0.0350000262260437,
194
+ "tpp_threshold_50_intended_diff_only": 0.03800004720687866,
195
+ "tpp_threshold_50_unintended_diff_only": 0.003000020980834961,
196
+ "tpp_threshold_100_total_metric": 0.11850003898143768,
197
+ "tpp_threshold_100_intended_diff_only": 0.12200003862380981,
198
+ "tpp_threshold_100_unintended_diff_only": 0.0034999996423721313,
199
+ "tpp_threshold_500_total_metric": 0.39775002002716064,
200
+ "tpp_threshold_500_intended_diff_only": 0.406000018119812,
201
+ "tpp_threshold_500_unintended_diff_only": 0.008249998092651367
202
+ },
203
+ "1": {
204
+ "tpp_threshold_2_total_metric": 0.0045000165700912476,
205
+ "tpp_threshold_2_intended_diff_only": 0.0040000081062316895,
206
+ "tpp_threshold_2_unintended_diff_only": -0.0005000084638595581,
207
+ "tpp_threshold_5_total_metric": -0.0002499520778656006,
208
+ "tpp_threshold_5_intended_diff_only": 0.001000046730041504,
209
+ "tpp_threshold_5_unintended_diff_only": 0.0012499988079071045,
210
+ "tpp_threshold_10_total_metric": 0.003750026226043701,
211
+ "tpp_threshold_10_intended_diff_only": 0.0020000338554382324,
212
+ "tpp_threshold_10_unintended_diff_only": -0.0017499923706054688,
213
+ "tpp_threshold_20_total_metric": -0.0012499690055847168,
214
+ "tpp_threshold_20_intended_diff_only": 0.0020000338554382324,
215
+ "tpp_threshold_20_unintended_diff_only": 0.0032500028610229492,
216
+ "tpp_threshold_50_total_metric": 0.0025000572204589844,
217
+ "tpp_threshold_50_intended_diff_only": 0.005000054836273193,
218
+ "tpp_threshold_50_unintended_diff_only": 0.002499997615814209,
219
+ "tpp_threshold_100_total_metric": 0.009000018239021301,
220
+ "tpp_threshold_100_intended_diff_only": 0.012000024318695068,
221
+ "tpp_threshold_100_unintended_diff_only": 0.003000006079673767,
222
+ "tpp_threshold_500_total_metric": 0.1807500571012497,
223
+ "tpp_threshold_500_intended_diff_only": 0.18500006198883057,
224
+ "tpp_threshold_500_unintended_diff_only": 0.004250004887580872
225
+ },
226
+ "2": {
227
+ "tpp_threshold_2_total_metric": -0.00025004148483276367,
228
+ "tpp_threshold_2_intended_diff_only": 0.001999974250793457,
229
+ "tpp_threshold_2_unintended_diff_only": 0.0022500157356262207,
230
+ "tpp_threshold_5_total_metric": -0.004250004887580872,
231
+ "tpp_threshold_5_intended_diff_only": 0.0,
232
+ "tpp_threshold_5_unintended_diff_only": 0.004250004887580872,
233
+ "tpp_threshold_10_total_metric": 0.009249955415725708,
234
+ "tpp_threshold_10_intended_diff_only": 0.010999977588653564,
235
+ "tpp_threshold_10_unintended_diff_only": 0.0017500221729278564,
236
+ "tpp_threshold_20_total_metric": 0.01299998164176941,
237
+ "tpp_threshold_20_intended_diff_only": 0.013999998569488525,
238
+ "tpp_threshold_20_unintended_diff_only": 0.0010000169277191162,
239
+ "tpp_threshold_50_total_metric": 0.03224998712539673,
240
+ "tpp_threshold_50_intended_diff_only": 0.03200000524520874,
241
+ "tpp_threshold_50_unintended_diff_only": -0.0002499818801879883,
242
+ "tpp_threshold_100_total_metric": 0.05349995195865631,
243
+ "tpp_threshold_100_intended_diff_only": 0.05299997329711914,
244
+ "tpp_threshold_100_unintended_diff_only": -0.0004999786615371704,
245
+ "tpp_threshold_500_total_metric": 0.3799999952316284,
246
+ "tpp_threshold_500_intended_diff_only": 0.3889999985694885,
247
+ "tpp_threshold_500_unintended_diff_only": 0.009000003337860107
248
+ },
249
+ "6": {
250
+ "tpp_threshold_2_total_metric": -0.000500023365020752,
251
+ "tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
252
+ "tpp_threshold_2_unintended_diff_only": 0.0015000104904174805,
253
+ "tpp_threshold_5_total_metric": 0.000250011682510376,
254
+ "tpp_threshold_5_intended_diff_only": -0.0009999871253967285,
255
+ "tpp_threshold_5_unintended_diff_only": -0.0012499988079071045,
256
+ "tpp_threshold_10_total_metric": 0.0012499988079071045,
257
+ "tpp_threshold_10_intended_diff_only": 0.003000020980834961,
258
+ "tpp_threshold_10_unintended_diff_only": 0.0017500221729278564,
259
+ "tpp_threshold_20_total_metric": 0.001999974250793457,
260
+ "tpp_threshold_20_intended_diff_only": 0.0009999871253967285,
261
+ "tpp_threshold_20_unintended_diff_only": -0.0009999871253967285,
262
+ "tpp_threshold_50_total_metric": 0.002750009298324585,
263
+ "tpp_threshold_50_intended_diff_only": 0.003000020980834961,
264
+ "tpp_threshold_50_unintended_diff_only": 0.000250011682510376,
265
+ "tpp_threshold_100_total_metric": 0.002749994397163391,
266
+ "tpp_threshold_100_intended_diff_only": 0.004999995231628418,
267
+ "tpp_threshold_100_unintended_diff_only": 0.002250000834465027,
268
+ "tpp_threshold_500_total_metric": 0.016249999403953552,
269
+ "tpp_threshold_500_intended_diff_only": 0.023000001907348633,
270
+ "tpp_threshold_500_unintended_diff_only": 0.006750002503395081
271
+ },
272
+ "9": {
273
+ "tpp_threshold_2_total_metric": 0.00424996018409729,
274
+ "tpp_threshold_2_intended_diff_only": 0.006999969482421875,
275
+ "tpp_threshold_2_unintended_diff_only": 0.002750009298324585,
276
+ "tpp_threshold_5_total_metric": 0.0004999786615371704,
277
+ "tpp_threshold_5_intended_diff_only": 0.004999995231628418,
278
+ "tpp_threshold_5_unintended_diff_only": 0.0045000165700912476,
279
+ "tpp_threshold_10_total_metric": -0.0017500519752502441,
280
+ "tpp_threshold_10_intended_diff_only": 0.0029999613761901855,
281
+ "tpp_threshold_10_unintended_diff_only": 0.00475001335144043,
282
+ "tpp_threshold_20_total_metric": 0.012250006198883057,
283
+ "tpp_threshold_20_intended_diff_only": 0.018000006675720215,
284
+ "tpp_threshold_20_unintended_diff_only": 0.005750000476837158,
285
+ "tpp_threshold_50_total_metric": 0.030999958515167236,
286
+ "tpp_threshold_50_intended_diff_only": 0.034999966621398926,
287
+ "tpp_threshold_50_unintended_diff_only": 0.0040000081062316895,
288
+ "tpp_threshold_100_total_metric": 0.0782499611377716,
289
+ "tpp_threshold_100_intended_diff_only": 0.08499997854232788,
290
+ "tpp_threshold_100_unintended_diff_only": 0.006750017404556274,
291
+ "tpp_threshold_500_total_metric": 0.38099999725818634,
292
+ "tpp_threshold_500_intended_diff_only": 0.3930000066757202,
293
+ "tpp_threshold_500_unintended_diff_only": 0.012000009417533875
294
+ }
295
+ },
296
+ "canrager/amazon_reviews_mcauley_1and5": {
297
+ "1": {
298
+ "tpp_threshold_2_total_metric": 0.009749948978424072,
299
+ "tpp_threshold_2_intended_diff_only": 0.012999951839447021,
300
+ "tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
301
+ "tpp_threshold_5_total_metric": 0.004499971866607666,
302
+ "tpp_threshold_5_intended_diff_only": 0.006999969482421875,
303
+ "tpp_threshold_5_unintended_diff_only": 0.002499997615814209,
304
+ "tpp_threshold_10_total_metric": -0.0015000104904174805,
305
+ "tpp_threshold_10_intended_diff_only": 0.006999969482421875,
306
+ "tpp_threshold_10_unintended_diff_only": 0.008499979972839355,
307
+ "tpp_threshold_20_total_metric": 0.0014999806880950928,
308
+ "tpp_threshold_20_intended_diff_only": 0.006999969482421875,
309
+ "tpp_threshold_20_unintended_diff_only": 0.005499988794326782,
310
+ "tpp_threshold_50_total_metric": 0.014499977231025696,
311
+ "tpp_threshold_50_intended_diff_only": 0.01699995994567871,
312
+ "tpp_threshold_50_unintended_diff_only": 0.002499982714653015,
313
+ "tpp_threshold_100_total_metric": 0.024999961256980896,
314
+ "tpp_threshold_100_intended_diff_only": 0.034999966621398926,
315
+ "tpp_threshold_100_unintended_diff_only": 0.01000000536441803,
316
+ "tpp_threshold_500_total_metric": 0.1757499873638153,
317
+ "tpp_threshold_500_intended_diff_only": 0.18599998950958252,
318
+ "tpp_threshold_500_unintended_diff_only": 0.010250002145767212
319
+ },
320
+ "2": {
321
+ "tpp_threshold_2_total_metric": 0.001249954104423523,
322
+ "tpp_threshold_2_intended_diff_only": 0.0029999613761901855,
323
+ "tpp_threshold_2_unintended_diff_only": 0.0017500072717666626,
324
+ "tpp_threshold_5_total_metric": -0.004749983549118042,
325
+ "tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
326
+ "tpp_threshold_5_unintended_diff_only": 0.008749991655349731,
327
+ "tpp_threshold_10_total_metric": 0.011499986052513123,
328
+ "tpp_threshold_10_intended_diff_only": 0.015999972820281982,
329
+ "tpp_threshold_10_unintended_diff_only": 0.00449998676776886,
330
+ "tpp_threshold_20_total_metric": 0.020249977707862854,
331
+ "tpp_threshold_20_intended_diff_only": 0.02499997615814209,
332
+ "tpp_threshold_20_unintended_diff_only": 0.004749998450279236,
333
+ "tpp_threshold_50_total_metric": 0.07249999046325684,
334
+ "tpp_threshold_50_intended_diff_only": 0.08499997854232788,
335
+ "tpp_threshold_50_unintended_diff_only": 0.012499988079071045,
336
+ "tpp_threshold_100_total_metric": 0.12825001776218414,
337
+ "tpp_threshold_100_intended_diff_only": 0.15200001001358032,
338
+ "tpp_threshold_100_unintended_diff_only": 0.02374999225139618,
339
+ "tpp_threshold_500_total_metric": 0.3604999780654907,
340
+ "tpp_threshold_500_intended_diff_only": 0.3999999761581421,
341
+ "tpp_threshold_500_unintended_diff_only": 0.03949999809265137
342
+ },
343
+ "3": {
344
+ "tpp_threshold_2_total_metric": -0.008499979972839355,
345
+ "tpp_threshold_2_intended_diff_only": -0.0059999823570251465,
346
+ "tpp_threshold_2_unintended_diff_only": 0.002499997615814209,
347
+ "tpp_threshold_5_total_metric": 0.0005000084638595581,
348
+ "tpp_threshold_5_intended_diff_only": 0.0,
349
+ "tpp_threshold_5_unintended_diff_only": -0.0005000084638595581,
350
+ "tpp_threshold_10_total_metric": 0.01424996554851532,
351
+ "tpp_threshold_10_intended_diff_only": 0.01699995994567871,
352
+ "tpp_threshold_10_unintended_diff_only": 0.002749994397163391,
353
+ "tpp_threshold_20_total_metric": 0.008999988436698914,
354
+ "tpp_threshold_20_intended_diff_only": 0.014999985694885254,
355
+ "tpp_threshold_20_unintended_diff_only": 0.00599999725818634,
356
+ "tpp_threshold_50_total_metric": 0.03624999523162842,
357
+ "tpp_threshold_50_intended_diff_only": 0.041999995708465576,
358
+ "tpp_threshold_50_unintended_diff_only": 0.005750000476837158,
359
+ "tpp_threshold_100_total_metric": 0.07375001907348633,
360
+ "tpp_threshold_100_intended_diff_only": 0.08700001239776611,
361
+ "tpp_threshold_100_unintended_diff_only": 0.013249993324279785,
362
+ "tpp_threshold_500_total_metric": 0.29850004613399506,
363
+ "tpp_threshold_500_intended_diff_only": 0.3400000333786011,
364
+ "tpp_threshold_500_unintended_diff_only": 0.04149998724460602
365
+ },
366
+ "5": {
367
+ "tpp_threshold_2_total_metric": -0.006499916315078735,
368
+ "tpp_threshold_2_intended_diff_only": -0.003999948501586914,
369
+ "tpp_threshold_2_unintended_diff_only": 0.0024999678134918213,
370
+ "tpp_threshold_5_total_metric": -0.007999926805496216,
371
+ "tpp_threshold_5_intended_diff_only": -0.003999948501586914,
372
+ "tpp_threshold_5_unintended_diff_only": 0.003999978303909302,
373
+ "tpp_threshold_10_total_metric": -0.014499947428703308,
374
+ "tpp_threshold_10_intended_diff_only": -0.007999956607818604,
375
+ "tpp_threshold_10_unintended_diff_only": 0.006499990820884705,
376
+ "tpp_threshold_20_total_metric": 0.03825005888938904,
377
+ "tpp_threshold_20_intended_diff_only": 0.06700003147125244,
378
+ "tpp_threshold_20_unintended_diff_only": 0.028749972581863403,
379
+ "tpp_threshold_50_total_metric": 0.07950006425380707,
380
+ "tpp_threshold_50_intended_diff_only": 0.12200003862380981,
381
+ "tpp_threshold_50_unintended_diff_only": 0.04249997437000275,
382
+ "tpp_threshold_100_total_metric": 0.13325001299381256,
383
+ "tpp_threshold_100_intended_diff_only": 0.18000000715255737,
384
+ "tpp_threshold_100_unintended_diff_only": 0.04674999415874481,
385
+ "tpp_threshold_500_total_metric": 0.3255000412464142,
386
+ "tpp_threshold_500_intended_diff_only": 0.39100003242492676,
387
+ "tpp_threshold_500_unintended_diff_only": 0.06549999117851257
388
+ },
389
+ "6": {
390
+ "tpp_threshold_2_total_metric": 0.012999996542930603,
391
+ "tpp_threshold_2_intended_diff_only": 0.018999993801116943,
392
+ "tpp_threshold_2_unintended_diff_only": 0.00599999725818634,
393
+ "tpp_threshold_5_total_metric": 0.023499995470046997,
394
+ "tpp_threshold_5_intended_diff_only": 0.02399998903274536,
395
+ "tpp_threshold_5_unintended_diff_only": 0.0004999935626983643,
396
+ "tpp_threshold_10_total_metric": 0.04375000298023224,
397
+ "tpp_threshold_10_intended_diff_only": 0.050999999046325684,
398
+ "tpp_threshold_10_unintended_diff_only": 0.007249996066093445,
399
+ "tpp_threshold_20_total_metric": 0.07274995744228363,
400
+ "tpp_threshold_20_intended_diff_only": 0.08599996566772461,
401
+ "tpp_threshold_20_unintended_diff_only": 0.013250008225440979,
402
+ "tpp_threshold_50_total_metric": 0.11749999225139618,
403
+ "tpp_threshold_50_intended_diff_only": 0.12999999523162842,
404
+ "tpp_threshold_50_unintended_diff_only": 0.012500002980232239,
405
+ "tpp_threshold_100_total_metric": 0.19349998235702515,
406
+ "tpp_threshold_100_intended_diff_only": 0.20999997854232788,
407
+ "tpp_threshold_100_unintended_diff_only": 0.016499996185302734,
408
+ "tpp_threshold_500_total_metric": 0.34599998593330383,
409
+ "tpp_threshold_500_intended_diff_only": 0.3709999918937683,
410
+ "tpp_threshold_500_unintended_diff_only": 0.025000005960464478
411
+ }
412
+ }
413
+ }
414
+ }
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "tpp",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": false,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "2072bfe8-9d3d-4573-8939-241c618278fe",
73
+ "datetime_epoch_millis": 1740162955007,
74
+ "eval_result_metrics": {
75
+ "tpp_metrics": {
76
+ "tpp_threshold_2_total_metric": 0.0027249947190284727,
77
+ "tpp_threshold_2_intended_diff_only": 0.005399996042251587,
78
+ "tpp_threshold_2_unintended_diff_only": 0.002675001323223114,
79
+ "tpp_threshold_5_total_metric": 0.004999993741512299,
80
+ "tpp_threshold_5_intended_diff_only": 0.007799994945526124,
81
+ "tpp_threshold_5_unintended_diff_only": 0.002800001204013825,
82
+ "tpp_threshold_10_total_metric": 0.011049999296665192,
83
+ "tpp_threshold_10_intended_diff_only": 0.014899998903274536,
84
+ "tpp_threshold_10_unintended_diff_only": 0.0038499996066093446,
85
+ "tpp_threshold_20_total_metric": 0.023100003600120544,
86
+ "tpp_threshold_20_intended_diff_only": 0.0281000018119812,
87
+ "tpp_threshold_20_unintended_diff_only": 0.004999998211860657,
88
+ "tpp_threshold_50_total_metric": 0.0674250066280365,
89
+ "tpp_threshold_50_intended_diff_only": 0.07430000305175781,
90
+ "tpp_threshold_50_unintended_diff_only": 0.006874996423721313,
91
+ "tpp_threshold_100_total_metric": 0.14430001527071,
92
+ "tpp_threshold_100_intended_diff_only": 0.15460001230239867,
93
+ "tpp_threshold_100_unintended_diff_only": 0.01029999703168869,
94
+ "tpp_threshold_500_total_metric": 0.3798500135540962,
95
+ "tpp_threshold_500_intended_diff_only": 0.4061000108718872,
96
+ "tpp_threshold_500_unintended_diff_only": 0.026249997317790985
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
102
+ "tpp_threshold_2_total_metric": 0.003049987554550171,
103
+ "tpp_threshold_2_intended_diff_only": 0.00559999942779541,
104
+ "tpp_threshold_2_unintended_diff_only": 0.002550011873245239,
105
+ "tpp_threshold_5_total_metric": 0.00589999258518219,
106
+ "tpp_threshold_5_intended_diff_only": 0.008800005912780762,
107
+ "tpp_threshold_5_unintended_diff_only": 0.002900013327598572,
108
+ "tpp_threshold_10_total_metric": 0.011350002884864808,
109
+ "tpp_threshold_10_intended_diff_only": 0.014000010490417481,
110
+ "tpp_threshold_10_unintended_diff_only": 0.0026500076055526733,
111
+ "tpp_threshold_20_total_metric": 0.026500004529953002,
112
+ "tpp_threshold_20_intended_diff_only": 0.02960001230239868,
113
+ "tpp_threshold_20_unintended_diff_only": 0.0031000077724456787,
114
+ "tpp_threshold_50_total_metric": 0.0689500093460083,
115
+ "tpp_threshold_50_intended_diff_only": 0.07320001125335693,
116
+ "tpp_threshold_50_unintended_diff_only": 0.004250001907348633,
117
+ "tpp_threshold_100_total_metric": 0.15995001494884492,
118
+ "tpp_threshold_100_intended_diff_only": 0.16520001888275146,
119
+ "tpp_threshold_100_unintended_diff_only": 0.005250003933906555,
120
+ "tpp_threshold_500_total_metric": 0.4329000234603882,
121
+ "tpp_threshold_500_intended_diff_only": 0.44420002698898314,
122
+ "tpp_threshold_500_unintended_diff_only": 0.011300003528594971
123
+ },
124
+ {
125
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
126
+ "tpp_threshold_2_total_metric": 0.002400001883506775,
127
+ "tpp_threshold_2_intended_diff_only": 0.005199992656707763,
128
+ "tpp_threshold_2_unintended_diff_only": 0.002799990773200989,
129
+ "tpp_threshold_5_total_metric": 0.004099994897842407,
130
+ "tpp_threshold_5_intended_diff_only": 0.0067999839782714845,
131
+ "tpp_threshold_5_unintended_diff_only": 0.0026999890804290773,
132
+ "tpp_threshold_10_total_metric": 0.010749995708465576,
133
+ "tpp_threshold_10_intended_diff_only": 0.01579998731613159,
134
+ "tpp_threshold_10_unintended_diff_only": 0.005049991607666016,
135
+ "tpp_threshold_20_total_metric": 0.019700002670288087,
136
+ "tpp_threshold_20_intended_diff_only": 0.02659999132156372,
137
+ "tpp_threshold_20_unintended_diff_only": 0.006899988651275635,
138
+ "tpp_threshold_50_total_metric": 0.0659000039100647,
139
+ "tpp_threshold_50_intended_diff_only": 0.07539999485015869,
140
+ "tpp_threshold_50_unintended_diff_only": 0.009499990940093994,
141
+ "tpp_threshold_100_total_metric": 0.12865001559257508,
142
+ "tpp_threshold_100_intended_diff_only": 0.1440000057220459,
143
+ "tpp_threshold_100_unintended_diff_only": 0.015349990129470826,
144
+ "tpp_threshold_500_total_metric": 0.32680000364780426,
145
+ "tpp_threshold_500_intended_diff_only": 0.3679999947547913,
146
+ "tpp_threshold_500_unintended_diff_only": 0.041199991106987
147
+ }
148
+ ],
149
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
150
+ "sae_lens_id": "custom_sae",
151
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_95.0",
152
+ "sae_lens_version": "5.4.2",
153
+ "sae_cfg_dict": {
154
+ "model_name": "gemma-2-2b",
155
+ "d_in": 2304,
156
+ "d_sae": 65536,
157
+ "hook_layer": 12,
158
+ "hook_name": "blocks.12.hook_resid_post",
159
+ "context_size": null,
160
+ "hook_head_index": null,
161
+ "architecture": "topk",
162
+ "apply_b_dec_to_input": null,
163
+ "finetuning_scaling_factor": null,
164
+ "activation_fn_str": "",
165
+ "prepend_bos": true,
166
+ "normalize_activations": "none",
167
+ "dtype": "bfloat16",
168
+ "device": "",
169
+ "dataset_path": "",
170
+ "dataset_trust_remote_code": true,
171
+ "seqpos_slice": [
172
+ null
173
+ ],
174
+ "training_tokens": -100000,
175
+ "sae_lens_training_version": null,
176
+ "neuronpedia_id": null
177
+ },
178
+ "eval_result_unstructured": {
179
+ "LabHC/bias_in_bios_class_set1": {
180
+ "0": {
181
+ "tpp_threshold_2_total_metric": 0.0037500113248825073,
182
+ "tpp_threshold_2_intended_diff_only": 0.008000016212463379,
183
+ "tpp_threshold_2_unintended_diff_only": 0.004250004887580872,
184
+ "tpp_threshold_5_total_metric": 0.011250019073486328,
185
+ "tpp_threshold_5_intended_diff_only": 0.016000032424926758,
186
+ "tpp_threshold_5_unintended_diff_only": 0.00475001335144043,
187
+ "tpp_threshold_10_total_metric": 0.010500013828277588,
188
+ "tpp_threshold_10_intended_diff_only": 0.016000032424926758,
189
+ "tpp_threshold_10_unintended_diff_only": 0.00550001859664917,
190
+ "tpp_threshold_20_total_metric": 0.02700003981590271,
191
+ "tpp_threshold_20_intended_diff_only": 0.030000030994415283,
192
+ "tpp_threshold_20_unintended_diff_only": 0.0029999911785125732,
193
+ "tpp_threshold_50_total_metric": 0.06300003826618195,
194
+ "tpp_threshold_50_intended_diff_only": 0.06700003147125244,
195
+ "tpp_threshold_50_unintended_diff_only": 0.003999993205070496,
196
+ "tpp_threshold_100_total_metric": 0.1420000195503235,
197
+ "tpp_threshold_100_intended_diff_only": 0.1470000147819519,
198
+ "tpp_threshold_100_unintended_diff_only": 0.004999995231628418,
199
+ "tpp_threshold_500_total_metric": 0.427000030875206,
200
+ "tpp_threshold_500_intended_diff_only": 0.44200003147125244,
201
+ "tpp_threshold_500_unintended_diff_only": 0.015000000596046448
202
+ },
203
+ "1": {
204
+ "tpp_threshold_2_total_metric": 0.00700002908706665,
205
+ "tpp_threshold_2_intended_diff_only": 0.00700002908706665,
206
+ "tpp_threshold_2_unintended_diff_only": 0.0,
207
+ "tpp_threshold_5_total_metric": 0.001999989151954651,
208
+ "tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
209
+ "tpp_threshold_5_unintended_diff_only": 0.0020000189542770386,
210
+ "tpp_threshold_10_total_metric": 0.005500048398971558,
211
+ "tpp_threshold_10_intended_diff_only": 0.005000054836273193,
212
+ "tpp_threshold_10_unintended_diff_only": -0.0004999935626983643,
213
+ "tpp_threshold_20_total_metric": 0.0072500258684158325,
214
+ "tpp_threshold_20_intended_diff_only": 0.012000024318695068,
215
+ "tpp_threshold_20_unintended_diff_only": 0.004749998450279236,
216
+ "tpp_threshold_50_total_metric": 0.061750054359436035,
217
+ "tpp_threshold_50_intended_diff_only": 0.0700000524520874,
218
+ "tpp_threshold_50_unintended_diff_only": 0.008249998092651367,
219
+ "tpp_threshold_100_total_metric": 0.11425001919269562,
220
+ "tpp_threshold_100_intended_diff_only": 0.12200003862380981,
221
+ "tpp_threshold_100_unintended_diff_only": 0.007750019431114197,
222
+ "tpp_threshold_500_total_metric": 0.4257500469684601,
223
+ "tpp_threshold_500_intended_diff_only": 0.4360000491142273,
224
+ "tpp_threshold_500_unintended_diff_only": 0.010250002145767212
225
+ },
226
+ "2": {
227
+ "tpp_threshold_2_total_metric": -0.0020000338554382324,
228
+ "tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
229
+ "tpp_threshold_2_unintended_diff_only": 0.003000020980834961,
230
+ "tpp_threshold_5_total_metric": 0.0057499706745147705,
231
+ "tpp_threshold_5_intended_diff_only": 0.009999990463256836,
232
+ "tpp_threshold_5_unintended_diff_only": 0.004250019788742065,
233
+ "tpp_threshold_10_total_metric": 0.02674996852874756,
234
+ "tpp_threshold_10_intended_diff_only": 0.02899998426437378,
235
+ "tpp_threshold_10_unintended_diff_only": 0.0022500157356262207,
236
+ "tpp_threshold_20_total_metric": 0.04725000262260437,
237
+ "tpp_threshold_20_intended_diff_only": 0.04900002479553223,
238
+ "tpp_threshold_20_unintended_diff_only": 0.0017500221729278564,
239
+ "tpp_threshold_50_total_metric": 0.08799996972084045,
240
+ "tpp_threshold_50_intended_diff_only": 0.08899998664855957,
241
+ "tpp_threshold_50_unintended_diff_only": 0.0010000169277191162,
242
+ "tpp_threshold_100_total_metric": 0.19200001657009125,
243
+ "tpp_threshold_100_intended_diff_only": 0.19300001859664917,
244
+ "tpp_threshold_100_unintended_diff_only": 0.0010000020265579224,
245
+ "tpp_threshold_500_total_metric": 0.43150001764297485,
246
+ "tpp_threshold_500_intended_diff_only": 0.43800002336502075,
247
+ "tpp_threshold_500_unintended_diff_only": 0.0065000057220458984
248
+ },
249
+ "6": {
250
+ "tpp_threshold_2_total_metric": 0.001999989151954651,
251
+ "tpp_threshold_2_intended_diff_only": 0.0040000081062316895,
252
+ "tpp_threshold_2_unintended_diff_only": 0.0020000189542770386,
253
+ "tpp_threshold_5_total_metric": 0.003500029444694519,
254
+ "tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
255
+ "tpp_threshold_5_unintended_diff_only": -0.0014999955892562866,
256
+ "tpp_threshold_10_total_metric": 0.00449998676776886,
257
+ "tpp_threshold_10_intended_diff_only": 0.004999995231628418,
258
+ "tpp_threshold_10_unintended_diff_only": 0.0005000084638595581,
259
+ "tpp_threshold_20_total_metric": 0.0052499920129776,
260
+ "tpp_threshold_20_intended_diff_only": 0.0040000081062316895,
261
+ "tpp_threshold_20_unintended_diff_only": -0.0012499839067459106,
262
+ "tpp_threshold_50_total_metric": 0.016000032424926758,
263
+ "tpp_threshold_50_intended_diff_only": 0.017000019550323486,
264
+ "tpp_threshold_50_unintended_diff_only": 0.0009999871253967285,
265
+ "tpp_threshold_100_total_metric": 0.1157500147819519,
266
+ "tpp_threshold_100_intended_diff_only": 0.11900001764297485,
267
+ "tpp_threshold_100_unintended_diff_only": 0.0032500028610229492,
268
+ "tpp_threshold_500_total_metric": 0.42000001668930054,
269
+ "tpp_threshold_500_intended_diff_only": 0.42900002002716064,
270
+ "tpp_threshold_500_unintended_diff_only": 0.009000003337860107
271
+ },
272
+ "9": {
273
+ "tpp_threshold_2_total_metric": 0.004499942064285278,
274
+ "tpp_threshold_2_intended_diff_only": 0.007999956607818604,
275
+ "tpp_threshold_2_unintended_diff_only": 0.003500014543533325,
276
+ "tpp_threshold_5_total_metric": 0.006999954581260681,
277
+ "tpp_threshold_5_intended_diff_only": 0.011999964714050293,
278
+ "tpp_threshold_5_unintended_diff_only": 0.005000010132789612,
279
+ "tpp_threshold_10_total_metric": 0.009499996900558472,
280
+ "tpp_threshold_10_intended_diff_only": 0.014999985694885254,
281
+ "tpp_threshold_10_unintended_diff_only": 0.005499988794326782,
282
+ "tpp_threshold_20_total_metric": 0.0457499623298645,
283
+ "tpp_threshold_20_intended_diff_only": 0.05299997329711914,
284
+ "tpp_threshold_20_unintended_diff_only": 0.007250010967254639,
285
+ "tpp_threshold_50_total_metric": 0.11599995195865631,
286
+ "tpp_threshold_50_intended_diff_only": 0.12299996614456177,
287
+ "tpp_threshold_50_unintended_diff_only": 0.0070000141859054565,
288
+ "tpp_threshold_100_total_metric": 0.2357500046491623,
289
+ "tpp_threshold_100_intended_diff_only": 0.24500000476837158,
290
+ "tpp_threshold_100_unintended_diff_only": 0.00925000011920929,
291
+ "tpp_threshold_500_total_metric": 0.46025000512599945,
292
+ "tpp_threshold_500_intended_diff_only": 0.47600001096725464,
293
+ "tpp_threshold_500_unintended_diff_only": 0.015750005841255188
294
+ }
295
+ },
296
+ "canrager/amazon_reviews_mcauley_1and5": {
297
+ "1": {
298
+ "tpp_threshold_2_total_metric": 0.00475001335144043,
299
+ "tpp_threshold_2_intended_diff_only": 0.009000003337860107,
300
+ "tpp_threshold_2_unintended_diff_only": 0.004249989986419678,
301
+ "tpp_threshold_5_total_metric": 0.0020000040531158447,
302
+ "tpp_threshold_5_intended_diff_only": 0.004999995231628418,
303
+ "tpp_threshold_5_unintended_diff_only": 0.0029999911785125732,
304
+ "tpp_threshold_10_total_metric": 0.004999950528144836,
305
+ "tpp_threshold_10_intended_diff_only": 0.012999951839447021,
306
+ "tpp_threshold_10_unintended_diff_only": 0.008000001311302185,
307
+ "tpp_threshold_20_total_metric": 0.005249962210655212,
308
+ "tpp_threshold_20_intended_diff_only": 0.011999964714050293,
309
+ "tpp_threshold_20_unintended_diff_only": 0.006750002503395081,
310
+ "tpp_threshold_50_total_metric": 0.01299998164176941,
311
+ "tpp_threshold_50_intended_diff_only": 0.014999985694885254,
312
+ "tpp_threshold_50_unintended_diff_only": 0.0020000040531158447,
313
+ "tpp_threshold_100_total_metric": 0.022250011563301086,
314
+ "tpp_threshold_100_intended_diff_only": 0.03700000047683716,
315
+ "tpp_threshold_100_unintended_diff_only": 0.014749988913536072,
316
+ "tpp_threshold_500_total_metric": 0.2149999886751175,
317
+ "tpp_threshold_500_intended_diff_only": 0.2749999761581421,
318
+ "tpp_threshold_500_unintended_diff_only": 0.0599999874830246
319
+ },
320
+ "2": {
321
+ "tpp_threshold_2_total_metric": 0.0037499964237213135,
322
+ "tpp_threshold_2_intended_diff_only": 0.004999995231628418,
323
+ "tpp_threshold_2_unintended_diff_only": 0.0012499988079071045,
324
+ "tpp_threshold_5_total_metric": 0.0007499754428863525,
325
+ "tpp_threshold_5_intended_diff_only": 0.007999956607818604,
326
+ "tpp_threshold_5_unintended_diff_only": 0.007249981164932251,
327
+ "tpp_threshold_10_total_metric": 0.014500007033348083,
328
+ "tpp_threshold_10_intended_diff_only": 0.018999993801116943,
329
+ "tpp_threshold_10_unintended_diff_only": 0.00449998676776886,
330
+ "tpp_threshold_20_total_metric": 0.032000020146369934,
331
+ "tpp_threshold_20_intended_diff_only": 0.03600001335144043,
332
+ "tpp_threshold_20_unintended_diff_only": 0.003999993205070496,
333
+ "tpp_threshold_50_total_metric": 0.0845000296831131,
334
+ "tpp_threshold_50_intended_diff_only": 0.09600001573562622,
335
+ "tpp_threshold_50_unintended_diff_only": 0.011499986052513123,
336
+ "tpp_threshold_100_total_metric": 0.17149998247623444,
337
+ "tpp_threshold_100_intended_diff_only": 0.19099998474121094,
338
+ "tpp_threshold_100_unintended_diff_only": 0.0195000022649765,
339
+ "tpp_threshold_500_total_metric": 0.3877500146627426,
340
+ "tpp_threshold_500_intended_diff_only": 0.41600000858306885,
341
+ "tpp_threshold_500_unintended_diff_only": 0.028249993920326233
342
+ },
343
+ "3": {
344
+ "tpp_threshold_2_total_metric": -0.01000000536441803,
345
+ "tpp_threshold_2_intended_diff_only": -0.008000016212463379,
346
+ "tpp_threshold_2_unintended_diff_only": 0.001999989151954651,
347
+ "tpp_threshold_5_total_metric": -2.9802322387695312e-08,
348
+ "tpp_threshold_5_intended_diff_only": -0.0020000338554382324,
349
+ "tpp_threshold_5_unintended_diff_only": -0.0020000040531158447,
350
+ "tpp_threshold_10_total_metric": 0.009499981999397278,
351
+ "tpp_threshold_10_intended_diff_only": 0.010999977588653564,
352
+ "tpp_threshold_10_unintended_diff_only": 0.0014999955892562866,
353
+ "tpp_threshold_20_total_metric": 0.002249985933303833,
354
+ "tpp_threshold_20_intended_diff_only": 0.011999964714050293,
355
+ "tpp_threshold_20_unintended_diff_only": 0.00974997878074646,
356
+ "tpp_threshold_50_total_metric": 0.0455000102519989,
357
+ "tpp_threshold_50_intended_diff_only": 0.0559999942779541,
358
+ "tpp_threshold_50_unintended_diff_only": 0.0104999840259552,
359
+ "tpp_threshold_100_total_metric": 0.0767500251531601,
360
+ "tpp_threshold_100_intended_diff_only": 0.0910000205039978,
361
+ "tpp_threshold_100_unintended_diff_only": 0.014249995350837708,
362
+ "tpp_threshold_500_total_metric": 0.3192499876022339,
363
+ "tpp_threshold_500_intended_diff_only": 0.3579999804496765,
364
+ "tpp_threshold_500_unintended_diff_only": 0.03874999284744263
365
+ },
366
+ "5": {
367
+ "tpp_threshold_2_total_metric": 2.9802322387695312e-08,
368
+ "tpp_threshold_2_intended_diff_only": 0.003000020980834961,
369
+ "tpp_threshold_2_unintended_diff_only": 0.0029999911785125732,
370
+ "tpp_threshold_5_total_metric": 0.006750032305717468,
371
+ "tpp_threshold_5_intended_diff_only": 0.013000011444091797,
372
+ "tpp_threshold_5_unintended_diff_only": 0.006249979138374329,
373
+ "tpp_threshold_10_total_metric": 0.004500046372413635,
374
+ "tpp_threshold_10_intended_diff_only": 0.012000024318695068,
375
+ "tpp_threshold_10_unintended_diff_only": 0.007499977946281433,
376
+ "tpp_threshold_20_total_metric": 0.01575005054473877,
377
+ "tpp_threshold_20_intended_diff_only": 0.025000035762786865,
378
+ "tpp_threshold_20_unintended_diff_only": 0.009249985218048096,
379
+ "tpp_threshold_50_total_metric": 0.08250001072883606,
380
+ "tpp_threshold_50_intended_diff_only": 0.09700000286102295,
381
+ "tpp_threshold_50_unintended_diff_only": 0.01449999213218689,
382
+ "tpp_threshold_100_total_metric": 0.15550003945827484,
383
+ "tpp_threshold_100_intended_diff_only": 0.17500001192092896,
384
+ "tpp_threshold_100_unintended_diff_only": 0.019499972462654114,
385
+ "tpp_threshold_500_total_metric": 0.35050003230571747,
386
+ "tpp_threshold_500_intended_diff_only": 0.4150000214576721,
387
+ "tpp_threshold_500_unintended_diff_only": 0.06449998915195465
388
+ },
389
+ "6": {
390
+ "tpp_threshold_2_total_metric": 0.013499975204467773,
391
+ "tpp_threshold_2_intended_diff_only": 0.01699995994567871,
392
+ "tpp_threshold_2_unintended_diff_only": 0.0034999847412109375,
393
+ "tpp_threshold_5_total_metric": 0.010999992489814758,
394
+ "tpp_threshold_5_intended_diff_only": 0.009999990463256836,
395
+ "tpp_threshold_5_unintended_diff_only": -0.0010000020265579224,
396
+ "tpp_threshold_10_total_metric": 0.020249992609024048,
397
+ "tpp_threshold_10_intended_diff_only": 0.02399998903274536,
398
+ "tpp_threshold_10_unintended_diff_only": 0.0037499964237213135,
399
+ "tpp_threshold_20_total_metric": 0.04324999451637268,
400
+ "tpp_threshold_20_intended_diff_only": 0.04799997806549072,
401
+ "tpp_threshold_20_unintended_diff_only": 0.004749983549118042,
402
+ "tpp_threshold_50_total_metric": 0.10399998724460602,
403
+ "tpp_threshold_50_intended_diff_only": 0.11299997568130493,
404
+ "tpp_threshold_50_unintended_diff_only": 0.008999988436698914,
405
+ "tpp_threshold_100_total_metric": 0.2172500193119049,
406
+ "tpp_threshold_100_intended_diff_only": 0.22600001096725464,
407
+ "tpp_threshold_100_unintended_diff_only": 0.008749991655349731,
408
+ "tpp_threshold_500_total_metric": 0.36149999499320984,
409
+ "tpp_threshold_500_intended_diff_only": 0.37599998712539673,
410
+ "tpp_threshold_500_unintended_diff_only": 0.01449999213218689
411
+ }
412
+ }
413
+ }
414
+ }
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "tpp",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": false,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "f2bf09ac-6740-414f-aa61-a62e38a23b92",
73
+ "datetime_epoch_millis": 1740162794981,
74
+ "eval_result_metrics": {
75
+ "tpp_metrics": {
76
+ "tpp_threshold_2_total_metric": 0.002074998617172241,
77
+ "tpp_threshold_2_intended_diff_only": 0.004600000381469726,
78
+ "tpp_threshold_2_unintended_diff_only": 0.0025250017642974854,
79
+ "tpp_threshold_5_total_metric": 0.003899991512298584,
80
+ "tpp_threshold_5_intended_diff_only": 0.0056999921798706055,
81
+ "tpp_threshold_5_unintended_diff_only": 0.0018000006675720215,
82
+ "tpp_threshold_10_total_metric": 2.5008618831634565e-05,
83
+ "tpp_threshold_10_intended_diff_only": 0.002800005674362183,
84
+ "tpp_threshold_10_unintended_diff_only": 0.002774997055530548,
85
+ "tpp_threshold_20_total_metric": 0.003749997913837433,
86
+ "tpp_threshold_20_intended_diff_only": 0.007200002670288086,
87
+ "tpp_threshold_20_unintended_diff_only": 0.0034500047564506534,
88
+ "tpp_threshold_50_total_metric": 0.012599988281726836,
89
+ "tpp_threshold_50_intended_diff_only": 0.016999995708465575,
90
+ "tpp_threshold_50_unintended_diff_only": 0.004400007426738739,
91
+ "tpp_threshold_100_total_metric": 0.02380000501871109,
92
+ "tpp_threshold_100_intended_diff_only": 0.02910000681877136,
93
+ "tpp_threshold_100_unintended_diff_only": 0.005300001800060272,
94
+ "tpp_threshold_500_total_metric": 0.07820001393556594,
95
+ "tpp_threshold_500_intended_diff_only": 0.08660001158714295,
96
+ "tpp_threshold_500_unintended_diff_only": 0.008399997651576997
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
102
+ "tpp_threshold_2_total_metric": 0.005099990963935852,
103
+ "tpp_threshold_2_intended_diff_only": 0.007200002670288086,
104
+ "tpp_threshold_2_unintended_diff_only": 0.002100011706352234,
105
+ "tpp_threshold_5_total_metric": 0.002500000596046448,
106
+ "tpp_threshold_5_intended_diff_only": 0.0048000097274780275,
107
+ "tpp_threshold_5_unintended_diff_only": 0.0023000091314315796,
108
+ "tpp_threshold_10_total_metric": 0.0022500097751617433,
109
+ "tpp_threshold_10_intended_diff_only": 0.004400014877319336,
110
+ "tpp_threshold_10_unintended_diff_only": 0.0021500051021575927,
111
+ "tpp_threshold_20_total_metric": 0.0045499980449676515,
112
+ "tpp_threshold_20_intended_diff_only": 0.007200014591217041,
113
+ "tpp_threshold_20_unintended_diff_only": 0.0026500165462493897,
114
+ "tpp_threshold_50_total_metric": 0.014249974489212036,
115
+ "tpp_threshold_50_intended_diff_only": 0.016999995708465575,
116
+ "tpp_threshold_50_unintended_diff_only": 0.00275002121925354,
117
+ "tpp_threshold_100_total_metric": 0.021900007128715517,
118
+ "tpp_threshold_100_intended_diff_only": 0.0254000186920166,
119
+ "tpp_threshold_100_unintended_diff_only": 0.0035000115633010863,
120
+ "tpp_threshold_500_total_metric": 0.07305001318454743,
121
+ "tpp_threshold_500_intended_diff_only": 0.07700002193450928,
122
+ "tpp_threshold_500_unintended_diff_only": 0.003950008749961853
123
+ },
124
+ {
125
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
126
+ "tpp_threshold_2_total_metric": -0.0009499937295913696,
127
+ "tpp_threshold_2_intended_diff_only": 0.0019999980926513673,
128
+ "tpp_threshold_2_unintended_diff_only": 0.002949991822242737,
129
+ "tpp_threshold_5_total_metric": 0.00529998242855072,
130
+ "tpp_threshold_5_intended_diff_only": 0.006599974632263183,
131
+ "tpp_threshold_5_unintended_diff_only": 0.0012999922037124634,
132
+ "tpp_threshold_10_total_metric": -0.002199992537498474,
133
+ "tpp_threshold_10_intended_diff_only": 0.0011999964714050292,
134
+ "tpp_threshold_10_unintended_diff_only": 0.0033999890089035033,
135
+ "tpp_threshold_20_total_metric": 0.0029499977827072144,
136
+ "tpp_threshold_20_intended_diff_only": 0.007199990749359131,
137
+ "tpp_threshold_20_unintended_diff_only": 0.004249992966651917,
138
+ "tpp_threshold_50_total_metric": 0.010950002074241637,
139
+ "tpp_threshold_50_intended_diff_only": 0.016999995708465575,
140
+ "tpp_threshold_50_unintended_diff_only": 0.006049993634223938,
141
+ "tpp_threshold_100_total_metric": 0.025700002908706665,
142
+ "tpp_threshold_100_intended_diff_only": 0.03279999494552612,
143
+ "tpp_threshold_100_unintended_diff_only": 0.007099992036819458,
144
+ "tpp_threshold_500_total_metric": 0.08335001468658447,
145
+ "tpp_threshold_500_intended_diff_only": 0.09620000123977661,
146
+ "tpp_threshold_500_unintended_diff_only": 0.012849986553192139
147
+ }
148
+ ],
149
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
150
+ "sae_lens_id": "custom_sae",
151
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_0.0",
152
+ "sae_lens_version": "5.4.2",
153
+ "sae_cfg_dict": {
154
+ "model_name": "gemma-2-2b",
155
+ "d_in": 2304,
156
+ "d_sae": 65536,
157
+ "hook_layer": 12,
158
+ "hook_name": "blocks.12.hook_resid_post",
159
+ "context_size": null,
160
+ "hook_head_index": null,
161
+ "architecture": "topk",
162
+ "apply_b_dec_to_input": null,
163
+ "finetuning_scaling_factor": null,
164
+ "activation_fn_str": "",
165
+ "prepend_bos": true,
166
+ "normalize_activations": "none",
167
+ "dtype": "bfloat16",
168
+ "device": "",
169
+ "dataset_path": "",
170
+ "dataset_trust_remote_code": true,
171
+ "seqpos_slice": [
172
+ null
173
+ ],
174
+ "training_tokens": -100000,
175
+ "sae_lens_training_version": null,
176
+ "neuronpedia_id": null
177
+ },
178
+ "eval_result_unstructured": {
179
+ "LabHC/bias_in_bios_class_set1": {
180
+ "0": {
181
+ "tpp_threshold_2_total_metric": 0.005000010132789612,
182
+ "tpp_threshold_2_intended_diff_only": 0.00700002908706665,
183
+ "tpp_threshold_2_unintended_diff_only": 0.0020000189542770386,
184
+ "tpp_threshold_5_total_metric": 0.004249975085258484,
185
+ "tpp_threshold_5_intended_diff_only": 0.004999995231628418,
186
+ "tpp_threshold_5_unintended_diff_only": 0.0007500201463699341,
187
+ "tpp_threshold_10_total_metric": 0.0072500258684158325,
188
+ "tpp_threshold_10_intended_diff_only": 0.00700002908706665,
189
+ "tpp_threshold_10_unintended_diff_only": -0.00024999678134918213,
190
+ "tpp_threshold_20_total_metric": 0.004749998450279236,
191
+ "tpp_threshold_20_intended_diff_only": 0.008000016212463379,
192
+ "tpp_threshold_20_unintended_diff_only": 0.003250017762184143,
193
+ "tpp_threshold_50_total_metric": 0.022749990224838257,
194
+ "tpp_threshold_50_intended_diff_only": 0.02799999713897705,
195
+ "tpp_threshold_50_unintended_diff_only": 0.005250006914138794,
196
+ "tpp_threshold_100_total_metric": 0.038000017404556274,
197
+ "tpp_threshold_100_intended_diff_only": 0.04400002956390381,
198
+ "tpp_threshold_100_unintended_diff_only": 0.006000012159347534,
199
+ "tpp_threshold_500_total_metric": 0.14850004017353058,
200
+ "tpp_threshold_500_intended_diff_only": 0.15400004386901855,
201
+ "tpp_threshold_500_unintended_diff_only": 0.005500003695487976
202
+ },
203
+ "1": {
204
+ "tpp_threshold_2_total_metric": 0.00875002145767212,
205
+ "tpp_threshold_2_intended_diff_only": 0.012000024318695068,
206
+ "tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
207
+ "tpp_threshold_5_total_metric": -0.0012499988079071045,
208
+ "tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
209
+ "tpp_threshold_5_unintended_diff_only": 0.005250006914138794,
210
+ "tpp_threshold_10_total_metric": -0.0009999573230743408,
211
+ "tpp_threshold_10_intended_diff_only": 0.001000046730041504,
212
+ "tpp_threshold_10_unintended_diff_only": 0.0020000040531158447,
213
+ "tpp_threshold_20_total_metric": -0.00024996697902679443,
214
+ "tpp_threshold_20_intended_diff_only": 0.001000046730041504,
215
+ "tpp_threshold_20_unintended_diff_only": 0.0012500137090682983,
216
+ "tpp_threshold_50_total_metric": -0.004250004887580872,
217
+ "tpp_threshold_50_intended_diff_only": -0.0009999871253967285,
218
+ "tpp_threshold_50_unintended_diff_only": 0.003250017762184143,
219
+ "tpp_threshold_100_total_metric": 0.007249981164932251,
220
+ "tpp_threshold_100_intended_diff_only": 0.009000003337860107,
221
+ "tpp_threshold_100_unintended_diff_only": 0.0017500221729278564,
222
+ "tpp_threshold_500_total_metric": 0.03200000524520874,
223
+ "tpp_threshold_500_intended_diff_only": 0.03700000047683716,
224
+ "tpp_threshold_500_unintended_diff_only": 0.004999995231628418
225
+ },
226
+ "2": {
227
+ "tpp_threshold_2_total_metric": 0.0062499940395355225,
228
+ "tpp_threshold_2_intended_diff_only": 0.009000003337860107,
229
+ "tpp_threshold_2_unintended_diff_only": 0.002750009298324585,
230
+ "tpp_threshold_5_total_metric": 0.0065000057220458984,
231
+ "tpp_threshold_5_intended_diff_only": 0.008000016212463379,
232
+ "tpp_threshold_5_unintended_diff_only": 0.0015000104904174805,
233
+ "tpp_threshold_10_total_metric": 0.0007500052452087402,
234
+ "tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
235
+ "tpp_threshold_10_unintended_diff_only": 0.0032500028610229492,
236
+ "tpp_threshold_20_total_metric": 0.007999971508979797,
237
+ "tpp_threshold_20_intended_diff_only": 0.009999990463256836,
238
+ "tpp_threshold_20_unintended_diff_only": 0.0020000189542770386,
239
+ "tpp_threshold_50_total_metric": 0.014999955892562866,
240
+ "tpp_threshold_50_intended_diff_only": 0.015999972820281982,
241
+ "tpp_threshold_50_unintended_diff_only": 0.0010000169277191162,
242
+ "tpp_threshold_100_total_metric": 0.008500009775161743,
243
+ "tpp_threshold_100_intended_diff_only": 0.013000011444091797,
244
+ "tpp_threshold_100_unintended_diff_only": 0.004500001668930054,
245
+ "tpp_threshold_500_total_metric": 0.0455000102519989,
246
+ "tpp_threshold_500_intended_diff_only": 0.04900002479553223,
247
+ "tpp_threshold_500_unintended_diff_only": 0.003500014543533325
248
+ },
249
+ "6": {
250
+ "tpp_threshold_2_total_metric": 0.0014999806880950928,
251
+ "tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
252
+ "tpp_threshold_2_unintended_diff_only": -0.0004999935626983643,
253
+ "tpp_threshold_5_total_metric": -0.0004999637603759766,
254
+ "tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
255
+ "tpp_threshold_5_unintended_diff_only": 0.002499997615814209,
256
+ "tpp_threshold_10_total_metric": 0.0032500028610229492,
257
+ "tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
258
+ "tpp_threshold_10_unintended_diff_only": 0.0007500052452087402,
259
+ "tpp_threshold_20_total_metric": -0.0004999935626983643,
260
+ "tpp_threshold_20_intended_diff_only": 0.003000020980834961,
261
+ "tpp_threshold_20_unintended_diff_only": 0.003500014543533325,
262
+ "tpp_threshold_50_total_metric": 0.0027499794960021973,
263
+ "tpp_threshold_50_intended_diff_only": 0.0040000081062316895,
264
+ "tpp_threshold_50_unintended_diff_only": 0.0012500286102294922,
265
+ "tpp_threshold_100_total_metric": 0.0015000253915786743,
266
+ "tpp_threshold_100_intended_diff_only": 0.0020000338554382324,
267
+ "tpp_threshold_100_unintended_diff_only": 0.0005000084638595581,
268
+ "tpp_threshold_500_total_metric": 0.004250019788742065,
269
+ "tpp_threshold_500_intended_diff_only": 0.00700002908706665,
270
+ "tpp_threshold_500_unintended_diff_only": 0.002750009298324585
271
+ },
272
+ "9": {
273
+ "tpp_threshold_2_total_metric": 0.003999948501586914,
274
+ "tpp_threshold_2_intended_diff_only": 0.006999969482421875,
275
+ "tpp_threshold_2_unintended_diff_only": 0.003000020980834961,
276
+ "tpp_threshold_5_total_metric": 0.0034999847412109375,
277
+ "tpp_threshold_5_intended_diff_only": 0.004999995231628418,
278
+ "tpp_threshold_5_unintended_diff_only": 0.0015000104904174805,
279
+ "tpp_threshold_10_total_metric": 0.0009999722242355347,
280
+ "tpp_threshold_10_intended_diff_only": 0.0059999823570251465,
281
+ "tpp_threshold_10_unintended_diff_only": 0.005000010132789612,
282
+ "tpp_threshold_20_total_metric": 0.010749980807304382,
283
+ "tpp_threshold_20_intended_diff_only": 0.013999998569488525,
284
+ "tpp_threshold_20_unintended_diff_only": 0.003250017762184143,
285
+ "tpp_threshold_50_total_metric": 0.03499995172023773,
286
+ "tpp_threshold_50_intended_diff_only": 0.03799998760223389,
287
+ "tpp_threshold_50_unintended_diff_only": 0.003000035881996155,
288
+ "tpp_threshold_100_total_metric": 0.05425000190734863,
289
+ "tpp_threshold_100_intended_diff_only": 0.05900001525878906,
290
+ "tpp_threshold_100_unintended_diff_only": 0.00475001335144043,
291
+ "tpp_threshold_500_total_metric": 0.13499999046325684,
292
+ "tpp_threshold_500_intended_diff_only": 0.1380000114440918,
293
+ "tpp_threshold_500_unintended_diff_only": 0.003000020980834961
294
+ }
295
+ },
296
+ "canrager/amazon_reviews_mcauley_1and5": {
297
+ "1": {
298
+ "tpp_threshold_2_total_metric": 0.0007500052452087402,
299
+ "tpp_threshold_2_intended_diff_only": 0.004999995231628418,
300
+ "tpp_threshold_2_unintended_diff_only": 0.004249989986419678,
301
+ "tpp_threshold_5_total_metric": 0.0037499815225601196,
302
+ "tpp_threshold_5_intended_diff_only": 0.006999969482421875,
303
+ "tpp_threshold_5_unintended_diff_only": 0.0032499879598617554,
304
+ "tpp_threshold_10_total_metric": -0.0002499818801879883,
305
+ "tpp_threshold_10_intended_diff_only": 0.004999995231628418,
306
+ "tpp_threshold_10_unintended_diff_only": 0.005249977111816406,
307
+ "tpp_threshold_20_total_metric": -0.003250032663345337,
308
+ "tpp_threshold_20_intended_diff_only": 0.0029999613761901855,
309
+ "tpp_threshold_20_unintended_diff_only": 0.0062499940395355225,
310
+ "tpp_threshold_50_total_metric": 0.0009999871253967285,
311
+ "tpp_threshold_50_intended_diff_only": 0.006999969482421875,
312
+ "tpp_threshold_50_unintended_diff_only": 0.0059999823570251465,
313
+ "tpp_threshold_100_total_metric": 0.002499997615814209,
314
+ "tpp_threshold_100_intended_diff_only": 0.0059999823570251465,
315
+ "tpp_threshold_100_unintended_diff_only": 0.0034999847412109375,
316
+ "tpp_threshold_500_total_metric": 0.015999972820281982,
317
+ "tpp_threshold_500_intended_diff_only": 0.02199995517730713,
318
+ "tpp_threshold_500_unintended_diff_only": 0.0059999823570251465
319
+ },
320
+ "2": {
321
+ "tpp_threshold_2_total_metric": 0.0005000084638595581,
322
+ "tpp_threshold_2_intended_diff_only": 0.004999995231628418,
323
+ "tpp_threshold_2_unintended_diff_only": 0.00449998676776886,
324
+ "tpp_threshold_5_total_metric": 0.012999966740608215,
325
+ "tpp_threshold_5_intended_diff_only": 0.011999964714050293,
326
+ "tpp_threshold_5_unintended_diff_only": -0.0010000020265579224,
327
+ "tpp_threshold_10_total_metric": 0.004749983549118042,
328
+ "tpp_threshold_10_intended_diff_only": 0.006999969482421875,
329
+ "tpp_threshold_10_unintended_diff_only": 0.002249985933303833,
330
+ "tpp_threshold_20_total_metric": 0.013750016689300537,
331
+ "tpp_threshold_20_intended_diff_only": 0.018000006675720215,
332
+ "tpp_threshold_20_unintended_diff_only": 0.004249989986419678,
333
+ "tpp_threshold_50_total_metric": 0.0062499940395355225,
334
+ "tpp_threshold_50_intended_diff_only": 0.013999998569488525,
335
+ "tpp_threshold_50_unintended_diff_only": 0.007750004529953003,
336
+ "tpp_threshold_100_total_metric": 0.023999974131584167,
337
+ "tpp_threshold_100_intended_diff_only": 0.029999971389770508,
338
+ "tpp_threshold_100_unintended_diff_only": 0.00599999725818634,
339
+ "tpp_threshold_500_total_metric": 0.07625000178813934,
340
+ "tpp_threshold_500_intended_diff_only": 0.08899998664855957,
341
+ "tpp_threshold_500_unintended_diff_only": 0.012749984860420227
342
+ },
343
+ "3": {
344
+ "tpp_threshold_2_total_metric": 0.0004999637603759766,
345
+ "tpp_threshold_2_intended_diff_only": -0.0020000338554382324,
346
+ "tpp_threshold_2_unintended_diff_only": -0.002499997615814209,
347
+ "tpp_threshold_5_total_metric": -0.003500029444694519,
348
+ "tpp_threshold_5_intended_diff_only": -0.0020000338554382324,
349
+ "tpp_threshold_5_unintended_diff_only": 0.0014999955892562866,
350
+ "tpp_threshold_10_total_metric": -0.008749991655349731,
351
+ "tpp_threshold_10_intended_diff_only": -0.004999995231628418,
352
+ "tpp_threshold_10_unintended_diff_only": 0.0037499964237213135,
353
+ "tpp_threshold_20_total_metric": -0.000500023365020752,
354
+ "tpp_threshold_20_intended_diff_only": 0.001999974250793457,
355
+ "tpp_threshold_20_unintended_diff_only": 0.002499997615814209,
356
+ "tpp_threshold_50_total_metric": 0.011249974370002747,
357
+ "tpp_threshold_50_intended_diff_only": 0.011999964714050293,
358
+ "tpp_threshold_50_unintended_diff_only": 0.0007499903440475464,
359
+ "tpp_threshold_100_total_metric": 0.024500012397766113,
360
+ "tpp_threshold_100_intended_diff_only": 0.027000010013580322,
361
+ "tpp_threshold_100_unintended_diff_only": 0.002499997615814209,
362
+ "tpp_threshold_500_total_metric": 0.07074999809265137,
363
+ "tpp_threshold_500_intended_diff_only": 0.08300000429153442,
364
+ "tpp_threshold_500_unintended_diff_only": 0.012250006198883057
365
+ },
366
+ "5": {
367
+ "tpp_threshold_2_total_metric": -0.014499962329864502,
368
+ "tpp_threshold_2_intended_diff_only": -0.006999969482421875,
369
+ "tpp_threshold_2_unintended_diff_only": 0.007499992847442627,
370
+ "tpp_threshold_5_total_metric": -0.008249983191490173,
371
+ "tpp_threshold_5_intended_diff_only": -0.004999995231628418,
372
+ "tpp_threshold_5_unintended_diff_only": 0.0032499879598617554,
373
+ "tpp_threshold_10_total_metric": -0.012499943375587463,
374
+ "tpp_threshold_10_intended_diff_only": -0.007999956607818604,
375
+ "tpp_threshold_10_unintended_diff_only": 0.00449998676776886,
376
+ "tpp_threshold_20_total_metric": -0.009499967098236084,
377
+ "tpp_threshold_20_intended_diff_only": -0.0009999871253967285,
378
+ "tpp_threshold_20_unintended_diff_only": 0.008499979972839355,
379
+ "tpp_threshold_50_total_metric": 0.004000052809715271,
380
+ "tpp_threshold_50_intended_diff_only": 0.01500004529953003,
381
+ "tpp_threshold_50_unintended_diff_only": 0.010999992489814758,
382
+ "tpp_threshold_100_total_metric": 0.01875002682209015,
383
+ "tpp_threshold_100_intended_diff_only": 0.03600001335144043,
384
+ "tpp_threshold_100_unintended_diff_only": 0.01724998652935028,
385
+ "tpp_threshold_500_total_metric": 0.09225007891654968,
386
+ "tpp_threshold_500_intended_diff_only": 0.11200004816055298,
387
+ "tpp_threshold_500_unintended_diff_only": 0.019749969244003296
388
+ },
389
+ "6": {
390
+ "tpp_threshold_2_total_metric": 0.008000016212463379,
391
+ "tpp_threshold_2_intended_diff_only": 0.009000003337860107,
392
+ "tpp_threshold_2_unintended_diff_only": 0.0009999871253967285,
393
+ "tpp_threshold_5_total_metric": 0.02149997651576996,
394
+ "tpp_threshold_5_intended_diff_only": 0.0209999680519104,
395
+ "tpp_threshold_5_unintended_diff_only": -0.0005000084638595581,
396
+ "tpp_threshold_10_total_metric": 0.0057499706745147705,
397
+ "tpp_threshold_10_intended_diff_only": 0.006999969482421875,
398
+ "tpp_threshold_10_unintended_diff_only": 0.0012499988079071045,
399
+ "tpp_threshold_20_total_metric": 0.014249995350837708,
400
+ "tpp_threshold_20_intended_diff_only": 0.013999998569488525,
401
+ "tpp_threshold_20_unintended_diff_only": -0.00024999678134918213,
402
+ "tpp_threshold_50_total_metric": 0.03225000202655792,
403
+ "tpp_threshold_50_intended_diff_only": 0.03700000047683716,
404
+ "tpp_threshold_50_unintended_diff_only": 0.004749998450279236,
405
+ "tpp_threshold_100_total_metric": 0.058750003576278687,
406
+ "tpp_threshold_100_intended_diff_only": 0.06499999761581421,
407
+ "tpp_threshold_100_unintended_diff_only": 0.0062499940395355225,
408
+ "tpp_threshold_500_total_metric": 0.1615000218153,
409
+ "tpp_threshold_500_intended_diff_only": 0.17500001192092896,
410
+ "tpp_threshold_500_unintended_diff_only": 0.013499990105628967
411
+ }
412
+ }
413
+ }
414
+ }
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "tpp",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": false,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "072d027b-019d-4c30-82dd-a58126cb07ee",
73
+ "datetime_epoch_millis": 1740163113677,
74
+ "eval_result_metrics": {
75
+ "tpp_metrics": {
76
+ "tpp_threshold_2_total_metric": 0.0036749929189682003,
77
+ "tpp_threshold_2_intended_diff_only": 0.00629999041557312,
78
+ "tpp_threshold_2_unintended_diff_only": 0.002624997496604919,
79
+ "tpp_threshold_5_total_metric": 0.005574998259544372,
80
+ "tpp_threshold_5_intended_diff_only": 0.008499997854232787,
81
+ "tpp_threshold_5_unintended_diff_only": 0.0029249995946884154,
82
+ "tpp_threshold_10_total_metric": 0.009825007617473602,
83
+ "tpp_threshold_10_intended_diff_only": 0.013400006294250488,
84
+ "tpp_threshold_10_unintended_diff_only": 0.003574998676776886,
85
+ "tpp_threshold_20_total_metric": 0.014200010895729066,
86
+ "tpp_threshold_20_intended_diff_only": 0.01830000877380371,
87
+ "tpp_threshold_20_unintended_diff_only": 0.004099997878074646,
88
+ "tpp_threshold_50_total_metric": 0.027800002694129942,
89
+ "tpp_threshold_50_intended_diff_only": 0.033399999141693115,
90
+ "tpp_threshold_50_unintended_diff_only": 0.005599996447563172,
91
+ "tpp_threshold_100_total_metric": 0.04107500612735748,
92
+ "tpp_threshold_100_intended_diff_only": 0.04900000095367432,
93
+ "tpp_threshold_100_unintended_diff_only": 0.007924994826316834,
94
+ "tpp_threshold_500_total_metric": 0.1252000018954277,
95
+ "tpp_threshold_500_intended_diff_only": 0.13830000162124634,
96
+ "tpp_threshold_500_unintended_diff_only": 0.013099999725818634
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
102
+ "tpp_threshold_2_total_metric": 0.006199979782104492,
103
+ "tpp_threshold_2_intended_diff_only": 0.008199989795684814,
104
+ "tpp_threshold_2_unintended_diff_only": 0.002000010013580322,
105
+ "tpp_threshold_5_total_metric": 0.007949993014335632,
106
+ "tpp_threshold_5_intended_diff_only": 0.010800004005432129,
107
+ "tpp_threshold_5_unintended_diff_only": 0.0028500109910964966,
108
+ "tpp_threshold_10_total_metric": 0.012450012564659118,
109
+ "tpp_threshold_10_intended_diff_only": 0.014600014686584473,
110
+ "tpp_threshold_10_unintended_diff_only": 0.002150002121925354,
111
+ "tpp_threshold_20_total_metric": 0.018750008940696717,
112
+ "tpp_threshold_20_intended_diff_only": 0.02100001573562622,
113
+ "tpp_threshold_20_unintended_diff_only": 0.0022500067949295043,
114
+ "tpp_threshold_50_total_metric": 0.03235000073909759,
115
+ "tpp_threshold_50_intended_diff_only": 0.03760000467300415,
116
+ "tpp_threshold_50_unintended_diff_only": 0.005250003933906555,
117
+ "tpp_threshold_100_total_metric": 0.05085000991821289,
118
+ "tpp_threshold_100_intended_diff_only": 0.05620001554489136,
119
+ "tpp_threshold_100_unintended_diff_only": 0.005350005626678467,
120
+ "tpp_threshold_500_total_metric": 0.1507999986410141,
121
+ "tpp_threshold_500_intended_diff_only": 0.15760000944137573,
122
+ "tpp_threshold_500_unintended_diff_only": 0.006800010800361633
123
+ },
124
+ {
125
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
126
+ "tpp_threshold_2_total_metric": 0.0011500060558319093,
127
+ "tpp_threshold_2_intended_diff_only": 0.004399991035461426,
128
+ "tpp_threshold_2_unintended_diff_only": 0.0032499849796295164,
129
+ "tpp_threshold_5_total_metric": 0.003200003504753113,
130
+ "tpp_threshold_5_intended_diff_only": 0.006199991703033448,
131
+ "tpp_threshold_5_unintended_diff_only": 0.0029999881982803343,
132
+ "tpp_threshold_10_total_metric": 0.007200002670288086,
133
+ "tpp_threshold_10_intended_diff_only": 0.012199997901916504,
134
+ "tpp_threshold_10_unintended_diff_only": 0.004999995231628418,
135
+ "tpp_threshold_20_total_metric": 0.009650012850761414,
136
+ "tpp_threshold_20_intended_diff_only": 0.015600001811981202,
137
+ "tpp_threshold_20_unintended_diff_only": 0.005949988961219788,
138
+ "tpp_threshold_50_total_metric": 0.02325000464916229,
139
+ "tpp_threshold_50_intended_diff_only": 0.02919999361038208,
140
+ "tpp_threshold_50_unintended_diff_only": 0.005949988961219788,
141
+ "tpp_threshold_100_total_metric": 0.03130000233650208,
142
+ "tpp_threshold_100_intended_diff_only": 0.04179998636245728,
143
+ "tpp_threshold_100_unintended_diff_only": 0.0104999840259552,
144
+ "tpp_threshold_500_total_metric": 0.09960000514984131,
145
+ "tpp_threshold_500_intended_diff_only": 0.11899999380111695,
146
+ "tpp_threshold_500_unintended_diff_only": 0.019399988651275634
147
+ }
148
+ ],
149
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
150
+ "sae_lens_id": "custom_sae",
151
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_20_peft_sae_from_scratch_95.0",
152
+ "sae_lens_version": "5.4.2",
153
+ "sae_cfg_dict": {
154
+ "model_name": "gemma-2-2b",
155
+ "d_in": 2304,
156
+ "d_sae": 65536,
157
+ "hook_layer": 12,
158
+ "hook_name": "blocks.12.hook_resid_post",
159
+ "context_size": null,
160
+ "hook_head_index": null,
161
+ "architecture": "topk",
162
+ "apply_b_dec_to_input": null,
163
+ "finetuning_scaling_factor": null,
164
+ "activation_fn_str": "",
165
+ "prepend_bos": true,
166
+ "normalize_activations": "none",
167
+ "dtype": "bfloat16",
168
+ "device": "",
169
+ "dataset_path": "",
170
+ "dataset_trust_remote_code": true,
171
+ "seqpos_slice": [
172
+ null
173
+ ],
174
+ "training_tokens": -100000,
175
+ "sae_lens_training_version": null,
176
+ "neuronpedia_id": null
177
+ },
178
+ "eval_result_unstructured": {
179
+ "LabHC/bias_in_bios_class_set1": {
180
+ "0": {
181
+ "tpp_threshold_2_total_metric": 0.01124998927116394,
182
+ "tpp_threshold_2_intended_diff_only": 0.013999998569488525,
183
+ "tpp_threshold_2_unintended_diff_only": 0.002750009298324585,
184
+ "tpp_threshold_5_total_metric": 0.014999985694885254,
185
+ "tpp_threshold_5_intended_diff_only": 0.018999993801116943,
186
+ "tpp_threshold_5_unintended_diff_only": 0.0040000081062316895,
187
+ "tpp_threshold_10_total_metric": 0.010249987244606018,
188
+ "tpp_threshold_10_intended_diff_only": 0.013999998569488525,
189
+ "tpp_threshold_10_unintended_diff_only": 0.0037500113248825073,
190
+ "tpp_threshold_20_total_metric": 0.021500006318092346,
191
+ "tpp_threshold_20_intended_diff_only": 0.023000001907348633,
192
+ "tpp_threshold_20_unintended_diff_only": 0.0014999955892562866,
193
+ "tpp_threshold_50_total_metric": 0.04150000214576721,
194
+ "tpp_threshold_50_intended_diff_only": 0.05500000715255737,
195
+ "tpp_threshold_50_unintended_diff_only": 0.013500005006790161,
196
+ "tpp_threshold_100_total_metric": 0.06099998950958252,
197
+ "tpp_threshold_100_intended_diff_only": 0.07400000095367432,
198
+ "tpp_threshold_100_unintended_diff_only": 0.013000011444091797,
199
+ "tpp_threshold_500_total_metric": 0.19425000250339508,
200
+ "tpp_threshold_500_intended_diff_only": 0.2070000171661377,
201
+ "tpp_threshold_500_unintended_diff_only": 0.012750014662742615
202
+ },
203
+ "1": {
204
+ "tpp_threshold_2_total_metric": 0.009250015020370483,
205
+ "tpp_threshold_2_intended_diff_only": 0.009000003337860107,
206
+ "tpp_threshold_2_unintended_diff_only": -0.000250011682510376,
207
+ "tpp_threshold_5_total_metric": 0.008250042796134949,
208
+ "tpp_threshold_5_intended_diff_only": 0.010000050067901611,
209
+ "tpp_threshold_5_unintended_diff_only": 0.0017500072717666626,
210
+ "tpp_threshold_10_total_metric": 0.006750047206878662,
211
+ "tpp_threshold_10_intended_diff_only": 0.006000041961669922,
212
+ "tpp_threshold_10_unintended_diff_only": -0.0007500052452087402,
213
+ "tpp_threshold_20_total_metric": 0.004000052809715271,
214
+ "tpp_threshold_20_intended_diff_only": 0.006000041961669922,
215
+ "tpp_threshold_20_unintended_diff_only": 0.001999989151954651,
216
+ "tpp_threshold_50_total_metric": 0.006000041961669922,
217
+ "tpp_threshold_50_intended_diff_only": 0.01100003719329834,
218
+ "tpp_threshold_50_unintended_diff_only": 0.004999995231628418,
219
+ "tpp_threshold_100_total_metric": 0.018000051379203796,
220
+ "tpp_threshold_100_intended_diff_only": 0.024000048637390137,
221
+ "tpp_threshold_100_unintended_diff_only": 0.00599999725818634,
222
+ "tpp_threshold_500_total_metric": 0.09325002133846283,
223
+ "tpp_threshold_500_intended_diff_only": 0.10000002384185791,
224
+ "tpp_threshold_500_unintended_diff_only": 0.006750002503395081
225
+ },
226
+ "2": {
227
+ "tpp_threshold_2_total_metric": 0.006749957799911499,
228
+ "tpp_threshold_2_intended_diff_only": 0.010999977588653564,
229
+ "tpp_threshold_2_unintended_diff_only": 0.004250019788742065,
230
+ "tpp_threshold_5_total_metric": 0.014499962329864502,
231
+ "tpp_threshold_5_intended_diff_only": 0.019999980926513672,
232
+ "tpp_threshold_5_unintended_diff_only": 0.00550001859664917,
233
+ "tpp_threshold_10_total_metric": 0.038000017404556274,
234
+ "tpp_threshold_10_intended_diff_only": 0.04000002145767212,
235
+ "tpp_threshold_10_unintended_diff_only": 0.0020000040531158447,
236
+ "tpp_threshold_20_total_metric": 0.047749996185302734,
237
+ "tpp_threshold_20_intended_diff_only": 0.050000011920928955,
238
+ "tpp_threshold_20_unintended_diff_only": 0.0022500157356262207,
239
+ "tpp_threshold_50_total_metric": 0.06199999153614044,
240
+ "tpp_threshold_50_intended_diff_only": 0.06400001049041748,
241
+ "tpp_threshold_50_unintended_diff_only": 0.0020000189542770386,
242
+ "tpp_threshold_100_total_metric": 0.08500000834465027,
243
+ "tpp_threshold_100_intended_diff_only": 0.08700001239776611,
244
+ "tpp_threshold_100_unintended_diff_only": 0.0020000040531158447,
245
+ "tpp_threshold_500_total_metric": 0.26524999737739563,
246
+ "tpp_threshold_500_intended_diff_only": 0.2670000195503235,
247
+ "tpp_threshold_500_unintended_diff_only": 0.0017500221729278564
248
+ },
249
+ "6": {
250
+ "tpp_threshold_2_total_metric": -0.0002500265836715698,
251
+ "tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
252
+ "tpp_threshold_2_unintended_diff_only": 0.0012500137090682983,
253
+ "tpp_threshold_5_total_metric": 0.000250011682510376,
254
+ "tpp_threshold_5_intended_diff_only": -0.0009999871253967285,
255
+ "tpp_threshold_5_unintended_diff_only": -0.0012499988079071045,
256
+ "tpp_threshold_10_total_metric": 0.003000006079673767,
257
+ "tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
258
+ "tpp_threshold_10_unintended_diff_only": 0.0010000020265579224,
259
+ "tpp_threshold_20_total_metric": 0.003250017762184143,
260
+ "tpp_threshold_20_intended_diff_only": 0.0020000338554382324,
261
+ "tpp_threshold_20_unintended_diff_only": -0.0012499839067459106,
262
+ "tpp_threshold_50_total_metric": 0.004250004887580872,
263
+ "tpp_threshold_50_intended_diff_only": 0.004999995231628418,
264
+ "tpp_threshold_50_unintended_diff_only": 0.0007499903440475464,
265
+ "tpp_threshold_100_total_metric": 0.011750027537345886,
266
+ "tpp_threshold_100_intended_diff_only": 0.01100003719329834,
267
+ "tpp_threshold_100_unintended_diff_only": -0.0007499903440475464,
268
+ "tpp_threshold_500_total_metric": 0.015999987721443176,
269
+ "tpp_threshold_500_intended_diff_only": 0.019999980926513672,
270
+ "tpp_threshold_500_unintended_diff_only": 0.003999993205070496
271
+ },
272
+ "9": {
273
+ "tpp_threshold_2_total_metric": 0.003999963402748108,
274
+ "tpp_threshold_2_intended_diff_only": 0.0059999823570251465,
275
+ "tpp_threshold_2_unintended_diff_only": 0.0020000189542770386,
276
+ "tpp_threshold_5_total_metric": 0.001749962568283081,
277
+ "tpp_threshold_5_intended_diff_only": 0.0059999823570251465,
278
+ "tpp_threshold_5_unintended_diff_only": 0.004250019788742065,
279
+ "tpp_threshold_10_total_metric": 0.004250004887580872,
280
+ "tpp_threshold_10_intended_diff_only": 0.009000003337860107,
281
+ "tpp_threshold_10_unintended_diff_only": 0.004749998450279236,
282
+ "tpp_threshold_20_total_metric": 0.017249971628189087,
283
+ "tpp_threshold_20_intended_diff_only": 0.02399998903274536,
284
+ "tpp_threshold_20_unintended_diff_only": 0.006750017404556274,
285
+ "tpp_threshold_50_total_metric": 0.04799996316432953,
286
+ "tpp_threshold_50_intended_diff_only": 0.05299997329711914,
287
+ "tpp_threshold_50_unintended_diff_only": 0.005000010132789612,
288
+ "tpp_threshold_100_total_metric": 0.07849997282028198,
289
+ "tpp_threshold_100_intended_diff_only": 0.08499997854232788,
290
+ "tpp_threshold_100_unintended_diff_only": 0.0065000057220458984,
291
+ "tpp_threshold_500_total_metric": 0.18524998426437378,
292
+ "tpp_threshold_500_intended_diff_only": 0.1940000057220459,
293
+ "tpp_threshold_500_unintended_diff_only": 0.00875002145767212
294
+ }
295
+ },
296
+ "canrager/amazon_reviews_mcauley_1and5": {
297
+ "1": {
298
+ "tpp_threshold_2_total_metric": 0.0062499940395355225,
299
+ "tpp_threshold_2_intended_diff_only": 0.010999977588653564,
300
+ "tpp_threshold_2_unintended_diff_only": 0.004749983549118042,
301
+ "tpp_threshold_5_total_metric": 0.0015000104904174805,
302
+ "tpp_threshold_5_intended_diff_only": 0.004999995231628418,
303
+ "tpp_threshold_5_unintended_diff_only": 0.0034999847412109375,
304
+ "tpp_threshold_10_total_metric": 0.0032499730587005615,
305
+ "tpp_threshold_10_intended_diff_only": 0.010999977588653564,
306
+ "tpp_threshold_10_unintended_diff_only": 0.007750004529953003,
307
+ "tpp_threshold_20_total_metric": 0.001749977469444275,
308
+ "tpp_threshold_20_intended_diff_only": 0.006999969482421875,
309
+ "tpp_threshold_20_unintended_diff_only": 0.0052499920129776,
310
+ "tpp_threshold_50_total_metric": 0.001499965786933899,
311
+ "tpp_threshold_50_intended_diff_only": 0.003999948501586914,
312
+ "tpp_threshold_50_unintended_diff_only": 0.002499982714653015,
313
+ "tpp_threshold_100_total_metric": 0.001749962568283081,
314
+ "tpp_threshold_100_intended_diff_only": 0.007999956607818604,
315
+ "tpp_threshold_100_unintended_diff_only": 0.0062499940395355225,
316
+ "tpp_threshold_500_total_metric": 0.027499958872795105,
317
+ "tpp_threshold_500_intended_diff_only": 0.030999958515167236,
318
+ "tpp_threshold_500_unintended_diff_only": 0.0034999996423721313
319
+ },
320
+ "2": {
321
+ "tpp_threshold_2_total_metric": 0.00475001335144043,
322
+ "tpp_threshold_2_intended_diff_only": 0.004999995231628418,
323
+ "tpp_threshold_2_unintended_diff_only": 0.0002499818801879883,
324
+ "tpp_threshold_5_total_metric": 0.001499965786933899,
325
+ "tpp_threshold_5_intended_diff_only": 0.007999956607818604,
326
+ "tpp_threshold_5_unintended_diff_only": 0.006499990820884705,
327
+ "tpp_threshold_10_total_metric": 0.008749961853027344,
328
+ "tpp_threshold_10_intended_diff_only": 0.011999964714050293,
329
+ "tpp_threshold_10_unintended_diff_only": 0.0032500028610229492,
330
+ "tpp_threshold_20_total_metric": 0.005750015377998352,
331
+ "tpp_threshold_20_intended_diff_only": 0.009000003337860107,
332
+ "tpp_threshold_20_unintended_diff_only": 0.0032499879598617554,
333
+ "tpp_threshold_50_total_metric": 0.012500002980232239,
334
+ "tpp_threshold_50_intended_diff_only": 0.018000006675720215,
335
+ "tpp_threshold_50_unintended_diff_only": 0.005500003695487976,
336
+ "tpp_threshold_100_total_metric": 0.019000008702278137,
337
+ "tpp_threshold_100_intended_diff_only": 0.02899998426437378,
338
+ "tpp_threshold_100_unintended_diff_only": 0.009999975562095642,
339
+ "tpp_threshold_500_total_metric": 0.08375000953674316,
340
+ "tpp_threshold_500_intended_diff_only": 0.09299999475479126,
341
+ "tpp_threshold_500_unintended_diff_only": 0.009249985218048096
342
+ },
343
+ "3": {
344
+ "tpp_threshold_2_total_metric": -0.00849999487400055,
345
+ "tpp_threshold_2_intended_diff_only": -0.004999995231628418,
346
+ "tpp_threshold_2_unintended_diff_only": 0.0034999996423721313,
347
+ "tpp_threshold_5_total_metric": 0.005000025033950806,
348
+ "tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
349
+ "tpp_threshold_5_unintended_diff_only": -0.0010000169277191162,
350
+ "tpp_threshold_10_total_metric": 0.006750002503395081,
351
+ "tpp_threshold_10_intended_diff_only": 0.009999990463256836,
352
+ "tpp_threshold_10_unintended_diff_only": 0.0032499879598617554,
353
+ "tpp_threshold_20_total_metric": 0.003250017762184143,
354
+ "tpp_threshold_20_intended_diff_only": 0.009000003337860107,
355
+ "tpp_threshold_20_unintended_diff_only": 0.005749985575675964,
356
+ "tpp_threshold_50_total_metric": 0.016000017523765564,
357
+ "tpp_threshold_50_intended_diff_only": 0.018000006675720215,
358
+ "tpp_threshold_50_unintended_diff_only": 0.001999989151954651,
359
+ "tpp_threshold_100_total_metric": 0.023000001907348633,
360
+ "tpp_threshold_100_intended_diff_only": 0.03299999237060547,
361
+ "tpp_threshold_100_unintended_diff_only": 0.009999990463256836,
362
+ "tpp_threshold_500_total_metric": 0.0832500159740448,
363
+ "tpp_threshold_500_intended_diff_only": 0.09700000286102295,
364
+ "tpp_threshold_500_unintended_diff_only": 0.01374998688697815
365
+ },
366
+ "5": {
367
+ "tpp_threshold_2_total_metric": -0.004999950528144836,
368
+ "tpp_threshold_2_intended_diff_only": -0.001999974250793457,
369
+ "tpp_threshold_2_unintended_diff_only": 0.0029999762773513794,
370
+ "tpp_threshold_5_total_metric": -0.0009999871253967285,
371
+ "tpp_threshold_5_intended_diff_only": 0.004999995231628418,
372
+ "tpp_threshold_5_unintended_diff_only": 0.0059999823570251465,
373
+ "tpp_threshold_10_total_metric": -0.0074999332427978516,
374
+ "tpp_threshold_10_intended_diff_only": 0.001000046730041504,
375
+ "tpp_threshold_10_unintended_diff_only": 0.008499979972839355,
376
+ "tpp_threshold_20_total_metric": 0.0055000633001327515,
377
+ "tpp_threshold_20_intended_diff_only": 0.01500004529953003,
378
+ "tpp_threshold_20_unintended_diff_only": 0.009499981999397278,
379
+ "tpp_threshold_50_total_metric": 0.03925004601478577,
380
+ "tpp_threshold_50_intended_diff_only": 0.04900002479553223,
381
+ "tpp_threshold_50_unintended_diff_only": 0.00974997878074646,
382
+ "tpp_threshold_100_total_metric": 0.04725003242492676,
383
+ "tpp_threshold_100_intended_diff_only": 0.06400001049041748,
384
+ "tpp_threshold_100_unintended_diff_only": 0.016749978065490723,
385
+ "tpp_threshold_500_total_metric": 0.08625003695487976,
386
+ "tpp_threshold_500_intended_diff_only": 0.14600002765655518,
387
+ "tpp_threshold_500_unintended_diff_only": 0.059749990701675415
388
+ },
389
+ "6": {
390
+ "tpp_threshold_2_total_metric": 0.00824996829032898,
391
+ "tpp_threshold_2_intended_diff_only": 0.012999951839447021,
392
+ "tpp_threshold_2_unintended_diff_only": 0.004749983549118042,
393
+ "tpp_threshold_5_total_metric": 0.009000003337860107,
394
+ "tpp_threshold_5_intended_diff_only": 0.009000003337860107,
395
+ "tpp_threshold_5_unintended_diff_only": 0.0,
396
+ "tpp_threshold_10_total_metric": 0.024750009179115295,
397
+ "tpp_threshold_10_intended_diff_only": 0.027000010013580322,
398
+ "tpp_threshold_10_unintended_diff_only": 0.002250000834465027,
399
+ "tpp_threshold_20_total_metric": 0.031999990344047546,
400
+ "tpp_threshold_20_intended_diff_only": 0.03799998760223389,
401
+ "tpp_threshold_20_unintended_diff_only": 0.00599999725818634,
402
+ "tpp_threshold_50_total_metric": 0.046999990940093994,
403
+ "tpp_threshold_50_intended_diff_only": 0.05699998140335083,
404
+ "tpp_threshold_50_unintended_diff_only": 0.009999990463256836,
405
+ "tpp_threshold_100_total_metric": 0.06550000607967377,
406
+ "tpp_threshold_100_intended_diff_only": 0.07499998807907104,
407
+ "tpp_threshold_100_unintended_diff_only": 0.009499981999397278,
408
+ "tpp_threshold_500_total_metric": 0.2172500044107437,
409
+ "tpp_threshold_500_intended_diff_only": 0.2279999852180481,
410
+ "tpp_threshold_500_unintended_diff_only": 0.010749980807304382
411
+ }
412
+ }
413
+ }
414
+ }
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "tpp",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": false,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "6d523acd-6837-4bf6-8169-4ebea2aedf9e",
73
+ "datetime_epoch_millis": 1740163747509,
74
+ "eval_result_metrics": {
75
+ "tpp_metrics": {
76
+ "tpp_threshold_2_total_metric": 0.001275007426738739,
77
+ "tpp_threshold_2_intended_diff_only": 0.0037000060081481935,
78
+ "tpp_threshold_2_unintended_diff_only": 0.0024249985814094547,
79
+ "tpp_threshold_5_total_metric": 0.0007249891757965087,
80
+ "tpp_threshold_5_intended_diff_only": 0.0034999907016754154,
81
+ "tpp_threshold_5_unintended_diff_only": 0.0027750015258789064,
82
+ "tpp_threshold_10_total_metric": 0.004174999892711639,
83
+ "tpp_threshold_10_intended_diff_only": 0.0078000009059906,
84
+ "tpp_threshold_10_unintended_diff_only": 0.0036250010132789614,
85
+ "tpp_threshold_20_total_metric": 0.0060749977827072145,
86
+ "tpp_threshold_20_intended_diff_only": 0.009899997711181642,
87
+ "tpp_threshold_20_unintended_diff_only": 0.003824999928474426,
88
+ "tpp_threshold_50_total_metric": 0.016099993884563447,
89
+ "tpp_threshold_50_intended_diff_only": 0.022099995613098146,
90
+ "tpp_threshold_50_unintended_diff_only": 0.006000001728534699,
91
+ "tpp_threshold_100_total_metric": 0.02952500283718109,
92
+ "tpp_threshold_100_intended_diff_only": 0.039000004529953,
93
+ "tpp_threshold_100_unintended_diff_only": 0.009475001692771911,
94
+ "tpp_threshold_500_total_metric": 0.12557500153779982,
95
+ "tpp_threshold_500_intended_diff_only": 0.1384999990463257,
96
+ "tpp_threshold_500_unintended_diff_only": 0.012924997508525847
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
102
+ "tpp_threshold_2_total_metric": 0.0031000018119812013,
103
+ "tpp_threshold_2_intended_diff_only": 0.0048000097274780275,
104
+ "tpp_threshold_2_unintended_diff_only": 0.0017000079154968263,
105
+ "tpp_threshold_5_total_metric": 0.0022999852895736693,
106
+ "tpp_threshold_5_intended_diff_only": 0.004799997806549073,
107
+ "tpp_threshold_5_unintended_diff_only": 0.002500012516975403,
108
+ "tpp_threshold_10_total_metric": 0.006099998950958252,
109
+ "tpp_threshold_10_intended_diff_only": 0.008200013637542724,
110
+ "tpp_threshold_10_unintended_diff_only": 0.0021000146865844727,
111
+ "tpp_threshold_20_total_metric": 0.01099998652935028,
112
+ "tpp_threshold_20_intended_diff_only": 0.013199996948242188,
113
+ "tpp_threshold_20_unintended_diff_only": 0.002200010418891907,
114
+ "tpp_threshold_50_total_metric": 0.0169999897480011,
115
+ "tpp_threshold_50_intended_diff_only": 0.020399999618530274,
116
+ "tpp_threshold_50_unintended_diff_only": 0.0034000098705291746,
117
+ "tpp_threshold_100_total_metric": 0.03155000209808349,
118
+ "tpp_threshold_100_intended_diff_only": 0.035600018501281736,
119
+ "tpp_threshold_100_unintended_diff_only": 0.004050016403198242,
120
+ "tpp_threshold_500_total_metric": 0.12450000941753388,
121
+ "tpp_threshold_500_intended_diff_only": 0.13040001392364503,
122
+ "tpp_threshold_500_unintended_diff_only": 0.005900004506111145
123
+ },
124
+ {
125
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
126
+ "tpp_threshold_2_total_metric": -0.0005499869585037231,
127
+ "tpp_threshold_2_intended_diff_only": 0.0026000022888183595,
128
+ "tpp_threshold_2_unintended_diff_only": 0.0031499892473220827,
129
+ "tpp_threshold_5_total_metric": -0.0008500069379806519,
130
+ "tpp_threshold_5_intended_diff_only": 0.0021999835968017577,
131
+ "tpp_threshold_5_unintended_diff_only": 0.0030499905347824096,
132
+ "tpp_threshold_10_total_metric": 0.002250000834465027,
133
+ "tpp_threshold_10_intended_diff_only": 0.007399988174438476,
134
+ "tpp_threshold_10_unintended_diff_only": 0.0051499873399734495,
135
+ "tpp_threshold_20_total_metric": 0.001150009036064148,
136
+ "tpp_threshold_20_intended_diff_only": 0.006599998474121094,
137
+ "tpp_threshold_20_unintended_diff_only": 0.0054499894380569455,
138
+ "tpp_threshold_50_total_metric": 0.015199998021125793,
139
+ "tpp_threshold_50_intended_diff_only": 0.023799991607666014,
140
+ "tpp_threshold_50_unintended_diff_only": 0.008599993586540223,
141
+ "tpp_threshold_100_total_metric": 0.027500003576278687,
142
+ "tpp_threshold_100_intended_diff_only": 0.04239999055862427,
143
+ "tpp_threshold_100_unintended_diff_only": 0.01489998698234558,
144
+ "tpp_threshold_500_total_metric": 0.12664999365806578,
145
+ "tpp_threshold_500_intended_diff_only": 0.14659998416900635,
146
+ "tpp_threshold_500_unintended_diff_only": 0.01994999051094055
147
+ }
148
+ ],
149
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
150
+ "sae_lens_id": "custom_sae",
151
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_0.0",
152
+ "sae_lens_version": "5.4.2",
153
+ "sae_cfg_dict": {
154
+ "model_name": "gemma-2-2b",
155
+ "d_in": 2304,
156
+ "d_sae": 65536,
157
+ "hook_layer": 12,
158
+ "hook_name": "blocks.12.hook_resid_post",
159
+ "context_size": null,
160
+ "hook_head_index": null,
161
+ "architecture": "topk",
162
+ "apply_b_dec_to_input": null,
163
+ "finetuning_scaling_factor": null,
164
+ "activation_fn_str": "",
165
+ "prepend_bos": true,
166
+ "normalize_activations": "none",
167
+ "dtype": "bfloat16",
168
+ "device": "",
169
+ "dataset_path": "",
170
+ "dataset_trust_remote_code": true,
171
+ "seqpos_slice": [
172
+ null
173
+ ],
174
+ "training_tokens": -100000,
175
+ "sae_lens_training_version": null,
176
+ "neuronpedia_id": null
177
+ },
178
+ "eval_result_unstructured": {
179
+ "LabHC/bias_in_bios_class_set1": {
180
+ "0": {
181
+ "tpp_threshold_2_total_metric": 0.007249996066093445,
182
+ "tpp_threshold_2_intended_diff_only": 0.009000003337860107,
183
+ "tpp_threshold_2_unintended_diff_only": 0.0017500072717666626,
184
+ "tpp_threshold_5_total_metric": 0.00925000011920929,
185
+ "tpp_threshold_5_intended_diff_only": 0.012000024318695068,
186
+ "tpp_threshold_5_unintended_diff_only": 0.002750024199485779,
187
+ "tpp_threshold_10_total_metric": 0.003000006079673767,
188
+ "tpp_threshold_10_intended_diff_only": 0.00700002908706665,
189
+ "tpp_threshold_10_unintended_diff_only": 0.004000023007392883,
190
+ "tpp_threshold_20_total_metric": 0.0169999897480011,
191
+ "tpp_threshold_20_intended_diff_only": 0.018999993801116943,
192
+ "tpp_threshold_20_unintended_diff_only": 0.0020000040531158447,
193
+ "tpp_threshold_50_total_metric": 0.028999969363212585,
194
+ "tpp_threshold_50_intended_diff_only": 0.03299999237060547,
195
+ "tpp_threshold_50_unintended_diff_only": 0.004000023007392883,
196
+ "tpp_threshold_100_total_metric": 0.05250002443790436,
197
+ "tpp_threshold_100_intended_diff_only": 0.058000028133392334,
198
+ "tpp_threshold_100_unintended_diff_only": 0.005500003695487976,
199
+ "tpp_threshold_500_total_metric": 0.23200003802776337,
200
+ "tpp_threshold_500_intended_diff_only": 0.23700004816055298,
201
+ "tpp_threshold_500_unintended_diff_only": 0.005000010132789612
202
+ },
203
+ "1": {
204
+ "tpp_threshold_2_total_metric": 0.007750034332275391,
205
+ "tpp_threshold_2_intended_diff_only": 0.00700002908706665,
206
+ "tpp_threshold_2_unintended_diff_only": -0.0007500052452087402,
207
+ "tpp_threshold_5_total_metric": 0.0015000104904174805,
208
+ "tpp_threshold_5_intended_diff_only": 0.003000020980834961,
209
+ "tpp_threshold_5_unintended_diff_only": 0.0015000104904174805,
210
+ "tpp_threshold_10_total_metric": 0.006750032305717468,
211
+ "tpp_threshold_10_intended_diff_only": 0.006000041961669922,
212
+ "tpp_threshold_10_unintended_diff_only": -0.0007499903440475464,
213
+ "tpp_threshold_20_total_metric": -0.0002499818801879883,
214
+ "tpp_threshold_20_intended_diff_only": 0.003000020980834961,
215
+ "tpp_threshold_20_unintended_diff_only": 0.0032500028610229492,
216
+ "tpp_threshold_50_total_metric": 0.002750024199485779,
217
+ "tpp_threshold_50_intended_diff_only": 0.00700002908706665,
218
+ "tpp_threshold_50_unintended_diff_only": 0.004250004887580872,
219
+ "tpp_threshold_100_total_metric": 0.0017500221729278564,
220
+ "tpp_threshold_100_intended_diff_only": 0.006000041961669922,
221
+ "tpp_threshold_100_unintended_diff_only": 0.004250019788742065,
222
+ "tpp_threshold_500_total_metric": 0.059500038623809814,
223
+ "tpp_threshold_500_intended_diff_only": 0.06700003147125244,
224
+ "tpp_threshold_500_unintended_diff_only": 0.007499992847442627
225
+ },
226
+ "2": {
227
+ "tpp_threshold_2_total_metric": -0.002750009298324585,
228
+ "tpp_threshold_2_intended_diff_only": 0.0,
229
+ "tpp_threshold_2_unintended_diff_only": 0.002750009298324585,
230
+ "tpp_threshold_5_total_metric": -0.003250032663345337,
231
+ "tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
232
+ "tpp_threshold_5_unintended_diff_only": 0.004250019788742065,
233
+ "tpp_threshold_10_total_metric": 0.013499975204467773,
234
+ "tpp_threshold_10_intended_diff_only": 0.014999985694885254,
235
+ "tpp_threshold_10_unintended_diff_only": 0.0015000104904174805,
236
+ "tpp_threshold_20_total_metric": 0.02374996244907379,
237
+ "tpp_threshold_20_intended_diff_only": 0.02499997615814209,
238
+ "tpp_threshold_20_unintended_diff_only": 0.0012500137090682983,
239
+ "tpp_threshold_50_total_metric": 0.022499963641166687,
240
+ "tpp_threshold_50_intended_diff_only": 0.02499997615814209,
241
+ "tpp_threshold_50_unintended_diff_only": 0.002500012516975403,
242
+ "tpp_threshold_100_total_metric": 0.03275001049041748,
243
+ "tpp_threshold_100_intended_diff_only": 0.0350000262260437,
244
+ "tpp_threshold_100_unintended_diff_only": 0.0022500157356262207,
245
+ "tpp_threshold_500_total_metric": 0.148250013589859,
246
+ "tpp_threshold_500_intended_diff_only": 0.15200001001358032,
247
+ "tpp_threshold_500_unintended_diff_only": 0.0037499964237213135
248
+ },
249
+ "6": {
250
+ "tpp_threshold_2_total_metric": 1.4901161193847656e-08,
251
+ "tpp_threshold_2_intended_diff_only": 0.0020000338554382324,
252
+ "tpp_threshold_2_unintended_diff_only": 0.0020000189542770386,
253
+ "tpp_threshold_5_total_metric": 0.001999989151954651,
254
+ "tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
255
+ "tpp_threshold_5_unintended_diff_only": -0.0010000020265579224,
256
+ "tpp_threshold_10_total_metric": 0.002749994397163391,
257
+ "tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
258
+ "tpp_threshold_10_unintended_diff_only": 0.0012500137090682983,
259
+ "tpp_threshold_20_total_metric": 0.003250017762184143,
260
+ "tpp_threshold_20_intended_diff_only": 0.0020000338554382324,
261
+ "tpp_threshold_20_unintended_diff_only": -0.0012499839067459106,
262
+ "tpp_threshold_50_total_metric": 0.003000006079673767,
263
+ "tpp_threshold_50_intended_diff_only": 0.0040000081062316895,
264
+ "tpp_threshold_50_unintended_diff_only": 0.0010000020265579224,
265
+ "tpp_threshold_100_total_metric": 0.0034999847412109375,
266
+ "tpp_threshold_100_intended_diff_only": 0.0040000081062316895,
267
+ "tpp_threshold_100_unintended_diff_only": 0.000500023365020752,
268
+ "tpp_threshold_500_total_metric": 0.012749999761581421,
269
+ "tpp_threshold_500_intended_diff_only": 0.018000006675720215,
270
+ "tpp_threshold_500_unintended_diff_only": 0.005250006914138794
271
+ },
272
+ "9": {
273
+ "tpp_threshold_2_total_metric": 0.0032499730587005615,
274
+ "tpp_threshold_2_intended_diff_only": 0.0059999823570251465,
275
+ "tpp_threshold_2_unintended_diff_only": 0.002750009298324585,
276
+ "tpp_threshold_5_total_metric": 0.001999959349632263,
277
+ "tpp_threshold_5_intended_diff_only": 0.006999969482421875,
278
+ "tpp_threshold_5_unintended_diff_only": 0.005000010132789612,
279
+ "tpp_threshold_10_total_metric": 0.00449998676776886,
280
+ "tpp_threshold_10_intended_diff_only": 0.009000003337860107,
281
+ "tpp_threshold_10_unintended_diff_only": 0.0045000165700912476,
282
+ "tpp_threshold_20_total_metric": 0.011249944567680359,
283
+ "tpp_threshold_20_intended_diff_only": 0.01699995994567871,
284
+ "tpp_threshold_20_unintended_diff_only": 0.005750015377998352,
285
+ "tpp_threshold_50_total_metric": 0.027749985456466675,
286
+ "tpp_threshold_50_intended_diff_only": 0.03299999237060547,
287
+ "tpp_threshold_50_unintended_diff_only": 0.005250006914138794,
288
+ "tpp_threshold_100_total_metric": 0.06724996864795685,
289
+ "tpp_threshold_100_intended_diff_only": 0.07499998807907104,
290
+ "tpp_threshold_100_unintended_diff_only": 0.007750019431114197,
291
+ "tpp_threshold_500_total_metric": 0.16999995708465576,
292
+ "tpp_threshold_500_intended_diff_only": 0.17799997329711914,
293
+ "tpp_threshold_500_unintended_diff_only": 0.008000016212463379
294
+ }
295
+ },
296
+ "canrager/amazon_reviews_mcauley_1and5": {
297
+ "1": {
298
+ "tpp_threshold_2_total_metric": 0.00599999725818634,
299
+ "tpp_threshold_2_intended_diff_only": 0.009999990463256836,
300
+ "tpp_threshold_2_unintended_diff_only": 0.003999993205070496,
301
+ "tpp_threshold_5_total_metric": 0.003999963402748108,
302
+ "tpp_threshold_5_intended_diff_only": 0.006999969482421875,
303
+ "tpp_threshold_5_unintended_diff_only": 0.003000006079673767,
304
+ "tpp_threshold_10_total_metric": 0.003000006079673767,
305
+ "tpp_threshold_10_intended_diff_only": 0.009999990463256836,
306
+ "tpp_threshold_10_unintended_diff_only": 0.006999984383583069,
307
+ "tpp_threshold_20_total_metric": -0.0025000274181365967,
308
+ "tpp_threshold_20_intended_diff_only": 0.0029999613761901855,
309
+ "tpp_threshold_20_unintended_diff_only": 0.005499988794326782,
310
+ "tpp_threshold_50_total_metric": 0.00349995493888855,
311
+ "tpp_threshold_50_intended_diff_only": 0.003999948501586914,
312
+ "tpp_threshold_50_unintended_diff_only": 0.0004999935626983643,
313
+ "tpp_threshold_100_total_metric": 0.0020000189542770386,
314
+ "tpp_threshold_100_intended_diff_only": 0.009000003337860107,
315
+ "tpp_threshold_100_unintended_diff_only": 0.006999984383583069,
316
+ "tpp_threshold_500_total_metric": 0.022749975323677063,
317
+ "tpp_threshold_500_intended_diff_only": 0.042999982833862305,
318
+ "tpp_threshold_500_unintended_diff_only": 0.02025000751018524
319
+ },
320
+ "2": {
321
+ "tpp_threshold_2_total_metric": -0.0004999935626983643,
322
+ "tpp_threshold_2_intended_diff_only": 0.0,
323
+ "tpp_threshold_2_unintended_diff_only": 0.0004999935626983643,
324
+ "tpp_threshold_5_total_metric": -0.005500033497810364,
325
+ "tpp_threshold_5_intended_diff_only": 0.0029999613761901855,
326
+ "tpp_threshold_5_unintended_diff_only": 0.00849999487400055,
327
+ "tpp_threshold_10_total_metric": 0.0014999955892562866,
328
+ "tpp_threshold_10_intended_diff_only": 0.006999969482421875,
329
+ "tpp_threshold_10_unintended_diff_only": 0.005499973893165588,
330
+ "tpp_threshold_20_total_metric": -0.0009999722242355347,
331
+ "tpp_threshold_20_intended_diff_only": 0.0040000081062316895,
332
+ "tpp_threshold_20_unintended_diff_only": 0.004999980330467224,
333
+ "tpp_threshold_50_total_metric": 0.008999988436698914,
334
+ "tpp_threshold_50_intended_diff_only": 0.014999985694885254,
335
+ "tpp_threshold_50_unintended_diff_only": 0.00599999725818634,
336
+ "tpp_threshold_100_total_metric": 0.01150001585483551,
337
+ "tpp_threshold_100_intended_diff_only": 0.023000001907348633,
338
+ "tpp_threshold_100_unintended_diff_only": 0.011499986052513123,
339
+ "tpp_threshold_500_total_metric": 0.08924996852874756,
340
+ "tpp_threshold_500_intended_diff_only": 0.10499995946884155,
341
+ "tpp_threshold_500_unintended_diff_only": 0.015749990940093994
342
+ },
343
+ "3": {
344
+ "tpp_threshold_2_total_metric": -0.009249970316886902,
345
+ "tpp_threshold_2_intended_diff_only": -0.0059999823570251465,
346
+ "tpp_threshold_2_unintended_diff_only": 0.0032499879598617554,
347
+ "tpp_threshold_5_total_metric": -0.004499971866607666,
348
+ "tpp_threshold_5_intended_diff_only": -0.004999995231628418,
349
+ "tpp_threshold_5_unintended_diff_only": -0.000500023365020752,
350
+ "tpp_threshold_10_total_metric": 0.004249989986419678,
351
+ "tpp_threshold_10_intended_diff_only": 0.0059999823570251465,
352
+ "tpp_threshold_10_unintended_diff_only": 0.0017499923706054688,
353
+ "tpp_threshold_20_total_metric": -0.005499973893165588,
354
+ "tpp_threshold_20_intended_diff_only": -0.0009999871253967285,
355
+ "tpp_threshold_20_unintended_diff_only": 0.00449998676776886,
356
+ "tpp_threshold_50_total_metric": 0.013749971985816956,
357
+ "tpp_threshold_50_intended_diff_only": 0.034999966621398926,
358
+ "tpp_threshold_50_unintended_diff_only": 0.02124999463558197,
359
+ "tpp_threshold_100_total_metric": 0.029999971389770508,
360
+ "tpp_threshold_100_intended_diff_only": 0.05799996852874756,
361
+ "tpp_threshold_100_unintended_diff_only": 0.02799999713897705,
362
+ "tpp_threshold_500_total_metric": 0.10924999415874481,
363
+ "tpp_threshold_500_intended_diff_only": 0.13599997758865356,
364
+ "tpp_threshold_500_unintended_diff_only": 0.026749983429908752
365
+ },
366
+ "5": {
367
+ "tpp_threshold_2_total_metric": -0.005499929189682007,
368
+ "tpp_threshold_2_intended_diff_only": -0.0029999613761901855,
369
+ "tpp_threshold_2_unintended_diff_only": 0.0024999678134918213,
370
+ "tpp_threshold_5_total_metric": -0.006249964237213135,
371
+ "tpp_threshold_5_intended_diff_only": -0.0009999871253967285,
372
+ "tpp_threshold_5_unintended_diff_only": 0.005249977111816406,
373
+ "tpp_threshold_10_total_metric": -0.009999975562095642,
374
+ "tpp_threshold_10_intended_diff_only": -0.0009999871253967285,
375
+ "tpp_threshold_10_unintended_diff_only": 0.008999988436698914,
376
+ "tpp_threshold_20_total_metric": -0.006999969482421875,
377
+ "tpp_threshold_20_intended_diff_only": 0.003000020980834961,
378
+ "tpp_threshold_20_unintended_diff_only": 0.009999990463256836,
379
+ "tpp_threshold_50_total_metric": 0.01650005578994751,
380
+ "tpp_threshold_50_intended_diff_only": 0.024000048637390137,
381
+ "tpp_threshold_50_unintended_diff_only": 0.007499992847442627,
382
+ "tpp_threshold_100_total_metric": 0.04000003635883331,
383
+ "tpp_threshold_100_intended_diff_only": 0.06300002336502075,
384
+ "tpp_threshold_100_unintended_diff_only": 0.02299998700618744,
385
+ "tpp_threshold_500_total_metric": 0.16475005447864532,
386
+ "tpp_threshold_500_intended_diff_only": 0.19200003147125244,
387
+ "tpp_threshold_500_unintended_diff_only": 0.027249976992607117
388
+ },
389
+ "6": {
390
+ "tpp_threshold_2_total_metric": 0.006499961018562317,
391
+ "tpp_threshold_2_intended_diff_only": 0.011999964714050293,
392
+ "tpp_threshold_2_unintended_diff_only": 0.005500003695487976,
393
+ "tpp_threshold_5_total_metric": 0.007999971508979797,
394
+ "tpp_threshold_5_intended_diff_only": 0.006999969482421875,
395
+ "tpp_threshold_5_unintended_diff_only": -0.0010000020265579224,
396
+ "tpp_threshold_10_total_metric": 0.012499988079071045,
397
+ "tpp_threshold_10_intended_diff_only": 0.014999985694885254,
398
+ "tpp_threshold_10_unintended_diff_only": 0.002499997615814209,
399
+ "tpp_threshold_20_total_metric": 0.021749988198280334,
400
+ "tpp_threshold_20_intended_diff_only": 0.02399998903274536,
401
+ "tpp_threshold_20_unintended_diff_only": 0.002250000834465027,
402
+ "tpp_threshold_50_total_metric": 0.03325001895427704,
403
+ "tpp_threshold_50_intended_diff_only": 0.04100000858306885,
404
+ "tpp_threshold_50_unintended_diff_only": 0.007749989628791809,
405
+ "tpp_threshold_100_total_metric": 0.05399997532367706,
406
+ "tpp_threshold_100_intended_diff_only": 0.05899995565414429,
407
+ "tpp_threshold_100_unintended_diff_only": 0.004999980330467224,
408
+ "tpp_threshold_500_total_metric": 0.24724997580051422,
409
+ "tpp_threshold_500_intended_diff_only": 0.2569999694824219,
410
+ "tpp_threshold_500_unintended_diff_only": 0.009749993681907654
411
+ }
412
+ }
413
+ }
414
+ }
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "tpp",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": false,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "16072d61-c7c6-4047-91dd-5fff05bf32c4",
73
+ "datetime_epoch_millis": 1740163589499,
74
+ "eval_result_metrics": {
75
+ "tpp_metrics": {
76
+ "tpp_threshold_2_total_metric": 0.0028499945998191833,
77
+ "tpp_threshold_2_intended_diff_only": 0.00549999475479126,
78
+ "tpp_threshold_2_unintended_diff_only": 0.0026500001549720764,
79
+ "tpp_threshold_5_total_metric": 0.004700003564357758,
80
+ "tpp_threshold_5_intended_diff_only": 0.0076000034809112545,
81
+ "tpp_threshold_5_unintended_diff_only": 0.0028999999165534975,
82
+ "tpp_threshold_10_total_metric": 0.01082499623298645,
83
+ "tpp_threshold_10_intended_diff_only": 0.014699995517730713,
84
+ "tpp_threshold_10_unintended_diff_only": 0.0038749992847442625,
85
+ "tpp_threshold_20_total_metric": 0.017949993908405303,
86
+ "tpp_threshold_20_intended_diff_only": 0.02199999690055847,
87
+ "tpp_threshold_20_unintended_diff_only": 0.004050002992153167,
88
+ "tpp_threshold_50_total_metric": 0.03577501326799393,
89
+ "tpp_threshold_50_intended_diff_only": 0.04050000905990601,
90
+ "tpp_threshold_50_unintended_diff_only": 0.0047249957919120785,
91
+ "tpp_threshold_100_total_metric": 0.06557500511407852,
92
+ "tpp_threshold_100_intended_diff_only": 0.07450000643730165,
93
+ "tpp_threshold_100_unintended_diff_only": 0.008925001323223113,
94
+ "tpp_threshold_500_total_metric": 0.21700000911951065,
95
+ "tpp_threshold_500_intended_diff_only": 0.23130001425743102,
96
+ "tpp_threshold_500_unintended_diff_only": 0.01430000513792038
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
102
+ "tpp_threshold_2_total_metric": 0.003350001573562622,
103
+ "tpp_threshold_2_intended_diff_only": 0.005600011348724366,
104
+ "tpp_threshold_2_unintended_diff_only": 0.0022500097751617433,
105
+ "tpp_threshold_5_total_metric": 0.006550008058547973,
106
+ "tpp_threshold_5_intended_diff_only": 0.009600019454956055,
107
+ "tpp_threshold_5_unintended_diff_only": 0.003050011396408081,
108
+ "tpp_threshold_10_total_metric": 0.01145000457763672,
109
+ "tpp_threshold_10_intended_diff_only": 0.014200007915496827,
110
+ "tpp_threshold_10_unintended_diff_only": 0.0027500033378601075,
111
+ "tpp_threshold_20_total_metric": 0.01974998414516449,
112
+ "tpp_threshold_20_intended_diff_only": 0.02239999771118164,
113
+ "tpp_threshold_20_unintended_diff_only": 0.002650013566017151,
114
+ "tpp_threshold_50_total_metric": 0.03780001997947693,
115
+ "tpp_threshold_50_intended_diff_only": 0.04080002307891846,
116
+ "tpp_threshold_50_unintended_diff_only": 0.003000003099441528,
117
+ "tpp_threshold_100_total_metric": 0.06720000505447388,
118
+ "tpp_threshold_100_intended_diff_only": 0.07420001029968262,
119
+ "tpp_threshold_100_unintended_diff_only": 0.007000005245208741,
120
+ "tpp_threshold_500_total_metric": 0.24940000772476195,
121
+ "tpp_threshold_500_intended_diff_only": 0.26020002365112305,
122
+ "tpp_threshold_500_unintended_diff_only": 0.010800015926361085
123
+ },
124
+ {
125
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
126
+ "tpp_threshold_2_total_metric": 0.0023499876260757446,
127
+ "tpp_threshold_2_intended_diff_only": 0.005399978160858155,
128
+ "tpp_threshold_2_unintended_diff_only": 0.0030499905347824096,
129
+ "tpp_threshold_5_total_metric": 0.0028499990701675417,
130
+ "tpp_threshold_5_intended_diff_only": 0.005599987506866455,
131
+ "tpp_threshold_5_unintended_diff_only": 0.0027499884366989137,
132
+ "tpp_threshold_10_total_metric": 0.010199987888336181,
133
+ "tpp_threshold_10_intended_diff_only": 0.0151999831199646,
134
+ "tpp_threshold_10_unintended_diff_only": 0.004999995231628418,
135
+ "tpp_threshold_20_total_metric": 0.016150003671646117,
136
+ "tpp_threshold_20_intended_diff_only": 0.021599996089935302,
137
+ "tpp_threshold_20_unintended_diff_only": 0.005449992418289184,
138
+ "tpp_threshold_50_total_metric": 0.03375000655651093,
139
+ "tpp_threshold_50_intended_diff_only": 0.040199995040893555,
140
+ "tpp_threshold_50_unintended_diff_only": 0.00644998848438263,
141
+ "tpp_threshold_100_total_metric": 0.06395000517368317,
142
+ "tpp_threshold_100_intended_diff_only": 0.07480000257492066,
143
+ "tpp_threshold_100_unintended_diff_only": 0.010849997401237488,
144
+ "tpp_threshold_500_total_metric": 0.18460001051425934,
145
+ "tpp_threshold_500_intended_diff_only": 0.20240000486373902,
146
+ "tpp_threshold_500_unintended_diff_only": 0.017799994349479674
147
+ }
148
+ ],
149
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
150
+ "sae_lens_id": "custom_sae",
151
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_40_peft_sae_from_scratch_95.0",
152
+ "sae_lens_version": "5.4.2",
153
+ "sae_cfg_dict": {
154
+ "model_name": "gemma-2-2b",
155
+ "d_in": 2304,
156
+ "d_sae": 65536,
157
+ "hook_layer": 12,
158
+ "hook_name": "blocks.12.hook_resid_post",
159
+ "context_size": null,
160
+ "hook_head_index": null,
161
+ "architecture": "topk",
162
+ "apply_b_dec_to_input": null,
163
+ "finetuning_scaling_factor": null,
164
+ "activation_fn_str": "",
165
+ "prepend_bos": true,
166
+ "normalize_activations": "none",
167
+ "dtype": "bfloat16",
168
+ "device": "",
169
+ "dataset_path": "",
170
+ "dataset_trust_remote_code": true,
171
+ "seqpos_slice": [
172
+ null
173
+ ],
174
+ "training_tokens": -100000,
175
+ "sae_lens_training_version": null,
176
+ "neuronpedia_id": null
177
+ },
178
+ "eval_result_unstructured": {
179
+ "LabHC/bias_in_bios_class_set1": {
180
+ "0": {
181
+ "tpp_threshold_2_total_metric": 0.0072500258684158325,
182
+ "tpp_threshold_2_intended_diff_only": 0.01100003719329834,
183
+ "tpp_threshold_2_unintended_diff_only": 0.0037500113248825073,
184
+ "tpp_threshold_5_total_metric": 0.016750022768974304,
185
+ "tpp_threshold_5_intended_diff_only": 0.021000027656555176,
186
+ "tpp_threshold_5_unintended_diff_only": 0.004250004887580872,
187
+ "tpp_threshold_10_total_metric": 0.012500002980232239,
188
+ "tpp_threshold_10_intended_diff_only": 0.017000019550323486,
189
+ "tpp_threshold_10_unintended_diff_only": 0.0045000165700912476,
190
+ "tpp_threshold_20_total_metric": 0.030749991536140442,
191
+ "tpp_threshold_20_intended_diff_only": 0.03299999237060547,
192
+ "tpp_threshold_20_unintended_diff_only": 0.002250000834465027,
193
+ "tpp_threshold_50_total_metric": 0.04200001060962677,
194
+ "tpp_threshold_50_intended_diff_only": 0.04500001668930054,
195
+ "tpp_threshold_50_unintended_diff_only": 0.003000006079673767,
196
+ "tpp_threshold_100_total_metric": 0.08250001072883606,
197
+ "tpp_threshold_100_intended_diff_only": 0.10199999809265137,
198
+ "tpp_threshold_100_unintended_diff_only": 0.019499987363815308,
199
+ "tpp_threshold_500_total_metric": 0.3044999986886978,
200
+ "tpp_threshold_500_intended_diff_only": 0.328000009059906,
201
+ "tpp_threshold_500_unintended_diff_only": 0.02350001037120819
202
+ },
203
+ "1": {
204
+ "tpp_threshold_2_total_metric": 0.006250038743019104,
205
+ "tpp_threshold_2_intended_diff_only": 0.006000041961669922,
206
+ "tpp_threshold_2_unintended_diff_only": -0.00024999678134918213,
207
+ "tpp_threshold_5_total_metric": 0.0077500492334365845,
208
+ "tpp_threshold_5_intended_diff_only": 0.010000050067901611,
209
+ "tpp_threshold_5_unintended_diff_only": 0.002250000834465027,
210
+ "tpp_threshold_10_total_metric": 0.012000024318695068,
211
+ "tpp_threshold_10_intended_diff_only": 0.012000024318695068,
212
+ "tpp_threshold_10_unintended_diff_only": 0.0,
213
+ "tpp_threshold_20_total_metric": 0.005749985575675964,
214
+ "tpp_threshold_20_intended_diff_only": 0.009000003337860107,
215
+ "tpp_threshold_20_unintended_diff_only": 0.003250017762184143,
216
+ "tpp_threshold_50_total_metric": 0.015250056982040405,
217
+ "tpp_threshold_50_intended_diff_only": 0.01900005340576172,
218
+ "tpp_threshold_50_unintended_diff_only": 0.0037499964237213135,
219
+ "tpp_threshold_100_total_metric": 0.026749998331069946,
220
+ "tpp_threshold_100_intended_diff_only": 0.03100001811981201,
221
+ "tpp_threshold_100_unintended_diff_only": 0.004250019788742065,
222
+ "tpp_threshold_500_total_metric": 0.14800003170967102,
223
+ "tpp_threshold_500_intended_diff_only": 0.15400004386901855,
224
+ "tpp_threshold_500_unintended_diff_only": 0.006000012159347534
225
+ },
226
+ "2": {
227
+ "tpp_threshold_2_total_metric": -0.0032500028610229492,
228
+ "tpp_threshold_2_intended_diff_only": 0.0,
229
+ "tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
230
+ "tpp_threshold_5_total_metric": 0.0007499754428863525,
231
+ "tpp_threshold_5_intended_diff_only": 0.004999995231628418,
232
+ "tpp_threshold_5_unintended_diff_only": 0.004250019788742065,
233
+ "tpp_threshold_10_total_metric": 0.02400001883506775,
234
+ "tpp_threshold_10_intended_diff_only": 0.026000022888183594,
235
+ "tpp_threshold_10_unintended_diff_only": 0.0020000040531158447,
236
+ "tpp_threshold_20_total_metric": 0.03149998188018799,
237
+ "tpp_threshold_20_intended_diff_only": 0.03299999237060547,
238
+ "tpp_threshold_20_unintended_diff_only": 0.0015000104904174805,
239
+ "tpp_threshold_50_total_metric": 0.05425000190734863,
240
+ "tpp_threshold_50_intended_diff_only": 0.0559999942779541,
241
+ "tpp_threshold_50_unintended_diff_only": 0.0017499923706054688,
242
+ "tpp_threshold_100_total_metric": 0.07874995470046997,
243
+ "tpp_threshold_100_intended_diff_only": 0.07999998331069946,
244
+ "tpp_threshold_100_unintended_diff_only": 0.0012500286102294922,
245
+ "tpp_threshold_500_total_metric": 0.2527500092983246,
246
+ "tpp_threshold_500_intended_diff_only": 0.25700002908706665,
247
+ "tpp_threshold_500_unintended_diff_only": 0.004250019788742065
248
+ },
249
+ "6": {
250
+ "tpp_threshold_2_total_metric": 0.001999989151954651,
251
+ "tpp_threshold_2_intended_diff_only": 0.0040000081062316895,
252
+ "tpp_threshold_2_unintended_diff_only": 0.0020000189542770386,
253
+ "tpp_threshold_5_total_metric": 0.002750024199485779,
254
+ "tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
255
+ "tpp_threshold_5_unintended_diff_only": -0.0007499903440475464,
256
+ "tpp_threshold_10_total_metric": 0.004749983549118042,
257
+ "tpp_threshold_10_intended_diff_only": 0.0059999823570251465,
258
+ "tpp_threshold_10_unintended_diff_only": 0.0012499988079071045,
259
+ "tpp_threshold_20_total_metric": 0.004999995231628418,
260
+ "tpp_threshold_20_intended_diff_only": 0.0040000081062316895,
261
+ "tpp_threshold_20_unintended_diff_only": -0.0009999871253967285,
262
+ "tpp_threshold_50_total_metric": 0.010000035166740417,
263
+ "tpp_threshold_50_intended_diff_only": 0.01100003719329834,
264
+ "tpp_threshold_50_unintended_diff_only": 0.0010000020265579224,
265
+ "tpp_threshold_100_total_metric": 0.03125004470348358,
266
+ "tpp_threshold_100_intended_diff_only": 0.03400003910064697,
267
+ "tpp_threshold_100_unintended_diff_only": 0.002749994397163391,
268
+ "tpp_threshold_500_total_metric": 0.19425003230571747,
269
+ "tpp_threshold_500_intended_diff_only": 0.20100003480911255,
270
+ "tpp_threshold_500_unintended_diff_only": 0.006750002503395081
271
+ },
272
+ "9": {
273
+ "tpp_threshold_2_total_metric": 0.004499956965446472,
274
+ "tpp_threshold_2_intended_diff_only": 0.006999969482421875,
275
+ "tpp_threshold_2_unintended_diff_only": 0.002500012516975403,
276
+ "tpp_threshold_5_total_metric": 0.004749968647956848,
277
+ "tpp_threshold_5_intended_diff_only": 0.009999990463256836,
278
+ "tpp_threshold_5_unintended_diff_only": 0.005250021815299988,
279
+ "tpp_threshold_10_total_metric": 0.003999993205070496,
280
+ "tpp_threshold_10_intended_diff_only": 0.009999990463256836,
281
+ "tpp_threshold_10_unintended_diff_only": 0.00599999725818634,
282
+ "tpp_threshold_20_total_metric": 0.025749966502189636,
283
+ "tpp_threshold_20_intended_diff_only": 0.03299999237060547,
284
+ "tpp_threshold_20_unintended_diff_only": 0.0072500258684158325,
285
+ "tpp_threshold_50_total_metric": 0.06749999523162842,
286
+ "tpp_threshold_50_intended_diff_only": 0.07300001382827759,
287
+ "tpp_threshold_50_unintended_diff_only": 0.00550001859664917,
288
+ "tpp_threshold_100_total_metric": 0.11675001680850983,
289
+ "tpp_threshold_100_intended_diff_only": 0.12400001287460327,
290
+ "tpp_threshold_100_unintended_diff_only": 0.007249996066093445,
291
+ "tpp_threshold_500_total_metric": 0.3474999666213989,
292
+ "tpp_threshold_500_intended_diff_only": 0.3610000014305115,
293
+ "tpp_threshold_500_unintended_diff_only": 0.013500034809112549
294
+ }
295
+ },
296
+ "canrager/amazon_reviews_mcauley_1and5": {
297
+ "1": {
298
+ "tpp_threshold_2_total_metric": 0.0037499666213989258,
299
+ "tpp_threshold_2_intended_diff_only": 0.007999956607818604,
300
+ "tpp_threshold_2_unintended_diff_only": 0.004249989986419678,
301
+ "tpp_threshold_5_total_metric": -0.00025004148483276367,
302
+ "tpp_threshold_5_intended_diff_only": 0.003999948501586914,
303
+ "tpp_threshold_5_unintended_diff_only": 0.004249989986419678,
304
+ "tpp_threshold_10_total_metric": 0.0017500072717666626,
305
+ "tpp_threshold_10_intended_diff_only": 0.009000003337860107,
306
+ "tpp_threshold_10_unintended_diff_only": 0.007249996066093445,
307
+ "tpp_threshold_20_total_metric": -0.000250011682510376,
308
+ "tpp_threshold_20_intended_diff_only": 0.0059999823570251465,
309
+ "tpp_threshold_20_unintended_diff_only": 0.0062499940395355225,
310
+ "tpp_threshold_50_total_metric": 0.010500013828277588,
311
+ "tpp_threshold_50_intended_diff_only": 0.013999998569488525,
312
+ "tpp_threshold_50_unintended_diff_only": 0.0034999847412109375,
313
+ "tpp_threshold_100_total_metric": 0.004749983549118042,
314
+ "tpp_threshold_100_intended_diff_only": 0.014999985694885254,
315
+ "tpp_threshold_100_unintended_diff_only": 0.010250002145767212,
316
+ "tpp_threshold_500_total_metric": 0.08899998664855957,
317
+ "tpp_threshold_500_intended_diff_only": 0.09799998998641968,
318
+ "tpp_threshold_500_unintended_diff_only": 0.009000003337860107
319
+ },
320
+ "2": {
321
+ "tpp_threshold_2_total_metric": 0.0014999806880950928,
322
+ "tpp_threshold_2_intended_diff_only": 0.001999974250793457,
323
+ "tpp_threshold_2_unintended_diff_only": 0.0004999935626983643,
324
+ "tpp_threshold_5_total_metric": -0.002499997615814209,
325
+ "tpp_threshold_5_intended_diff_only": 0.004999995231628418,
326
+ "tpp_threshold_5_unintended_diff_only": 0.007499992847442627,
327
+ "tpp_threshold_10_total_metric": 0.01099996268749237,
328
+ "tpp_threshold_10_intended_diff_only": 0.01699995994567871,
329
+ "tpp_threshold_10_unintended_diff_only": 0.00599999725818634,
330
+ "tpp_threshold_20_total_metric": 0.011499986052513123,
331
+ "tpp_threshold_20_intended_diff_only": 0.014999985694885254,
332
+ "tpp_threshold_20_unintended_diff_only": 0.0034999996423721313,
333
+ "tpp_threshold_50_total_metric": 0.03499998152256012,
334
+ "tpp_threshold_50_intended_diff_only": 0.042999982833862305,
335
+ "tpp_threshold_50_unintended_diff_only": 0.008000001311302185,
336
+ "tpp_threshold_100_total_metric": 0.04949997365474701,
337
+ "tpp_threshold_100_intended_diff_only": 0.06299996376037598,
338
+ "tpp_threshold_100_unintended_diff_only": 0.013499990105628967,
339
+ "tpp_threshold_500_total_metric": 0.1442500203847885,
340
+ "tpp_threshold_500_intended_diff_only": 0.16100001335144043,
341
+ "tpp_threshold_500_unintended_diff_only": 0.016749992966651917
342
+ },
343
+ "3": {
344
+ "tpp_threshold_2_total_metric": -0.010000020265579224,
345
+ "tpp_threshold_2_intended_diff_only": -0.00700002908706665,
346
+ "tpp_threshold_2_unintended_diff_only": 0.0029999911785125732,
347
+ "tpp_threshold_5_total_metric": 0.006999999284744263,
348
+ "tpp_threshold_5_intended_diff_only": 0.004999995231628418,
349
+ "tpp_threshold_5_unintended_diff_only": -0.0020000040531158447,
350
+ "tpp_threshold_10_total_metric": 0.013249978423118591,
351
+ "tpp_threshold_10_intended_diff_only": 0.014999985694885254,
352
+ "tpp_threshold_10_unintended_diff_only": 0.0017500072717666626,
353
+ "tpp_threshold_20_total_metric": 0.0052499920129776,
354
+ "tpp_threshold_20_intended_diff_only": 0.009999990463256836,
355
+ "tpp_threshold_20_unintended_diff_only": 0.004749998450279236,
356
+ "tpp_threshold_50_total_metric": 0.02750001847743988,
357
+ "tpp_threshold_50_intended_diff_only": 0.03100001811981201,
358
+ "tpp_threshold_50_unintended_diff_only": 0.0034999996423721313,
359
+ "tpp_threshold_100_total_metric": 0.04750002920627594,
360
+ "tpp_threshold_100_intended_diff_only": 0.054000020027160645,
361
+ "tpp_threshold_100_unintended_diff_only": 0.006499990820884705,
362
+ "tpp_threshold_500_total_metric": 0.13974998891353607,
363
+ "tpp_threshold_500_intended_diff_only": 0.1629999876022339,
364
+ "tpp_threshold_500_unintended_diff_only": 0.023249998688697815
365
+ },
366
+ "5": {
367
+ "tpp_threshold_2_total_metric": 0.00025004148483276367,
368
+ "tpp_threshold_2_intended_diff_only": 0.003000020980834961,
369
+ "tpp_threshold_2_unintended_diff_only": 0.0027499794960021973,
370
+ "tpp_threshold_5_total_metric": 0.0012500584125518799,
371
+ "tpp_threshold_5_intended_diff_only": 0.00700002908706665,
372
+ "tpp_threshold_5_unintended_diff_only": 0.0057499706745147705,
373
+ "tpp_threshold_10_total_metric": -0.004249989986419678,
374
+ "tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
375
+ "tpp_threshold_10_unintended_diff_only": 0.008249998092651367,
376
+ "tpp_threshold_20_total_metric": 0.020750045776367188,
377
+ "tpp_threshold_20_intended_diff_only": 0.030000030994415283,
378
+ "tpp_threshold_20_unintended_diff_only": 0.009249985218048096,
379
+ "tpp_threshold_50_total_metric": 0.031500041484832764,
380
+ "tpp_threshold_50_intended_diff_only": 0.04000002145767212,
381
+ "tpp_threshold_50_unintended_diff_only": 0.008499979972839355,
382
+ "tpp_threshold_100_total_metric": 0.11225005984306335,
383
+ "tpp_threshold_100_intended_diff_only": 0.1300000548362732,
384
+ "tpp_threshold_100_unintended_diff_only": 0.01774999499320984,
385
+ "tpp_threshold_500_total_metric": 0.24350006878376007,
386
+ "tpp_threshold_500_intended_diff_only": 0.2690000534057617,
387
+ "tpp_threshold_500_unintended_diff_only": 0.025499984622001648
388
+ },
389
+ "6": {
390
+ "tpp_threshold_2_total_metric": 0.016249969601631165,
391
+ "tpp_threshold_2_intended_diff_only": 0.0209999680519104,
392
+ "tpp_threshold_2_unintended_diff_only": 0.004749998450279236,
393
+ "tpp_threshold_5_total_metric": 0.008749976754188538,
394
+ "tpp_threshold_5_intended_diff_only": 0.006999969482421875,
395
+ "tpp_threshold_5_unintended_diff_only": -0.0017500072717666626,
396
+ "tpp_threshold_10_total_metric": 0.02924998104572296,
397
+ "tpp_threshold_10_intended_diff_only": 0.030999958515167236,
398
+ "tpp_threshold_10_unintended_diff_only": 0.001749977469444275,
399
+ "tpp_threshold_20_total_metric": 0.04350000619888306,
400
+ "tpp_threshold_20_intended_diff_only": 0.046999990940093994,
401
+ "tpp_threshold_20_unintended_diff_only": 0.0034999847412109375,
402
+ "tpp_threshold_50_total_metric": 0.06424997746944427,
403
+ "tpp_threshold_50_intended_diff_only": 0.07299995422363281,
404
+ "tpp_threshold_50_unintended_diff_only": 0.008749976754188538,
405
+ "tpp_threshold_100_total_metric": 0.10574997961521149,
406
+ "tpp_threshold_100_intended_diff_only": 0.1119999885559082,
407
+ "tpp_threshold_100_unintended_diff_only": 0.006250008940696716,
408
+ "tpp_threshold_500_total_metric": 0.30649998784065247,
409
+ "tpp_threshold_500_intended_diff_only": 0.32099997997283936,
410
+ "tpp_threshold_500_unintended_diff_only": 0.01449999213218689
411
+ }
412
+ }
413
+ }
414
+ }
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "tpp",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": false,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "0cf78e95-df49-4332-8aa4-30ee2712619b",
73
+ "datetime_epoch_millis": 1740163430683,
74
+ "eval_result_metrics": {
75
+ "tpp_metrics": {
76
+ "tpp_threshold_2_total_metric": 0.001475006341934204,
77
+ "tpp_threshold_2_intended_diff_only": 0.003700006008148193,
78
+ "tpp_threshold_2_unintended_diff_only": 0.0022249996662139894,
79
+ "tpp_threshold_5_total_metric": 0.0010999992489814758,
80
+ "tpp_threshold_5_intended_diff_only": 0.003600001335144043,
81
+ "tpp_threshold_5_unintended_diff_only": 0.002500002086162567,
82
+ "tpp_threshold_10_total_metric": 0.004625001549720764,
83
+ "tpp_threshold_10_intended_diff_only": 0.00790000557899475,
84
+ "tpp_threshold_10_unintended_diff_only": 0.0032750040292739866,
85
+ "tpp_threshold_20_total_metric": 0.008000005781650544,
86
+ "tpp_threshold_20_intended_diff_only": 0.01170000433921814,
87
+ "tpp_threshold_20_unintended_diff_only": 0.003699998557567596,
88
+ "tpp_threshold_50_total_metric": 0.024850000441074372,
89
+ "tpp_threshold_50_intended_diff_only": 0.03130000233650208,
90
+ "tpp_threshold_50_unintended_diff_only": 0.0064500018954277046,
91
+ "tpp_threshold_100_total_metric": 0.042700006067752844,
92
+ "tpp_threshold_100_intended_diff_only": 0.05190000534057617,
93
+ "tpp_threshold_100_unintended_diff_only": 0.009199999272823334,
94
+ "tpp_threshold_500_total_metric": 0.19910001307725905,
95
+ "tpp_threshold_500_intended_diff_only": 0.21640000939369203,
96
+ "tpp_threshold_500_unintended_diff_only": 0.017299996316432954
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
102
+ "tpp_threshold_2_total_metric": 0.003000003099441528,
103
+ "tpp_threshold_2_intended_diff_only": 0.004600012302398681,
104
+ "tpp_threshold_2_unintended_diff_only": 0.0016000092029571534,
105
+ "tpp_threshold_5_total_metric": 0.0025999963283538817,
106
+ "tpp_threshold_5_intended_diff_only": 0.005000007152557373,
107
+ "tpp_threshold_5_unintended_diff_only": 0.002400010824203491,
108
+ "tpp_threshold_10_total_metric": 0.005449992418289184,
109
+ "tpp_threshold_10_intended_diff_only": 0.007800006866455078,
110
+ "tpp_threshold_10_unintended_diff_only": 0.0023500144481658934,
111
+ "tpp_threshold_20_total_metric": 0.010450014472007751,
112
+ "tpp_threshold_20_intended_diff_only": 0.012800014019012452,
113
+ "tpp_threshold_20_unintended_diff_only": 0.0023499995470046995,
114
+ "tpp_threshold_50_total_metric": 0.01789999008178711,
115
+ "tpp_threshold_50_intended_diff_only": 0.02080000638961792,
116
+ "tpp_threshold_50_unintended_diff_only": 0.0029000163078308104,
117
+ "tpp_threshold_100_total_metric": 0.0331000030040741,
118
+ "tpp_threshold_100_intended_diff_only": 0.036400008201599124,
119
+ "tpp_threshold_100_unintended_diff_only": 0.0033000051975250245,
120
+ "tpp_threshold_500_total_metric": 0.19535001814365388,
121
+ "tpp_threshold_500_intended_diff_only": 0.20200002193450928,
122
+ "tpp_threshold_500_unintended_diff_only": 0.006650003790855408
123
+ },
124
+ {
125
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
126
+ "tpp_threshold_2_total_metric": -4.999041557312012e-05,
127
+ "tpp_threshold_2_intended_diff_only": 0.002799999713897705,
128
+ "tpp_threshold_2_unintended_diff_only": 0.0028499901294708253,
129
+ "tpp_threshold_5_total_metric": -0.0003999978303909302,
130
+ "tpp_threshold_5_intended_diff_only": 0.002199995517730713,
131
+ "tpp_threshold_5_unintended_diff_only": 0.002599993348121643,
132
+ "tpp_threshold_10_total_metric": 0.0038000106811523437,
133
+ "tpp_threshold_10_intended_diff_only": 0.008000004291534423,
134
+ "tpp_threshold_10_unintended_diff_only": 0.00419999361038208,
135
+ "tpp_threshold_20_total_metric": 0.005549997091293335,
136
+ "tpp_threshold_20_intended_diff_only": 0.010599994659423828,
137
+ "tpp_threshold_20_unintended_diff_only": 0.005049997568130493,
138
+ "tpp_threshold_50_total_metric": 0.031800010800361635,
139
+ "tpp_threshold_50_intended_diff_only": 0.04179999828338623,
140
+ "tpp_threshold_50_unintended_diff_only": 0.009999987483024598,
141
+ "tpp_threshold_100_total_metric": 0.05230000913143158,
142
+ "tpp_threshold_100_intended_diff_only": 0.06740000247955322,
143
+ "tpp_threshold_100_unintended_diff_only": 0.015099993348121643,
144
+ "tpp_threshold_500_total_metric": 0.20285000801086425,
145
+ "tpp_threshold_500_intended_diff_only": 0.23079999685287475,
146
+ "tpp_threshold_500_unintended_diff_only": 0.027949988842010498
147
+ }
148
+ ],
149
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
150
+ "sae_lens_id": "custom_sae",
151
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_0.0",
152
+ "sae_lens_version": "5.4.2",
153
+ "sae_cfg_dict": {
154
+ "model_name": "gemma-2-2b",
155
+ "d_in": 2304,
156
+ "d_sae": 65536,
157
+ "hook_layer": 12,
158
+ "hook_name": "blocks.12.hook_resid_post",
159
+ "context_size": null,
160
+ "hook_head_index": null,
161
+ "architecture": "topk",
162
+ "apply_b_dec_to_input": null,
163
+ "finetuning_scaling_factor": null,
164
+ "activation_fn_str": "",
165
+ "prepend_bos": true,
166
+ "normalize_activations": "none",
167
+ "dtype": "bfloat16",
168
+ "device": "",
169
+ "dataset_path": "",
170
+ "dataset_trust_remote_code": true,
171
+ "seqpos_slice": [
172
+ null
173
+ ],
174
+ "training_tokens": -100000,
175
+ "sae_lens_training_version": null,
176
+ "neuronpedia_id": null
177
+ },
178
+ "eval_result_unstructured": {
179
+ "LabHC/bias_in_bios_class_set1": {
180
+ "0": {
181
+ "tpp_threshold_2_total_metric": 0.008750036358833313,
182
+ "tpp_threshold_2_intended_diff_only": 0.01100003719329834,
183
+ "tpp_threshold_2_unintended_diff_only": 0.002250000834465027,
184
+ "tpp_threshold_5_total_metric": 0.012000024318695068,
185
+ "tpp_threshold_5_intended_diff_only": 0.01500004529953003,
186
+ "tpp_threshold_5_unintended_diff_only": 0.003000020980834961,
187
+ "tpp_threshold_10_total_metric": 0.006749972701072693,
188
+ "tpp_threshold_10_intended_diff_only": 0.009999990463256836,
189
+ "tpp_threshold_10_unintended_diff_only": 0.003250017762184143,
190
+ "tpp_threshold_20_total_metric": 0.01975002884864807,
191
+ "tpp_threshold_20_intended_diff_only": 0.022000014781951904,
192
+ "tpp_threshold_20_unintended_diff_only": 0.002249985933303833,
193
+ "tpp_threshold_50_total_metric": 0.03349998593330383,
194
+ "tpp_threshold_50_intended_diff_only": 0.03700000047683716,
195
+ "tpp_threshold_50_unintended_diff_only": 0.003500014543533325,
196
+ "tpp_threshold_100_total_metric": 0.06024999916553497,
197
+ "tpp_threshold_100_intended_diff_only": 0.06400001049041748,
198
+ "tpp_threshold_100_unintended_diff_only": 0.0037500113248825073,
199
+ "tpp_threshold_500_total_metric": 0.3145000487565994,
200
+ "tpp_threshold_500_intended_diff_only": 0.3200000524520874,
201
+ "tpp_threshold_500_unintended_diff_only": 0.005500003695487976
202
+ },
203
+ "1": {
204
+ "tpp_threshold_2_total_metric": 0.0055000633001327515,
205
+ "tpp_threshold_2_intended_diff_only": 0.005000054836273193,
206
+ "tpp_threshold_2_unintended_diff_only": -0.0005000084638595581,
207
+ "tpp_threshold_5_total_metric": -0.0010000020265579224,
208
+ "tpp_threshold_5_intended_diff_only": 0.0,
209
+ "tpp_threshold_5_unintended_diff_only": 0.0010000020265579224,
210
+ "tpp_threshold_10_total_metric": 0.0022500455379486084,
211
+ "tpp_threshold_10_intended_diff_only": 0.001000046730041504,
212
+ "tpp_threshold_10_unintended_diff_only": -0.0012499988079071045,
213
+ "tpp_threshold_20_total_metric": 0.0002500265836715698,
214
+ "tpp_threshold_20_intended_diff_only": 0.003000020980834961,
215
+ "tpp_threshold_20_unintended_diff_only": 0.002749994397163391,
216
+ "tpp_threshold_50_total_metric": 0.004500031471252441,
217
+ "tpp_threshold_50_intended_diff_only": 0.00700002908706665,
218
+ "tpp_threshold_50_unintended_diff_only": 0.002499997615814209,
219
+ "tpp_threshold_100_total_metric": 0.014500007033348083,
220
+ "tpp_threshold_100_intended_diff_only": 0.018000006675720215,
221
+ "tpp_threshold_100_unintended_diff_only": 0.0034999996423721313,
222
+ "tpp_threshold_500_total_metric": 0.13175006210803986,
223
+ "tpp_threshold_500_intended_diff_only": 0.1350000500679016,
224
+ "tpp_threshold_500_unintended_diff_only": 0.0032499879598617554
225
+ },
226
+ "2": {
227
+ "tpp_threshold_2_total_metric": -0.0005000531673431396,
228
+ "tpp_threshold_2_intended_diff_only": 0.001999974250793457,
229
+ "tpp_threshold_2_unintended_diff_only": 0.0025000274181365967,
230
+ "tpp_threshold_5_total_metric": -1.4901161193847656e-08,
231
+ "tpp_threshold_5_intended_diff_only": 0.004999995231628418,
232
+ "tpp_threshold_5_unintended_diff_only": 0.005000010132789612,
233
+ "tpp_threshold_10_total_metric": 0.015000000596046448,
234
+ "tpp_threshold_10_intended_diff_only": 0.017000019550323486,
235
+ "tpp_threshold_10_unintended_diff_only": 0.0020000189542770386,
236
+ "tpp_threshold_20_total_metric": 0.02025000751018524,
237
+ "tpp_threshold_20_intended_diff_only": 0.022000014781951904,
238
+ "tpp_threshold_20_unintended_diff_only": 0.0017500072717666626,
239
+ "tpp_threshold_50_total_metric": 0.025249987840652466,
240
+ "tpp_threshold_50_intended_diff_only": 0.027000010013580322,
241
+ "tpp_threshold_50_unintended_diff_only": 0.0017500221729278564,
242
+ "tpp_threshold_100_total_metric": 0.04474999010562897,
243
+ "tpp_threshold_100_intended_diff_only": 0.046999990940093994,
244
+ "tpp_threshold_100_unintended_diff_only": 0.002250000834465027,
245
+ "tpp_threshold_500_total_metric": 0.30299998819828033,
246
+ "tpp_threshold_500_intended_diff_only": 0.3100000023841858,
247
+ "tpp_threshold_500_unintended_diff_only": 0.0070000141859054565
248
+ },
249
+ "6": {
250
+ "tpp_threshold_2_total_metric": -0.0015000104904174805,
251
+ "tpp_threshold_2_intended_diff_only": 0.0,
252
+ "tpp_threshold_2_unintended_diff_only": 0.0015000104904174805,
253
+ "tpp_threshold_5_total_metric": 0.000250011682510376,
254
+ "tpp_threshold_5_intended_diff_only": -0.0009999871253967285,
255
+ "tpp_threshold_5_unintended_diff_only": -0.0012499988079071045,
256
+ "tpp_threshold_10_total_metric": 0.002249985933303833,
257
+ "tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
258
+ "tpp_threshold_10_unintended_diff_only": 0.0017500221729278564,
259
+ "tpp_threshold_20_total_metric": 0.00475001335144043,
260
+ "tpp_threshold_20_intended_diff_only": 0.003000020980834961,
261
+ "tpp_threshold_20_unintended_diff_only": -0.0017499923706054688,
262
+ "tpp_threshold_50_total_metric": 0.004249989986419678,
263
+ "tpp_threshold_50_intended_diff_only": 0.0040000081062316895,
264
+ "tpp_threshold_50_unintended_diff_only": -0.0002499818801879883,
265
+ "tpp_threshold_100_total_metric": 0.0037500113248825073,
266
+ "tpp_threshold_100_intended_diff_only": 0.003000020980834961,
267
+ "tpp_threshold_100_unintended_diff_only": -0.0007499903440475464,
268
+ "tpp_threshold_500_total_metric": 0.030749976634979248,
269
+ "tpp_threshold_500_intended_diff_only": 0.03799998760223389,
270
+ "tpp_threshold_500_unintended_diff_only": 0.007250010967254639
271
+ },
272
+ "9": {
273
+ "tpp_threshold_2_total_metric": 0.0027499794960021973,
274
+ "tpp_threshold_2_intended_diff_only": 0.004999995231628418,
275
+ "tpp_threshold_2_unintended_diff_only": 0.0022500157356262207,
276
+ "tpp_threshold_5_total_metric": 0.001749962568283081,
277
+ "tpp_threshold_5_intended_diff_only": 0.0059999823570251465,
278
+ "tpp_threshold_5_unintended_diff_only": 0.004250019788742065,
279
+ "tpp_threshold_10_total_metric": 0.0009999573230743408,
280
+ "tpp_threshold_10_intended_diff_only": 0.006999969482421875,
281
+ "tpp_threshold_10_unintended_diff_only": 0.006000012159347534,
282
+ "tpp_threshold_20_total_metric": 0.007249996066093445,
283
+ "tpp_threshold_20_intended_diff_only": 0.013999998569488525,
284
+ "tpp_threshold_20_unintended_diff_only": 0.006750002503395081,
285
+ "tpp_threshold_50_total_metric": 0.02199995517730713,
286
+ "tpp_threshold_50_intended_diff_only": 0.02899998426437378,
287
+ "tpp_threshold_50_unintended_diff_only": 0.00700002908706665,
288
+ "tpp_threshold_100_total_metric": 0.04225000739097595,
289
+ "tpp_threshold_100_intended_diff_only": 0.050000011920928955,
290
+ "tpp_threshold_100_unintended_diff_only": 0.007750004529953003,
291
+ "tpp_threshold_500_total_metric": 0.19675001502037048,
292
+ "tpp_threshold_500_intended_diff_only": 0.2070000171661377,
293
+ "tpp_threshold_500_unintended_diff_only": 0.010250002145767212
294
+ }
295
+ },
296
+ "canrager/amazon_reviews_mcauley_1and5": {
297
+ "1": {
298
+ "tpp_threshold_2_total_metric": 0.007499992847442627,
299
+ "tpp_threshold_2_intended_diff_only": 0.010999977588653564,
300
+ "tpp_threshold_2_unintended_diff_only": 0.0034999847412109375,
301
+ "tpp_threshold_5_total_metric": 0.004249975085258484,
302
+ "tpp_threshold_5_intended_diff_only": 0.0059999823570251465,
303
+ "tpp_threshold_5_unintended_diff_only": 0.0017500072717666626,
304
+ "tpp_threshold_10_total_metric": 0.0022500157356262207,
305
+ "tpp_threshold_10_intended_diff_only": 0.009000003337860107,
306
+ "tpp_threshold_10_unintended_diff_only": 0.006749987602233887,
307
+ "tpp_threshold_20_total_metric": 0.00024999678134918213,
308
+ "tpp_threshold_20_intended_diff_only": 0.004999995231628418,
309
+ "tpp_threshold_20_unintended_diff_only": 0.004749998450279236,
310
+ "tpp_threshold_50_total_metric": 0.010250002145767212,
311
+ "tpp_threshold_50_intended_diff_only": 0.013999998569488525,
312
+ "tpp_threshold_50_unintended_diff_only": 0.0037499964237213135,
313
+ "tpp_threshold_100_total_metric": 0.008000016212463379,
314
+ "tpp_threshold_100_intended_diff_only": 0.018000006675720215,
315
+ "tpp_threshold_100_unintended_diff_only": 0.009999990463256836,
316
+ "tpp_threshold_500_total_metric": 0.08074997365474701,
317
+ "tpp_threshold_500_intended_diff_only": 0.09099996089935303,
318
+ "tpp_threshold_500_unintended_diff_only": 0.010249987244606018
319
+ },
320
+ "2": {
321
+ "tpp_threshold_2_total_metric": 0.0004999935626983643,
322
+ "tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
323
+ "tpp_threshold_2_unintended_diff_only": 0.0004999935626983643,
324
+ "tpp_threshold_5_total_metric": -0.0070000141859054565,
325
+ "tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
326
+ "tpp_threshold_5_unintended_diff_only": 0.008000001311302185,
327
+ "tpp_threshold_10_total_metric": 0.002249971032142639,
328
+ "tpp_threshold_10_intended_diff_only": 0.007999956607818604,
329
+ "tpp_threshold_10_unintended_diff_only": 0.005749985575675964,
330
+ "tpp_threshold_20_total_metric": 0.0029999762773513794,
331
+ "tpp_threshold_20_intended_diff_only": 0.0059999823570251465,
332
+ "tpp_threshold_20_unintended_diff_only": 0.003000006079673767,
333
+ "tpp_threshold_50_total_metric": 0.016999974846839905,
334
+ "tpp_threshold_50_intended_diff_only": 0.02599996328353882,
335
+ "tpp_threshold_50_unintended_diff_only": 0.008999988436698914,
336
+ "tpp_threshold_100_total_metric": 0.028250008821487427,
337
+ "tpp_threshold_100_intended_diff_only": 0.041999995708465576,
338
+ "tpp_threshold_100_unintended_diff_only": 0.01374998688697815,
339
+ "tpp_threshold_500_total_metric": 0.19774997234344482,
340
+ "tpp_threshold_500_intended_diff_only": 0.21999996900558472,
341
+ "tpp_threshold_500_unintended_diff_only": 0.022249996662139893
342
+ },
343
+ "3": {
344
+ "tpp_threshold_2_total_metric": -0.008249998092651367,
345
+ "tpp_threshold_2_intended_diff_only": -0.004999995231628418,
346
+ "tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
347
+ "tpp_threshold_5_total_metric": 0.000250011682510376,
348
+ "tpp_threshold_5_intended_diff_only": 0.0,
349
+ "tpp_threshold_5_unintended_diff_only": -0.000250011682510376,
350
+ "tpp_threshold_10_total_metric": 0.0072500258684158325,
351
+ "tpp_threshold_10_intended_diff_only": 0.008000016212463379,
352
+ "tpp_threshold_10_unintended_diff_only": 0.0007499903440475464,
353
+ "tpp_threshold_20_total_metric": -0.0015000253915786743,
354
+ "tpp_threshold_20_intended_diff_only": 0.001999974250793457,
355
+ "tpp_threshold_20_unintended_diff_only": 0.0034999996423721313,
356
+ "tpp_threshold_50_total_metric": 0.012000039219856262,
357
+ "tpp_threshold_50_intended_diff_only": 0.013000011444091797,
358
+ "tpp_threshold_50_unintended_diff_only": 0.0009999722242355347,
359
+ "tpp_threshold_100_total_metric": 0.02925001084804535,
360
+ "tpp_threshold_100_intended_diff_only": 0.04100000858306885,
361
+ "tpp_threshold_100_unintended_diff_only": 0.011749997735023499,
362
+ "tpp_threshold_500_total_metric": 0.1612500250339508,
363
+ "tpp_threshold_500_intended_diff_only": 0.2160000205039978,
364
+ "tpp_threshold_500_unintended_diff_only": 0.054749995470047
365
+ },
366
+ "5": {
367
+ "tpp_threshold_2_total_metric": -0.006749927997589111,
368
+ "tpp_threshold_2_intended_diff_only": -0.003999948501586914,
369
+ "tpp_threshold_2_unintended_diff_only": 0.0027499794960021973,
370
+ "tpp_threshold_5_total_metric": -0.006249949336051941,
371
+ "tpp_threshold_5_intended_diff_only": -0.001999974250793457,
372
+ "tpp_threshold_5_unintended_diff_only": 0.004249975085258484,
373
+ "tpp_threshold_10_total_metric": -0.008999958634376526,
374
+ "tpp_threshold_10_intended_diff_only": -0.0029999613761901855,
375
+ "tpp_threshold_10_unintended_diff_only": 0.00599999725818634,
376
+ "tpp_threshold_20_total_metric": -0.009499937295913696,
377
+ "tpp_threshold_20_intended_diff_only": 0.001000046730041504,
378
+ "tpp_threshold_20_unintended_diff_only": 0.0104999840259552,
379
+ "tpp_threshold_50_total_metric": 0.05525003373622894,
380
+ "tpp_threshold_50_intended_diff_only": 0.0820000171661377,
381
+ "tpp_threshold_50_unintended_diff_only": 0.026749983429908752,
382
+ "tpp_threshold_100_total_metric": 0.08600002527236938,
383
+ "tpp_threshold_100_intended_diff_only": 0.11800003051757812,
384
+ "tpp_threshold_100_unintended_diff_only": 0.03200000524520874,
385
+ "tpp_threshold_500_total_metric": 0.2552500516176224,
386
+ "tpp_threshold_500_intended_diff_only": 0.2940000295639038,
387
+ "tpp_threshold_500_unintended_diff_only": 0.03874997794628143
388
+ },
389
+ "6": {
390
+ "tpp_threshold_2_total_metric": 0.006749987602233887,
391
+ "tpp_threshold_2_intended_diff_only": 0.010999977588653564,
392
+ "tpp_threshold_2_unintended_diff_only": 0.004249989986419678,
393
+ "tpp_threshold_5_total_metric": 0.006749987602233887,
394
+ "tpp_threshold_5_intended_diff_only": 0.0059999823570251465,
395
+ "tpp_threshold_5_unintended_diff_only": -0.0007500052452087402,
396
+ "tpp_threshold_10_total_metric": 0.016249999403953552,
397
+ "tpp_threshold_10_intended_diff_only": 0.018000006675720215,
398
+ "tpp_threshold_10_unintended_diff_only": 0.0017500072717666626,
399
+ "tpp_threshold_20_total_metric": 0.035499975085258484,
400
+ "tpp_threshold_20_intended_diff_only": 0.038999974727630615,
401
+ "tpp_threshold_20_unintended_diff_only": 0.0034999996423721313,
402
+ "tpp_threshold_50_total_metric": 0.06450000405311584,
403
+ "tpp_threshold_50_intended_diff_only": 0.07400000095367432,
404
+ "tpp_threshold_50_unintended_diff_only": 0.009499996900558472,
405
+ "tpp_threshold_100_total_metric": 0.10999998450279236,
406
+ "tpp_threshold_100_intended_diff_only": 0.11799997091293335,
407
+ "tpp_threshold_100_unintended_diff_only": 0.007999986410140991,
408
+ "tpp_threshold_500_total_metric": 0.3192500174045563,
409
+ "tpp_threshold_500_intended_diff_only": 0.3330000042915344,
410
+ "tpp_threshold_500_unintended_diff_only": 0.01374998688697815
411
+ }
412
+ }
413
+ }
414
+ }
eval_results_from_scratch/tpp/kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "tpp",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "LabHC/bias_in_bios_class_set1",
7
+ "canrager/amazon_reviews_mcauley_1and5"
8
+ ],
9
+ "perform_scr": false,
10
+ "early_stopping_patience": 20,
11
+ "train_set_size": 4000,
12
+ "test_set_size": 1000,
13
+ "context_length": 128,
14
+ "probe_train_batch_size": 16,
15
+ "probe_test_batch_size": 500,
16
+ "probe_epochs": 20,
17
+ "probe_lr": 0.001,
18
+ "probe_l1_penalty": 0.001,
19
+ "sae_batch_size": 125,
20
+ "llm_batch_size": 32,
21
+ "llm_dtype": "bfloat16",
22
+ "lower_vram_usage": false,
23
+ "model_name": "gemma-2-2b",
24
+ "n_values": [
25
+ 2,
26
+ 5,
27
+ 10,
28
+ 20,
29
+ 50,
30
+ 100,
31
+ 500
32
+ ],
33
+ "column1_vals_lookup": {
34
+ "LabHC/bias_in_bios_class_set1": [
35
+ [
36
+ "professor",
37
+ "nurse"
38
+ ],
39
+ [
40
+ "architect",
41
+ "journalist"
42
+ ],
43
+ [
44
+ "surgeon",
45
+ "psychologist"
46
+ ],
47
+ [
48
+ "attorney",
49
+ "teacher"
50
+ ]
51
+ ],
52
+ "canrager/amazon_reviews_mcauley_1and5": [
53
+ [
54
+ "Books",
55
+ "CDs_and_Vinyl"
56
+ ],
57
+ [
58
+ "Software",
59
+ "Electronics"
60
+ ],
61
+ [
62
+ "Pet_Supplies",
63
+ "Office_Products"
64
+ ],
65
+ [
66
+ "Industrial_and_Scientific",
67
+ "Toys_and_Games"
68
+ ]
69
+ ]
70
+ }
71
+ },
72
+ "eval_id": "496eec06-563d-45a6-8ad8-e68c3fad1008",
73
+ "datetime_epoch_millis": 1740163906732,
74
+ "eval_result_metrics": {
75
+ "tpp_metrics": {
76
+ "tpp_threshold_2_total_metric": 0.003649994730949402,
77
+ "tpp_threshold_2_intended_diff_only": 0.006199997663497925,
78
+ "tpp_threshold_2_unintended_diff_only": 0.0025500029325485228,
79
+ "tpp_threshold_5_total_metric": 0.0050999939441680915,
80
+ "tpp_threshold_5_intended_diff_only": 0.007999992370605467,
81
+ "tpp_threshold_5_unintended_diff_only": 0.0028999984264373776,
82
+ "tpp_threshold_10_total_metric": 0.013499999046325683,
83
+ "tpp_threshold_10_intended_diff_only": 0.017399996519088745,
84
+ "tpp_threshold_10_unintended_diff_only": 0.0038999974727630614,
85
+ "tpp_threshold_20_total_metric": 0.02162499576807022,
86
+ "tpp_threshold_20_intended_diff_only": 0.02619999647140503,
87
+ "tpp_threshold_20_unintended_diff_only": 0.004575000703334808,
88
+ "tpp_threshold_50_total_metric": 0.051074995100498205,
89
+ "tpp_threshold_50_intended_diff_only": 0.057199996709823606,
90
+ "tpp_threshold_50_unintended_diff_only": 0.006125001609325409,
91
+ "tpp_threshold_100_total_metric": 0.09944999665021896,
92
+ "tpp_threshold_100_intended_diff_only": 0.10920000076293945,
93
+ "tpp_threshold_100_unintended_diff_only": 0.009750004112720489,
94
+ "tpp_threshold_500_total_metric": 0.310000017285347,
95
+ "tpp_threshold_500_intended_diff_only": 0.3243000149726868,
96
+ "tpp_threshold_500_unintended_diff_only": 0.014299997687339782
97
+ }
98
+ },
99
+ "eval_result_details": [
100
+ {
101
+ "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
102
+ "tpp_threshold_2_total_metric": 0.004200002551078797,
103
+ "tpp_threshold_2_intended_diff_only": 0.006200015544891357,
104
+ "tpp_threshold_2_unintended_diff_only": 0.002000012993812561,
105
+ "tpp_threshold_5_total_metric": 0.0056500047445297245,
106
+ "tpp_threshold_5_intended_diff_only": 0.00840001106262207,
107
+ "tpp_threshold_5_unintended_diff_only": 0.002750006318092346,
108
+ "tpp_threshold_10_total_metric": 0.01315000057220459,
109
+ "tpp_threshold_10_intended_diff_only": 0.015400004386901856,
110
+ "tpp_threshold_10_unintended_diff_only": 0.002250003814697266,
111
+ "tpp_threshold_20_total_metric": 0.02469998598098755,
112
+ "tpp_threshold_20_intended_diff_only": 0.02799999713897705,
113
+ "tpp_threshold_20_unintended_diff_only": 0.003300011157989502,
114
+ "tpp_threshold_50_total_metric": 0.05049999952316284,
115
+ "tpp_threshold_50_intended_diff_only": 0.053800010681152345,
116
+ "tpp_threshold_50_unintended_diff_only": 0.003300011157989502,
117
+ "tpp_threshold_100_total_metric": 0.09890000522136688,
118
+ "tpp_threshold_100_intended_diff_only": 0.103600013256073,
119
+ "tpp_threshold_100_unintended_diff_only": 0.004700008034706116,
120
+ "tpp_threshold_500_total_metric": 0.3669000148773193,
121
+ "tpp_threshold_500_intended_diff_only": 0.37420002222061155,
122
+ "tpp_threshold_500_unintended_diff_only": 0.0073000073432922365
123
+ },
124
+ {
125
+ "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
126
+ "tpp_threshold_2_total_metric": 0.0030999869108200074,
127
+ "tpp_threshold_2_intended_diff_only": 0.006199979782104492,
128
+ "tpp_threshold_2_unintended_diff_only": 0.003099992871284485,
129
+ "tpp_threshold_5_total_metric": 0.004549983143806458,
130
+ "tpp_threshold_5_intended_diff_only": 0.007599973678588867,
131
+ "tpp_threshold_5_unintended_diff_only": 0.0030499905347824096,
132
+ "tpp_threshold_10_total_metric": 0.013849997520446777,
133
+ "tpp_threshold_10_intended_diff_only": 0.019399988651275634,
134
+ "tpp_threshold_10_unintended_diff_only": 0.005549991130828857,
135
+ "tpp_threshold_20_total_metric": 0.018550005555152894,
136
+ "tpp_threshold_20_intended_diff_only": 0.024399995803833008,
137
+ "tpp_threshold_20_unintended_diff_only": 0.005849990248680115,
138
+ "tpp_threshold_50_total_metric": 0.05164999067783356,
139
+ "tpp_threshold_50_intended_diff_only": 0.06059998273849487,
140
+ "tpp_threshold_50_unintended_diff_only": 0.008949992060661317,
141
+ "tpp_threshold_100_total_metric": 0.09999998807907104,
142
+ "tpp_threshold_100_intended_diff_only": 0.1147999882698059,
143
+ "tpp_threshold_100_unintended_diff_only": 0.014800000190734863,
144
+ "tpp_threshold_500_total_metric": 0.25310001969337464,
145
+ "tpp_threshold_500_intended_diff_only": 0.274400007724762,
146
+ "tpp_threshold_500_unintended_diff_only": 0.02129998803138733
147
+ }
148
+ ],
149
+ "sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
150
+ "sae_lens_id": "custom_sae",
151
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_80_peft_sae_from_scratch_95.0",
152
+ "sae_lens_version": "5.4.2",
153
+ "sae_cfg_dict": {
154
+ "model_name": "gemma-2-2b",
155
+ "d_in": 2304,
156
+ "d_sae": 65536,
157
+ "hook_layer": 12,
158
+ "hook_name": "blocks.12.hook_resid_post",
159
+ "context_size": null,
160
+ "hook_head_index": null,
161
+ "architecture": "topk",
162
+ "apply_b_dec_to_input": null,
163
+ "finetuning_scaling_factor": null,
164
+ "activation_fn_str": "",
165
+ "prepend_bos": true,
166
+ "normalize_activations": "none",
167
+ "dtype": "bfloat16",
168
+ "device": "",
169
+ "dataset_path": "",
170
+ "dataset_trust_remote_code": true,
171
+ "seqpos_slice": [
172
+ null
173
+ ],
174
+ "training_tokens": -100000,
175
+ "sae_lens_training_version": null,
176
+ "neuronpedia_id": null
177
+ },
178
+ "eval_result_unstructured": {
179
+ "LabHC/bias_in_bios_class_set1": {
180
+ "0": {
181
+ "tpp_threshold_2_total_metric": 0.006749987602233887,
182
+ "tpp_threshold_2_intended_diff_only": 0.009999990463256836,
183
+ "tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
184
+ "tpp_threshold_5_total_metric": 0.017000019550323486,
185
+ "tpp_threshold_5_intended_diff_only": 0.021000027656555176,
186
+ "tpp_threshold_5_unintended_diff_only": 0.0040000081062316895,
187
+ "tpp_threshold_10_total_metric": 0.009750038385391235,
188
+ "tpp_threshold_10_intended_diff_only": 0.01500004529953003,
189
+ "tpp_threshold_10_unintended_diff_only": 0.005250006914138794,
190
+ "tpp_threshold_20_total_metric": 0.029749974608421326,
191
+ "tpp_threshold_20_intended_diff_only": 0.03299999237060547,
192
+ "tpp_threshold_20_unintended_diff_only": 0.003250017762184143,
193
+ "tpp_threshold_50_total_metric": 0.046999990940093994,
194
+ "tpp_threshold_50_intended_diff_only": 0.050999999046325684,
195
+ "tpp_threshold_50_unintended_diff_only": 0.0040000081062316895,
196
+ "tpp_threshold_100_total_metric": 0.12275001406669617,
197
+ "tpp_threshold_100_intended_diff_only": 0.12800002098083496,
198
+ "tpp_threshold_100_unintended_diff_only": 0.005250006914138794,
199
+ "tpp_threshold_500_total_metric": 0.398250013589859,
200
+ "tpp_threshold_500_intended_diff_only": 0.4020000100135803,
201
+ "tpp_threshold_500_unintended_diff_only": 0.0037499964237213135
202
+ },
203
+ "1": {
204
+ "tpp_threshold_2_total_metric": 0.0050000399351119995,
205
+ "tpp_threshold_2_intended_diff_only": 0.005000054836273193,
206
+ "tpp_threshold_2_unintended_diff_only": 1.4901161193847656e-08,
207
+ "tpp_threshold_5_total_metric": 0.0025000274181365967,
208
+ "tpp_threshold_5_intended_diff_only": 0.003000020980834961,
209
+ "tpp_threshold_5_unintended_diff_only": 0.0004999935626983643,
210
+ "tpp_threshold_10_total_metric": 0.010499998927116394,
211
+ "tpp_threshold_10_intended_diff_only": 0.009000003337860107,
212
+ "tpp_threshold_10_unintended_diff_only": -0.0014999955892562866,
213
+ "tpp_threshold_20_total_metric": 0.0104999840259552,
214
+ "tpp_threshold_20_intended_diff_only": 0.013999998569488525,
215
+ "tpp_threshold_20_unintended_diff_only": 0.003500014543533325,
216
+ "tpp_threshold_50_total_metric": 0.04275001585483551,
217
+ "tpp_threshold_50_intended_diff_only": 0.0480000376701355,
218
+ "tpp_threshold_50_unintended_diff_only": 0.005250021815299988,
219
+ "tpp_threshold_100_total_metric": 0.0625,
220
+ "tpp_threshold_100_intended_diff_only": 0.0690000057220459,
221
+ "tpp_threshold_100_unintended_diff_only": 0.0065000057220458984,
222
+ "tpp_threshold_500_total_metric": 0.31550003588199615,
223
+ "tpp_threshold_500_intended_diff_only": 0.32600003480911255,
224
+ "tpp_threshold_500_unintended_diff_only": 0.010499998927116394
225
+ },
226
+ "2": {
227
+ "tpp_threshold_2_total_metric": 0.0007500052452087402,
228
+ "tpp_threshold_2_intended_diff_only": 0.003000020980834961,
229
+ "tpp_threshold_2_unintended_diff_only": 0.0022500157356262207,
230
+ "tpp_threshold_5_total_metric": 0.001499950885772705,
231
+ "tpp_threshold_5_intended_diff_only": 0.006999969482421875,
232
+ "tpp_threshold_5_unintended_diff_only": 0.00550001859664917,
233
+ "tpp_threshold_10_total_metric": 0.0337500125169754,
234
+ "tpp_threshold_10_intended_diff_only": 0.03600001335144043,
235
+ "tpp_threshold_10_unintended_diff_only": 0.002250000834465027,
236
+ "tpp_threshold_20_total_metric": 0.047749996185302734,
237
+ "tpp_threshold_20_intended_diff_only": 0.050999999046325684,
238
+ "tpp_threshold_20_unintended_diff_only": 0.0032500028610229492,
239
+ "tpp_threshold_50_total_metric": 0.07749998569488525,
240
+ "tpp_threshold_50_intended_diff_only": 0.07899999618530273,
241
+ "tpp_threshold_50_unintended_diff_only": 0.0015000104904174805,
242
+ "tpp_threshold_100_total_metric": 0.13199999928474426,
243
+ "tpp_threshold_100_intended_diff_only": 0.1340000033378601,
244
+ "tpp_threshold_100_unintended_diff_only": 0.0020000040531158447,
245
+ "tpp_threshold_500_total_metric": 0.37299999594688416,
246
+ "tpp_threshold_500_intended_diff_only": 0.3790000081062317,
247
+ "tpp_threshold_500_unintended_diff_only": 0.006000012159347534
248
+ },
249
+ "6": {
250
+ "tpp_threshold_2_total_metric": 0.0015000104904174805,
251
+ "tpp_threshold_2_intended_diff_only": 0.003000020980834961,
252
+ "tpp_threshold_2_unintended_diff_only": 0.0015000104904174805,
253
+ "tpp_threshold_5_total_metric": 0.003000035881996155,
254
+ "tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
255
+ "tpp_threshold_5_unintended_diff_only": -0.0010000020265579224,
256
+ "tpp_threshold_10_total_metric": 0.005249977111816406,
257
+ "tpp_threshold_10_intended_diff_only": 0.0059999823570251465,
258
+ "tpp_threshold_10_unintended_diff_only": 0.0007500052452087402,
259
+ "tpp_threshold_20_total_metric": 0.004999995231628418,
260
+ "tpp_threshold_20_intended_diff_only": 0.004999995231628418,
261
+ "tpp_threshold_20_unintended_diff_only": 0.0,
262
+ "tpp_threshold_50_total_metric": 0.008500009775161743,
263
+ "tpp_threshold_50_intended_diff_only": 0.009000003337860107,
264
+ "tpp_threshold_50_unintended_diff_only": 0.0004999935626983643,
265
+ "tpp_threshold_100_total_metric": 0.02300003170967102,
266
+ "tpp_threshold_100_intended_diff_only": 0.025000035762786865,
267
+ "tpp_threshold_100_unintended_diff_only": 0.0020000040531158447,
268
+ "tpp_threshold_500_total_metric": 0.320000022649765,
269
+ "tpp_threshold_500_intended_diff_only": 0.32600003480911255,
270
+ "tpp_threshold_500_unintended_diff_only": 0.006000012159347534
271
+ },
272
+ "9": {
273
+ "tpp_threshold_2_total_metric": 0.006999969482421875,
274
+ "tpp_threshold_2_intended_diff_only": 0.009999990463256836,
275
+ "tpp_threshold_2_unintended_diff_only": 0.003000020980834961,
276
+ "tpp_threshold_5_total_metric": 0.004249989986419678,
277
+ "tpp_threshold_5_intended_diff_only": 0.009000003337860107,
278
+ "tpp_threshold_5_unintended_diff_only": 0.00475001335144043,
279
+ "tpp_threshold_10_total_metric": 0.006499975919723511,
280
+ "tpp_threshold_10_intended_diff_only": 0.010999977588653564,
281
+ "tpp_threshold_10_unintended_diff_only": 0.004500001668930054,
282
+ "tpp_threshold_20_total_metric": 0.030499979853630066,
283
+ "tpp_threshold_20_intended_diff_only": 0.03700000047683716,
284
+ "tpp_threshold_20_unintended_diff_only": 0.006500020623207092,
285
+ "tpp_threshold_50_total_metric": 0.07674999535083771,
286
+ "tpp_threshold_50_intended_diff_only": 0.0820000171661377,
287
+ "tpp_threshold_50_unintended_diff_only": 0.005250021815299988,
288
+ "tpp_threshold_100_total_metric": 0.15424998104572296,
289
+ "tpp_threshold_100_intended_diff_only": 0.16200000047683716,
290
+ "tpp_threshold_100_unintended_diff_only": 0.007750019431114197,
291
+ "tpp_threshold_500_total_metric": 0.42775000631809235,
292
+ "tpp_threshold_500_intended_diff_only": 0.43800002336502075,
293
+ "tpp_threshold_500_unintended_diff_only": 0.010250017046928406
294
+ }
295
+ },
296
+ "canrager/amazon_reviews_mcauley_1and5": {
297
+ "1": {
298
+ "tpp_threshold_2_total_metric": 0.00424996018409729,
299
+ "tpp_threshold_2_intended_diff_only": 0.007999956607818604,
300
+ "tpp_threshold_2_unintended_diff_only": 0.0037499964237213135,
301
+ "tpp_threshold_5_total_metric": 0.0015000104904174805,
302
+ "tpp_threshold_5_intended_diff_only": 0.004999995231628418,
303
+ "tpp_threshold_5_unintended_diff_only": 0.0034999847412109375,
304
+ "tpp_threshold_10_total_metric": 0.0007499605417251587,
305
+ "tpp_threshold_10_intended_diff_only": 0.007999956607818604,
306
+ "tpp_threshold_10_unintended_diff_only": 0.007249996066093445,
307
+ "tpp_threshold_20_total_metric": 0.001749962568283081,
308
+ "tpp_threshold_20_intended_diff_only": 0.006999969482421875,
309
+ "tpp_threshold_20_unintended_diff_only": 0.005250006914138794,
310
+ "tpp_threshold_50_total_metric": 0.011249944567680359,
311
+ "tpp_threshold_50_intended_diff_only": 0.012999951839447021,
312
+ "tpp_threshold_50_unintended_diff_only": 0.0017500072717666626,
313
+ "tpp_threshold_100_total_metric": 0.011499956250190735,
314
+ "tpp_threshold_100_intended_diff_only": 0.0209999680519104,
315
+ "tpp_threshold_100_unintended_diff_only": 0.009500011801719666,
316
+ "tpp_threshold_500_total_metric": 0.1262499988079071,
317
+ "tpp_threshold_500_intended_diff_only": 0.13899999856948853,
318
+ "tpp_threshold_500_unintended_diff_only": 0.012749999761581421
319
+ },
320
+ "2": {
321
+ "tpp_threshold_2_total_metric": 0.0040000081062316895,
322
+ "tpp_threshold_2_intended_diff_only": 0.004999995231628418,
323
+ "tpp_threshold_2_unintended_diff_only": 0.0009999871253967285,
324
+ "tpp_threshold_5_total_metric": -0.004500031471252441,
325
+ "tpp_threshold_5_intended_diff_only": 0.0029999613761901855,
326
+ "tpp_threshold_5_unintended_diff_only": 0.007499992847442627,
327
+ "tpp_threshold_10_total_metric": 0.00449998676776886,
328
+ "tpp_threshold_10_intended_diff_only": 0.010999977588653564,
329
+ "tpp_threshold_10_unintended_diff_only": 0.006499990820884705,
330
+ "tpp_threshold_20_total_metric": 0.009250015020370483,
331
+ "tpp_threshold_20_intended_diff_only": 0.013999998569488525,
332
+ "tpp_threshold_20_unintended_diff_only": 0.004749983549118042,
333
+ "tpp_threshold_50_total_metric": 0.03349998593330383,
334
+ "tpp_threshold_50_intended_diff_only": 0.04399996995925903,
335
+ "tpp_threshold_50_unintended_diff_only": 0.0104999840259552,
336
+ "tpp_threshold_100_total_metric": 0.06824997067451477,
337
+ "tpp_threshold_100_intended_diff_only": 0.08499997854232788,
338
+ "tpp_threshold_100_unintended_diff_only": 0.01675000786781311,
339
+ "tpp_threshold_500_total_metric": 0.25550003349781036,
340
+ "tpp_threshold_500_intended_diff_only": 0.2720000147819519,
341
+ "tpp_threshold_500_unintended_diff_only": 0.01649998128414154
342
+ },
343
+ "3": {
344
+ "tpp_threshold_2_total_metric": -0.006250008940696716,
345
+ "tpp_threshold_2_intended_diff_only": -0.003000020980834961,
346
+ "tpp_threshold_2_unintended_diff_only": 0.0032499879598617554,
347
+ "tpp_threshold_5_total_metric": 0.0027499645948410034,
348
+ "tpp_threshold_5_intended_diff_only": 0.0029999613761901855,
349
+ "tpp_threshold_5_unintended_diff_only": 0.00024999678134918213,
350
+ "tpp_threshold_10_total_metric": 0.0065000057220458984,
351
+ "tpp_threshold_10_intended_diff_only": 0.009999990463256836,
352
+ "tpp_threshold_10_unintended_diff_only": 0.0034999847412109375,
353
+ "tpp_threshold_20_total_metric": 0.006000012159347534,
354
+ "tpp_threshold_20_intended_diff_only": 0.013000011444091797,
355
+ "tpp_threshold_20_unintended_diff_only": 0.006999999284744263,
356
+ "tpp_threshold_50_total_metric": 0.02499997615814209,
357
+ "tpp_threshold_50_intended_diff_only": 0.029999971389770508,
358
+ "tpp_threshold_50_unintended_diff_only": 0.004999995231628418,
359
+ "tpp_threshold_100_total_metric": 0.06025001406669617,
360
+ "tpp_threshold_100_intended_diff_only": 0.07300001382827759,
361
+ "tpp_threshold_100_unintended_diff_only": 0.012749999761581421,
362
+ "tpp_threshold_500_total_metric": 0.19625000655651093,
363
+ "tpp_threshold_500_intended_diff_only": 0.22699999809265137,
364
+ "tpp_threshold_500_unintended_diff_only": 0.030749991536140442
365
+ },
366
+ "5": {
367
+ "tpp_threshold_2_total_metric": 0.006000012159347534,
368
+ "tpp_threshold_2_intended_diff_only": 0.009000003337860107,
369
+ "tpp_threshold_2_unintended_diff_only": 0.0029999911785125732,
370
+ "tpp_threshold_5_total_metric": 0.008500009775161743,
371
+ "tpp_threshold_5_intended_diff_only": 0.013999998569488525,
372
+ "tpp_threshold_5_unintended_diff_only": 0.005499988794326782,
373
+ "tpp_threshold_10_total_metric": 0.018500030040740967,
374
+ "tpp_threshold_10_intended_diff_only": 0.026000022888183594,
375
+ "tpp_threshold_10_unintended_diff_only": 0.007499992847442627,
376
+ "tpp_threshold_20_total_metric": 0.027250036597251892,
377
+ "tpp_threshold_20_intended_diff_only": 0.03600001335144043,
378
+ "tpp_threshold_20_unintended_diff_only": 0.008749976754188538,
379
+ "tpp_threshold_50_total_metric": 0.09700007736682892,
380
+ "tpp_threshold_50_intended_diff_only": 0.11600005626678467,
381
+ "tpp_threshold_50_unintended_diff_only": 0.01899997889995575,
382
+ "tpp_threshold_100_total_metric": 0.17525003850460052,
383
+ "tpp_threshold_100_intended_diff_only": 0.20200002193450928,
384
+ "tpp_threshold_100_unintended_diff_only": 0.026749983429908752,
385
+ "tpp_threshold_500_total_metric": 0.34125006198883057,
386
+ "tpp_threshold_500_intended_diff_only": 0.3720000386238098,
387
+ "tpp_threshold_500_unintended_diff_only": 0.030749976634979248
388
+ },
389
+ "6": {
390
+ "tpp_threshold_2_total_metric": 0.007499963045120239,
391
+ "tpp_threshold_2_intended_diff_only": 0.011999964714050293,
392
+ "tpp_threshold_2_unintended_diff_only": 0.004500001668930054,
393
+ "tpp_threshold_5_total_metric": 0.014499962329864502,
394
+ "tpp_threshold_5_intended_diff_only": 0.012999951839447021,
395
+ "tpp_threshold_5_unintended_diff_only": -0.0015000104904174805,
396
+ "tpp_threshold_10_total_metric": 0.039000004529953,
397
+ "tpp_threshold_10_intended_diff_only": 0.041999995708465576,
398
+ "tpp_threshold_10_unintended_diff_only": 0.0029999911785125732,
399
+ "tpp_threshold_20_total_metric": 0.048500001430511475,
400
+ "tpp_threshold_20_intended_diff_only": 0.05199998617172241,
401
+ "tpp_threshold_20_unintended_diff_only": 0.0034999847412109375,
402
+ "tpp_threshold_50_total_metric": 0.09149996936321259,
403
+ "tpp_threshold_50_intended_diff_only": 0.09999996423721313,
404
+ "tpp_threshold_50_unintended_diff_only": 0.00849999487400055,
405
+ "tpp_threshold_100_total_metric": 0.18474996089935303,
406
+ "tpp_threshold_100_intended_diff_only": 0.1929999589920044,
407
+ "tpp_threshold_100_unintended_diff_only": 0.008249998092651367,
408
+ "tpp_threshold_500_total_metric": 0.3462499976158142,
409
+ "tpp_threshold_500_intended_diff_only": 0.3619999885559082,
410
+ "tpp_threshold_500_unintended_diff_only": 0.015749990940093994
411
+ }
412
+ }
413
+ }
414
+ }
eval_results_from_scratch/unlearning/kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0_custom_sae_eval_results.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_type_id": "unlearning",
3
+ "eval_config": {
4
+ "random_seed": 42,
5
+ "dataset_names": [
6
+ "wmdp-bio",
7
+ "high_school_us_history",
8
+ "college_computer_science",
9
+ "high_school_geography",
10
+ "human_aging"
11
+ ],
12
+ "intervention_method": "clamp_feature_activation",
13
+ "retain_thresholds": [
14
+ 0.001,
15
+ 0.01
16
+ ],
17
+ "n_features_list": [
18
+ 10,
19
+ 20
20
+ ],
21
+ "multipliers": [
22
+ 25,
23
+ 50,
24
+ 100,
25
+ 200
26
+ ],
27
+ "dataset_size": 1024,
28
+ "seq_len": 1024,
29
+ "n_batch_loss_added": 50,
30
+ "target_metric": "correct",
31
+ "save_metrics": true,
32
+ "model_name": "gemma-2-2b-it",
33
+ "llm_batch_size": 4,
34
+ "llm_dtype": "bfloat16"
35
+ },
36
+ "eval_id": "9c4fef1a-ae28-4280-b511-c8d59c94496f",
37
+ "datetime_epoch_millis": 1740170757553,
38
+ "eval_result_metrics": {
39
+ "unlearning": {
40
+ "unlearning_score": 0.022514045238494873
41
+ }
42
+ },
43
+ "eval_result_details": [],
44
+ "sae_bench_commit_hash": "d91a218b4cc4ac6c164d0e1b739c8437901c7acd",
45
+ "sae_lens_id": "custom_sae",
46
+ "sae_lens_release_id": "kl_finetunes_from_scratch_2pow16_trainer_160_peft_sae_from_scratch_0.0",
47
+ "sae_lens_version": "5.4.2",
48
+ "sae_cfg_dict": {
49
+ "model_name": "gemma-2-2b",
50
+ "d_in": 2304,
51
+ "d_sae": 65536,
52
+ "hook_layer": 12,
53
+ "hook_name": "blocks.12.hook_resid_post",
54
+ "context_size": null,
55
+ "hook_head_index": null,
56
+ "architecture": "topk",
57
+ "apply_b_dec_to_input": null,
58
+ "finetuning_scaling_factor": null,
59
+ "activation_fn_str": "",
60
+ "prepend_bos": true,
61
+ "normalize_activations": "none",
62
+ "dtype": "bfloat16",
63
+ "device": "",
64
+ "dataset_path": "",
65
+ "dataset_trust_remote_code": true,
66
+ "seqpos_slice": [
67
+ null
68
+ ],
69
+ "training_tokens": -100000,
70
+ "sae_lens_training_version": null,
71
+ "neuronpedia_id": null
72
+ },
73
+ "eval_result_unstructured": null
74
+ }