Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +36 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json +268 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json +3 -0
- eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,39 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
38 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
39 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
40 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
41 |
+
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
42 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
43 |
+
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
44 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
45 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
46 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
47 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
48 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
49 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
50 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
51 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
52 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
53 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
54 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
55 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
56 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
57 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
58 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
59 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
60 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
61 |
+
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
62 |
+
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
63 |
+
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
64 |
+
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
65 |
+
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
66 |
+
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
67 |
+
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
68 |
+
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
69 |
+
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
70 |
+
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
71 |
+
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "26cf7785-b764-4b2f-9cd4-b388819364f4",
|
17 |
+
"datetime_epoch_millis": 1740072497386,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.08756063184284377,
|
21 |
+
"mean_full_absorption_score": 0.0026708784824093696,
|
22 |
+
"mean_num_split_features": 1.1153846153846154,
|
23 |
+
"std_dev_absorption_fraction_score": 0.10310418368085361,
|
24 |
+
"std_dev_full_absorption_score": 0.004668474177881613,
|
25 |
+
"std_dev_num_split_features": 0.3258125936084211
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.08140821007145073,
|
32 |
+
"full_absorption_rate": 0.0,
|
33 |
+
"num_full_absorption": 0,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.01681192611231011,
|
40 |
+
"full_absorption_rate": 0.0006485084306095979,
|
41 |
+
"num_full_absorption": 1,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.35926263607439873,
|
48 |
+
"full_absorption_rate": 0.012121212121212121,
|
49 |
+
"num_full_absorption": 34,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.028519339166013014,
|
56 |
+
"full_absorption_rate": 0.0006024096385542169,
|
57 |
+
"num_full_absorption": 1,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 2
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.08212543463683289,
|
64 |
+
"full_absorption_rate": 0.0024752475247524753,
|
65 |
+
"num_full_absorption": 4,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 2
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.015080013168122549,
|
72 |
+
"full_absorption_rate": 0.0,
|
73 |
+
"num_full_absorption": 0,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.030999187614493637,
|
80 |
+
"full_absorption_rate": 0.0,
|
81 |
+
"num_full_absorption": 0,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.011943369017875526,
|
88 |
+
"full_absorption_rate": 0.000966183574879227,
|
89 |
+
"num_full_absorption": 1,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.1279149210080362,
|
96 |
+
"full_absorption_rate": 0.0,
|
97 |
+
"num_full_absorption": 0,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.015672704411350203,
|
104 |
+
"full_absorption_rate": 0.0,
|
105 |
+
"num_full_absorption": 0,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.020683345579076196,
|
112 |
+
"full_absorption_rate": 0.0,
|
113 |
+
"num_full_absorption": 0,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.038069472005201645,
|
120 |
+
"full_absorption_rate": 0.0,
|
121 |
+
"num_full_absorption": 0,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.017109632297467238,
|
128 |
+
"full_absorption_rate": 0.001098297638660077,
|
129 |
+
"num_full_absorption": 2,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.0499469606653892,
|
136 |
+
"full_absorption_rate": 0.0,
|
137 |
+
"num_full_absorption": 0,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.19349678964164968,
|
144 |
+
"full_absorption_rate": 0.005623242736644799,
|
145 |
+
"num_full_absorption": 6,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.18385323993434824,
|
152 |
+
"full_absorption_rate": 0.0,
|
153 |
+
"num_full_absorption": 0,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.027994167957066447,
|
160 |
+
"full_absorption_rate": 0.0,
|
161 |
+
"num_full_absorption": 0,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.05001762156507236,
|
168 |
+
"full_absorption_rate": 0.0,
|
169 |
+
"num_full_absorption": 0,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.24798331778260943,
|
176 |
+
"full_absorption_rate": 0.008906305664410403,
|
177 |
+
"num_full_absorption": 25,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.030864278273271966,
|
184 |
+
"full_absorption_rate": 0.0,
|
185 |
+
"num_full_absorption": 0,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.058126416439307944,
|
192 |
+
"full_absorption_rate": 0.0026490066225165563,
|
193 |
+
"num_full_absorption": 2,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.02767831174987224,
|
200 |
+
"full_absorption_rate": 0.0025348542458808617,
|
201 |
+
"num_full_absorption": 2,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.04302083964242241,
|
208 |
+
"full_absorption_rate": 0.0027548209366391185,
|
209 |
+
"num_full_absorption": 2,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.3811482605125039,
|
216 |
+
"full_absorption_rate": 0.017699115044247787,
|
217 |
+
"num_full_absorption": 2,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.09440012354552016,
|
224 |
+
"full_absorption_rate": 0.011363636363636364,
|
225 |
+
"num_full_absorption": 2,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.04244590904227524,
|
232 |
+
"full_absorption_rate": 0.0,
|
233 |
+
"num_full_absorption": 0,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_0",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard_april_update",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "a406d96b-4764-4870-87ad-ffb6f39febf6",
|
17 |
+
"datetime_epoch_millis": 1740070455827,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.10957305234695539,
|
21 |
+
"mean_full_absorption_score": 0.009237782749944001,
|
22 |
+
"mean_num_split_features": 1.2692307692307692,
|
23 |
+
"std_dev_absorption_fraction_score": 0.13197935301164276,
|
24 |
+
"std_dev_full_absorption_score": 0.02373718118524806,
|
25 |
+
"std_dev_num_split_features": 0.533493565673837
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.13154150609047785,
|
32 |
+
"full_absorption_rate": 0.00039872408293460925,
|
33 |
+
"num_full_absorption": 1,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.02933484749706098,
|
40 |
+
"full_absorption_rate": 0.0006485084306095979,
|
41 |
+
"num_full_absorption": 1,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.5181347299930564,
|
48 |
+
"full_absorption_rate": 0.057754010695187166,
|
49 |
+
"num_full_absorption": 162,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.06592090170151613,
|
56 |
+
"full_absorption_rate": 0.0018072289156626507,
|
57 |
+
"num_full_absorption": 3,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 3
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.1328337800923092,
|
64 |
+
"full_absorption_rate": 0.003094059405940594,
|
65 |
+
"num_full_absorption": 5,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 2
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.02871150615595956,
|
72 |
+
"full_absorption_rate": 0.0,
|
73 |
+
"num_full_absorption": 0,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.03038662265781444,
|
80 |
+
"full_absorption_rate": 0.0008733624454148472,
|
81 |
+
"num_full_absorption": 1,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.012250889606148977,
|
88 |
+
"full_absorption_rate": 0.001932367149758454,
|
89 |
+
"num_full_absorption": 2,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.18872682839441868,
|
96 |
+
"full_absorption_rate": 0.008547008547008548,
|
97 |
+
"num_full_absorption": 14,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.006006265772572784,
|
104 |
+
"full_absorption_rate": 0.0,
|
105 |
+
"num_full_absorption": 0,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.0139049237721171,
|
112 |
+
"full_absorption_rate": 0.0,
|
113 |
+
"num_full_absorption": 0,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.08373047207179418,
|
120 |
+
"full_absorption_rate": 0.000856898029134533,
|
121 |
+
"num_full_absorption": 1,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.051899966232708414,
|
128 |
+
"full_absorption_rate": 0.003844041735310269,
|
129 |
+
"num_full_absorption": 7,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.06348820232865938,
|
136 |
+
"full_absorption_rate": 0.0,
|
137 |
+
"num_full_absorption": 0,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.20622782125334888,
|
144 |
+
"full_absorption_rate": 0.015932521087160263,
|
145 |
+
"num_full_absorption": 17,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.28848948459991025,
|
152 |
+
"full_absorption_rate": 0.004820333041191937,
|
153 |
+
"num_full_absorption": 11,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.015337079699260047,
|
160 |
+
"full_absorption_rate": 0.0,
|
161 |
+
"num_full_absorption": 0,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.07880539011960401,
|
168 |
+
"full_absorption_rate": 0.0011757789535567313,
|
169 |
+
"num_full_absorption": 2,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.47082284356769066,
|
176 |
+
"full_absorption_rate": 0.11115069469184183,
|
177 |
+
"num_full_absorption": 312,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.05863498588324972,
|
184 |
+
"full_absorption_rate": 0.0,
|
185 |
+
"num_full_absorption": 0,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.08018276488751244,
|
192 |
+
"full_absorption_rate": 0.0026490066225165563,
|
193 |
+
"num_full_absorption": 2,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.014063794282346212,
|
200 |
+
"full_absorption_rate": 0.005069708491761723,
|
201 |
+
"num_full_absorption": 4,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.05768679269514189,
|
208 |
+
"full_absorption_rate": 0.008264462809917356,
|
209 |
+
"num_full_absorption": 6,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.12267083087401717,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 2
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.06181458936192774,
|
224 |
+
"full_absorption_rate": 0.011363636363636364,
|
225 |
+
"num_full_absorption": 2,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.0372915414302168,
|
232 |
+
"full_absorption_rate": 0.0,
|
233 |
+
"num_full_absorption": 0,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_1",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard_april_update",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "0a428c0f-b6b1-48a7-bb19-d7086f4270d6",
|
17 |
+
"datetime_epoch_millis": 1740073177210,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.2006084021429049,
|
21 |
+
"mean_full_absorption_score": 0.04185121640812415,
|
22 |
+
"mean_num_split_features": 1.1923076923076923,
|
23 |
+
"std_dev_absorption_fraction_score": 0.1932477405174448,
|
24 |
+
"std_dev_full_absorption_score": 0.07870423678832646,
|
25 |
+
"std_dev_num_split_features": 0.4019184762342502
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.3269017199955651,
|
32 |
+
"full_absorption_rate": 0.006778309409888357,
|
33 |
+
"num_full_absorption": 17,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.3153361615591299,
|
40 |
+
"full_absorption_rate": 0.01621271076523995,
|
41 |
+
"num_full_absorption": 25,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.7681073287240997,
|
48 |
+
"full_absorption_rate": 0.3679144385026738,
|
49 |
+
"num_full_absorption": 1032,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.4844772034986525,
|
56 |
+
"full_absorption_rate": 0.15963855421686746,
|
57 |
+
"num_full_absorption": 265,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.4189963358401263,
|
64 |
+
"full_absorption_rate": 0.08787128712871287,
|
65 |
+
"num_full_absorption": 142,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.0906370665145521,
|
72 |
+
"full_absorption_rate": 0.0016155088852988692,
|
73 |
+
"num_full_absorption": 2,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.05344715412809589,
|
80 |
+
"full_absorption_rate": 0.0017467248908296944,
|
81 |
+
"num_full_absorption": 2,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.017331113075409323,
|
88 |
+
"full_absorption_rate": 0.001932367149758454,
|
89 |
+
"num_full_absorption": 2,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.27869038889672915,
|
96 |
+
"full_absorption_rate": 0.036019536019536016,
|
97 |
+
"num_full_absorption": 59,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.02339215944422728,
|
104 |
+
"full_absorption_rate": 0.0048543689320388345,
|
105 |
+
"num_full_absorption": 2,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.010137989127819028,
|
112 |
+
"full_absorption_rate": 0.0014814814814814814,
|
113 |
+
"num_full_absorption": 1,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.16146520974781348,
|
120 |
+
"full_absorption_rate": 0.017994858611825194,
|
121 |
+
"num_full_absorption": 21,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.12957510375875406,
|
128 |
+
"full_absorption_rate": 0.0060406370126304225,
|
129 |
+
"num_full_absorption": 11,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 2
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.067508810290958,
|
136 |
+
"full_absorption_rate": 0.0,
|
137 |
+
"num_full_absorption": 0,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.30204240974545166,
|
144 |
+
"full_absorption_rate": 0.061855670103092786,
|
145 |
+
"num_full_absorption": 66,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.5004589950119995,
|
152 |
+
"full_absorption_rate": 0.09640666082383874,
|
153 |
+
"num_full_absorption": 220,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.02447369883434256,
|
160 |
+
"full_absorption_rate": 0.0,
|
161 |
+
"num_full_absorption": 0,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.13838726937556015,
|
168 |
+
"full_absorption_rate": 0.011757789535567314,
|
169 |
+
"num_full_absorption": 20,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.4160579727849388,
|
176 |
+
"full_absorption_rate": 0.11364446027787674,
|
177 |
+
"num_full_absorption": 319,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.1625731588674795,
|
184 |
+
"full_absorption_rate": 0.00471976401179941,
|
185 |
+
"num_full_absorption": 8,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.1731853313821131,
|
192 |
+
"full_absorption_rate": 0.05695364238410596,
|
193 |
+
"num_full_absorption": 43,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.041442721515423414,
|
200 |
+
"full_absorption_rate": 0.0038022813688212928,
|
201 |
+
"num_full_absorption": 3,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.06787815048597441,
|
208 |
+
"full_absorption_rate": 0.005509641873278237,
|
209 |
+
"num_full_absorption": 4,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.19401323494686296,
|
216 |
+
"full_absorption_rate": 0.017699115044247787,
|
217 |
+
"num_full_absorption": 2,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.03275891889001838,
|
224 |
+
"full_absorption_rate": 0.005681818181818182,
|
225 |
+
"num_full_absorption": 1,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.016542849273431318,
|
232 |
+
"full_absorption_rate": 0.0,
|
233 |
+
"num_full_absorption": 0,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_2",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard_april_update",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "cbe48292-6c0e-450f-a703-383be5405b0b",
|
17 |
+
"datetime_epoch_millis": 1740073855524,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.3073508786945687,
|
21 |
+
"mean_full_absorption_score": 0.12017129252511788,
|
22 |
+
"mean_num_split_features": 1.1923076923076923,
|
23 |
+
"std_dev_absorption_fraction_score": 0.23919102364176312,
|
24 |
+
"std_dev_full_absorption_score": 0.14148906885538548,
|
25 |
+
"std_dev_num_split_features": 0.6336706254344299
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.7535526101510682,
|
32 |
+
"full_absorption_rate": 0.3616427432216906,
|
33 |
+
"num_full_absorption": 907,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.443139983018839,
|
40 |
+
"full_absorption_rate": 0.1556420233463035,
|
41 |
+
"num_full_absorption": 240,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.8014968498975634,
|
48 |
+
"full_absorption_rate": 0.4859180035650624,
|
49 |
+
"num_full_absorption": 1363,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.5862511734750558,
|
56 |
+
"full_absorption_rate": 0.27951807228915665,
|
57 |
+
"num_full_absorption": 464,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.5525257217578151,
|
64 |
+
"full_absorption_rate": 0.27722772277227725,
|
65 |
+
"num_full_absorption": 448,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.25453275102142786,
|
72 |
+
"full_absorption_rate": 0.04361873990306947,
|
73 |
+
"num_full_absorption": 54,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.28917468092769205,
|
80 |
+
"full_absorption_rate": 0.10393013100436681,
|
81 |
+
"num_full_absorption": 119,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.08357516271680715,
|
88 |
+
"full_absorption_rate": 0.004830917874396135,
|
89 |
+
"num_full_absorption": 5,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.2977907134368573,
|
96 |
+
"full_absorption_rate": 0.09584859584859586,
|
97 |
+
"num_full_absorption": 157,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.059716156278724446,
|
104 |
+
"full_absorption_rate": 0.0024271844660194173,
|
105 |
+
"num_full_absorption": 1,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.030590905131696624,
|
112 |
+
"full_absorption_rate": 0.005925925925925926,
|
113 |
+
"num_full_absorption": 4,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.291295481380707,
|
120 |
+
"full_absorption_rate": 0.0702656383890317,
|
121 |
+
"num_full_absorption": 82,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.41191895007535845,
|
128 |
+
"full_absorption_rate": 0.09115870400878638,
|
129 |
+
"num_full_absorption": 166,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 4
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.15726987522678162,
|
136 |
+
"full_absorption_rate": 0.02392947103274559,
|
137 |
+
"num_full_absorption": 19,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.4208217252218873,
|
144 |
+
"full_absorption_rate": 0.14245548266166824,
|
145 |
+
"num_full_absorption": 152,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.6289941240335383,
|
152 |
+
"full_absorption_rate": 0.27256792287467135,
|
153 |
+
"num_full_absorption": 622,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.071798199256888,
|
160 |
+
"full_absorption_rate": 0.010526315789473684,
|
161 |
+
"num_full_absorption": 2,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.19783672547023598,
|
168 |
+
"full_absorption_rate": 0.021164021164021163,
|
169 |
+
"num_full_absorption": 36,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.6855947838560892,
|
176 |
+
"full_absorption_rate": 0.40969006056287854,
|
177 |
+
"num_full_absorption": 1150,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 2
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.28160576517739044,
|
184 |
+
"full_absorption_rate": 0.061946902654867256,
|
185 |
+
"num_full_absorption": 105,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.3022976453913539,
|
192 |
+
"full_absorption_rate": 0.1629139072847682,
|
193 |
+
"num_full_absorption": 123,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.06957494713769145,
|
200 |
+
"full_absorption_rate": 0.022813688212927757,
|
201 |
+
"num_full_absorption": 18,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.13656791006773783,
|
208 |
+
"full_absorption_rate": 0.009641873278236915,
|
209 |
+
"num_full_absorption": 7,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.1448744395789566,
|
216 |
+
"full_absorption_rate": 0.008849557522123894,
|
217 |
+
"num_full_absorption": 1,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.01812094705888303,
|
224 |
+
"full_absorption_rate": 0.0,
|
225 |
+
"num_full_absorption": 0,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.020204619311739412,
|
232 |
+
"full_absorption_rate": 0.0,
|
233 |
+
"num_full_absorption": 0,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_3",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard_april_update",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "825c5398-a5f2-4bdc-aed3-be71f8948c48",
|
17 |
+
"datetime_epoch_millis": 1740071810704,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.31437811121228376,
|
21 |
+
"mean_full_absorption_score": 0.14400194476050063,
|
22 |
+
"mean_num_split_features": 1.3076923076923077,
|
23 |
+
"std_dev_absorption_fraction_score": 0.21692006101845815,
|
24 |
+
"std_dev_full_absorption_score": 0.14764320616941257,
|
25 |
+
"std_dev_num_split_features": 0.7358929688062399
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.6700946207748417,
|
32 |
+
"full_absorption_rate": 0.27910685805422647,
|
33 |
+
"num_full_absorption": 700,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.24812878681697556,
|
40 |
+
"full_absorption_rate": 0.05966277561608301,
|
41 |
+
"num_full_absorption": 92,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 2
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.6216311352605427,
|
48 |
+
"full_absorption_rate": 0.3479500891265597,
|
49 |
+
"num_full_absorption": 976,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 4
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.48471932315961974,
|
56 |
+
"full_absorption_rate": 0.2246987951807229,
|
57 |
+
"num_full_absorption": 373,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.5598971921091018,
|
64 |
+
"full_absorption_rate": 0.3100247524752475,
|
65 |
+
"num_full_absorption": 501,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.37532803679140847,
|
72 |
+
"full_absorption_rate": 0.11550888529886914,
|
73 |
+
"num_full_absorption": 143,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.238732932480143,
|
80 |
+
"full_absorption_rate": 0.08122270742358079,
|
81 |
+
"num_full_absorption": 93,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.14743501683060636,
|
88 |
+
"full_absorption_rate": 0.02318840579710145,
|
89 |
+
"num_full_absorption": 24,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.35531076845345727,
|
96 |
+
"full_absorption_rate": 0.14224664224664224,
|
97 |
+
"num_full_absorption": 233,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.06722788757390594,
|
104 |
+
"full_absorption_rate": 0.0048543689320388345,
|
105 |
+
"num_full_absorption": 2,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.025713322600569993,
|
112 |
+
"full_absorption_rate": 0.0044444444444444444,
|
113 |
+
"num_full_absorption": 3,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.2608917646713939,
|
120 |
+
"full_absorption_rate": 0.10025706940874037,
|
121 |
+
"num_full_absorption": 117,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.5039165322808883,
|
128 |
+
"full_absorption_rate": 0.2311916529379462,
|
129 |
+
"num_full_absorption": 421,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.16836587854963553,
|
136 |
+
"full_absorption_rate": 0.031486146095717885,
|
137 |
+
"num_full_absorption": 25,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.26251439529076037,
|
144 |
+
"full_absorption_rate": 0.08809746954076851,
|
145 |
+
"num_full_absorption": 94,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.7250538832113292,
|
152 |
+
"full_absorption_rate": 0.42375109553023665,
|
153 |
+
"num_full_absorption": 967,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.08274907313028433,
|
160 |
+
"full_absorption_rate": 0.005263157894736842,
|
161 |
+
"num_full_absorption": 1,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.35246887286225786,
|
168 |
+
"full_absorption_rate": 0.0993533215755438,
|
169 |
+
"num_full_absorption": 169,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.6186526433047977,
|
176 |
+
"full_absorption_rate": 0.4324902030637692,
|
177 |
+
"num_full_absorption": 1214,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 3
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.35549654950125187,
|
184 |
+
"full_absorption_rate": 0.13274336283185842,
|
185 |
+
"num_full_absorption": 225,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.548820273967667,
|
192 |
+
"full_absorption_rate": 0.4728476821192053,
|
193 |
+
"num_full_absorption": 357,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.09142207483594575,
|
200 |
+
"full_absorption_rate": 0.032953105196451206,
|
201 |
+
"num_full_absorption": 26,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.17764068045934867,
|
208 |
+
"full_absorption_rate": 0.03581267217630854,
|
209 |
+
"num_full_absorption": 26,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.12268279551514197,
|
216 |
+
"full_absorption_rate": 0.02654867256637168,
|
217 |
+
"num_full_absorption": 3,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.08832088254864336,
|
224 |
+
"full_absorption_rate": 0.03409090909090909,
|
225 |
+
"num_full_absorption": 6,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.020615568538858722,
|
232 |
+
"full_absorption_rate": 0.00425531914893617,
|
233 |
+
"num_full_absorption": 1,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_4",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard_april_update",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "1a765d01-f368-4f32-ac3f-112701f7a594",
|
17 |
+
"datetime_epoch_millis": 1740071130337,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.3576698915735376,
|
21 |
+
"mean_full_absorption_score": 0.19249329486644487,
|
22 |
+
"mean_num_split_features": 1.6153846153846154,
|
23 |
+
"std_dev_absorption_fraction_score": 0.2065944618933617,
|
24 |
+
"std_dev_full_absorption_score": 0.1505229817991518,
|
25 |
+
"std_dev_num_split_features": 1.061203960675725
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.6340099406185813,
|
32 |
+
"full_absorption_rate": 0.3141945773524721,
|
33 |
+
"num_full_absorption": 788,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 2
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.3637546041985076,
|
40 |
+
"full_absorption_rate": 0.17380025940337224,
|
41 |
+
"num_full_absorption": 268,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 2
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.5699543239021706,
|
48 |
+
"full_absorption_rate": 0.33475935828877007,
|
49 |
+
"num_full_absorption": 939,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 5
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.42378677187369035,
|
56 |
+
"full_absorption_rate": 0.23132530120481928,
|
57 |
+
"num_full_absorption": 384,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 2
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.37917700106716457,
|
64 |
+
"full_absorption_rate": 0.1806930693069307,
|
65 |
+
"num_full_absorption": 292,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 3
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.2694528890929361,
|
72 |
+
"full_absorption_rate": 0.09773828756058159,
|
73 |
+
"num_full_absorption": 121,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.33762684174409513,
|
80 |
+
"full_absorption_rate": 0.2034934497816594,
|
81 |
+
"num_full_absorption": 233,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.21553467858627987,
|
88 |
+
"full_absorption_rate": 0.05990338164251208,
|
89 |
+
"num_full_absorption": 62,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.3133461994941835,
|
96 |
+
"full_absorption_rate": 0.1221001221001221,
|
97 |
+
"num_full_absorption": 200,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.09022134602155767,
|
104 |
+
"full_absorption_rate": 0.014563106796116505,
|
105 |
+
"num_full_absorption": 6,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.05866300627458284,
|
112 |
+
"full_absorption_rate": 0.008888888888888889,
|
113 |
+
"num_full_absorption": 6,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.4974789370506606,
|
120 |
+
"full_absorption_rate": 0.3273350471293916,
|
121 |
+
"num_full_absorption": 382,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.5123970181859486,
|
128 |
+
"full_absorption_rate": 0.2904997254255903,
|
129 |
+
"num_full_absorption": 529,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 3
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.26753510683966114,
|
136 |
+
"full_absorption_rate": 0.0982367758186398,
|
137 |
+
"num_full_absorption": 78,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.28532511297821034,
|
144 |
+
"full_absorption_rate": 0.1246485473289597,
|
145 |
+
"num_full_absorption": 133,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.7526527841045251,
|
152 |
+
"full_absorption_rate": 0.46757230499561786,
|
153 |
+
"num_full_absorption": 1067,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 2
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.2352011783959387,
|
160 |
+
"full_absorption_rate": 0.06842105263157895,
|
161 |
+
"num_full_absorption": 13,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.5935578512060531,
|
168 |
+
"full_absorption_rate": 0.38741916519694297,
|
169 |
+
"num_full_absorption": 659,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.6336584581511494,
|
176 |
+
"full_absorption_rate": 0.4720342002137513,
|
177 |
+
"num_full_absorption": 1325,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 4
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.6346142754558641,
|
184 |
+
"full_absorption_rate": 0.35988200589970504,
|
185 |
+
"num_full_absorption": 610,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.5560926658622606,
|
192 |
+
"full_absorption_rate": 0.41324503311258276,
|
193 |
+
"num_full_absorption": 312,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.13204581985673397,
|
200 |
+
"full_absorption_rate": 0.03929024081115336,
|
201 |
+
"num_full_absorption": 31,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.2592258683246165,
|
208 |
+
"full_absorption_rate": 0.12258953168044077,
|
209 |
+
"num_full_absorption": 89,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.14300134539704432,
|
216 |
+
"full_absorption_rate": 0.035398230088495575,
|
217 |
+
"num_full_absorption": 4,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.10338290562531721,
|
224 |
+
"full_absorption_rate": 0.03977272727272727,
|
225 |
+
"num_full_absorption": 7,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.03772025060424493,
|
232 |
+
"full_absorption_rate": 0.01702127659574468,
|
233 |
+
"num_full_absorption": 4,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_5",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard_april_update",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "ac7d6a5d-0d00-4a5b-abd1-3aeae988bb97",
|
17 |
+
"datetime_epoch_millis": 1740106423402,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.3354590245907481,
|
21 |
+
"mean_full_absorption_score": 0.031849435853675334,
|
22 |
+
"mean_num_split_features": 1.0,
|
23 |
+
"std_dev_absorption_fraction_score": 0.1793908795821037,
|
24 |
+
"std_dev_full_absorption_score": 0.07822514491503704,
|
25 |
+
"std_dev_num_split_features": 0.0
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.6705902831278018,
|
32 |
+
"full_absorption_rate": 0.04824561403508772,
|
33 |
+
"num_full_absorption": 121,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.3728156359136282,
|
40 |
+
"full_absorption_rate": 0.00648508430609598,
|
41 |
+
"num_full_absorption": 10,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.5348743583874349,
|
48 |
+
"full_absorption_rate": 0.020320855614973262,
|
49 |
+
"num_full_absorption": 57,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.39798907235407177,
|
56 |
+
"full_absorption_rate": 0.02469879518072289,
|
57 |
+
"num_full_absorption": 41,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.533886743466549,
|
64 |
+
"full_absorption_rate": 0.01485148514851485,
|
65 |
+
"num_full_absorption": 24,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.27790151084705284,
|
72 |
+
"full_absorption_rate": 0.0024232633279483036,
|
73 |
+
"num_full_absorption": 3,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.27757546015014967,
|
80 |
+
"full_absorption_rate": 0.03668122270742358,
|
81 |
+
"num_full_absorption": 42,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.2768691562378332,
|
88 |
+
"full_absorption_rate": 0.007729468599033816,
|
89 |
+
"num_full_absorption": 8,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.82576509541867,
|
96 |
+
"full_absorption_rate": 0.4084249084249084,
|
97 |
+
"num_full_absorption": 669,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.17559629722958997,
|
104 |
+
"full_absorption_rate": 0.012135922330097087,
|
105 |
+
"num_full_absorption": 5,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.09880944880998055,
|
112 |
+
"full_absorption_rate": 0.005925925925925926,
|
113 |
+
"num_full_absorption": 4,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.2382625256082734,
|
120 |
+
"full_absorption_rate": 0.005998286203941731,
|
121 |
+
"num_full_absorption": 7,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.31249057733024366,
|
128 |
+
"full_absorption_rate": 0.007138934651290499,
|
129 |
+
"num_full_absorption": 13,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.28150837185990335,
|
136 |
+
"full_absorption_rate": 0.012594458438287154,
|
137 |
+
"num_full_absorption": 10,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.480505577799689,
|
144 |
+
"full_absorption_rate": 0.05248359887535145,
|
145 |
+
"num_full_absorption": 56,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.47784453048938713,
|
152 |
+
"full_absorption_rate": 0.018843120070113933,
|
153 |
+
"num_full_absorption": 43,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.14627500048163214,
|
160 |
+
"full_absorption_rate": 0.0,
|
161 |
+
"num_full_absorption": 0,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.42047647042346825,
|
168 |
+
"full_absorption_rate": 0.02821869488536155,
|
169 |
+
"num_full_absorption": 48,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.46815558803513757,
|
176 |
+
"full_absorption_rate": 0.032418952618453865,
|
177 |
+
"num_full_absorption": 91,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.31041426393272636,
|
184 |
+
"full_absorption_rate": 0.008259587020648967,
|
185 |
+
"num_full_absorption": 14,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.3376080066325288,
|
192 |
+
"full_absorption_rate": 0.04105960264900662,
|
193 |
+
"num_full_absorption": 31,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.20292887001215182,
|
200 |
+
"full_absorption_rate": 0.0063371356147021544,
|
201 |
+
"num_full_absorption": 5,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.17713092282140977,
|
208 |
+
"full_absorption_rate": 0.005509641873278237,
|
209 |
+
"num_full_absorption": 4,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.15669316359265703,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.183909327242874,
|
224 |
+
"full_absorption_rate": 0.017045454545454544,
|
225 |
+
"num_full_absorption": 3,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.08505838115460661,
|
232 |
+
"full_absorption_rate": 0.00425531914893617,
|
233 |
+
"num_full_absorption": 1,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_0",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard_april_update",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "44014436-2277-4932-8035-15dfefe49292",
|
17 |
+
"datetime_epoch_millis": 1740104119403,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.347605051511975,
|
21 |
+
"mean_full_absorption_score": 0.03857317859306944,
|
22 |
+
"mean_num_split_features": 1.0384615384615385,
|
23 |
+
"std_dev_absorption_fraction_score": 0.17133887402196588,
|
24 |
+
"std_dev_full_absorption_score": 0.046258397055203106,
|
25 |
+
"std_dev_num_split_features": 0.19611613513818404
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.7399088796725409,
|
32 |
+
"full_absorption_rate": 0.12081339712918661,
|
33 |
+
"num_full_absorption": 303,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.41786808469850445,
|
40 |
+
"full_absorption_rate": 0.03631647211413749,
|
41 |
+
"num_full_absorption": 56,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.5593407106875781,
|
48 |
+
"full_absorption_rate": 0.037076648841354726,
|
49 |
+
"num_full_absorption": 104,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.45953918169479785,
|
56 |
+
"full_absorption_rate": 0.04216867469879518,
|
57 |
+
"num_full_absorption": 70,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.5119789949378576,
|
64 |
+
"full_absorption_rate": 0.0297029702970297,
|
65 |
+
"num_full_absorption": 48,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.37566264645277836,
|
72 |
+
"full_absorption_rate": 0.016962843295638127,
|
73 |
+
"num_full_absorption": 21,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.22625273738522275,
|
80 |
+
"full_absorption_rate": 0.023580786026200874,
|
81 |
+
"num_full_absorption": 27,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.37416087626156275,
|
88 |
+
"full_absorption_rate": 0.021256038647342997,
|
89 |
+
"num_full_absorption": 22,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.6517323393277149,
|
96 |
+
"full_absorption_rate": 0.22466422466422467,
|
97 |
+
"num_full_absorption": 368,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.15898688303273234,
|
104 |
+
"full_absorption_rate": 0.007281553398058253,
|
105 |
+
"num_full_absorption": 3,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.05484682779040442,
|
112 |
+
"full_absorption_rate": 0.0044444444444444444,
|
113 |
+
"num_full_absorption": 3,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.35469782028287083,
|
120 |
+
"full_absorption_rate": 0.027420736932305057,
|
121 |
+
"num_full_absorption": 32,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.28887345983707763,
|
128 |
+
"full_absorption_rate": 0.0060406370126304225,
|
129 |
+
"num_full_absorption": 11,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.30536576036448354,
|
136 |
+
"full_absorption_rate": 0.02518891687657431,
|
137 |
+
"num_full_absorption": 20,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.5108695858931614,
|
144 |
+
"full_absorption_rate": 0.08528584817244611,
|
145 |
+
"num_full_absorption": 91,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.41906548997444415,
|
152 |
+
"full_absorption_rate": 0.007887817703768623,
|
153 |
+
"num_full_absorption": 18,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.18782440951973378,
|
160 |
+
"full_absorption_rate": 0.02631578947368421,
|
161 |
+
"num_full_absorption": 5,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.4157013866326478,
|
168 |
+
"full_absorption_rate": 0.05584950029394474,
|
169 |
+
"num_full_absorption": 95,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.46749630339725584,
|
176 |
+
"full_absorption_rate": 0.0441752760954756,
|
177 |
+
"num_full_absorption": 124,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.39811444693801784,
|
184 |
+
"full_absorption_rate": 0.024778761061946902,
|
185 |
+
"num_full_absorption": 42,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.3173131816990261,
|
192 |
+
"full_absorption_rate": 0.05033112582781457,
|
193 |
+
"num_full_absorption": 38,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.1644279320997771,
|
200 |
+
"full_absorption_rate": 0.017743979721166033,
|
201 |
+
"num_full_absorption": 14,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.2168683380240976,
|
208 |
+
"full_absorption_rate": 0.01790633608815427,
|
209 |
+
"num_full_absorption": 13,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.1809079054261136,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.21409290660015243,
|
224 |
+
"full_absorption_rate": 0.045454545454545456,
|
225 |
+
"num_full_absorption": 8,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.0658342506807965,
|
232 |
+
"full_absorption_rate": 0.00425531914893617,
|
233 |
+
"num_full_absorption": 1,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_1",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard_april_update",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "a25add70-d8fc-4e77-a5ac-f9b20ed47a90",
|
17 |
+
"datetime_epoch_millis": 1740107190845,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.35606277259778024,
|
21 |
+
"mean_full_absorption_score": 0.058416307732367695,
|
22 |
+
"mean_num_split_features": 1.1538461538461537,
|
23 |
+
"std_dev_absorption_fraction_score": 0.20313388269593746,
|
24 |
+
"std_dev_full_absorption_score": 0.06309863260264571,
|
25 |
+
"std_dev_num_split_features": 0.36794648440311994
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.6914944003282536,
|
32 |
+
"full_absorption_rate": 0.09928229665071771,
|
33 |
+
"num_full_absorption": 249,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.5346865856027213,
|
40 |
+
"full_absorption_rate": 0.12062256809338522,
|
41 |
+
"num_full_absorption": 186,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.6022704850600882,
|
48 |
+
"full_absorption_rate": 0.10338680926916222,
|
49 |
+
"num_full_absorption": 290,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.4691487366121355,
|
56 |
+
"full_absorption_rate": 0.05783132530120482,
|
57 |
+
"num_full_absorption": 96,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.5822165186131862,
|
64 |
+
"full_absorption_rate": 0.09777227722772278,
|
65 |
+
"num_full_absorption": 158,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.34109857549209605,
|
72 |
+
"full_absorption_rate": 0.027463651050080775,
|
73 |
+
"num_full_absorption": 34,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.2975657369760735,
|
80 |
+
"full_absorption_rate": 0.0462882096069869,
|
81 |
+
"num_full_absorption": 53,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.35129528088332335,
|
88 |
+
"full_absorption_rate": 0.035748792270531404,
|
89 |
+
"num_full_absorption": 37,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.6614939578585769,
|
96 |
+
"full_absorption_rate": 0.2869352869352869,
|
97 |
+
"num_full_absorption": 470,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.1501980866745374,
|
104 |
+
"full_absorption_rate": 0.007281553398058253,
|
105 |
+
"num_full_absorption": 3,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.04632368976363547,
|
112 |
+
"full_absorption_rate": 0.0044444444444444444,
|
113 |
+
"num_full_absorption": 3,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.3433001642791041,
|
120 |
+
"full_absorption_rate": 0.02570694087403599,
|
121 |
+
"num_full_absorption": 30,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 2
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.3545827375756091,
|
128 |
+
"full_absorption_rate": 0.022515101592531575,
|
129 |
+
"num_full_absorption": 41,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.28102067711277673,
|
136 |
+
"full_absorption_rate": 0.022670025188916875,
|
137 |
+
"num_full_absorption": 18,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.3869171083857235,
|
144 |
+
"full_absorption_rate": 0.05435801312089972,
|
145 |
+
"num_full_absorption": 58,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.6588443437832601,
|
152 |
+
"full_absorption_rate": 0.15030674846625766,
|
153 |
+
"num_full_absorption": 343,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.1851751902884397,
|
160 |
+
"full_absorption_rate": 0.02631578947368421,
|
161 |
+
"num_full_absorption": 5,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.5037574452444985,
|
168 |
+
"full_absorption_rate": 0.08465608465608465,
|
169 |
+
"num_full_absorption": 144,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.5589132911026052,
|
176 |
+
"full_absorption_rate": 0.11400071250445315,
|
177 |
+
"num_full_absorption": 320,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.4614050815777875,
|
184 |
+
"full_absorption_rate": 0.06371681415929203,
|
185 |
+
"num_full_absorption": 108,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.12407794887818784,
|
192 |
+
"full_absorption_rate": 0.006622516556291391,
|
193 |
+
"num_full_absorption": 5,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.16250522051692146,
|
200 |
+
"full_absorption_rate": 0.016476552598225603,
|
201 |
+
"num_full_absorption": 13,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.25975629765923597,
|
208 |
+
"full_absorption_rate": 0.03305785123966942,
|
209 |
+
"num_full_absorption": 24,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.10259413193221943,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.09755416501625133,
|
224 |
+
"full_absorption_rate": 0.011363636363636364,
|
225 |
+
"num_full_absorption": 2,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 2
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.04943623032503822,
|
232 |
+
"full_absorption_rate": 0.0,
|
233 |
+
"num_full_absorption": 0,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_2",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard_april_update",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "67975d88-3e1e-45d1-9c8f-dfb138e9f448",
|
17 |
+
"datetime_epoch_millis": 1740107954355,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.44380362556359265,
|
21 |
+
"mean_full_absorption_score": 0.16020534428445912,
|
22 |
+
"mean_num_split_features": 1.0769230769230769,
|
23 |
+
"std_dev_absorption_fraction_score": 0.19712103165471032,
|
24 |
+
"std_dev_full_absorption_score": 0.0950978390878325,
|
25 |
+
"std_dev_num_split_features": 0.271746488194703
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.6839817030406431,
|
32 |
+
"full_absorption_rate": 0.19577352472089316,
|
33 |
+
"num_full_absorption": 491,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.6290547365203768,
|
40 |
+
"full_absorption_rate": 0.2821011673151751,
|
41 |
+
"num_full_absorption": 435,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.6674229089281046,
|
48 |
+
"full_absorption_rate": 0.2103386809269162,
|
49 |
+
"num_full_absorption": 590,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.5200649016554985,
|
56 |
+
"full_absorption_rate": 0.12048192771084337,
|
57 |
+
"num_full_absorption": 200,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.5727495906700607,
|
64 |
+
"full_absorption_rate": 0.20606435643564355,
|
65 |
+
"num_full_absorption": 333,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.5293144892122851,
|
72 |
+
"full_absorption_rate": 0.15751211631663975,
|
73 |
+
"num_full_absorption": 195,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.4212924218838792,
|
80 |
+
"full_absorption_rate": 0.15021834061135372,
|
81 |
+
"num_full_absorption": 172,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.5288618679423563,
|
88 |
+
"full_absorption_rate": 0.1468599033816425,
|
89 |
+
"num_full_absorption": 152,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.48231909319996086,
|
96 |
+
"full_absorption_rate": 0.26495726495726496,
|
97 |
+
"num_full_absorption": 434,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.31945510015807715,
|
104 |
+
"full_absorption_rate": 0.09951456310679611,
|
105 |
+
"num_full_absorption": 41,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.10917495607823761,
|
112 |
+
"full_absorption_rate": 0.014814814814814815,
|
113 |
+
"num_full_absorption": 10,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.5170836823568926,
|
120 |
+
"full_absorption_rate": 0.1868037703513282,
|
121 |
+
"num_full_absorption": 218,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 2
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.47433744153500806,
|
128 |
+
"full_absorption_rate": 0.10049423393739704,
|
129 |
+
"num_full_absorption": 183,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.456688962722974,
|
136 |
+
"full_absorption_rate": 0.13350125944584382,
|
137 |
+
"num_full_absorption": 106,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.429842671281088,
|
144 |
+
"full_absorption_rate": 0.14620431115276475,
|
145 |
+
"num_full_absorption": 156,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.7336453968814676,
|
152 |
+
"full_absorption_rate": 0.3347940403155127,
|
153 |
+
"num_full_absorption": 764,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.21841852715273,
|
160 |
+
"full_absorption_rate": 0.05789473684210526,
|
161 |
+
"num_full_absorption": 11,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.664077195798576,
|
168 |
+
"full_absorption_rate": 0.3550852439741329,
|
169 |
+
"num_full_absorption": 604,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.6076130947697026,
|
176 |
+
"full_absorption_rate": 0.23619522622016387,
|
177 |
+
"num_full_absorption": 663,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.6065091469749866,
|
184 |
+
"full_absorption_rate": 0.19469026548672566,
|
185 |
+
"num_full_absorption": 330,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.3811763776255798,
|
192 |
+
"full_absorption_rate": 0.25298013245033113,
|
193 |
+
"num_full_absorption": 191,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.2316869655239408,
|
200 |
+
"full_absorption_rate": 0.06337135614702155,
|
201 |
+
"num_full_absorption": 50,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.4702754754474782,
|
208 |
+
"full_absorption_rate": 0.1887052341597796,
|
209 |
+
"num_full_absorption": 137,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.13971775461100902,
|
216 |
+
"full_absorption_rate": 0.017699115044247787,
|
217 |
+
"num_full_absorption": 2,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.09684929978197893,
|
224 |
+
"full_absorption_rate": 0.03977272727272727,
|
225 |
+
"num_full_absorption": 7,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.04728050290051698,
|
232 |
+
"full_absorption_rate": 0.00851063829787234,
|
233 |
+
"num_full_absorption": 2,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_3",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard_april_update",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "c676f0a0-53b5-4849-9ef8-53b1afa1bb75",
|
17 |
+
"datetime_epoch_millis": 1740105664938,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.49673005917063284,
|
21 |
+
"mean_full_absorption_score": 0.26388583248007075,
|
22 |
+
"mean_num_split_features": 1.2692307692307692,
|
23 |
+
"std_dev_absorption_fraction_score": 0.2113161375367725,
|
24 |
+
"std_dev_full_absorption_score": 0.14871901551206065,
|
25 |
+
"std_dev_num_split_features": 0.7243033788512826
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.7138046192508011,
|
32 |
+
"full_absorption_rate": 0.310207336523126,
|
33 |
+
"num_full_absorption": 778,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.6674758017576328,
|
40 |
+
"full_absorption_rate": 0.4280155642023346,
|
41 |
+
"num_full_absorption": 660,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.7629196965454134,
|
48 |
+
"full_absorption_rate": 0.46096256684491976,
|
49 |
+
"num_full_absorption": 1293,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.5509837850383651,
|
56 |
+
"full_absorption_rate": 0.26385542168674697,
|
57 |
+
"num_full_absorption": 438,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.6106759626477273,
|
64 |
+
"full_absorption_rate": 0.3193069306930693,
|
65 |
+
"num_full_absorption": 516,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.6850841851784408,
|
72 |
+
"full_absorption_rate": 0.4079159935379645,
|
73 |
+
"num_full_absorption": 505,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.5521196093178781,
|
80 |
+
"full_absorption_rate": 0.2925764192139738,
|
81 |
+
"num_full_absorption": 335,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.5727047223894772,
|
88 |
+
"full_absorption_rate": 0.22995169082125605,
|
89 |
+
"num_full_absorption": 238,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.5935284275560726,
|
96 |
+
"full_absorption_rate": 0.37484737484737485,
|
97 |
+
"num_full_absorption": 614,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.36914701836970715,
|
104 |
+
"full_absorption_rate": 0.18446601941747573,
|
105 |
+
"num_full_absorption": 76,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.14630322313929153,
|
112 |
+
"full_absorption_rate": 0.028148148148148148,
|
113 |
+
"num_full_absorption": 19,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.5786165566811846,
|
120 |
+
"full_absorption_rate": 0.31362467866323906,
|
121 |
+
"num_full_absorption": 366,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.6447573755425703,
|
128 |
+
"full_absorption_rate": 0.28610653487095,
|
129 |
+
"num_full_absorption": 521,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.4502983371728396,
|
136 |
+
"full_absorption_rate": 0.16624685138539042,
|
137 |
+
"num_full_absorption": 132,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.4333315336903838,
|
144 |
+
"full_absorption_rate": 0.1583880037488285,
|
145 |
+
"num_full_absorption": 169,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.76016403003186,
|
152 |
+
"full_absorption_rate": 0.45267309377738824,
|
153 |
+
"num_full_absorption": 1033,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 2
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.4128471718435671,
|
160 |
+
"full_absorption_rate": 0.19473684210526315,
|
161 |
+
"num_full_absorption": 37,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.6314906541291778,
|
168 |
+
"full_absorption_rate": 0.3985890652557319,
|
169 |
+
"num_full_absorption": 678,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 3
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.7506050472722071,
|
176 |
+
"full_absorption_rate": 0.6034912718204489,
|
177 |
+
"num_full_absorption": 1694,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 4
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.6150172352711047,
|
184 |
+
"full_absorption_rate": 0.2902654867256637,
|
185 |
+
"num_full_absorption": 492,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.36322090311483735,
|
192 |
+
"full_absorption_rate": 0.24370860927152319,
|
193 |
+
"num_full_absorption": 184,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.40699263233588356,
|
200 |
+
"full_absorption_rate": 0.1761723700887199,
|
201 |
+
"num_full_absorption": 139,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.39281073164811625,
|
208 |
+
"full_absorption_rate": 0.209366391184573,
|
209 |
+
"num_full_absorption": 152,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.1217508800357222,
|
216 |
+
"full_absorption_rate": 0.017699115044247787,
|
217 |
+
"num_full_absorption": 2,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.0886418130654087,
|
224 |
+
"full_absorption_rate": 0.045454545454545456,
|
225 |
+
"num_full_absorption": 8,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.03968958541078437,
|
232 |
+
"full_absorption_rate": 0.00425531914893617,
|
233 |
+
"num_full_absorption": 1,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_4",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard_april_update",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "7a23eda6-43f2-42d3-b41f-82aa14ab0bcd",
|
17 |
+
"datetime_epoch_millis": 1740104875458,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.5282848269512149,
|
21 |
+
"mean_full_absorption_score": 0.35722963768366117,
|
22 |
+
"mean_num_split_features": 1.9230769230769231,
|
23 |
+
"std_dev_absorption_fraction_score": 0.20756776737189886,
|
24 |
+
"std_dev_full_absorption_score": 0.17409113695625567,
|
25 |
+
"std_dev_num_split_features": 1.3834182859302366
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.7070839083555905,
|
32 |
+
"full_absorption_rate": 0.36762360446570974,
|
33 |
+
"num_full_absorption": 922,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.6648609736653764,
|
40 |
+
"full_absorption_rate": 0.4961089494163424,
|
41 |
+
"num_full_absorption": 765,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 4
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.8005478702573291,
|
48 |
+
"full_absorption_rate": 0.6081996434937611,
|
49 |
+
"num_full_absorption": 1706,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.6749493296742596,
|
56 |
+
"full_absorption_rate": 0.45542168674698796,
|
57 |
+
"num_full_absorption": 756,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 3
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.6332021290003637,
|
64 |
+
"full_absorption_rate": 0.4350247524752475,
|
65 |
+
"num_full_absorption": 703,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 2
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.6800208255796995,
|
72 |
+
"full_absorption_rate": 0.4806138933764136,
|
73 |
+
"num_full_absorption": 595,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.6579759079289633,
|
80 |
+
"full_absorption_rate": 0.5414847161572053,
|
81 |
+
"num_full_absorption": 620,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.6481991163632241,
|
88 |
+
"full_absorption_rate": 0.41642512077294686,
|
89 |
+
"num_full_absorption": 431,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.5566601715573842,
|
96 |
+
"full_absorption_rate": 0.3882783882783883,
|
97 |
+
"num_full_absorption": 636,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.41230771985239695,
|
104 |
+
"full_absorption_rate": 0.27184466019417475,
|
105 |
+
"num_full_absorption": 112,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.22952222086566945,
|
112 |
+
"full_absorption_rate": 0.09481481481481481,
|
113 |
+
"num_full_absorption": 64,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.5737482843930954,
|
120 |
+
"full_absorption_rate": 0.4087403598971722,
|
121 |
+
"num_full_absorption": 477,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 2
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.6814245446786736,
|
128 |
+
"full_absorption_rate": 0.500274574409665,
|
129 |
+
"num_full_absorption": 911,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 3
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.5440130265045751,
|
136 |
+
"full_absorption_rate": 0.3136020151133501,
|
137 |
+
"num_full_absorption": 249,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.4040905603924752,
|
144 |
+
"full_absorption_rate": 0.19400187441424555,
|
145 |
+
"num_full_absorption": 207,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.7797710366668161,
|
152 |
+
"full_absorption_rate": 0.5473269062226117,
|
153 |
+
"num_full_absorption": 1249,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 3
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.3222575918031264,
|
160 |
+
"full_absorption_rate": 0.16842105263157894,
|
161 |
+
"num_full_absorption": 32,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 2
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.6852024934359321,
|
168 |
+
"full_absorption_rate": 0.5038212815990594,
|
169 |
+
"num_full_absorption": 857,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 3
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.6981286454286791,
|
176 |
+
"full_absorption_rate": 0.5575347345920912,
|
177 |
+
"num_full_absorption": 1565,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 7
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.7110423532653868,
|
184 |
+
"full_absorption_rate": 0.5079646017699115,
|
185 |
+
"num_full_absorption": 861,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 3
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.3995663241148787,
|
192 |
+
"full_absorption_rate": 0.31920529801324504,
|
193 |
+
"num_full_absorption": 241,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.3717835736899836,
|
200 |
+
"full_absorption_rate": 0.20785804816223066,
|
201 |
+
"num_full_absorption": 164,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.5231441138039113,
|
208 |
+
"full_absorption_rate": 0.3760330578512397,
|
209 |
+
"num_full_absorption": 273,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.11941588125068914,
|
216 |
+
"full_absorption_rate": 0.02654867256637168,
|
217 |
+
"num_full_absorption": 3,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.13165534351732078,
|
224 |
+
"full_absorption_rate": 0.0625,
|
225 |
+
"num_full_absorption": 11,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.12483155468578631,
|
232 |
+
"full_absorption_rate": 0.03829787234042553,
|
233 |
+
"num_full_absorption": 9,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_5",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard_april_update",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "f9074aa1-0b4f-4b68-9e3c-3bfabf71bc13",
|
17 |
+
"datetime_epoch_millis": 1740076749888,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.37747633872113084,
|
21 |
+
"mean_full_absorption_score": 0.4139575465645789,
|
22 |
+
"mean_num_split_features": 3.269230769230769,
|
23 |
+
"std_dev_absorption_fraction_score": 0.19789018654007454,
|
24 |
+
"std_dev_full_absorption_score": 0.18954624360986377,
|
25 |
+
"std_dev_num_split_features": 2.42582262018792
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.5971456051033638,
|
32 |
+
"full_absorption_rate": 0.4800637958532695,
|
33 |
+
"num_full_absorption": 1204,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 8
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.3813263071869703,
|
40 |
+
"full_absorption_rate": 0.4701686121919585,
|
41 |
+
"num_full_absorption": 725,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 6
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.7258237506928806,
|
48 |
+
"full_absorption_rate": 0.718716577540107,
|
49 |
+
"num_full_absorption": 2016,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 5
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.48482520471847396,
|
56 |
+
"full_absorption_rate": 0.5078313253012048,
|
57 |
+
"num_full_absorption": 843,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 4
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.5224461002137466,
|
64 |
+
"full_absorption_rate": 0.47339108910891087,
|
65 |
+
"num_full_absorption": 765,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 3
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.5491052342123783,
|
72 |
+
"full_absorption_rate": 0.5823909531502424,
|
73 |
+
"num_full_absorption": 721,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 3
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.27565515017431547,
|
80 |
+
"full_absorption_rate": 0.3467248908296943,
|
81 |
+
"num_full_absorption": 397,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 3
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.3776238034975393,
|
88 |
+
"full_absorption_rate": 0.4,
|
89 |
+
"num_full_absorption": 414,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 2
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.4793555693867835,
|
96 |
+
"full_absorption_rate": 0.49633699633699635,
|
97 |
+
"num_full_absorption": 813,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 6
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.12459906400823838,
|
104 |
+
"full_absorption_rate": 0.1553398058252427,
|
105 |
+
"num_full_absorption": 64,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.07497296105837896,
|
112 |
+
"full_absorption_rate": 0.1037037037037037,
|
113 |
+
"num_full_absorption": 70,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.3189773190709259,
|
120 |
+
"full_absorption_rate": 0.38046272493573263,
|
121 |
+
"num_full_absorption": 444,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 4
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.42581561984285576,
|
128 |
+
"full_absorption_rate": 0.4728171334431631,
|
129 |
+
"num_full_absorption": 861,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 8
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.45881668609062815,
|
136 |
+
"full_absorption_rate": 0.45591939546599497,
|
137 |
+
"num_full_absorption": 362,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 3
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.3059020128826346,
|
144 |
+
"full_absorption_rate": 0.45454545454545453,
|
145 |
+
"num_full_absorption": 485,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.6640428801113221,
|
152 |
+
"full_absorption_rate": 0.6305872042068361,
|
153 |
+
"num_full_absorption": 1439,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 7
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.23857383539688107,
|
160 |
+
"full_absorption_rate": 0.3105263157894737,
|
161 |
+
"num_full_absorption": 59,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.4896607321852771,
|
168 |
+
"full_absorption_rate": 0.5537918871252204,
|
169 |
+
"num_full_absorption": 942,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 4
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.47620660054740455,
|
176 |
+
"full_absorption_rate": 0.6294976843605272,
|
177 |
+
"num_full_absorption": 1767,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 7
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.629838138645014,
|
184 |
+
"full_absorption_rate": 0.6005899705014749,
|
185 |
+
"num_full_absorption": 1018,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.5066990836428441,
|
192 |
+
"full_absorption_rate": 0.6410596026490066,
|
193 |
+
"num_full_absorption": 484,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.12582986991851342,
|
200 |
+
"full_absorption_rate": 0.21166032953105196,
|
201 |
+
"num_full_absorption": 167,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.3654424833313806,
|
208 |
+
"full_absorption_rate": 0.4228650137741047,
|
209 |
+
"num_full_absorption": 307,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.051502797875670484,
|
216 |
+
"full_absorption_rate": 0.07079646017699115,
|
217 |
+
"num_full_absorption": 8,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.1337897577016078,
|
224 |
+
"full_absorption_rate": 0.14204545454545456,
|
225 |
+
"num_full_absorption": 25,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.030408239253373628,
|
232 |
+
"full_absorption_rate": 0.05106382978723404,
|
233 |
+
"num_full_absorption": 12,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_0",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "8afc9485-7c23-46a6-a571-8dfcd88b74a0",
|
17 |
+
"datetime_epoch_millis": 1740074545743,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.31839490776468177,
|
21 |
+
"mean_full_absorption_score": 0.3314631805182705,
|
22 |
+
"mean_num_split_features": 2.730769230769231,
|
23 |
+
"std_dev_absorption_fraction_score": 0.18921047426421872,
|
24 |
+
"std_dev_full_absorption_score": 0.18421106793113218,
|
25 |
+
"std_dev_num_split_features": 2.0505158825562373
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.6049809639558894,
|
32 |
+
"full_absorption_rate": 0.49800637958532695,
|
33 |
+
"num_full_absorption": 1249,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 5
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.26007062348535115,
|
40 |
+
"full_absorption_rate": 0.3125810635538262,
|
41 |
+
"num_full_absorption": 482,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 5
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.5776837402582738,
|
48 |
+
"full_absorption_rate": 0.5440285204991088,
|
49 |
+
"num_full_absorption": 1526,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 7
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.3839260656787951,
|
56 |
+
"full_absorption_rate": 0.3493975903614458,
|
57 |
+
"num_full_absorption": 580,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 4
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.5491400236273614,
|
64 |
+
"full_absorption_rate": 0.4975247524752475,
|
65 |
+
"num_full_absorption": 804,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 3
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.41603058308666035,
|
72 |
+
"full_absorption_rate": 0.4087237479806139,
|
73 |
+
"num_full_absorption": 506,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 2
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.3562791807345914,
|
80 |
+
"full_absorption_rate": 0.3868995633187773,
|
81 |
+
"num_full_absorption": 443,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.22981263470261395,
|
88 |
+
"full_absorption_rate": 0.2289855072463768,
|
89 |
+
"num_full_absorption": 237,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.3856963045568308,
|
96 |
+
"full_absorption_rate": 0.42918192918192916,
|
97 |
+
"num_full_absorption": 703,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 5
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.01846723298578015,
|
104 |
+
"full_absorption_rate": 0.038834951456310676,
|
105 |
+
"num_full_absorption": 16,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.04136762125390637,
|
112 |
+
"full_absorption_rate": 0.05037037037037037,
|
113 |
+
"num_full_absorption": 34,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.30929793561492136,
|
120 |
+
"full_absorption_rate": 0.3401885175664096,
|
121 |
+
"num_full_absorption": 397,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 3
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.4736418912102383,
|
128 |
+
"full_absorption_rate": 0.5172981878088962,
|
129 |
+
"num_full_absorption": 942,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 6
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.28225493966228277,
|
136 |
+
"full_absorption_rate": 0.26448362720403024,
|
137 |
+
"num_full_absorption": 210,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 2
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.37481096091469307,
|
144 |
+
"full_absorption_rate": 0.46485473289597,
|
145 |
+
"num_full_absorption": 496,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.6552642696406819,
|
152 |
+
"full_absorption_rate": 0.6156879929886064,
|
153 |
+
"num_full_absorption": 1405,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 5
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.14073819277923014,
|
160 |
+
"full_absorption_rate": 0.17894736842105263,
|
161 |
+
"num_full_absorption": 34,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.45016305961443226,
|
168 |
+
"full_absorption_rate": 0.47677836566725457,
|
169 |
+
"num_full_absorption": 811,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.45928419865327924,
|
176 |
+
"full_absorption_rate": 0.5529034556465978,
|
177 |
+
"num_full_absorption": 1552,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 7
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.4171686934889313,
|
184 |
+
"full_absorption_rate": 0.34926253687315634,
|
185 |
+
"num_full_absorption": 592,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 2
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.3753684140171086,
|
192 |
+
"full_absorption_rate": 0.5311258278145695,
|
193 |
+
"num_full_absorption": 401,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.14320502373748673,
|
200 |
+
"full_absorption_rate": 0.2002534854245881,
|
201 |
+
"num_full_absorption": 158,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.25757599356476546,
|
208 |
+
"full_absorption_rate": 0.2699724517906336,
|
209 |
+
"num_full_absorption": 196,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.03557787974406646,
|
216 |
+
"full_absorption_rate": 0.02654867256637168,
|
217 |
+
"num_full_absorption": 3,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.06861018177768921,
|
224 |
+
"full_absorption_rate": 0.06818181818181818,
|
225 |
+
"num_full_absorption": 12,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.011850993135864673,
|
232 |
+
"full_absorption_rate": 0.01702127659574468,
|
233 |
+
"num_full_absorption": 4,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_1",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "7192a0e1-a7da-41bd-a84d-9026148fcf82",
|
17 |
+
"datetime_epoch_millis": 1740077437804,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.17942797907165925,
|
21 |
+
"mean_full_absorption_score": 0.15978218783987327,
|
22 |
+
"mean_num_split_features": 1.5,
|
23 |
+
"std_dev_absorption_fraction_score": 0.15226072446759698,
|
24 |
+
"std_dev_full_absorption_score": 0.13533222488741237,
|
25 |
+
"std_dev_num_split_features": 0.8602325267042626
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.47564895197099144,
|
32 |
+
"full_absorption_rate": 0.31259968102073366,
|
33 |
+
"num_full_absorption": 784,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.02365185253295002,
|
40 |
+
"full_absorption_rate": 0.02594033722438392,
|
41 |
+
"num_full_absorption": 40,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.4598973519414802,
|
48 |
+
"full_absorption_rate": 0.37219251336898396,
|
49 |
+
"num_full_absorption": 1044,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 3
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.2749116720207556,
|
56 |
+
"full_absorption_rate": 0.20662650602409638,
|
57 |
+
"num_full_absorption": 343,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 3
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.2707575688664681,
|
64 |
+
"full_absorption_rate": 0.2004950495049505,
|
65 |
+
"num_full_absorption": 324,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 3
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.1519379314336452,
|
72 |
+
"full_absorption_rate": 0.12762520193861066,
|
73 |
+
"num_full_absorption": 158,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.08931513277897936,
|
80 |
+
"full_absorption_rate": 0.09344978165938865,
|
81 |
+
"num_full_absorption": 107,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.04798358268604414,
|
88 |
+
"full_absorption_rate": 0.033816425120772944,
|
89 |
+
"num_full_absorption": 35,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.41216370077647757,
|
96 |
+
"full_absorption_rate": 0.4297924297924298,
|
97 |
+
"num_full_absorption": 704,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.0,
|
104 |
+
"full_absorption_rate": 0.009708737864077669,
|
105 |
+
"num_full_absorption": 4,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.010591802895094848,
|
112 |
+
"full_absorption_rate": 0.013333333333333334,
|
113 |
+
"num_full_absorption": 9,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.18751227466897116,
|
120 |
+
"full_absorption_rate": 0.20051413881748073,
|
121 |
+
"num_full_absorption": 234,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.10548850032097844,
|
128 |
+
"full_absorption_rate": 0.10049423393739704,
|
129 |
+
"num_full_absorption": 183,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.11818217053319924,
|
136 |
+
"full_absorption_rate": 0.08564231738035265,
|
137 |
+
"num_full_absorption": 68,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.27228559065459534,
|
144 |
+
"full_absorption_rate": 0.3064667291471415,
|
145 |
+
"num_full_absorption": 327,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.42231676025499576,
|
152 |
+
"full_absorption_rate": 0.3299737072743208,
|
153 |
+
"num_full_absorption": 753,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 2
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.08127272823941951,
|
160 |
+
"full_absorption_rate": 0.06842105263157895,
|
161 |
+
"num_full_absorption": 13,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.2961293111494162,
|
168 |
+
"full_absorption_rate": 0.2569077013521458,
|
169 |
+
"num_full_absorption": 437,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.3695603008892001,
|
176 |
+
"full_absorption_rate": 0.4324902030637692,
|
177 |
+
"num_full_absorption": 1214,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 4
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.18786499442114749,
|
184 |
+
"full_absorption_rate": 0.11209439528023599,
|
185 |
+
"num_full_absorption": 190,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.1637863224612186,
|
192 |
+
"full_absorption_rate": 0.21589403973509932,
|
193 |
+
"num_full_absorption": 163,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.02925016974477176,
|
200 |
+
"full_absorption_rate": 0.043092522179974654,
|
201 |
+
"num_full_absorption": 34,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.1275726157048305,
|
208 |
+
"full_absorption_rate": 0.09090909090909091,
|
209 |
+
"num_full_absorption": 66,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.028161295653006697,
|
216 |
+
"full_absorption_rate": 0.017699115044247787,
|
217 |
+
"num_full_absorption": 2,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.048059342038807434,
|
224 |
+
"full_absorption_rate": 0.05113636363636364,
|
225 |
+
"num_full_absorption": 9,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.010825531225695472,
|
232 |
+
"full_absorption_rate": 0.01702127659574468,
|
233 |
+
"num_full_absorption": 4,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_2",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "2ede38b6-345d-47f1-b639-8da6aa368fbe",
|
17 |
+
"datetime_epoch_millis": 1740078140625,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.04801302443670268,
|
21 |
+
"mean_full_absorption_score": 0.03591255684680933,
|
22 |
+
"mean_num_split_features": 1.0384615384615385,
|
23 |
+
"std_dev_absorption_fraction_score": 0.07947032446059547,
|
24 |
+
"std_dev_full_absorption_score": 0.08062070797631418,
|
25 |
+
"std_dev_num_split_features": 0.19611613513818404
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.02934165591566395,
|
32 |
+
"full_absorption_rate": 0.009170653907496013,
|
33 |
+
"num_full_absorption": 23,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.0029935259579446072,
|
40 |
+
"full_absorption_rate": 0.004539559014267186,
|
41 |
+
"num_full_absorption": 7,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.005569506519538388,
|
48 |
+
"full_absorption_rate": 0.0017825311942959,
|
49 |
+
"num_full_absorption": 5,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.00922397878267749,
|
56 |
+
"full_absorption_rate": 0.005421686746987952,
|
57 |
+
"num_full_absorption": 9,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.10168121881529071,
|
64 |
+
"full_absorption_rate": 0.03589108910891089,
|
65 |
+
"num_full_absorption": 58,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.008827253461870332,
|
72 |
+
"full_absorption_rate": 0.0032310177705977385,
|
73 |
+
"num_full_absorption": 4,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.012313091462540756,
|
80 |
+
"full_absorption_rate": 0.00611353711790393,
|
81 |
+
"num_full_absorption": 7,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.012699860300682255,
|
88 |
+
"full_absorption_rate": 0.00676328502415459,
|
89 |
+
"num_full_absorption": 7,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.09975263121604687,
|
96 |
+
"full_absorption_rate": 0.06898656898656899,
|
97 |
+
"num_full_absorption": 113,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.0037571585606403575,
|
104 |
+
"full_absorption_rate": 0.009708737864077669,
|
105 |
+
"num_full_absorption": 4,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.0051757245171220635,
|
112 |
+
"full_absorption_rate": 0.0044444444444444444,
|
113 |
+
"num_full_absorption": 3,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.058294956943712124,
|
120 |
+
"full_absorption_rate": 0.023993144815766924,
|
121 |
+
"num_full_absorption": 28,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.001762598898069777,
|
128 |
+
"full_absorption_rate": 0.004942339373970346,
|
129 |
+
"num_full_absorption": 9,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.0417621897858296,
|
136 |
+
"full_absorption_rate": 0.006297229219143577,
|
137 |
+
"num_full_absorption": 5,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.1499792308079282,
|
144 |
+
"full_absorption_rate": 0.16119962511715089,
|
145 |
+
"num_full_absorption": 172,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.07528312050104714,
|
152 |
+
"full_absorption_rate": 0.01621384750219106,
|
153 |
+
"num_full_absorption": 37,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.011651399259752978,
|
160 |
+
"full_absorption_rate": 0.021052631578947368,
|
161 |
+
"num_full_absorption": 4,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.05169549625768419,
|
168 |
+
"full_absorption_rate": 0.026455026455026454,
|
169 |
+
"num_full_absorption": 45,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.018073780196159923,
|
176 |
+
"full_absorption_rate": 0.0110438190238689,
|
177 |
+
"num_full_absorption": 31,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.015837549421101646,
|
184 |
+
"full_absorption_rate": 0.0029498525073746312,
|
185 |
+
"num_full_absorption": 5,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.39172802729272294,
|
192 |
+
"full_absorption_rate": 0.3973509933774834,
|
193 |
+
"num_full_absorption": 300,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.0007882202694839606,
|
200 |
+
"full_absorption_rate": 0.0025348542458808617,
|
201 |
+
"num_full_absorption": 2,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.03700330112932826,
|
208 |
+
"full_absorption_rate": 0.02203856749311295,
|
209 |
+
"num_full_absorption": 16,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.05503942363659827,
|
216 |
+
"full_absorption_rate": 0.017699115044247787,
|
217 |
+
"num_full_absorption": 2,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.03850760997536622,
|
224 |
+
"full_absorption_rate": 0.05113636363636364,
|
225 |
+
"num_full_absorption": 9,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.009596125469466737,
|
232 |
+
"full_absorption_rate": 0.01276595744680851,
|
233 |
+
"num_full_absorption": 3,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_3",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "fca0882d-facd-4d4a-829b-88d40a83269a",
|
17 |
+
"datetime_epoch_millis": 1740076039093,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.04725406186343194,
|
21 |
+
"mean_full_absorption_score": 0.035376663712700394,
|
22 |
+
"mean_num_split_features": 1.1538461538461537,
|
23 |
+
"std_dev_absorption_fraction_score": 0.12702547722858154,
|
24 |
+
"std_dev_full_absorption_score": 0.1273809249096068,
|
25 |
+
"std_dev_num_split_features": 0.36794648440311994
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.009415356574908502,
|
32 |
+
"full_absorption_rate": 0.001993620414673046,
|
33 |
+
"num_full_absorption": 5,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.0010598118823535877,
|
40 |
+
"full_absorption_rate": 0.0006485084306095979,
|
41 |
+
"num_full_absorption": 1,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 2.681482172260622e-05,
|
48 |
+
"full_absorption_rate": 0.0,
|
49 |
+
"num_full_absorption": 0,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.0,
|
56 |
+
"full_absorption_rate": 0.0030120481927710845,
|
57 |
+
"num_full_absorption": 5,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.005805138800008344,
|
64 |
+
"full_absorption_rate": 0.0,
|
65 |
+
"num_full_absorption": 0,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.0002805702876438644,
|
72 |
+
"full_absorption_rate": 0.0,
|
73 |
+
"num_full_absorption": 0,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.006430103531010465,
|
80 |
+
"full_absorption_rate": 0.0017467248908296944,
|
81 |
+
"num_full_absorption": 2,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.005265016174444728,
|
88 |
+
"full_absorption_rate": 0.001932367149758454,
|
89 |
+
"num_full_absorption": 2,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.07734533383832189,
|
96 |
+
"full_absorption_rate": 0.014652014652014652,
|
97 |
+
"num_full_absorption": 24,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.00013682976302465064,
|
104 |
+
"full_absorption_rate": 0.0024271844660194173,
|
105 |
+
"num_full_absorption": 1,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.09316016842349063,
|
112 |
+
"full_absorption_rate": 0.013333333333333334,
|
113 |
+
"num_full_absorption": 9,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.0027611929011460462,
|
120 |
+
"full_absorption_rate": 0.0,
|
121 |
+
"num_full_absorption": 0,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.0,
|
128 |
+
"full_absorption_rate": 0.007138934651290499,
|
129 |
+
"num_full_absorption": 13,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.0018469457675600843,
|
136 |
+
"full_absorption_rate": 0.0,
|
137 |
+
"num_full_absorption": 0,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.018485424239876518,
|
144 |
+
"full_absorption_rate": 0.0018744142455482662,
|
145 |
+
"num_full_absorption": 2,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 2
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.00015561067287864704,
|
152 |
+
"full_absorption_rate": 0.0,
|
153 |
+
"num_full_absorption": 0,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.014953244171397297,
|
160 |
+
"full_absorption_rate": 0.010526315789473684,
|
161 |
+
"num_full_absorption": 2,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.0004312995824857611,
|
168 |
+
"full_absorption_rate": 0.0,
|
169 |
+
"num_full_absorption": 0,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.00032280375537369337,
|
176 |
+
"full_absorption_rate": 0.0003562522265764161,
|
177 |
+
"num_full_absorption": 1,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.008179936000677979,
|
184 |
+
"full_absorption_rate": 0.0,
|
185 |
+
"num_full_absorption": 0,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.5644845103885536,
|
192 |
+
"full_absorption_rate": 0.6251655629139072,
|
193 |
+
"num_full_absorption": 472,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.001315398452670934,
|
200 |
+
"full_absorption_rate": 0.0038022813688212928,
|
201 |
+
"num_full_absorption": 3,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.0167619192257724,
|
208 |
+
"full_absorption_rate": 0.011019283746556474,
|
209 |
+
"num_full_absorption": 8,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.01788328707276735,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 2
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.3549107104349042,
|
224 |
+
"full_absorption_rate": 0.2159090909090909,
|
225 |
+
"num_full_absorption": 38,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.0271881816862366,
|
232 |
+
"full_absorption_rate": 0.00425531914893617,
|
233 |
+
"num_full_absorption": 1,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_4",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "4b1b8fa0-4e18-48e5-8a0b-ce7bb5b4208b",
|
17 |
+
"datetime_epoch_millis": 1740075302374,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.10691279716460964,
|
21 |
+
"mean_full_absorption_score": 0.04496650031498632,
|
22 |
+
"mean_num_split_features": 1.3076923076923077,
|
23 |
+
"std_dev_absorption_fraction_score": 0.2646622409455047,
|
24 |
+
"std_dev_full_absorption_score": 0.15377861098967333,
|
25 |
+
"std_dev_num_split_features": 0.5491251783869153
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.0019515094681566653,
|
32 |
+
"full_absorption_rate": 0.0,
|
33 |
+
"num_full_absorption": 0,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 2
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.0015650618455625462,
|
40 |
+
"full_absorption_rate": 0.0,
|
41 |
+
"num_full_absorption": 0,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.0,
|
48 |
+
"full_absorption_rate": 0.0,
|
49 |
+
"num_full_absorption": 0,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.0002489021118178152,
|
56 |
+
"full_absorption_rate": 0.0,
|
57 |
+
"num_full_absorption": 0,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.0017399766884406232,
|
64 |
+
"full_absorption_rate": 0.0,
|
65 |
+
"num_full_absorption": 0,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 2
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.002086390044012097,
|
72 |
+
"full_absorption_rate": 0.0,
|
73 |
+
"num_full_absorption": 0,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.019592569798226284,
|
80 |
+
"full_absorption_rate": 0.0,
|
81 |
+
"num_full_absorption": 0,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.010286585437156805,
|
88 |
+
"full_absorption_rate": 0.0,
|
89 |
+
"num_full_absorption": 0,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.025307686740506496,
|
96 |
+
"full_absorption_rate": 0.0,
|
97 |
+
"num_full_absorption": 0,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.001778383083826356,
|
104 |
+
"full_absorption_rate": 0.0,
|
105 |
+
"num_full_absorption": 0,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 2
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.02774139524886622,
|
112 |
+
"full_absorption_rate": 0.0,
|
113 |
+
"num_full_absorption": 0,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.02970714069454916,
|
120 |
+
"full_absorption_rate": 0.0,
|
121 |
+
"num_full_absorption": 0,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.005959470315798746,
|
128 |
+
"full_absorption_rate": 0.004393190554640308,
|
129 |
+
"num_full_absorption": 8,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.0036818374231465958,
|
136 |
+
"full_absorption_rate": 0.0,
|
137 |
+
"num_full_absorption": 0,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.0025381797908818992,
|
144 |
+
"full_absorption_rate": 0.0,
|
145 |
+
"num_full_absorption": 0,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 2
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.002444093867357725,
|
152 |
+
"full_absorption_rate": 0.0,
|
153 |
+
"num_full_absorption": 0,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.8720913195010103,
|
160 |
+
"full_absorption_rate": 0.5157894736842106,
|
161 |
+
"num_full_absorption": 98,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 3
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.0,
|
168 |
+
"full_absorption_rate": 0.0,
|
169 |
+
"num_full_absorption": 0,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.0001310025196193003,
|
176 |
+
"full_absorption_rate": 0.0,
|
177 |
+
"num_full_absorption": 0,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.004794285383742862,
|
184 |
+
"full_absorption_rate": 0.0,
|
185 |
+
"num_full_absorption": 0,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.0082685023894774,
|
192 |
+
"full_absorption_rate": 0.0,
|
193 |
+
"num_full_absorption": 0,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.008778392470257317,
|
200 |
+
"full_absorption_rate": 0.0012674271229404308,
|
201 |
+
"num_full_absorption": 1,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.004039336538578031,
|
208 |
+
"full_absorption_rate": 0.0,
|
209 |
+
"num_full_absorption": 0,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.16255359383165666,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.8471691439845103,
|
224 |
+
"full_absorption_rate": 0.6136363636363636,
|
225 |
+
"num_full_absorption": 108,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.7352779671026924,
|
232 |
+
"full_absorption_rate": 0.03404255319148936,
|
233 |
+
"num_full_absorption": 8,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_5",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "9c26caea-7ab6-40f7-b9ff-92a9e95bc77b",
|
17 |
+
"datetime_epoch_millis": 1740111330754,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.5571745900551895,
|
21 |
+
"mean_full_absorption_score": 0.6205963032975313,
|
22 |
+
"mean_num_split_features": 3.3461538461538463,
|
23 |
+
"std_dev_absorption_fraction_score": 0.21989389200707546,
|
24 |
+
"std_dev_full_absorption_score": 0.22468883182683483,
|
25 |
+
"std_dev_num_split_features": 2.115510485765697
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.8328389947558087,
|
32 |
+
"full_absorption_rate": 0.7372408293460925,
|
33 |
+
"num_full_absorption": 1849,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 5
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.5479417963209664,
|
40 |
+
"full_absorption_rate": 0.6582360570687419,
|
41 |
+
"num_full_absorption": 1015,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 7
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.86249459792848,
|
48 |
+
"full_absorption_rate": 0.9094474153297682,
|
49 |
+
"num_full_absorption": 2551,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 2
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.7076732984318926,
|
56 |
+
"full_absorption_rate": 0.7518072289156627,
|
57 |
+
"num_full_absorption": 1248,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 4
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.6826856855395601,
|
64 |
+
"full_absorption_rate": 0.7048267326732673,
|
65 |
+
"num_full_absorption": 1139,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 4
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.6202403786327095,
|
72 |
+
"full_absorption_rate": 0.6672051696284329,
|
73 |
+
"num_full_absorption": 826,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 9
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.6125840391486669,
|
80 |
+
"full_absorption_rate": 0.7406113537117904,
|
81 |
+
"num_full_absorption": 848,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 4
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.5458358390619358,
|
88 |
+
"full_absorption_rate": 0.561352657004831,
|
89 |
+
"num_full_absorption": 581,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 6
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.6988443248524734,
|
96 |
+
"full_absorption_rate": 0.7576312576312576,
|
97 |
+
"num_full_absorption": 1241,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.3962094458509169,
|
104 |
+
"full_absorption_rate": 0.4441747572815534,
|
105 |
+
"num_full_absorption": 183,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 4
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.24225838471380645,
|
112 |
+
"full_absorption_rate": 0.2548148148148148,
|
113 |
+
"num_full_absorption": 172,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.4866409619756762,
|
120 |
+
"full_absorption_rate": 0.6101113967437874,
|
121 |
+
"num_full_absorption": 712,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 4
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.6178430577948258,
|
128 |
+
"full_absorption_rate": 0.7040087863811093,
|
129 |
+
"num_full_absorption": 1282,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 6
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.6032541710977196,
|
136 |
+
"full_absorption_rate": 0.6259445843828715,
|
137 |
+
"num_full_absorption": 497,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 3
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.6203477114750064,
|
144 |
+
"full_absorption_rate": 0.7806935332708529,
|
145 |
+
"num_full_absorption": 833,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 2
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.9127210191414626,
|
152 |
+
"full_absorption_rate": 0.9189307624890447,
|
153 |
+
"num_full_absorption": 2097,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.4269980273815769,
|
160 |
+
"full_absorption_rate": 0.45263157894736844,
|
161 |
+
"num_full_absorption": 86,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.6736719884689889,
|
168 |
+
"full_absorption_rate": 0.746031746031746,
|
169 |
+
"num_full_absorption": 1269,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 4
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.7408915278900652,
|
176 |
+
"full_absorption_rate": 0.8589241182757392,
|
177 |
+
"num_full_absorption": 2411,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 3
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.7782384749711955,
|
184 |
+
"full_absorption_rate": 0.7787610619469026,
|
185 |
+
"num_full_absorption": 1320,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 3
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.44644447032672796,
|
192 |
+
"full_absorption_rate": 0.7311258278145696,
|
193 |
+
"num_full_absorption": 552,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.4525626767954696,
|
200 |
+
"full_absorption_rate": 0.5690747782002535,
|
201 |
+
"num_full_absorption": 449,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 5
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.6077924133789215,
|
208 |
+
"full_absorption_rate": 0.7107438016528925,
|
209 |
+
"num_full_absorption": 516,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 3
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.07823997577754759,
|
216 |
+
"full_absorption_rate": 0.07079646017699115,
|
217 |
+
"num_full_absorption": 8,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.16327547847243773,
|
224 |
+
"full_absorption_rate": 0.2159090909090909,
|
225 |
+
"num_full_absorption": 38,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.12801060125008876,
|
232 |
+
"full_absorption_rate": 0.17446808510638298,
|
233 |
+
"num_full_absorption": 41,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_0",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "efd023b3-0586-4aba-8ca4-647050643711",
|
17 |
+
"datetime_epoch_millis": 1740108755984,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.5428689946157796,
|
21 |
+
"mean_full_absorption_score": 0.5340971732661811,
|
22 |
+
"mean_num_split_features": 3.5384615384615383,
|
23 |
+
"std_dev_absorption_fraction_score": 0.24710052044084302,
|
24 |
+
"std_dev_full_absorption_score": 0.24258449258739415,
|
25 |
+
"std_dev_num_split_features": 1.964296703265965
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.7381686628198274,
|
32 |
+
"full_absorption_rate": 0.32137161084529503,
|
33 |
+
"num_full_absorption": 806,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 6
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.5984969751995518,
|
40 |
+
"full_absorption_rate": 0.6368352788586251,
|
41 |
+
"num_full_absorption": 982,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 7
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.8509632004082076,
|
48 |
+
"full_absorption_rate": 0.8281639928698752,
|
49 |
+
"num_full_absorption": 2323,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 3
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.7385909936977806,
|
56 |
+
"full_absorption_rate": 0.6879518072289157,
|
57 |
+
"num_full_absorption": 1142,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 6
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.7064702427486708,
|
64 |
+
"full_absorption_rate": 0.713490099009901,
|
65 |
+
"num_full_absorption": 1153,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 4
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.8272320803216332,
|
72 |
+
"full_absorption_rate": 0.8206785137318255,
|
73 |
+
"num_full_absorption": 1016,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 3
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.48885130442705776,
|
80 |
+
"full_absorption_rate": 0.5048034934497817,
|
81 |
+
"num_full_absorption": 578,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 7
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.5506681226653267,
|
88 |
+
"full_absorption_rate": 0.523671497584541,
|
89 |
+
"num_full_absorption": 542,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 6
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.5955470934241096,
|
96 |
+
"full_absorption_rate": 0.6172161172161172,
|
97 |
+
"num_full_absorption": 1011,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 4
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.4661751002159926,
|
104 |
+
"full_absorption_rate": 0.47572815533980584,
|
105 |
+
"num_full_absorption": 196,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.08811882770229734,
|
112 |
+
"full_absorption_rate": 0.08296296296296296,
|
113 |
+
"num_full_absorption": 56,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.48978935395668166,
|
120 |
+
"full_absorption_rate": 0.5029991431019709,
|
121 |
+
"num_full_absorption": 587,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 6
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.7243955590490241,
|
128 |
+
"full_absorption_rate": 0.7742998352553542,
|
129 |
+
"num_full_absorption": 1410,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 4
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.5111219581814586,
|
136 |
+
"full_absorption_rate": 0.4622166246851385,
|
137 |
+
"num_full_absorption": 367,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 5
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.5677139221288461,
|
144 |
+
"full_absorption_rate": 0.5951265229615745,
|
145 |
+
"num_full_absorption": 635,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 3
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.9270114805234171,
|
152 |
+
"full_absorption_rate": 0.8943908851884312,
|
153 |
+
"num_full_absorption": 2041,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 2
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.3246987177931468,
|
160 |
+
"full_absorption_rate": 0.3157894736842105,
|
161 |
+
"num_full_absorption": 60,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 2
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.6958312111104011,
|
168 |
+
"full_absorption_rate": 0.7184009406231628,
|
169 |
+
"num_full_absorption": 1222,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 4
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.7227235278168044,
|
176 |
+
"full_absorption_rate": 0.8229426433915212,
|
177 |
+
"num_full_absorption": 2310,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 5
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.8121763803507712,
|
184 |
+
"full_absorption_rate": 0.7457227138643068,
|
185 |
+
"num_full_absorption": 1264,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 3
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.2571658200652339,
|
192 |
+
"full_absorption_rate": 0.36821192052980134,
|
193 |
+
"num_full_absorption": 278,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.447052814587088,
|
200 |
+
"full_absorption_rate": 0.47021546261089986,
|
201 |
+
"num_full_absorption": 371,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 3
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.6457433298421121,
|
208 |
+
"full_absorption_rate": 0.6487603305785123,
|
209 |
+
"num_full_absorption": 471,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 2
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.17761250387893723,
|
216 |
+
"full_absorption_rate": 0.1415929203539823,
|
217 |
+
"num_full_absorption": 16,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.12538379998253557,
|
224 |
+
"full_absorption_rate": 0.1534090909090909,
|
225 |
+
"num_full_absorption": 27,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.03689087711335874,
|
232 |
+
"full_absorption_rate": 0.059574468085106386,
|
233 |
+
"num_full_absorption": 14,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_1",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "15798b98-1844-4735-aaa8-f77bf7522cd9",
|
17 |
+
"datetime_epoch_millis": 1740112140202,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.352325354017403,
|
21 |
+
"mean_full_absorption_score": 0.30593847749055475,
|
22 |
+
"mean_num_split_features": 1.2307692307692308,
|
23 |
+
"std_dev_absorption_fraction_score": 0.22898626550383655,
|
24 |
+
"std_dev_full_absorption_score": 0.22145601085815653,
|
25 |
+
"std_dev_num_split_features": 0.4296689244236597
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.6216706060419831,
|
32 |
+
"full_absorption_rate": 0.4409888357256778,
|
33 |
+
"num_full_absorption": 1106,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.3974572887100187,
|
40 |
+
"full_absorption_rate": 0.3521400778210117,
|
41 |
+
"num_full_absorption": 543,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.6927646146693088,
|
48 |
+
"full_absorption_rate": 0.6413547237076649,
|
49 |
+
"num_full_absorption": 1799,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.47744418964989493,
|
56 |
+
"full_absorption_rate": 0.36927710843373496,
|
57 |
+
"num_full_absorption": 613,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 2
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.5870694404248965,
|
64 |
+
"full_absorption_rate": 0.573019801980198,
|
65 |
+
"num_full_absorption": 926,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 2
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.5830104351629766,
|
72 |
+
"full_absorption_rate": 0.5201938610662359,
|
73 |
+
"num_full_absorption": 644,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.34745234590018287,
|
80 |
+
"full_absorption_rate": 0.25851528384279476,
|
81 |
+
"num_full_absorption": 296,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.22967960561507234,
|
88 |
+
"full_absorption_rate": 0.13429951690821257,
|
89 |
+
"num_full_absorption": 139,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.619077199673277,
|
96 |
+
"full_absorption_rate": 0.6501831501831502,
|
97 |
+
"num_full_absorption": 1065,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.00171975497397602,
|
104 |
+
"full_absorption_rate": 0.0048543689320388345,
|
105 |
+
"num_full_absorption": 2,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.018045039728896824,
|
112 |
+
"full_absorption_rate": 0.01925925925925926,
|
113 |
+
"num_full_absorption": 13,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.45296058549713225,
|
120 |
+
"full_absorption_rate": 0.4670094258783205,
|
121 |
+
"num_full_absorption": 545,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.5811030809298412,
|
128 |
+
"full_absorption_rate": 0.5612300933552993,
|
129 |
+
"num_full_absorption": 1022,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.22312187000875386,
|
136 |
+
"full_absorption_rate": 0.13476070528967254,
|
137 |
+
"num_full_absorption": 107,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.2300412012388341,
|
144 |
+
"full_absorption_rate": 0.1846298031865042,
|
145 |
+
"num_full_absorption": 197,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.6751946688382788,
|
152 |
+
"full_absorption_rate": 0.5753724802804557,
|
153 |
+
"num_full_absorption": 1313,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.05781321882890537,
|
160 |
+
"full_absorption_rate": 0.042105263157894736,
|
161 |
+
"num_full_absorption": 8,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.488805156034051,
|
168 |
+
"full_absorption_rate": 0.4262198706643151,
|
169 |
+
"num_full_absorption": 725,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.5873766958549872,
|
176 |
+
"full_absorption_rate": 0.6127538297114357,
|
177 |
+
"num_full_absorption": 1720,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 2
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.3832592488071156,
|
184 |
+
"full_absorption_rate": 0.25309734513274335,
|
185 |
+
"num_full_absorption": 429,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.13962747521360538,
|
192 |
+
"full_absorption_rate": 0.09933774834437085,
|
193 |
+
"num_full_absorption": 75,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.1366646018147327,
|
200 |
+
"full_absorption_rate": 0.13688212927756654,
|
201 |
+
"num_full_absorption": 108,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.3388312082364926,
|
208 |
+
"full_absorption_rate": 0.31129476584022037,
|
209 |
+
"num_full_absorption": 226,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.23041273003221366,
|
216 |
+
"full_absorption_rate": 0.1415929203539823,
|
217 |
+
"num_full_absorption": 16,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.05134630426917764,
|
224 |
+
"full_absorption_rate": 0.03977272727272727,
|
225 |
+
"num_full_absorption": 7,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.00851063829787234,
|
232 |
+
"full_absorption_rate": 0.00425531914893617,
|
233 |
+
"num_full_absorption": 1,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_2",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "26084e69-c5d9-40cd-b4e5-696f309d64fc",
|
17 |
+
"datetime_epoch_millis": 1740112965166,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.15174524643532927,
|
21 |
+
"mean_full_absorption_score": 0.08589398459398576,
|
22 |
+
"mean_num_split_features": 1.0384615384615385,
|
23 |
+
"std_dev_absorption_fraction_score": 0.16384394369953284,
|
24 |
+
"std_dev_full_absorption_score": 0.14338353890417252,
|
25 |
+
"std_dev_num_split_features": 0.19611613513818404
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.13921711721546576,
|
32 |
+
"full_absorption_rate": 0.03708133971291866,
|
33 |
+
"num_full_absorption": 93,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.04310752832952938,
|
40 |
+
"full_absorption_rate": 0.014267185473411154,
|
41 |
+
"num_full_absorption": 22,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.26743136506353554,
|
48 |
+
"full_absorption_rate": 0.1319073083778966,
|
49 |
+
"num_full_absorption": 370,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.3161296378836916,
|
56 |
+
"full_absorption_rate": 0.16686746987951806,
|
57 |
+
"num_full_absorption": 277,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.2612935127831887,
|
64 |
+
"full_absorption_rate": 0.11448019801980198,
|
65 |
+
"num_full_absorption": 185,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.2231852171454896,
|
72 |
+
"full_absorption_rate": 0.09289176090468497,
|
73 |
+
"num_full_absorption": 115,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.030756473849768082,
|
80 |
+
"full_absorption_rate": 0.00960698689956332,
|
81 |
+
"num_full_absorption": 11,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.07832209477769293,
|
88 |
+
"full_absorption_rate": 0.025120772946859903,
|
89 |
+
"num_full_absorption": 26,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.34438719358668407,
|
96 |
+
"full_absorption_rate": 0.27106227106227104,
|
97 |
+
"num_full_absorption": 444,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.004930270721614366,
|
104 |
+
"full_absorption_rate": 0.0048543689320388345,
|
105 |
+
"num_full_absorption": 2,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.0038580199123774677,
|
112 |
+
"full_absorption_rate": 0.002962962962962963,
|
113 |
+
"num_full_absorption": 2,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.14294435771033875,
|
120 |
+
"full_absorption_rate": 0.06255355612682091,
|
121 |
+
"num_full_absorption": 73,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.02202077616866355,
|
128 |
+
"full_absorption_rate": 0.011532125205930808,
|
129 |
+
"num_full_absorption": 21,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.07854167753708283,
|
136 |
+
"full_absorption_rate": 0.02644836272040302,
|
137 |
+
"num_full_absorption": 21,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.22802217761961677,
|
144 |
+
"full_absorption_rate": 0.08341143392689784,
|
145 |
+
"num_full_absorption": 89,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.30941729908873966,
|
152 |
+
"full_absorption_rate": 0.15731814198071867,
|
153 |
+
"num_full_absorption": 359,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.005634211490395127,
|
160 |
+
"full_absorption_rate": 0.0,
|
161 |
+
"num_full_absorption": 0,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.17034694561101904,
|
168 |
+
"full_absorption_rate": 0.07818930041152264,
|
169 |
+
"num_full_absorption": 133,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.17729512097612715,
|
176 |
+
"full_absorption_rate": 0.10972568578553615,
|
177 |
+
"num_full_absorption": 308,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.03769830907309684,
|
184 |
+
"full_absorption_rate": 0.00943952802359882,
|
185 |
+
"num_full_absorption": 16,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.750288739748417,
|
192 |
+
"full_absorption_rate": 0.7099337748344371,
|
193 |
+
"num_full_absorption": 536,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.014844840313461553,
|
200 |
+
"full_absorption_rate": 0.012674271229404309,
|
201 |
+
"num_full_absorption": 10,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.1397234584743241,
|
208 |
+
"full_absorption_rate": 0.05371900826446281,
|
209 |
+
"num_full_absorption": 39,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.013887928657641017,
|
216 |
+
"full_absorption_rate": 0.008849557522123894,
|
217 |
+
"num_full_absorption": 1,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.028506459333482823,
|
224 |
+
"full_absorption_rate": 0.03409090909090909,
|
225 |
+
"num_full_absorption": 6,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.11358567424711703,
|
232 |
+
"full_absorption_rate": 0.00425531914893617,
|
233 |
+
"num_full_absorption": 1,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_3",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "1a8188af-1e8d-4efc-b36d-0955803fc7b4",
|
17 |
+
"datetime_epoch_millis": 1740110494886,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.03942344512163808,
|
21 |
+
"mean_full_absorption_score": 0.0140364892361101,
|
22 |
+
"mean_num_split_features": 1.1923076923076923,
|
23 |
+
"std_dev_absorption_fraction_score": 0.09934591318238874,
|
24 |
+
"std_dev_full_absorption_score": 0.05756098788114222,
|
25 |
+
"std_dev_num_split_features": 0.4914656259988704
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.028346535429880127,
|
32 |
+
"full_absorption_rate": 0.0007974481658692185,
|
33 |
+
"num_full_absorption": 2,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.0022608089680371196,
|
40 |
+
"full_absorption_rate": 0.0,
|
41 |
+
"num_full_absorption": 0,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.0025546027061933625,
|
48 |
+
"full_absorption_rate": 0.00071301247771836,
|
49 |
+
"num_full_absorption": 2,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.0019355578730626043,
|
56 |
+
"full_absorption_rate": 0.0018072289156626507,
|
57 |
+
"num_full_absorption": 3,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.07498623830651696,
|
64 |
+
"full_absorption_rate": 0.003094059405940594,
|
65 |
+
"num_full_absorption": 5,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.0050345150551234065,
|
72 |
+
"full_absorption_rate": 0.0,
|
73 |
+
"num_full_absorption": 0,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.010460701096032003,
|
80 |
+
"full_absorption_rate": 0.0008733624454148472,
|
81 |
+
"num_full_absorption": 1,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.01170635135150821,
|
88 |
+
"full_absorption_rate": 0.001932367149758454,
|
89 |
+
"num_full_absorption": 2,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.1089880317607272,
|
96 |
+
"full_absorption_rate": 0.017704517704517704,
|
97 |
+
"num_full_absorption": 29,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.0003029988879400231,
|
104 |
+
"full_absorption_rate": 0.0,
|
105 |
+
"num_full_absorption": 0,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.0029912743470123966,
|
112 |
+
"full_absorption_rate": 0.0,
|
113 |
+
"num_full_absorption": 0,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.024211372038585182,
|
120 |
+
"full_absorption_rate": 0.000856898029134533,
|
121 |
+
"num_full_absorption": 1,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.006949418441528633,
|
128 |
+
"full_absorption_rate": 0.006589785831960461,
|
129 |
+
"num_full_absorption": 12,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.025325264607950822,
|
136 |
+
"full_absorption_rate": 0.0025188916876574307,
|
137 |
+
"num_full_absorption": 2,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.06964801703096594,
|
144 |
+
"full_absorption_rate": 0.011246485473289597,
|
145 |
+
"num_full_absorption": 12,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.0016421080000302358,
|
152 |
+
"full_absorption_rate": 0.0,
|
153 |
+
"num_full_absorption": 0,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.002114695349444831,
|
160 |
+
"full_absorption_rate": 0.0,
|
161 |
+
"num_full_absorption": 0,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.0031820979727350375,
|
168 |
+
"full_absorption_rate": 0.0,
|
169 |
+
"num_full_absorption": 0,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.004449262686803511,
|
176 |
+
"full_absorption_rate": 0.0003562522265764161,
|
177 |
+
"num_full_absorption": 1,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.012846054983407984,
|
184 |
+
"full_absorption_rate": 0.0005899705014749262,
|
185 |
+
"num_full_absorption": 1,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.04887027767059182,
|
192 |
+
"full_absorption_rate": 0.009271523178807948,
|
193 |
+
"num_full_absorption": 7,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.0008588606945117642,
|
200 |
+
"full_absorption_rate": 0.0,
|
201 |
+
"num_full_absorption": 0,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.04016210362798525,
|
208 |
+
"full_absorption_rate": 0.006887052341597796,
|
209 |
+
"num_full_absorption": 5,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.010852614455368956,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 3
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.5077652724254798,
|
224 |
+
"full_absorption_rate": 0.29545454545454547,
|
225 |
+
"num_full_absorption": 52,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.01656453739516708,
|
232 |
+
"full_absorption_rate": 0.00425531914893617,
|
233 |
+
"num_full_absorption": 1,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 2
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_4",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/absorption/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "b39a2bbf-6894-4756-837a-7757325a495d",
|
17 |
+
"datetime_epoch_millis": 1740109628891,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.06555661212074526,
|
21 |
+
"mean_full_absorption_score": 0.028694289243238084,
|
22 |
+
"mean_num_split_features": 1.1153846153846154,
|
23 |
+
"std_dev_absorption_fraction_score": 0.17874387812611048,
|
24 |
+
"std_dev_full_absorption_score": 0.1282236268349608,
|
25 |
+
"std_dev_num_split_features": 0.3258125936084211
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.009520576342205286,
|
32 |
+
"full_absorption_rate": 0.0,
|
33 |
+
"num_full_absorption": 0,
|
34 |
+
"num_probe_true_positives": 2508,
|
35 |
+
"num_split_features": 2
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.0025275048111106477,
|
40 |
+
"full_absorption_rate": 0.0006485084306095979,
|
41 |
+
"num_full_absorption": 1,
|
42 |
+
"num_probe_true_positives": 1542,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.0,
|
48 |
+
"full_absorption_rate": 0.0,
|
49 |
+
"num_full_absorption": 0,
|
50 |
+
"num_probe_true_positives": 2805,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.0019839170132145943,
|
56 |
+
"full_absorption_rate": 0.0006024096385542169,
|
57 |
+
"num_full_absorption": 1,
|
58 |
+
"num_probe_true_positives": 1660,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.04418599759645858,
|
64 |
+
"full_absorption_rate": 0.0012376237623762376,
|
65 |
+
"num_full_absorption": 2,
|
66 |
+
"num_probe_true_positives": 1616,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.003045558610690488,
|
72 |
+
"full_absorption_rate": 0.0,
|
73 |
+
"num_full_absorption": 0,
|
74 |
+
"num_probe_true_positives": 1238,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.003541220671457806,
|
80 |
+
"full_absorption_rate": 0.0,
|
81 |
+
"num_full_absorption": 0,
|
82 |
+
"num_probe_true_positives": 1145,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.004566561548265196,
|
88 |
+
"full_absorption_rate": 0.0,
|
89 |
+
"num_full_absorption": 0,
|
90 |
+
"num_probe_true_positives": 1035,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.03134665350983779,
|
96 |
+
"full_absorption_rate": 0.0018315018315018315,
|
97 |
+
"num_full_absorption": 3,
|
98 |
+
"num_probe_true_positives": 1638,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.0007627951868082768,
|
104 |
+
"full_absorption_rate": 0.0,
|
105 |
+
"num_full_absorption": 0,
|
106 |
+
"num_probe_true_positives": 412,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.01549266614777519,
|
112 |
+
"full_absorption_rate": 0.0,
|
113 |
+
"num_full_absorption": 0,
|
114 |
+
"num_probe_true_positives": 675,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.0005719143487719158,
|
120 |
+
"full_absorption_rate": 0.0,
|
121 |
+
"num_full_absorption": 0,
|
122 |
+
"num_probe_true_positives": 1167,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.008494294204347433,
|
128 |
+
"full_absorption_rate": 0.006589785831960461,
|
129 |
+
"num_full_absorption": 12,
|
130 |
+
"num_probe_true_positives": 1821,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.05672687383143593,
|
136 |
+
"full_absorption_rate": 0.0,
|
137 |
+
"num_full_absorption": 0,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.05122861661179144,
|
144 |
+
"full_absorption_rate": 0.0028116213683223993,
|
145 |
+
"num_full_absorption": 3,
|
146 |
+
"num_probe_true_positives": 1067,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.0019615119799893904,
|
152 |
+
"full_absorption_rate": 0.0,
|
153 |
+
"num_full_absorption": 0,
|
154 |
+
"num_probe_true_positives": 2282,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.39355620461923463,
|
160 |
+
"full_absorption_rate": 0.07368421052631578,
|
161 |
+
"num_full_absorption": 14,
|
162 |
+
"num_probe_true_positives": 190,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.0,
|
168 |
+
"full_absorption_rate": 0.0,
|
169 |
+
"num_full_absorption": 0,
|
170 |
+
"num_probe_true_positives": 1701,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.00032212707935616407,
|
176 |
+
"full_absorption_rate": 0.0,
|
177 |
+
"num_full_absorption": 0,
|
178 |
+
"num_probe_true_positives": 2807,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.005862875355780282,
|
184 |
+
"full_absorption_rate": 0.0,
|
185 |
+
"num_full_absorption": 0,
|
186 |
+
"num_probe_true_positives": 1695,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.04422480500553416,
|
192 |
+
"full_absorption_rate": 0.0013245033112582781,
|
193 |
+
"num_full_absorption": 1,
|
194 |
+
"num_probe_true_positives": 755,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.004069543132561543,
|
200 |
+
"full_absorption_rate": 0.0025348542458808617,
|
201 |
+
"num_full_absorption": 2,
|
202 |
+
"num_probe_true_positives": 789,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.019203869068399586,
|
208 |
+
"full_absorption_rate": 0.0013774104683195593,
|
209 |
+
"num_full_absorption": 1,
|
210 |
+
"num_probe_true_positives": 726,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.06325101886841776,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 113,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.8563286531653201,
|
224 |
+
"full_absorption_rate": 0.6534090909090909,
|
225 |
+
"num_full_absorption": 115,
|
226 |
+
"num_probe_true_positives": 176,
|
227 |
+
"num_split_features": 2
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.08169615643061263,
|
232 |
+
"full_absorption_rate": 0.0,
|
233 |
+
"num_full_absorption": 0,
|
234 |
+
"num_probe_true_positives": 235,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "60579ed19a6281956621283bada7be2e76a7b583",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_5",
|
241 |
+
"sae_lens_version": "5.4.2",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 65536,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0d19fc459e1f9c514d0097fae277f0831a61abd03d9e726ff6f4b7a652dea9f
|
3 |
+
size 27623781
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9fca9a6b43a6677b507b976053fb9e656720e0b39cae2f6626dda6c0882bb58a
|
3 |
+
size 27478722
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1887153b4660a98b73df19ca3d6929e0a628dbd019daa0767626cc8a3174e668
|
3 |
+
size 27326392
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4639d5faecfe47d93388cd85219ec503ec42c2c3efce501ac7cb4c96ba201587
|
3 |
+
size 26972592
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:733a1150854b8898c75ecaba2481408d4d03eedf0d9a176c1a04af2761eb92ce
|
3 |
+
size 26685823
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9320040cfecad57406c4aee36f6ee3100cab3f8a7fce767da1f159d90109005e
|
3 |
+
size 26334864
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df3613f337e54b63e0feec7c52b072ff8d02ca4e8b2cefccb7ebc7ebb35aeada
|
3 |
+
size 26846162
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4ba2ffbe45d5a2a16ec7ba867974934e88a66209b036561ece8edc07b56580ec
|
3 |
+
size 26732924
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:72a689e8599a850a9a8beb0f9c6414ff2403e613b7ccff4c8928b2468e9279e0
|
3 |
+
size 26438355
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b04aad0de34d397bd245a06c6ba7c5eb821485360e9cae47bd1ae3b10fbdae5
|
3 |
+
size 26285690
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:544cdad19a6c89a8374f44af5f98f1dfb2418b5f9350bc9c02f11d822ce680e3
|
3 |
+
size 26155343
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_standard_new_width-2pow16_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b5304e4005dc5d04c891d20903db02b54c689b76921aa9db3aec1991fa83844f
|
3 |
+
size 25984532
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a3082c208b6c18c843fc43ee0221547e1a97273f9ec78d78b9290f2030b44453
|
3 |
+
size 26444963
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2f389b4cbe12e5e559fa5126e6e3ab171d1c979fbf358400600784e750e52c82
|
3 |
+
size 26356253
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c276830a7613665611e50bb694a0ea2389c34ce5d52b443daa5d03e60f169b4
|
3 |
+
size 26610835
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae1c49b0452d85f33ec36c07e2649801741441c4fa30fc49e1cfa60e6a219ae9
|
3 |
+
size 26712914
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:667104ce70a5ec0ef03908c41894b7d592f51fabb11feea7f1119195a2840a4c
|
3 |
+
size 26824711
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow14_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b49951403d7d345b98f4104f56dbf932ed2bb5285ef62da752faaf5f28e32ff0
|
3 |
+
size 26393676
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:33c4b6c6bc4a00b7cab365f25798d5059f5b1a8eb73395ea3c8e5ad8d9ae3204
|
3 |
+
size 25068488
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c39909390312c51b98e9b9b812343319a32b3ac403dd5f9e39752df62370d828
|
3 |
+
size 25597832
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9b895a1eb166c2e94c2d10b32032a7b51dac853c542244972590c6ec9889adf5
|
3 |
+
size 25403761
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0f7f89e595e738eadb4099bcc96a3408f36eb04346a57b53ff0ab8db448785d7
|
3 |
+
size 25537327
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2463cff82576d2ff2fe1bb866256d1b6f77401d54ee95611881fa9399cb6bba
|
3 |
+
size 25536425
|
eval_results_finetunes/autointerp/kl_finetunes_gemma-2-2b_top_k_width-2pow16_date-0107_resid_post_layer_12_trainer_5_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d6c13849849e6e2277b075568d296d43d091a547356ff229777c9a4d8228986a
|
3 |
+
size 25181253
|
eval_results_finetunes/core/kl_finetunes_gemma-2-2b_standard_new_width-2pow14_date-0107_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|