HakHan commited on
Commit
c6a41ac
·
verified ·
1 Parent(s): 07caa33

Upload folder using huggingface_hub

Browse files
Files changed (42) hide show
  1. .gitattributes +0 -34
  2. README.md +11 -3
  3. SafeSwitch_Llama-3.1-8B-Instruct/direct_prober/args.json +25 -0
  4. SafeSwitch_Llama-3.1-8B-Instruct/direct_prober/model_weights.pth +3 -0
  5. SafeSwitch_Llama-3.1-8B-Instruct/direct_prober/result.json +7 -0
  6. SafeSwitch_Llama-3.1-8B-Instruct/refusal_head.pth +3 -0
  7. SafeSwitch_Llama-3.1-8B-Instruct/stage1_prober/args.json +25 -0
  8. SafeSwitch_Llama-3.1-8B-Instruct/stage1_prober/model_weights.pth +3 -0
  9. SafeSwitch_Llama-3.1-8B-Instruct/stage1_prober/result.json +7 -0
  10. SafeSwitch_Llama-3.1-8B-Instruct/stage2_prober/args.json +25 -0
  11. SafeSwitch_Llama-3.1-8B-Instruct/stage2_prober/model_weights.pth +3 -0
  12. SafeSwitch_Llama-3.1-8B-Instruct/stage2_prober/result.json +7 -0
  13. SafeSwitch_Ministral-8B-Instruct-2410/direct_prober/args.json +23 -0
  14. SafeSwitch_Ministral-8B-Instruct-2410/direct_prober/model_weights.pth +3 -0
  15. SafeSwitch_Ministral-8B-Instruct-2410/direct_prober/result.json +7 -0
  16. SafeSwitch_Ministral-8B-Instruct-2410/refusal_head.pth +3 -0
  17. SafeSwitch_Ministral-8B-Instruct-2410/stage1_prober/args.json +23 -0
  18. SafeSwitch_Ministral-8B-Instruct-2410/stage1_prober/model_weights.pth +3 -0
  19. SafeSwitch_Ministral-8B-Instruct-2410/stage1_prober/result.json +7 -0
  20. SafeSwitch_Ministral-8B-Instruct-2410/stage2_prober/args.json +23 -0
  21. SafeSwitch_Ministral-8B-Instruct-2410/stage2_prober/model_weights.pth +3 -0
  22. SafeSwitch_Ministral-8B-Instruct-2410/stage2_prober/result.json +7 -0
  23. SafeSwitch_Yi-1.5-9B-Chat/direct_prober/args.json +25 -0
  24. SafeSwitch_Yi-1.5-9B-Chat/direct_prober/model_weights.pth +3 -0
  25. SafeSwitch_Yi-1.5-9B-Chat/direct_prober/result.json +7 -0
  26. SafeSwitch_Yi-1.5-9B-Chat/refusal_head.pth +3 -0
  27. SafeSwitch_Yi-1.5-9B-Chat/stage1_prober/args.json +25 -0
  28. SafeSwitch_Yi-1.5-9B-Chat/stage1_prober/model_weights.pth +3 -0
  29. SafeSwitch_Yi-1.5-9B-Chat/stage1_prober/result.json +7 -0
  30. SafeSwitch_Yi-1.5-9B-Chat/stage2_prober/args.json +25 -0
  31. SafeSwitch_Yi-1.5-9B-Chat/stage2_prober/model_weights.pth +3 -0
  32. SafeSwitch_Yi-1.5-9B-Chat/stage2_prober/result.json +7 -0
  33. SwfeSwitch_Qwen2.5-7B-Instruct/direct_prober/args.json +25 -0
  34. SwfeSwitch_Qwen2.5-7B-Instruct/direct_prober/model_weights.pth +3 -0
  35. SwfeSwitch_Qwen2.5-7B-Instruct/direct_prober/result.json +7 -0
  36. SwfeSwitch_Qwen2.5-7B-Instruct/refusal_head.pth +3 -0
  37. SwfeSwitch_Qwen2.5-7B-Instruct/stage1_prober/args.json +25 -0
  38. SwfeSwitch_Qwen2.5-7B-Instruct/stage1_prober/model_weights.pth +3 -0
  39. SwfeSwitch_Qwen2.5-7B-Instruct/stage1_prober/result.json +7 -0
  40. SwfeSwitch_Qwen2.5-7B-Instruct/stage2_prober/args.json +25 -0
  41. SwfeSwitch_Qwen2.5-7B-Instruct/stage2_prober/model_weights.pth +3 -0
  42. SwfeSwitch_Qwen2.5-7B-Instruct/stage2_prober/result.json +7 -0
.gitattributes CHANGED
@@ -1,35 +1 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.pth filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,3 +1,11 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
1
+ Refer to our [code repo](https://github.com/Hanpx20/SafeSwitch) for usage.
2
+
3
+ `refusal_head.pth`: the refusal head.
4
+
5
+ `direct_prober/`: the direct prober from the last layer.
6
+
7
+ `stage1_prober/`: the prober to predict unsafe inputs from the last layer tokens.
8
+
9
+ `stage2_prober/`: the prober to predict mdoel compliance after decoding 3 tokens.
10
+
11
+ All probers are 2-layer MLPs with intermediate sizes of 64.
SafeSwitch_Llama-3.1-8B-Instruct/direct_prober/args.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epochs": 20,
3
+ "batch_size": 8,
4
+ "learning_rate": 1e-05,
5
+ "base_dir": "/shared/nas2/ph16/toxic/outputs/classifier",
6
+ "data_dir": "/shared/nas2/ph16/toxic/outputs/states/Llama-3.1-8B-Instruct",
7
+ "job_name": "Llama-3.1-8B-Instruct_both/layer32",
8
+ "ckpt": "",
9
+ "gpu": "",
10
+ "wandb": false,
11
+ "overwrite": true,
12
+ "hidden_sizes": [
13
+ "64"
14
+ ],
15
+ "random_seed": 42,
16
+ "token_rule": "last",
17
+ "label": "both",
18
+ "n_decode": 0,
19
+ "layer_id": 32,
20
+ "num_classification": 2,
21
+ "neg_weight": 1.0,
22
+ "lens_path": "",
23
+ "llm": "Llama-3.1-8B-Instruct",
24
+ "use_lens": false
25
+ }
SafeSwitch_Llama-3.1-8B-Instruct/direct_prober/model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42f144e6ef937fab8e46fdc975576e6d9bbcff8c3bf6bd0511c8b601df20ff49
3
+ size 1051584
SafeSwitch_Llama-3.1-8B-Instruct/direct_prober/result.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "F1": 85.435,
3
+ "acc": 93.227,
4
+ "positive_rate": 23.955,
5
+ "prec": 88.105,
6
+ "recall": 82.922
7
+ }
SafeSwitch_Llama-3.1-8B-Instruct/refusal_head.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6c9ce8c3b81b2f8ce3154ab1c7170f2b5c19e2afd598e29e46cd58be54b93f7
3
+ size 2101347548
SafeSwitch_Llama-3.1-8B-Instruct/stage1_prober/args.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epochs": 20,
3
+ "batch_size": 8,
4
+ "learning_rate": 1e-05,
5
+ "base_dir": "/shared/nas2/ph16/toxic/outputs/classifier",
6
+ "data_dir": "/shared/nas2/ph16/toxic/outputs/states/Llama-3.1-8B-Instruct",
7
+ "job_name": "Llama-3.1-8B-Instruct_multi_safety/token0",
8
+ "ckpt": "",
9
+ "gpu": "",
10
+ "wandb": false,
11
+ "overwrite": true,
12
+ "hidden_sizes": [
13
+ "64"
14
+ ],
15
+ "random_seed": 42,
16
+ "token_rule": "multi",
17
+ "label": "safety",
18
+ "n_decode": 0,
19
+ "layer_id": 32,
20
+ "num_classification": 2,
21
+ "neg_weight": 1.0,
22
+ "lens_path": "",
23
+ "llm": "Llama-3.1-8B-Instruct",
24
+ "use_lens": false
25
+ }
SafeSwitch_Llama-3.1-8B-Instruct/stage1_prober/model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a4da13c0df8f8d46b66a0118adfd6a687a1bf68112f42184fef105bf4992d16
3
+ size 1051584
SafeSwitch_Llama-3.1-8B-Instruct/stage1_prober/result.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "F1": 95.022,
3
+ "acc": 95.909,
4
+ "positive_rate": 40.909,
5
+ "prec": 94.604,
6
+ "recall": 95.444
7
+ }
SafeSwitch_Llama-3.1-8B-Instruct/stage2_prober/args.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epochs": 20,
3
+ "batch_size": 8,
4
+ "learning_rate": 1e-05,
5
+ "base_dir": "/shared/nas2/ph16/toxic/outputs/classifier",
6
+ "data_dir": "/shared/nas2/ph16/toxic/outputs/states/Llama-3.1-8B-Instruct",
7
+ "job_name": "Llama-3.1-8B-Instruct_multi_response/token3",
8
+ "ckpt": "",
9
+ "gpu": "",
10
+ "wandb": false,
11
+ "overwrite": true,
12
+ "hidden_sizes": [
13
+ "64"
14
+ ],
15
+ "random_seed": 42,
16
+ "token_rule": "multi",
17
+ "label": "response",
18
+ "n_decode": 3,
19
+ "layer_id": 32,
20
+ "num_classification": 2,
21
+ "neg_weight": 1.0,
22
+ "lens_path": "",
23
+ "llm": "Llama-3.1-8B-Instruct",
24
+ "use_lens": false
25
+ }
SafeSwitch_Llama-3.1-8B-Instruct/stage2_prober/model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67bcd47033d8c1a75cc4221bfd9c55fc49d6f2a3867f06ada0f885cc519b330c
3
+ size 1051584
SafeSwitch_Llama-3.1-8B-Instruct/stage2_prober/result.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "F1": 98.369,
3
+ "acc": 97.409,
4
+ "positive_rate": 79.091,
5
+ "prec": 97.949,
6
+ "recall": 98.793
7
+ }
SafeSwitch_Ministral-8B-Instruct-2410/direct_prober/args.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epochs": 20,
3
+ "batch_size": 8,
4
+ "learning_rate": 1e-05,
5
+ "base_dir": "/shared/nas2/ph16/toxic/outputs/classifier",
6
+ "data_dir": "/shared/nas2/ph16/toxic/outputs/states/Ministral-8B-Instruct-2410",
7
+ "job_name": "Ministral-8B-Instruct-2410_both/layer36",
8
+ "ckpt": "",
9
+ "gpu": "",
10
+ "wandb": false,
11
+ "overwrite": true,
12
+ "hidden_sizes": [
13
+ "64"
14
+ ],
15
+ "random_seed": 42,
16
+ "token_rule": "last",
17
+ "label": "both",
18
+ "n_decode": 0,
19
+ "layer_id": 36,
20
+ "num_classification": 2,
21
+ "neg_weight": 1.0,
22
+ "llm": "Ministral-8B-Instruct-2410"
23
+ }
SafeSwitch_Ministral-8B-Instruct-2410/direct_prober/model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04d59fa05b5f560a4fc2e0ef659622b811a7dfd0af9396849899fd16c2dbc322
3
+ size 1051584
SafeSwitch_Ministral-8B-Instruct-2410/direct_prober/result.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "F1": 88.587,
3
+ "acc": 92.364,
4
+ "positive_rate": 32.773,
5
+ "prec": 86.818,
6
+ "recall": 90.43
7
+ }
SafeSwitch_Ministral-8B-Instruct-2410/refusal_head.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:118e733c3423193c1e2023ca87b3518c4bc1231a13cdaee576905bd45004ffbb
3
+ size 2147484892
SafeSwitch_Ministral-8B-Instruct-2410/stage1_prober/args.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epochs": 20,
3
+ "batch_size": 8,
4
+ "learning_rate": 1e-05,
5
+ "base_dir": "/shared/nas2/ph16/toxic/outputs/classifier",
6
+ "data_dir": "/shared/nas2/ph16/toxic/outputs/states/Ministral-8B-Instruct-2410",
7
+ "job_name": "Ministral-8B-Instruct-2410_multi_safety/token0",
8
+ "ckpt": "",
9
+ "gpu": "",
10
+ "wandb": false,
11
+ "overwrite": true,
12
+ "hidden_sizes": [
13
+ "64"
14
+ ],
15
+ "random_seed": 42,
16
+ "token_rule": "multi",
17
+ "label": "safety",
18
+ "n_decode": 0,
19
+ "layer_id": -1,
20
+ "num_classification": 2,
21
+ "neg_weight": 1.0,
22
+ "llm": "Ministral-8B-Instruct-2410"
23
+ }
SafeSwitch_Ministral-8B-Instruct-2410/stage1_prober/model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db2325ec00a6fd983b39aebe698690c46cee6b6c1a52c9e24a7beff060a7d264
3
+ size 1051584
SafeSwitch_Ministral-8B-Instruct-2410/stage1_prober/result.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "F1": 94.8,
3
+ "acc": 95.682,
4
+ "positive_rate": 40.909,
5
+ "prec": 93.42,
6
+ "recall": 96.222
7
+ }
SafeSwitch_Ministral-8B-Instruct-2410/stage2_prober/args.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epochs": 20,
3
+ "batch_size": 8,
4
+ "learning_rate": 1e-05,
5
+ "base_dir": "/shared/nas2/ph16/toxic/outputs/classifier",
6
+ "data_dir": "/shared/nas2/ph16/toxic/outputs/states/Ministral-8B-Instruct-2410",
7
+ "job_name": "Ministral-8B-Instruct-2410_multi_response/token3",
8
+ "ckpt": "",
9
+ "gpu": "",
10
+ "wandb": false,
11
+ "overwrite": true,
12
+ "hidden_sizes": [
13
+ "64"
14
+ ],
15
+ "random_seed": 42,
16
+ "token_rule": "multi",
17
+ "label": "response",
18
+ "n_decode": 3,
19
+ "layer_id": -1,
20
+ "num_classification": 2,
21
+ "neg_weight": 1.0,
22
+ "llm": "Ministral-8B-Instruct-2410"
23
+ }
SafeSwitch_Ministral-8B-Instruct-2410/stage2_prober/model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0926c89c23db5ed38f107b3525ee025d374ec4b28643e44f686303c88640ba3
3
+ size 1051584
SafeSwitch_Ministral-8B-Instruct-2410/stage2_prober/result.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "F1": 97.665,
3
+ "acc": 95.818,
4
+ "positive_rate": 89.409,
5
+ "prec": 97.516,
6
+ "recall": 97.814
7
+ }
SafeSwitch_Yi-1.5-9B-Chat/direct_prober/args.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epochs": 20,
3
+ "batch_size": 8,
4
+ "learning_rate": 1e-05,
5
+ "base_dir": "/shared/nas2/ph16/toxic/outputs/classifier",
6
+ "data_dir": "/shared/nas2/ph16/toxic/outputs/states/Yi-1.5-9B-Chat",
7
+ "job_name": "Yi-1.5-9B-Chat_both/layer48",
8
+ "ckpt": "",
9
+ "gpu": "",
10
+ "wandb": false,
11
+ "overwrite": true,
12
+ "hidden_sizes": [
13
+ "64"
14
+ ],
15
+ "random_seed": 42,
16
+ "token_rule": "last",
17
+ "label": "both",
18
+ "n_decode": 0,
19
+ "layer_id": 48,
20
+ "num_classification": 2,
21
+ "neg_weight": 1.0,
22
+ "lens_path": "",
23
+ "llm": "Yi-1.5-9B-Chat",
24
+ "use_lens": false
25
+ }
SafeSwitch_Yi-1.5-9B-Chat/direct_prober/model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a05dda4f63e650a835fd3caff30b3647cf12f7023661083e7b10157d1beab011
3
+ size 1051584
SafeSwitch_Yi-1.5-9B-Chat/direct_prober/result.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "F1": 86.595,
3
+ "acc": 92.091,
4
+ "positive_rate": 29.364,
5
+ "prec": 86.196,
6
+ "recall": 86.997
7
+ }
SafeSwitch_Yi-1.5-9B-Chat/refusal_head.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b175f425c26d02ebce55add355e3f4bb29aa08c73488bbb74add6c222f9bb0fa
3
+ size 1048577244
SafeSwitch_Yi-1.5-9B-Chat/stage1_prober/args.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epochs": 20,
3
+ "batch_size": 8,
4
+ "learning_rate": 1e-05,
5
+ "base_dir": "/shared/nas2/ph16/toxic/outputs/classifier",
6
+ "data_dir": "/shared/nas2/ph16/toxic/outputs/states/Yi-1.5-9B-Chat",
7
+ "job_name": "Yi-1.5-9B-Chat_multi_safety/token0",
8
+ "ckpt": "",
9
+ "gpu": "",
10
+ "wandb": false,
11
+ "overwrite": true,
12
+ "hidden_sizes": [
13
+ "64"
14
+ ],
15
+ "random_seed": 42,
16
+ "token_rule": "multi",
17
+ "label": "safety",
18
+ "n_decode": 0,
19
+ "layer_id": 48,
20
+ "num_classification": 2,
21
+ "neg_weight": 1.0,
22
+ "lens_path": "",
23
+ "llm": "Yi-1.5-9B-Chat",
24
+ "use_lens": false
25
+ }
SafeSwitch_Yi-1.5-9B-Chat/stage1_prober/model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efdbbc95fc9425a92eaf683e0eeb97bd75d683b2a1a4d77a361b89451e313411
3
+ size 1051584
SafeSwitch_Yi-1.5-9B-Chat/stage1_prober/result.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "F1": 92.536,
3
+ "acc": 93.818,
4
+ "positive_rate": 40.909,
5
+ "prec": 91.432,
6
+ "recall": 93.667
7
+ }
SafeSwitch_Yi-1.5-9B-Chat/stage2_prober/args.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epochs": 20,
3
+ "batch_size": 8,
4
+ "learning_rate": 1e-05,
5
+ "base_dir": "/shared/nas2/ph16/toxic/outputs/classifier",
6
+ "data_dir": "/shared/nas2/ph16/toxic/outputs/states/Yi-1.5-9B-Chat",
7
+ "job_name": "Yi-1.5-9B-Chat_multi_response/token3",
8
+ "ckpt": "",
9
+ "gpu": "",
10
+ "wandb": false,
11
+ "overwrite": true,
12
+ "hidden_sizes": [
13
+ "64"
14
+ ],
15
+ "random_seed": 42,
16
+ "token_rule": "multi",
17
+ "label": "response",
18
+ "n_decode": 3,
19
+ "layer_id": 48,
20
+ "num_classification": 2,
21
+ "neg_weight": 1.0,
22
+ "lens_path": "",
23
+ "llm": "Yi-1.5-9B-Chat",
24
+ "use_lens": false
25
+ }
SafeSwitch_Yi-1.5-9B-Chat/stage2_prober/model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9036a833d33997443441616015a0264514aeb452050cd86168587c838d8a7ca
3
+ size 1051584
SafeSwitch_Yi-1.5-9B-Chat/stage2_prober/result.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "F1": 98.24,
3
+ "acc": 96.909,
4
+ "positive_rate": 86.682,
5
+ "prec": 96.985,
6
+ "recall": 99.528
7
+ }
SwfeSwitch_Qwen2.5-7B-Instruct/direct_prober/args.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epochs": 20,
3
+ "batch_size": 8,
4
+ "learning_rate": 1e-05,
5
+ "base_dir": "/shared/nas2/ph16/toxic/outputs/classifier",
6
+ "data_dir": "/shared/nas2/ph16/toxic/outputs/states/Qwen2.5-7B-Instruct",
7
+ "job_name": "Qwen2.5-7B-Instruct_both/layer28",
8
+ "ckpt": "",
9
+ "gpu": "",
10
+ "wandb": false,
11
+ "overwrite": true,
12
+ "hidden_sizes": [
13
+ "64"
14
+ ],
15
+ "random_seed": 42,
16
+ "token_rule": "last",
17
+ "label": "both",
18
+ "n_decode": 0,
19
+ "layer_id": 28,
20
+ "num_classification": 2,
21
+ "neg_weight": 1.0,
22
+ "lens_path": "",
23
+ "llm": "Qwen2.5-7B-Instruct",
24
+ "use_lens": false
25
+ }
SwfeSwitch_Qwen2.5-7B-Instruct/direct_prober/model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d6bb732f71e5b6fe8f0aed289d73a76422c5bdafcb2f6a82b581b60a62aae38
3
+ size 920512
SwfeSwitch_Qwen2.5-7B-Instruct/direct_prober/result.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "F1": 86.651,
3
+ "acc": 92.045,
4
+ "positive_rate": 29.727,
5
+ "prec": 86.454,
6
+ "recall": 86.85
7
+ }
SwfeSwitch_Qwen2.5-7B-Instruct/refusal_head.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4869de880199a48ce421a9a95166f9b2f295c3a864e397d4ef73ef36fe57899
3
+ size 2179990748
SwfeSwitch_Qwen2.5-7B-Instruct/stage1_prober/args.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epochs": 20,
3
+ "batch_size": 8,
4
+ "learning_rate": 1e-05,
5
+ "base_dir": "/shared/nas2/ph16/toxic/outputs/classifier",
6
+ "data_dir": "/shared/nas2/ph16/toxic/outputs/states/Qwen2.5-7B-Instruct",
7
+ "job_name": "Qwen2.5-7B-Instruct_multi_safety/token0",
8
+ "ckpt": "",
9
+ "gpu": "",
10
+ "wandb": false,
11
+ "overwrite": true,
12
+ "hidden_sizes": [
13
+ "64"
14
+ ],
15
+ "random_seed": 42,
16
+ "token_rule": "multi",
17
+ "label": "safety",
18
+ "n_decode": 0,
19
+ "layer_id": 28,
20
+ "num_classification": 2,
21
+ "neg_weight": 1.0,
22
+ "lens_path": "",
23
+ "llm": "Qwen2.5-7B-Instruct",
24
+ "use_lens": false
25
+ }
SwfeSwitch_Qwen2.5-7B-Instruct/stage1_prober/model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67754809618102057da44cc98d3bb98b8639e827108089f7d779e64da74a04b1
3
+ size 920512
SwfeSwitch_Qwen2.5-7B-Instruct/stage1_prober/result.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "F1": 93.475,
3
+ "acc": 94.682,
4
+ "positive_rate": 40.909,
5
+ "prec": 93.841,
6
+ "recall": 93.111
7
+ }
SwfeSwitch_Qwen2.5-7B-Instruct/stage2_prober/args.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epochs": 20,
3
+ "batch_size": 8,
4
+ "learning_rate": 1e-05,
5
+ "base_dir": "/shared/nas2/ph16/toxic/outputs/classifier",
6
+ "data_dir": "/shared/nas2/ph16/toxic/outputs/states/Qwen2.5-7B-Instruct",
7
+ "job_name": "Qwen2.5-7B-Instruct_multi_response/token3",
8
+ "ckpt": "",
9
+ "gpu": "",
10
+ "wandb": false,
11
+ "overwrite": true,
12
+ "hidden_sizes": [
13
+ "64"
14
+ ],
15
+ "random_seed": 42,
16
+ "token_rule": "multi",
17
+ "label": "response",
18
+ "n_decode": 3,
19
+ "layer_id": 28,
20
+ "num_classification": 2,
21
+ "neg_weight": 1.0,
22
+ "lens_path": "",
23
+ "llm": "Qwen2.5-7B-Instruct",
24
+ "use_lens": false
25
+ }
SwfeSwitch_Qwen2.5-7B-Instruct/stage2_prober/model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f150487f92aae5d1c33188efd6ebb91fd7d1bb3cc5567d5ce1a60fd129fdbdf5
3
+ size 920512
SwfeSwitch_Qwen2.5-7B-Instruct/stage2_prober/result.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "F1": 96.885,
3
+ "acc": 94.636,
4
+ "positive_rate": 85.0,
5
+ "prec": 95.673,
6
+ "recall": 98.128
7
+ }