Merge branch 'main' of hf.co:tangledgroup/tangled-alpha-0.11-core
Browse files- README.md +13 -13
- config-0.json +4 -4
README.md
CHANGED
@@ -93,20 +93,20 @@ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable
|
|
93 |
|
94 |
```
|
95 |
Seed set to 23
|
96 |
-
Time to instantiate model: 0.
|
97 |
-
Total parameters:
|
98 |
Verifying settings ...
|
99 |
-
Measured TFLOPs:
|
100 |
-
Epoch 1 | iter 64 step 1 | loss train: 11.
|
101 |
-
Epoch 1 | iter 128 step 2 | loss train: 11.
|
102 |
-
Epoch 1 | iter 192 step 3 | loss train: 11.
|
103 |
-
Epoch 1 | iter 256 step 4 | loss train: 11.
|
104 |
-
Epoch 1 | iter 320 step 5 | loss train: 11.
|
105 |
-
Epoch 1 | iter 384 step 6 | loss train: 11.
|
106 |
-
Epoch 1 | iter 448 step 7 | loss train: 11.
|
107 |
-
Epoch 1 | iter 512 step 8 | loss train: 11.
|
108 |
-
Epoch 1 | iter 576 step 9 | loss train: 11.
|
109 |
-
Epoch 1 | iter 640 step 10 | loss train: 11.
|
110 |
# ...
|
111 |
```
|
112 |
|
|
|
93 |
|
94 |
```
|
95 |
Seed set to 23
|
96 |
+
Time to instantiate model: 0.20 seconds.
|
97 |
+
Total parameters: 234,897,920
|
98 |
Verifying settings ...
|
99 |
+
Measured TFLOPs: 28077.03
|
100 |
+
Epoch 1 | iter 64 step 1 | loss train: 11.977, val: n/a | iter time: 350.96 ms (step) remaining time: 10 days, 14:14:05
|
101 |
+
Epoch 1 | iter 128 step 2 | loss train: 11.977, val: n/a | iter time: 280.36 ms (step) remaining time: 7 days, 8:25:44
|
102 |
+
Epoch 1 | iter 192 step 3 | loss train: 11.974, val: n/a | iter time: 280.80 ms (step) remaining time: 6 days, 6:28:36
|
103 |
+
Epoch 1 | iter 256 step 4 | loss train: 11.975, val: n/a | iter time: 281.44 ms (step) remaining time: 5 days, 17:28:43
|
104 |
+
Epoch 1 | iter 320 step 5 | loss train: 11.974, val: n/a | iter time: 280.13 ms (step) remaining time: 5 days, 9:40:25
|
105 |
+
Epoch 1 | iter 384 step 6 | loss train: 11.976, val: n/a | iter time: 281.50 ms (step) remaining time: 5 days, 4:26:59
|
106 |
+
Epoch 1 | iter 448 step 7 | loss train: 11.974, val: n/a | iter time: 280.34 ms (step) remaining time: 5 days, 0:43:34
|
107 |
+
Epoch 1 | iter 512 step 8 | loss train: 11.970, val: n/a | iter time: 280.74 ms (step) remaining time: 4 days, 21:55:15
|
108 |
+
Epoch 1 | iter 576 step 9 | loss train: 11.970, val: n/a | iter time: 279.90 ms (step) remaining time: 4 days, 19:44:24
|
109 |
+
Epoch 1 | iter 640 step 10 | loss train: 11.971, val: n/a | iter time: 279.74 ms (step) remaining time: 4 days, 17:59:44
|
110 |
# ...
|
111 |
```
|
112 |
|
config-0.json
CHANGED
@@ -8,15 +8,15 @@
|
|
8 |
"eos_token_id": 1,
|
9 |
"head_dim": 64,
|
10 |
"hidden_act": "silu",
|
11 |
-
"hidden_size":
|
12 |
"initializer_range": 0.02,
|
13 |
-
"intermediate_size":
|
14 |
"max_position_embeddings": 131072,
|
15 |
"mlp_bias": false,
|
16 |
"model_type": "llama",
|
17 |
-
"num_attention_heads":
|
18 |
"num_hidden_layers": 32,
|
19 |
-
"num_key_value_heads":
|
20 |
"pretraining_tp": 1,
|
21 |
"rms_norm_eps": 1e-05,
|
22 |
"rope_scaling": null,
|
|
|
8 |
"eos_token_id": 1,
|
9 |
"head_dim": 64,
|
10 |
"hidden_act": "silu",
|
11 |
+
"hidden_size": 512,
|
12 |
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 1365,
|
14 |
"max_position_embeddings": 131072,
|
15 |
"mlp_bias": false,
|
16 |
"model_type": "llama",
|
17 |
+
"num_attention_heads": 8,
|
18 |
"num_hidden_layers": 32,
|
19 |
+
"num_key_value_heads": 8,
|
20 |
"pretraining_tp": 1,
|
21 |
"rms_norm_eps": 1e-05,
|
22 |
"rope_scaling": null,
|