Adil1567 commited on
Commit
68710ab
·
verified ·
1 Parent(s): efaeb13

Training in progress, epoch 0

Browse files
logs/training_log.txt CHANGED
@@ -1,58 +1,120 @@
1
- 2025-01-08 18:29:22,070 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmpw2j4jae_/test.c -o /tmp/tmpw2j4jae_/test.o
2
- 2025-01-08 18:29:22,097 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmpw2j4jae_/test.o -laio -o /tmp/tmpw2j4jae_/a.out
3
- 2025-01-08 18:29:22,252 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmp3xs2q_w0/test.c -o /tmp/tmp3xs2q_w0/test.o
4
- 2025-01-08 18:29:22,279 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmp3xs2q_w0/test.o -laio -o /tmp/tmp3xs2q_w0/a.out
5
- 2025-01-08 18:29:22,281 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmpoehuhgbl/test.c -o /tmp/tmpoehuhgbl/test.o
6
- 2025-01-08 18:29:22,307 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmpoehuhgbl/test.o -laio -o /tmp/tmpoehuhgbl/a.out
7
- 2025-01-08 18:29:22,311 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmp5eog_7fp/test.c -o /tmp/tmp5eog_7fp/test.o
8
- 2025-01-08 18:29:22,334 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmp5eog_7fp/test.o -laio -o /tmp/tmp5eog_7fp/a.out
9
- 2025-01-08 18:29:22,519 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmp7o4d197o/test.c -o /tmp/tmp7o4d197o/test.o
10
- 2025-01-08 18:29:22,545 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmp7o4d197o/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp7o4d197o/a.out
11
- 2025-01-08 18:29:22,683 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmpskkmpgdv/test.c -o /tmp/tmpskkmpgdv/test.o
12
- 2025-01-08 18:29:22,710 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmpskkmpgdv/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpskkmpgdv/a.out
13
- 2025-01-08 18:29:22,759 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmp0i19mv7y/test.c -o /tmp/tmp0i19mv7y/test.o
14
- 2025-01-08 18:29:22,778 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmptzck4dvd/test.c -o /tmp/tmptzck4dvd/test.o
15
- 2025-01-08 18:29:22,785 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmp0i19mv7y/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp0i19mv7y/a.out
16
- 2025-01-08 18:29:22,795 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmptzck4dvd/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmptzck4dvd/a.out
17
- 2025-01-08 18:34:07,128 - INFO - Training started
18
- 2025-01-08 18:34:07,129 - INFO - Total steps: 2
19
- 2025-01-08 18:37:14,796 - INFO - Loss improved from inf to 1.98041
20
- 2025-01-08 18:37:14,796 - INFO - Loss improved from inf to 1.98041
21
- 2025-01-08 18:37:14,797 - INFO - Loss improved from inf to 1.98041
22
- 2025-01-08 18:37:14,798 - INFO - Step 1/2 (50.0%), epoch: 1.0000, step_time: 571.32s, elapsed_time: 571.32s
23
- 2025-01-08 18:37:14,799 - INFO - Evaluation Results:
24
- eval_loss: 1.9804
25
- eval_runtime: 24.9974
26
- eval_samples_per_second: 0.3200
27
- eval_steps_per_second: 0.0800
28
- epoch: 1.0000
29
- elapsed_time: 571.32s
30
- step_time: 571.32s
31
- 2025-01-08 18:37:14,799 - INFO - Loss improved from inf to 1.98041
32
- 2025-01-08 18:40:40,756 - INFO - Saving model to mistral-sft-lora-fsdp2/checkpoint-1/pytorch_model_fsdp_0
33
- 2025-01-08 18:40:44,085 - INFO - Model saved to mistral-sft-lora-fsdp2/checkpoint-1/pytorch_model_fsdp_0
34
- 2025-01-08 18:40:50,139 - INFO - Saving Optimizer state to mistral-sft-lora-fsdp2/checkpoint-1/optimizer_0
35
- 2025-01-08 18:40:56,423 - INFO - Optimizer state saved in mistral-sft-lora-fsdp2/checkpoint-1/optimizer_0
36
- 2025-01-08 18:44:56,103 - INFO - Saving model to mistral-sft-lora-fsdp2/checkpoint-2/pytorch_model_fsdp_0
37
- 2025-01-08 18:44:59,225 - INFO - Model saved to mistral-sft-lora-fsdp2/checkpoint-2/pytorch_model_fsdp_0
38
- 2025-01-08 18:45:05,105 - INFO - Saving Optimizer state to mistral-sft-lora-fsdp2/checkpoint-2/optimizer_0
39
- 2025-01-08 18:45:11,104 - INFO - Optimizer state saved in mistral-sft-lora-fsdp2/checkpoint-2/optimizer_0
40
- 2025-01-08 18:45:36,527 - INFO - Loss improved from 1.98041 to 1.83309
41
- 2025-01-08 18:45:36,527 - INFO - Loss improved from 1.98041 to 1.83309
42
- 2025-01-08 18:45:36,527 - INFO - Loss improved from 1.98041 to 1.83309
43
- 2025-01-08 18:45:36,528 - INFO - Step 2/2 (100.0%), epoch: 2.0000, step_time: 501.73s, elapsed_time: 1073.05s
44
- 2025-01-08 18:45:36,529 - INFO - Evaluation Results:
45
- eval_loss: 1.8331
46
- eval_runtime: 25.1685
47
- eval_samples_per_second: 0.3180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  eval_steps_per_second: 0.0790
49
- epoch: 2.0000
50
- elapsed_time: 1073.05s
51
- step_time: 501.73s
52
- 2025-01-08 18:45:36,529 - INFO - Loss improved from 1.98041 to 1.83309
53
- 2025-01-08 18:48:59,163 - INFO - Saving model to mistral-sft-lora-fsdp2/checkpoint-2/pytorch_model_fsdp_0
54
- 2025-01-08 18:49:02,615 - INFO - Model saved to mistral-sft-lora-fsdp2/checkpoint-2/pytorch_model_fsdp_0
55
- 2025-01-08 18:49:08,850 - INFO - Saving Optimizer state to mistral-sft-lora-fsdp2/checkpoint-2/optimizer_0
56
- 2025-01-08 18:49:15,280 - INFO - Optimizer state saved in mistral-sft-lora-fsdp2/checkpoint-2/optimizer_0
57
- 2025-01-08 18:49:15,799 - INFO - Step 2/2 (100.0%), epoch: 2.0000, step_time: 219.27s, elapsed_time: 1292.32s
58
- 2025-01-08 18:49:15,801 - INFO - Training completed in 1292.32 seconds
 
1
+ 2025-01-08 19:17:21,692 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmp2ck6dpv_/test.c -o /tmp/tmp2ck6dpv_/test.o
2
+ 2025-01-08 19:17:21,723 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmp2ck6dpv_/test.o -laio -o /tmp/tmp2ck6dpv_/a.out
3
+ 2025-01-08 19:17:22,159 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmppfomhgow/test.c -o /tmp/tmppfomhgow/test.o
4
+ 2025-01-08 19:17:22,204 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmppfomhgow/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmppfomhgow/a.out
5
+ 2025-01-08 19:17:24,497 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmp0g0jxvh5/test.c -o /tmp/tmp0g0jxvh5/test.o
6
+ 2025-01-08 19:17:24,525 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmp0g0jxvh5/test.o -laio -o /tmp/tmp0g0jxvh5/a.out
7
+ 2025-01-08 19:17:24,555 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmp6e8myn4y/test.c -o /tmp/tmp6e8myn4y/test.o
8
+ 2025-01-08 19:17:24,557 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmp7vyq5nz_/test.c -o /tmp/tmp7vyq5nz_/test.o
9
+ 2025-01-08 19:17:24,582 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmp6e8myn4y/test.o -laio -o /tmp/tmp6e8myn4y/a.out
10
+ 2025-01-08 19:17:24,583 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmp7vyq5nz_/test.o -laio -o /tmp/tmp7vyq5nz_/a.out
11
+ 2025-01-08 19:17:24,960 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmp2rrm1y3q/test.c -o /tmp/tmp2rrm1y3q/test.o
12
+ 2025-01-08 19:17:24,983 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmpbg_v2wps/test.c -o /tmp/tmpbg_v2wps/test.o
13
+ 2025-01-08 19:17:24,986 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmp2rrm1y3q/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp2rrm1y3q/a.out
14
+ 2025-01-08 19:17:25,007 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmpbg_v2wps/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpbg_v2wps/a.out
15
+ 2025-01-08 19:17:25,049 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmp8gt5q4f1/test.c -o /tmp/tmp8gt5q4f1/test.o
16
+ 2025-01-08 19:17:25,071 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmp8gt5q4f1/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp8gt5q4f1/a.out
17
+ 2025-01-08 19:22:03,063 - INFO - Training started
18
+ 2025-01-08 19:22:03,063 - INFO - Total steps: 1281
19
+ 2025-01-08 19:34:20,509 - INFO - Step 5/1281 (0.4%), loss: 1.4755, learning_rate: 1.00e-04, epoch: 0.0117, step_time: 1120.55s, elapsed_time: 1120.55s, grad_norm: 0.8934
20
+ 2025-01-08 19:44:33,351 - INFO - Step 10/1281 (0.8%), loss: 1.0298, learning_rate: 1.00e-04, epoch: 0.0234, step_time: 612.84s, elapsed_time: 1733.39s, grad_norm: 0.7723
21
+ 2025-01-08 19:54:45,845 - INFO - Step 15/1281 (1.2%), loss: 0.8797, learning_rate: 1.00e-04, epoch: 0.0351, step_time: 612.49s, elapsed_time: 2345.88s, grad_norm: 1.5860
22
+ 2025-01-08 20:04:57,722 - INFO - Step 20/1281 (1.6%), loss: 0.7675, learning_rate: 9.99e-05, epoch: 0.0468, step_time: 611.88s, elapsed_time: 2957.76s, grad_norm: 1.6310
23
+ 2025-01-08 20:15:05,543 - INFO - Step 25/1281 (2.0%), loss: 0.7195, learning_rate: 9.99e-05, epoch: 0.0585, step_time: 607.82s, elapsed_time: 3565.58s, grad_norm: 1.2201
24
+ 2025-01-08 20:25:08,368 - INFO - Step 30/1281 (2.3%), loss: 0.6904, learning_rate: 9.99e-05, epoch: 0.0702, step_time: 602.82s, elapsed_time: 4168.40s, grad_norm: 0.6741
25
+ 2025-01-08 20:35:09,287 - INFO - Step 35/1281 (2.7%), loss: 0.6628, learning_rate: 9.98e-05, epoch: 0.0819, step_time: 600.92s, elapsed_time: 4769.32s, grad_norm: 0.5197
26
+ 2025-01-08 20:45:12,080 - INFO - Step 40/1281 (3.1%), loss: 0.6241, learning_rate: 9.98e-05, epoch: 0.0936, step_time: 602.79s, elapsed_time: 5372.12s, grad_norm: 0.4424
27
+ 2025-01-08 20:55:15,658 - INFO - Step 45/1281 (3.5%), loss: 0.6229, learning_rate: 9.97e-05, epoch: 0.1053, step_time: 603.58s, elapsed_time: 5975.69s, grad_norm: 0.4752
28
+ 2025-01-08 21:05:17,810 - INFO - Step 50/1281 (3.9%), loss: 0.5978, learning_rate: 9.96e-05, epoch: 0.1170, step_time: 602.15s, elapsed_time: 6577.85s, grad_norm: 0.3438
29
+ 2025-01-08 21:15:19,753 - INFO - Step 55/1281 (4.3%), loss: 0.5847, learning_rate: 9.95e-05, epoch: 0.1287, step_time: 601.94s, elapsed_time: 7179.79s, grad_norm: 0.3600
30
+ 2025-01-08 21:25:21,775 - INFO - Step 60/1281 (4.7%), loss: 0.5686, learning_rate: 9.95e-05, epoch: 0.1404, step_time: 602.02s, elapsed_time: 7781.81s, grad_norm: 0.3749
31
+ 2025-01-08 21:35:25,945 - INFO - Step 65/1281 (5.1%), loss: 0.5787, learning_rate: 9.94e-05, epoch: 0.1520, step_time: 604.17s, elapsed_time: 8385.98s, grad_norm: 0.3729
32
+ 2025-01-08 21:45:26,630 - INFO - Step 70/1281 (5.5%), loss: 0.5608, learning_rate: 9.93e-05, epoch: 0.1637, step_time: 600.69s, elapsed_time: 8986.67s, grad_norm: 0.3449
33
+ 2025-01-08 21:55:29,457 - INFO - Step 75/1281 (5.9%), loss: 0.5192, learning_rate: 9.92e-05, epoch: 0.1754, step_time: 602.83s, elapsed_time: 9589.49s, grad_norm: 0.3919
34
+ 2025-01-08 22:05:33,796 - INFO - Step 80/1281 (6.2%), loss: 0.5120, learning_rate: 9.90e-05, epoch: 0.1871, step_time: 604.34s, elapsed_time: 10193.83s, grad_norm: 0.3015
35
+ 2025-01-08 22:15:37,562 - INFO - Step 85/1281 (6.6%), loss: 0.4869, learning_rate: 9.89e-05, epoch: 0.1988, step_time: 603.77s, elapsed_time: 10797.60s, grad_norm: 0.2931
36
+ 2025-01-08 22:25:41,155 - INFO - Step 90/1281 (7.0%), loss: 0.4632, learning_rate: 9.88e-05, epoch: 0.2105, step_time: 603.59s, elapsed_time: 11401.19s, grad_norm: 0.3108
37
+ 2025-01-08 22:35:45,900 - INFO - Step 95/1281 (7.4%), loss: 0.4794, learning_rate: 9.86e-05, epoch: 0.2222, step_time: 604.74s, elapsed_time: 12005.94s, grad_norm: 0.3473
38
+ 2025-01-08 22:45:48,393 - INFO - Step 100/1281 (7.8%), loss: 0.4609, learning_rate: 9.85e-05, epoch: 0.2339, step_time: 602.49s, elapsed_time: 12608.43s, grad_norm: 0.2963
39
+ 2025-01-08 22:55:51,306 - INFO - Step 105/1281 (8.2%), loss: 0.4842, learning_rate: 9.84e-05, epoch: 0.2456, step_time: 602.91s, elapsed_time: 13211.34s, grad_norm: 0.2883
40
+ 2025-01-08 23:05:52,794 - INFO - Step 110/1281 (8.6%), loss: 0.4557, learning_rate: 9.82e-05, epoch: 0.2573, step_time: 601.49s, elapsed_time: 13812.83s, grad_norm: 0.2928
41
+ 2025-01-08 23:15:55,888 - INFO - Step 115/1281 (9.0%), loss: 0.4644, learning_rate: 9.80e-05, epoch: 0.2690, step_time: 603.09s, elapsed_time: 14415.92s, grad_norm: 0.2669
42
+ 2025-01-08 23:25:57,881 - INFO - Step 120/1281 (9.4%), loss: 0.4490, learning_rate: 9.79e-05, epoch: 0.2807, step_time: 601.99s, elapsed_time: 15017.92s, grad_norm: 0.3591
43
+ 2025-01-08 23:35:58,659 - INFO - Step 125/1281 (9.8%), loss: 0.4663, learning_rate: 9.77e-05, epoch: 0.2924, step_time: 600.78s, elapsed_time: 15618.70s, grad_norm: 0.2833
44
+ 2025-01-08 23:46:00,173 - INFO - Step 130/1281 (10.1%), loss: 0.4461, learning_rate: 9.75e-05, epoch: 0.3041, step_time: 601.51s, elapsed_time: 16220.21s, grad_norm: 0.2706
45
+ 2025-01-08 23:56:01,734 - INFO - Step 135/1281 (10.5%), loss: 0.4481, learning_rate: 9.73e-05, epoch: 0.3158, step_time: 601.56s, elapsed_time: 16821.77s, grad_norm: 0.2958
46
+ 2025-01-09 00:06:06,317 - INFO - Step 140/1281 (10.9%), loss: 0.4631, learning_rate: 9.71e-05, epoch: 0.3275, step_time: 604.58s, elapsed_time: 17426.35s, grad_norm: 0.2749
47
+ 2025-01-09 00:16:09,098 - INFO - Step 145/1281 (11.3%), loss: 0.4503, learning_rate: 9.69e-05, epoch: 0.3392, step_time: 602.78s, elapsed_time: 18029.13s, grad_norm: 0.3135
48
+ 2025-01-09 00:26:11,816 - INFO - Step 150/1281 (11.7%), loss: 0.4389, learning_rate: 9.67e-05, epoch: 0.3509, step_time: 602.72s, elapsed_time: 18631.85s, grad_norm: 0.2961
49
+ 2025-01-09 00:36:12,847 - INFO - Step 155/1281 (12.1%), loss: 0.4391, learning_rate: 9.64e-05, epoch: 0.3626, step_time: 601.03s, elapsed_time: 19232.88s, grad_norm: 0.2587
50
+ 2025-01-09 00:46:15,889 - INFO - Step 160/1281 (12.5%), loss: 0.4372, learning_rate: 9.62e-05, epoch: 0.3743, step_time: 603.04s, elapsed_time: 19835.93s, grad_norm: 0.2949
51
+ 2025-01-09 00:56:18,225 - INFO - Step 165/1281 (12.9%), loss: 0.4333, learning_rate: 9.60e-05, epoch: 0.3860, step_time: 602.34s, elapsed_time: 20438.26s, grad_norm: 0.2650
52
+ 2025-01-09 01:06:21,912 - INFO - Step 170/1281 (13.3%), loss: 0.4352, learning_rate: 9.57e-05, epoch: 0.3977, step_time: 603.69s, elapsed_time: 21041.95s, grad_norm: 0.2787
53
+ 2025-01-09 01:16:26,354 - INFO - Step 175/1281 (13.7%), loss: 0.4215, learning_rate: 9.55e-05, epoch: 0.4094, step_time: 604.44s, elapsed_time: 21646.39s, grad_norm: 0.2737
54
+ 2025-01-09 01:26:27,417 - INFO - Step 180/1281 (14.1%), loss: 0.4382, learning_rate: 9.52e-05, epoch: 0.4211, step_time: 601.06s, elapsed_time: 22247.45s, grad_norm: 0.2691
55
+ 2025-01-09 01:36:29,557 - INFO - Step 185/1281 (14.4%), loss: 0.4456, learning_rate: 9.49e-05, epoch: 0.4327, step_time: 602.14s, elapsed_time: 22849.59s, grad_norm: 0.2718
56
+ 2025-01-09 01:46:30,681 - INFO - Step 190/1281 (14.8%), loss: 0.4134, learning_rate: 9.47e-05, epoch: 0.4444, step_time: 601.12s, elapsed_time: 23450.72s, grad_norm: 0.2703
57
+ 2025-01-09 01:56:32,016 - INFO - Step 195/1281 (15.2%), loss: 0.4200, learning_rate: 9.44e-05, epoch: 0.4561, step_time: 601.33s, elapsed_time: 24052.05s, grad_norm: 0.2519
58
+ 2025-01-09 02:06:34,240 - INFO - Step 200/1281 (15.6%), loss: 0.4261, learning_rate: 9.41e-05, epoch: 0.4678, step_time: 602.22s, elapsed_time: 24654.28s, grad_norm: 0.3421
59
+ 2025-01-09 02:16:36,844 - INFO - Step 205/1281 (16.0%), loss: 0.3964, learning_rate: 9.38e-05, epoch: 0.4795, step_time: 602.60s, elapsed_time: 25256.88s, grad_norm: 0.2663
60
+ 2025-01-09 02:26:39,776 - INFO - Step 210/1281 (16.4%), loss: 0.4266, learning_rate: 9.35e-05, epoch: 0.4912, step_time: 602.93s, elapsed_time: 25859.81s, grad_norm: 0.2692
61
+ 2025-01-09 02:36:42,346 - INFO - Step 215/1281 (16.8%), loss: 0.4340, learning_rate: 9.32e-05, epoch: 0.5029, step_time: 602.57s, elapsed_time: 26462.38s, grad_norm: 0.2842
62
+ 2025-01-09 02:46:44,912 - INFO - Step 220/1281 (17.2%), loss: 0.4246, learning_rate: 9.29e-05, epoch: 0.5146, step_time: 602.57s, elapsed_time: 27064.95s, grad_norm: 0.4175
63
+ 2025-01-09 02:56:48,074 - INFO - Step 225/1281 (17.6%), loss: 0.4436, learning_rate: 9.26e-05, epoch: 0.5263, step_time: 603.16s, elapsed_time: 27668.11s, grad_norm: 0.2852
64
+ 2025-01-09 03:06:49,000 - INFO - Step 230/1281 (18.0%), loss: 0.4152, learning_rate: 9.23e-05, epoch: 0.5380, step_time: 600.93s, elapsed_time: 28269.04s, grad_norm: 0.2848
65
+ 2025-01-09 03:16:50,893 - INFO - Step 235/1281 (18.3%), loss: 0.4013, learning_rate: 9.19e-05, epoch: 0.5497, step_time: 601.89s, elapsed_time: 28870.93s, grad_norm: 0.2704
66
+ 2025-01-09 03:26:53,653 - INFO - Step 240/1281 (18.7%), loss: 0.3941, learning_rate: 9.16e-05, epoch: 0.5614, step_time: 602.76s, elapsed_time: 29473.69s, grad_norm: 0.2616
67
+ 2025-01-09 03:36:53,946 - INFO - Step 245/1281 (19.1%), loss: 0.4165, learning_rate: 9.12e-05, epoch: 0.5731, step_time: 600.29s, elapsed_time: 30073.98s, grad_norm: 0.2544
68
+ 2025-01-09 03:46:56,614 - INFO - Step 250/1281 (19.5%), loss: 0.4177, learning_rate: 9.09e-05, epoch: 0.5848, step_time: 602.67s, elapsed_time: 30676.65s, grad_norm: 0.2776
69
+ 2025-01-09 03:56:57,796 - INFO - Step 255/1281 (19.9%), loss: 0.4018, learning_rate: 9.05e-05, epoch: 0.5965, step_time: 601.18s, elapsed_time: 31277.83s, grad_norm: 0.2499
70
+ 2025-01-09 04:07:00,066 - INFO - Step 260/1281 (20.3%), loss: 0.4138, learning_rate: 9.02e-05, epoch: 0.6082, step_time: 602.27s, elapsed_time: 31880.10s, grad_norm: 0.2693
71
+ 2025-01-09 04:17:02,224 - INFO - Step 265/1281 (20.7%), loss: 0.3984, learning_rate: 8.98e-05, epoch: 0.6199, step_time: 602.16s, elapsed_time: 32482.26s, grad_norm: 0.2744
72
+ 2025-01-09 04:27:03,630 - INFO - Step 270/1281 (21.1%), loss: 0.4269, learning_rate: 8.94e-05, epoch: 0.6316, step_time: 601.41s, elapsed_time: 33083.67s, grad_norm: 0.2762
73
+ 2025-01-09 04:37:06,555 - INFO - Step 275/1281 (21.5%), loss: 0.3986, learning_rate: 8.91e-05, epoch: 0.6433, step_time: 602.93s, elapsed_time: 33686.59s, grad_norm: 0.2647
74
+ 2025-01-09 04:47:08,811 - INFO - Step 280/1281 (21.9%), loss: 0.4057, learning_rate: 8.87e-05, epoch: 0.6550, step_time: 602.26s, elapsed_time: 34288.85s, grad_norm: 0.2787
75
+ 2025-01-09 04:57:11,235 - INFO - Step 285/1281 (22.2%), loss: 0.4143, learning_rate: 8.83e-05, epoch: 0.6667, step_time: 602.42s, elapsed_time: 34891.27s, grad_norm: 0.3001
76
+ 2025-01-09 05:07:12,645 - INFO - Step 290/1281 (22.6%), loss: 0.4012, learning_rate: 8.79e-05, epoch: 0.6784, step_time: 601.41s, elapsed_time: 35492.68s, grad_norm: 0.2544
77
+ 2025-01-09 05:17:14,293 - INFO - Step 295/1281 (23.0%), loss: 0.3942, learning_rate: 8.75e-05, epoch: 0.6901, step_time: 601.65s, elapsed_time: 36094.33s, grad_norm: 0.2604
78
+ 2025-01-09 05:27:17,925 - INFO - Step 300/1281 (23.4%), loss: 0.3974, learning_rate: 8.71e-05, epoch: 0.7018, step_time: 603.63s, elapsed_time: 36697.96s, grad_norm: 0.2718
79
+ 2025-01-09 05:37:19,535 - INFO - Step 305/1281 (23.8%), loss: 0.3967, learning_rate: 8.67e-05, epoch: 0.7135, step_time: 601.61s, elapsed_time: 37299.57s, grad_norm: 0.2717
80
+ 2025-01-09 05:47:20,092 - INFO - Step 310/1281 (24.2%), loss: 0.3765, learning_rate: 8.62e-05, epoch: 0.7251, step_time: 600.56s, elapsed_time: 37900.13s, grad_norm: 0.2735
81
+ 2025-01-09 05:57:20,851 - INFO - Step 315/1281 (24.6%), loss: 0.4131, learning_rate: 8.58e-05, epoch: 0.7368, step_time: 600.76s, elapsed_time: 38500.89s, grad_norm: 0.2609
82
+ 2025-01-09 06:07:22,985 - INFO - Step 320/1281 (25.0%), loss: 0.3945, learning_rate: 8.54e-05, epoch: 0.7485, step_time: 602.13s, elapsed_time: 39103.02s, grad_norm: 0.2507
83
+ 2025-01-09 06:17:24,449 - INFO - Step 325/1281 (25.4%), loss: 0.3916, learning_rate: 8.49e-05, epoch: 0.7602, step_time: 601.46s, elapsed_time: 39704.49s, grad_norm: 0.2386
84
+ 2025-01-09 06:27:25,872 - INFO - Step 330/1281 (25.8%), loss: 0.3894, learning_rate: 8.45e-05, epoch: 0.7719, step_time: 601.42s, elapsed_time: 40305.91s, grad_norm: 0.2645
85
+ 2025-01-09 06:37:27,281 - INFO - Step 335/1281 (26.2%), loss: 0.3955, learning_rate: 8.41e-05, epoch: 0.7836, step_time: 601.41s, elapsed_time: 40907.32s, grad_norm: 0.2722
86
+ 2025-01-09 06:47:28,321 - INFO - Step 340/1281 (26.5%), loss: 0.3725, learning_rate: 8.36e-05, epoch: 0.7953, step_time: 601.04s, elapsed_time: 41508.36s, grad_norm: 0.2430
87
+ 2025-01-09 06:57:30,311 - INFO - Step 345/1281 (26.9%), loss: 0.3883, learning_rate: 8.31e-05, epoch: 0.8070, step_time: 601.99s, elapsed_time: 42110.35s, grad_norm: 0.2525
88
+ 2025-01-09 07:07:32,983 - INFO - Step 350/1281 (27.3%), loss: 0.3883, learning_rate: 8.27e-05, epoch: 0.8187, step_time: 602.67s, elapsed_time: 42713.02s, grad_norm: 0.2387
89
+ 2025-01-09 07:17:34,098 - INFO - Step 355/1281 (27.7%), loss: 0.3906, learning_rate: 8.22e-05, epoch: 0.8304, step_time: 601.12s, elapsed_time: 43314.13s, grad_norm: 0.2725
90
+ 2025-01-09 07:27:37,098 - INFO - Step 360/1281 (28.1%), loss: 0.3751, learning_rate: 8.17e-05, epoch: 0.8421, step_time: 603.00s, elapsed_time: 43917.13s, grad_norm: 0.2814
91
+ 2025-01-09 07:37:37,150 - INFO - Step 365/1281 (28.5%), loss: 0.3858, learning_rate: 8.13e-05, epoch: 0.8538, step_time: 600.05s, elapsed_time: 44517.19s, grad_norm: 0.2561
92
+ 2025-01-09 07:47:40,487 - INFO - Step 370/1281 (28.9%), loss: 0.3629, learning_rate: 8.08e-05, epoch: 0.8655, step_time: 603.34s, elapsed_time: 45120.52s, grad_norm: 0.2712
93
+ 2025-01-09 07:57:41,870 - INFO - Step 375/1281 (29.3%), loss: 0.3733, learning_rate: 8.03e-05, epoch: 0.8772, step_time: 601.38s, elapsed_time: 45721.91s, grad_norm: 0.2457
94
+ 2025-01-09 08:07:42,687 - INFO - Step 380/1281 (29.7%), loss: 0.3691, learning_rate: 7.98e-05, epoch: 0.8889, step_time: 600.82s, elapsed_time: 46322.72s, grad_norm: 0.2544
95
+ 2025-01-09 08:17:46,148 - INFO - Step 385/1281 (30.1%), loss: 0.3768, learning_rate: 7.93e-05, epoch: 0.9006, step_time: 603.46s, elapsed_time: 46926.18s, grad_norm: 0.2821
96
+ 2025-01-09 08:27:49,374 - INFO - Step 390/1281 (30.4%), loss: 0.3914, learning_rate: 7.88e-05, epoch: 0.9123, step_time: 603.23s, elapsed_time: 47529.41s, grad_norm: 0.2370
97
+ 2025-01-09 08:37:51,424 - INFO - Step 395/1281 (30.8%), loss: 0.3796, learning_rate: 7.83e-05, epoch: 0.9240, step_time: 602.05s, elapsed_time: 48131.46s, grad_norm: 0.2675
98
+ 2025-01-09 08:47:53,337 - INFO - Step 400/1281 (31.2%), loss: 0.3701, learning_rate: 7.78e-05, epoch: 0.9357, step_time: 601.91s, elapsed_time: 48733.37s, grad_norm: 0.2477
99
+ 2025-01-09 08:57:57,081 - INFO - Step 405/1281 (31.6%), loss: 0.3703, learning_rate: 7.73e-05, epoch: 0.9474, step_time: 603.74s, elapsed_time: 49337.12s, grad_norm: 0.2288
100
+ 2025-01-09 09:07:58,310 - INFO - Step 410/1281 (32.0%), loss: 0.3958, learning_rate: 7.68e-05, epoch: 0.9591, step_time: 601.23s, elapsed_time: 49938.35s, grad_norm: 0.2681
101
+ 2025-01-09 09:17:59,916 - INFO - Step 415/1281 (32.4%), loss: 0.3704, learning_rate: 7.63e-05, epoch: 0.9708, step_time: 601.61s, elapsed_time: 50539.95s, grad_norm: 0.2619
102
+ 2025-01-09 09:28:02,267 - INFO - Step 420/1281 (32.8%), loss: 0.3609, learning_rate: 7.57e-05, epoch: 0.9825, step_time: 602.35s, elapsed_time: 51142.30s, grad_norm: 0.2586
103
+ 2025-01-09 09:38:04,706 - INFO - Step 425/1281 (33.2%), loss: 0.3553, learning_rate: 7.52e-05, epoch: 0.9942, step_time: 602.44s, elapsed_time: 51744.74s, grad_norm: 0.2764
104
+ 2025-01-09 11:03:43,855 - INFO - Loss improved from inf to 0.37839
105
+ 2025-01-09 11:03:43,855 - INFO - Loss improved from inf to 0.37839
106
+ 2025-01-09 11:03:43,855 - INFO - Loss improved from inf to 0.37839
107
+ 2025-01-09 11:03:43,856 - INFO - Step 427/1281 (33.3%), epoch: 0.9988, step_time: 5139.15s, elapsed_time: 56883.89s
108
+ 2025-01-09 11:03:43,858 - INFO - Evaluation Results:
109
+ eval_loss: 0.3784
110
+ eval_runtime: 4839.9190
111
+ eval_samples_per_second: 0.3140
112
  eval_steps_per_second: 0.0790
113
+ epoch: 0.9988
114
+ elapsed_time: 56883.89s
115
+ step_time: 5139.15s
116
+ 2025-01-09 11:03:43,858 - INFO - Loss improved from inf to 0.37839
117
+ 2025-01-09 11:07:38,811 - INFO - Saving model to mistral-sft-lora-fsdp2/checkpoint-427/pytorch_model_fsdp_0
118
+ 2025-01-09 11:07:41,993 - INFO - Model saved to mistral-sft-lora-fsdp2/checkpoint-427/pytorch_model_fsdp_0
119
+ 2025-01-09 11:07:48,542 - INFO - Saving Optimizer state to mistral-sft-lora-fsdp2/checkpoint-427/optimizer_0
120
+ 2025-01-09 11:07:54,762 - INFO - Optimizer state saved in mistral-sft-lora-fsdp2/checkpoint-427/optimizer_0
 
 
runs/Jan08_19-15-39_gpu-server/events.out.tfevents.1736364121.gpu-server.1095267.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:541a840766d43f95086492b9860aca0e0b96e56395216e99c70d250b0e305019
3
+ size 23713
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9099e841f56ef2dbca18d7a3750f478bbc35b0d6001546c8b501b0c4a1d3ff0
3
  size 5560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45369fb129012ba11d00d00653f2cc946852e10c134daf9fc8b8c017c046bdbf
3
  size 5560