Training in progress, epoch 1
Browse files
logs/training_log.txt
CHANGED
@@ -1,58 +1,35 @@
|
|
1 |
-
2025-01-08
|
2 |
-
2025-01-08
|
3 |
-
2025-01-08
|
4 |
-
2025-01-08
|
5 |
-
2025-01-08
|
6 |
-
2025-01-08
|
7 |
-
2025-01-08
|
8 |
-
2025-01-08
|
9 |
-
2025-01-08
|
10 |
-
2025-01-08
|
11 |
-
2025-01-08
|
12 |
-
2025-01-08
|
13 |
-
2025-01-08
|
14 |
-
2025-01-08
|
15 |
-
2025-01-08
|
16 |
-
2025-01-08
|
17 |
-
2025-01-08
|
18 |
-
2025-01-08
|
19 |
-
2025-01-08
|
20 |
-
2025-01-08
|
21 |
-
2025-01-08
|
22 |
-
2025-01-08
|
23 |
-
2025-01-08
|
24 |
-
eval_loss: 1.
|
25 |
-
eval_runtime:
|
26 |
-
eval_samples_per_second: 0.
|
27 |
-
eval_steps_per_second: 0.
|
28 |
epoch: 1.0000
|
29 |
-
elapsed_time:
|
30 |
-
step_time:
|
31 |
-
2025-01-08
|
32 |
-
2025-01-08
|
33 |
-
2025-01-08
|
34 |
-
2025-01-08
|
35 |
-
2025-01-08
|
36 |
-
2025-01-08 13:56:45,309 - INFO - Saving model to mistral-sft-lora-fsdp2/checkpoint-2/pytorch_model_fsdp_0
|
37 |
-
2025-01-08 13:56:48,398 - INFO - Model saved to mistral-sft-lora-fsdp2/checkpoint-2/pytorch_model_fsdp_0
|
38 |
-
2025-01-08 13:56:53,278 - INFO - Saving Optimizer state to mistral-sft-lora-fsdp2/checkpoint-2/optimizer_0
|
39 |
-
2025-01-08 13:56:59,275 - INFO - Optimizer state saved in mistral-sft-lora-fsdp2/checkpoint-2/optimizer_0
|
40 |
-
2025-01-08 13:57:23,252 - INFO - Loss improved from 1.97976 to 1.83431
|
41 |
-
2025-01-08 13:57:23,252 - INFO - Loss improved from 1.97976 to 1.83431
|
42 |
-
2025-01-08 13:57:23,252 - INFO - Loss improved from 1.97976 to 1.83431
|
43 |
-
2025-01-08 13:57:23,253 - INFO - Step 2/2 (100.0%), epoch: 2.0000, step_time: 205.04s, elapsed_time: 599.83s
|
44 |
-
2025-01-08 13:57:23,254 - INFO - Evaluation Results:
|
45 |
-
eval_loss: 1.8343
|
46 |
-
eval_runtime: 23.7205
|
47 |
-
eval_samples_per_second: 0.3370
|
48 |
-
eval_steps_per_second: 0.0840
|
49 |
-
epoch: 2.0000
|
50 |
-
elapsed_time: 599.83s
|
51 |
-
step_time: 205.04s
|
52 |
-
2025-01-08 13:57:23,255 - INFO - Loss improved from 1.97976 to 1.83431
|
53 |
-
2025-01-08 13:58:21,011 - INFO - Saving model to mistral-sft-lora-fsdp2/checkpoint-2/pytorch_model_fsdp_0
|
54 |
-
2025-01-08 13:58:24,262 - INFO - Model saved to mistral-sft-lora-fsdp2/checkpoint-2/pytorch_model_fsdp_0
|
55 |
-
2025-01-08 13:58:29,156 - INFO - Saving Optimizer state to mistral-sft-lora-fsdp2/checkpoint-2/optimizer_0
|
56 |
-
2025-01-08 13:58:35,600 - INFO - Optimizer state saved in mistral-sft-lora-fsdp2/checkpoint-2/optimizer_0
|
57 |
-
2025-01-08 13:58:36,036 - INFO - Step 2/2 (100.0%), epoch: 2.0000, step_time: 72.78s, elapsed_time: 672.61s
|
58 |
-
2025-01-08 13:58:36,037 - INFO - Training completed in 672.61 seconds
|
|
|
1 |
+
2025-01-08 16:51:57,031 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmprelfhj11/test.c -o /tmp/tmprelfhj11/test.o
|
2 |
+
2025-01-08 16:51:57,055 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmprelfhj11/test.o -laio -o /tmp/tmprelfhj11/a.out
|
3 |
+
2025-01-08 16:51:57,150 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmpq0kg_ejb/test.c -o /tmp/tmpq0kg_ejb/test.o
|
4 |
+
2025-01-08 16:51:57,152 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmpcl_df4so/test.c -o /tmp/tmpcl_df4so/test.o
|
5 |
+
2025-01-08 16:51:57,176 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmpcl_df4so/test.o -laio -o /tmp/tmpcl_df4so/a.out
|
6 |
+
2025-01-08 16:51:57,177 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmpq0kg_ejb/test.o -laio -o /tmp/tmpq0kg_ejb/a.out
|
7 |
+
2025-01-08 16:51:57,241 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmpr4qfgumu/test.c -o /tmp/tmpr4qfgumu/test.o
|
8 |
+
2025-01-08 16:51:57,270 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmpr4qfgumu/test.o -laio -o /tmp/tmpr4qfgumu/a.out
|
9 |
+
2025-01-08 16:51:57,507 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmpovnjxczz/test.c -o /tmp/tmpovnjxczz/test.o
|
10 |
+
2025-01-08 16:51:57,534 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmpovnjxczz/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpovnjxczz/a.out
|
11 |
+
2025-01-08 16:51:57,596 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmpkg32zxk_/test.c -o /tmp/tmpkg32zxk_/test.o
|
12 |
+
2025-01-08 16:51:57,611 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmphv2qqt72/test.c -o /tmp/tmphv2qqt72/test.o
|
13 |
+
2025-01-08 16:51:57,615 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmpkg32zxk_/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpkg32zxk_/a.out
|
14 |
+
2025-01-08 16:51:57,629 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmphv2qqt72/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmphv2qqt72/a.out
|
15 |
+
2025-01-08 16:51:57,706 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -O2 -isystem /root/anaconda3/envs/faiss_1.8.0/include -fPIC -c /tmp/tmpfcn3zdii/test.c -o /tmp/tmpfcn3zdii/test.o
|
16 |
+
2025-01-08 16:51:57,728 - INFO - gcc -pthread -B /root/anaconda3/envs/faiss_1.8.0/compiler_compat /tmp/tmpfcn3zdii/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpfcn3zdii/a.out
|
17 |
+
2025-01-08 16:56:02,452 - INFO - Training started
|
18 |
+
2025-01-08 16:56:02,452 - INFO - Total steps: 2
|
19 |
+
2025-01-08 16:59:03,388 - INFO - Loss improved from inf to 1.97822
|
20 |
+
2025-01-08 16:59:03,388 - INFO - Loss improved from inf to 1.97822
|
21 |
+
2025-01-08 16:59:03,388 - INFO - Loss improved from inf to 1.97822
|
22 |
+
2025-01-08 16:59:03,389 - INFO - Step 1/2 (50.0%), epoch: 1.0000, step_time: 526.05s, elapsed_time: 526.05s
|
23 |
+
2025-01-08 16:59:03,391 - INFO - Evaluation Results:
|
24 |
+
eval_loss: 1.9782
|
25 |
+
eval_runtime: 25.4206
|
26 |
+
eval_samples_per_second: 0.3150
|
27 |
+
eval_steps_per_second: 0.0790
|
28 |
epoch: 1.0000
|
29 |
+
elapsed_time: 526.05s
|
30 |
+
step_time: 526.05s
|
31 |
+
2025-01-08 16:59:03,391 - INFO - Loss improved from inf to 1.97822
|
32 |
+
2025-01-08 17:01:50,261 - INFO - Saving model to mistral-sft-lora-fsdp2/checkpoint-1/pytorch_model_fsdp_0
|
33 |
+
2025-01-08 17:01:53,657 - INFO - Model saved to mistral-sft-lora-fsdp2/checkpoint-1/pytorch_model_fsdp_0
|
34 |
+
2025-01-08 17:01:59,821 - INFO - Saving Optimizer state to mistral-sft-lora-fsdp2/checkpoint-1/optimizer_0
|
35 |
+
2025-01-08 17:02:06,458 - INFO - Optimizer state saved in mistral-sft-lora-fsdp2/checkpoint-1/optimizer_0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
runs/Jan08_16-50-16_gpu-server/events.out.tfevents.1736355360.gpu-server.922092.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d43fc185418e889196286a648d25b64e44b0bb14959b9af5987765245a4eb50
|
3 |
+
size 5873
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5560
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:51fc8d24c71b3f37b8b6ee98fea4d4c8df8bae3b541e3efa59279a56399f9f0c
|
3 |
size 5560
|