wcyat commited on
Commit
e1e62e8
·
verified ·
1 Parent(s): dbac2eb

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0dd0a0d817dec0f13a91bbdebdab941824497147e44c1798a98fad307dc57ce7
3
  size 410636248
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91c7a1763ef70813211a3e126e99ed02a4d68911db407a86f3c01451154abfd9
3
  size 410636248
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9848c8c56692efd3776d40a5bddcd4432060212437984f62108627cdf924bf06
3
  size 821393658
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e58bcb928dd8bb1203c8e8f7525d51a0322d66d5e46c62c33a51265957ea734
3
  size 821393658
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99077ea0f39ab9a2a73c591b9a25382a425ad11a428ab4632ccc7cbfe7bf5983
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdcef4615c497648044a6d80895c3d46b1c5f7a9c132beb33350b027c8cf3c17
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5703ca913eab162c25a0bf110cd330db4a862d1f0434dd16326b1640bd4f079c
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66e6f95527be7d742c182b1b25b1632bf2465fc58ebbe4ae2f736399e0d31f82
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.2427118569612503,
3
  "best_model_checkpoint": "./results/checkpoint-340",
4
- "epoch": 2.865329512893983,
5
  "eval_steps": 20,
6
- "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -807,6 +807,406 @@
807
  "eval_samples_per_second": 36.847,
808
  "eval_steps_per_second": 9.328,
809
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
810
  }
811
  ],
812
  "logging_steps": 20,
@@ -826,7 +1226,7 @@
826
  "attributes": {}
827
  }
828
  },
829
- "total_flos": 902257656041100.0,
830
  "train_batch_size": 4,
831
  "trial_name": null,
832
  "trial_params": null
 
1
  {
2
  "best_metric": 0.2427118569612503,
3
  "best_model_checkpoint": "./results/checkpoint-340",
4
+ "epoch": 4.2979942693409745,
5
  "eval_steps": 20,
6
+ "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
807
  "eval_samples_per_second": 36.847,
808
  "eval_steps_per_second": 9.328,
809
  "step": 1000
810
+ },
811
+ {
812
+ "epoch": 2.9226361031518624,
813
+ "grad_norm": 0.0065177069045603275,
814
+ "learning_rate": 8.30945558739255e-06,
815
+ "loss": 0.0006,
816
+ "step": 1020
817
+ },
818
+ {
819
+ "epoch": 2.9226361031518624,
820
+ "eval_accuracy": 0.9240506329113924,
821
+ "eval_loss": 0.38359534740448,
822
+ "eval_runtime": 4.251,
823
+ "eval_samples_per_second": 37.167,
824
+ "eval_steps_per_second": 9.409,
825
+ "step": 1020
826
+ },
827
+ {
828
+ "epoch": 2.9799426934097424,
829
+ "grad_norm": 0.008996536955237389,
830
+ "learning_rate": 8.080229226361033e-06,
831
+ "loss": 0.0062,
832
+ "step": 1040
833
+ },
834
+ {
835
+ "epoch": 2.9799426934097424,
836
+ "eval_accuracy": 0.9113924050632911,
837
+ "eval_loss": 0.40920865535736084,
838
+ "eval_runtime": 4.4231,
839
+ "eval_samples_per_second": 35.722,
840
+ "eval_steps_per_second": 9.043,
841
+ "step": 1040
842
+ },
843
+ {
844
+ "epoch": 3.037249283667622,
845
+ "grad_norm": 0.008578244596719742,
846
+ "learning_rate": 7.851002865329513e-06,
847
+ "loss": 0.0018,
848
+ "step": 1060
849
+ },
850
+ {
851
+ "epoch": 3.037249283667622,
852
+ "eval_accuracy": 0.9240506329113924,
853
+ "eval_loss": 0.4326882064342499,
854
+ "eval_runtime": 4.3644,
855
+ "eval_samples_per_second": 36.202,
856
+ "eval_steps_per_second": 9.165,
857
+ "step": 1060
858
+ },
859
+ {
860
+ "epoch": 3.0945558739255015,
861
+ "grad_norm": 0.029599307104945183,
862
+ "learning_rate": 7.6217765042979954e-06,
863
+ "loss": 0.0006,
864
+ "step": 1080
865
+ },
866
+ {
867
+ "epoch": 3.0945558739255015,
868
+ "eval_accuracy": 0.9177215189873418,
869
+ "eval_loss": 0.4501863420009613,
870
+ "eval_runtime": 4.3033,
871
+ "eval_samples_per_second": 36.716,
872
+ "eval_steps_per_second": 9.295,
873
+ "step": 1080
874
+ },
875
+ {
876
+ "epoch": 3.151862464183381,
877
+ "grad_norm": 0.014250312000513077,
878
+ "learning_rate": 7.392550143266476e-06,
879
+ "loss": 0.1874,
880
+ "step": 1100
881
+ },
882
+ {
883
+ "epoch": 3.151862464183381,
884
+ "eval_accuracy": 0.9177215189873418,
885
+ "eval_loss": 0.4321536421775818,
886
+ "eval_runtime": 4.2681,
887
+ "eval_samples_per_second": 37.019,
888
+ "eval_steps_per_second": 9.372,
889
+ "step": 1100
890
+ },
891
+ {
892
+ "epoch": 3.2091690544412605,
893
+ "grad_norm": 0.026432504877448082,
894
+ "learning_rate": 7.163323782234957e-06,
895
+ "loss": 0.0676,
896
+ "step": 1120
897
+ },
898
+ {
899
+ "epoch": 3.2091690544412605,
900
+ "eval_accuracy": 0.9113924050632911,
901
+ "eval_loss": 0.4126332998275757,
902
+ "eval_runtime": 4.2309,
903
+ "eval_samples_per_second": 37.344,
904
+ "eval_steps_per_second": 9.454,
905
+ "step": 1120
906
+ },
907
+ {
908
+ "epoch": 3.2664756446991405,
909
+ "grad_norm": 0.01032521203160286,
910
+ "learning_rate": 6.934097421203439e-06,
911
+ "loss": 0.0199,
912
+ "step": 1140
913
+ },
914
+ {
915
+ "epoch": 3.2664756446991405,
916
+ "eval_accuracy": 0.9050632911392406,
917
+ "eval_loss": 0.41126754879951477,
918
+ "eval_runtime": 4.2627,
919
+ "eval_samples_per_second": 37.066,
920
+ "eval_steps_per_second": 9.384,
921
+ "step": 1140
922
+ },
923
+ {
924
+ "epoch": 3.32378223495702,
925
+ "grad_norm": 0.016674930229783058,
926
+ "learning_rate": 6.70487106017192e-06,
927
+ "loss": 0.0674,
928
+ "step": 1160
929
+ },
930
+ {
931
+ "epoch": 3.32378223495702,
932
+ "eval_accuracy": 0.9177215189873418,
933
+ "eval_loss": 0.4134314954280853,
934
+ "eval_runtime": 4.2734,
935
+ "eval_samples_per_second": 36.973,
936
+ "eval_steps_per_second": 9.36,
937
+ "step": 1160
938
+ },
939
+ {
940
+ "epoch": 3.3810888252148996,
941
+ "grad_norm": 0.0032745320349931717,
942
+ "learning_rate": 6.475644699140402e-06,
943
+ "loss": 0.0004,
944
+ "step": 1180
945
+ },
946
+ {
947
+ "epoch": 3.3810888252148996,
948
+ "eval_accuracy": 0.9177215189873418,
949
+ "eval_loss": 0.4212283790111542,
950
+ "eval_runtime": 4.2821,
951
+ "eval_samples_per_second": 36.898,
952
+ "eval_steps_per_second": 9.341,
953
+ "step": 1180
954
+ },
955
+ {
956
+ "epoch": 3.4383954154727796,
957
+ "grad_norm": 0.012243836186826229,
958
+ "learning_rate": 6.246418338108883e-06,
959
+ "loss": 0.0004,
960
+ "step": 1200
961
+ },
962
+ {
963
+ "epoch": 3.4383954154727796,
964
+ "eval_accuracy": 0.9177215189873418,
965
+ "eval_loss": 0.42768773436546326,
966
+ "eval_runtime": 4.2965,
967
+ "eval_samples_per_second": 36.774,
968
+ "eval_steps_per_second": 9.31,
969
+ "step": 1200
970
+ },
971
+ {
972
+ "epoch": 3.495702005730659,
973
+ "grad_norm": 0.09642524272203445,
974
+ "learning_rate": 6.017191977077364e-06,
975
+ "loss": 0.1097,
976
+ "step": 1220
977
+ },
978
+ {
979
+ "epoch": 3.495702005730659,
980
+ "eval_accuracy": 0.9177215189873418,
981
+ "eval_loss": 0.4246382415294647,
982
+ "eval_runtime": 4.2676,
983
+ "eval_samples_per_second": 37.023,
984
+ "eval_steps_per_second": 9.373,
985
+ "step": 1220
986
+ },
987
+ {
988
+ "epoch": 3.5530085959885387,
989
+ "grad_norm": 0.004081379622220993,
990
+ "learning_rate": 5.787965616045845e-06,
991
+ "loss": 0.0004,
992
+ "step": 1240
993
+ },
994
+ {
995
+ "epoch": 3.5530085959885387,
996
+ "eval_accuracy": 0.9177215189873418,
997
+ "eval_loss": 0.42067304253578186,
998
+ "eval_runtime": 4.2338,
999
+ "eval_samples_per_second": 37.319,
1000
+ "eval_steps_per_second": 9.448,
1001
+ "step": 1240
1002
+ },
1003
+ {
1004
+ "epoch": 3.6103151862464182,
1005
+ "grad_norm": 0.013711544685065746,
1006
+ "learning_rate": 5.558739255014327e-06,
1007
+ "loss": 0.0152,
1008
+ "step": 1260
1009
+ },
1010
+ {
1011
+ "epoch": 3.6103151862464182,
1012
+ "eval_accuracy": 0.9177215189873418,
1013
+ "eval_loss": 0.4250052869319916,
1014
+ "eval_runtime": 4.2297,
1015
+ "eval_samples_per_second": 37.355,
1016
+ "eval_steps_per_second": 9.457,
1017
+ "step": 1260
1018
+ },
1019
+ {
1020
+ "epoch": 3.6676217765042978,
1021
+ "grad_norm": 148.3441619873047,
1022
+ "learning_rate": 5.3295128939828086e-06,
1023
+ "loss": 0.0146,
1024
+ "step": 1280
1025
+ },
1026
+ {
1027
+ "epoch": 3.6676217765042978,
1028
+ "eval_accuracy": 0.9240506329113924,
1029
+ "eval_loss": 0.412005752325058,
1030
+ "eval_runtime": 4.2278,
1031
+ "eval_samples_per_second": 37.372,
1032
+ "eval_steps_per_second": 9.461,
1033
+ "step": 1280
1034
+ },
1035
+ {
1036
+ "epoch": 3.7249283667621778,
1037
+ "grad_norm": 0.0035390935372561216,
1038
+ "learning_rate": 5.10028653295129e-06,
1039
+ "loss": 0.0377,
1040
+ "step": 1300
1041
+ },
1042
+ {
1043
+ "epoch": 3.7249283667621778,
1044
+ "eval_accuracy": 0.930379746835443,
1045
+ "eval_loss": 0.40523138642311096,
1046
+ "eval_runtime": 4.2347,
1047
+ "eval_samples_per_second": 37.311,
1048
+ "eval_steps_per_second": 9.446,
1049
+ "step": 1300
1050
+ },
1051
+ {
1052
+ "epoch": 3.7822349570200573,
1053
+ "grad_norm": 9.169730186462402,
1054
+ "learning_rate": 4.871060171919771e-06,
1055
+ "loss": 0.1061,
1056
+ "step": 1320
1057
+ },
1058
+ {
1059
+ "epoch": 3.7822349570200573,
1060
+ "eval_accuracy": 0.9177215189873418,
1061
+ "eval_loss": 0.40109243988990784,
1062
+ "eval_runtime": 4.2624,
1063
+ "eval_samples_per_second": 37.069,
1064
+ "eval_steps_per_second": 9.384,
1065
+ "step": 1320
1066
+ },
1067
+ {
1068
+ "epoch": 3.839541547277937,
1069
+ "grad_norm": 0.004674045369029045,
1070
+ "learning_rate": 4.641833810888253e-06,
1071
+ "loss": 0.1026,
1072
+ "step": 1340
1073
+ },
1074
+ {
1075
+ "epoch": 3.839541547277937,
1076
+ "eval_accuracy": 0.9177215189873418,
1077
+ "eval_loss": 0.43842944502830505,
1078
+ "eval_runtime": 4.2684,
1079
+ "eval_samples_per_second": 37.016,
1080
+ "eval_steps_per_second": 9.371,
1081
+ "step": 1340
1082
+ },
1083
+ {
1084
+ "epoch": 3.896848137535817,
1085
+ "grad_norm": 0.014885048381984234,
1086
+ "learning_rate": 4.412607449856734e-06,
1087
+ "loss": 0.1264,
1088
+ "step": 1360
1089
+ },
1090
+ {
1091
+ "epoch": 3.896848137535817,
1092
+ "eval_accuracy": 0.9177215189873418,
1093
+ "eval_loss": 0.4101775884628296,
1094
+ "eval_runtime": 4.2709,
1095
+ "eval_samples_per_second": 36.995,
1096
+ "eval_steps_per_second": 9.366,
1097
+ "step": 1360
1098
+ },
1099
+ {
1100
+ "epoch": 3.9541547277936964,
1101
+ "grad_norm": 0.00861190166324377,
1102
+ "learning_rate": 4.1833810888252155e-06,
1103
+ "loss": 0.0079,
1104
+ "step": 1380
1105
+ },
1106
+ {
1107
+ "epoch": 3.9541547277936964,
1108
+ "eval_accuracy": 0.9240506329113924,
1109
+ "eval_loss": 0.40192869305610657,
1110
+ "eval_runtime": 4.2834,
1111
+ "eval_samples_per_second": 36.887,
1112
+ "eval_steps_per_second": 9.338,
1113
+ "step": 1380
1114
+ },
1115
+ {
1116
+ "epoch": 4.011461318051576,
1117
+ "grad_norm": 0.0044676773250103,
1118
+ "learning_rate": 3.954154727793696e-06,
1119
+ "loss": 0.0249,
1120
+ "step": 1400
1121
+ },
1122
+ {
1123
+ "epoch": 4.011461318051576,
1124
+ "eval_accuracy": 0.9177215189873418,
1125
+ "eval_loss": 0.3997720777988434,
1126
+ "eval_runtime": 4.2897,
1127
+ "eval_samples_per_second": 36.833,
1128
+ "eval_steps_per_second": 9.325,
1129
+ "step": 1400
1130
+ },
1131
+ {
1132
+ "epoch": 4.0687679083094554,
1133
+ "grad_norm": 0.1052209734916687,
1134
+ "learning_rate": 3.724928366762178e-06,
1135
+ "loss": 0.0115,
1136
+ "step": 1420
1137
+ },
1138
+ {
1139
+ "epoch": 4.0687679083094554,
1140
+ "eval_accuracy": 0.9240506329113924,
1141
+ "eval_loss": 0.39488697052001953,
1142
+ "eval_runtime": 4.2996,
1143
+ "eval_samples_per_second": 36.748,
1144
+ "eval_steps_per_second": 9.303,
1145
+ "step": 1420
1146
+ },
1147
+ {
1148
+ "epoch": 4.126074498567335,
1149
+ "grad_norm": 0.012624930590391159,
1150
+ "learning_rate": 3.4957020057306597e-06,
1151
+ "loss": 0.0004,
1152
+ "step": 1440
1153
+ },
1154
+ {
1155
+ "epoch": 4.126074498567335,
1156
+ "eval_accuracy": 0.9240506329113924,
1157
+ "eval_loss": 0.39705362915992737,
1158
+ "eval_runtime": 4.2873,
1159
+ "eval_samples_per_second": 36.853,
1160
+ "eval_steps_per_second": 9.33,
1161
+ "step": 1440
1162
+ },
1163
+ {
1164
+ "epoch": 4.1833810888252145,
1165
+ "grad_norm": 0.008006641641259193,
1166
+ "learning_rate": 3.2664756446991407e-06,
1167
+ "loss": 0.0847,
1168
+ "step": 1460
1169
+ },
1170
+ {
1171
+ "epoch": 4.1833810888252145,
1172
+ "eval_accuracy": 0.930379746835443,
1173
+ "eval_loss": 0.3859291076660156,
1174
+ "eval_runtime": 4.3068,
1175
+ "eval_samples_per_second": 36.686,
1176
+ "eval_steps_per_second": 9.288,
1177
+ "step": 1460
1178
+ },
1179
+ {
1180
+ "epoch": 4.240687679083095,
1181
+ "grad_norm": 0.022626299411058426,
1182
+ "learning_rate": 3.037249283667622e-06,
1183
+ "loss": 0.0004,
1184
+ "step": 1480
1185
+ },
1186
+ {
1187
+ "epoch": 4.240687679083095,
1188
+ "eval_accuracy": 0.930379746835443,
1189
+ "eval_loss": 0.38549065589904785,
1190
+ "eval_runtime": 4.2744,
1191
+ "eval_samples_per_second": 36.964,
1192
+ "eval_steps_per_second": 9.358,
1193
+ "step": 1480
1194
+ },
1195
+ {
1196
+ "epoch": 4.2979942693409745,
1197
+ "grad_norm": 0.00761532224714756,
1198
+ "learning_rate": 2.8080229226361035e-06,
1199
+ "loss": 0.002,
1200
+ "step": 1500
1201
+ },
1202
+ {
1203
+ "epoch": 4.2979942693409745,
1204
+ "eval_accuracy": 0.9367088607594937,
1205
+ "eval_loss": 0.3879244923591614,
1206
+ "eval_runtime": 4.2938,
1207
+ "eval_samples_per_second": 36.797,
1208
+ "eval_steps_per_second": 9.316,
1209
+ "step": 1500
1210
  }
1211
  ],
1212
  "logging_steps": 20,
 
1226
  "attributes": {}
1227
  }
1228
  },
1229
+ "total_flos": 1358383281613980.0,
1230
  "train_batch_size": 4,
1231
  "trial_name": null,
1232
  "trial_params": null