jasong03 commited on
Commit
4a6de25
·
verified ·
1 Parent(s): 0f0165f

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec96824e1ff102827892eee5f5cdc8561fa38ff68e933e5d53a064fbc07dff40
3
  size 59021064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e140581cf97c2ff49feded27279cbd08eace0495f5afd12562fa8c1906693ba4
3
  size 59021064
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eff7db8b15de2eac66a5884727b78b8976cd97a92097182396b6546808795750
3
  size 30290452
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba59ea4e8eba7f0640208573e688cfef98d93f03d724499b344550aa047c5607
3
  size 30290452
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:347b832840722999395c0ab8798861062651672803889236fbfc6f0c86443262
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8abf2c3a2d675d09f86f31a4bbaac529a18147ee9c2568ece2b0294dff92a441
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a901b2f24739241604b39afb865dbd28e3e91e20879381578494b9e0ca03a34
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe86356e82949734df1ffbf4ca1d6d4aad55d9e61b90271979f2d574e6a34d95
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.1754626456477038,
5
  "eval_steps": 500,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -707,6 +707,356 @@
707
  "learning_rate": 1.769911504424779e-05,
708
  "loss": 2.72,
709
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710
  }
711
  ],
712
  "logging_steps": 1,
@@ -726,7 +1076,7 @@
726
  "attributes": {}
727
  }
728
  },
729
- "total_flos": 2.1759429300977664e+17,
730
  "train_batch_size": 1,
731
  "trial_name": null,
732
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.2631939684715556,
5
  "eval_steps": 500,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
707
  "learning_rate": 1.769911504424779e-05,
708
  "loss": 2.72,
709
  "step": 100
710
+ },
711
+ {
712
+ "epoch": 2.197395476353667,
713
+ "grad_norm": 0.27964890003204346,
714
+ "learning_rate": 1.7876106194690265e-05,
715
+ "loss": 2.703,
716
+ "step": 101
717
+ },
718
+ {
719
+ "epoch": 2.2193283070596297,
720
+ "grad_norm": 0.23122280836105347,
721
+ "learning_rate": 1.8053097345132743e-05,
722
+ "loss": 2.6755,
723
+ "step": 102
724
+ },
725
+ {
726
+ "epoch": 2.241261137765593,
727
+ "grad_norm": 0.13872268795967102,
728
+ "learning_rate": 1.823008849557522e-05,
729
+ "loss": 2.5387,
730
+ "step": 103
731
+ },
732
+ {
733
+ "epoch": 2.2631939684715556,
734
+ "grad_norm": 0.15757662057876587,
735
+ "learning_rate": 1.8407079646017702e-05,
736
+ "loss": 2.7089,
737
+ "step": 104
738
+ },
739
+ {
740
+ "epoch": 2.285126799177519,
741
+ "grad_norm": 0.14636781811714172,
742
+ "learning_rate": 1.858407079646018e-05,
743
+ "loss": 2.7059,
744
+ "step": 105
745
+ },
746
+ {
747
+ "epoch": 2.307059629883482,
748
+ "grad_norm": 0.12666302919387817,
749
+ "learning_rate": 1.8761061946902657e-05,
750
+ "loss": 2.677,
751
+ "step": 106
752
+ },
753
+ {
754
+ "epoch": 2.3289924605894448,
755
+ "grad_norm": 0.09550761431455612,
756
+ "learning_rate": 1.8938053097345135e-05,
757
+ "loss": 2.5572,
758
+ "step": 107
759
+ },
760
+ {
761
+ "epoch": 2.350925291295408,
762
+ "grad_norm": 0.22307220101356506,
763
+ "learning_rate": 1.9115044247787613e-05,
764
+ "loss": 2.5424,
765
+ "step": 108
766
+ },
767
+ {
768
+ "epoch": 2.3728581220013707,
769
+ "grad_norm": 0.1976032704114914,
770
+ "learning_rate": 1.929203539823009e-05,
771
+ "loss": 2.608,
772
+ "step": 109
773
+ },
774
+ {
775
+ "epoch": 2.394790952707334,
776
+ "grad_norm": 0.1282164454460144,
777
+ "learning_rate": 1.946902654867257e-05,
778
+ "loss": 2.6157,
779
+ "step": 110
780
+ },
781
+ {
782
+ "epoch": 2.4167237834132966,
783
+ "grad_norm": 0.12939345836639404,
784
+ "learning_rate": 1.9646017699115046e-05,
785
+ "loss": 2.631,
786
+ "step": 111
787
+ },
788
+ {
789
+ "epoch": 2.43865661411926,
790
+ "grad_norm": 0.1813574880361557,
791
+ "learning_rate": 1.9823008849557524e-05,
792
+ "loss": 2.5653,
793
+ "step": 112
794
+ },
795
+ {
796
+ "epoch": 2.4605894448252226,
797
+ "grad_norm": 0.21558630466461182,
798
+ "learning_rate": 2e-05,
799
+ "loss": 2.6836,
800
+ "step": 113
801
+ },
802
+ {
803
+ "epoch": 2.4825222755311858,
804
+ "grad_norm": 0.1171233206987381,
805
+ "learning_rate": 1.9999989194107888e-05,
806
+ "loss": 2.6329,
807
+ "step": 114
808
+ },
809
+ {
810
+ "epoch": 2.504455106237149,
811
+ "grad_norm": 0.09073666483163834,
812
+ "learning_rate": 1.9999956776454904e-05,
813
+ "loss": 2.6365,
814
+ "step": 115
815
+ },
816
+ {
817
+ "epoch": 2.5263879369431117,
818
+ "grad_norm": 0.15764959156513214,
819
+ "learning_rate": 1.999990274711111e-05,
820
+ "loss": 2.5957,
821
+ "step": 116
822
+ },
823
+ {
824
+ "epoch": 2.5483207676490744,
825
+ "grad_norm": 0.1431242674589157,
826
+ "learning_rate": 1.9999827106193264e-05,
827
+ "loss": 2.6047,
828
+ "step": 117
829
+ },
830
+ {
831
+ "epoch": 2.5702535983550376,
832
+ "grad_norm": 0.14802202582359314,
833
+ "learning_rate": 1.9999729853864854e-05,
834
+ "loss": 2.7031,
835
+ "step": 118
836
+ },
837
+ {
838
+ "epoch": 2.592186429061001,
839
+ "grad_norm": 0.1163327619433403,
840
+ "learning_rate": 1.999961099033605e-05,
841
+ "loss": 2.5701,
842
+ "step": 119
843
+ },
844
+ {
845
+ "epoch": 2.6141192597669636,
846
+ "grad_norm": 0.12083975225687027,
847
+ "learning_rate": 1.9999470515863738e-05,
848
+ "loss": 2.5481,
849
+ "step": 120
850
+ },
851
+ {
852
+ "epoch": 2.6360520904729268,
853
+ "grad_norm": 0.11876373738050461,
854
+ "learning_rate": 1.9999308430751513e-05,
855
+ "loss": 2.6219,
856
+ "step": 121
857
+ },
858
+ {
859
+ "epoch": 2.6579849211788895,
860
+ "grad_norm": 0.12213423103094101,
861
+ "learning_rate": 1.9999124735349666e-05,
862
+ "loss": 2.5822,
863
+ "step": 122
864
+ },
865
+ {
866
+ "epoch": 2.6799177518848527,
867
+ "grad_norm": 0.1777854710817337,
868
+ "learning_rate": 1.9998919430055193e-05,
869
+ "loss": 2.6359,
870
+ "step": 123
871
+ },
872
+ {
873
+ "epoch": 2.701850582590816,
874
+ "grad_norm": 0.10676445066928864,
875
+ "learning_rate": 1.9998692515311806e-05,
876
+ "loss": 2.6015,
877
+ "step": 124
878
+ },
879
+ {
880
+ "epoch": 2.7237834132967786,
881
+ "grad_norm": 0.08957359939813614,
882
+ "learning_rate": 1.9998443991609897e-05,
883
+ "loss": 2.6041,
884
+ "step": 125
885
+ },
886
+ {
887
+ "epoch": 2.7457162440027414,
888
+ "grad_norm": 0.14414869248867035,
889
+ "learning_rate": 1.9998173859486575e-05,
890
+ "loss": 2.6832,
891
+ "step": 126
892
+ },
893
+ {
894
+ "epoch": 2.7676490747087046,
895
+ "grad_norm": 0.14377790689468384,
896
+ "learning_rate": 1.9997882119525644e-05,
897
+ "loss": 2.502,
898
+ "step": 127
899
+ },
900
+ {
901
+ "epoch": 2.7895819054146678,
902
+ "grad_norm": 0.14488162100315094,
903
+ "learning_rate": 1.9997568772357603e-05,
904
+ "loss": 2.6653,
905
+ "step": 128
906
+ },
907
+ {
908
+ "epoch": 2.8115147361206305,
909
+ "grad_norm": 0.10184569656848907,
910
+ "learning_rate": 1.999723381865965e-05,
911
+ "loss": 2.6141,
912
+ "step": 129
913
+ },
914
+ {
915
+ "epoch": 2.8334475668265937,
916
+ "grad_norm": 0.08704333007335663,
917
+ "learning_rate": 1.999687725915569e-05,
918
+ "loss": 2.7103,
919
+ "step": 130
920
+ },
921
+ {
922
+ "epoch": 2.8553803975325565,
923
+ "grad_norm": 0.09079142659902573,
924
+ "learning_rate": 1.99964990946163e-05,
925
+ "loss": 2.4572,
926
+ "step": 131
927
+ },
928
+ {
929
+ "epoch": 2.8773132282385196,
930
+ "grad_norm": 0.11046919226646423,
931
+ "learning_rate": 1.9996099325858766e-05,
932
+ "loss": 2.6408,
933
+ "step": 132
934
+ },
935
+ {
936
+ "epoch": 2.8992460589444824,
937
+ "grad_norm": 0.11394225060939789,
938
+ "learning_rate": 1.999567795374706e-05,
939
+ "loss": 2.4854,
940
+ "step": 133
941
+ },
942
+ {
943
+ "epoch": 2.9211788896504456,
944
+ "grad_norm": 0.1528523713350296,
945
+ "learning_rate": 1.9995234979191843e-05,
946
+ "loss": 2.5499,
947
+ "step": 134
948
+ },
949
+ {
950
+ "epoch": 2.9431117203564083,
951
+ "grad_norm": 0.07764479517936707,
952
+ "learning_rate": 1.999477040315046e-05,
953
+ "loss": 2.6315,
954
+ "step": 135
955
+ },
956
+ {
957
+ "epoch": 2.9650445510623715,
958
+ "grad_norm": 0.09709993004798889,
959
+ "learning_rate": 1.9994284226626944e-05,
960
+ "loss": 2.6537,
961
+ "step": 136
962
+ },
963
+ {
964
+ "epoch": 2.9869773817683347,
965
+ "grad_norm": 0.07892050594091415,
966
+ "learning_rate": 1.9993776450672007e-05,
967
+ "loss": 2.5603,
968
+ "step": 137
969
+ },
970
+ {
971
+ "epoch": 3.0,
972
+ "grad_norm": 0.09977299720048904,
973
+ "learning_rate": 1.999324707638304e-05,
974
+ "loss": 2.6216,
975
+ "step": 138
976
+ },
977
+ {
978
+ "epoch": 3.021932830705963,
979
+ "grad_norm": 0.10072707384824753,
980
+ "learning_rate": 1.999269610490413e-05,
981
+ "loss": 2.7363,
982
+ "step": 139
983
+ },
984
+ {
985
+ "epoch": 3.043865661411926,
986
+ "grad_norm": 0.11965449154376984,
987
+ "learning_rate": 1.999212353742601e-05,
988
+ "loss": 2.5138,
989
+ "step": 140
990
+ },
991
+ {
992
+ "epoch": 3.065798492117889,
993
+ "grad_norm": 0.09121505171060562,
994
+ "learning_rate": 1.9991529375186104e-05,
995
+ "loss": 2.6217,
996
+ "step": 141
997
+ },
998
+ {
999
+ "epoch": 3.087731322823852,
1000
+ "grad_norm": 0.143572136759758,
1001
+ "learning_rate": 1.9990913619468507e-05,
1002
+ "loss": 2.6007,
1003
+ "step": 142
1004
+ },
1005
+ {
1006
+ "epoch": 3.109664153529815,
1007
+ "grad_norm": 0.06798284500837326,
1008
+ "learning_rate": 1.9990276271603972e-05,
1009
+ "loss": 2.5446,
1010
+ "step": 143
1011
+ },
1012
+ {
1013
+ "epoch": 3.131596984235778,
1014
+ "grad_norm": 0.07691462337970734,
1015
+ "learning_rate": 1.9989617332969924e-05,
1016
+ "loss": 2.5537,
1017
+ "step": 144
1018
+ },
1019
+ {
1020
+ "epoch": 3.153529814941741,
1021
+ "grad_norm": 0.05225904658436775,
1022
+ "learning_rate": 1.9988936804990446e-05,
1023
+ "loss": 2.5238,
1024
+ "step": 145
1025
+ },
1026
+ {
1027
+ "epoch": 3.1754626456477038,
1028
+ "grad_norm": 0.10837385058403015,
1029
+ "learning_rate": 1.9988234689136284e-05,
1030
+ "loss": 2.6316,
1031
+ "step": 146
1032
+ },
1033
+ {
1034
+ "epoch": 3.197395476353667,
1035
+ "grad_norm": 0.08016310632228851,
1036
+ "learning_rate": 1.9987510986924828e-05,
1037
+ "loss": 2.5408,
1038
+ "step": 147
1039
+ },
1040
+ {
1041
+ "epoch": 3.2193283070596297,
1042
+ "grad_norm": 0.1278998851776123,
1043
+ "learning_rate": 1.9986765699920134e-05,
1044
+ "loss": 2.6916,
1045
+ "step": 148
1046
+ },
1047
+ {
1048
+ "epoch": 3.241261137765593,
1049
+ "grad_norm": 0.115745909512043,
1050
+ "learning_rate": 1.9985998829732898e-05,
1051
+ "loss": 2.5964,
1052
+ "step": 149
1053
+ },
1054
+ {
1055
+ "epoch": 3.2631939684715556,
1056
+ "grad_norm": 0.05191206559538841,
1057
+ "learning_rate": 1.9985210378020464e-05,
1058
+ "loss": 2.5719,
1059
+ "step": 150
1060
  }
1061
  ],
1062
  "logging_steps": 1,
 
1076
  "attributes": {}
1077
  }
1078
  },
1079
+ "total_flos": 3.2639143951466496e+17,
1080
  "train_batch_size": 1,
1081
  "trial_name": null,
1082
  "trial_params": null