shivanandmn commited on
Commit
7079385
·
verified ·
1 Parent(s): 8a41221

Model save

Browse files
README.md CHANGED
@@ -17,10 +17,10 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 3.1790
21
  - Accuracy: 0.4217
22
- - Perplexity: 24.0231
23
  - Bleu: 0.1309
 
 
24
 
25
  ## Model description
26
 
@@ -69,10 +69,10 @@ The following hyperparameters were used during training:
69
  | 3.1212 | 4.2088 | 7500 | 0.4161 | 0.1325 | 3.2262 | 25.1831 |
70
  | 3.0816 | 4.4893 | 8000 | 0.4176 | 0.1307 | 3.2128 | 24.8480 |
71
  | 3.0917 | 4.7699 | 8500 | 0.4196 | 0.1339 | 3.1985 | 24.4954 |
72
- | 3.0562 | 5.0505 | 9000 | 3.2049 | 0.4185 | 24.6521 | 0.1326 |
73
- | 3.0683 | 5.3311 | 9500 | 3.1970 | 0.4195 | 24.4597 | 0.1307 |
74
- | 3.0502 | 5.6117 | 10000 | 3.1857 | 0.4209 | 24.1847 | 0.1331 |
75
- | 3.0469 | 5.8923 | 10500 | 3.1790 | 0.4217 | 24.0231 | 0.1309 |
76
 
77
 
78
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
 
20
  - Accuracy: 0.4217
 
21
  - Bleu: 0.1309
22
+ - Loss: 3.1790
23
+ - Perplexity: 24.0231
24
 
25
  ## Model description
26
 
 
69
  | 3.1212 | 4.2088 | 7500 | 0.4161 | 0.1325 | 3.2262 | 25.1831 |
70
  | 3.0816 | 4.4893 | 8000 | 0.4176 | 0.1307 | 3.2128 | 24.8480 |
71
  | 3.0917 | 4.7699 | 8500 | 0.4196 | 0.1339 | 3.1985 | 24.4954 |
72
+ | 3.0562 | 5.0505 | 9000 | 0.4185 | 0.1326 | 3.2049 | 24.6521 |
73
+ | 3.0683 | 5.3311 | 9500 | 0.4195 | 0.1307 | 3.1970 | 24.4597 |
74
+ | 3.0502 | 5.6117 | 10000 | 0.4209 | 0.1331 | 3.1857 | 24.1847 |
75
+ | 3.0469 | 5.8923 | 10500 | 0.4217 | 0.1309 | 3.1790 | 24.0231 |
76
 
77
 
78
  ### Framework versions
all_results.json CHANGED
@@ -1,17 +1,17 @@
1
  {
2
- "epoch": 5.0,
3
- "eval_accuracy": 0.41997167684877956,
4
- "eval_bleu": 0.13436705184218095,
5
- "eval_loss": 3.19417405128479,
6
- "eval_perplexity": 24.39002046460787,
7
- "eval_runtime": 12.522,
8
  "eval_samples": 1141,
9
- "eval_samples_per_second": 91.119,
10
- "eval_steps_per_second": 1.437,
11
- "perplexity": 24.39002046460787,
12
- "total_flos": 1.0587061010143642e+18,
13
- "train_loss": 0.0,
14
- "train_runtime": 0.6821,
15
- "train_samples_per_second": 835582.505,
16
- "train_steps_per_second": 13062.161
17
  }
 
1
  {
2
+ "epoch": 6.0,
3
+ "eval_accuracy": 0.4217896359198556,
4
+ "eval_bleu": 0.1323262570770995,
5
+ "eval_loss": 3.1777124404907227,
6
+ "eval_perplexity": 23.991808041831927,
7
+ "eval_runtime": 12.2078,
8
  "eval_samples": 1141,
9
+ "eval_samples_per_second": 93.465,
10
+ "eval_steps_per_second": 1.474,
11
+ "perplexity": 23.991808041831927,
12
+ "total_flos": 1.270447321217237e+18,
13
+ "train_loss": 0.5092627385828409,
14
+ "train_runtime": 3159.8581,
15
+ "train_samples_per_second": 216.454,
16
+ "train_steps_per_second": 3.384
17
  }
eval_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "epoch": 5.0,
3
- "eval_accuracy": 0.41997167684877956,
4
- "eval_bleu": 0.13436705184218095,
5
- "eval_loss": 3.19417405128479,
6
- "eval_perplexity": 24.39002046460787,
7
- "eval_runtime": 12.522,
8
  "eval_samples": 1141,
9
- "eval_samples_per_second": 91.119,
10
- "eval_steps_per_second": 1.437,
11
- "perplexity": 24.39002046460787
12
  }
 
1
  {
2
+ "epoch": 6.0,
3
+ "eval_accuracy": 0.4217896359198556,
4
+ "eval_bleu": 0.1323262570770995,
5
+ "eval_loss": 3.1777124404907227,
6
+ "eval_perplexity": 23.991808041831927,
7
+ "eval_runtime": 12.2078,
8
  "eval_samples": 1141,
9
+ "eval_samples_per_second": 93.465,
10
+ "eval_steps_per_second": 1.474,
11
+ "perplexity": 23.991808041831927
12
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 5.0,
3
- "total_flos": 1.0587061010143642e+18,
4
- "train_loss": 0.0,
5
- "train_runtime": 0.6821,
6
- "train_samples_per_second": 835582.505,
7
- "train_steps_per_second": 13062.161
8
  }
 
1
  {
2
+ "epoch": 6.0,
3
+ "total_flos": 1.270447321217237e+18,
4
+ "train_loss": 0.5092627385828409,
5
+ "train_runtime": 3159.8581,
6
+ "train_samples_per_second": 216.454,
7
+ "train_steps_per_second": 3.384
8
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 3.1984846591949463,
3
- "best_model_checkpoint": "./output/models/rotating-head-gp-gpt2-medium-wikitext/checkpoint-8500",
4
- "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 8910,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -819,19 +819,182 @@
819
  "step": 8900
820
  },
821
  {
822
- "epoch": 5.0,
823
- "step": 8910,
824
- "total_flos": 1.0587061010143642e+18,
825
- "train_loss": 0.0,
826
- "train_runtime": 0.6821,
827
- "train_samples_per_second": 835582.505,
828
- "train_steps_per_second": 13062.161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
829
  }
830
  ],
831
  "logging_steps": 100,
832
- "max_steps": 8910,
833
  "num_input_tokens_seen": 0,
834
- "num_train_epochs": 5,
835
  "save_steps": 500,
836
  "stateful_callbacks": {
837
  "EarlyStoppingCallback": {
@@ -854,7 +1017,7 @@
854
  "attributes": {}
855
  }
856
  },
857
- "total_flos": 1.0587061010143642e+18,
858
  "train_batch_size": 64,
859
  "trial_name": null,
860
  "trial_params": null
 
1
  {
2
+ "best_metric": 3.1790146827697754,
3
+ "best_model_checkpoint": "./output/models/rotating-head-gp-gpt2-medium-wikitext/checkpoint-10500",
4
+ "epoch": 6.0,
5
  "eval_steps": 500,
6
+ "global_step": 10692,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
819
  "step": 8900
820
  },
821
  {
822
+ "epoch": 5.05050505050505,
823
+ "grad_norm": 1.2899984121322632,
824
+ "learning_rate": 1.7584701725213056e-05,
825
+ "loss": 3.0562,
826
+ "step": 9000
827
+ },
828
+ {
829
+ "epoch": 5.05050505050505,
830
+ "eval_accuracy": 0.4185418117735553,
831
+ "eval_bleu": 0.13262877395418665,
832
+ "eval_loss": 3.2048630714416504,
833
+ "eval_perplexity": 24.652124205511342,
834
+ "eval_runtime": 12.0913,
835
+ "eval_samples_per_second": 94.365,
836
+ "eval_steps_per_second": 1.489,
837
+ "step": 9000
838
+ },
839
+ {
840
+ "epoch": 5.10662177328844,
841
+ "grad_norm": 1.1415753364562988,
842
+ "learning_rate": 1.654541675327375e-05,
843
+ "loss": 3.0602,
844
+ "step": 9100
845
+ },
846
+ {
847
+ "epoch": 5.162738496071829,
848
+ "grad_norm": 1.3813053369522095,
849
+ "learning_rate": 1.5506131781334445e-05,
850
+ "loss": 3.0627,
851
+ "step": 9200
852
+ },
853
+ {
854
+ "epoch": 5.218855218855219,
855
+ "grad_norm": 1.9901399612426758,
856
+ "learning_rate": 1.4466846809395137e-05,
857
+ "loss": 3.0647,
858
+ "step": 9300
859
+ },
860
+ {
861
+ "epoch": 5.274971941638609,
862
+ "grad_norm": 1.109480857849121,
863
+ "learning_rate": 1.3427561837455832e-05,
864
+ "loss": 3.0496,
865
+ "step": 9400
866
+ },
867
+ {
868
+ "epoch": 5.331088664421998,
869
+ "grad_norm": 1.1226487159729004,
870
+ "learning_rate": 1.2388276865516525e-05,
871
+ "loss": 3.0683,
872
+ "step": 9500
873
+ },
874
+ {
875
+ "epoch": 5.331088664421998,
876
+ "eval_accuracy": 0.4195261826372058,
877
+ "eval_bleu": 0.1307104242557084,
878
+ "eval_loss": 3.197028875350952,
879
+ "eval_perplexity": 24.459749166234648,
880
+ "eval_runtime": 12.1714,
881
+ "eval_samples_per_second": 93.745,
882
+ "eval_steps_per_second": 1.479,
883
+ "step": 9500
884
+ },
885
+ {
886
+ "epoch": 5.3872053872053876,
887
+ "grad_norm": 1.8179054260253906,
888
+ "learning_rate": 1.134899189357722e-05,
889
+ "loss": 3.0641,
890
+ "step": 9600
891
+ },
892
+ {
893
+ "epoch": 5.4433221099887765,
894
+ "grad_norm": 1.5777114629745483,
895
+ "learning_rate": 1.0309706921637914e-05,
896
+ "loss": 3.0545,
897
+ "step": 9700
898
+ },
899
+ {
900
+ "epoch": 5.499438832772166,
901
+ "grad_norm": 2.161355972290039,
902
+ "learning_rate": 9.270421949698607e-06,
903
+ "loss": 3.057,
904
+ "step": 9800
905
+ },
906
+ {
907
+ "epoch": 5.555555555555555,
908
+ "grad_norm": 2.167677640914917,
909
+ "learning_rate": 8.231136977759301e-06,
910
+ "loss": 3.0532,
911
+ "step": 9900
912
+ },
913
+ {
914
+ "epoch": 5.611672278338945,
915
+ "grad_norm": 0.995482325553894,
916
+ "learning_rate": 7.1918520058199965e-06,
917
+ "loss": 3.0502,
918
+ "step": 10000
919
+ },
920
+ {
921
+ "epoch": 5.611672278338945,
922
+ "eval_accuracy": 0.4209234923661997,
923
+ "eval_bleu": 0.13307821076833265,
924
+ "eval_loss": 3.1857192516326904,
925
+ "eval_perplexity": 24.18467702013368,
926
+ "eval_runtime": 12.2734,
927
+ "eval_samples_per_second": 92.965,
928
+ "eval_steps_per_second": 1.467,
929
+ "step": 10000
930
+ },
931
+ {
932
+ "epoch": 5.667789001122334,
933
+ "grad_norm": 1.5987473726272583,
934
+ "learning_rate": 6.152567033880691e-06,
935
+ "loss": 3.0525,
936
+ "step": 10100
937
+ },
938
+ {
939
+ "epoch": 5.723905723905724,
940
+ "grad_norm": 1.1128613948822021,
941
+ "learning_rate": 5.113282061941385e-06,
942
+ "loss": 3.0491,
943
+ "step": 10200
944
+ },
945
+ {
946
+ "epoch": 5.780022446689113,
947
+ "grad_norm": 1.4610397815704346,
948
+ "learning_rate": 4.073997090002079e-06,
949
+ "loss": 3.0498,
950
+ "step": 10300
951
+ },
952
+ {
953
+ "epoch": 5.836139169472503,
954
+ "grad_norm": 1.4574775695800781,
955
+ "learning_rate": 3.0347121180627732e-06,
956
+ "loss": 3.0475,
957
+ "step": 10400
958
+ },
959
+ {
960
+ "epoch": 5.892255892255893,
961
+ "grad_norm": 1.4116820096969604,
962
+ "learning_rate": 1.9954271461234673e-06,
963
+ "loss": 3.0469,
964
+ "step": 10500
965
+ },
966
+ {
967
+ "epoch": 5.892255892255893,
968
+ "eval_accuracy": 0.42165513093674584,
969
+ "eval_bleu": 0.13088786550672932,
970
+ "eval_loss": 3.1790146827697754,
971
+ "eval_perplexity": 24.023071540521666,
972
+ "eval_runtime": 12.1539,
973
+ "eval_samples_per_second": 93.88,
974
+ "eval_steps_per_second": 1.481,
975
+ "step": 10500
976
+ },
977
+ {
978
+ "epoch": 5.948372615039282,
979
+ "grad_norm": 1.1517071723937988,
980
+ "learning_rate": 9.561421741841614e-07,
981
+ "loss": 3.0538,
982
+ "step": 10600
983
+ },
984
+ {
985
+ "epoch": 6.0,
986
+ "step": 10692,
987
+ "total_flos": 1.270447321217237e+18,
988
+ "train_loss": 0.5092627385828409,
989
+ "train_runtime": 3159.8581,
990
+ "train_samples_per_second": 216.454,
991
+ "train_steps_per_second": 3.384
992
  }
993
  ],
994
  "logging_steps": 100,
995
+ "max_steps": 10692,
996
  "num_input_tokens_seen": 0,
997
+ "num_train_epochs": 6,
998
  "save_steps": 500,
999
  "stateful_callbacks": {
1000
  "EarlyStoppingCallback": {
 
1017
  "attributes": {}
1018
  }
1019
  },
1020
+ "total_flos": 1.270447321217237e+18,
1021
  "train_batch_size": 64,
1022
  "trial_name": null,
1023
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04d6437c2ee925975d79e817dff86d018d08d52bce678fa75ae1d31d316d4699
3
  size 5560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59200eb8b86a27533dadf7d6faf0a6fa80382d582aa8060a5ac3329ab756818f
3
  size 5560