Ahmed Nassar [email protected] commited on
Commit
95e922e
·
1 Parent(s): 2624dcc

pre-release (11)

Browse files
added_tokens.json CHANGED
@@ -1,8 +1,60 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "<end_of_utterance>": 49279,
3
  "<fake_token_around_image>": 49189,
 
 
 
 
4
  "<global-img>": 49152,
 
5
  "<image>": 49190,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "<row_1_col_1>": 49153,
7
  "<row_1_col_2>": 49154,
8
  "<row_1_col_3>": 49155,
@@ -39,63 +91,15 @@
39
  "<row_6_col_4>": 49186,
40
  "<row_6_col_5>": 49187,
41
  "<row_6_col_6>": 49188,
42
- "<|reserved_special_token_0|>": 49191,
43
- "<|reserved_special_token_10|>": 49201,
44
- "<|reserved_special_token_11|>": 49202,
45
- "<|reserved_special_token_12|>": 49203,
46
- "<|reserved_special_token_13|>": 49204,
47
- "<|reserved_special_token_14|>": 49205,
48
- "<|reserved_special_token_15|>": 49206,
49
- "<|reserved_special_token_16|>": 49207,
50
- "<|reserved_special_token_17|>": 49208,
51
- "<|reserved_special_token_18|>": 49209,
52
- "<|reserved_special_token_19|>": 49210,
53
- "<|reserved_special_token_1|>": 49192,
54
- "<|reserved_special_token_20|>": 49211,
55
- "<|reserved_special_token_21|>": 49212,
56
- "<|reserved_special_token_22|>": 49213,
57
- "<|reserved_special_token_23|>": 49214,
58
- "<|reserved_special_token_24|>": 49215,
59
- "<|reserved_special_token_25|>": 49216,
60
- "<|reserved_special_token_26|>": 49217,
61
- "<|reserved_special_token_27|>": 49218,
62
- "<|reserved_special_token_28|>": 49219,
63
- "<|reserved_special_token_29|>": 49220,
64
- "<|reserved_special_token_2|>": 49193,
65
- "<|reserved_special_token_30|>": 49221,
66
- "<|reserved_special_token_31|>": 49222,
67
- "<|reserved_special_token_32|>": 49223,
68
- "<|reserved_special_token_33|>": 49224,
69
- "<|reserved_special_token_34|>": 49225,
70
- "<|reserved_special_token_35|>": 49226,
71
- "<|reserved_special_token_36|>": 49227,
72
- "<|reserved_special_token_37|>": 49228,
73
- "<|reserved_special_token_38|>": 49229,
74
- "<|reserved_special_token_39|>": 49230,
75
  "<|reserved_special_token_3|>": 49194,
76
- "<|reserved_special_token_40|>": 49231,
77
- "<|reserved_special_token_41|>": 49232,
78
- "<|reserved_special_token_42|>": 49233,
79
- "<|reserved_special_token_43|>": 49234,
80
- "<|reserved_special_token_44|>": 49235,
81
- "<|reserved_special_token_45|>": 49236,
82
- "<|reserved_special_token_46|>": 49237,
83
- "<|reserved_special_token_47|>": 49238,
84
- "<|reserved_special_token_48|>": 49239,
85
- "<|reserved_special_token_49|>": 49240,
86
- "<|reserved_special_token_4|>": 49195,
87
  "<|reserved_special_token_50|>": 49241,
88
- "<|reserved_special_token_51|>": 49242,
89
- "<|reserved_special_token_52|>": 49243,
90
- "<|reserved_special_token_53|>": 49244,
91
- "<|reserved_special_token_54|>": 49245,
92
- "<|reserved_special_token_55|>": 49246,
93
- "<|reserved_special_token_56|>": 49247,
94
- "<|reserved_special_token_57|>": 49248,
95
- "<|reserved_special_token_58|>": 49249,
96
- "<|reserved_special_token_59|>": 49250,
97
- "<|reserved_special_token_5|>": 49196,
98
- "<|reserved_special_token_60|>": 49251,
99
  "<|reserved_special_token_61|>": 49252,
100
  "<|reserved_special_token_62|>": 49253,
101
  "<|reserved_special_token_63|>": 49254,
@@ -105,7 +109,6 @@
105
  "<|reserved_special_token_67|>": 49258,
106
  "<|reserved_special_token_68|>": 49259,
107
  "<|reserved_special_token_69|>": 49260,
108
- "<|reserved_special_token_6|>": 49197,
109
  "<|reserved_special_token_70|>": 49261,
110
  "<|reserved_special_token_71|>": 49262,
111
  "<|reserved_special_token_72|>": 49263,
@@ -116,7 +119,6 @@
116
  "<|reserved_special_token_77|>": 49268,
117
  "<|reserved_special_token_78|>": 49269,
118
  "<|reserved_special_token_79|>": 49270,
119
- "<|reserved_special_token_7|>": 49198,
120
  "<|reserved_special_token_80|>": 49271,
121
  "<|reserved_special_token_81|>": 49272,
122
  "<|reserved_special_token_82|>": 49273,
@@ -124,7 +126,5 @@
124
  "<|reserved_special_token_84|>": 49275,
125
  "<|reserved_special_token_85|>": 49276,
126
  "<|reserved_special_token_86|>": 49277,
127
- "<|reserved_special_token_87|>": 49278,
128
- "<|reserved_special_token_8|>": 49199,
129
- "<|reserved_special_token_9|>": 49200
130
  }
 
1
  {
2
+ "</caption>": 49192,
3
+ "</chart>": 49248,
4
+ "</checkbox_selected>": 49211,
5
+ "</checkbox_unselected>": 49213,
6
+ "</doctag>": 49230,
7
+ "</footnote>": 49195,
8
+ "</form>": 49215,
9
+ "</formula>": 49197,
10
+ "</group>": 49228,
11
+ "</key_": 49243,
12
+ "</key_value_region>": 49217,
13
+ "</list_item>": 49199,
14
+ "</ordered_list>": 49224,
15
+ "</otsl>": 49209,
16
+ "</page_footer>": 49201,
17
+ "</page_header>": 49203,
18
+ "</paragraph>": 49220,
19
+ "</picture>": 49205,
20
+ "</reference>": 49222,
21
+ "</section_header_level_": 49207,
22
+ "</smiles>": 49251,
23
+ "</unordered_list>": 49226,
24
+ "</value_": 49245,
25
+ "<caption>": 49191,
26
+ "<chart>": 49247,
27
+ "<checkbox_selected>": 49210,
28
+ "<checkbox_unselected>": 49212,
29
+ "<ched>": 49239,
30
+ "<doctag>": 49229,
31
+ "<ecel>": 49234,
32
  "<end_of_utterance>": 49279,
33
  "<fake_token_around_image>": 49189,
34
+ "<fcel>": 49233,
35
+ "<footnote>": 49193,
36
+ "<form>": 49214,
37
+ "<formula>": 49196,
38
  "<global-img>": 49152,
39
+ "<group>": 49227,
40
  "<image>": 49190,
41
+ "<key_": 49242,
42
+ "<key_value_region>": 49216,
43
+ "<lcel>": 49235,
44
+ "<link_": 49246,
45
+ "<list_item>": 49198,
46
+ "<loc_": 49218,
47
+ "<nl>": 49238,
48
+ "<ordered_list>": 49223,
49
+ "<otsl>": 49208,
50
+ "<page_": 49231,
51
+ "<page_break>": 49249,
52
+ "<page_footer>": 49200,
53
+ "<page_header>": 49202,
54
+ "<paragraph>": 49219,
55
+ "<picture>": 49204,
56
+ "<reference>": 49221,
57
+ "<rhed>": 49240,
58
  "<row_1_col_1>": 49153,
59
  "<row_1_col_2>": 49154,
60
  "<row_1_col_3>": 49155,
 
91
  "<row_6_col_4>": 49186,
92
  "<row_6_col_5>": 49187,
93
  "<row_6_col_6>": 49188,
94
+ "<section_header_level_": 49206,
95
+ "<smiles>": 49250,
96
+ "<text_break>": 49232,
97
+ "<ucel>": 49236,
98
+ "<unordered_list>": 49225,
99
+ "<value_": 49244,
100
+ "<xcel>": 49237,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  "<|reserved_special_token_3|>": 49194,
 
 
 
 
 
 
 
 
 
 
 
102
  "<|reserved_special_token_50|>": 49241,
 
 
 
 
 
 
 
 
 
 
 
103
  "<|reserved_special_token_61|>": 49252,
104
  "<|reserved_special_token_62|>": 49253,
105
  "<|reserved_special_token_63|>": 49254,
 
109
  "<|reserved_special_token_67|>": 49258,
110
  "<|reserved_special_token_68|>": 49259,
111
  "<|reserved_special_token_69|>": 49260,
 
112
  "<|reserved_special_token_70|>": 49261,
113
  "<|reserved_special_token_71|>": 49262,
114
  "<|reserved_special_token_72|>": 49263,
 
119
  "<|reserved_special_token_77|>": 49268,
120
  "<|reserved_special_token_78|>": 49269,
121
  "<|reserved_special_token_79|>": 49270,
 
122
  "<|reserved_special_token_80|>": 49271,
123
  "<|reserved_special_token_81|>": 49272,
124
  "<|reserved_special_token_82|>": 49273,
 
126
  "<|reserved_special_token_84|>": 49275,
127
  "<|reserved_special_token_85|>": 49276,
128
  "<|reserved_special_token_86|>": 49277,
129
+ "<|reserved_special_token_87|>": 49278
 
 
130
  }
assets/2d0fbcc50e88065a040a537b717620e964fb4453314b71d83f3ed3425addcef6.pdf DELETED
Binary file (45.6 kB)
 
assets/2d0fbcc50e88065a040a537b717620e964fb4453314b71d83f3ed3425addcef6.png DELETED

Git LFS Details

  • SHA256: 069ec77320ef4de477397e03f39e482c9f755122864894d32dd48696059182a8
  • Pointer size: 131 Bytes
  • Size of remote file: 296 kB
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/dccstor/ahn_cdip/checkpoints/SmolDocling_250M_1.3_stg_6/SmolDocling-256M-preview",
3
  "architectures": [
4
  "Idefics3ForConditionalGeneration"
5
  ],
@@ -115,7 +115,7 @@
115
  },
116
  "tie_word_embeddings": false,
117
  "torch_dtype": "bfloat16",
118
- "transformers_version": "4.48.3",
119
  "use_cache": true,
120
  "vision_config": {
121
  "hidden_size": 768,
 
1
  {
2
+ "_name_or_path": "/data1/checkpoints/SmolDocling_250M_DT_ST10/checkpoint-1758/",
3
  "architectures": [
4
  "Idefics3ForConditionalGeneration"
5
  ],
 
115
  },
116
  "tie_word_embeddings": false,
117
  "torch_dtype": "bfloat16",
118
+ "transformers_version": "4.50.0.dev0",
119
  "use_cache": true,
120
  "vision_config": {
121
  "hidden_size": 768,
generation_config.json CHANGED
@@ -3,5 +3,5 @@
3
  "bos_token_id": 0,
4
  "eos_token_id": 49279,
5
  "pad_token_id": 2,
6
- "transformers_version": "4.48.3"
7
  }
 
3
  "bos_token_id": 0,
4
  "eos_token_id": 49279,
5
  "pad_token_id": 2,
6
+ "transformers_version": "4.50.0.dev0"
7
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa304312ea5d92e0aadf70ff5e3f330ddd289ec9b93dd8cb1cdb438494e17709
3
  size 513028808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdcdf5d823c5684029c7d8e52177cf10f9034b3aba6577549cfb1a9ce36ad0a2
3
  size 513028808
tokenizer.json CHANGED
@@ -1,7 +1,19 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
4
- "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
@@ -509,7 +521,7 @@
509
  },
510
  {
511
  "id": 49191,
512
- "content": "<|reserved_special_token_0|>",
513
  "single_word": false,
514
  "lstrip": false,
515
  "rstrip": false,
@@ -518,7 +530,7 @@
518
  },
519
  {
520
  "id": 49192,
521
- "content": "<|reserved_special_token_1|>",
522
  "single_word": false,
523
  "lstrip": false,
524
  "rstrip": false,
@@ -527,7 +539,7 @@
527
  },
528
  {
529
  "id": 49193,
530
- "content": "<|reserved_special_token_2|>",
531
  "single_word": false,
532
  "lstrip": false,
533
  "rstrip": false,
@@ -545,7 +557,7 @@
545
  },
546
  {
547
  "id": 49195,
548
- "content": "<|reserved_special_token_4|>",
549
  "single_word": false,
550
  "lstrip": false,
551
  "rstrip": false,
@@ -554,7 +566,7 @@
554
  },
555
  {
556
  "id": 49196,
557
- "content": "<|reserved_special_token_5|>",
558
  "single_word": false,
559
  "lstrip": false,
560
  "rstrip": false,
@@ -563,7 +575,7 @@
563
  },
564
  {
565
  "id": 49197,
566
- "content": "<|reserved_special_token_6|>",
567
  "single_word": false,
568
  "lstrip": false,
569
  "rstrip": false,
@@ -572,7 +584,7 @@
572
  },
573
  {
574
  "id": 49198,
575
- "content": "<|reserved_special_token_7|>",
576
  "single_word": false,
577
  "lstrip": false,
578
  "rstrip": false,
@@ -581,7 +593,7 @@
581
  },
582
  {
583
  "id": 49199,
584
- "content": "<|reserved_special_token_8|>",
585
  "single_word": false,
586
  "lstrip": false,
587
  "rstrip": false,
@@ -590,7 +602,7 @@
590
  },
591
  {
592
  "id": 49200,
593
- "content": "<|reserved_special_token_9|>",
594
  "single_word": false,
595
  "lstrip": false,
596
  "rstrip": false,
@@ -599,7 +611,7 @@
599
  },
600
  {
601
  "id": 49201,
602
- "content": "<|reserved_special_token_10|>",
603
  "single_word": false,
604
  "lstrip": false,
605
  "rstrip": false,
@@ -608,7 +620,7 @@
608
  },
609
  {
610
  "id": 49202,
611
- "content": "<|reserved_special_token_11|>",
612
  "single_word": false,
613
  "lstrip": false,
614
  "rstrip": false,
@@ -617,7 +629,7 @@
617
  },
618
  {
619
  "id": 49203,
620
- "content": "<|reserved_special_token_12|>",
621
  "single_word": false,
622
  "lstrip": false,
623
  "rstrip": false,
@@ -626,7 +638,7 @@
626
  },
627
  {
628
  "id": 49204,
629
- "content": "<|reserved_special_token_13|>",
630
  "single_word": false,
631
  "lstrip": false,
632
  "rstrip": false,
@@ -635,7 +647,7 @@
635
  },
636
  {
637
  "id": 49205,
638
- "content": "<|reserved_special_token_14|>",
639
  "single_word": false,
640
  "lstrip": false,
641
  "rstrip": false,
@@ -644,7 +656,7 @@
644
  },
645
  {
646
  "id": 49206,
647
- "content": "<|reserved_special_token_15|>",
648
  "single_word": false,
649
  "lstrip": false,
650
  "rstrip": false,
@@ -653,7 +665,7 @@
653
  },
654
  {
655
  "id": 49207,
656
- "content": "<|reserved_special_token_16|>",
657
  "single_word": false,
658
  "lstrip": false,
659
  "rstrip": false,
@@ -662,7 +674,7 @@
662
  },
663
  {
664
  "id": 49208,
665
- "content": "<|reserved_special_token_17|>",
666
  "single_word": false,
667
  "lstrip": false,
668
  "rstrip": false,
@@ -671,7 +683,7 @@
671
  },
672
  {
673
  "id": 49209,
674
- "content": "<|reserved_special_token_18|>",
675
  "single_word": false,
676
  "lstrip": false,
677
  "rstrip": false,
@@ -680,7 +692,7 @@
680
  },
681
  {
682
  "id": 49210,
683
- "content": "<|reserved_special_token_19|>",
684
  "single_word": false,
685
  "lstrip": false,
686
  "rstrip": false,
@@ -689,7 +701,7 @@
689
  },
690
  {
691
  "id": 49211,
692
- "content": "<|reserved_special_token_20|>",
693
  "single_word": false,
694
  "lstrip": false,
695
  "rstrip": false,
@@ -698,7 +710,7 @@
698
  },
699
  {
700
  "id": 49212,
701
- "content": "<|reserved_special_token_21|>",
702
  "single_word": false,
703
  "lstrip": false,
704
  "rstrip": false,
@@ -707,7 +719,7 @@
707
  },
708
  {
709
  "id": 49213,
710
- "content": "<|reserved_special_token_22|>",
711
  "single_word": false,
712
  "lstrip": false,
713
  "rstrip": false,
@@ -716,7 +728,7 @@
716
  },
717
  {
718
  "id": 49214,
719
- "content": "<|reserved_special_token_23|>",
720
  "single_word": false,
721
  "lstrip": false,
722
  "rstrip": false,
@@ -725,7 +737,7 @@
725
  },
726
  {
727
  "id": 49215,
728
- "content": "<|reserved_special_token_24|>",
729
  "single_word": false,
730
  "lstrip": false,
731
  "rstrip": false,
@@ -734,7 +746,7 @@
734
  },
735
  {
736
  "id": 49216,
737
- "content": "<|reserved_special_token_25|>",
738
  "single_word": false,
739
  "lstrip": false,
740
  "rstrip": false,
@@ -743,7 +755,7 @@
743
  },
744
  {
745
  "id": 49217,
746
- "content": "<|reserved_special_token_26|>",
747
  "single_word": false,
748
  "lstrip": false,
749
  "rstrip": false,
@@ -752,7 +764,7 @@
752
  },
753
  {
754
  "id": 49218,
755
- "content": "<|reserved_special_token_27|>",
756
  "single_word": false,
757
  "lstrip": false,
758
  "rstrip": false,
@@ -761,7 +773,7 @@
761
  },
762
  {
763
  "id": 49219,
764
- "content": "<|reserved_special_token_28|>",
765
  "single_word": false,
766
  "lstrip": false,
767
  "rstrip": false,
@@ -770,7 +782,7 @@
770
  },
771
  {
772
  "id": 49220,
773
- "content": "<|reserved_special_token_29|>",
774
  "single_word": false,
775
  "lstrip": false,
776
  "rstrip": false,
@@ -779,7 +791,7 @@
779
  },
780
  {
781
  "id": 49221,
782
- "content": "<|reserved_special_token_30|>",
783
  "single_word": false,
784
  "lstrip": false,
785
  "rstrip": false,
@@ -788,7 +800,7 @@
788
  },
789
  {
790
  "id": 49222,
791
- "content": "<|reserved_special_token_31|>",
792
  "single_word": false,
793
  "lstrip": false,
794
  "rstrip": false,
@@ -797,7 +809,7 @@
797
  },
798
  {
799
  "id": 49223,
800
- "content": "<|reserved_special_token_32|>",
801
  "single_word": false,
802
  "lstrip": false,
803
  "rstrip": false,
@@ -806,7 +818,7 @@
806
  },
807
  {
808
  "id": 49224,
809
- "content": "<|reserved_special_token_33|>",
810
  "single_word": false,
811
  "lstrip": false,
812
  "rstrip": false,
@@ -815,7 +827,7 @@
815
  },
816
  {
817
  "id": 49225,
818
- "content": "<|reserved_special_token_34|>",
819
  "single_word": false,
820
  "lstrip": false,
821
  "rstrip": false,
@@ -824,7 +836,7 @@
824
  },
825
  {
826
  "id": 49226,
827
- "content": "<|reserved_special_token_35|>",
828
  "single_word": false,
829
  "lstrip": false,
830
  "rstrip": false,
@@ -833,7 +845,7 @@
833
  },
834
  {
835
  "id": 49227,
836
- "content": "<|reserved_special_token_36|>",
837
  "single_word": false,
838
  "lstrip": false,
839
  "rstrip": false,
@@ -842,7 +854,7 @@
842
  },
843
  {
844
  "id": 49228,
845
- "content": "<|reserved_special_token_37|>",
846
  "single_word": false,
847
  "lstrip": false,
848
  "rstrip": false,
@@ -851,7 +863,7 @@
851
  },
852
  {
853
  "id": 49229,
854
- "content": "<|reserved_special_token_38|>",
855
  "single_word": false,
856
  "lstrip": false,
857
  "rstrip": false,
@@ -860,7 +872,7 @@
860
  },
861
  {
862
  "id": 49230,
863
- "content": "<|reserved_special_token_39|>",
864
  "single_word": false,
865
  "lstrip": false,
866
  "rstrip": false,
@@ -869,7 +881,7 @@
869
  },
870
  {
871
  "id": 49231,
872
- "content": "<|reserved_special_token_40|>",
873
  "single_word": false,
874
  "lstrip": false,
875
  "rstrip": false,
@@ -878,7 +890,7 @@
878
  },
879
  {
880
  "id": 49232,
881
- "content": "<|reserved_special_token_41|>",
882
  "single_word": false,
883
  "lstrip": false,
884
  "rstrip": false,
@@ -887,7 +899,7 @@
887
  },
888
  {
889
  "id": 49233,
890
- "content": "<|reserved_special_token_42|>",
891
  "single_word": false,
892
  "lstrip": false,
893
  "rstrip": false,
@@ -896,7 +908,7 @@
896
  },
897
  {
898
  "id": 49234,
899
- "content": "<|reserved_special_token_43|>",
900
  "single_word": false,
901
  "lstrip": false,
902
  "rstrip": false,
@@ -905,7 +917,7 @@
905
  },
906
  {
907
  "id": 49235,
908
- "content": "<|reserved_special_token_44|>",
909
  "single_word": false,
910
  "lstrip": false,
911
  "rstrip": false,
@@ -914,7 +926,7 @@
914
  },
915
  {
916
  "id": 49236,
917
- "content": "<|reserved_special_token_45|>",
918
  "single_word": false,
919
  "lstrip": false,
920
  "rstrip": false,
@@ -923,7 +935,7 @@
923
  },
924
  {
925
  "id": 49237,
926
- "content": "<|reserved_special_token_46|>",
927
  "single_word": false,
928
  "lstrip": false,
929
  "rstrip": false,
@@ -932,7 +944,7 @@
932
  },
933
  {
934
  "id": 49238,
935
- "content": "<|reserved_special_token_47|>",
936
  "single_word": false,
937
  "lstrip": false,
938
  "rstrip": false,
@@ -941,7 +953,7 @@
941
  },
942
  {
943
  "id": 49239,
944
- "content": "<|reserved_special_token_48|>",
945
  "single_word": false,
946
  "lstrip": false,
947
  "rstrip": false,
@@ -950,7 +962,7 @@
950
  },
951
  {
952
  "id": 49240,
953
- "content": "<|reserved_special_token_49|>",
954
  "single_word": false,
955
  "lstrip": false,
956
  "rstrip": false,
@@ -968,7 +980,7 @@
968
  },
969
  {
970
  "id": 49242,
971
- "content": "<|reserved_special_token_51|>",
972
  "single_word": false,
973
  "lstrip": false,
974
  "rstrip": false,
@@ -977,7 +989,7 @@
977
  },
978
  {
979
  "id": 49243,
980
- "content": "<|reserved_special_token_52|>",
981
  "single_word": false,
982
  "lstrip": false,
983
  "rstrip": false,
@@ -986,7 +998,7 @@
986
  },
987
  {
988
  "id": 49244,
989
- "content": "<|reserved_special_token_53|>",
990
  "single_word": false,
991
  "lstrip": false,
992
  "rstrip": false,
@@ -995,7 +1007,7 @@
995
  },
996
  {
997
  "id": 49245,
998
- "content": "<|reserved_special_token_54|>",
999
  "single_word": false,
1000
  "lstrip": false,
1001
  "rstrip": false,
@@ -1004,7 +1016,7 @@
1004
  },
1005
  {
1006
  "id": 49246,
1007
- "content": "<|reserved_special_token_55|>",
1008
  "single_word": false,
1009
  "lstrip": false,
1010
  "rstrip": false,
@@ -1013,7 +1025,7 @@
1013
  },
1014
  {
1015
  "id": 49247,
1016
- "content": "<|reserved_special_token_56|>",
1017
  "single_word": false,
1018
  "lstrip": false,
1019
  "rstrip": false,
@@ -1022,7 +1034,7 @@
1022
  },
1023
  {
1024
  "id": 49248,
1025
- "content": "<|reserved_special_token_57|>",
1026
  "single_word": false,
1027
  "lstrip": false,
1028
  "rstrip": false,
@@ -1031,7 +1043,7 @@
1031
  },
1032
  {
1033
  "id": 49249,
1034
- "content": "<|reserved_special_token_58|>",
1035
  "single_word": false,
1036
  "lstrip": false,
1037
  "rstrip": false,
@@ -1040,7 +1052,7 @@
1040
  },
1041
  {
1042
  "id": 49250,
1043
- "content": "<|reserved_special_token_59|>",
1044
  "single_word": false,
1045
  "lstrip": false,
1046
  "rstrip": false,
@@ -1049,7 +1061,7 @@
1049
  },
1050
  {
1051
  "id": 49251,
1052
- "content": "<|reserved_special_token_60|>",
1053
  "single_word": false,
1054
  "lstrip": false,
1055
  "rstrip": false,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 8192,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": "BatchLongest",
11
+ "direction": "Right",
12
+ "pad_to_multiple_of": null,
13
+ "pad_id": 2,
14
+ "pad_type_id": 0,
15
+ "pad_token": "<|im_end|>"
16
+ },
17
  "added_tokens": [
18
  {
19
  "id": 0,
 
521
  },
522
  {
523
  "id": 49191,
524
+ "content": "<caption>",
525
  "single_word": false,
526
  "lstrip": false,
527
  "rstrip": false,
 
530
  },
531
  {
532
  "id": 49192,
533
+ "content": "</caption>",
534
  "single_word": false,
535
  "lstrip": false,
536
  "rstrip": false,
 
539
  },
540
  {
541
  "id": 49193,
542
+ "content": "<footnote>",
543
  "single_word": false,
544
  "lstrip": false,
545
  "rstrip": false,
 
557
  },
558
  {
559
  "id": 49195,
560
+ "content": "</footnote>",
561
  "single_word": false,
562
  "lstrip": false,
563
  "rstrip": false,
 
566
  },
567
  {
568
  "id": 49196,
569
+ "content": "<formula>",
570
  "single_word": false,
571
  "lstrip": false,
572
  "rstrip": false,
 
575
  },
576
  {
577
  "id": 49197,
578
+ "content": "</formula>",
579
  "single_word": false,
580
  "lstrip": false,
581
  "rstrip": false,
 
584
  },
585
  {
586
  "id": 49198,
587
+ "content": "<list_item>",
588
  "single_word": false,
589
  "lstrip": false,
590
  "rstrip": false,
 
593
  },
594
  {
595
  "id": 49199,
596
+ "content": "</list_item>",
597
  "single_word": false,
598
  "lstrip": false,
599
  "rstrip": false,
 
602
  },
603
  {
604
  "id": 49200,
605
+ "content": "<page_footer>",
606
  "single_word": false,
607
  "lstrip": false,
608
  "rstrip": false,
 
611
  },
612
  {
613
  "id": 49201,
614
+ "content": "</page_footer>",
615
  "single_word": false,
616
  "lstrip": false,
617
  "rstrip": false,
 
620
  },
621
  {
622
  "id": 49202,
623
+ "content": "<page_header>",
624
  "single_word": false,
625
  "lstrip": false,
626
  "rstrip": false,
 
629
  },
630
  {
631
  "id": 49203,
632
+ "content": "</page_header>",
633
  "single_word": false,
634
  "lstrip": false,
635
  "rstrip": false,
 
638
  },
639
  {
640
  "id": 49204,
641
+ "content": "<picture>",
642
  "single_word": false,
643
  "lstrip": false,
644
  "rstrip": false,
 
647
  },
648
  {
649
  "id": 49205,
650
+ "content": "</picture>",
651
  "single_word": false,
652
  "lstrip": false,
653
  "rstrip": false,
 
656
  },
657
  {
658
  "id": 49206,
659
+ "content": "<section_header_level_",
660
  "single_word": false,
661
  "lstrip": false,
662
  "rstrip": false,
 
665
  },
666
  {
667
  "id": 49207,
668
+ "content": "</section_header_level_",
669
  "single_word": false,
670
  "lstrip": false,
671
  "rstrip": false,
 
674
  },
675
  {
676
  "id": 49208,
677
+ "content": "<otsl>",
678
  "single_word": false,
679
  "lstrip": false,
680
  "rstrip": false,
 
683
  },
684
  {
685
  "id": 49209,
686
+ "content": "</otsl>",
687
  "single_word": false,
688
  "lstrip": false,
689
  "rstrip": false,
 
692
  },
693
  {
694
  "id": 49210,
695
+ "content": "<checkbox_selected>",
696
  "single_word": false,
697
  "lstrip": false,
698
  "rstrip": false,
 
701
  },
702
  {
703
  "id": 49211,
704
+ "content": "</checkbox_selected>",
705
  "single_word": false,
706
  "lstrip": false,
707
  "rstrip": false,
 
710
  },
711
  {
712
  "id": 49212,
713
+ "content": "<checkbox_unselected>",
714
  "single_word": false,
715
  "lstrip": false,
716
  "rstrip": false,
 
719
  },
720
  {
721
  "id": 49213,
722
+ "content": "</checkbox_unselected>",
723
  "single_word": false,
724
  "lstrip": false,
725
  "rstrip": false,
 
728
  },
729
  {
730
  "id": 49214,
731
+ "content": "<form>",
732
  "single_word": false,
733
  "lstrip": false,
734
  "rstrip": false,
 
737
  },
738
  {
739
  "id": 49215,
740
+ "content": "</form>",
741
  "single_word": false,
742
  "lstrip": false,
743
  "rstrip": false,
 
746
  },
747
  {
748
  "id": 49216,
749
+ "content": "<key_value_region>",
750
  "single_word": false,
751
  "lstrip": false,
752
  "rstrip": false,
 
755
  },
756
  {
757
  "id": 49217,
758
+ "content": "</key_value_region>",
759
  "single_word": false,
760
  "lstrip": false,
761
  "rstrip": false,
 
764
  },
765
  {
766
  "id": 49218,
767
+ "content": "<loc_",
768
  "single_word": false,
769
  "lstrip": false,
770
  "rstrip": false,
 
773
  },
774
  {
775
  "id": 49219,
776
+ "content": "<paragraph>",
777
  "single_word": false,
778
  "lstrip": false,
779
  "rstrip": false,
 
782
  },
783
  {
784
  "id": 49220,
785
+ "content": "</paragraph>",
786
  "single_word": false,
787
  "lstrip": false,
788
  "rstrip": false,
 
791
  },
792
  {
793
  "id": 49221,
794
+ "content": "<reference>",
795
  "single_word": false,
796
  "lstrip": false,
797
  "rstrip": false,
 
800
  },
801
  {
802
  "id": 49222,
803
+ "content": "</reference>",
804
  "single_word": false,
805
  "lstrip": false,
806
  "rstrip": false,
 
809
  },
810
  {
811
  "id": 49223,
812
+ "content": "<ordered_list>",
813
  "single_word": false,
814
  "lstrip": false,
815
  "rstrip": false,
 
818
  },
819
  {
820
  "id": 49224,
821
+ "content": "</ordered_list>",
822
  "single_word": false,
823
  "lstrip": false,
824
  "rstrip": false,
 
827
  },
828
  {
829
  "id": 49225,
830
+ "content": "<unordered_list>",
831
  "single_word": false,
832
  "lstrip": false,
833
  "rstrip": false,
 
836
  },
837
  {
838
  "id": 49226,
839
+ "content": "</unordered_list>",
840
  "single_word": false,
841
  "lstrip": false,
842
  "rstrip": false,
 
845
  },
846
  {
847
  "id": 49227,
848
+ "content": "<group>",
849
  "single_word": false,
850
  "lstrip": false,
851
  "rstrip": false,
 
854
  },
855
  {
856
  "id": 49228,
857
+ "content": "</group>",
858
  "single_word": false,
859
  "lstrip": false,
860
  "rstrip": false,
 
863
  },
864
  {
865
  "id": 49229,
866
+ "content": "<doctag>",
867
  "single_word": false,
868
  "lstrip": false,
869
  "rstrip": false,
 
872
  },
873
  {
874
  "id": 49230,
875
+ "content": "</doctag>",
876
  "single_word": false,
877
  "lstrip": false,
878
  "rstrip": false,
 
881
  },
882
  {
883
  "id": 49231,
884
+ "content": "<page_",
885
  "single_word": false,
886
  "lstrip": false,
887
  "rstrip": false,
 
890
  },
891
  {
892
  "id": 49232,
893
+ "content": "<text_break>",
894
  "single_word": false,
895
  "lstrip": false,
896
  "rstrip": false,
 
899
  },
900
  {
901
  "id": 49233,
902
+ "content": "<fcel>",
903
  "single_word": false,
904
  "lstrip": false,
905
  "rstrip": false,
 
908
  },
909
  {
910
  "id": 49234,
911
+ "content": "<ecel>",
912
  "single_word": false,
913
  "lstrip": false,
914
  "rstrip": false,
 
917
  },
918
  {
919
  "id": 49235,
920
+ "content": "<lcel>",
921
  "single_word": false,
922
  "lstrip": false,
923
  "rstrip": false,
 
926
  },
927
  {
928
  "id": 49236,
929
+ "content": "<ucel>",
930
  "single_word": false,
931
  "lstrip": false,
932
  "rstrip": false,
 
935
  },
936
  {
937
  "id": 49237,
938
+ "content": "<xcel>",
939
  "single_word": false,
940
  "lstrip": false,
941
  "rstrip": false,
 
944
  },
945
  {
946
  "id": 49238,
947
+ "content": "<nl>",
948
  "single_word": false,
949
  "lstrip": false,
950
  "rstrip": false,
 
953
  },
954
  {
955
  "id": 49239,
956
+ "content": "<ched>",
957
  "single_word": false,
958
  "lstrip": false,
959
  "rstrip": false,
 
962
  },
963
  {
964
  "id": 49240,
965
+ "content": "<rhed>",
966
  "single_word": false,
967
  "lstrip": false,
968
  "rstrip": false,
 
980
  },
981
  {
982
  "id": 49242,
983
+ "content": "<key_",
984
  "single_word": false,
985
  "lstrip": false,
986
  "rstrip": false,
 
989
  },
990
  {
991
  "id": 49243,
992
+ "content": "</key_",
993
  "single_word": false,
994
  "lstrip": false,
995
  "rstrip": false,
 
998
  },
999
  {
1000
  "id": 49244,
1001
+ "content": "<value_",
1002
  "single_word": false,
1003
  "lstrip": false,
1004
  "rstrip": false,
 
1007
  },
1008
  {
1009
  "id": 49245,
1010
+ "content": "</value_",
1011
  "single_word": false,
1012
  "lstrip": false,
1013
  "rstrip": false,
 
1016
  },
1017
  {
1018
  "id": 49246,
1019
+ "content": "<link_",
1020
  "single_word": false,
1021
  "lstrip": false,
1022
  "rstrip": false,
 
1025
  },
1026
  {
1027
  "id": 49247,
1028
+ "content": "<chart>",
1029
  "single_word": false,
1030
  "lstrip": false,
1031
  "rstrip": false,
 
1034
  },
1035
  {
1036
  "id": 49248,
1037
+ "content": "</chart>",
1038
  "single_word": false,
1039
  "lstrip": false,
1040
  "rstrip": false,
 
1043
  },
1044
  {
1045
  "id": 49249,
1046
+ "content": "<page_break>",
1047
  "single_word": false,
1048
  "lstrip": false,
1049
  "rstrip": false,
 
1052
  },
1053
  {
1054
  "id": 49250,
1055
+ "content": "<smiles>",
1056
  "single_word": false,
1057
  "lstrip": false,
1058
  "rstrip": false,
 
1061
  },
1062
  {
1063
  "id": 49251,
1064
+ "content": "</smiles>",
1065
  "single_word": false,
1066
  "lstrip": false,
1067
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -450,7 +450,7 @@
450
  "special": true
451
  },
452
  "49191": {
453
- "content": "<|reserved_special_token_0|>",
454
  "lstrip": false,
455
  "normalized": false,
456
  "rstrip": false,
@@ -458,7 +458,7 @@
458
  "special": true
459
  },
460
  "49192": {
461
- "content": "<|reserved_special_token_1|>",
462
  "lstrip": false,
463
  "normalized": false,
464
  "rstrip": false,
@@ -466,7 +466,7 @@
466
  "special": true
467
  },
468
  "49193": {
469
- "content": "<|reserved_special_token_2|>",
470
  "lstrip": false,
471
  "normalized": false,
472
  "rstrip": false,
@@ -482,7 +482,7 @@
482
  "special": true
483
  },
484
  "49195": {
485
- "content": "<|reserved_special_token_4|>",
486
  "lstrip": false,
487
  "normalized": false,
488
  "rstrip": false,
@@ -490,7 +490,7 @@
490
  "special": true
491
  },
492
  "49196": {
493
- "content": "<|reserved_special_token_5|>",
494
  "lstrip": false,
495
  "normalized": false,
496
  "rstrip": false,
@@ -498,7 +498,7 @@
498
  "special": true
499
  },
500
  "49197": {
501
- "content": "<|reserved_special_token_6|>",
502
  "lstrip": false,
503
  "normalized": false,
504
  "rstrip": false,
@@ -506,7 +506,7 @@
506
  "special": true
507
  },
508
  "49198": {
509
- "content": "<|reserved_special_token_7|>",
510
  "lstrip": false,
511
  "normalized": false,
512
  "rstrip": false,
@@ -514,7 +514,7 @@
514
  "special": true
515
  },
516
  "49199": {
517
- "content": "<|reserved_special_token_8|>",
518
  "lstrip": false,
519
  "normalized": false,
520
  "rstrip": false,
@@ -522,7 +522,7 @@
522
  "special": true
523
  },
524
  "49200": {
525
- "content": "<|reserved_special_token_9|>",
526
  "lstrip": false,
527
  "normalized": false,
528
  "rstrip": false,
@@ -530,7 +530,7 @@
530
  "special": true
531
  },
532
  "49201": {
533
- "content": "<|reserved_special_token_10|>",
534
  "lstrip": false,
535
  "normalized": false,
536
  "rstrip": false,
@@ -538,7 +538,7 @@
538
  "special": true
539
  },
540
  "49202": {
541
- "content": "<|reserved_special_token_11|>",
542
  "lstrip": false,
543
  "normalized": false,
544
  "rstrip": false,
@@ -546,7 +546,7 @@
546
  "special": true
547
  },
548
  "49203": {
549
- "content": "<|reserved_special_token_12|>",
550
  "lstrip": false,
551
  "normalized": false,
552
  "rstrip": false,
@@ -554,7 +554,7 @@
554
  "special": true
555
  },
556
  "49204": {
557
- "content": "<|reserved_special_token_13|>",
558
  "lstrip": false,
559
  "normalized": false,
560
  "rstrip": false,
@@ -562,7 +562,7 @@
562
  "special": true
563
  },
564
  "49205": {
565
- "content": "<|reserved_special_token_14|>",
566
  "lstrip": false,
567
  "normalized": false,
568
  "rstrip": false,
@@ -570,7 +570,7 @@
570
  "special": true
571
  },
572
  "49206": {
573
- "content": "<|reserved_special_token_15|>",
574
  "lstrip": false,
575
  "normalized": false,
576
  "rstrip": false,
@@ -578,7 +578,7 @@
578
  "special": true
579
  },
580
  "49207": {
581
- "content": "<|reserved_special_token_16|>",
582
  "lstrip": false,
583
  "normalized": false,
584
  "rstrip": false,
@@ -586,7 +586,7 @@
586
  "special": true
587
  },
588
  "49208": {
589
- "content": "<|reserved_special_token_17|>",
590
  "lstrip": false,
591
  "normalized": false,
592
  "rstrip": false,
@@ -594,7 +594,7 @@
594
  "special": true
595
  },
596
  "49209": {
597
- "content": "<|reserved_special_token_18|>",
598
  "lstrip": false,
599
  "normalized": false,
600
  "rstrip": false,
@@ -602,7 +602,7 @@
602
  "special": true
603
  },
604
  "49210": {
605
- "content": "<|reserved_special_token_19|>",
606
  "lstrip": false,
607
  "normalized": false,
608
  "rstrip": false,
@@ -610,7 +610,7 @@
610
  "special": true
611
  },
612
  "49211": {
613
- "content": "<|reserved_special_token_20|>",
614
  "lstrip": false,
615
  "normalized": false,
616
  "rstrip": false,
@@ -618,7 +618,7 @@
618
  "special": true
619
  },
620
  "49212": {
621
- "content": "<|reserved_special_token_21|>",
622
  "lstrip": false,
623
  "normalized": false,
624
  "rstrip": false,
@@ -626,7 +626,7 @@
626
  "special": true
627
  },
628
  "49213": {
629
- "content": "<|reserved_special_token_22|>",
630
  "lstrip": false,
631
  "normalized": false,
632
  "rstrip": false,
@@ -634,7 +634,7 @@
634
  "special": true
635
  },
636
  "49214": {
637
- "content": "<|reserved_special_token_23|>",
638
  "lstrip": false,
639
  "normalized": false,
640
  "rstrip": false,
@@ -642,7 +642,7 @@
642
  "special": true
643
  },
644
  "49215": {
645
- "content": "<|reserved_special_token_24|>",
646
  "lstrip": false,
647
  "normalized": false,
648
  "rstrip": false,
@@ -650,7 +650,7 @@
650
  "special": true
651
  },
652
  "49216": {
653
- "content": "<|reserved_special_token_25|>",
654
  "lstrip": false,
655
  "normalized": false,
656
  "rstrip": false,
@@ -658,7 +658,7 @@
658
  "special": true
659
  },
660
  "49217": {
661
- "content": "<|reserved_special_token_26|>",
662
  "lstrip": false,
663
  "normalized": false,
664
  "rstrip": false,
@@ -666,7 +666,7 @@
666
  "special": true
667
  },
668
  "49218": {
669
- "content": "<|reserved_special_token_27|>",
670
  "lstrip": false,
671
  "normalized": false,
672
  "rstrip": false,
@@ -674,7 +674,7 @@
674
  "special": true
675
  },
676
  "49219": {
677
- "content": "<|reserved_special_token_28|>",
678
  "lstrip": false,
679
  "normalized": false,
680
  "rstrip": false,
@@ -682,7 +682,7 @@
682
  "special": true
683
  },
684
  "49220": {
685
- "content": "<|reserved_special_token_29|>",
686
  "lstrip": false,
687
  "normalized": false,
688
  "rstrip": false,
@@ -690,7 +690,7 @@
690
  "special": true
691
  },
692
  "49221": {
693
- "content": "<|reserved_special_token_30|>",
694
  "lstrip": false,
695
  "normalized": false,
696
  "rstrip": false,
@@ -698,7 +698,7 @@
698
  "special": true
699
  },
700
  "49222": {
701
- "content": "<|reserved_special_token_31|>",
702
  "lstrip": false,
703
  "normalized": false,
704
  "rstrip": false,
@@ -706,7 +706,7 @@
706
  "special": true
707
  },
708
  "49223": {
709
- "content": "<|reserved_special_token_32|>",
710
  "lstrip": false,
711
  "normalized": false,
712
  "rstrip": false,
@@ -714,7 +714,7 @@
714
  "special": true
715
  },
716
  "49224": {
717
- "content": "<|reserved_special_token_33|>",
718
  "lstrip": false,
719
  "normalized": false,
720
  "rstrip": false,
@@ -722,7 +722,7 @@
722
  "special": true
723
  },
724
  "49225": {
725
- "content": "<|reserved_special_token_34|>",
726
  "lstrip": false,
727
  "normalized": false,
728
  "rstrip": false,
@@ -730,7 +730,7 @@
730
  "special": true
731
  },
732
  "49226": {
733
- "content": "<|reserved_special_token_35|>",
734
  "lstrip": false,
735
  "normalized": false,
736
  "rstrip": false,
@@ -738,7 +738,7 @@
738
  "special": true
739
  },
740
  "49227": {
741
- "content": "<|reserved_special_token_36|>",
742
  "lstrip": false,
743
  "normalized": false,
744
  "rstrip": false,
@@ -746,7 +746,7 @@
746
  "special": true
747
  },
748
  "49228": {
749
- "content": "<|reserved_special_token_37|>",
750
  "lstrip": false,
751
  "normalized": false,
752
  "rstrip": false,
@@ -754,7 +754,7 @@
754
  "special": true
755
  },
756
  "49229": {
757
- "content": "<|reserved_special_token_38|>",
758
  "lstrip": false,
759
  "normalized": false,
760
  "rstrip": false,
@@ -762,7 +762,7 @@
762
  "special": true
763
  },
764
  "49230": {
765
- "content": "<|reserved_special_token_39|>",
766
  "lstrip": false,
767
  "normalized": false,
768
  "rstrip": false,
@@ -770,7 +770,7 @@
770
  "special": true
771
  },
772
  "49231": {
773
- "content": "<|reserved_special_token_40|>",
774
  "lstrip": false,
775
  "normalized": false,
776
  "rstrip": false,
@@ -778,7 +778,7 @@
778
  "special": true
779
  },
780
  "49232": {
781
- "content": "<|reserved_special_token_41|>",
782
  "lstrip": false,
783
  "normalized": false,
784
  "rstrip": false,
@@ -786,7 +786,7 @@
786
  "special": true
787
  },
788
  "49233": {
789
- "content": "<|reserved_special_token_42|>",
790
  "lstrip": false,
791
  "normalized": false,
792
  "rstrip": false,
@@ -794,7 +794,7 @@
794
  "special": true
795
  },
796
  "49234": {
797
- "content": "<|reserved_special_token_43|>",
798
  "lstrip": false,
799
  "normalized": false,
800
  "rstrip": false,
@@ -802,7 +802,7 @@
802
  "special": true
803
  },
804
  "49235": {
805
- "content": "<|reserved_special_token_44|>",
806
  "lstrip": false,
807
  "normalized": false,
808
  "rstrip": false,
@@ -810,7 +810,7 @@
810
  "special": true
811
  },
812
  "49236": {
813
- "content": "<|reserved_special_token_45|>",
814
  "lstrip": false,
815
  "normalized": false,
816
  "rstrip": false,
@@ -818,7 +818,7 @@
818
  "special": true
819
  },
820
  "49237": {
821
- "content": "<|reserved_special_token_46|>",
822
  "lstrip": false,
823
  "normalized": false,
824
  "rstrip": false,
@@ -826,7 +826,7 @@
826
  "special": true
827
  },
828
  "49238": {
829
- "content": "<|reserved_special_token_47|>",
830
  "lstrip": false,
831
  "normalized": false,
832
  "rstrip": false,
@@ -834,7 +834,7 @@
834
  "special": true
835
  },
836
  "49239": {
837
- "content": "<|reserved_special_token_48|>",
838
  "lstrip": false,
839
  "normalized": false,
840
  "rstrip": false,
@@ -842,7 +842,7 @@
842
  "special": true
843
  },
844
  "49240": {
845
- "content": "<|reserved_special_token_49|>",
846
  "lstrip": false,
847
  "normalized": false,
848
  "rstrip": false,
@@ -858,7 +858,7 @@
858
  "special": true
859
  },
860
  "49242": {
861
- "content": "<|reserved_special_token_51|>",
862
  "lstrip": false,
863
  "normalized": false,
864
  "rstrip": false,
@@ -866,7 +866,7 @@
866
  "special": true
867
  },
868
  "49243": {
869
- "content": "<|reserved_special_token_52|>",
870
  "lstrip": false,
871
  "normalized": false,
872
  "rstrip": false,
@@ -874,7 +874,7 @@
874
  "special": true
875
  },
876
  "49244": {
877
- "content": "<|reserved_special_token_53|>",
878
  "lstrip": false,
879
  "normalized": false,
880
  "rstrip": false,
@@ -882,7 +882,7 @@
882
  "special": true
883
  },
884
  "49245": {
885
- "content": "<|reserved_special_token_54|>",
886
  "lstrip": false,
887
  "normalized": false,
888
  "rstrip": false,
@@ -890,7 +890,7 @@
890
  "special": true
891
  },
892
  "49246": {
893
- "content": "<|reserved_special_token_55|>",
894
  "lstrip": false,
895
  "normalized": false,
896
  "rstrip": false,
@@ -898,7 +898,7 @@
898
  "special": true
899
  },
900
  "49247": {
901
- "content": "<|reserved_special_token_56|>",
902
  "lstrip": false,
903
  "normalized": false,
904
  "rstrip": false,
@@ -906,7 +906,7 @@
906
  "special": true
907
  },
908
  "49248": {
909
- "content": "<|reserved_special_token_57|>",
910
  "lstrip": false,
911
  "normalized": false,
912
  "rstrip": false,
@@ -914,7 +914,7 @@
914
  "special": true
915
  },
916
  "49249": {
917
- "content": "<|reserved_special_token_58|>",
918
  "lstrip": false,
919
  "normalized": false,
920
  "rstrip": false,
@@ -922,7 +922,7 @@
922
  "special": true
923
  },
924
  "49250": {
925
- "content": "<|reserved_special_token_59|>",
926
  "lstrip": false,
927
  "normalized": false,
928
  "rstrip": false,
@@ -930,7 +930,7 @@
930
  "special": true
931
  },
932
  "49251": {
933
- "content": "<|reserved_special_token_60|>",
934
  "lstrip": false,
935
  "normalized": false,
936
  "rstrip": false,
@@ -1173,11 +1173,17 @@
1173
  "eos_token": "<|im_end|>",
1174
  "extra_special_tokens": {},
1175
  "legacy": false,
 
1176
  "model_max_length": 8192,
 
1177
  "pad_token": "<|im_end|>",
 
 
1178
  "processor_class": "Idefics3Processor",
 
1179
  "tokenizer_class": "GPT2Tokenizer",
1180
- "truncation_side": "left",
 
1181
  "unk_token": "<|endoftext|>",
1182
  "vocab_size": 49152
1183
  }
 
450
  "special": true
451
  },
452
  "49191": {
453
+ "content": "<caption>",
454
  "lstrip": false,
455
  "normalized": false,
456
  "rstrip": false,
 
458
  "special": true
459
  },
460
  "49192": {
461
+ "content": "</caption>",
462
  "lstrip": false,
463
  "normalized": false,
464
  "rstrip": false,
 
466
  "special": true
467
  },
468
  "49193": {
469
+ "content": "<footnote>",
470
  "lstrip": false,
471
  "normalized": false,
472
  "rstrip": false,
 
482
  "special": true
483
  },
484
  "49195": {
485
+ "content": "</footnote>",
486
  "lstrip": false,
487
  "normalized": false,
488
  "rstrip": false,
 
490
  "special": true
491
  },
492
  "49196": {
493
+ "content": "<formula>",
494
  "lstrip": false,
495
  "normalized": false,
496
  "rstrip": false,
 
498
  "special": true
499
  },
500
  "49197": {
501
+ "content": "</formula>",
502
  "lstrip": false,
503
  "normalized": false,
504
  "rstrip": false,
 
506
  "special": true
507
  },
508
  "49198": {
509
+ "content": "<list_item>",
510
  "lstrip": false,
511
  "normalized": false,
512
  "rstrip": false,
 
514
  "special": true
515
  },
516
  "49199": {
517
+ "content": "</list_item>",
518
  "lstrip": false,
519
  "normalized": false,
520
  "rstrip": false,
 
522
  "special": true
523
  },
524
  "49200": {
525
+ "content": "<page_footer>",
526
  "lstrip": false,
527
  "normalized": false,
528
  "rstrip": false,
 
530
  "special": true
531
  },
532
  "49201": {
533
+ "content": "</page_footer>",
534
  "lstrip": false,
535
  "normalized": false,
536
  "rstrip": false,
 
538
  "special": true
539
  },
540
  "49202": {
541
+ "content": "<page_header>",
542
  "lstrip": false,
543
  "normalized": false,
544
  "rstrip": false,
 
546
  "special": true
547
  },
548
  "49203": {
549
+ "content": "</page_header>",
550
  "lstrip": false,
551
  "normalized": false,
552
  "rstrip": false,
 
554
  "special": true
555
  },
556
  "49204": {
557
+ "content": "<picture>",
558
  "lstrip": false,
559
  "normalized": false,
560
  "rstrip": false,
 
562
  "special": true
563
  },
564
  "49205": {
565
+ "content": "</picture>",
566
  "lstrip": false,
567
  "normalized": false,
568
  "rstrip": false,
 
570
  "special": true
571
  },
572
  "49206": {
573
+ "content": "<section_header_level_",
574
  "lstrip": false,
575
  "normalized": false,
576
  "rstrip": false,
 
578
  "special": true
579
  },
580
  "49207": {
581
+ "content": "</section_header_level_",
582
  "lstrip": false,
583
  "normalized": false,
584
  "rstrip": false,
 
586
  "special": true
587
  },
588
  "49208": {
589
+ "content": "<otsl>",
590
  "lstrip": false,
591
  "normalized": false,
592
  "rstrip": false,
 
594
  "special": true
595
  },
596
  "49209": {
597
+ "content": "</otsl>",
598
  "lstrip": false,
599
  "normalized": false,
600
  "rstrip": false,
 
602
  "special": true
603
  },
604
  "49210": {
605
+ "content": "<checkbox_selected>",
606
  "lstrip": false,
607
  "normalized": false,
608
  "rstrip": false,
 
610
  "special": true
611
  },
612
  "49211": {
613
+ "content": "</checkbox_selected>",
614
  "lstrip": false,
615
  "normalized": false,
616
  "rstrip": false,
 
618
  "special": true
619
  },
620
  "49212": {
621
+ "content": "<checkbox_unselected>",
622
  "lstrip": false,
623
  "normalized": false,
624
  "rstrip": false,
 
626
  "special": true
627
  },
628
  "49213": {
629
+ "content": "</checkbox_unselected>",
630
  "lstrip": false,
631
  "normalized": false,
632
  "rstrip": false,
 
634
  "special": true
635
  },
636
  "49214": {
637
+ "content": "<form>",
638
  "lstrip": false,
639
  "normalized": false,
640
  "rstrip": false,
 
642
  "special": true
643
  },
644
  "49215": {
645
+ "content": "</form>",
646
  "lstrip": false,
647
  "normalized": false,
648
  "rstrip": false,
 
650
  "special": true
651
  },
652
  "49216": {
653
+ "content": "<key_value_region>",
654
  "lstrip": false,
655
  "normalized": false,
656
  "rstrip": false,
 
658
  "special": true
659
  },
660
  "49217": {
661
+ "content": "</key_value_region>",
662
  "lstrip": false,
663
  "normalized": false,
664
  "rstrip": false,
 
666
  "special": true
667
  },
668
  "49218": {
669
+ "content": "<loc_",
670
  "lstrip": false,
671
  "normalized": false,
672
  "rstrip": false,
 
674
  "special": true
675
  },
676
  "49219": {
677
+ "content": "<paragraph>",
678
  "lstrip": false,
679
  "normalized": false,
680
  "rstrip": false,
 
682
  "special": true
683
  },
684
  "49220": {
685
+ "content": "</paragraph>",
686
  "lstrip": false,
687
  "normalized": false,
688
  "rstrip": false,
 
690
  "special": true
691
  },
692
  "49221": {
693
+ "content": "<reference>",
694
  "lstrip": false,
695
  "normalized": false,
696
  "rstrip": false,
 
698
  "special": true
699
  },
700
  "49222": {
701
+ "content": "</reference>",
702
  "lstrip": false,
703
  "normalized": false,
704
  "rstrip": false,
 
706
  "special": true
707
  },
708
  "49223": {
709
+ "content": "<ordered_list>",
710
  "lstrip": false,
711
  "normalized": false,
712
  "rstrip": false,
 
714
  "special": true
715
  },
716
  "49224": {
717
+ "content": "</ordered_list>",
718
  "lstrip": false,
719
  "normalized": false,
720
  "rstrip": false,
 
722
  "special": true
723
  },
724
  "49225": {
725
+ "content": "<unordered_list>",
726
  "lstrip": false,
727
  "normalized": false,
728
  "rstrip": false,
 
730
  "special": true
731
  },
732
  "49226": {
733
+ "content": "</unordered_list>",
734
  "lstrip": false,
735
  "normalized": false,
736
  "rstrip": false,
 
738
  "special": true
739
  },
740
  "49227": {
741
+ "content": "<group>",
742
  "lstrip": false,
743
  "normalized": false,
744
  "rstrip": false,
 
746
  "special": true
747
  },
748
  "49228": {
749
+ "content": "</group>",
750
  "lstrip": false,
751
  "normalized": false,
752
  "rstrip": false,
 
754
  "special": true
755
  },
756
  "49229": {
757
+ "content": "<doctag>",
758
  "lstrip": false,
759
  "normalized": false,
760
  "rstrip": false,
 
762
  "special": true
763
  },
764
  "49230": {
765
+ "content": "</doctag>",
766
  "lstrip": false,
767
  "normalized": false,
768
  "rstrip": false,
 
770
  "special": true
771
  },
772
  "49231": {
773
+ "content": "<page_",
774
  "lstrip": false,
775
  "normalized": false,
776
  "rstrip": false,
 
778
  "special": true
779
  },
780
  "49232": {
781
+ "content": "<text_break>",
782
  "lstrip": false,
783
  "normalized": false,
784
  "rstrip": false,
 
786
  "special": true
787
  },
788
  "49233": {
789
+ "content": "<fcel>",
790
  "lstrip": false,
791
  "normalized": false,
792
  "rstrip": false,
 
794
  "special": true
795
  },
796
  "49234": {
797
+ "content": "<ecel>",
798
  "lstrip": false,
799
  "normalized": false,
800
  "rstrip": false,
 
802
  "special": true
803
  },
804
  "49235": {
805
+ "content": "<lcel>",
806
  "lstrip": false,
807
  "normalized": false,
808
  "rstrip": false,
 
810
  "special": true
811
  },
812
  "49236": {
813
+ "content": "<ucel>",
814
  "lstrip": false,
815
  "normalized": false,
816
  "rstrip": false,
 
818
  "special": true
819
  },
820
  "49237": {
821
+ "content": "<xcel>",
822
  "lstrip": false,
823
  "normalized": false,
824
  "rstrip": false,
 
826
  "special": true
827
  },
828
  "49238": {
829
+ "content": "<nl>",
830
  "lstrip": false,
831
  "normalized": false,
832
  "rstrip": false,
 
834
  "special": true
835
  },
836
  "49239": {
837
+ "content": "<ched>",
838
  "lstrip": false,
839
  "normalized": false,
840
  "rstrip": false,
 
842
  "special": true
843
  },
844
  "49240": {
845
+ "content": "<rhed>",
846
  "lstrip": false,
847
  "normalized": false,
848
  "rstrip": false,
 
858
  "special": true
859
  },
860
  "49242": {
861
+ "content": "<key_",
862
  "lstrip": false,
863
  "normalized": false,
864
  "rstrip": false,
 
866
  "special": true
867
  },
868
  "49243": {
869
+ "content": "</key_",
870
  "lstrip": false,
871
  "normalized": false,
872
  "rstrip": false,
 
874
  "special": true
875
  },
876
  "49244": {
877
+ "content": "<value_",
878
  "lstrip": false,
879
  "normalized": false,
880
  "rstrip": false,
 
882
  "special": true
883
  },
884
  "49245": {
885
+ "content": "</value_",
886
  "lstrip": false,
887
  "normalized": false,
888
  "rstrip": false,
 
890
  "special": true
891
  },
892
  "49246": {
893
+ "content": "<link_",
894
  "lstrip": false,
895
  "normalized": false,
896
  "rstrip": false,
 
898
  "special": true
899
  },
900
  "49247": {
901
+ "content": "<chart>",
902
  "lstrip": false,
903
  "normalized": false,
904
  "rstrip": false,
 
906
  "special": true
907
  },
908
  "49248": {
909
+ "content": "</chart>",
910
  "lstrip": false,
911
  "normalized": false,
912
  "rstrip": false,
 
914
  "special": true
915
  },
916
  "49249": {
917
+ "content": "<page_break>",
918
  "lstrip": false,
919
  "normalized": false,
920
  "rstrip": false,
 
922
  "special": true
923
  },
924
  "49250": {
925
+ "content": "<smiles>",
926
  "lstrip": false,
927
  "normalized": false,
928
  "rstrip": false,
 
930
  "special": true
931
  },
932
  "49251": {
933
+ "content": "</smiles>",
934
  "lstrip": false,
935
  "normalized": false,
936
  "rstrip": false,
 
1173
  "eos_token": "<|im_end|>",
1174
  "extra_special_tokens": {},
1175
  "legacy": false,
1176
+ "max_length": 8192,
1177
  "model_max_length": 8192,
1178
+ "pad_to_multiple_of": null,
1179
  "pad_token": "<|im_end|>",
1180
+ "pad_token_type_id": 0,
1181
+ "padding_side": "right",
1182
  "processor_class": "Idefics3Processor",
1183
+ "stride": 0,
1184
  "tokenizer_class": "GPT2Tokenizer",
1185
+ "truncation_side": "right",
1186
+ "truncation_strategy": "longest_first",
1187
  "unk_token": "<|endoftext|>",
1188
  "vocab_size": 49152
1189
  }
zero_to_fp32.py ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example:
14
+ # python zero_to_fp32.py . output_dir/
15
+ # or
16
+ # python zero_to_fp32.py . output_dir/ --safe_serialization
17
+
18
+ import argparse
19
+ import torch
20
+ import glob
21
+ import math
22
+ import os
23
+ import re
24
+ import gc
25
+ import json
26
+ import numpy as np
27
+ from tqdm import tqdm
28
+ from collections import OrderedDict
29
+ from dataclasses import dataclass
30
+
31
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
32
+ # DeepSpeed data structures it has to be available in the current python environment.
33
+ from deepspeed.utils import logger
34
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
35
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
36
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
37
+
38
+
39
+ @dataclass
40
+ class zero_model_state:
41
+ buffers: dict()
42
+ param_shapes: dict()
43
+ shared_params: list
44
+ ds_version: int
45
+ frozen_param_shapes: dict()
46
+ frozen_param_fragments: dict()
47
+
48
+
49
+ debug = 0
50
+
51
+ # load to cpu
52
+ device = torch.device('cpu')
53
+
54
+
55
+ def atoi(text):
56
+ return int(text) if text.isdigit() else text
57
+
58
+
59
+ def natural_keys(text):
60
+ '''
61
+ alist.sort(key=natural_keys) sorts in human order
62
+ http://nedbatchelder.com/blog/200712/human_sorting.html
63
+ (See Toothy's implementation in the comments)
64
+ '''
65
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
66
+
67
+
68
+ def get_model_state_file(checkpoint_dir, zero_stage):
69
+ if not os.path.isdir(checkpoint_dir):
70
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
71
+
72
+ # there should be only one file
73
+ if zero_stage <= 2:
74
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
75
+ elif zero_stage == 3:
76
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
77
+
78
+ if not os.path.exists(file):
79
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
80
+
81
+ return file
82
+
83
+
84
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
85
+ # XXX: need to test that this simple glob rule works for multi-node setup too
86
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
87
+
88
+ if len(ckpt_files) == 0:
89
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
90
+
91
+ return ckpt_files
92
+
93
+
94
+ def get_optim_files(checkpoint_dir):
95
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
96
+
97
+
98
+ def get_model_state_files(checkpoint_dir):
99
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
100
+
101
+
102
+ def parse_model_states(files):
103
+ zero_model_states = []
104
+ for file in files:
105
+ state_dict = torch.load(file, map_location=device, weights_only=False)
106
+
107
+ if BUFFER_NAMES not in state_dict:
108
+ raise ValueError(f"{file} is not a model state checkpoint")
109
+ buffer_names = state_dict[BUFFER_NAMES]
110
+ if debug:
111
+ print("Found buffers:", buffer_names)
112
+
113
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
114
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
115
+ param_shapes = state_dict[PARAM_SHAPES]
116
+
117
+ # collect parameters that are included in param_shapes
118
+ param_names = []
119
+ for s in param_shapes:
120
+ for name in s.keys():
121
+ param_names.append(name)
122
+
123
+ # update with frozen parameters
124
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
125
+ if frozen_param_shapes is not None:
126
+ if debug:
127
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
128
+ param_names += list(frozen_param_shapes.keys())
129
+
130
+ # handle shared params
131
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
132
+
133
+ ds_version = state_dict.get(DS_VERSION, None)
134
+
135
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
136
+
137
+ z_model_state = zero_model_state(buffers=buffers,
138
+ param_shapes=param_shapes,
139
+ shared_params=shared_params,
140
+ ds_version=ds_version,
141
+ frozen_param_shapes=frozen_param_shapes,
142
+ frozen_param_fragments=frozen_param_fragments)
143
+ zero_model_states.append(z_model_state)
144
+
145
+ return zero_model_states
146
+
147
+
148
+ def parse_optim_states(files, ds_checkpoint_dir):
149
+ total_files = len(files)
150
+ state_dicts = []
151
+ for f in tqdm(files, desc='Loading checkpoint shards'):
152
+ state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
153
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
154
+ # and also handle the case where it was already removed by another helper script
155
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
156
+ state_dicts.append(state_dict)
157
+
158
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
159
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
160
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
161
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
162
+
163
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
164
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
165
+ # use the max of the partition_count to get the dp world_size.
166
+
167
+ if type(world_size) is list:
168
+ world_size = max(world_size)
169
+
170
+ if world_size != total_files:
171
+ raise ValueError(
172
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
173
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
174
+ )
175
+
176
+ # the groups are named differently in each stage
177
+ if zero_stage <= 2:
178
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
179
+ elif zero_stage == 3:
180
+ fp32_groups_key = FP32_FLAT_GROUPS
181
+ else:
182
+ raise ValueError(f"unknown zero stage {zero_stage}")
183
+
184
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
185
+ return zero_stage, world_size, fp32_flat_groups
186
+
187
+
188
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
189
+ """
190
+ Returns fp32 state_dict reconstructed from ds checkpoint
191
+
192
+ Args:
193
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
194
+
195
+ """
196
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
197
+
198
+ optim_files = get_optim_files(ds_checkpoint_dir)
199
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
200
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
201
+
202
+ model_files = get_model_state_files(ds_checkpoint_dir)
203
+
204
+ zero_model_states = parse_model_states(model_files)
205
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
206
+
207
+ if zero_stage <= 2:
208
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
209
+ exclude_frozen_parameters)
210
+ elif zero_stage == 3:
211
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
212
+ exclude_frozen_parameters)
213
+
214
+
215
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
216
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
217
+ return
218
+
219
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
220
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
221
+
222
+ if debug:
223
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
224
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
225
+
226
+ wanted_params = len(frozen_param_shapes)
227
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
228
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
229
+ print(f'Frozen params: Have {avail_numel} numels to process.')
230
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
231
+
232
+ total_params = 0
233
+ total_numel = 0
234
+ for name, shape in frozen_param_shapes.items():
235
+ total_params += 1
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+
239
+ state_dict[name] = frozen_param_fragments[name]
240
+
241
+ if debug:
242
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
243
+
244
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
245
+
246
+
247
+ def _has_callable(obj, fn):
248
+ attr = getattr(obj, fn, None)
249
+ return callable(attr)
250
+
251
+
252
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
253
+ param_shapes = zero_model_states[0].param_shapes
254
+
255
+ # Reconstruction protocol:
256
+ #
257
+ # XXX: document this
258
+
259
+ if debug:
260
+ for i in range(world_size):
261
+ for j in range(len(fp32_flat_groups[0])):
262
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
263
+
264
+ # XXX: memory usage doubles here (zero2)
265
+ num_param_groups = len(fp32_flat_groups[0])
266
+ merged_single_partition_of_fp32_groups = []
267
+ for i in range(num_param_groups):
268
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
269
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
270
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
271
+ avail_numel = sum(
272
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
273
+
274
+ if debug:
275
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
276
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
277
+ # not asserting if there is a mismatch due to possible padding
278
+ print(f"Have {avail_numel} numels to process.")
279
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
280
+
281
+ # params
282
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
283
+ # out-of-core computing solution
284
+ total_numel = 0
285
+ total_params = 0
286
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
287
+ offset = 0
288
+ avail_numel = full_single_fp32_vector.numel()
289
+ for name, shape in shapes.items():
290
+
291
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
292
+ total_numel += unpartitioned_numel
293
+ total_params += 1
294
+
295
+ if debug:
296
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
297
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
298
+ offset += unpartitioned_numel
299
+
300
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
301
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
302
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
303
+ # live optimizer object, so we are checking that the numbers are within the right range
304
+ align_to = 2 * world_size
305
+
306
+ def zero2_align(x):
307
+ return align_to * math.ceil(x / align_to)
308
+
309
+ if debug:
310
+ print(f"original offset={offset}, avail_numel={avail_numel}")
311
+
312
+ offset = zero2_align(offset)
313
+ avail_numel = zero2_align(avail_numel)
314
+
315
+ if debug:
316
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
317
+
318
+ # Sanity check
319
+ if offset != avail_numel:
320
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
321
+
322
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
323
+
324
+
325
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
326
+ exclude_frozen_parameters):
327
+ state_dict = OrderedDict()
328
+
329
+ # buffers
330
+ buffers = zero_model_states[0].buffers
331
+ state_dict.update(buffers)
332
+ if debug:
333
+ print(f"added {len(buffers)} buffers")
334
+
335
+ if not exclude_frozen_parameters:
336
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
337
+
338
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
339
+
340
+ # recover shared parameters
341
+ for pair in zero_model_states[0].shared_params:
342
+ if pair[1] in state_dict:
343
+ state_dict[pair[0]] = state_dict[pair[1]]
344
+
345
+ return state_dict
346
+
347
+
348
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
349
+ remainder = unpartitioned_numel % world_size
350
+ padding_numel = (world_size - remainder) if remainder else 0
351
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
352
+ return partitioned_numel, padding_numel
353
+
354
+
355
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
356
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
357
+ return
358
+
359
+ if debug:
360
+ for i in range(world_size):
361
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
362
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
363
+
364
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
365
+ wanted_params = len(frozen_param_shapes)
366
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
367
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
368
+ print(f'Frozen params: Have {avail_numel} numels to process.')
369
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
370
+
371
+ total_params = 0
372
+ total_numel = 0
373
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
374
+ total_params += 1
375
+ unpartitioned_numel = shape.numel()
376
+ total_numel += unpartitioned_numel
377
+
378
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
379
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
380
+
381
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
382
+
383
+ if debug:
384
+ print(
385
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
386
+ )
387
+
388
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
389
+
390
+
391
+ class GatheredTensor:
392
+ """
393
+ A pseudo tensor that collects partitioned weights.
394
+ It is more memory efficient when there are multiple groups.
395
+ """
396
+
397
+ def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
398
+ self.flat_groups = flat_groups
399
+ self.flat_groups_offset = flat_groups_offset
400
+ self.offset = offset
401
+ self.partitioned_numel = partitioned_numel
402
+ self.shape = shape
403
+ self.dtype = self.flat_groups[0][0].dtype
404
+
405
+ def contiguous(self):
406
+ """
407
+ Merge partitioned weights from flat_groups into a single tensor.
408
+ """
409
+ end_idx = self.offset + self.partitioned_numel
410
+ world_size = len(self.flat_groups)
411
+ pad_flat_param_chunks = []
412
+
413
+ for rank_i in range(world_size):
414
+ # for each rank, we need to collect weights from related group/groups
415
+ flat_groups_at_rank_i = self.flat_groups[rank_i]
416
+ start_group_id = None
417
+ end_group_id = None
418
+ for group_id in range(len(self.flat_groups_offset)):
419
+ if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
420
+ start_group_id = group_id
421
+ if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
422
+ end_group_id = group_id
423
+ break
424
+ # collect weights from related group/groups
425
+ for group_id in range(start_group_id, end_group_id + 1):
426
+ flat_tensor = flat_groups_at_rank_i[group_id]
427
+ start_offset = self.offset - self.flat_groups_offset[group_id]
428
+ end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
429
+ pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
430
+
431
+ # collect weights from all ranks
432
+ pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
433
+ param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
434
+ return param
435
+
436
+
437
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
438
+ param_shapes = zero_model_states[0].param_shapes
439
+ avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
440
+
441
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
442
+ # param, re-consolidating each param, while dealing with padding if any
443
+
444
+ # merge list of dicts, preserving order
445
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
446
+
447
+ if debug:
448
+ for i in range(world_size):
449
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
450
+
451
+ wanted_params = len(param_shapes)
452
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
453
+ # not asserting if there is a mismatch due to possible padding
454
+ avail_numel = fp32_flat_groups[0].numel() * world_size
455
+ print(f"Trainable params: Have {avail_numel} numels to process.")
456
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
457
+
458
+ # params
459
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
460
+ # out-of-core computing solution
461
+ offset = 0
462
+ total_numel = 0
463
+ total_params = 0
464
+ flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
465
+ for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
466
+ unpartitioned_numel = shape.numel()
467
+ total_numel += unpartitioned_numel
468
+ total_params += 1
469
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
470
+
471
+ if debug:
472
+ print(
473
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
474
+ )
475
+
476
+ # memory efficient tensor
477
+ tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
478
+ state_dict[name] = tensor
479
+ offset += partitioned_numel
480
+
481
+ offset *= world_size
482
+
483
+ # Sanity check
484
+ if offset != avail_numel:
485
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
486
+
487
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
488
+
489
+
490
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
491
+ exclude_frozen_parameters):
492
+ state_dict = OrderedDict()
493
+
494
+ # buffers
495
+ buffers = zero_model_states[0].buffers
496
+ state_dict.update(buffers)
497
+ if debug:
498
+ print(f"added {len(buffers)} buffers")
499
+
500
+ if not exclude_frozen_parameters:
501
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
502
+
503
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
504
+
505
+ # recover shared parameters
506
+ for pair in zero_model_states[0].shared_params:
507
+ if pair[1] in state_dict:
508
+ state_dict[pair[0]] = state_dict[pair[1]]
509
+
510
+ return state_dict
511
+
512
+
513
+ def to_torch_tensor(state_dict, return_empty_tensor=False):
514
+ """
515
+ Convert state_dict of GatheredTensor to torch tensor
516
+ """
517
+ torch_state_dict = {}
518
+ converted_tensors = {}
519
+ for name, tensor in state_dict.items():
520
+ tensor_id = id(tensor)
521
+ if tensor_id in converted_tensors: # shared tensors
522
+ shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
523
+ torch_state_dict[name] = shared_tensor
524
+ else:
525
+ converted_tensors[tensor_id] = name
526
+ if return_empty_tensor:
527
+ torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
528
+ else:
529
+ torch_state_dict[name] = tensor.contiguous()
530
+ return torch_state_dict
531
+
532
+
533
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
534
+ tag=None,
535
+ exclude_frozen_parameters=False,
536
+ lazy_mode=False):
537
+ """
538
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
539
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
540
+ via a model hub.
541
+
542
+ Args:
543
+ - ``checkpoint_dir``: path to the desired checkpoint folder
544
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
545
+ - ``exclude_frozen_parameters``: exclude frozen parameters
546
+ - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
547
+ Convert the pesduo tensor to torch tensor by ``.contiguous()``
548
+
549
+ Returns:
550
+ - pytorch ``state_dict``
551
+
552
+ A typical usage might be ::
553
+
554
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
555
+ # do the training and checkpoint saving
556
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
557
+ model = model.cpu() # move to cpu
558
+ model.load_state_dict(state_dict)
559
+ # submit to model hub or save the model to share with others
560
+
561
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
562
+ application. i.e. you will need to re-initialize the deepspeed engine, since
563
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
564
+
565
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
566
+
567
+ Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
568
+ You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
569
+ the checkpoint. Or you can load state_dict in lazy mode ::
570
+
571
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
572
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
573
+ for name, lazy_tensor in state_dict.item():
574
+ tensor = lazy_tensor.contiguous() # to cpu
575
+ print(name, tensor)
576
+ # del tensor to release memory if it no longer in use
577
+ """
578
+ if tag is None:
579
+ latest_path = os.path.join(checkpoint_dir, 'latest')
580
+ if os.path.isfile(latest_path):
581
+ with open(latest_path, 'r') as fd:
582
+ tag = fd.read().strip()
583
+ else:
584
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
585
+
586
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
587
+
588
+ if not os.path.isdir(ds_checkpoint_dir):
589
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
590
+
591
+ state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
592
+ if lazy_mode:
593
+ return state_dict
594
+ else:
595
+ return to_torch_tensor(state_dict)
596
+
597
+
598
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
599
+ output_dir,
600
+ max_shard_size="5GB",
601
+ safe_serialization=False,
602
+ tag=None,
603
+ exclude_frozen_parameters=False):
604
+ """
605
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
606
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
607
+
608
+ Args:
609
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
610
+ - ``output_dir``: directory to the pytorch fp32 state_dict output files
611
+ - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
612
+ - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
613
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
614
+ - ``exclude_frozen_parameters``: exclude frozen parameters
615
+ """
616
+
617
+ # Dependency pre-check
618
+ if safe_serialization:
619
+ try:
620
+ from safetensors.torch import save_file
621
+ except ImportError:
622
+ print('If you want to use `safe_serialization`, please `pip install safetensors`')
623
+ raise
624
+ if max_shard_size is not None:
625
+ try:
626
+ from huggingface_hub import split_torch_state_dict_into_shards
627
+ except ImportError:
628
+ print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
629
+ raise
630
+
631
+ # Convert zero checkpoint to state_dict
632
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
633
+ tag,
634
+ exclude_frozen_parameters,
635
+ lazy_mode=True)
636
+
637
+ # Shard the model if it is too big.
638
+ weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
639
+ if max_shard_size is not None:
640
+ filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
641
+ # an memory-efficient approach for sharding
642
+ empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
643
+ state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
644
+ filename_pattern=filename_pattern,
645
+ max_shard_size=max_shard_size)
646
+ else:
647
+ from collections import namedtuple
648
+ StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
649
+ state_dict_split = StateDictSplit(is_sharded=False,
650
+ filename_to_tensors={weights_name: list(state_dict.keys())})
651
+
652
+ # Save the model by shard
653
+ os.makedirs(output_dir, exist_ok=True)
654
+ filename_to_tensors = state_dict_split.filename_to_tensors.items()
655
+ for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
656
+ shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
657
+ shard_state_dict = to_torch_tensor(shard_state_dict)
658
+ output_path = os.path.join(output_dir, shard_file)
659
+ if safe_serialization:
660
+ save_file(shard_state_dict, output_path, metadata={"format": "pt"})
661
+ else:
662
+ torch.save(shard_state_dict, output_path)
663
+ # release the memory of current shard
664
+ for tensor_name in list(shard_state_dict.keys()):
665
+ del state_dict[tensor_name]
666
+ del shard_state_dict[tensor_name]
667
+ del shard_state_dict
668
+ gc.collect()
669
+
670
+ # Save index if sharded
671
+ if state_dict_split.is_sharded:
672
+ index = {
673
+ "metadata": state_dict_split.metadata,
674
+ "weight_map": state_dict_split.tensor_to_filename,
675
+ }
676
+ save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
677
+ save_index_file = os.path.join(output_dir, save_index_file)
678
+ with open(save_index_file, "w", encoding="utf-8") as f:
679
+ content = json.dumps(index, indent=2, sort_keys=True) + "\n"
680
+ f.write(content)
681
+
682
+
683
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
684
+ """
685
+ 1. Put the provided model to cpu
686
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
687
+ 3. Load it into the provided model
688
+
689
+ Args:
690
+ - ``model``: the model object to update
691
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
692
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
693
+
694
+ Returns:
695
+ - ``model`: modified model
696
+
697
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
698
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
699
+ conveniently placed for you in the checkpoint folder.
700
+
701
+ A typical usage might be ::
702
+
703
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
704
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
705
+ # submit to model hub or save the model to share with others
706
+
707
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
708
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
709
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
710
+
711
+ """
712
+ logger.info(f"Extracting fp32 weights")
713
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
714
+
715
+ logger.info(f"Overwriting model with fp32 weights")
716
+ model = model.cpu()
717
+ model.load_state_dict(state_dict, strict=False)
718
+
719
+ return model
720
+
721
+
722
+ if __name__ == "__main__":
723
+ parser = argparse.ArgumentParser()
724
+ parser.add_argument("checkpoint_dir",
725
+ type=str,
726
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
727
+ parser.add_argument("output_dir",
728
+ type=str,
729
+ help="directory to the pytorch fp32 state_dict output files"
730
+ "(e.g. path/checkpoint-12-output/)")
731
+ parser.add_argument(
732
+ "--max_shard_size",
733
+ type=str,
734
+ default="5GB",
735
+ help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
736
+ "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
737
+ "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
738
+ "without CPU OOM issues.")
739
+ parser.add_argument(
740
+ "--safe_serialization",
741
+ default=False,
742
+ action='store_true',
743
+ help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
744
+ parser.add_argument("-t",
745
+ "--tag",
746
+ type=str,
747
+ default=None,
748
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
749
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
750
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
751
+ args = parser.parse_args()
752
+
753
+ debug = args.debug
754
+
755
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
756
+ args.output_dir,
757
+ max_shard_size=args.max_shard_size,
758
+ safe_serialization=args.safe_serialization,
759
+ tag=args.tag,
760
+ exclude_frozen_parameters=args.exclude_frozen_parameters)