Thacio Garcia Scandaroli
commited on
Commit
·
18ef259
1
Parent(s):
ed9deac
Upload tokenizer
Browse files- added_tokens.json +106 -0
- merges.txt +0 -0
- special_tokens_map.json +110 -0
- tokenizer.json +0 -0
- tokenizer_config.json +14 -0
- vocab.json +0 -0
added_tokens.json
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<|NLG|>": 50260,
|
3 |
+
"<|NLU|>": 50259,
|
4 |
+
"<|beginoftext|>": 50257,
|
5 |
+
"<|extra_id_0|>": 50261,
|
6 |
+
"<|extra_id_10|>": 50271,
|
7 |
+
"<|extra_id_11|>": 50272,
|
8 |
+
"<|extra_id_12|>": 50273,
|
9 |
+
"<|extra_id_13|>": 50274,
|
10 |
+
"<|extra_id_14|>": 50275,
|
11 |
+
"<|extra_id_15|>": 50276,
|
12 |
+
"<|extra_id_16|>": 50277,
|
13 |
+
"<|extra_id_17|>": 50278,
|
14 |
+
"<|extra_id_18|>": 50279,
|
15 |
+
"<|extra_id_19|>": 50280,
|
16 |
+
"<|extra_id_1|>": 50262,
|
17 |
+
"<|extra_id_20|>": 50281,
|
18 |
+
"<|extra_id_21|>": 50282,
|
19 |
+
"<|extra_id_22|>": 50283,
|
20 |
+
"<|extra_id_23|>": 50284,
|
21 |
+
"<|extra_id_24|>": 50285,
|
22 |
+
"<|extra_id_25|>": 50286,
|
23 |
+
"<|extra_id_26|>": 50287,
|
24 |
+
"<|extra_id_27|>": 50288,
|
25 |
+
"<|extra_id_28|>": 50289,
|
26 |
+
"<|extra_id_29|>": 50290,
|
27 |
+
"<|extra_id_2|>": 50263,
|
28 |
+
"<|extra_id_30|>": 50291,
|
29 |
+
"<|extra_id_31|>": 50292,
|
30 |
+
"<|extra_id_32|>": 50293,
|
31 |
+
"<|extra_id_33|>": 50294,
|
32 |
+
"<|extra_id_34|>": 50295,
|
33 |
+
"<|extra_id_35|>": 50296,
|
34 |
+
"<|extra_id_36|>": 50297,
|
35 |
+
"<|extra_id_37|>": 50298,
|
36 |
+
"<|extra_id_38|>": 50299,
|
37 |
+
"<|extra_id_39|>": 50300,
|
38 |
+
"<|extra_id_3|>": 50264,
|
39 |
+
"<|extra_id_40|>": 50301,
|
40 |
+
"<|extra_id_41|>": 50302,
|
41 |
+
"<|extra_id_42|>": 50303,
|
42 |
+
"<|extra_id_43|>": 50304,
|
43 |
+
"<|extra_id_44|>": 50305,
|
44 |
+
"<|extra_id_45|>": 50306,
|
45 |
+
"<|extra_id_46|>": 50307,
|
46 |
+
"<|extra_id_47|>": 50308,
|
47 |
+
"<|extra_id_48|>": 50309,
|
48 |
+
"<|extra_id_49|>": 50310,
|
49 |
+
"<|extra_id_4|>": 50265,
|
50 |
+
"<|extra_id_50|>": 50311,
|
51 |
+
"<|extra_id_51|>": 50312,
|
52 |
+
"<|extra_id_52|>": 50313,
|
53 |
+
"<|extra_id_53|>": 50314,
|
54 |
+
"<|extra_id_54|>": 50315,
|
55 |
+
"<|extra_id_55|>": 50316,
|
56 |
+
"<|extra_id_56|>": 50317,
|
57 |
+
"<|extra_id_57|>": 50318,
|
58 |
+
"<|extra_id_58|>": 50319,
|
59 |
+
"<|extra_id_59|>": 50320,
|
60 |
+
"<|extra_id_5|>": 50266,
|
61 |
+
"<|extra_id_60|>": 50321,
|
62 |
+
"<|extra_id_61|>": 50322,
|
63 |
+
"<|extra_id_62|>": 50323,
|
64 |
+
"<|extra_id_63|>": 50324,
|
65 |
+
"<|extra_id_64|>": 50325,
|
66 |
+
"<|extra_id_65|>": 50326,
|
67 |
+
"<|extra_id_66|>": 50327,
|
68 |
+
"<|extra_id_67|>": 50328,
|
69 |
+
"<|extra_id_68|>": 50329,
|
70 |
+
"<|extra_id_69|>": 50330,
|
71 |
+
"<|extra_id_6|>": 50267,
|
72 |
+
"<|extra_id_70|>": 50331,
|
73 |
+
"<|extra_id_71|>": 50332,
|
74 |
+
"<|extra_id_72|>": 50333,
|
75 |
+
"<|extra_id_73|>": 50334,
|
76 |
+
"<|extra_id_74|>": 50335,
|
77 |
+
"<|extra_id_75|>": 50336,
|
78 |
+
"<|extra_id_76|>": 50337,
|
79 |
+
"<|extra_id_77|>": 50338,
|
80 |
+
"<|extra_id_78|>": 50339,
|
81 |
+
"<|extra_id_79|>": 50340,
|
82 |
+
"<|extra_id_7|>": 50268,
|
83 |
+
"<|extra_id_80|>": 50341,
|
84 |
+
"<|extra_id_81|>": 50342,
|
85 |
+
"<|extra_id_82|>": 50343,
|
86 |
+
"<|extra_id_83|>": 50344,
|
87 |
+
"<|extra_id_84|>": 50345,
|
88 |
+
"<|extra_id_85|>": 50346,
|
89 |
+
"<|extra_id_86|>": 50347,
|
90 |
+
"<|extra_id_87|>": 50348,
|
91 |
+
"<|extra_id_88|>": 50349,
|
92 |
+
"<|extra_id_89|>": 50350,
|
93 |
+
"<|extra_id_8|>": 50269,
|
94 |
+
"<|extra_id_90|>": 50351,
|
95 |
+
"<|extra_id_91|>": 50352,
|
96 |
+
"<|extra_id_92|>": 50353,
|
97 |
+
"<|extra_id_93|>": 50354,
|
98 |
+
"<|extra_id_94|>": 50355,
|
99 |
+
"<|extra_id_95|>": 50356,
|
100 |
+
"<|extra_id_96|>": 50357,
|
101 |
+
"<|extra_id_97|>": 50358,
|
102 |
+
"<|extra_id_98|>": 50359,
|
103 |
+
"<|extra_id_99|>": 50360,
|
104 |
+
"<|extra_id_9|>": 50270,
|
105 |
+
"<|unk|>": 50258
|
106 |
+
}
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
special_tokens_map.json
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|NLU|>",
|
4 |
+
"<|NLG|>",
|
5 |
+
"<|extra_id_0|>",
|
6 |
+
"<|extra_id_1|>",
|
7 |
+
"<|extra_id_2|>",
|
8 |
+
"<|extra_id_3|>",
|
9 |
+
"<|extra_id_4|>",
|
10 |
+
"<|extra_id_5|>",
|
11 |
+
"<|extra_id_6|>",
|
12 |
+
"<|extra_id_7|>",
|
13 |
+
"<|extra_id_8|>",
|
14 |
+
"<|extra_id_9|>",
|
15 |
+
"<|extra_id_10|>",
|
16 |
+
"<|extra_id_11|>",
|
17 |
+
"<|extra_id_12|>",
|
18 |
+
"<|extra_id_13|>",
|
19 |
+
"<|extra_id_14|>",
|
20 |
+
"<|extra_id_15|>",
|
21 |
+
"<|extra_id_16|>",
|
22 |
+
"<|extra_id_17|>",
|
23 |
+
"<|extra_id_18|>",
|
24 |
+
"<|extra_id_19|>",
|
25 |
+
"<|extra_id_20|>",
|
26 |
+
"<|extra_id_21|>",
|
27 |
+
"<|extra_id_22|>",
|
28 |
+
"<|extra_id_23|>",
|
29 |
+
"<|extra_id_24|>",
|
30 |
+
"<|extra_id_25|>",
|
31 |
+
"<|extra_id_26|>",
|
32 |
+
"<|extra_id_27|>",
|
33 |
+
"<|extra_id_28|>",
|
34 |
+
"<|extra_id_29|>",
|
35 |
+
"<|extra_id_30|>",
|
36 |
+
"<|extra_id_31|>",
|
37 |
+
"<|extra_id_32|>",
|
38 |
+
"<|extra_id_33|>",
|
39 |
+
"<|extra_id_34|>",
|
40 |
+
"<|extra_id_35|>",
|
41 |
+
"<|extra_id_36|>",
|
42 |
+
"<|extra_id_37|>",
|
43 |
+
"<|extra_id_38|>",
|
44 |
+
"<|extra_id_39|>",
|
45 |
+
"<|extra_id_40|>",
|
46 |
+
"<|extra_id_41|>",
|
47 |
+
"<|extra_id_42|>",
|
48 |
+
"<|extra_id_43|>",
|
49 |
+
"<|extra_id_44|>",
|
50 |
+
"<|extra_id_45|>",
|
51 |
+
"<|extra_id_46|>",
|
52 |
+
"<|extra_id_47|>",
|
53 |
+
"<|extra_id_48|>",
|
54 |
+
"<|extra_id_49|>",
|
55 |
+
"<|extra_id_50|>",
|
56 |
+
"<|extra_id_51|>",
|
57 |
+
"<|extra_id_52|>",
|
58 |
+
"<|extra_id_53|>",
|
59 |
+
"<|extra_id_54|>",
|
60 |
+
"<|extra_id_55|>",
|
61 |
+
"<|extra_id_56|>",
|
62 |
+
"<|extra_id_57|>",
|
63 |
+
"<|extra_id_58|>",
|
64 |
+
"<|extra_id_59|>",
|
65 |
+
"<|extra_id_60|>",
|
66 |
+
"<|extra_id_61|>",
|
67 |
+
"<|extra_id_62|>",
|
68 |
+
"<|extra_id_63|>",
|
69 |
+
"<|extra_id_64|>",
|
70 |
+
"<|extra_id_65|>",
|
71 |
+
"<|extra_id_66|>",
|
72 |
+
"<|extra_id_67|>",
|
73 |
+
"<|extra_id_68|>",
|
74 |
+
"<|extra_id_69|>",
|
75 |
+
"<|extra_id_70|>",
|
76 |
+
"<|extra_id_71|>",
|
77 |
+
"<|extra_id_72|>",
|
78 |
+
"<|extra_id_73|>",
|
79 |
+
"<|extra_id_74|>",
|
80 |
+
"<|extra_id_75|>",
|
81 |
+
"<|extra_id_76|>",
|
82 |
+
"<|extra_id_77|>",
|
83 |
+
"<|extra_id_78|>",
|
84 |
+
"<|extra_id_79|>",
|
85 |
+
"<|extra_id_80|>",
|
86 |
+
"<|extra_id_81|>",
|
87 |
+
"<|extra_id_82|>",
|
88 |
+
"<|extra_id_83|>",
|
89 |
+
"<|extra_id_84|>",
|
90 |
+
"<|extra_id_85|>",
|
91 |
+
"<|extra_id_86|>",
|
92 |
+
"<|extra_id_87|>",
|
93 |
+
"<|extra_id_88|>",
|
94 |
+
"<|extra_id_89|>",
|
95 |
+
"<|extra_id_90|>",
|
96 |
+
"<|extra_id_91|>",
|
97 |
+
"<|extra_id_92|>",
|
98 |
+
"<|extra_id_93|>",
|
99 |
+
"<|extra_id_94|>",
|
100 |
+
"<|extra_id_95|>",
|
101 |
+
"<|extra_id_96|>",
|
102 |
+
"<|extra_id_97|>",
|
103 |
+
"<|extra_id_98|>",
|
104 |
+
"<|extra_id_99|>"
|
105 |
+
],
|
106 |
+
"bos_token": "<|beginoftext|>",
|
107 |
+
"eos_token": "<|endoftext|>",
|
108 |
+
"pad_token": "<|pad|>",
|
109 |
+
"unk_token": "<|unk|>"
|
110 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"additional_special_tokens": [
|
4 |
+
"<|pad|>",
|
5 |
+
"<|endoftext|>"
|
6 |
+
],
|
7 |
+
"bos_token": "<|pad|>",
|
8 |
+
"clean_up_tokenization_spaces": true,
|
9 |
+
"eos_token": "<|pad|>",
|
10 |
+
"model_max_length": 1024,
|
11 |
+
"special_tokens_map_file": null,
|
12 |
+
"tokenizer_class": "GPT2Tokenizer",
|
13 |
+
"unk_token": "<|pad|>"
|
14 |
+
}
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|