mohalisad commited on
Commit
5b40aa3
·
1 Parent(s): 57cffe3

fix tokenizer

Browse files
Files changed (1) hide show
  1. tokenizer.json +17 -3
tokenizer.json CHANGED
@@ -1,7 +1,14 @@
1
  {
2
  "version": "1.0",
3
  "truncation": null,
4
- "padding": null,
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
@@ -45,7 +52,7 @@
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
48
- "normalized": false,
49
  "special": true
50
  }
51
  ],
@@ -194,6 +201,13 @@
194
  "Regex": "[\u0001‪‫‬‭‎‏‮†“”•–—ž„ˆ‰˜‹Œ️⃣]"
195
  },
196
  "content": " "
 
 
 
 
 
 
 
197
  }
198
  ]
199
  },
@@ -96062,4 +96076,4 @@
96062
  "▁ent ire"
96063
  ]
96064
  }
96065
- }
 
1
  {
2
  "version": "1.0",
3
  "truncation": null,
4
+ "padding": {
5
+ "strategy": "BatchLongest",
6
+ "direction": "Right",
7
+ "pad_to_multiple_of": null,
8
+ "pad_id": 0,
9
+ "pad_type_id": 0,
10
+ "pad_token": "<pad>"
11
+ },
12
  "added_tokens": [
13
  {
14
  "id": 0,
 
52
  "single_word": false,
53
  "lstrip": false,
54
  "rstrip": false,
55
+ "normalized": true,
56
  "special": true
57
  }
58
  ],
 
201
  "Regex": "[\u0001‪‫‬‭‎‏‮†“”•–—ž„ˆ‰˜‹Œ️⃣]"
202
  },
203
  "content": " "
204
+ },
205
+ {
206
+ "type": "Replace",
207
+ "pattern": {
208
+ "Regex": " *<mask> *"
209
+ },
210
+ "content": "<mask>"
211
  }
212
  ]
213
  },
 
96076
  "▁ent ire"
96077
  ]
96078
  }
96079
+ }