michelleyunun commited on
Commit
705eb2e
·
1 Parent(s): 332a53b

Upload tokenizer

Browse files
Files changed (1) hide show
  1. tokenizer.json +494 -2
tokenizer.json CHANGED
@@ -140,7 +140,253 @@
140
  "dii": 78,
141
  "nii": 79,
142
  "ts": 80,
143
- "xwi": 81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  },
145
  "merges": [
146
  "s t",
@@ -177,7 +423,253 @@
177
  "d ii",
178
  "n ii",
179
  "t s",
180
- "x wi"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  ]
182
  }
183
  }
 
140
  "dii": 78,
141
  "nii": 79,
142
  "ts": 80,
143
+ "xwi": 81,
144
+ "Ġd": 82,
145
+ "Ġha": 83,
146
+ "uu": 84,
147
+ "Ġnee": 85,
148
+ "xs": 86,
149
+ "Ġyu": 87,
150
+ "Ġa": 88,
151
+ "ip": 89,
152
+ "kwhl": 90,
153
+ "wihl": 91,
154
+ "gi": 92,
155
+ "Ġk": 93,
156
+ "xw": 94,
157
+ "'m": 95,
158
+ "Ġxs": 96,
159
+ "Ġdim": 97,
160
+ "Ġneedii": 98,
161
+ "igi": 99,
162
+ "Ġb": 100,
163
+ "Ġligi": 101,
164
+ "Ġwili": 102,
165
+ "di": 103,
166
+ "Ġj": 104,
167
+ "Ġp": 105,
168
+ "Ġt": 106,
169
+ "Ġwihl": 107,
170
+ "sxwi": 108,
171
+ "Ġs": 109,
172
+ "Ġya": 110,
173
+ "in": 111,
174
+ "Ġhlaa": 112,
175
+ "Ġna": 113,
176
+ "Ġan": 114,
177
+ "ax": 115,
178
+ "ay": 116,
179
+ "ahl": 117,
180
+ "oot": 118,
181
+ "ni": 119,
182
+ "ol": 120,
183
+ "Ġyukwhl": 121,
184
+ "Ġnii": 122,
185
+ "Ġnaa": 123,
186
+ "Ġwilp": 124,
187
+ "ipe": 125,
188
+ "Ġpipe": 126,
189
+ "uxw": 127,
190
+ "tshl": 128,
191
+ "Ġyatshl": 129,
192
+ "ĠS": 130,
193
+ "na": 131,
194
+ "hli": 132,
195
+ "Ġaa": 133,
196
+ "Ġneediit": 134,
197
+ "Ġ\"": 135,
198
+ "̲'": 136,
199
+ "il": 137,
200
+ "Ġw": 138,
201
+ "Ġyee": 139,
202
+ "Ġloot": 140,
203
+ "at": 141,
204
+ "ck": 142,
205
+ "hol": 143,
206
+ "ka": 144,
207
+ "lhl": 145,
208
+ "ock": 146,
209
+ "tock": 147,
210
+ "ya": 148,
211
+ "wil": 149,
212
+ "Ġgya": 150,
213
+ "Ġiin": 151,
214
+ "Ġluu": 152,
215
+ "uuhl": 153,
216
+ "ĠStock": 154,
217
+ "holm": 155,
218
+ "ĠStockholm": 156,
219
+ "ad": 157,
220
+ "ls": 158,
221
+ "xu": 159,
222
+ "Ġts": 160,
223
+ "hla": 161,
224
+ "Ġwina": 162,
225
+ "Ġhlg": 163,
226
+ "Ġhahla": 164,
227
+ "uut": 165,
228
+ "Ġbag": 166,
229
+ "ayt": 167,
230
+ "Ġwag": 168,
231
+ "lsdi": 169,
232
+ "as": 170,
233
+ "ok": 171,
234
+ "Ġhe": 172,
235
+ "diit": 173,
236
+ "ain": 174,
237
+ "wit": 175,
238
+ "Ġxsa": 176,
239
+ "Ġxsi": 177,
240
+ "Ġja": 178,
241
+ "nit": 179,
242
+ "xhl": 180,
243
+ "xwhl": 181,
244
+ "iihli": 182,
245
+ "Ġgiihli": 183,
246
+ "Ġlax": 184,
247
+ "ak": 185,
248
+ "̲.": 186,
249
+ "eek": 187,
250
+ "Ġap": 188,
251
+ "Ġxseek": 189,
252
+ "Ġji": 190,
253
+ "Ġaats": 191,
254
+ "hahl": 192,
255
+ "un": 193,
256
+ "waa": 194,
257
+ "oos": 195,
258
+ "Ġanhahl": 196,
259
+ "Ġanhahla": 197,
260
+ "ĠA": 198,
261
+ "Ġneet": 199,
262
+ "Ġam": 200,
263
+ "akwhl": 201,
264
+ "Ġak": 202,
265
+ "--": 203,
266
+ "Can": 204,
267
+ "Dim": 205,
268
+ "bi": 206,
269
+ "da": 207,
270
+ "fl": 208,
271
+ "gwaa": 209,
272
+ "isxwi": 210,
273
+ "ika": 211,
274
+ "ja": 212,
275
+ "kst": 213,
276
+ "lt": 214,
277
+ "lst": 215,
278
+ "nag": 216,
279
+ "pja": 217,
280
+ "rain": 218,
281
+ "sii": 219,
282
+ "ska": 220,
283
+ "sgwaa": 221,
284
+ "upja": 222,
285
+ "yt": 223,
286
+ "Ġag": 224,
287
+ "ĠCan": 225,
288
+ "Ġfl": 226,
289
+ "Ġisxwi": 227,
290
+ "Ġupja": 228,
291
+ "ndoos": 229,
292
+ "Ġgi": 230,
293
+ "Ġgwil": 231,
294
+ "Ġguuhl": 232,
295
+ "aahli": 233,
296
+ "oodi": 234,
297
+ "Ġno": 235,
298
+ "anhl": 236,
299
+ "anwil": 237,
300
+ "anuut": 238,
301
+ "anska": 239,
302
+ "Ġlip": 240,
303
+ "imil": 241,
304
+ "niig": 242,
305
+ "niisgwaa": 243,
306
+ "Ġyuwi": 244,
307
+ "Ġandoos": 245,
308
+ "gihl": 246,
309
+ "Ġky": 247,
310
+ "dilhl": 248,
311
+ "Ġpol": 249,
312
+ "Ġtun": 250,
313
+ "Ġtrain": 251,
314
+ "Ġsgihl": 252,
315
+ "Ġsdilhl": 253,
316
+ "Ġyalt": 254,
317
+ "insxwi": 255,
318
+ "Ġnakst": 256,
319
+ "Ġant": 257,
320
+ "Ġansii": 258,
321
+ "ayoo": 259,
322
+ "uxwt": 260,
323
+ "Ġaam": 261,
324
+ "adanska": 262,
325
+ "Ġhlgu": 263,
326
+ "Ġxsawi": 264,
327
+ "Ġjabi": 265,
328
+ "nagwit": 266,
329
+ "Ġagwihl": 267,
330
+ "ĠCanadanska": 268,
331
+ "Ġflika": 269,
332
+ "Ġgwila": 270,
333
+ "aahlihl": 271,
334
+ "anwilat": 272,
335
+ "anuutxw": 273,
336
+ "Ġandoosda": 274,
337
+ "Ġpole": 275,
338
+ "Ġyaltxu": 276,
339
+ "Ġansiip": 277,
340
+ "Hl": 278,
341
+ "Nii": 279,
342
+ "Oo": 280,
343
+ "nim": 281,
344
+ "wahl": 282,
345
+ "yhl": 283,
346
+ "ĠHl": 284,
347
+ "ĠNii": 285,
348
+ "wii": 286,
349
+ "Ġguxw": 287,
350
+ "Ġguut": 288,
351
+ "aaxhl": 289,
352
+ "Ġyuxwhl": 290,
353
+ "Ġkw": 291,
354
+ "Ġbas": 292,
355
+ "inhl": 293,
356
+ "ootxwi": 294,
357
+ "nisxwi": 295,
358
+ "uxwsxwi": 296,
359
+ "ilx": 297,
360
+ "adaaxhl": 298,
361
+ "Ġhlguxwsxwi": 299,
362
+ "Ġbagu": 300,
363
+ "asinhl": 301,
364
+ "Ġamhl": 302,
365
+ "ĠHlaa": 303,
366
+ "Ġguxws": 304,
367
+ "Ġbasax": 305,
368
+ ".\"": 306,
369
+ "daa": 307,
370
+ "ix": 308,
371
+ "idaa": 309,
372
+ "loohl": 310,
373
+ "phl": 311,
374
+ "pain": 312,
375
+ "sx": 313,
376
+ "yim": 314,
377
+ "Ġxhl": 315,
378
+ "aahl": 316,
379
+ "aasx": 317,
380
+ "aayim": 318,
381
+ "ook": 319,
382
+ "Ġhlag": 320,
383
+ "Ġhlidaa": 321,
384
+ "xwit": 322,
385
+ "Ġdok": 323,
386
+ "Ġdaayim": 324,
387
+ "Ġyuxw": 325,
388
+ "Ġaloohl": 326,
389
+ "Ġbax": 327
390
  },
391
  "merges": [
392
  "s t",
 
423
  "d ii",
424
  "n ii",
425
  "t s",
426
+ "x wi",
427
+ "Ġ d",
428
+ "Ġh a",
429
+ "u u",
430
+ "Ġn ee",
431
+ "x s",
432
+ "Ġy u",
433
+ "Ġ a",
434
+ "i p",
435
+ "k whl",
436
+ "wi hl",
437
+ "g i",
438
+ "Ġ k",
439
+ "x w",
440
+ "' m",
441
+ "Ġ xs",
442
+ "Ġd im",
443
+ "Ġnee dii",
444
+ "i gi",
445
+ "Ġ b",
446
+ "Ġl igi",
447
+ "Ġwil i",
448
+ "d i",
449
+ "Ġ j",
450
+ "Ġ p",
451
+ "Ġ t",
452
+ "Ġwi hl",
453
+ "s xwi",
454
+ "Ġ s",
455
+ "Ġy a",
456
+ "i n",
457
+ "Ġhl aa",
458
+ "Ġn a",
459
+ "Ġ an",
460
+ "a x",
461
+ "a y",
462
+ "a hl",
463
+ "oo t",
464
+ "n i",
465
+ "o l",
466
+ "Ġyu kwhl",
467
+ "Ġn ii",
468
+ "Ġn aa",
469
+ "Ġwil p",
470
+ "ip e",
471
+ "Ġp ipe",
472
+ "u xw",
473
+ "ts hl",
474
+ "Ġya tshl",
475
+ "Ġ S",
476
+ "n a",
477
+ "hl i",
478
+ "Ġ aa",
479
+ "Ġneedii t",
480
+ "Ġ \"",
481
+ "̲ '",
482
+ "i l",
483
+ "Ġ w",
484
+ "Ġy ee",
485
+ "Ġl oot",
486
+ "a t",
487
+ "c k",
488
+ "h ol",
489
+ "k a",
490
+ "l hl",
491
+ "o ck",
492
+ "t ock",
493
+ "y a",
494
+ "wi l",
495
+ "Ġg ya",
496
+ "Ġii n",
497
+ "Ġl uu",
498
+ "uu hl",
499
+ "ĠS tock",
500
+ "hol m",
501
+ "ĠStock holm",
502
+ "a d",
503
+ "l s",
504
+ "x u",
505
+ "Ġ ts",
506
+ "hl a",
507
+ "Ġwi na",
508
+ "Ġhl g",
509
+ "Ġha hla",
510
+ "uu t",
511
+ "Ġb ag",
512
+ "ay t",
513
+ "Ġw ag",
514
+ "ls di",
515
+ "a s",
516
+ "o k",
517
+ "Ġh e",
518
+ "dii t",
519
+ "a in",
520
+ "wi t",
521
+ "Ġxs a",
522
+ "Ġxs i",
523
+ "Ġj a",
524
+ "ni t",
525
+ "x hl",
526
+ "x whl",
527
+ "ii hli",
528
+ "Ġg iihli",
529
+ "Ġl ax",
530
+ "a k",
531
+ "̲ .",
532
+ "ee k",
533
+ "Ġa p",
534
+ "Ġxs eek",
535
+ "Ġj i",
536
+ "Ġaa ts",
537
+ "h ahl",
538
+ "u n",
539
+ "w aa",
540
+ "oo s",
541
+ "Ġan hahl",
542
+ "Ġanhahl a",
543
+ "Ġ A",
544
+ "Ġnee t",
545
+ "Ġa m",
546
+ "a kwhl",
547
+ "Ġa k",
548
+ "- -",
549
+ "C an",
550
+ "D im",
551
+ "b i",
552
+ "d a",
553
+ "f l",
554
+ "g waa",
555
+ "i sxwi",
556
+ "i ka",
557
+ "j a",
558
+ "k st",
559
+ "l t",
560
+ "l st",
561
+ "n ag",
562
+ "p ja",
563
+ "r ain",
564
+ "s ii",
565
+ "s ka",
566
+ "s gwaa",
567
+ "u pja",
568
+ "y t",
569
+ "Ġ ag",
570
+ "Ġ Can",
571
+ "Ġ fl",
572
+ "Ġ isxwi",
573
+ "Ġ upja",
574
+ "nd oos",
575
+ "Ġg i",
576
+ "Ġg wil",
577
+ "Ġg uuhl",
578
+ "aa hli",
579
+ "oo di",
580
+ "Ġn o",
581
+ "an hl",
582
+ "an wil",
583
+ "an uut",
584
+ "an ska",
585
+ "Ġl ip",
586
+ "im il",
587
+ "nii g",
588
+ "nii sgwaa",
589
+ "Ġyu wi",
590
+ "Ġa ndoos",
591
+ "gi hl",
592
+ "Ġk y",
593
+ "di lhl",
594
+ "Ġp ol",
595
+ "Ġt un",
596
+ "Ġt rain",
597
+ "Ġs gihl",
598
+ "Ġs dilhl",
599
+ "Ġya lt",
600
+ "in sxwi",
601
+ "Ġna kst",
602
+ "Ġan t",
603
+ "Ġan sii",
604
+ "ay oo",
605
+ "uxw t",
606
+ "Ġaa m",
607
+ "ad anska",
608
+ "Ġhlg u",
609
+ "Ġxsa wi",
610
+ "Ġja bi",
611
+ "nag wit",
612
+ "Ġag wihl",
613
+ "ĠCan adanska",
614
+ "Ġfl ika",
615
+ "Ġgwil a",
616
+ "aahli hl",
617
+ "anwil at",
618
+ "anuut xw",
619
+ "Ġandoos da",
620
+ "Ġpol e",
621
+ "Ġyalt xu",
622
+ "Ġansii p",
623
+ "H l",
624
+ "N ii",
625
+ "O o",
626
+ "n im",
627
+ "w ahl",
628
+ "y hl",
629
+ "Ġ Hl",
630
+ "Ġ Nii",
631
+ "wi i",
632
+ "Ġg uxw",
633
+ "Ġg uut",
634
+ "aa xhl",
635
+ "Ġyu xwhl",
636
+ "Ġk w",
637
+ "Ġb as",
638
+ "in hl",
639
+ "oot xwi",
640
+ "ni sxwi",
641
+ "uxw sxwi",
642
+ "il x",
643
+ "ad aaxhl",
644
+ "Ġhlg uxwsxwi",
645
+ "Ġbag u",
646
+ "as inhl",
647
+ "Ġam hl",
648
+ "ĠHl aa",
649
+ "Ġguxw s",
650
+ "Ġbas ax",
651
+ ". \"",
652
+ "d aa",
653
+ "i x",
654
+ "i daa",
655
+ "l oohl",
656
+ "p hl",
657
+ "p ain",
658
+ "s x",
659
+ "y im",
660
+ "Ġ xhl",
661
+ "aa hl",
662
+ "aa sx",
663
+ "aa yim",
664
+ "oo k",
665
+ "Ġhl ag",
666
+ "Ġhl idaa",
667
+ "xwi t",
668
+ "Ġd ok",
669
+ "Ġd aayim",
670
+ "Ġyu xw",
671
+ "Ġa loohl",
672
+ "Ġb ax"
673
  ]
674
  }
675
  }