alvarodt commited on
Commit
3976d9f
·
1 Parent(s): a1fce5f

Upload tokenizer

Browse files
merges.txt CHANGED
@@ -49735,10 +49735,3 @@ alra shidiya
49735
  ĠBan que
49736
  ĠJas im
49737
  Wear house
49738
- ĠPriv é
49739
- 97150443 0584
49740
- Ġkal eej
49741
- Ġproperty finder
49742
- ĠBri ghton
49743
- Can ary
49744
- imm i
 
49735
  ĠBan que
49736
  ĠJas im
49737
  Wear house
 
 
 
 
 
 
 
special_tokens_map.json CHANGED
@@ -1,5 +1,9 @@
1
  {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "unk_token": "<|endoftext|>"
 
 
 
 
5
  }
 
1
  {
2
+ "bos_token": "<|BOS|>",
3
+ "cls_token": "<|CLS|>",
4
+ "eos_token": "<|EOS|>",
5
+ "mask_token": "<|MASK|>",
6
+ "pad_token": "<|PAD|>",
7
+ "sep_token": "<|SEP|>",
8
+ "unk_token": "<|UNK|>"
9
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,10 +1,14 @@
1
  {
2
  "add_prefix_space": false,
3
- "bos_token": "<|endoftext|>",
4
- "eos_token": "<|endoftext|>",
 
 
5
  "model_max_length": 1024,
6
  "name_or_path": "gpt2",
 
 
7
  "special_tokens_map_file": null,
8
  "tokenizer_class": "GPT2Tokenizer",
9
- "unk_token": "<|endoftext|>"
10
  }
 
1
  {
2
  "add_prefix_space": false,
3
+ "bos_token": "<|BOS|>",
4
+ "cls_token": "<|CLS|>",
5
+ "eos_token": "<|EOS|>",
6
+ "mask_token": "<|MASK|>",
7
  "model_max_length": 1024,
8
  "name_or_path": "gpt2",
9
+ "pad_token": "<|PAD|>",
10
+ "sep_token": "<|SEP|>",
11
  "special_tokens_map_file": null,
12
  "tokenizer_class": "GPT2Tokenizer",
13
+ "unk_token": "<|UNK|>"
14
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff