nroggendorff commited on
Commit
4f9862c
·
verified ·
1 Parent(s): 53e45ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -2
app.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  import torch
4
  import trl
5
 
6
- from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, TrainingArguments
7
  from datasets import load_dataset
8
  from tokenizers import ByteLevelBPETokenizer
9
 
@@ -28,7 +28,8 @@ def create_tokenizer(training_corpus):
28
  min_frequency=2,
29
  special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<|user|>", "<|bot|>", "<|end|>"]
30
  )
31
- return tokenizer
 
32
 
33
  def get_training_corpus(dataset):
34
  for i in range(0, len(dataset), 1000):
 
3
  import torch
4
  import trl
5
 
6
+ from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, TrainingArguments, PreTrainedTokenizerFast
7
  from datasets import load_dataset
8
  from tokenizers import ByteLevelBPETokenizer
9
 
 
28
  min_frequency=2,
29
  special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<|user|>", "<|bot|>", "<|end|>"]
30
  )
31
+
32
+ return PreTrainedTokenizerFast(tokenizer_object=tokenizer)
33
 
34
  def get_training_corpus(dataset):
35
  for i in range(0, len(dataset), 1000):