nroggendorff commited on
Commit
c2f601d
·
verified ·
1 Parent(s): dfffe28

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -5
app.py CHANGED
@@ -29,8 +29,9 @@ def create_tokenizer(training_corpus):
29
  min_frequency=2,
30
  special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<|user|>", "<|bot|>", "<|end|>"]
31
  )
32
-
33
- return PreTrainedTokenizerFast(tokenizer_object=tokenizer)
 
34
 
35
  def get_training_corpus(dataset):
36
  for i in range(0, len(dataset), 1000):
@@ -48,9 +49,7 @@ def format_prompts(examples, tokenizer):
48
  conversation.append({"role": "assistant", "content": response})
49
  formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False)
50
  texts.append(formatted_conversation)
51
- output = {}
52
- output['text'] = texts
53
- return output
54
 
55
  def create_model(tokenizer, factor):
56
  config = LlamaConfig(
 
29
  min_frequency=2,
30
  special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<|user|>", "<|bot|>", "<|end|>"]
31
  )
32
+
33
+ fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
34
+ return fast_tokenizer
35
 
36
  def get_training_corpus(dataset):
37
  for i in range(0, len(dataset), 1000):
 
49
  conversation.append({"role": "assistant", "content": response})
50
  formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False)
51
  texts.append(formatted_conversation)
52
+ return {"text": texts}
 
 
53
 
54
  def create_model(tokenizer, factor):
55
  config = LlamaConfig(