nroggendorff commited on
Commit
a65e8e7
·
verified ·
1 Parent(s): b5c212c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -16
app.py CHANGED
@@ -1,5 +1,3 @@
1
- print('Importing goodies..')
2
-
3
  import gc
4
 
5
  import numpy as np
@@ -12,12 +10,8 @@ from datasets import load_dataset
12
  from tokenizers import ByteLevelBPETokenizer
13
  import trl
14
 
15
- print("Loading dataset..")
16
-
17
  dataset = load_dataset("nroggendorff/openhermes", split="train").select(range(int(1e+4)))
18
 
19
- print("Setting up tokenizer..")
20
-
21
  def get_training_corpus():
22
  for i in range(0, len(dataset), 1000):
23
  yield dataset[i : i + 1000]["text"]
@@ -64,8 +58,6 @@ tokenizer.save_pretrained("/tmp/llama-tokenizer")
64
  tokenizer = AutoTokenizer.from_pretrained("/tmp/llama-tokenizer")
65
  print(tokenizer.apply_chat_template([{"role": "user", "content": "Why is the sky blue?"}, {"role": "assistant", "content": "Due to rayleigh scattering."}, {"role": "user", "content": "That's cool."}, {"role": "assistant", "content": "Yeah, I agree."}], tokenize=False))
66
 
67
- print("Configuring..")
68
-
69
  config = LlamaConfig(
70
  vocab_size=tokenizer.vocab_size,
71
  hidden_size=int(512 / 1),
@@ -84,8 +76,6 @@ config = LlamaConfig(
84
 
85
  model = LlamaForCausalLM(config)
86
 
87
- print("Mapping dataset..")
88
-
89
  def format_prompts(examples):
90
  texts = []
91
  for text in examples['text']:
@@ -106,8 +96,6 @@ dataset = dataset.map(format_prompts, batched=True)
106
 
107
  print(dataset['text'][2])
108
 
109
- print("Defining trainer..")
110
-
111
  args = TrainingArguments(
112
  output_dir="mayo",
113
  num_train_epochs=1,
@@ -134,12 +122,8 @@ torch.cuda.set_device(0)
134
  gc.collect()
135
  torch.cuda.empty_cache()
136
 
137
- print("Training..")
138
-
139
  trainer.train()
140
 
141
- print("Pushing to hub..")
142
-
143
  #trainer.push_to_hub()
144
  trained_model = trainer.model
145
  trained_tokenizer = trainer.tokenizer
 
 
 
1
  import gc
2
 
3
  import numpy as np
 
10
  from tokenizers import ByteLevelBPETokenizer
11
  import trl
12
 
 
 
13
  dataset = load_dataset("nroggendorff/openhermes", split="train").select(range(int(1e+4)))
14
 
 
 
15
  def get_training_corpus():
16
  for i in range(0, len(dataset), 1000):
17
  yield dataset[i : i + 1000]["text"]
 
58
  tokenizer = AutoTokenizer.from_pretrained("/tmp/llama-tokenizer")
59
  print(tokenizer.apply_chat_template([{"role": "user", "content": "Why is the sky blue?"}, {"role": "assistant", "content": "Due to rayleigh scattering."}, {"role": "user", "content": "That's cool."}, {"role": "assistant", "content": "Yeah, I agree."}], tokenize=False))
60
 
 
 
61
  config = LlamaConfig(
62
  vocab_size=tokenizer.vocab_size,
63
  hidden_size=int(512 / 1),
 
76
 
77
  model = LlamaForCausalLM(config)
78
 
 
 
79
  def format_prompts(examples):
80
  texts = []
81
  for text in examples['text']:
 
96
 
97
  print(dataset['text'][2])
98
 
 
 
99
  args = TrainingArguments(
100
  output_dir="mayo",
101
  num_train_epochs=1,
 
122
  gc.collect()
123
  torch.cuda.empty_cache()
124
 
 
 
125
  trainer.train()
126
 
 
 
127
  #trainer.push_to_hub()
128
  trained_model = trainer.model
129
  trained_tokenizer = trainer.tokenizer