nroggendorff commited on
Commit
b5c212c
·
verified ·
1 Parent(s): 68b1899

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -2
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import gc
2
 
3
  import numpy as np
@@ -10,8 +12,12 @@ from datasets import load_dataset
10
  from tokenizers import ByteLevelBPETokenizer
11
  import trl
12
 
 
 
13
  dataset = load_dataset("nroggendorff/openhermes", split="train").select(range(int(1e+4)))
14
 
 
 
15
  def get_training_corpus():
16
  for i in range(0, len(dataset), 1000):
17
  yield dataset[i : i + 1000]["text"]
@@ -58,6 +64,8 @@ tokenizer.save_pretrained("/tmp/llama-tokenizer")
58
  tokenizer = AutoTokenizer.from_pretrained("/tmp/llama-tokenizer")
59
  print(tokenizer.apply_chat_template([{"role": "user", "content": "Why is the sky blue?"}, {"role": "assistant", "content": "Due to rayleigh scattering."}, {"role": "user", "content": "That's cool."}, {"role": "assistant", "content": "Yeah, I agree."}], tokenize=False))
60
 
 
 
61
  config = LlamaConfig(
62
  vocab_size=tokenizer.vocab_size,
63
  hidden_size=int(512 / 1),
@@ -76,6 +84,8 @@ config = LlamaConfig(
76
 
77
  model = LlamaForCausalLM(config)
78
 
 
 
79
  def format_prompts(examples):
80
  texts = []
81
  for text in examples['text']:
@@ -96,6 +106,8 @@ dataset = dataset.map(format_prompts, batched=True)
96
 
97
  print(dataset['text'][2])
98
 
 
 
99
  args = TrainingArguments(
100
  output_dir="mayo",
101
  num_train_epochs=1,
@@ -122,8 +134,12 @@ torch.cuda.set_device(0)
122
  gc.collect()
123
  torch.cuda.empty_cache()
124
 
 
 
125
  trainer.train()
126
-
 
 
127
  #trainer.push_to_hub()
128
  trained_model = trainer.model
129
  trained_tokenizer = trainer.tokenizer
@@ -132,4 +148,4 @@ repo_id = "makeshift-mayo"
132
  trained_model.push_to_hub(repo_id)
133
  trained_tokenizer.push_to_hub(repo_id)
134
 
135
- raise RuntimeError("The script was finished.")
 
1
+ print('Importing goodies..')
2
+
3
  import gc
4
 
5
  import numpy as np
 
12
  from tokenizers import ByteLevelBPETokenizer
13
  import trl
14
 
15
+ print("Loading dataset..")
16
+
17
  dataset = load_dataset("nroggendorff/openhermes", split="train").select(range(int(1e+4)))
18
 
19
+ print("Setting up tokenizer..")
20
+
21
  def get_training_corpus():
22
  for i in range(0, len(dataset), 1000):
23
  yield dataset[i : i + 1000]["text"]
 
64
  tokenizer = AutoTokenizer.from_pretrained("/tmp/llama-tokenizer")
65
  print(tokenizer.apply_chat_template([{"role": "user", "content": "Why is the sky blue?"}, {"role": "assistant", "content": "Due to rayleigh scattering."}, {"role": "user", "content": "That's cool."}, {"role": "assistant", "content": "Yeah, I agree."}], tokenize=False))
66
 
67
+ print("Configuring..")
68
+
69
  config = LlamaConfig(
70
  vocab_size=tokenizer.vocab_size,
71
  hidden_size=int(512 / 1),
 
84
 
85
  model = LlamaForCausalLM(config)
86
 
87
+ print("Mapping dataset..")
88
+
89
  def format_prompts(examples):
90
  texts = []
91
  for text in examples['text']:
 
106
 
107
  print(dataset['text'][2])
108
 
109
+ print("Defining trainer..")
110
+
111
  args = TrainingArguments(
112
  output_dir="mayo",
113
  num_train_epochs=1,
 
134
  gc.collect()
135
  torch.cuda.empty_cache()
136
 
137
+ print("Training..")
138
+
139
  trainer.train()
140
+
141
+ print("Pushing to hub..")
142
+
143
  #trainer.push_to_hub()
144
  trained_model = trainer.model
145
  trained_tokenizer = trainer.tokenizer
 
148
  trained_model.push_to_hub(repo_id)
149
  trained_tokenizer.push_to_hub(repo_id)
150
 
151
+ raise RuntimeError("The script is finished.")