mtasic85 commited on
Commit
d1bfa72
·
1 Parent(s): 282021b

cpt core 4

Browse files
Files changed (1) hide show
  1. scripts/cpt_core_model_4.py +3 -11
scripts/cpt_core_model_4.py CHANGED
@@ -55,7 +55,7 @@ model = FastLanguageModel.get_peft_model(
55
 
56
 
57
  #
58
- #
59
  #
60
  from datasets import Dataset
61
  from litdata import TokensLoader, StreamingDataset
@@ -74,11 +74,10 @@ def unlsoth_generator():
74
  yield {'input_ids': batch}
75
 
76
 
77
- # train_dataset = Dataset.from_generator(unlsoth_generator, streaming=True)
78
  train_dataset = Dataset.from_generator(unlsoth_generator)
79
 
80
  #
81
- #
82
  #
83
  from trl import SFTTrainer
84
  from transformers import TrainingArguments
@@ -97,18 +96,12 @@ trainer = UnslothTrainer(
97
  packing=False, # Can make training 5x faster for short sequences.
98
 
99
  args = UnslothTrainingArguments(
100
- # per_device_train_batch_size=16,
101
- # gradient_accumulation_steps=64,
102
- # per_device_train_batch_size=16,
103
- # gradient_accumulation_steps=16,
104
  per_device_train_batch_size=1,
105
  # gradient_accumulation_steps=8,
106
 
107
  warmup_ratio=0,
108
  num_train_epochs=1,
109
 
110
- # learning_rate=5e-5,
111
- # embedding_learning_rate=5e-6,
112
  learning_rate = 5e-5,
113
  embedding_learning_rate = 5e-5 / 10.0,
114
 
@@ -116,8 +109,7 @@ trainer = UnslothTrainer(
116
  bf16=is_bfloat16_supported(),
117
  logging_steps=1,
118
  # optim='adamw_8bit',
119
- optim='adamw_torch',
120
- # optim='adamw_torch_fused',
121
  weight_decay=0.01,
122
  lr_scheduler_type='cosine',
123
  seed=23,
 
55
 
56
 
57
  #
58
+ # dataset
59
  #
60
  from datasets import Dataset
61
  from litdata import TokensLoader, StreamingDataset
 
74
  yield {'input_ids': batch}
75
 
76
 
 
77
  train_dataset = Dataset.from_generator(unlsoth_generator)
78
 
79
  #
80
+ # trainer
81
  #
82
  from trl import SFTTrainer
83
  from transformers import TrainingArguments
 
96
  packing=False, # Can make training 5x faster for short sequences.
97
 
98
  args = UnslothTrainingArguments(
 
 
 
 
99
  per_device_train_batch_size=1,
100
  # gradient_accumulation_steps=8,
101
 
102
  warmup_ratio=0,
103
  num_train_epochs=1,
104
 
 
 
105
  learning_rate = 5e-5,
106
  embedding_learning_rate = 5e-5 / 10.0,
107
 
 
109
  bf16=is_bfloat16_supported(),
110
  logging_steps=1,
111
  # optim='adamw_8bit',
112
+ optim='adamw_torch_fused',
 
113
  weight_decay=0.01,
114
  lr_scheduler_type='cosine',
115
  seed=23,