cpt core 4
Browse files- scripts/cpt_core_model_4.py +3 -11
scripts/cpt_core_model_4.py
CHANGED
@@ -55,7 +55,7 @@ model = FastLanguageModel.get_peft_model(
|
|
55 |
|
56 |
|
57 |
#
|
58 |
-
#
|
59 |
#
|
60 |
from datasets import Dataset
|
61 |
from litdata import TokensLoader, StreamingDataset
|
@@ -74,11 +74,10 @@ def unlsoth_generator():
|
|
74 |
yield {'input_ids': batch}
|
75 |
|
76 |
|
77 |
-
# train_dataset = Dataset.from_generator(unlsoth_generator, streaming=True)
|
78 |
train_dataset = Dataset.from_generator(unlsoth_generator)
|
79 |
|
80 |
#
|
81 |
-
#
|
82 |
#
|
83 |
from trl import SFTTrainer
|
84 |
from transformers import TrainingArguments
|
@@ -97,18 +96,12 @@ trainer = UnslothTrainer(
|
|
97 |
packing=False, # Can make training 5x faster for short sequences.
|
98 |
|
99 |
args = UnslothTrainingArguments(
|
100 |
-
# per_device_train_batch_size=16,
|
101 |
-
# gradient_accumulation_steps=64,
|
102 |
-
# per_device_train_batch_size=16,
|
103 |
-
# gradient_accumulation_steps=16,
|
104 |
per_device_train_batch_size=1,
|
105 |
# gradient_accumulation_steps=8,
|
106 |
|
107 |
warmup_ratio=0,
|
108 |
num_train_epochs=1,
|
109 |
|
110 |
-
# learning_rate=5e-5,
|
111 |
-
# embedding_learning_rate=5e-6,
|
112 |
learning_rate = 5e-5,
|
113 |
embedding_learning_rate = 5e-5 / 10.0,
|
114 |
|
@@ -116,8 +109,7 @@ trainer = UnslothTrainer(
|
|
116 |
bf16=is_bfloat16_supported(),
|
117 |
logging_steps=1,
|
118 |
# optim='adamw_8bit',
|
119 |
-
optim='
|
120 |
-
# optim='adamw_torch_fused',
|
121 |
weight_decay=0.01,
|
122 |
lr_scheduler_type='cosine',
|
123 |
seed=23,
|
|
|
55 |
|
56 |
|
57 |
#
|
58 |
+
# dataset
|
59 |
#
|
60 |
from datasets import Dataset
|
61 |
from litdata import TokensLoader, StreamingDataset
|
|
|
74 |
yield {'input_ids': batch}
|
75 |
|
76 |
|
|
|
77 |
train_dataset = Dataset.from_generator(unlsoth_generator)
|
78 |
|
79 |
#
|
80 |
+
# trainer
|
81 |
#
|
82 |
from trl import SFTTrainer
|
83 |
from transformers import TrainingArguments
|
|
|
96 |
packing=False, # Can make training 5x faster for short sequences.
|
97 |
|
98 |
args = UnslothTrainingArguments(
|
|
|
|
|
|
|
|
|
99 |
per_device_train_batch_size=1,
|
100 |
# gradient_accumulation_steps=8,
|
101 |
|
102 |
warmup_ratio=0,
|
103 |
num_train_epochs=1,
|
104 |
|
|
|
|
|
105 |
learning_rate = 5e-5,
|
106 |
embedding_learning_rate = 5e-5 / 10.0,
|
107 |
|
|
|
109 |
bf16=is_bfloat16_supported(),
|
110 |
logging_steps=1,
|
111 |
# optim='adamw_8bit',
|
112 |
+
optim='adamw_torch_fused',
|
|
|
113 |
weight_decay=0.01,
|
114 |
lr_scheduler_type='cosine',
|
115 |
seed=23,
|