cpt core 4
Browse files
scripts/cpt_core_model_4.py
CHANGED
@@ -7,15 +7,14 @@ from transformers import AutoTokenizer
|
|
7 |
os.environ['WANDB_PROJECT'] = 'tangled-alpha-0.9-core'
|
8 |
run_name = 'cpt-core-4'
|
9 |
|
|
|
|
|
10 |
max_seq_length = 16385
|
11 |
dtype = torch.bfloat16
|
12 |
load_in_4bit = False
|
13 |
model_name = '../out/pretrain-core-3/hf'
|
14 |
output_dir = '../out/cpt-core-4'
|
15 |
|
16 |
-
dataset_input_dir = '../core-data-4-8193-16385-16385-1000/'
|
17 |
-
dataset_block_size = 16385
|
18 |
-
|
19 |
#
|
20 |
# model
|
21 |
#
|
@@ -44,7 +43,7 @@ model = FastLanguageModel.get_peft_model(
|
|
44 |
# lora_alpha=32,
|
45 |
lora_alpha=16,
|
46 |
lora_dropout=0, # Supports any, but = 0 is optimized
|
47 |
-
bias=
|
48 |
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
|
49 |
# use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
|
50 |
use_gradient_checkpointing=False,
|
|
|
7 |
os.environ['WANDB_PROJECT'] = 'tangled-alpha-0.9-core'
|
8 |
run_name = 'cpt-core-4'
|
9 |
|
10 |
+
dataset_input_dir = '../core-data-4-8193-16385-16385-1000/'
|
11 |
+
dataset_block_size = 16385
|
12 |
max_seq_length = 16385
|
13 |
dtype = torch.bfloat16
|
14 |
load_in_4bit = False
|
15 |
model_name = '../out/pretrain-core-3/hf'
|
16 |
output_dir = '../out/cpt-core-4'
|
17 |
|
|
|
|
|
|
|
18 |
#
|
19 |
# model
|
20 |
#
|
|
|
43 |
# lora_alpha=32,
|
44 |
lora_alpha=16,
|
45 |
lora_dropout=0, # Supports any, but = 0 is optimized
|
46 |
+
bias='none', # Supports any, but = "none" is optimized
|
47 |
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
|
48 |
# use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
|
49 |
use_gradient_checkpointing=False,
|