File size: 3,479 Bytes
734e414 3bd4051 734e414 7224ded 734e414 accd6a7 2d440bc 734e414 c5afc4e 734e414 d73bbdd 2d440bc 734e414 041c526 b394167 041c526 2d440bc 734e414 278cf75 2d440bc 734e414 d73bbdd 734e414 8b6e5e1 3bd4051 c5afc4e 3bd4051 c5afc4e 8b6e5e1 3bd4051 c5afc4e 3bd4051 c5afc4e 4f4772d 3bd4051 8b6e5e1 734e414 3bd4051 eef5b70 734e414 756b2ff 6ffe1e7 734e414 d73bbdd 586c286 041c526 6304c7f 734e414 eef5b70 734e414 eef5b70 aa7c767 734e414 c152e40 1895830 734e414 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
from unsloth import FastLanguageModel
import torch
from transformers import AutoTokenizer
max_seq_length = 16385
dtype = torch.bfloat16
load_in_4bit = False
model_name = '../out/pretrain-core-3/hf'
output_dir = '../out/cpt-core-4'
dataset_input_dir = '../core-data-4-8193-16385-16385-1000/'
dataset_block_size = 16385
#
# model
#
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# print(f'{model=}')
# print('Ignore loaded tokenizer by FastLanguageModel.from_pretrained and using AutoTokenizer.from_pretrained')
# tokenizer = AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True)
# print(f'{tokenizer=}')
model = FastLanguageModel.get_peft_model(
model,
r = 256, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
# r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj",
"up_proj", "down_proj",
"embed_tokens", "lm_head",
],
lora_alpha = 32,
# lora_alpha = 2,
lora_dropout = 0, # Supports any, but = 0 is optimized
bias = "none", # Supports any, but = "none" is optimized
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
# use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
use_gradient_checkpointing = False,
random_state = 3407,
use_rslora = True, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
# print(f'{model=}')
#
#
#
from datasets import Dataset
from litdata import TokensLoader, StreamingDataset
litgpt_streaming_dataset = StreamingDataset(
input_dir=dataset_input_dir,
item_loader=TokensLoader(block_size=dataset_block_size),
)
def unlsoth_generator():
global litgpt_streaming_dataset
for batch in litgpt_streaming_dataset:
yield {'input_ids': batch}
# train_dataset = Dataset.from_generator(unlsoth_generator, streaming=True)
train_dataset = Dataset.from_generator(unlsoth_generator)
#
#
#
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments
trainer = UnslothTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=train_dataset,
# dataset_text_field='text',
max_seq_length=max_seq_length,
dataset_num_proc=32,
max_steps=len(litgpt_streaming_dataset),
packing=False, # Can make training 5x faster for short sequences.
args = UnslothTrainingArguments(
# per_device_train_batch_size=16,
# gradient_accumulation_steps=64,
# per_device_train_batch_size=16,
# gradient_accumulation_steps=16,
per_device_train_batch_size=1,
# gradient_accumulation_steps=8,
warmup_ratio=0,
num_train_epochs=1,
# learning_rate=5e-5,
# embedding_learning_rate=5e-6,
learning_rate = 5e-5,
embedding_learning_rate = 5e-5 / 10.0,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=1,
# optim='adamw_8bit',
optim='adamw_torch',
# optim='adamw_torch_fused',
weight_decay=0.01,
lr_scheduler_type='cosine',
seed=23,
output_dir=output_dir,
report_to='wandb',
),
)
trainer_stats = trainer.train()
|