File size: 3,096 Bytes
734e414
 
3bd4051
734e414
7224ded
734e414
 
2d440bc
 
734e414
c5afc4e
 
 
 
 
 
734e414
 
 
 
 
 
 
2d440bc
 
 
 
734e414
 
 
2d440bc
b394167
 
 
 
 
 
2d440bc
 
 
734e414
2d440bc
 
 
 
734e414
 
 
 
 
3bd4051
c5afc4e
 
3bd4051
 
c5afc4e
 
 
 
3bd4051
 
c5afc4e
3bd4051
 
c5afc4e
 
4f4772d
 
3bd4051
 
734e414
 
 
 
 
 
 
 
 
3bd4051
eef5b70
734e414
 
756b2ff
6ffe1e7
734e414
 
eef5b70
 
734e414
eef5b70
734e414
 
eef5b70
 
 
 
734e414
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from unsloth import FastLanguageModel
import torch
from transformers import AutoTokenizer

max_seq_length = 16385
dtype = torch.bfloat16
load_in_4bit = True
model_name = '../out/pretrain-core-3/hf'
output_dir = '../out/cpt-core-4'

dataset_input_dir = '../core-data-4-8193-16385-16385-1000/'
dataset_block_size = 16385

#
# model
#
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
print(f'{model=}')

# print('Ignore loaded tokenizer by FastLanguageModel.from_pretrained and using AutoTokenizer.from_pretrained')
# tokenizer = AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True)
# print(f'{tokenizer=}')

model = FastLanguageModel.get_peft_model(
    model,
    r = 256, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj",
        "up_proj", "down_proj",
        "embed_tokens", "lm_head",
    ],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

print(f'{model=}')


from datasets import Dataset
from litdata import TokensLoader, StreamingDataset


litgpt_streaming_dataset = StreamingDataset(
    input_dir=dataset_input_dir,
    item_loader=TokensLoader(block_size=dataset_block_size),
)

def unlsoth_generator():
    global litgpt_streaming_dataset

    for batch in litgpt_streaming_dataset:
        yield {'input_ids': batch}


# train_dataset = Dataset.from_generator(unlsoth_generator, streaming=True)
train_dataset = Dataset.from_generator(unlsoth_generator)


from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments


trainer = UnslothTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    # dataset_text_field='text',
    max_seq_length=max_seq_length,
    dataset_num_proc=32,
    max_steps=len(litgpt_streaming_dataset),
    packing=False, # Can make training 5x faster for short sequences.

    args = UnslothTrainingArguments(
        per_device_train_batch_size=16,
        gradient_accumulation_steps=64,

        warmup_ratio=0,
        num_train_epochs=1,

        # learning_rate=5e-5,
        # embedding_learning_rate=5e-6,
        learning_rate = 5e-5 * 2,
        embedding_learning_rate = 5e-5 / 2,

        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim='adamw_8bit',
        weight_decay=0.01,
        lr_scheduler_type='cosine',
        seed=23,
        output_dir=output_dir,
        report_to='wandb',
    ),
)

trainer_stats = trainer.train()