File size: 3,712 Bytes
734e414
 
2d440bc
734e414
2d440bc
734e414
 
2d440bc
 
734e414
c5afc4e
 
 
 
 
 
734e414
 
 
 
 
 
 
 
2d440bc
 
 
 
 
734e414
 
 
2d440bc
 
 
 
 
 
 
 
734e414
2d440bc
 
 
 
734e414
 
 
 
2d440bc
734e414
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d440bc
734e414
c5afc4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d440bc
734e414
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d440bc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from unsloth import FastLanguageModel
import torch
# from transformers import AutoTokenizer

max_seq_length = 16384
dtype = torch.bfloat16
load_in_4bit = True
model_name = '../out/pretrain-core-3/hf'
output_dir = '../out/cpt-core-4'

dataset_input_dir = '../core-data-4-8193-16385-16385-1000/'
dataset_block_size = 16385

#
# model
#
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print(f'{model=}')

# print('Ignore loaded tokenizer by FastLanguageModel.from_pretrained and using AutoTokenizer.from_pretrained')
# tokenizer = AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True)

# print(f'{tokenizer=}')

model = FastLanguageModel.get_peft_model(
    model,
    r = 256, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj",
                        "up_proj", "down_proj",
                        "embed_tokens", "lm_head",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

print(f'{model=}')

'''
from datasets import concatenate_datasets
from cpt_base_datasets import cpt_base_datasets
from cpt_instruct_datasets import cpt_instruct_datasets
from unsloth_utils import load_text_dataset, load_chat_dataset

core_datasets = []

for dataset_config in cpt_base_datasets:
    dataset = load_text_dataset(tokenizer, **dataset_config)
    print(f'{dataset=}')
    core_datasets.append(dataset)

# for dataset_config in cpt_instruct_datasets:
#     dataset = load_chat_dataset(tokenizer, **dataset_config)
#     print(f'{dataset=}')
#     core_datasets.append(dataset)

final_dataset = concatenate_datasets(core_datasets)
print(f'{final_dataset=}')
'''

from litdata import TokensLoader, StreamingDataset

dataset = StreamingDataset(
    input_dir=dataset_input_dir,
    item_loader=TokensLoader(block_size=dataset_block_size),
)


def unlsoth_generator(dataset):
    for batch in dataset:
        print(batch)

        yield {
            'input_ids': batch['input_ids'].tolist()  # Convert tensor to list
        }

        break
        # # Assuming TokensLoader returns tensors with 'input_ids'
        # yield {
        #     'input_ids': batch['input_ids'].tolist()  # Convert tensor to list
        # }

for n in unlsoth_generator(dataset):
    print(n)
    break

'''
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments


trainer = UnslothTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=final_dataset,
    dataset_text_field='text',
    max_seq_length=max_seq_length,
    dataset_num_proc=32,

    args = UnslothTrainingArguments(
        per_device_train_batch_size=8,
        gradient_accumulation_steps=8,

        warmup_ratio=0.1,
        num_train_epochs=1,

        learning_rate=5e-5,
        embedding_learning_rate=5e-6,

        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim='adamw_8bit',
        weight_decay=0.01,
        lr_scheduler_type='cosine',
        seed=23,
        output_dir=output_dir,
        report_to='wandb',
    ),
)

trainer_stats = trainer.train()
'''