File size: 4,514 Bytes
0f9e3cc
 
 
 
 
 
 
 
 
 
 
f6687d9
0f9e3cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f2ceb7
0f9e3cc
1f2ceb7
0f9e3cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47875c3
0f9e3cc
47875c3
0f9e3cc
 
 
 
c26b20b
 
 
 
0f9e3cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c26b20b
0f9e3cc
b39abab
0f9e3cc
 
 
 
aa4a704
0f9e3cc
 
 
 
 
 
 
 
f970cca
0f9e3cc
 
 
 
 
 
 
315ce9c
0f9e3cc
 
 
 
 
 
 
f6687d9
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
import numpy as np
from transformers import LlamaConfig, LlamaForCausalLM
import trl
import torch
from datasets import load_dataset
from transformers import PreTrainedTokenizerFast
import requests as rq
import gc
from tokenizers import ByteLevelBPETokenizer

dataset = load_dataset("nroggendorff/openhermes", split="train").select(range(int(1e+5)))

def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

training_corpus = get_training_corpus()

tokenizer = ByteLevelBPETokenizer()

tokenizer.train_from_iterator(
    training_corpus,
    vocab_size=3200,
    min_frequency=2,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<|user|>", "<|bot|>", "<|end|>"]
)

tokenizer.save("/tmp/custom_tokenizer.json")

tokenizer = PreTrainedTokenizerFast(tokenizer_file="/tmp/custom_tokenizer.json")

tokenizer.bos_token = "<s>"
tokenizer.eos_token = "</s>"
tokenizer.unk_token = "<unk>"
tokenizer.pad_token = "<pad>"
tokenizer.mask_token = "<mask>"

tokenizer.additional_special_tokens = ["<|user|>", "<|bot|>", "<|end|>"]

tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")

chat_template = "{{bos_token}}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}{{ eos_token }}"

tokenizer.chat_template = chat_template

tokenizer.add_special_tokens({
    "additional_special_tokens": ["<|user|>", "<|bot|>", "<|end|>"]
})

tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")

tokenizer.save_pretrained("/tmp/llama-tokenizer")

tokenizer = AutoTokenizer.from_pretrained("/tmp/llama-tokenizer")
print(tokenizer.apply_chat_template([{"role": "user", "content": "Why is the sky blue?"}, {"role": "assistant", "content": "Due to rayleigh scattering."}, {"role": "user", "content": "That's cool."}, {"role": "assistant", "content": "Yeah, I agree."}], tokenize=False))

config = LlamaConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=int(512 / 1),
    intermediate_size=int(1024 / 1),
    num_hidden_layers=int(8 / 1),
    num_attention_heads=int(8 / 1),
    max_position_embeddings=int(512 / 1),
    rms_norm_eps=1e-6,
    initializer_range=0.02,
    use_cache=True,
    pad_token_id=tokenizer.pad_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    tie_word_embeddings=False,
)

model = LlamaForCausalLM(config)

def format_prompts(examples):
    texts = []
    for text in examples['text']:
        conversation = []
        parts = text.split('<|end|>')
        for i in range(0, len(parts) - 1, 2):
            prompt = parts[i].replace("<|user|>", "")
            response = parts[i + 1].replace("<|bot|>", "")
            conversation.append({"role": "user", "content": prompt})
            conversation.append({"role": "assistant", "content": response})
        formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False)
        texts.append(formatted_conversation)
    output = {}
    output['text'] = texts
    return output

dataset = dataset.map(format_prompts, batched=True)

print(dataset['text'][2])

args = TrainingArguments(
    output_dir="mayo",
    num_train_epochs=2,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=16,
    learning_rate=1e-5,
    save_steps=100000,
    fp16=True,
    optim="sgd",
    optim_target_modules=["attn", "mlp"]
)

trainer = trl.SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=dataset,
    dataset_text_field='text',
    max_seq_length=512
)

torch.cuda.set_device(0)

gc.collect()
torch.cuda.empty_cache()

trainer.train()
    
#trainer.push_to_hub()
trained_model = trainer.model
trained_tokenizer = trainer.tokenizer

repo_id = "makeshift-mayo"
trained_model.push_to_hub(repo_id)
trained_tokenizer.push_to_hub(repo_id)

raise RuntimeError("The script was finished.")