File size: 4,464 Bytes
d9cfebf
4f2ef99
d9cfebf
a16809f
d9cfebf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a16809f
d9cfebf
a16809f
 
 
 
 
d9cfebf
4f2ef99
 
 
 
 
 
 
 
 
d9cfebf
 
 
 
 
 
4f2ef99
d9cfebf
4f2ef99
d9cfebf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a16809f
 
d9cfebf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a16809f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# train_llama4.py
# Script to fine-tune Llama 4 Maverick for healthcare fraud detection (text-only with CPU offloading)

from transformers import AutoTokenizer, Llama4ForConditionalGeneration, Trainer, TrainingArguments
from transformers import BitsAndBytesConfig
import datasets
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from accelerate import Accelerator
import huggingface_hub
import os

# Version and CUDA check
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Is CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

# Authenticate with Hugging Face
LLama = os.getenv("LLama")
if not LLama:
    raise ValueError("LLama token not found. Set it in Hugging Face Space secrets as 'LLama'.")
huggingface_hub.login(token=LLama)

# Load Llama 4 model and tokenizer
MODEL_ID = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Custom device map to offload some layers to CPU
device_map = {
    "model.embed_tokens": 0,
    "model.layers.0-15": 0,  # Keep first 16 layers on GPU
    "model.layers.16-31": "cpu",  # Offload remaining layers to CPU
    "model.norm": 0,
    "lm_head": 0
}

# Quantization config for A100 80 GB VRAM
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

model = Llama4ForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map=device_map,
    quantization_config=quantization_config,
    llm_int8_enable_fp32_cpu_offload=True,
    attn_implementation="flex_attention"
)

# Prepare for LoRA
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Load dataset
dataset = datasets.load_dataset("json", data_files="Bingaman_training_data.json", field="training_pairs")
print("First example from dataset:", dataset["train"][0])

# Tokenization
def tokenize_data(example):
    formatted_text = f"<s>[INST] {example['input']} [/INST] {example['output']}</s>"
    inputs = tokenizer(formatted_text, padding="max_length", truncation=True, max_length=4096, return_tensors="pt")
    input_ids = inputs["input_ids"].squeeze(0).tolist()
    attention_mask = inputs["attention_mask"].squeeze(0).tolist()
    labels = input_ids.copy()
    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": attention_mask
    }

tokenized_dataset = dataset["train"].map(tokenize_data, batched=False, remove_columns=dataset["train"].column_names)
print("First tokenized example:", {k: (type(v), len(v)) for k, v in tokenized_dataset[0].items()})

# Data collator
def custom_data_collator(features):
    input_ids = [torch.tensor(f["input_ids"]) for f in features]
    attention_mask = [torch.tensor(f["attention_mask"]) for f in features]
    labels = [torch.tensor(f["labels"]) for f in features]
    return {
        "input_ids": torch.stack(input_ids),
        "attention_mask": torch.stack(attention_mask),
        "labels": torch.stack(labels)
    }

# Training setup
accelerator = Accelerator()
training_args = TrainingArguments(
    output_dir="./fine_tuned_llama4_healthcare",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    eval_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=20,
    save_total_limit=3,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=5,
    bf16=True,
    gradient_checkpointing=True,
    optim="adamw_torch",
    warmup_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset.select(range(min(5, len(tokenized_dataset)))),
    data_collator=custom_data_collator
)

# Start training
trainer.train()
model.save_pretrained("./fine_tuned_llama4_healthcare")
tokenizer.save_pretrained("./fine_tuned_llama4_healthcare")
print("Training complete. Model and tokenizer saved to ./fine_tuned_llama4_healthcare")