File size: 4,239 Bytes
e77bff1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e205424
e77bff1
 
 
 
 
 
 
 
 
e205424
 
 
 
e77bff1
 
 
 
 
 
 
 
 
 
e205424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e77bff1
 
 
e205424
e77bff1
 
 
7e93b98
 
 
 
 
 
 
 
 
 
 
 
 
e77bff1
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
{
    "notebook_title": "Supervised fine-tuning (SFT)",
    "notebook_type": "sft",
    "dataset_types": ["text"],
    "compatible_library": "datasets",
    "notebook_template": [
        {
            "cell_type": "markdown",
            "source": "---\n# **Supervised fine-tuning Notebook for {dataset_name} dataset**\n---"
        },
        {
            "cell_type": "markdown",
            "source": "## 1. Setup necessary libraries and load the dataset"
        },
        {
            "cell_type": "code",
            "source": "# Install and import necessary libraries\n!pip install trl datasets transformers bitsandbytes"
        },
        {
            "cell_type": "code",
            "source": "from datasets import load_dataset\nfrom trl import SFTTrainer\nfrom transformers import TrainingArguments"
        },
        {
            "cell_type": "code",
            "source": "# Load the dataset\ndataset = load_dataset('{dataset_name}', name='{first_config}', split='{first_split}')\ndataset"
        },
        {
            "cell_type": "code",
            "source": "# Split the dataset: 20% for evaluation, 80% for training\ntrain_test_split = dataset.train_test_split(test_size=0.2)\n\n# Get the training and evaluation datasets\ntrain_dataset = train_test_split['train']\neval_dataset = train_test_split['test']"
        },
        {
            "cell_type": "code",
            "source": "# Specify the column name that will be used for training\ndataset_text_field = '{longest_col}'"
        },
        {
            "cell_type": "markdown",
            "source": "## 2. Configure SFT trainer"
        },
        {
            "cell_type": "code",
            "source": "model_name = 'facebook/opt-350m' # Replace with your desired model\noutput_model_name = f'{model_name}-{dataset_name}'.replace('/', '-')"
        },
        {
            "cell_type": "code",
            "source": "# Initialize training arguments, adjust parameters as needed\ntraining_args = TrainingArguments(\n      per_device_train_batch_size = 1, #Batch size per GPU for training\n      gradient_accumulation_steps = 4,\n      max_steps = 100, #Total number of training steps.(Overrides epochs)\n      learning_rate = 2e-4,\n      fp16 = True,\n      logging_steps=20,\n      output_dir = output_model_name,\n      optim = 'paged_adamw_8bit' #Optimizer to use\n  )"
        },
        {
            "cell_type": "code",
            "source": "# Initialize SFTTrainer\ntrainer = SFTTrainer(\n  model = model_name,\n  train_dataset=train_dataset,\n  eval_dataset=eval_dataset,\n  dataset_text_field=dataset_text_field,\n  max_seq_length=512,\n  args=training_args\n)"
        },
        {
            "cell_type": "markdown",
            "source": "## 3. Perform fine-tuning and capture the training process"
        },
        {
            "cell_type": "code",
            "source": "eval_result_before = trainer.evaluate()\n\n# Start training\ntrainer.train()\n\neval_result_after = trainer.evaluate()"
        },
        {
            "cell_type": "code",
            "source": "print(f'Before training: {eval_result_before}')\nprint(f'After training: {eval_result_after}')"
        },
        {
            "cell_type": "markdown",
            "source": "## 4. Compare model output vs original"
        },
        {
            "cell_type": "code",
            "source": "from transformers import pipeline, AutoTokenizer\n\nmy_model = trainer.model\ntokenizer = AutoTokenizer.from_pretrained(model_name)\n\nmy_pipe = pipeline('text-generation', model=my_model, tokenizer=tokenizer)\noriginal_pipe = pipeline('text-generation', model=model_name)"
        },
        {
            "cell_type": "code",
            "source": "print(original_pipe('Hello'))\nprint(my_pipe('Hello'))"
        },
        {
            "cell_type": "markdown",
            "source": "## 5. Push model to hub (Optional)"
        },
        {
            "cell_type": "code",
            "source": "# Authenticate to the Hugging Face Hub\nfrom huggingface_hub import notebook_login\nnotebook_login()"
        },
        {
            "cell_type": "code",
            "source": "# Push the model to Hugging Face Hub\ntrainer.push_to_hub()"
        }
    ]    
}