Spaces:
Runtime error
Runtime error
File size: 4,889 Bytes
bccaed2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
AutoTokenizer,
AutoConfig,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
TrainingArguments,
Trainer)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
# load dataset
dataset = load_dataset('shawhin/imdb-truncated')
# display % of training data with label=1
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])
model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer
# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}
# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
# add pad token if none exists
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))
# create tokenize function
def tokenize_function(examples):
# extract text
text = examples["text"]
#tokenize and truncate text
tokenizer.truncation_side = "left"
tokenized_inputs = tokenizer(
text,
return_tensors="np",
truncation=True,
max_length=512
)
return tokenized_inputs
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")
# define an evaluation function to pass into trainer later
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=1)
return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}
# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]
print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
# tokenize text
inputs = tokenizer.encode(text, return_tensors="pt")
# compute logits
logits = model(inputs).logits
# convert logits to label
predictions = torch.argmax(logits)
print(text + " - " + id2label[predictions.tolist()])
peft_config = LoraConfig(task_type="SEQ_CLS",
r=4,
lora_alpha=32,
lora_dropout=0.01,
target_modules = ['q_lin'])
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10
# define training arguments
training_args = TrainingArguments(
output_dir= model_checkpoint + "-lora-text-classification",
learning_rate=lr,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
)
# creater trainer object
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
tokenizer=tokenizer,
data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
compute_metrics=compute_metrics,
)
# train model
trainer.train()
model.to('mps') # moving to mps for Mac (can alternatively do 'cpu')
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
inputs = tokenizer.encode(text, return_tensors="pt").to("mps") # moving to mps for Mac (can alternatively do 'cpu')
logits = model(inputs).logits
predictions = torch.max(logits,1).indices
print(text + " - " + id2label[predictions.tolist()[0]])
# option 1: notebook login
from huggingface_hub import notebook_login
notebook_login() # ensure token gives write access
hf_name = 'laxmisahu' # your hf username or org name
model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification" # you can name the model whatever you want
# how to load peft model from hub for inference
config = PeftConfig.from_pretrained(model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(
config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, model_id) |