Spaces:
Runtime error
Runtime error
from datasets import load_dataset, DatasetDict, Dataset | |
from transformers import ( | |
AutoTokenizer, | |
AutoConfig, | |
AutoModelForSequenceClassification, | |
DataCollatorWithPadding, | |
TrainingArguments, | |
Trainer) | |
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig | |
import evaluate | |
import torch | |
import numpy as np | |
# load dataset | |
dataset = load_dataset('shawhin/imdb-truncated') | |
# display % of training data with label=1 | |
np.array(dataset['train']['label']).sum()/len(dataset['train']['label']) | |
model_checkpoint = 'distilbert-base-uncased' | |
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer | |
# define label maps | |
id2label = {0: "Negative", 1: "Positive"} | |
label2id = {"Negative":0, "Positive":1} | |
# generate classification model from model_checkpoint | |
model = AutoModelForSequenceClassification.from_pretrained( | |
model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id) | |
# create tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True) | |
# add pad token if none exists | |
if tokenizer.pad_token is None: | |
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |
model.resize_token_embeddings(len(tokenizer)) | |
# create tokenize function | |
def tokenize_function(examples): | |
# extract text | |
text = examples["text"] | |
#tokenize and truncate text | |
tokenizer.truncation_side = "left" | |
tokenized_inputs = tokenizer( | |
text, | |
return_tensors="np", | |
truncation=True, | |
max_length=512 | |
) | |
return tokenized_inputs | |
# tokenize training and validation datasets | |
tokenized_dataset = dataset.map(tokenize_function, batched=True) | |
# create data collator | |
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
# import accuracy evaluation metric | |
accuracy = evaluate.load("accuracy") | |
# define an evaluation function to pass into trainer later | |
def compute_metrics(p): | |
predictions, labels = p | |
predictions = np.argmax(predictions, axis=1) | |
return {"accuracy": accuracy.compute(predictions=predictions, references=labels)} | |
# define list of examples | |
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."] | |
print("Untrained model predictions:") | |
print("----------------------------") | |
for text in text_list: | |
# tokenize text | |
inputs = tokenizer.encode(text, return_tensors="pt") | |
# compute logits | |
logits = model(inputs).logits | |
# convert logits to label | |
predictions = torch.argmax(logits) | |
print(text + " - " + id2label[predictions.tolist()]) | |
peft_config = LoraConfig(task_type="SEQ_CLS", | |
r=4, | |
lora_alpha=32, | |
lora_dropout=0.01, | |
target_modules = ['q_lin']) | |
model = get_peft_model(model, peft_config) | |
model.print_trainable_parameters() | |
# hyperparameters | |
lr = 1e-3 | |
batch_size = 4 | |
num_epochs = 10 | |
# define training arguments | |
training_args = TrainingArguments( | |
output_dir= model_checkpoint + "-lora-text-classification", | |
learning_rate=lr, | |
per_device_train_batch_size=batch_size, | |
per_device_eval_batch_size=batch_size, | |
num_train_epochs=num_epochs, | |
weight_decay=0.01, | |
evaluation_strategy="epoch", | |
save_strategy="epoch", | |
load_best_model_at_end=True, | |
) | |
# creater trainer object | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_dataset["train"], | |
eval_dataset=tokenized_dataset["validation"], | |
tokenizer=tokenizer, | |
data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length | |
compute_metrics=compute_metrics, | |
) | |
# train model | |
trainer.train() | |
model.to('mps') # moving to mps for Mac (can alternatively do 'cpu') | |
print("Trained model predictions:") | |
print("--------------------------") | |
for text in text_list: | |
inputs = tokenizer.encode(text, return_tensors="pt").to("mps") # moving to mps for Mac (can alternatively do 'cpu') | |
logits = model(inputs).logits | |
predictions = torch.max(logits,1).indices | |
print(text + " - " + id2label[predictions.tolist()[0]]) | |
# option 1: notebook login | |
from huggingface_hub import notebook_login | |
notebook_login() # ensure token gives write access | |
hf_name = 'laxmisahu' # your hf username or org name | |
model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification" # you can name the model whatever you want | |
# how to load peft model from hub for inference | |
config = PeftConfig.from_pretrained(model_id) | |
inference_model = AutoModelForSequenceClassification.from_pretrained( | |
config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id | |
) | |
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) | |
model = PeftModel.from_pretrained(inference_model, model_id) |