from datasets import load_dataset, DatasetDict, Dataset from transformers import ( AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer) from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig import evaluate import torch import numpy as np # load dataset dataset = load_dataset('shawhin/imdb-truncated') # display % of training data with label=1 np.array(dataset['train']['label']).sum()/len(dataset['train']['label']) model_checkpoint = 'distilbert-base-uncased' # model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer # define label maps id2label = {0: "Negative", 1: "Positive"} label2id = {"Negative":0, "Positive":1} # generate classification model from model_checkpoint model = AutoModelForSequenceClassification.from_pretrained( model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id) # create tokenizer tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True) # add pad token if none exists if tokenizer.pad_token is None: tokenizer.add_special_tokens({'pad_token': '[PAD]'}) model.resize_token_embeddings(len(tokenizer)) # create tokenize function def tokenize_function(examples): # extract text text = examples["text"] #tokenize and truncate text tokenizer.truncation_side = "left" tokenized_inputs = tokenizer( text, return_tensors="np", truncation=True, max_length=512 ) return tokenized_inputs # tokenize training and validation datasets tokenized_dataset = dataset.map(tokenize_function, batched=True) # create data collator data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # import accuracy evaluation metric accuracy = evaluate.load("accuracy") # define an evaluation function to pass into trainer later def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=1) return {"accuracy": accuracy.compute(predictions=predictions, references=labels)} # define list of examples text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."] print("Untrained model predictions:") print("----------------------------") for text in text_list: # tokenize text inputs = tokenizer.encode(text, return_tensors="pt") # compute logits logits = model(inputs).logits # convert logits to label predictions = torch.argmax(logits) print(text + " - " + id2label[predictions.tolist()]) peft_config = LoraConfig(task_type="SEQ_CLS", r=4, lora_alpha=32, lora_dropout=0.01, target_modules = ['q_lin']) model = get_peft_model(model, peft_config) model.print_trainable_parameters() # hyperparameters lr = 1e-3 batch_size = 4 num_epochs = 10 # define training arguments training_args = TrainingArguments( output_dir= model_checkpoint + "-lora-text-classification", learning_rate=lr, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=num_epochs, weight_decay=0.01, evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, ) # creater trainer object trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["validation"], tokenizer=tokenizer, data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length compute_metrics=compute_metrics, ) # train model trainer.train() model.to('mps') # moving to mps for Mac (can alternatively do 'cpu') print("Trained model predictions:") print("--------------------------") for text in text_list: inputs = tokenizer.encode(text, return_tensors="pt").to("mps") # moving to mps for Mac (can alternatively do 'cpu') logits = model(inputs).logits predictions = torch.max(logits,1).indices print(text + " - " + id2label[predictions.tolist()[0]]) # option 1: notebook login from huggingface_hub import notebook_login notebook_login() # ensure token gives write access hf_name = 'laxmisahu' # your hf username or org name model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification" # you can name the model whatever you want # how to load peft model from hub for inference config = PeftConfig.from_pretrained(model_id) inference_model = AutoModelForSequenceClassification.from_pretrained( config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id ) tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) model = PeftModel.from_pretrained(inference_model, model_id)