True Network commited on
Commit
bccaed2
Β·
1 Parent(s): fe14332

second_commit

Browse files
Files changed (1) hide show
  1. sentiment_analysis_finetuning.py +154 -0
sentiment_analysis_finetuning.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, DatasetDict, Dataset
2
+
3
+ from transformers import (
4
+ AutoTokenizer,
5
+ AutoConfig,
6
+ AutoModelForSequenceClassification,
7
+ DataCollatorWithPadding,
8
+ TrainingArguments,
9
+ Trainer)
10
+
11
+ from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
12
+ import evaluate
13
+ import torch
14
+ import numpy as np
15
+
16
+ # load dataset
17
+ dataset = load_dataset('shawhin/imdb-truncated')
18
+
19
+ # display % of training data with label=1
20
+ np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])
21
+
22
+ model_checkpoint = 'distilbert-base-uncased'
23
+ # model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer
24
+
25
+ # define label maps
26
+ id2label = {0: "Negative", 1: "Positive"}
27
+ label2id = {"Negative":0, "Positive":1}
28
+
29
+ # generate classification model from model_checkpoint
30
+ model = AutoModelForSequenceClassification.from_pretrained(
31
+ model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)
32
+
33
+ # create tokenizer
34
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
35
+
36
+ # add pad token if none exists
37
+ if tokenizer.pad_token is None:
38
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
39
+ model.resize_token_embeddings(len(tokenizer))
40
+
41
+ # create tokenize function
42
+ def tokenize_function(examples):
43
+ # extract text
44
+ text = examples["text"]
45
+
46
+ #tokenize and truncate text
47
+ tokenizer.truncation_side = "left"
48
+ tokenized_inputs = tokenizer(
49
+ text,
50
+ return_tensors="np",
51
+ truncation=True,
52
+ max_length=512
53
+ )
54
+
55
+ return tokenized_inputs
56
+
57
+ # tokenize training and validation datasets
58
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
59
+
60
+ # create data collator
61
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
62
+
63
+ # import accuracy evaluation metric
64
+ accuracy = evaluate.load("accuracy")
65
+
66
+ # define an evaluation function to pass into trainer later
67
+ def compute_metrics(p):
68
+ predictions, labels = p
69
+ predictions = np.argmax(predictions, axis=1)
70
+
71
+ return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}
72
+
73
+ # define list of examples
74
+ text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]
75
+
76
+ print("Untrained model predictions:")
77
+ print("----------------------------")
78
+ for text in text_list:
79
+ # tokenize text
80
+ inputs = tokenizer.encode(text, return_tensors="pt")
81
+ # compute logits
82
+ logits = model(inputs).logits
83
+ # convert logits to label
84
+ predictions = torch.argmax(logits)
85
+
86
+ print(text + " - " + id2label[predictions.tolist()])
87
+
88
+ peft_config = LoraConfig(task_type="SEQ_CLS",
89
+ r=4,
90
+ lora_alpha=32,
91
+ lora_dropout=0.01,
92
+ target_modules = ['q_lin'])
93
+
94
+ model = get_peft_model(model, peft_config)
95
+ model.print_trainable_parameters()
96
+
97
+ # hyperparameters
98
+ lr = 1e-3
99
+ batch_size = 4
100
+ num_epochs = 10
101
+
102
+ # define training arguments
103
+ training_args = TrainingArguments(
104
+ output_dir= model_checkpoint + "-lora-text-classification",
105
+ learning_rate=lr,
106
+ per_device_train_batch_size=batch_size,
107
+ per_device_eval_batch_size=batch_size,
108
+ num_train_epochs=num_epochs,
109
+ weight_decay=0.01,
110
+ evaluation_strategy="epoch",
111
+ save_strategy="epoch",
112
+ load_best_model_at_end=True,
113
+ )
114
+
115
+ # creater trainer object
116
+ trainer = Trainer(
117
+ model=model,
118
+ args=training_args,
119
+ train_dataset=tokenized_dataset["train"],
120
+ eval_dataset=tokenized_dataset["validation"],
121
+ tokenizer=tokenizer,
122
+ data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
123
+ compute_metrics=compute_metrics,
124
+ )
125
+
126
+ # train model
127
+ trainer.train()
128
+
129
+ model.to('mps') # moving to mps for Mac (can alternatively do 'cpu')
130
+
131
+ print("Trained model predictions:")
132
+ print("--------------------------")
133
+ for text in text_list:
134
+ inputs = tokenizer.encode(text, return_tensors="pt").to("mps") # moving to mps for Mac (can alternatively do 'cpu')
135
+
136
+ logits = model(inputs).logits
137
+ predictions = torch.max(logits,1).indices
138
+
139
+ print(text + " - " + id2label[predictions.tolist()[0]])
140
+
141
+ # option 1: notebook login
142
+ from huggingface_hub import notebook_login
143
+ notebook_login() # ensure token gives write access
144
+
145
+ hf_name = 'laxmisahu' # your hf username or org name
146
+ model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification" # you can name the model whatever you want
147
+
148
+ # how to load peft model from hub for inference
149
+ config = PeftConfig.from_pretrained(model_id)
150
+ inference_model = AutoModelForSequenceClassification.from_pretrained(
151
+ config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
152
+ )
153
+ tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
154
+ model = PeftModel.from_pretrained(inference_model, model_id)