File size: 6,178 Bytes
e3abd2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a445abb
e3abd2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e49f41
e3abd2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""
This mod fine-tunes a BERT model on the ACARIS dataset for comparison with ACARISMdl.
"""

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, TrainingArguments, Trainer, AdamW, EarlyStoppingCallback, PreTrainedModel, DistilBertModel
from transformers.modeling_outputs import SequenceClassifierOutput
from datasets import load_dataset, Dataset
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_auc_score
import huggingface_hub
import os
import random
import numpy as np


config = {
	"mdl": "distilbert-base-uncased",
	"epochs": 5,
	"batchSize": 14,
	"maxLen": 512,
	"warmupSteps": 0.1, # proportion of total steps, NOT absolute
	"weightDecay": 0.02,
	"outputDir": "./output",
	"earlyStopping": True,
	"earlyStoppingPatience": 2,
	"dropout": 0.1,
	"initlr": 5e-5,
	"epsilon": 1e-8
}

#wandb.init(project="MarkIII_ACARIS", entity="simtoonia", config=config)


def lockSeed(seed):
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	if torch.cuda.is_available():
		torch.cuda.manual_seed_all(seed)
		torch.backends.cudnn.deterministic = True

#0 disabled, as determinism is not guaranteed and lowers performance
#lockSeed(69) # setting a fixed seed for *some* reproducibility

class DistilBertForMulticlassSequenceClassification(DistilBertForSequenceClassification):
	def __init__(self, config):
		super().__init__(config)

	def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None):
		return_dict = return_dict if return_dict is not None else self.config.use_return_dict

		outputs = self.distilbert(input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)

		hidden_state = outputs[0]
		pooled_output = hidden_state[:, 0]
		pooled_output = self.pre_classifier(pooled_output)
		pooled_output = nn.ReLU()(pooled_output)
		pooled_output = self.dropout(pooled_output)
		logits = self.classifier(pooled_output)

		loss = None
		if labels is not None:
			lossFct = nn.CrossEntropyLoss()
			loss = lossFct(logits.view(-1, self.num_labels), labels.view(-1))

		if not return_dict:
			output = (logits,) + outputs[2:]
			return ((loss,) + output) if loss is not None else output

		return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)



class ACARISBERT:
	def __init__(self, trainPath, valPath):
		self.trainPath = trainPath
		self.valPath = valPath
		self.tokenizer = DistilBertTokenizerFast.from_pretrained(config["mdl"])
		self.model = DistilBertForMulticlassSequenceClassification.from_pretrained(config["mdl"], num_labels=3, id2label={0: "neg", 1: "neu", 2: "pos"}, label2id={"neg": 0, "neu": 1, "pos": 2}, dropout=config["dropout"], attention_dropout=config["dropout"])

	def read_data(self, path):
		df = pd.read_csv(path, sep="|", usecols=["content", "sentiment"])
		return Dataset.from_pandas(df)
	
	def tokenize_data(self, dataset):
		sentMapping = {"pos": 2, "neg": 0, "neu": 1}
		tokenized = dataset.map(
			lambda x: {
				**self.tokenizer(x["content"], truncation=True, padding="max_length", max_length=config["maxLen"]),
				"labels": torch.tensor([sentMapping[sent] for sent in x["sentiment"]])
			},
			batched=True,
			remove_columns=["content", "sentiment"]
		)
		tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
		return tokenized
	
	def get_data_loaders(self, trainDS, valDS):
		trainLoader = DataLoader(trainDS, batch_size=config["batchSize"], shuffle=False)
		valLoader = DataLoader(valDS, batch_size=config["batchSize"], shuffle=False)
		return trainLoader, valLoader
	
	def compute_metrics(self, evalPred):
		logits, labels = evalPred
		preds = torch.argmax(torch.Tensor(logits), dim=1)
		probs = torch.nn.functional.softmax(torch.Tensor(logits), dim=1)
		precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None)
		accuracy = accuracy_score(labels, preds)
		rocAUC = roc_auc_score(labels, probs, multi_class="ovr")
		metrics = {
			"accuracy": accuracy,
			"roc_auc": rocAUC
		}
		metricNames = ["precision", "recall", "f1"]
		labelNames = ["neg", "neu", "pos"]
		for metricName, metricValue in zip(metricNames, [precision, recall, f1]):
			for labelName, value in zip(labelNames, metricValue):
				metrics[f"{metricName}_{labelName}"] = float(value)
		return metrics
	
	def train(self):
		trainDS = self.tokenize_data(self.read_data(self.trainPath))
		valDS = self.tokenize_data(self.read_data(self.valPath))

		totalSteps = len(trainDS) // config["batchSize"] * config["epochs"]
		warmupSteps = int(totalSteps * config["warmupSteps"])
		
		trainingArgs = TrainingArguments(
			output_dir=config["outputDir"],
			num_train_epochs=config["epochs"],
			per_device_train_batch_size=config["batchSize"],
			per_device_eval_batch_size=config["batchSize"],
			warmup_steps=warmupSteps,
			weight_decay=config["weightDecay"],
			logging_dir="./logs",
			logging_steps=100,
			learning_rate=config["initlr"],
			evaluation_strategy="epoch",
			save_strategy="epoch",
			load_best_model_at_end=True,
			metric_for_best_model="accuracy",
			save_total_limit=5,
			adam_epsilon=config["epsilon"],
			report_to="wandb",
			fp16=True
		)
		
		trainer = Trainer(
			model=self.model,
			args=trainingArgs,
			train_dataset=trainDS,
			eval_dataset=valDS,
			compute_metrics=self.compute_metrics,
			callbacks=[EarlyStoppingCallback(early_stopping_patience=config["earlyStoppingPatience"])]
		)
		print(f"Number of parameters: {trainer.model.num_parameters()}")
		print("Running eval ...")
		trainer.evaluate()
		print("Running training ...")
		trainer.train()
		print("Saving model ...")
		trainer.save_model(config["outputDir"])
		
		
if __name__ == "__main__":
	acaris_bert = ACARISBERT("./datasets/train.csv", "./datasets/val.csv")
	acaris_bert.train()
	wandb.finish()