aimlnerd commited on
Commit
cb09873
·
1 Parent(s): 09366c9
source/__init__.py ADDED
File without changes
source/services/__init__.py ADDED
File without changes
source/services/predicting_effective_arguments/train/model.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline, DataCollatorWithPadding
3
+ from sklearn.metrics import accuracy_score, f1_score
4
+ import torch
5
+ import numpy as np
6
+ import torch.nn.functional as F
7
+ import matplotlib.pyplot as plt
8
+ from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
9
+
10
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+
12
+
13
+
14
+ class TransformersSequenceClassifier:
15
+ def __init__(self,
16
+ model_output_dir,
17
+ num_labels,
18
+ tokenizer : AutoTokenizer,
19
+ model_checkpoint="distilbert-base-uncased"
20
+ ):
21
+ self.model_output_dir = model_output_dir
22
+ self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
23
+ self.model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels).to(device)
24
+
25
+ def tokenizer_func(self, batch):
26
+ return self.tokenizer(batch["text"], truncation=True, max_len=386)
27
+
28
+ def train(self, train_dataset, eval_dataset, epochs=2, batch_size=64):
29
+
30
+ train_tok_dataset = train_dataset.map(self.tokenizer_func, batched=True, remove_columns=('inputs', '__index_level_0__'))
31
+ val_tok_dataset = eval_dataset.map(self.tokenizer_func, batched=True, remove_columns=('inputs', '__index_level_0__'))
32
+ data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, padding='longest')
33
+ training_args = TrainingArguments(output_dir=self.model_output_dir,
34
+ num_train_epochs=epochs,
35
+ learning_rate=2e-5,
36
+ per_device_train_batch_size=batch_size,
37
+ per_device_eval_batch_size=batch_size,
38
+ weight_decay=0.01,
39
+ evaluation_strategy="epoch",
40
+ disable_tqdm=False,
41
+ logging_steps=len(train_dataset) // batch_size,
42
+ push_to_hub=True,
43
+ log_level="error")
44
+ self.trainer = Trainer(
45
+ model=self.model,
46
+ args=training_args,
47
+ compute_metrics=self._compute_metrics,
48
+ train_dataset=train_tok_dataset,
49
+ eval_dataset=val_tok_dataset,
50
+ tokenizer=self.tokenizer,
51
+ data_collator=data_collator
52
+ )
53
+ self.trainer.train()
54
+
55
+ @staticmethod
56
+ def _compute_metrics(pred):
57
+ labels = pred.label_ids
58
+ preds = pred.predictions.argmax(-1)
59
+ f1 = f1_score(labels, preds, average="weighted")
60
+ acc = accuracy_score(labels, preds)
61
+ return {"accuracy": acc, "f1": f1}
62
+
63
+ def forward_pass_with_label(self, batch):
64
+ # Place all input tensors on the same device as the model
65
+ inputs = {k:v.to(device) for k,v in batch.items()
66
+ if k in self.tokenizer.model_input_names}
67
+
68
+ with torch.no_grad():
69
+ output = self.model(**inputs)
70
+ pred_label = torch.argmax(output.logits, axis=-1)
71
+ loss = F.cross_entropy(output.logits, batch["label"].to(device),
72
+ reduction="none")
73
+
74
+ # Place outputs on CPU for compatibility with other dataset columns
75
+ return {"loss": loss.cpu().numpy(),
76
+ "predicted_label": pred_label.cpu().numpy()}
77
+
78
+ def compute_loss_per_pred(self, valid_dataset):
79
+ # Compute loss values
80
+ return valid_dataset.map(self.forward_pass_with_label, batched=True, batch_size=16)
81
+
82
+ @staticmethod
83
+ def plot_confusion_matrix(y_preds, y_true, labels):
84
+ cm = confusion_matrix(y_true, y_preds, normalize="true")
85
+ fig, ax = plt.subplots(figsize=(6, 6))
86
+ disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
87
+ disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
88
+ plt.title("Normalized confusion matrix")
89
+ plt.show()
90
+
91
+ def predict_valid_data(self, valid_dataset):
92
+ #trainer = Trainer(model=self.model)
93
+ preds_output = self.trainer.predict(valid_dataset)
94
+ print(preds_output.metrics)
95
+ y_preds = np.argmax(preds_output.predictions, axis=1)
96
+ return y_preds
97
+
98
+ @staticmethod
99
+ def predict_test_data(model_checkpoint, test_data):
100
+ pipe_classifier = pipeline("text-classification", model=model_checkpoint)
101
+ preds = pipe_classifier(test_data, return_all_scores=True)
102
+ return preds
103
+
104
+
105
+
source/services/predicting_effective_arguments/train/seq_classification.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ from datasets import load_dataset
5
+ from transformers import AutoTokenizer
6
+ from datasets import Dataset, load_metric
7
+ from sklearn.model_selection import train_test_split
8
+ from source.services.predicting_effective_arguments.train.model import TransformersSequenceClassifier
9
+
10
+ TARGET = 'discourse_effectiveness'
11
+ TEXT = "discourse_text"
12
+ MODEL_CHECKPOINT = "distilbert-base-uncased"
13
+ MODEL_OUTPUT_DIR ='source/services/predicting_effective_arguments/model/hf_textclassification'
14
+ class CFG:
15
+ TARGET = 'discourse_effectiveness'
16
+ TEXT = "discourse_text"
17
+ MODEL_CHECKPOINT = "distilbert-base-uncased"
18
+ MODEL_OUTPUT_DIR ='source/services/predicting_effective_arguments/model/hf_textclassification'
19
+ model_name="debertav3base"
20
+ learning_rate=1.5e-5
21
+ weight_decay=0.02
22
+ hidden_dropout_prob=0.007
23
+ attention_probs_dropout_prob=0.007
24
+ num_train_epochs=10
25
+ n_splits=4
26
+ batch_size=12
27
+ random_seed=42
28
+ save_steps=100
29
+ max_length=512
30
+
31
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
32
+
33
+ def seed_everything(seed: int):
34
+ import random, os
35
+ import numpy as np
36
+ import torch
37
+
38
+ random.seed(seed)
39
+ os.environ['PYTHONHASHSEED'] = str(seed)
40
+ np.random.seed(seed)
41
+ torch.manual_seed(seed)
42
+ torch.cuda.manual_seed(seed)
43
+ torch.backends.cudnn.deterministic = True
44
+ torch.backends.cudnn.benchmark = True
45
+
46
+
47
+ def prepare_input_text(df, sep_token):
48
+ df['inputs'] = df.discourse_type.str.lower() + ' ' + sep_token + ' ' + df.discourse_text.str.lower()
49
+ return df
50
+
51
+
52
+ if __name__ == '__main__':
53
+
54
+ config = CFG()
55
+ seqClassifer = TransformersSequenceClassifier(model_output_dir=config.MODEL_OUTPUT_DIR, tokenizer=tokenizer, model_checkpoint="distilbert-base-uncased", num_labels=3)
56
+ data = pd.read_csv("data/raw_data/train.csv")
57
+ test_df = pd.read_csv("data/raw_data/test.csv")
58
+ train_df, valid_df = train_test_split(data, test_size=0.30, random_state=42)
59
+
60
+ train_df = prepare_input_text(train_df, sep_token=tokenizer.sep_token)
61
+ valid_df = prepare_input_text(valid_df, sep_token=tokenizer.sep_token)
62
+
63
+ train_dataset = Dataset.from_pandas(train_df[['inputs', TARGET]]).rename_column(TARGET, 'label').class_encode_column("label")
64
+ val_dataset = Dataset.from_pandas(valid_df[['inputs', TARGET]]).rename_column(TARGET, 'label').class_encode_column("label")
65
+ seqClassifer.train(train_dataset=train_dataset, eval_dataset=val_dataset, epochs=2, batch_size=64)
66
+
67
+ """
68
+ train_df[TARGET].value_counts(ascending=True).plot.barh()
69
+ plt.title("Frequency of Classes")
70
+ plt.show()
71
+
72
+ train_df['discourse_type'].value_counts(ascending=True).plot.barh()
73
+ plt.title("Frequency of discourse_type")
74
+ plt.show()
75
+
76
+ train_df["Words Per text"] = train_df[TEXT].str.split().apply(len)
77
+ train_df.boxplot("Words Per text", by=TARGET, grid=False, showfliers=False,
78
+ color="black")
79
+ plt.suptitle("")
80
+ plt.xlabel("")
81
+ plt.show()
82
+ """
83
+
84
+
85
+ pass