aimlnerd commited on
Commit
4f607de
·
1 Parent(s): f7abe49
requirements.txt CHANGED
@@ -9,4 +9,5 @@ seqeval==1.2.2
9
  pandas==2.1.4
10
  gradio==4.13.0
11
  pydantic_settings==2.1.0
12
- sentencepiece==0.1.99
 
 
9
  pandas==2.1.4
10
  gradio==4.13.0
11
  pydantic_settings==2.1.0
12
+ sentencepiece==0.1.99
13
+ umap-learn==0.5.5
source/services/predicting_effective_arguments/train/model.py CHANGED
@@ -7,6 +7,8 @@ import torch.nn.functional as F
7
  import matplotlib.pyplot as plt
8
  from typing import List
9
  from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
 
 
10
 
11
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
 
@@ -17,19 +19,41 @@ class TransformersSequenceClassifier:
17
  model_output_dir,
18
  num_labels,
19
  tokenizer : AutoTokenizer,
 
 
20
  model_checkpoint="distilbert-base-uncased"
21
  ):
22
  self.model_output_dir = model_output_dir
23
- self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
24
- self.model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels).to(device)
25
 
26
  def tokenizer_batch(self, batch):
27
- return self.tokenizer(batch["inputs"], truncation=True) #, max_len=386
28
 
29
  def tokenize_dataset(self, dataset):
30
  return dataset.map(self.tokenizer_batch, batched=True, remove_columns=('inputs', '__index_level_0__'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def train(self, train_dataset, eval_dataset, batch_size, epochs):
32
- data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, padding='longest')
33
  training_args = TrainingArguments(output_dir=self.model_output_dir,
34
  num_train_epochs=epochs,
35
  learning_rate=2e-5,
@@ -39,7 +63,7 @@ class TransformersSequenceClassifier:
39
  evaluation_strategy="epoch",
40
  save_strategy='epoch',
41
  disable_tqdm=False,
42
- logging_steps=len(train_dataset)// batch_size,
43
  push_to_hub=True,
44
  load_best_model_at_end=True,
45
  log_level="error")
@@ -50,7 +74,7 @@ class TransformersSequenceClassifier:
50
  train_dataset=train_dataset,
51
  eval_dataset=eval_dataset,
52
  tokenizer=self.tokenizer,
53
- data_collator=data_collator
54
  )
55
  self.trainer.train()
56
  self.trainer.push_to_hub(commit_message="Training completed!")
@@ -83,15 +107,15 @@ class TransformersSequenceClassifier:
83
  return valid_dataset.map(self.forward_pass_with_label, batched=True, batch_size=16)
84
 
85
  @staticmethod
86
- def plot_confusion_matrix(y_preds, y_true, labels):
87
  cm = confusion_matrix(y_true, y_preds, normalize="true")
88
  fig, ax = plt.subplots(figsize=(6, 6))
89
- disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
90
  disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
91
  plt.title("Normalized confusion matrix")
92
  plt.show()
93
 
94
- def predict_valid_data(self, valid_dataset):
95
  #trainer = Trainer(model=self.model)
96
  preds_output = self.trainer.predict(valid_dataset)
97
  print(preds_output.metrics)
@@ -99,7 +123,7 @@ class TransformersSequenceClassifier:
99
  return y_preds
100
 
101
  @staticmethod
102
- def predict_test_data(model_checkpoint, test_list: List[str]) -> List:
103
  pipe_classifier = pipeline("text-classification", model=model_checkpoint)
104
  preds = pipe_classifier(test_list, return_all_scores=True)
105
  return preds
 
7
  import matplotlib.pyplot as plt
8
  from typing import List
9
  from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
10
+ from umap import UMAP
11
+ from sklearn.preprocessing import MinMaxScaler
12
 
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
 
 
19
  model_output_dir,
20
  num_labels,
21
  tokenizer : AutoTokenizer,
22
+ id2label,
23
+ label2id,
24
  model_checkpoint="distilbert-base-uncased"
25
  ):
26
  self.model_output_dir = model_output_dir
27
+ self.tokenizer = tokenizer
28
+ self.model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id).to(device)
29
 
30
  def tokenizer_batch(self, batch):
31
+ return self.tokenizer(batch["inputs"], truncation=True, padding=True, return_tensors="pt") #, max_len=386
32
 
33
  def tokenize_dataset(self, dataset):
34
  return dataset.map(self.tokenizer_batch, batched=True, remove_columns=('inputs', '__index_level_0__'))
35
+
36
+ @staticmethod
37
+ def extract_hidden_states(batch, tokenizer, model):
38
+ # Place model inputs on the GPU
39
+ inputs = {k:v for k,v in batch.items() if k in tokenizer.model_input_names} #.to(device)
40
+ # Extract last hidden states
41
+ with torch.no_grad():
42
+ last_hidden_state = model(**inputs).last_hidden_state
43
+ # Return vector for [CLS] token
44
+ return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}
45
+
46
+ @staticmethod
47
+ def fit_umap(df_x):
48
+ # Scale features to [0,1] range
49
+ X_scaled = MinMaxScaler().fit_transform(df_x)
50
+ # Initialize and fit UMAP
51
+ mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
52
+ return mapper.embedding_
53
+ # Create a DataFrame of 2D embeddings
54
+
55
  def train(self, train_dataset, eval_dataset, batch_size, epochs):
56
+ #data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, padding='longest')
57
  training_args = TrainingArguments(output_dir=self.model_output_dir,
58
  num_train_epochs=epochs,
59
  learning_rate=2e-5,
 
63
  evaluation_strategy="epoch",
64
  save_strategy='epoch',
65
  disable_tqdm=False,
66
+ logging_steps=len(train_dataset)//batch_size,
67
  push_to_hub=True,
68
  load_best_model_at_end=True,
69
  log_level="error")
 
74
  train_dataset=train_dataset,
75
  eval_dataset=eval_dataset,
76
  tokenizer=self.tokenizer,
77
+ #data_collator=data_collator
78
  )
79
  self.trainer.train()
80
  self.trainer.push_to_hub(commit_message="Training completed!")
 
107
  return valid_dataset.map(self.forward_pass_with_label, batched=True, batch_size=16)
108
 
109
  @staticmethod
110
+ def plot_confusion_matrix(y_preds, y_true, label_names):
111
  cm = confusion_matrix(y_true, y_preds, normalize="true")
112
  fig, ax = plt.subplots(figsize=(6, 6))
113
+ disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
114
  disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
115
  plt.title("Normalized confusion matrix")
116
  plt.show()
117
 
118
+ def predict_argmax_logit(self, valid_dataset):
119
  #trainer = Trainer(model=self.model)
120
  preds_output = self.trainer.predict(valid_dataset)
121
  print(preds_output.metrics)
 
123
  return y_preds
124
 
125
  @staticmethod
126
+ def predict_pipeline(model_checkpoint, test_list: List[str]) -> List:
127
  pipe_classifier = pipeline("text-classification", model=model_checkpoint)
128
  preds = pipe_classifier(test_list, return_all_scores=True)
129
  return preds
source/services/predicting_effective_arguments/train/train_seq_classification.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ from datasets import load_dataset
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
6
+ from datasets import Dataset, load_metric
7
+ from sklearn.model_selection import train_test_split
8
+ from source.services.predicting_effective_arguments.train.model import TransformersSequenceClassifier
9
+
10
+ class CFG:
11
+ TARGET = 'discourse_effectiveness'
12
+ TEXT = "discourse_text"
13
+ MODEL_CHECKPOINT = "distilbert-base-uncased"
14
+ MODEL_OUTPUT_DIR ='source/services/predicting_effective_arguments/model/hf_textclassification/predicting_effective_arguments_distilbert'
15
+ model_name="debertav3base"
16
+ learning_rate=1.5e-5
17
+ weight_decay=0.02
18
+ hidden_dropout_prob=0.007
19
+ attention_probs_dropout_prob=0.007
20
+ num_train_epochs=10
21
+ n_splits=4
22
+ batch_size=12
23
+ random_seed=42
24
+ save_steps=100
25
+ max_length=512
26
+
27
+
28
+ def seed_everything(seed: int):
29
+ import random, os
30
+ import numpy as np
31
+ import torch
32
+
33
+ random.seed(seed)
34
+ os.environ['PYTHONHASHSEED'] = str(seed)
35
+ np.random.seed(seed)
36
+ torch.manual_seed(seed)
37
+ torch.cuda.manual_seed(seed)
38
+ torch.backends.cudnn.deterministic = True
39
+ torch.backends.cudnn.benchmark = True
40
+
41
+
42
+ def prepare_input_text(df, sep_token):
43
+ df['inputs'] = df.discourse_type.str.lower() + ' ' + sep_token + ' ' + df.discourse_text.str.lower()
44
+ return df
45
+
46
+
47
+ if __name__ == '__main__':
48
+
49
+ config = CFG()
50
+ tokenizer = AutoTokenizer.from_pretrained(config.MODEL_CHECKPOINT)
51
+
52
+ data = pd.read_csv("data/raw_data/train.csv")[:100]
53
+ label_names = list(data[config.TARGET].unique())
54
+ #score_df = pd.read_csv("data/raw_data/test.csv")
55
+
56
+ """
57
+ data[TARGET].value_counts(ascending=True).plot.barh()
58
+ plt.title("Frequency of Classes")
59
+ plt.show()
60
+
61
+ data['discourse_type'].value_counts(ascending=True).plot.barh()
62
+ plt.title("Frequency of discourse_type")
63
+ plt.show()
64
+
65
+ data["Words Per text"] = data[TEXT].str.split().apply(len)
66
+ data.boxplot("Words Per text", by=TARGET, grid=False, showfliers=False,
67
+ color="black")
68
+ plt.suptitle("")
69
+ plt.xlabel("")
70
+ plt.show()
71
+ """
72
+
73
+ train_size = 0.7
74
+ valid_size = 0.2
75
+ test_size = 0.1
76
+
77
+ # First split: Separate out the training set
78
+ train_df, temp_df = train_test_split(data, test_size=1 - train_size, random_state=5600)
79
+
80
+ # Second split: Separate out the validation and test sets
81
+ valid_df, test_df = train_test_split(temp_df, test_size=test_size / (test_size + valid_size), random_state=5600)
82
+
83
+
84
+ train_df = prepare_input_text(train_df, sep_token=tokenizer.sep_token)
85
+ valid_df = prepare_input_text(valid_df, sep_token=tokenizer.sep_token)
86
+ test_df = prepare_input_text(test_df, sep_token=tokenizer.sep_token)
87
+
88
+ train_dataset = Dataset.from_pandas(train_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
89
+ val_dataset = Dataset.from_pandas(valid_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
90
+ test_dataset = Dataset.from_pandas(test_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
91
+
92
+ id2label = {i: label for i, label in enumerate(label_names)}
93
+ label2id = {v: k for k, v in id2label.items()}
94
+ seqClassifer = TransformersSequenceClassifier(model_output_dir=config.MODEL_OUTPUT_DIR,
95
+ tokenizer=tokenizer,
96
+ model_checkpoint="distilbert-base-uncased",
97
+ num_labels=3,
98
+ id2label=id2label,
99
+ label2id=label2id)
100
+
101
+ train_tok_dataset = seqClassifer.tokenize_dataset(dataset=train_dataset)
102
+ val_tok_dataset = seqClassifer.tokenize_dataset(dataset=val_dataset)
103
+ test_tok_dataset = seqClassifer.tokenize_dataset(dataset=test_dataset)
104
+
105
+
106
+
107
+ seqClassifer.train(train_dataset=train_tok_dataset, eval_dataset=val_tok_dataset, epochs=1, batch_size=16)
108
+
109
+ y_test_pred = seqClassifer.predict_argmax_logit(test_tok_dataset)
110
+ seqClassifer.plot_confusion_matrix(y_preds=y_test_pred, y_true=test_dataset['label'], label_names=label_names)
111
+
112
+ y_pred = seqClassifer.predict_pipeline(model_checkpoint=config.MODEL_OUTPUT_DIR, test_list=test_df['inputs'].tolist())
113
+ #hidden = train_tok_dataset.map(seqClassifer.extract_hidden_states,
114
+ # batched=True,
115
+ # fn_kwargs={'tokenizer': AutoTokenizer.from_pretrained(config.MODEL_OUTPUT_DIR),
116
+ # 'model': AutoModelForSequenceClassification.from_pretrained(config.MODEL_OUTPUT_DIR)})
117
+
118
+ pass
119
+