aimlnerd commited on
Commit
67d83f0
·
1 Parent(s): 440014c

add trainer

Browse files
README.md CHANGED
File without changes
source/services/predicting_effective_arguments/train/model.py CHANGED
@@ -5,6 +5,7 @@ import torch
5
  import numpy as np
6
  import torch.nn.functional as F
7
  import matplotlib.pyplot as plt
 
8
  from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
9
 
10
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -22,35 +23,37 @@ class TransformersSequenceClassifier:
22
  self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
23
  self.model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels).to(device)
24
 
25
- def tokenizer_func(self, batch):
26
  return self.tokenizer(batch["inputs"], truncation=True) #, max_len=386
27
 
28
- def train(self, train_dataset, eval_dataset, epochs=2, batch_size=64):
29
-
30
- train_tok_dataset = train_dataset.map(self.tokenizer_func, batched=True, remove_columns=('inputs', '__index_level_0__'))
31
- val_tok_dataset = eval_dataset.map(self.tokenizer_func, batched=True, remove_columns=('inputs', '__index_level_0__'))
32
  data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, padding='longest')
33
- training_args = TrainingArguments(output_dir=f"{self.model_output_dir}/predicting_effective_arguments_distilbert",
34
  num_train_epochs=epochs,
35
  learning_rate=2e-5,
36
  per_device_train_batch_size=batch_size,
37
  per_device_eval_batch_size=batch_size,
38
  weight_decay=0.01,
39
  evaluation_strategy="epoch",
 
40
  disable_tqdm=False,
41
  logging_steps=len(train_dataset)// batch_size,
42
  push_to_hub=True,
 
43
  log_level="error")
44
  self.trainer = Trainer(
45
  model=self.model,
46
  args=training_args,
47
  compute_metrics=self._compute_metrics,
48
- train_dataset=train_tok_dataset,
49
- eval_dataset=val_tok_dataset,
50
  tokenizer=self.tokenizer,
51
  data_collator=data_collator
52
  )
53
  self.trainer.train()
 
54
 
55
  @staticmethod
56
  def _compute_metrics(pred):
@@ -96,9 +99,9 @@ class TransformersSequenceClassifier:
96
  return y_preds
97
 
98
  @staticmethod
99
- def predict_test_data(model_checkpoint, test_data):
100
  pipe_classifier = pipeline("text-classification", model=model_checkpoint)
101
- preds = pipe_classifier(test_data, return_all_scores=True)
102
  return preds
103
 
104
 
 
5
  import numpy as np
6
  import torch.nn.functional as F
7
  import matplotlib.pyplot as plt
8
+ from typing import List
9
  from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
10
 
11
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
23
  self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
24
  self.model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels).to(device)
25
 
26
+ def tokenizer_batch(self, batch):
27
  return self.tokenizer(batch["inputs"], truncation=True) #, max_len=386
28
 
29
+ def tokenize_dataset(self, dataset):
30
+ return dataset.map(self.tokenizer_batch, batched=True, remove_columns=('inputs', '__index_level_0__'))
31
+ def train(self, train_dataset, eval_dataset, batch_size, epochs):
 
32
  data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, padding='longest')
33
+ training_args = TrainingArguments(output_dir=self.model_output_dir,
34
  num_train_epochs=epochs,
35
  learning_rate=2e-5,
36
  per_device_train_batch_size=batch_size,
37
  per_device_eval_batch_size=batch_size,
38
  weight_decay=0.01,
39
  evaluation_strategy="epoch",
40
+ save_strategy='epoch',
41
  disable_tqdm=False,
42
  logging_steps=len(train_dataset)// batch_size,
43
  push_to_hub=True,
44
+ load_best_model_at_end=True,
45
  log_level="error")
46
  self.trainer = Trainer(
47
  model=self.model,
48
  args=training_args,
49
  compute_metrics=self._compute_metrics,
50
+ train_dataset=train_dataset,
51
+ eval_dataset=eval_dataset,
52
  tokenizer=self.tokenizer,
53
  data_collator=data_collator
54
  )
55
  self.trainer.train()
56
+ self.trainer.push_to_hub(commit_message="Training completed!")
57
 
58
  @staticmethod
59
  def _compute_metrics(pred):
 
99
  return y_preds
100
 
101
  @staticmethod
102
+ def predict_test_data(model_checkpoint, test_list: List[str]) -> List:
103
  pipe_classifier = pipeline("text-classification", model=model_checkpoint)
104
+ preds = pipe_classifier(test_list, return_all_scores=True)
105
  return preds
106
 
107
 
source/services/predicting_effective_arguments/train/seq_classification.py CHANGED
@@ -7,15 +7,11 @@ from datasets import Dataset, load_metric
7
  from sklearn.model_selection import train_test_split
8
  from source.services.predicting_effective_arguments.train.model import TransformersSequenceClassifier
9
 
10
- TARGET = 'discourse_effectiveness'
11
- TEXT = "discourse_text"
12
- MODEL_CHECKPOINT = "distilbert-base-uncased"
13
- MODEL_OUTPUT_DIR ='source/services/predicting_effective_arguments/model/hf_textclassification'
14
  class CFG:
15
  TARGET = 'discourse_effectiveness'
16
  TEXT = "discourse_text"
17
  MODEL_CHECKPOINT = "distilbert-base-uncased"
18
- MODEL_OUTPUT_DIR ='source/services/predicting_effective_arguments/model/hf_textclassification'
19
  model_name="debertav3base"
20
  learning_rate=1.5e-5
21
  weight_decay=0.02
@@ -28,7 +24,6 @@ class CFG:
28
  save_steps=100
29
  max_length=512
30
 
31
- tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
32
 
33
  def seed_everything(seed: int):
34
  import random, os
@@ -52,17 +47,37 @@ def prepare_input_text(df, sep_token):
52
  if __name__ == '__main__':
53
 
54
  config = CFG()
 
55
  seqClassifer = TransformersSequenceClassifier(model_output_dir=config.MODEL_OUTPUT_DIR, tokenizer=tokenizer, model_checkpoint="distilbert-base-uncased", num_labels=3) #distilbert-base-uncased
56
  data = pd.read_csv("data/raw_data/train.csv")[:100]
57
  test_df = pd.read_csv("data/raw_data/test.csv")
58
- train_df, valid_df = train_test_split(data, test_size=0.30, random_state=42)
 
 
 
 
 
 
 
 
 
59
 
60
  train_df = prepare_input_text(train_df, sep_token=tokenizer.sep_token)
61
  valid_df = prepare_input_text(valid_df, sep_token=tokenizer.sep_token)
 
62
 
63
- train_dataset = Dataset.from_pandas(train_df[['inputs', TARGET]]).rename_column(TARGET, 'label').class_encode_column("label")
64
- val_dataset = Dataset.from_pandas(valid_df[['inputs', TARGET]]).rename_column(TARGET, 'label').class_encode_column("label")
65
- seqClassifer.train(train_dataset=train_dataset, eval_dataset=val_dataset, epochs=2, batch_size=64)
 
 
 
 
 
 
 
 
 
66
 
67
  """
68
  train_df[TARGET].value_counts(ascending=True).plot.barh()
 
7
  from sklearn.model_selection import train_test_split
8
  from source.services.predicting_effective_arguments.train.model import TransformersSequenceClassifier
9
 
 
 
 
 
10
  class CFG:
11
  TARGET = 'discourse_effectiveness'
12
  TEXT = "discourse_text"
13
  MODEL_CHECKPOINT = "distilbert-base-uncased"
14
+ MODEL_OUTPUT_DIR ='source/services/predicting_effective_arguments/model/hf_textclassification/predicting_effective_arguments_distilbert'
15
  model_name="debertav3base"
16
  learning_rate=1.5e-5
17
  weight_decay=0.02
 
24
  save_steps=100
25
  max_length=512
26
 
 
27
 
28
  def seed_everything(seed: int):
29
  import random, os
 
47
  if __name__ == '__main__':
48
 
49
  config = CFG()
50
+ tokenizer = AutoTokenizer.from_pretrained(config.MODEL_CHECKPOINT)
51
  seqClassifer = TransformersSequenceClassifier(model_output_dir=config.MODEL_OUTPUT_DIR, tokenizer=tokenizer, model_checkpoint="distilbert-base-uncased", num_labels=3) #distilbert-base-uncased
52
  data = pd.read_csv("data/raw_data/train.csv")[:100]
53
  test_df = pd.read_csv("data/raw_data/test.csv")
54
+ train_size = 0.7
55
+ valid_size = 0.2
56
+ test_size = 0.1
57
+
58
+ # First split: Separate out the training set
59
+ train_df, temp_df = train_test_split(data, test_size=1 - train_size)
60
+
61
+ # Second split: Separate out the validation and test sets
62
+ valid_df, test_df = train_test_split(temp_df, test_size=test_size / (test_size + valid_size))
63
+
64
 
65
  train_df = prepare_input_text(train_df, sep_token=tokenizer.sep_token)
66
  valid_df = prepare_input_text(valid_df, sep_token=tokenizer.sep_token)
67
+ test_df = prepare_input_text(test_df, sep_token=tokenizer.sep_token)
68
 
69
+ train_dataset = Dataset.from_pandas(train_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
70
+ val_dataset = Dataset.from_pandas(valid_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
71
+ test_dataset = Dataset.from_pandas(test_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
72
+
73
+ train_tok_dataset = seqClassifer.tokenize_dataset(dataset=train_dataset)
74
+ val_tok_dataset = seqClassifer.tokenize_dataset(dataset=val_dataset)
75
+ test_tok_dataset = seqClassifer.tokenize_dataset(dataset=test_dataset)
76
+
77
+ seqClassifer.train(train_dataset=train_tok_dataset, eval_dataset=val_tok_dataset, epochs=1, batch_size=16)
78
+ y_pred = seqClassifer.predict_valid_data(val_tok_dataset)
79
+ seqClassifer.predict_test_data(model_checkpoint=config.MODEL_OUTPUT_DIR, test_data=test_df['inputs'].tolist())
80
+ pass
81
 
82
  """
83
  train_df[TARGET].value_counts(ascending=True).plot.barh()