aimlnerd commited on
Commit
440014c
·
1 Parent(s): cb09873

add trainer

Browse files
source/services/predicting_effective_arguments/train/model.py CHANGED
@@ -23,14 +23,14 @@ class TransformersSequenceClassifier:
23
  self.model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels).to(device)
24
 
25
  def tokenizer_func(self, batch):
26
- return self.tokenizer(batch["text"], truncation=True, max_len=386)
27
 
28
  def train(self, train_dataset, eval_dataset, epochs=2, batch_size=64):
29
 
30
  train_tok_dataset = train_dataset.map(self.tokenizer_func, batched=True, remove_columns=('inputs', '__index_level_0__'))
31
  val_tok_dataset = eval_dataset.map(self.tokenizer_func, batched=True, remove_columns=('inputs', '__index_level_0__'))
32
  data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, padding='longest')
33
- training_args = TrainingArguments(output_dir=self.model_output_dir,
34
  num_train_epochs=epochs,
35
  learning_rate=2e-5,
36
  per_device_train_batch_size=batch_size,
@@ -38,7 +38,7 @@ class TransformersSequenceClassifier:
38
  weight_decay=0.01,
39
  evaluation_strategy="epoch",
40
  disable_tqdm=False,
41
- logging_steps=len(train_dataset) // batch_size,
42
  push_to_hub=True,
43
  log_level="error")
44
  self.trainer = Trainer(
 
23
  self.model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels).to(device)
24
 
25
  def tokenizer_func(self, batch):
26
+ return self.tokenizer(batch["inputs"], truncation=True) #, max_len=386
27
 
28
  def train(self, train_dataset, eval_dataset, epochs=2, batch_size=64):
29
 
30
  train_tok_dataset = train_dataset.map(self.tokenizer_func, batched=True, remove_columns=('inputs', '__index_level_0__'))
31
  val_tok_dataset = eval_dataset.map(self.tokenizer_func, batched=True, remove_columns=('inputs', '__index_level_0__'))
32
  data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, padding='longest')
33
+ training_args = TrainingArguments(output_dir=f"{self.model_output_dir}/predicting_effective_arguments_distilbert",
34
  num_train_epochs=epochs,
35
  learning_rate=2e-5,
36
  per_device_train_batch_size=batch_size,
 
38
  weight_decay=0.01,
39
  evaluation_strategy="epoch",
40
  disable_tqdm=False,
41
+ logging_steps=len(train_dataset)// batch_size,
42
  push_to_hub=True,
43
  log_level="error")
44
  self.trainer = Trainer(
source/services/predicting_effective_arguments/train/seq_classification.py CHANGED
@@ -52,8 +52,8 @@ def prepare_input_text(df, sep_token):
52
  if __name__ == '__main__':
53
 
54
  config = CFG()
55
- seqClassifer = TransformersSequenceClassifier(model_output_dir=config.MODEL_OUTPUT_DIR, tokenizer=tokenizer, model_checkpoint="distilbert-base-uncased", num_labels=3)
56
- data = pd.read_csv("data/raw_data/train.csv")
57
  test_df = pd.read_csv("data/raw_data/test.csv")
58
  train_df, valid_df = train_test_split(data, test_size=0.30, random_state=42)
59
 
 
52
  if __name__ == '__main__':
53
 
54
  config = CFG()
55
+ seqClassifer = TransformersSequenceClassifier(model_output_dir=config.MODEL_OUTPUT_DIR, tokenizer=tokenizer, model_checkpoint="distilbert-base-uncased", num_labels=3) #distilbert-base-uncased
56
+ data = pd.read_csv("data/raw_data/train.csv")[:100]
57
  test_df = pd.read_csv("data/raw_data/test.csv")
58
  train_df, valid_df = train_test_split(data, test_size=0.30, random_state=42)
59