Spaces:
Runtime error
Runtime error
add trainer
Browse files
source/services/predicting_effective_arguments/train/model.py
CHANGED
@@ -23,14 +23,14 @@ class TransformersSequenceClassifier:
|
|
23 |
self.model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels).to(device)
|
24 |
|
25 |
def tokenizer_func(self, batch):
|
26 |
-
return self.tokenizer(batch["
|
27 |
|
28 |
def train(self, train_dataset, eval_dataset, epochs=2, batch_size=64):
|
29 |
|
30 |
train_tok_dataset = train_dataset.map(self.tokenizer_func, batched=True, remove_columns=('inputs', '__index_level_0__'))
|
31 |
val_tok_dataset = eval_dataset.map(self.tokenizer_func, batched=True, remove_columns=('inputs', '__index_level_0__'))
|
32 |
data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, padding='longest')
|
33 |
-
training_args = TrainingArguments(output_dir=self.model_output_dir,
|
34 |
num_train_epochs=epochs,
|
35 |
learning_rate=2e-5,
|
36 |
per_device_train_batch_size=batch_size,
|
@@ -38,7 +38,7 @@ class TransformersSequenceClassifier:
|
|
38 |
weight_decay=0.01,
|
39 |
evaluation_strategy="epoch",
|
40 |
disable_tqdm=False,
|
41 |
-
logging_steps=len(train_dataset)
|
42 |
push_to_hub=True,
|
43 |
log_level="error")
|
44 |
self.trainer = Trainer(
|
|
|
23 |
self.model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels).to(device)
|
24 |
|
25 |
def tokenizer_func(self, batch):
|
26 |
+
return self.tokenizer(batch["inputs"], truncation=True) #, max_len=386
|
27 |
|
28 |
def train(self, train_dataset, eval_dataset, epochs=2, batch_size=64):
|
29 |
|
30 |
train_tok_dataset = train_dataset.map(self.tokenizer_func, batched=True, remove_columns=('inputs', '__index_level_0__'))
|
31 |
val_tok_dataset = eval_dataset.map(self.tokenizer_func, batched=True, remove_columns=('inputs', '__index_level_0__'))
|
32 |
data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, padding='longest')
|
33 |
+
training_args = TrainingArguments(output_dir=f"{self.model_output_dir}/predicting_effective_arguments_distilbert",
|
34 |
num_train_epochs=epochs,
|
35 |
learning_rate=2e-5,
|
36 |
per_device_train_batch_size=batch_size,
|
|
|
38 |
weight_decay=0.01,
|
39 |
evaluation_strategy="epoch",
|
40 |
disable_tqdm=False,
|
41 |
+
logging_steps=len(train_dataset)// batch_size,
|
42 |
push_to_hub=True,
|
43 |
log_level="error")
|
44 |
self.trainer = Trainer(
|
source/services/predicting_effective_arguments/train/seq_classification.py
CHANGED
@@ -52,8 +52,8 @@ def prepare_input_text(df, sep_token):
|
|
52 |
if __name__ == '__main__':
|
53 |
|
54 |
config = CFG()
|
55 |
-
seqClassifer = TransformersSequenceClassifier(model_output_dir=config.MODEL_OUTPUT_DIR, tokenizer=tokenizer, model_checkpoint="distilbert-base-uncased", num_labels=3)
|
56 |
-
data = pd.read_csv("data/raw_data/train.csv")
|
57 |
test_df = pd.read_csv("data/raw_data/test.csv")
|
58 |
train_df, valid_df = train_test_split(data, test_size=0.30, random_state=42)
|
59 |
|
|
|
52 |
if __name__ == '__main__':
|
53 |
|
54 |
config = CFG()
|
55 |
+
seqClassifer = TransformersSequenceClassifier(model_output_dir=config.MODEL_OUTPUT_DIR, tokenizer=tokenizer, model_checkpoint="distilbert-base-uncased", num_labels=3) #distilbert-base-uncased
|
56 |
+
data = pd.read_csv("data/raw_data/train.csv")[:100]
|
57 |
test_df = pd.read_csv("data/raw_data/test.csv")
|
58 |
train_df, valid_df = train_test_split(data, test_size=0.30, random_state=42)
|
59 |
|