GabrielSalem commited on
Commit
2ad819f
·
verified ·
1 Parent(s): f6dd965

Delete utils.py

Browse files
Files changed (1) hide show
  1. utils.py +0 -28
utils.py DELETED
@@ -1,28 +0,0 @@
1
- from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
2
- from datasets import Dataset
3
-
4
- def preprocess_data(df, tokenizer):
5
- df["text"] = df.apply(lambda row: f"Question: {row['Question']} Answer: {row['Answer']}", axis=1)
6
- dataset = Dataset.from_pandas(df)
7
- dataset = dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=512), batched=True)
8
- return dataset
9
-
10
- def train_model(model, tokenizer, dataset, output_dir):
11
- training_args = TrainingArguments(
12
- output_dir=output_dir,
13
- per_device_train_batch_size=4,
14
- num_train_epochs=1,
15
- logging_dir="./logs",
16
- save_steps=10,
17
- logging_steps=10
18
- )
19
- data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
20
- trainer = Trainer(
21
- model=model,
22
- args=training_args,
23
- train_dataset=dataset,
24
- data_collator=data_collator
25
- )
26
- trainer.train()
27
- model.save_pretrained(output_dir)
28
- tokenizer.save_pretrained(output_dir)