GeminiFan207 commited on
Commit
0ed8f16
·
verified ·
1 Parent(s): 651dc30

Create utilis.py

Browse files
Files changed (1) hide show
  1. utilis.py +59 -0
utilis.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
3
+ from datasets import load_dataset
4
+
5
+ def load_model_and_tokenizer(model_name):
6
+ """
7
+ Load the model and tokenizer.
8
+ """
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+ model = AutoModelForCausalLM.from_pretrained(model_name)
11
+ return model, tokenizer
12
+
13
+ def load_and_tokenize_dataset(dataset_name, tokenizer, max_length=512):
14
+ """
15
+ Load and tokenize the dataset.
16
+ """
17
+ dataset = load_dataset(dataset_name)
18
+
19
+ def tokenize_function(examples):
20
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length)
21
+
22
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
23
+ return tokenized_datasets
24
+
25
+ def setup_training_args(output_dir="./results", per_device_train_batch_size=2, per_device_eval_batch_size=2,
26
+ gradient_accumulation_steps=8, num_train_epochs=3, learning_rate=5e-5, weight_decay=0.01,
27
+ warmup_steps=500, logging_steps=100, fp16=True):
28
+ """
29
+ Set up training arguments.
30
+ """
31
+ training_args = TrainingArguments(
32
+ output_dir=output_dir,
33
+ evaluation_strategy="epoch",
34
+ per_device_train_batch_size=per_device_train_batch_size,
35
+ per_device_eval_batch_size=per_device_eval_batch_size,
36
+ gradient_accumulation_steps=gradient_accumulation_steps,
37
+ num_train_epochs=num_train_epochs,
38
+ save_strategy="epoch",
39
+ save_total_limit=2,
40
+ logging_dir="./logs",
41
+ logging_steps=logging_steps,
42
+ report_to="none",
43
+ fp16=fp16,
44
+ learning_rate=learning_rate,
45
+ weight_decay=weight_decay,
46
+ warmup_steps=warmup_steps,
47
+ dataloader_num_workers=4,
48
+ push_to_hub=False
49
+ )
50
+ return training_args
51
+
52
+ def save_model_and_tokenizer(model, tokenizer, save_dir):
53
+ """
54
+ Save the model and tokenizer.
55
+ """
56
+ os.makedirs(save_dir, exist_ok=True)
57
+ model.save_pretrained(save_dir)
58
+ tokenizer.save_pretrained(save_dir)
59
+ print(f"Model and tokenizer saved at {save_dir}")