File size: 4,307 Bytes
ea19ac8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import torch
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import RobertaConfig, RobertaForSequenceClassification
from transformers import AdamW

from newhead import NewClassificationHead

def preprocess_data(df):
    """
    Preprocess the data by renaming columns, removing rows with missing values, and removing extra spaces.
    """
    df = df.rename(columns={'Comment': 'text', 'Emotion': 'label'})
    df = df.dropna()
    df['text'] = df['text'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip()
    df['label'] = df['label'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip()
    return df

def encode_label(df):
    """
    Encode the labels using LabelEncoder.
    """
    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df['label'])
    return df

def generate_dataset(df, test_size=0.2):
    """
    Convert the DataFrame into a Dataset that can be used with transformers.
    """
    return Dataset.from_pandas(df)

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)


def compute_metrics(pred):
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define model and training arguments
model_name = "cardiffnlp/twitter-roberta-base-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = RobertaConfig.from_pretrained(model_name, num_labels=3)  # Set the number of labels to 3
model = RobertaForSequenceClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True)
model.classifier = NewClassificationHead(config)

df = pd.read_csv('Emotion_classify_Data.csv')
df = preprocess_data(df)
df = encode_label(df)
ds = generate_dataset(df)
ds = ds.map(tokenize, batched=True)


### Transer Learning First
# Freeze all layers first
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the classifier layer
for param in model.classifier.parameters():
    param.requires_grad = True


# Define different learning rates
head_lr = 3e-4  # Higher learning rate for the head
base_lr = head_lr/5 # Lower learning rate for the base layers

# Group parameters and set learning rates
optimizer_grouped_parameters = [
    {'params': model.classifier.parameters(), 'lr': head_lr},
    {'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr}
]

optimizer = AdamW(optimizer_grouped_parameters)

## Training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=10,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',
    save_strategy="no",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, None),  # No need to pass a learning rate scheduler if you're managing learning rates manually,
    compute_metrics=compute_metrics
)


## Train the head of the model
trainer.train()


## Unfreeze all layers
for param in model.parameters():
    param.requires_grad = True

    
head_lr = 1e-4  # Slightly lower learning rate for the head
base_lr = 5e-6  # Much lower learning rate for the base layers

optimizer_grouped_parameters = [
    {'params': model.classifier.parameters(), 'lr': head_lr},
    {'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr}
]

## train the entire model 
optimizer = AdamW(optimizer_grouped_parameters)

training_args.num_train_epochs = 5  # Set the number of additional epochs
trainer.train()

model.save_pretrained('transferLearningResults')
tokenizer.save_pretrained('transferLearningResults')