Spaces:
Sleeping
Sleeping
File size: 4,307 Bytes
ea19ac8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import RobertaConfig, RobertaForSequenceClassification
from transformers import AdamW
from newhead import NewClassificationHead
def preprocess_data(df):
"""
Preprocess the data by renaming columns, removing rows with missing values, and removing extra spaces.
"""
df = df.rename(columns={'Comment': 'text', 'Emotion': 'label'})
df = df.dropna()
df['text'] = df['text'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip()
df['label'] = df['label'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip()
return df
def encode_label(df):
"""
Encode the labels using LabelEncoder.
"""
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
return df
def generate_dataset(df, test_size=0.2):
"""
Convert the DataFrame into a Dataset that can be used with transformers.
"""
return Dataset.from_pandas(df)
def tokenize(batch):
return tokenizer(batch['text'], padding='max_length', truncation=True)
def compute_metrics(pred):
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
# Define model and training arguments
model_name = "cardiffnlp/twitter-roberta-base-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = RobertaConfig.from_pretrained(model_name, num_labels=3) # Set the number of labels to 3
model = RobertaForSequenceClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True)
model.classifier = NewClassificationHead(config)
df = pd.read_csv('Emotion_classify_Data.csv')
df = preprocess_data(df)
df = encode_label(df)
ds = generate_dataset(df)
ds = ds.map(tokenize, batched=True)
### Transer Learning First
# Freeze all layers first
for param in model.parameters():
param.requires_grad = False
# Unfreeze the classifier layer
for param in model.classifier.parameters():
param.requires_grad = True
# Define different learning rates
head_lr = 3e-4 # Higher learning rate for the head
base_lr = head_lr/5 # Lower learning rate for the base layers
# Group parameters and set learning rates
optimizer_grouped_parameters = [
{'params': model.classifier.parameters(), 'lr': head_lr},
{'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr}
]
optimizer = AdamW(optimizer_grouped_parameters)
## Training arguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=10,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
save_strategy="no",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=ds['train'],
eval_dataset=ds['test'],
tokenizer=tokenizer,
optimizers=(optimizer, None), # No need to pass a learning rate scheduler if you're managing learning rates manually,
compute_metrics=compute_metrics
)
## Train the head of the model
trainer.train()
## Unfreeze all layers
for param in model.parameters():
param.requires_grad = True
head_lr = 1e-4 # Slightly lower learning rate for the head
base_lr = 5e-6 # Much lower learning rate for the base layers
optimizer_grouped_parameters = [
{'params': model.classifier.parameters(), 'lr': head_lr},
{'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr}
]
## train the entire model
optimizer = AdamW(optimizer_grouped_parameters)
training_args.num_train_epochs = 5 # Set the number of additional epochs
trainer.train()
model.save_pretrained('transferLearningResults')
tokenizer.save_pretrained('transferLearningResults')
|