Spaces:
Sleeping
Sleeping
import torch | |
import pandas as pd | |
from sklearn.preprocessing import LabelEncoder | |
from datasets import Dataset | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer | |
from transformers import RobertaConfig, RobertaForSequenceClassification | |
from transformers import AdamW | |
from newhead import NewClassificationHead | |
def preprocess_data(df): | |
""" | |
Preprocess the data by renaming columns, removing rows with missing values, and removing extra spaces. | |
""" | |
df = df.rename(columns={'Comment': 'text', 'Emotion': 'label'}) | |
df = df.dropna() | |
df['text'] = df['text'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip() | |
df['label'] = df['label'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip() | |
return df | |
def encode_label(df): | |
""" | |
Encode the labels using LabelEncoder. | |
""" | |
label_encoder = LabelEncoder() | |
df['label'] = label_encoder.fit_transform(df['label']) | |
return df | |
def generate_dataset(df, test_size=0.2): | |
""" | |
Convert the DataFrame into a Dataset that can be used with transformers. | |
""" | |
return Dataset.from_pandas(df) | |
def tokenize(batch): | |
return tokenizer(batch['text'], padding='max_length', truncation=True) | |
def compute_metrics(pred): | |
from sklearn.metrics import accuracy_score, precision_recall_fscore_support | |
labels = pred.label_ids | |
preds = pred.predictions.argmax(-1) | |
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted') | |
acc = accuracy_score(labels, preds) | |
return { | |
'accuracy': acc, | |
'f1': f1, | |
'precision': precision, | |
'recall': recall | |
} | |
# Define model and training arguments | |
model_name = "cardiffnlp/twitter-roberta-base-emotion" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
config = RobertaConfig.from_pretrained(model_name, num_labels=3) # Set the number of labels to 3 | |
model = RobertaForSequenceClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True) | |
model.classifier = NewClassificationHead(config) | |
df = pd.read_csv('Emotion_classify_Data.csv') | |
df = preprocess_data(df) | |
df = encode_label(df) | |
ds = generate_dataset(df) | |
ds = ds.map(tokenize, batched=True) | |
### Transer Learning First | |
# Freeze all layers first | |
for param in model.parameters(): | |
param.requires_grad = False | |
# Unfreeze the classifier layer | |
for param in model.classifier.parameters(): | |
param.requires_grad = True | |
# Define different learning rates | |
head_lr = 3e-4 # Higher learning rate for the head | |
base_lr = head_lr/5 # Lower learning rate for the base layers | |
# Group parameters and set learning rates | |
optimizer_grouped_parameters = [ | |
{'params': model.classifier.parameters(), 'lr': head_lr}, | |
{'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr} | |
] | |
optimizer = AdamW(optimizer_grouped_parameters) | |
## Training arguments | |
training_args = TrainingArguments( | |
output_dir='./results', | |
num_train_epochs=10, | |
per_device_train_batch_size=16, | |
per_device_eval_batch_size=64, | |
warmup_steps=500, | |
weight_decay=0.01, | |
logging_dir='./logs', | |
save_strategy="no", | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=ds['train'], | |
eval_dataset=ds['test'], | |
tokenizer=tokenizer, | |
optimizers=(optimizer, None), # No need to pass a learning rate scheduler if you're managing learning rates manually, | |
compute_metrics=compute_metrics | |
) | |
## Train the head of the model | |
trainer.train() | |
## Unfreeze all layers | |
for param in model.parameters(): | |
param.requires_grad = True | |
head_lr = 1e-4 # Slightly lower learning rate for the head | |
base_lr = 5e-6 # Much lower learning rate for the base layers | |
optimizer_grouped_parameters = [ | |
{'params': model.classifier.parameters(), 'lr': head_lr}, | |
{'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr} | |
] | |
## train the entire model | |
optimizer = AdamW(optimizer_grouped_parameters) | |
training_args.num_train_epochs = 5 # Set the number of additional epochs | |
trainer.train() | |
model.save_pretrained('transferLearningResults') | |
tokenizer.save_pretrained('transferLearningResults') | |