In [1]:
! pip install pandas
! pip install scikit-learn
! pip install datasets
! pip install transformers
! pip install transformers[torch]
! pip install accelerate -U

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu:
Collecting pandas
  Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
     |████████████████████████████████| 12.4 MB 9.3 MB/s            
Collecting tzdata>=2022.1
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
     |████████████████████████████████| 341 kB 89.1 MB/s            
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.0.3 tzdata-2023.3
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu:
Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.1 MB)
     |████████████████████████████████| 11.1 MB 9.1 MB/s            
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Collecting joblib>=1.1.1
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
     |██████████████████████████████

In [2]:
import torch
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import RobertaConfig, RobertaForSequenceClassification
from transformers import AdamW

# Define a new classification head
class NewClassificationHead(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.out_proj = torch.nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.nn.functional.relu(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

def preprocess_data(df):
    ## rename columns
    df = df.rename(columns={'Comment': 'text', 'Emotion': 'label'})

    ## remove rows with missing values
    df = df.dropna()
    df['text'] = df['text'].str.replace('\t', ' ') # Remove extra spaces - this line replaces any occurrence of two or more spaces with a single spac
    df['text'] = df['text'].str.replace(' +', ' ', regex=True) # Remove extra spaces - this line replaces any occurrence of two or more spaces with a single space
    df['text'] = df['text'].str.strip() # Remove extra spaces - this line replaces any occurrence of two or more spaces with a single space

    df['label'] = df['label'].str.replace('\t', ' ') # Remove extra spaces - this line replaces any occurrence of two or more spaces with a single spac
    df['label'] = df['label'].str.replace(' +', ' ', regex=True) # Remove extra spaces - this line replaces any occurrence of two or more spaces with a single space
    df['label'] = df['label'].str.strip() # Remove extra spaces - this line replaces any occurrence of two or more spaces with a single space 

    return df

def encode_label(df):
    le = LabelEncoder()
    df['label'] = le.fit_transform(df['label'])
    label_mapping = {label: index for index, label in enumerate(le.classes_)}
    df['label'].map(label_mapping)
    return df

def generate_dataset(df, test_size=0.2):
    """
    Convert to transformers dataset and split into train and test
    """
    dataset = Dataset.from_pandas(df)
    ds = dataset.train_test_split(test_size=test_size)
    return ds

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)


def compute_metrics(pred):
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define model and training arguments
model_name = "cardiffnlp/twitter-roberta-base-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = RobertaConfig.from_pretrained(model_name, num_labels=3)  # Set the number of labels to 3
model = RobertaForSequenceClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True)
model.classifier = NewClassificationHead(config)

df = pd.read_csv('Emotion_classify_Data.csv')
df = preprocess_data(df)
df = encode_label(df)
ds = generate_dataset(df)
ds = ds.map(tokenize, batched=True)



config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4749 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1188 [00:00<?, ? examples/s]

In [3]:
# Freeze all layers first
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the classifier layer
for param in model.classifier.parameters():
    param.requires_grad = True


# Define different learning rates
head_lr = 3e-4  # Higher learning rate for the head
base_lr = head_lr/5 # Lower learning rate for the base layers

# Group parameters and set learning rates
optimizer_grouped_parameters = [
    {'params': model.classifier.parameters(), 'lr': head_lr},
    {'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr}
]

optimizer = AdamW(optimizer_grouped_parameters)



In [4]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=10,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',
    save_strategy="no",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, None),  # No need to pass a learning rate scheduler if you're managing learning rates manually,
    compute_metrics=compute_metrics
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [5]:
 trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.6781
1000,0.5377
1500,0.5149
2000,0.4745
2500,0.4505


TrainOutput(global_step=2970, training_loss=0.516797270598235, metrics={'train_runtime': 517.0884, 'train_samples_per_second': 91.841, 'train_steps_per_second': 5.744, 'total_flos': 1128914327325078.0, 'train_loss': 0.516797270598235, 'epoch': 10.0})

In [6]:
trainer.evaluate()

{'eval_loss': 0.4612630307674408,
 'eval_accuracy': 0.8181818181818182,
 'eval_f1': 0.8180812962482343,
 'eval_precision': 0.8186808374254468,
 'eval_recall': 0.8181818181818182,
 'eval_runtime': 13.0807,
 'eval_samples_per_second': 90.821,
 'eval_steps_per_second': 1.453,
 'epoch': 10.0}

In [7]:
for param in model.parameters():
    param.requires_grad = True

    
head_lr = 1e-4  # Slightly lower learning rate for the head
base_lr = 5e-6  # Much lower learning rate for the base layers

optimizer_grouped_parameters = [
    {'params': model.classifier.parameters(), 'lr': head_lr},
    {'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr}
]

optimizer = AdamW(optimizer_grouped_parameters)

training_args.num_train_epochs = 5  # Set the number of additional epochs
trainer.train()



Step,Training Loss
500,0.2532
1000,0.105


TrainOutput(global_step=1485, training_loss=0.13263646263867515, metrics={'train_runtime': 1037.0165, 'train_samples_per_second': 22.897, 'train_steps_per_second': 1.432, 'total_flos': 563885457261714.0, 'train_loss': 0.13263646263867515, 'epoch': 5.0})

In [10]:
trainer.evaluate()

{'eval_loss': 0.2423660308122635,
 'eval_accuracy': 0.9671717171717171,
 'eval_f1': 0.9671861840444216,
 'eval_precision': 0.9672086987568536,
 'eval_recall': 0.9671717171717171,
 'eval_runtime': 12.2384,
 'eval_samples_per_second': 97.071,
 'eval_steps_per_second': 1.552,
 'epoch': 5.0}

In [13]:
model.save_pretrained('transferLearningResults')
tokenizer.save_pretrained('transferLearningResults')

('transferLearningResults/tokenizer_config.json',
 'transferLearningResults/special_tokens_map.json',
 'transferLearningResults/vocab.json',
 'transferLearningResults/merges.txt',
 'transferLearningResults/added_tokens.json',
 'transferLearningResults/tokenizer.json')