Spaces:

hkanumilli
/

EmotionExtracter

Sleeping

File size: 4,307 Bytes

ea19ac8

import torch
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import RobertaConfig, RobertaForSequenceClassification
from transformers import AdamW

from newhead import NewClassificationHead

def preprocess_data(df):
    """
    Preprocess the data by renaming columns, removing rows with missing values, and removing extra spaces.
    """
    df = df.rename(columns={'Comment': 'text', 'Emotion': 'label'})
    df = df.dropna()
    df['text'] = df['text'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip()
    df['label'] = df['label'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip()
    return df

def encode_label(df):
    """
    Encode the labels using LabelEncoder.
    """
    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df['label'])
    return df

def generate_dataset(df, test_size=0.2):
    """
    Convert the DataFrame into a Dataset that can be used with transformers.
    """
    return Dataset.from_pandas(df)

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)


def compute_metrics(pred):
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define model and training arguments
model_name = "cardiffnlp/twitter-roberta-base-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = RobertaConfig.from_pretrained(model_name, num_labels=3)  # Set the number of labels to 3
model = RobertaForSequenceClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True)
model.classifier = NewClassificationHead(config)

df = pd.read_csv('Emotion_classify_Data.csv')
df = preprocess_data(df)
df = encode_label(df)
ds = generate_dataset(df)
ds = ds.map(tokenize, batched=True)


### Transer Learning First
# Freeze all layers first
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the classifier layer
for param in model.classifier.parameters():
    param.requires_grad = True


# Define different learning rates
head_lr = 3e-4  # Higher learning rate for the head
base_lr = head_lr/5 # Lower learning rate for the base layers

# Group parameters and set learning rates
optimizer_grouped_parameters = [
    {'params': model.classifier.parameters(), 'lr': head_lr},
    {'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr}
]

optimizer = AdamW(optimizer_grouped_parameters)

## Training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=10,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',
    save_strategy="no",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, None),  # No need to pass a learning rate scheduler if you're managing learning rates manually,
    compute_metrics=compute_metrics
)


## Train the head of the model
trainer.train()


## Unfreeze all layers
for param in model.parameters():
    param.requires_grad = True

    
head_lr = 1e-4  # Slightly lower learning rate for the head
base_lr = 5e-6  # Much lower learning rate for the base layers

optimizer_grouped_parameters = [
    {'params': model.classifier.parameters(), 'lr': head_lr},
    {'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr}
]

## train the entire model 
optimizer = AdamW(optimizer_grouped_parameters)

training_args.num_train_epochs = 5  # Set the number of additional epochs
trainer.train()

model.save_pretrained('transferLearningResults')
tokenizer.save_pretrained('transferLearningResults')