import torch import pandas as pd from sklearn.preprocessing import LabelEncoder from datasets import Dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer from transformers import RobertaConfig, RobertaForSequenceClassification from transformers import AdamW from newhead import NewClassificationHead def preprocess_data(df): """ Preprocess the data by renaming columns, removing rows with missing values, and removing extra spaces. """ df = df.rename(columns={'Comment': 'text', 'Emotion': 'label'}) df = df.dropna() df['text'] = df['text'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip() df['label'] = df['label'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip() return df def encode_label(df): """ Encode the labels using LabelEncoder. """ label_encoder = LabelEncoder() df['label'] = label_encoder.fit_transform(df['label']) return df def generate_dataset(df, test_size=0.2): """ Convert the DataFrame into a Dataset that can be used with transformers. """ return Dataset.from_pandas(df) def tokenize(batch): return tokenizer(batch['text'], padding='max_length', truncation=True) def compute_metrics(pred): from sklearn.metrics import accuracy_score, precision_recall_fscore_support labels = pred.label_ids preds = pred.predictions.argmax(-1) precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted') acc = accuracy_score(labels, preds) return { 'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall } # Define model and training arguments model_name = "cardiffnlp/twitter-roberta-base-emotion" tokenizer = AutoTokenizer.from_pretrained(model_name) config = RobertaConfig.from_pretrained(model_name, num_labels=3) # Set the number of labels to 3 model = RobertaForSequenceClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True) model.classifier = NewClassificationHead(config) df = pd.read_csv('Emotion_classify_Data.csv') df = preprocess_data(df) df = encode_label(df) ds = generate_dataset(df) ds = ds.map(tokenize, batched=True) ### Transer Learning First # Freeze all layers first for param in model.parameters(): param.requires_grad = False # Unfreeze the classifier layer for param in model.classifier.parameters(): param.requires_grad = True # Define different learning rates head_lr = 3e-4 # Higher learning rate for the head base_lr = head_lr/5 # Lower learning rate for the base layers # Group parameters and set learning rates optimizer_grouped_parameters = [ {'params': model.classifier.parameters(), 'lr': head_lr}, {'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr} ] optimizer = AdamW(optimizer_grouped_parameters) ## Training arguments training_args = TrainingArguments( output_dir='./results', num_train_epochs=10, per_device_train_batch_size=16, per_device_eval_batch_size=64, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', save_strategy="no", ) trainer = Trainer( model=model, args=training_args, train_dataset=ds['train'], eval_dataset=ds['test'], tokenizer=tokenizer, optimizers=(optimizer, None), # No need to pass a learning rate scheduler if you're managing learning rates manually, compute_metrics=compute_metrics ) ## Train the head of the model trainer.train() ## Unfreeze all layers for param in model.parameters(): param.requires_grad = True head_lr = 1e-4 # Slightly lower learning rate for the head base_lr = 5e-6 # Much lower learning rate for the base layers optimizer_grouped_parameters = [ {'params': model.classifier.parameters(), 'lr': head_lr}, {'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr} ] ## train the entire model optimizer = AdamW(optimizer_grouped_parameters) training_args.num_train_epochs = 5 # Set the number of additional epochs trainer.train() model.save_pretrained('transferLearningResults') tokenizer.save_pretrained('transferLearningResults')