Spaces:

hkanumilli
/

EmotionExtracter

Sleeping

App Files Files Community

EmotionExtracter / train /transfer_learning.py

hkanumilli

emotion extracter application built using transfer learning

ea19ac8 over 1 year ago

raw

history blame

4.31 kB

	import torch
	import pandas as pd

	from sklearn.preprocessing import LabelEncoder
	from datasets import Dataset
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
	from transformers import RobertaConfig, RobertaForSequenceClassification
	from transformers import AdamW

	from newhead import NewClassificationHead

	def preprocess_data(df):
	"""
	Preprocess the data by renaming columns, removing rows with missing values, and removing extra spaces.
	"""
	df = df.rename(columns={'Comment': 'text', 'Emotion': 'label'})
	df = df.dropna()
	df['text'] = df['text'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip()
	df['label'] = df['label'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip()
	return df

	def encode_label(df):
	"""
	Encode the labels using LabelEncoder.
	"""
	label_encoder = LabelEncoder()
	df['label'] = label_encoder.fit_transform(df['label'])
	return df

	def generate_dataset(df, test_size=0.2):
	"""
	Convert the DataFrame into a Dataset that can be used with transformers.
	"""
	return Dataset.from_pandas(df)

	def tokenize(batch):
	return tokenizer(batch['text'], padding='max_length', truncation=True)


	def compute_metrics(pred):
	from sklearn.metrics import accuracy_score, precision_recall_fscore_support
	labels = pred.label_ids
	preds = pred.predictions.argmax(-1)
	precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
	acc = accuracy_score(labels, preds)
	return {
	'accuracy': acc,
	'f1': f1,
	'precision': precision,
	'recall': recall
	}

	# Define model and training arguments
	model_name = "cardiffnlp/twitter-roberta-base-emotion"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	config = RobertaConfig.from_pretrained(model_name, num_labels=3) # Set the number of labels to 3
	model = RobertaForSequenceClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True)
	model.classifier = NewClassificationHead(config)

	df = pd.read_csv('Emotion_classify_Data.csv')
	df = preprocess_data(df)
	df = encode_label(df)
	ds = generate_dataset(df)
	ds = ds.map(tokenize, batched=True)


	### Transer Learning First
	# Freeze all layers first
	for param in model.parameters():
	param.requires_grad = False

	# Unfreeze the classifier layer
	for param in model.classifier.parameters():
	param.requires_grad = True


	# Define different learning rates
	head_lr = 3e-4 # Higher learning rate for the head
	base_lr = head_lr/5 # Lower learning rate for the base layers

	# Group parameters and set learning rates
	optimizer_grouped_parameters = [
	{'params': model.classifier.parameters(), 'lr': head_lr},
	{'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr}
	]

	optimizer = AdamW(optimizer_grouped_parameters)

	## Training arguments
	training_args = TrainingArguments(
	output_dir='./results',
	num_train_epochs=10,
	per_device_train_batch_size=16,
	per_device_eval_batch_size=64,
	warmup_steps=500,
	weight_decay=0.01,
	logging_dir='./logs',
	save_strategy="no",
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=ds['train'],
	eval_dataset=ds['test'],
	tokenizer=tokenizer,
	optimizers=(optimizer, None), # No need to pass a learning rate scheduler if you're managing learning rates manually,
	compute_metrics=compute_metrics
	)


	## Train the head of the model
	trainer.train()


	## Unfreeze all layers
	for param in model.parameters():
	param.requires_grad = True


	head_lr = 1e-4 # Slightly lower learning rate for the head
	base_lr = 5e-6 # Much lower learning rate for the base layers

	optimizer_grouped_parameters = [
	{'params': model.classifier.parameters(), 'lr': head_lr},
	{'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr}
	]

	## train the entire model
	optimizer = AdamW(optimizer_grouped_parameters)

	training_args.num_train_epochs = 5 # Set the number of additional epochs
	trainer.train()

	model.save_pretrained('transferLearningResults')
	tokenizer.save_pretrained('transferLearningResults')