BERT-Text-Classification-Model / train.py

Upload train.py

e082a81 verified over 1 year ago

5.64 kB

	# %% Importing the dependencies we need
	import numpy as np
	import torch
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.metrics import (accuracy_score, f1_score, confusion_matrix,
	ConfusionMatrixDisplay, classification_report)
	from sklearn.model_selection import train_test_split
	from sklearn.pipeline import Pipeline
	from skops import card, hub_utils
	from skorch import NeuralNetClassifier
	from skorch.callbacks import LRScheduler, ProgressBar
	from skorch.hf import HuggingfacePretrainedTokenizer
	from torch import nn
	from torch.optim.lr_scheduler import LambdaLR
	from transformers import AutoModelForSequenceClassification
	from transformers import AutoTokenizer
	# for model hosting and requirements
	from pathlib import Path
	import transformers
	import skorch
	import sklearn
	import torch

	# %%
	# Choose a tokenizer and BERT model that work together
	TOKENIZER = "distilbert-base-uncased"
	PRETRAINED_MODEL = "distilbert-base-uncased"

	# model hyper-parameters
	OPTMIZER = torch.optim.AdamW
	LR = 5e-5
	MAX_EPOCHS = 3
	CRITERION = nn.CrossEntropyLoss
	BATCH_SIZE = 8

	# device
	DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

	# %% Load the dataset, define features & labels and split
	dataset = fetch_20newsgroups()

	print(dataset.DESCR.split('Usage')[0])

	dataset.target_names

	X = dataset.data
	y = dataset.target
	X_train, X_test, y_train, y_test, = train_test_split(X, y, stratify=y, random_state=0)
	num_training_steps = MAX_EPOCHS * (len(X_train) // BATCH_SIZE + 1)

	# %%
	# Defining learning rate scheduler & BERT in nn.Module

	def lr_schedule(current_step):
	factor = float(num_training_steps - current_step) / float(max(1, num_training_steps))
	assert factor > 0
	return factor

	class BertModule(nn.Module):
	def __init__(self, name, num_labels):
	super().__init__()
	self.name = name
	self.num_labels = num_labels

	self.reset_weights()

	def reset_weights(self):
	self.bert = AutoModelForSequenceClassification.from_pretrained(
	self.name, num_labels=self.num_labels
	)

	def forward(self, **kwargs):
	pred = self.bert(**kwargs)
	return pred.logits

	# %% Chaining tokenizer and BERT in one pipeline
	pipeline = Pipeline([
	('tokenizer', HuggingfacePretrainedTokenizer(TOKENIZER)),
	('net', NeuralNetClassifier(
	BertModule,
	module__name=PRETRAINED_MODEL,
	module__num_labels=len(set(y_train)),
	optimizer=OPTMIZER,
	lr=LR,
	max_epochs=MAX_EPOCHS,
	criterion=CRITERION,
	batch_size=BATCH_SIZE,
	iterator_train__shuffle=True,
	device=DEVICE,
	callbacks=[
	LRScheduler(LambdaLR, lr_lambda=lr_schedule, step_every='batch'),
	ProgressBar(),
	],
	)),
	])

	torch.manual_seed(0)
	torch.cuda.manual_seed(0)
	torch.cuda.manual_seed_all(0)
	np.random.seed(0)

	# %% Training
	%time pipeline.fit(X_train, y_train)

	# %% Evaluate the model
	%%time
	with torch.inference_mode():
	y_pred = pipeline.predict(X_test)

	accuracy_score(y_test, y_pred)

	# %% Save the model
	import pickle
	with open("model.pkl", mode="bw") as f:
	pickle.dump(pipeline, file=f)

	# %% Initialize the repository for Hub
	local_repo = "model_repo"
	hub_utils.init(
	model="model.pkl",
	requirements=[f"scikit-learn={sklearn.__version__}", f"transformers={transformers.__version__}",
	f"torch={torch.__version__}", f"skorch={skorch.__version__}"],
	dst=local_repo,
	task="text-classification",
	data=X_test,
	)

	# %% Create model card
	model_card = card.Card(pipeline, metadata=card.metadata_from_config(Path("model_repo")))

	# %% We will add information related to model
	model_description = (
	"This is a neural net classifier and distilbert model chained with sklearn Pipeline trained on 20 news groups dataset."
	)
	limitations = "This model is trained for a tutorial and is not ready to be used in production."
	model_card.add(
	model_description=model_description,
	limitations=limitations
	)

	# %% We can add plots, evaluation results and more!
	eval_descr = (
	"The model is evaluated on validation data from 20 news group's test split,"
	" using accuracy and F1-score with micro average."
	)
	model_card.add(eval_method=eval_descr)

	accuracy = accuracy_score(y_test, y_pred)
	f1 = f1_score(y_test, y_pred, average="micro")
	model_card.add_metrics(**{"accuracy": accuracy, "f1 score": f1})


	cm = confusion_matrix(y_test, y_pred, labels=pipeline.classes_)
	disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipeline.classes_)
	disp.plot()

	disp.figure_.savefig(Path(local_repo) / "confusion_matrix.png")
	model_card.add_plot(**{"Confusion matrix": "confusion_matrix.png"})

	clf_report = classification_report(
	y_test, y_pred, output_dict=True, target_names=dataset.target_names
	)
	# %% We can add classification report as a table
	# We first need to convert classification report to DataFrame to add it as a table
	import pandas as pd
	del clf_report["accuracy"]
	clf_report = pd.DataFrame(clf_report).T.reset_index()
	model_card.add_table(
	folded=True,
	**{
	"Classification Report": clf_report,
	},
	)

	# %% We will save our model card
	model_card.save(Path(local_repo) / "README.md")

	# %% We will add the training script to our repository
	hub_utils.add_files(__file__, dst=local_repo)

	# %% Push to Hub! This requires us to authenticate ourselves first.
	from huggingface_hub import notebook_login
	notebook_login()

	hub_utils.push(
	repo_id="scikit-learn/skorch-text-classification",
	source=local_repo,
	create_remote=True,
	)