lengocduc195
/

SentenceTransformer

Model card Files Files and versions Community

SentenceTransformer / examples /training /distillation /dimensionality_reduction.py

lengocduc195

pushNe

2359bda almost 2 years ago

raw

history blame contribute delete

4.57 kB

	"""
	The pre-trained models produce embeddings of size 512 - 1024. However, when storing a large
	number of embeddings, this requires quite a lot of memory / storage.

	In this example, we reduce the dimensionality of the embeddings to e.g. 128 dimensions. This significantly
	reduces the required memory / storage while maintaining nearly the same performance.

	For dimensionality reduction, we compute embeddings for a large set of (representative) sentence. Then,
	we use PCA to find e.g. 128 principle components of our vector space. This allows us to maintain
	us much information as possible with only 128 dimensions.

	PCA gives us a matrix that down-projects vectors to 128 dimensions. We use this matrix
	and extend our original SentenceTransformer model with this linear downproject. Hence,
	the new SentenceTransformer model will produce directly embeddings with 128 dimensions
	without further changes needed.
	"""
	from sklearn.decomposition import PCA
	from sentence_transformers import SentenceTransformer, LoggingHandler, util, evaluation, models, InputExample
	import logging
	import os
	import gzip
	import csv
	import random
	import numpy as np
	import torch

	#### Just some code to print debug information to stdout
	logging.basicConfig(format='%(asctime)s - %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S',
	level=logging.INFO,
	handlers=[LoggingHandler()])
	logger = logging.getLogger(__name__)
	#### /print debug information to stdout

	#Model for which we apply dimensionality reduction
	model = SentenceTransformer('all-MiniLM-L6-v2')

	#New size for the embeddings
	new_dimension = 128


	#We use AllNLI as a source of sentences to compute PCA
	nli_dataset_path = 'datasets/AllNLI.tsv.gz'

	#We use the STS benchmark dataset to see how much performance we loose by using the dimensionality reduction
	sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

	if not os.path.exists(nli_dataset_path):
	util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)

	if not os.path.exists(sts_dataset_path):
	util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)


	# We measure the performance of the original model
	# and later we will measure the performance with the reduces dimension size
	logger.info("Read STSbenchmark test dataset")
	eval_examples = []
	with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
	reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
	for row in reader:
	if row['split'] == 'test':
	score = float(row['score']) / 5.0 #Normalize score to range 0 ... 1
	eval_examples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))

	# Evaluate the original model on the STS benchmark dataset
	stsb_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples, name='sts-benchmark-test')

	logger.info("Original model performance:")
	stsb_evaluator(model)

	######## Reduce the embedding dimensions ########

	#Read sentences from NLI dataset
	nli_sentences = set()
	with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
	reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
	for row in reader:
	nli_sentences.add(row['sentence1'])
	nli_sentences.add(row['sentence2'])

	nli_sentences = list(nli_sentences)
	random.shuffle(nli_sentences)

	#To determine the PCA matrix, we need some example sentence embeddings.
	#Here, we compute the embeddings for 20k random sentences from the AllNLI dataset
	pca_train_sentences = nli_sentences[0:20000]
	train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True)

	#Compute PCA on the train embeddings matrix
	pca = PCA(n_components=new_dimension)
	pca.fit(train_embeddings)
	pca_comp = np.asarray(pca.components_)

	# We add a dense layer to the model, so that it will produce directly embeddings with the new size
	dense = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=new_dimension, bias=False, activation_function=torch.nn.Identity())
	dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
	model.add_module('dense', dense)

	# Evaluate the model with the reduce embedding size
	logger.info("Model with {} dimensions:".format(new_dimension))
	stsb_evaluator(model)


	# If you like, you can store the model on disc by uncommenting the following line
	#model.save('models/my-128dim-model')

	# You can then load the adapted model that produces 128 dimensional embeddings like this:
	#model = SentenceTransformer('models/my-128dim-model')