SentenceTransformer / examples /training /distillation /dimensionality_reduction.py
lengocduc195's picture
pushNe
2359bda
"""
The pre-trained models produce embeddings of size 512 - 1024. However, when storing a large
number of embeddings, this requires quite a lot of memory / storage.
In this example, we reduce the dimensionality of the embeddings to e.g. 128 dimensions. This significantly
reduces the required memory / storage while maintaining nearly the same performance.
For dimensionality reduction, we compute embeddings for a large set of (representative) sentence. Then,
we use PCA to find e.g. 128 principle components of our vector space. This allows us to maintain
us much information as possible with only 128 dimensions.
PCA gives us a matrix that down-projects vectors to 128 dimensions. We use this matrix
and extend our original SentenceTransformer model with this linear downproject. Hence,
the new SentenceTransformer model will produce directly embeddings with 128 dimensions
without further changes needed.
"""
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer, LoggingHandler, util, evaluation, models, InputExample
import logging
import os
import gzip
import csv
import random
import numpy as np
import torch
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO,
handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)
#### /print debug information to stdout
#Model for which we apply dimensionality reduction
model = SentenceTransformer('all-MiniLM-L6-v2')
#New size for the embeddings
new_dimension = 128
#We use AllNLI as a source of sentences to compute PCA
nli_dataset_path = 'datasets/AllNLI.tsv.gz'
#We use the STS benchmark dataset to see how much performance we loose by using the dimensionality reduction
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'
if not os.path.exists(nli_dataset_path):
util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)
if not os.path.exists(sts_dataset_path):
util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)
# We measure the performance of the original model
# and later we will measure the performance with the reduces dimension size
logger.info("Read STSbenchmark test dataset")
eval_examples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in reader:
if row['split'] == 'test':
score = float(row['score']) / 5.0 #Normalize score to range 0 ... 1
eval_examples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))
# Evaluate the original model on the STS benchmark dataset
stsb_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples, name='sts-benchmark-test')
logger.info("Original model performance:")
stsb_evaluator(model)
######## Reduce the embedding dimensions ########
#Read sentences from NLI dataset
nli_sentences = set()
with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in reader:
nli_sentences.add(row['sentence1'])
nli_sentences.add(row['sentence2'])
nli_sentences = list(nli_sentences)
random.shuffle(nli_sentences)
#To determine the PCA matrix, we need some example sentence embeddings.
#Here, we compute the embeddings for 20k random sentences from the AllNLI dataset
pca_train_sentences = nli_sentences[0:20000]
train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True)
#Compute PCA on the train embeddings matrix
pca = PCA(n_components=new_dimension)
pca.fit(train_embeddings)
pca_comp = np.asarray(pca.components_)
# We add a dense layer to the model, so that it will produce directly embeddings with the new size
dense = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=new_dimension, bias=False, activation_function=torch.nn.Identity())
dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
model.add_module('dense', dense)
# Evaluate the model with the reduce embedding size
logger.info("Model with {} dimensions:".format(new_dimension))
stsb_evaluator(model)
# If you like, you can store the model on disc by uncommenting the following line
#model.save('models/my-128dim-model')
# You can then load the adapted model that produces 128 dimensional embeddings like this:
#model = SentenceTransformer('models/my-128dim-model')