|
import mteb |
|
from mteb.encoder_interface import PromptType |
|
from sentence_transformers import SentenceTransformer, models |
|
import numpy as np |
|
import torch |
|
import os |
|
import math |
|
|
|
model_save_path = "./" |
|
|
|
|
|
model = SentenceTransformer(model_save_path) |
|
|
|
|
|
class CustomModel: |
|
def __init__(self, model): |
|
self.model = model |
|
|
|
def encode( |
|
self, |
|
sentences, |
|
task_name: str, |
|
prompt_type = None, |
|
max_batch_size: int = 32, |
|
**kwargs |
|
) -> np.ndarray: |
|
""" |
|
Encodes the given sentences using the model with a maximum batch size. |
|
|
|
Args: |
|
sentences (List[str]): The sentences to encode. |
|
task_name (str): The name of the task. |
|
prompt_type (Optional[PromptType]): The prompt type to use. |
|
max_batch_size (int): The maximum number of sentences to process in a single batch. |
|
**kwargs: Additional arguments to pass to the encoder. |
|
|
|
Returns: |
|
np.ndarray: Encoded sentences as a numpy array. |
|
""" |
|
|
|
sentences = [str(sentence) for sentence in sentences] |
|
total_sentences = len(sentences) |
|
num_batches = math.ceil(total_sentences / max_batch_size) |
|
embeddings_list = [] |
|
|
|
for batch_idx in range(num_batches): |
|
start_idx = batch_idx * max_batch_size |
|
end_idx = min(start_idx + max_batch_size, total_sentences) |
|
batch_sentences = sentences[start_idx:end_idx] |
|
batch_embeddings = self.model.encode(batch_sentences, convert_to_tensor=True) |
|
|
|
if not isinstance(batch_embeddings, torch.Tensor): |
|
batch_embeddings = torch.tensor(batch_embeddings) |
|
|
|
embeddings_list.append(batch_embeddings.cpu().numpy()) |
|
|
|
return np.vstack(embeddings_list) |
|
|
|
|
|
|
|
|
|
custom_model = CustomModel(model) |
|
|
|
|
|
tasks = mteb.get_benchmark("MTEB(eng, classic)") |
|
|
|
|
|
evaluation = mteb.MTEB(tasks=tasks) |
|
|
|
|
|
results = evaluation.run(custom_model, output_folder="results/model_results") |
|
|