Spaces:
Sleeping
Sleeping
from sentence_transformers import SentenceTransformer, InputExample, losses | |
from transformers import AutoTokenizer, AutoModel | |
import pandas as pd | |
from torch.utils.data import DataLoader | |
import numpy as np | |
from pathlib import Path | |
import os | |
import sys | |
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
BASE_DIR = Path(__file__).resolve().parent.parent | |
def load_model(model_name): | |
model = SentenceTransformer(model_name) | |
return model | |
def get_embeddings(model, texts): | |
embeddings = model.encode(texts) #convert_to_tensor=True) | |
return embeddings | |
# Function to get embeddings from a pre-trained model - requires a lot of memory | |
def get_transformes_embeddings(text, model, tokenizer): | |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128) | |
outputs = model(**inputs) | |
# Mean pooling to get a single vector per sentence | |
embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy() | |
return embeddings | |
# Using batch processing - for low memory | |
def batch_process_transformes_embeddings(sentences, model, tokenizer, batch_size=16, max_length=128): | |
all_embeddings = [] | |
for i in range(0, len(sentences), batch_size): | |
batch = sentences[i:i + batch_size] | |
inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length) | |
outputs = model(**inputs) | |
embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy() | |
all_embeddings.append(embeddings) | |
return np.vstack(all_embeddings) | |
def fine_tune_and_save_model(model_name, dataset): | |
# Initialize the pre-trained model | |
model = SentenceTransformer(model_name) | |
# Create a list of InputExample objects | |
train_examples = [InputExample(texts=[row['utterance'], row['intent']], label=1.0) for _, row in dataset.iterrows()] | |
# Define a DataLoader | |
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16) | |
# Define the loss function | |
train_loss = losses.MultipleNegativesRankingLoss(model) | |
# Fine-tune the model | |
model.fit( | |
train_objectives=[(train_dataloader, train_loss)], | |
epochs=1, | |
warmup_steps=100 | |
) | |
# Save the fine-tuned model | |
path = Path(BASE_DIR) / "output" / "fine-tuned-model" / model_name | |
model.save(str(path)) | |
# old model.save('output/fine-tuned-model/{model_name}') | |
return model | |
def load_model(model_path): | |
model = SentenceTransformer(model_path) # SentenceTransformer.from_pretrained('output/fine-tuned-model') | |
return model |