Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import torch.nn.functional as F | |
from torch import Tensor | |
from transformers import AutoTokenizer, AutoModel | |
def last_token_pool(last_hidden_states: Tensor, | |
attention_mask: Tensor) -> Tensor: | |
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) | |
if left_padding: | |
return last_hidden_states[:, -1] | |
else: | |
sequence_lengths = attention_mask.sum(dim=1) - 1 | |
batch_size = last_hidden_states.shape[0] | |
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths] | |
def get_similarity_scores(queries:list, passages:list, model, tokenizer): | |
print("queries", queries) | |
print("passages", passages) | |
tokenizer.add_eos_token = True | |
max_length = 4096 | |
input_texts = queries + passages | |
batch_dict = tokenizer(input_texts, max_length=max_length - 1, padding=True, truncation=True, return_tensors="pt") | |
outputs = model(**batch_dict) | |
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask']) | |
embeddings = F.normalize(embeddings, p=2, dim=1) | |
scores = (embeddings[:len(queries)] @ embeddings[len(queries):].T) * 100 | |
return scores.tolist() | |
def similarity_ui(keyNames, fields): | |
print("keynames", keyNames) | |
print("fields", fields) | |
task = 'Given a keyName, find similarity score against provided fields' | |
queries = keyNames.split(',') | |
passages = fields.split(',') | |
scores = get_similarity_scores(queries, passages, model, tokenizer) | |
return scores | |
# Load model and tokenizer | |
tokenizer = AutoTokenizer.from_pretrained('Salesforce/SFR-Embedding-Mistral') | |
model = AutoModel.from_pretrained('Salesforce/SFR-Embedding-Mistral') | |
# Create Gradio Interface | |
gr.Interface( | |
fn=similarity_ui, | |
inputs=[gr.Textbox(), gr.Textbox()], | |
outputs=gr.Textbox(), | |
title="Similarity Score Calculator", | |
description="Enter a Key Name and 3 Fields to find similarity scores" | |
).launch() | |