Spaces:
Runtime error
Runtime error
File size: 3,067 Bytes
3004443 2619b03 3004443 2619b03 3004443 c96e9d6 4132514 3004443 71667b3 3004443 2619b03 59dea20 3004443 2619b03 3004443 2619b03 3004443 71667b3 2619b03 71667b3 2619b03 71667b3 2619b03 71667b3 3004443 71667b3 3004443 71667b3 3004443 71667b3 3004443 2619b03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
#from transformers import AlbertTokenizer, AlbertModel
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
#This is a quick evaluation on a few cases
# base
# large
#tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
#model = AlbertModel.from_pretrained("albert-base-v2")
#'sentence-transformers/paraphrase-albert-base-v2'
model_name = 'output/training_OnlineConstrativeLoss-2023-03-10_11-17-15'
model_sbert = SentenceTransformer(model_name)
def get_sbert_embedding(input_text):
embedding = model_sbert.encode(input_text)
return embedding.tolist()
a1 = "65 Mountain Blvd Ext, Warren, NJ 07059"
a2 = "112 Mountain Blvd Ext, Warren, NJ 07059"
a3 = "1677 NJ-27 #2, Edison, NJ 08817"
a4 = "5078 S Maryland Pkwy, Las Vegas, NV 89119"
a5 = "65 Mountain Boulevard Ext, Warren, NJ 07059"
a6 = "123 Broad St, New York, NY, 10304-2345"
a7 = "440 TECHNOLOGY CENTER DRIVE, Boston, MA 10034"
a8 = "200 Technology Center Drive, Boston, MA 10034"
a8x= "87 Technology Center Drive, Boston, MA 10034"
a9 = "440 Technology Center Dr., Boston, MA 10034-0345"
a10 = "440 Technology Center Dr., Boston, MA 10034"
#def get_embedding(input_text):
# encoded_input = tokenizer(input_text, return_tensors='pt')
# input_ids = encoded_input.input_ids
# input_num_tokens = input_ids.shape[1]
#
# print( "Number of input tokens: " + str(input_num_tokens))
# print("Length of input: " + str(len(input_text)))
#
# list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
#
# print( "Tokens : " + ' '.join(list_of_tokens))
# with torch.no_grad():
#
# outputs = model(**encoded_input)
# last_hidden_states = outputs[0]
# sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
# #sentence_embedding = output.last_hidden_state[0][0]
# return sentence_embedding.tolist()
e1 = get_sbert_embedding(a1)
e2 = get_sbert_embedding(a2)
#e3 = get_sbert_embedding(a3)
e4 = get_sbert_embedding(a4)
e5 = get_sbert_embedding(a5)
e6 = get_sbert_embedding(a6)
e7 = get_sbert_embedding(a7)
e8 = get_sbert_embedding(a8)
e8x = get_sbert_embedding(a8x)
e9 = get_sbert_embedding(a9)
e10 = get_sbert_embedding(a10)
print(f"a1 \"{a1}\" to \"{a2}\" a2 - expected Different")
print(cosine_similarity([e1], [e2]))
print(f"a1 \"{a1}\" to \"{a4}\" a4 - expected Different")
print(cosine_similarity([e1], [e4]))
print(f"a1 \"{a1}\" to \"{a5}\" a5 - expected Same")
print(cosine_similarity([e1], [e5]))
print(f"a7 \"{a7}\" to \"{a8}\" a8 - expected Different")
print(cosine_similarity([e7], [e8]))
print(f"a7 \"{a7}\" to \"{a8x}\" a8x - expected Different")
print(cosine_similarity([e7], [e8x]))
print(f"a7 \"{a7}\" to \"{a9}\" a9 - expected Same")
print(cosine_similarity([e7], [e9]))
print(f"a7 \"{a7}\" to \"{a10}\" a10 - expected Same")
print(cosine_similarity([e7], [e10]))
# with base
#a1 to a2
#[[0.99512167]]
#a1 to a4
#[[0.94850088]]
#a1 to a5
#[[0.99636901]]
# with large
#a1 to a2
#[[0.99682108]]
#a1 to a4
#[[0.94006972]]
#a1 to a5
#[[0.99503919]] |