Spaces:
Runtime error
Runtime error
File size: 3,957 Bytes
3004443 2619b03 3004443 2619b03 3004443 c96e9d6 4132514 3004443 71667b3 6005876 3004443 2619b03 59dea20 3004443 6005876 3004443 2619b03 3004443 6005876 71667b3 2619b03 71667b3 2619b03 71667b3 2619b03 71667b3 3004443 71667b3 3004443 71667b3 3004443 71667b3 3004443 6005876 2619b03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
#from transformers import AlbertTokenizer, AlbertModel
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
#This is a quick evaluation on a few cases
# base
# large
#tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
#model = AlbertModel.from_pretrained("albert-base-v2")
#'sentence-transformers/paraphrase-albert-base-v2'
model_name = 'output/training_OnlineConstrativeLoss-2023-03-10_11-17-15'
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_00-24-35'
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_01-00-19'
model_name='output/training_OnlineConstrativeLoss-2023-03-12_00-42-41'
model_sbert = SentenceTransformer(model_name)
def get_sbert_embedding(input_text):
embedding = model_sbert.encode(input_text)
return embedding.tolist()
a1 = "65 Mountain Blvd Ext, Warren, NJ 07059"
a2 = "112 Mountain Blvd Ext, Warren, NJ 07059"
a3 = "1677 NJ-27 #2, Edison, NJ 08817"
a4 = "5078 S Maryland Pkwy, Las Vegas, NV 89119"
a5 = "65 Mountain Boulevard Ext, Warren, NJ 07059"
a6 = "123 Broad St, New York, NY, 10304-2345"
a7 = "440 TECHNOLOGY CENTER DRIVE, Boston, MA 10034"
a8 = "200 Technology Center Drive, Boston, MA 10034"
a8x= "87 Technology Center Drive, Boston, MA 10034"
a9 = "440 Technology Center Dr., Boston, MA 10034-0345"
a10 = "440 Technology Center Dr., Boston, MA 10034"
a11="872 Route 13, Cortlandville NY 13045"
a12="87-2 Route 13, Cortlandville NY 13045"
a13="87-5 Route 13, Cortlandville NY 13045"
a14="257 37 US Rt 11, Evans Mills NY 13637"
a15="257-37 US Route 11, Evans Mills NY 13637"
#def get_embedding(input_text):
# encoded_input = tokenizer(input_text, return_tensors='pt')
# input_ids = encoded_input.input_ids
# input_num_tokens = input_ids.shape[1]
#
# print( "Number of input tokens: " + str(input_num_tokens))
# print("Length of input: " + str(len(input_text)))
#
# list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
#
# print( "Tokens : " + ' '.join(list_of_tokens))
# with torch.no_grad():
#
# outputs = model(**encoded_input)
# last_hidden_states = outputs[0]
# sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
# #sentence_embedding = output.last_hidden_state[0][0]
# return sentence_embedding.tolist()
e1 = get_sbert_embedding(a1)
e2 = get_sbert_embedding(a2)
#e3 = get_sbert_embedding(a3)
e4 = get_sbert_embedding(a4)
e5 = get_sbert_embedding(a5)
e6 = get_sbert_embedding(a6)
e7 = get_sbert_embedding(a7)
e8 = get_sbert_embedding(a8)
e8x = get_sbert_embedding(a8x)
e9 = get_sbert_embedding(a9)
e10 = get_sbert_embedding(a10)
e11 = get_sbert_embedding(a11)
e12 = get_sbert_embedding(a12)
e13 = get_sbert_embedding(a13)
e14 = get_sbert_embedding(a14)
e15 = get_sbert_embedding(a15)
print(f"a1 \"{a1}\" to \"{a2}\" a2 - expected Different")
print(cosine_similarity([e1], [e2]))
print(f"a1 \"{a1}\" to \"{a4}\" a4 - expected Different")
print(cosine_similarity([e1], [e4]))
print(f"a1 \"{a1}\" to \"{a5}\" a5 - expected Same")
print(cosine_similarity([e1], [e5]))
print(f"a7 \"{a7}\" to \"{a8}\" a8 - expected Different")
print(cosine_similarity([e7], [e8]))
print(f"a7 \"{a7}\" to \"{a8x}\" a8x - expected Different")
print(cosine_similarity([e7], [e8x]))
print(f"a7 \"{a7}\" to \"{a9}\" a9 - expected Same")
print(cosine_similarity([e7], [e9]))
print(f"a7 \"{a7}\" to \"{a10}\" a10 - expected Same")
print(cosine_similarity([e7], [e10]))
print(f"a11 \"{a11}\" to \"{a12}\" a12 - expected Same")
print(cosine_similarity([e11], [e12]))
print(f"a11 \"{a11}\" to \"{a13}\" a13 - expected Different")
print(cosine_similarity([e11], [e13]))
print(f"a14 \"{a14}\" to \"{a15}\" a15 - expected Same")
print(cosine_similarity([e14], [e15]))
# with base
#a1 to a2
#[[0.99512167]]
#a1 to a4
#[[0.94850088]]
#a1 to a5
#[[0.99636901]]
# with large
#a1 to a2
#[[0.99682108]]
#a1 to a4
#[[0.94006972]]
#a1 to a5
#[[0.99503919]] |