File size: 3,957 Bytes
3004443
2619b03
3004443
2619b03
3004443
c96e9d6
4132514
 
3004443
 
 
71667b3
6005876
 
 
3004443
 
 
 
 
2619b03
 
 
 
 
 
59dea20
3004443
 
 
 
 
6005876
 
 
 
 
3004443
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2619b03
3004443
 
 
 
 
 
 
 
 
 
 
6005876
 
 
 
 
 
71667b3
2619b03
71667b3
2619b03
71667b3
2619b03
 
71667b3
3004443
71667b3
3004443
 
71667b3
3004443
 
71667b3
3004443
6005876
 
 
 
 
 
 
 
 
2619b03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#from transformers import AlbertTokenizer, AlbertModel
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

#This is a quick evaluation on a few cases

# base
# large
#tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
#model = AlbertModel.from_pretrained("albert-base-v2")
#'sentence-transformers/paraphrase-albert-base-v2'
model_name = 'output/training_OnlineConstrativeLoss-2023-03-10_11-17-15'
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_00-24-35'
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_01-00-19'
model_name='output/training_OnlineConstrativeLoss-2023-03-12_00-42-41'
model_sbert = SentenceTransformer(model_name)

def get_sbert_embedding(input_text):
    embedding = model_sbert.encode(input_text)
    return embedding.tolist()

a1 = "65 Mountain Blvd Ext, Warren, NJ 07059"
a2 = "112 Mountain Blvd Ext, Warren, NJ 07059"
a3 = "1677 NJ-27 #2, Edison, NJ 08817"
a4 = "5078 S Maryland Pkwy, Las Vegas, NV 89119"
a5 = "65 Mountain Boulevard Ext, Warren, NJ 07059"
a6 = "123 Broad St, New York, NY, 10304-2345"
a7 = "440 TECHNOLOGY CENTER DRIVE, Boston, MA 10034"
a8 = "200 Technology Center Drive, Boston, MA 10034"
a8x= "87 Technology Center Drive, Boston, MA 10034"
a9 = "440 Technology Center Dr., Boston, MA 10034-0345"
a10 = "440 Technology Center Dr., Boston, MA 10034"
a11="872 Route 13, Cortlandville NY 13045"
a12="87-2 Route 13, Cortlandville NY 13045"
a13="87-5 Route 13, Cortlandville NY 13045"
a14="257 37 US Rt 11, Evans Mills NY 13637"
a15="257-37 US Route 11, Evans Mills NY 13637"
#def get_embedding(input_text):
#    encoded_input = tokenizer(input_text, return_tensors='pt')
#    input_ids = encoded_input.input_ids
#    input_num_tokens = input_ids.shape[1]
#
#    print( "Number of input tokens: " + str(input_num_tokens))
#    print("Length of input: " + str(len(input_text)))
#
#    list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
#
#    print( "Tokens : " + ' '.join(list_of_tokens))
#    with torch.no_grad():
#
#        outputs = model(**encoded_input)
#        last_hidden_states = outputs[0]
#        sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
#        #sentence_embedding = output.last_hidden_state[0][0]
#        return sentence_embedding.tolist()

e1 = get_sbert_embedding(a1)
e2 = get_sbert_embedding(a2)
#e3 = get_sbert_embedding(a3)
e4 = get_sbert_embedding(a4)
e5 = get_sbert_embedding(a5)
e6 = get_sbert_embedding(a6)
e7 = get_sbert_embedding(a7)
e8 = get_sbert_embedding(a8)
e8x = get_sbert_embedding(a8x)
e9 = get_sbert_embedding(a9)
e10 = get_sbert_embedding(a10)
e11 = get_sbert_embedding(a11)
e12 = get_sbert_embedding(a12)
e13 = get_sbert_embedding(a13)
e14 = get_sbert_embedding(a14)
e15 = get_sbert_embedding(a15)

print(f"a1 \"{a1}\" to \"{a2}\" a2 - expected Different")
print(cosine_similarity([e1], [e2]))
print(f"a1 \"{a1}\" to \"{a4}\" a4 - expected Different")
print(cosine_similarity([e1], [e4]))
print(f"a1 \"{a1}\" to \"{a5}\" a5 - expected Same")
print(cosine_similarity([e1], [e5]))

print(f"a7 \"{a7}\" to \"{a8}\" a8 - expected Different")
print(cosine_similarity([e7], [e8]))
print(f"a7 \"{a7}\" to \"{a8x}\" a8x - expected Different")
print(cosine_similarity([e7], [e8x]))

print(f"a7 \"{a7}\" to \"{a9}\" a9 - expected Same")
print(cosine_similarity([e7], [e9]))

print(f"a7 \"{a7}\" to \"{a10}\" a10 - expected Same")
print(cosine_similarity([e7], [e10]))

print(f"a11 \"{a11}\" to \"{a12}\" a12 - expected Same")
print(cosine_similarity([e11], [e12]))

print(f"a11 \"{a11}\" to \"{a13}\" a13 - expected Different")
print(cosine_similarity([e11], [e13]))

print(f"a14 \"{a14}\" to \"{a15}\" a15 - expected Same")
print(cosine_similarity([e14], [e15]))
# with base
#a1 to a2
#[[0.99512167]]
#a1 to a4
#[[0.94850088]]
#a1 to a5
#[[0.99636901]]

# with large
#a1 to a2
#[[0.99682108]]
#a1 to a4
#[[0.94006972]]
#a1 to a5
#[[0.99503919]]