Feliks Zaslavskiy commited on
Commit
b5b5700
·
2 Parent(s): c90e96d 2619b03

Merge remote-tracking branch 'origin/main'

Browse files
Files changed (1) hide show
  1. data.py +56 -0
data.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AlbertTokenizer, AlbertModel
2
+ from sklearn.metrics.pairwise import cosine_similarity
3
+
4
+ tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2')
5
+ model = AlbertModel.from_pretrained("albert-large-v2")
6
+
7
+ a1 = "65 Mountain Blvd Ext, Warren, NJ 07059"
8
+ a2 = "112 Mountain Blvd Ext, Warren, NJ 07059"
9
+ a3 = "1677 NJ-27 #2, Edison, NJ 08817"
10
+ a4 = "5078 S Maryland Pkwy, Las Vegas, NV 89119"
11
+ a5 = "65 Mountain Boulevard Ext, Warren, NJ 07059"
12
+
13
+ def get_embedding(input_text):
14
+ encoded_input = tokenizer(input_text, return_tensors='pt')
15
+ input_ids = encoded_input.input_ids
16
+ input_num_tokens = input_ids.shape[1]
17
+
18
+ print( "Number of input tokens: " + str(input_num_tokens))
19
+ print("Length of input: " + str(len(input_text)))
20
+
21
+ list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
22
+
23
+ print( "Tokens : " + ' '.join(list_of_tokens))
24
+ output = model(**encoded_input)
25
+
26
+ embedding = output.last_hidden_state[0][0]
27
+ return embedding.tolist()
28
+
29
+ e1 = get_embedding(a1)
30
+ e2 = get_embedding(a2)
31
+ #e3 = get_embedding(a3)
32
+ e4 = get_embedding(a4)
33
+ e5 = get_embedding(a5)
34
+
35
+ print("a1 to a2")
36
+ print(cosine_similarity([e1], [e2]))
37
+ print("a1 to a4")
38
+ print(cosine_similarity([e1], [e4]))
39
+ print("a1 to a5")
40
+ print(cosine_similarity([e1], [e5]))
41
+
42
+ # with base
43
+ #a1 to a2
44
+ #[[0.99512167]]
45
+ #a1 to a4
46
+ #[[0.94850088]]
47
+ #a1 to a5
48
+ #[[0.99636901]]
49
+
50
+ # with large
51
+ #a1 to a2
52
+ #[[0.99682108]]
53
+ #a1 to a4
54
+ #[[0.94006972]]
55
+ #a1 to a5
56
+ #[[0.99503919]]