Commit
·
e16bf00
1
Parent(s):
c97f477
Update README.md
Browse files
README.md
CHANGED
@@ -11,4 +11,37 @@ def preprocess(text):
|
|
11 |
t = 'http' if t.startswith('http') else t
|
12 |
new_text.append(t)
|
13 |
return " ".join(new_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
```
|
|
|
11 |
t = 'http' if t.startswith('http') else t
|
12 |
new_text.append(t)
|
13 |
return " ".join(new_text)
|
14 |
+
|
15 |
+
def get_embedding(text):
|
16 |
+
text = preprocess(text)
|
17 |
+
encoded_input = tokenizer(text, return_tensors='pt')
|
18 |
+
features = model(**encoded_input)
|
19 |
+
features = features[0].detach().numpy()
|
20 |
+
features_mean = np.mean(features[0], axis=0)
|
21 |
+
return features_mean
|
22 |
+
|
23 |
+
query = "Acabo de pedir pollo frito 🐣" #spanish
|
24 |
+
|
25 |
+
tweets = ["We had a great time! ⚽️", # english
|
26 |
+
"We hebben een geweldige tijd gehad! ⛩", # dutch
|
27 |
+
"Nous avons passé un bon moment! 🎥", # french
|
28 |
+
"Ci siamo divertiti! 🍝"] # italian
|
29 |
+
|
30 |
+
d = defaultdict(int)
|
31 |
+
for tweet in tweets:
|
32 |
+
sim = 1-cosine(get_embedding(query),get_embedding(tweet))
|
33 |
+
d[tweet] = sim
|
34 |
+
|
35 |
+
print('Most similar to: ',query)
|
36 |
+
print('----------------------------------------')
|
37 |
+
for idx,x in enumerate(sorted(d.items(), key=lambda x:x[1], reverse=True)):
|
38 |
+
print(idx+1,x[0])
|
39 |
+
```
|
40 |
+
```
|
41 |
+
Most similar to: Acabo de pedir pollo frito 🐣
|
42 |
+
----------------------------------------
|
43 |
+
1 Ci siamo divertiti! 🍝
|
44 |
+
2 Nous avons passé un bon moment! 🎥
|
45 |
+
3 We had a great time! ⚽️
|
46 |
+
4 We hebben een geweldige tijd gehad! ⛩
|
47 |
```
|