File size: 1,315 Bytes
280d87f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import json

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the JSON data
with open('Datasets/Query/datasets_text.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Prepare sentences and labels
sentences = [item[0] for item in data["annotations"]]
labels = [item[1]['entities'] for item in data["annotations"]]
# Define tags
tags = data["classes"]
# tags = ['<pad>'] + tags

# Convert tags to indices
tag2idx = {tag: 0 for idx, tag in enumerate(tags)}
for label in labels:
    for entity in label:
        tag2idx[entity[1]] = tag2idx[entity[1]] + 1
# Sort the dictionary by values
sorted_tags_dict = dict(sorted(tag2idx.items(), key=lambda item: item[1],reverse=True))
sorted_tags = {key: value for key, value in sorted_tags_dict.items()}
sorted_tags = list(sorted_tags)

for i in range(len(sorted_tags)):
    sorted_tags[i] = sorted_tags[i].replace(" ", "_")
    
destinations = pd.read_excel("Datasets/Places/des_retags_copilot.xlsx")

vectorizer = CountVectorizer(max_features=10000, stop_words="english")
tags_vector = vectorizer.fit_transform(destinations["tags"].values.astype('U')).toarray()
tags_vector = tags_vector[1:]

feature_names = vectorizer.get_feature_names_out()