|
from gensim.models import Word2Vec |
|
from collections import defaultdict |
|
import os |
|
import tempfile |
|
|
|
|
|
def load_all_models(): |
|
''' |
|
Load all word2vec models |
|
''' |
|
|
|
archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model')) |
|
classical = ('classical', load_word2vec_model('models/classical_cbow.model')) |
|
early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model')) |
|
hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model')) |
|
late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model')) |
|
|
|
return [archaic, classical, early_roman, hellen, late_roman] |
|
|
|
|
|
def load_word2vec_model(model_path): |
|
''' |
|
Load a word2vec model from a file |
|
''' |
|
return Word2Vec.load(model_path) |
|
|
|
|
|
def get_word_vector(model, word): |
|
''' |
|
Return the word vector of a word |
|
''' |
|
return model.wv[word] |
|
|
|
|
|
def iterate_over_words(model): |
|
''' |
|
Iterate over all words in the vocabulary and print their vectors |
|
''' |
|
index = 0 |
|
for word, index in model.wv.key_to_index.items(): |
|
vector = get_word_vector(model, word) |
|
print(f'{index} Word: {word}, Vector: {vector}') |
|
index += 1 |
|
|
|
|
|
def model_dictionary(model): |
|
''' |
|
Return the dictionary of the word2vec model |
|
Key is the word and value is the vector of the word |
|
''' |
|
dict = defaultdict(list) |
|
for word, index in model.wv.key_to_index.items(): |
|
vector = get_word_vector(model, word) |
|
dict[word] = vector |
|
|
|
return dict |
|
|
|
|
|
def dot_product(vector_a, vector_b): |
|
''' |
|
Return the dot product of two vectors |
|
''' |
|
return sum(a * b for a, b in zip(vector_a, vector_b)) |
|
|
|
|
|
def magnitude(vector): |
|
''' |
|
Return the magnitude of a vector |
|
''' |
|
return sum(x**2 for x in vector) ** 0.5 |
|
|
|
|
|
def cosine_similarity(vector_a, vector_b): |
|
''' |
|
Return the cosine similarity of two vectors |
|
''' |
|
dot_prod = dot_product(vector_a, vector_b) |
|
mag_a = magnitude(vector_a) |
|
mag_b = magnitude(vector_b) |
|
|
|
|
|
if mag_a == 0 or mag_b == 0: |
|
return 0.0 |
|
|
|
similarity = dot_prod / (mag_a * mag_b) |
|
return "{:.2f}".format(similarity) |
|
|
|
|
|
def get_cosine_similarity(word1, word2, time_slice): |
|
''' |
|
Return the cosine similarity of two words |
|
''' |
|
|
|
|
|
|
|
if not os.path.exists(f'models/{time_slice}.model'): |
|
return |
|
|
|
model = load_word2vec_model(f'models/{time_slice}.model') |
|
dict = model_dictionary(model) |
|
return cosine_similarity(dict[word1], dict[word2]) |
|
|
|
|
|
def get_cosine_similarity_one_word(word, time_slice1, time_slice2): |
|
''' |
|
Return the cosine similarity of one word in two different time slices |
|
''' |
|
|
|
|
|
if not os.path.exists(f'models/{time_slice1}.model') or not os.path.exists(f'models/{time_slice2}.model'): |
|
return |
|
|
|
model1 = load_word2vec_model(f'models/{time_slice1}.model') |
|
model2 = load_word2vec_model(f'models/{time_slice2}.model') |
|
|
|
dict1 = model_dictionary(model1) |
|
dict2 = model_dictionary(model2) |
|
|
|
return cosine_similarity(dict1[word], dict2[word]) |
|
|
|
|
|
|
|
def validate_nearest_neighbours(word, time_slice_model, n): |
|
''' |
|
Validate the input of the nearest neighbours function |
|
''' |
|
if word == '' or time_slice_model == 'models/None.model' or n == '': |
|
return False |
|
return True |
|
|
|
|
|
def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models()): |
|
''' |
|
Return the nearest neighbours of a word |
|
|
|
word: the word for which the nearest neighbours are calculated |
|
time_slice_model: the word2vec model of the time slice of the input word |
|
models: list of tuples with the name of the time slice and the word2vec model (default: all in ./models) |
|
n: the number of nearest neighbours to return (default: 10) |
|
|
|
Return: list of tuples with the word, the time slice and |
|
the cosine similarity of the nearest neighbours |
|
''' |
|
|
|
|
|
valid = validate_nearest_neighbours(word, time_slice_model, n) |
|
if valid == False: |
|
return [['Error: not all parameters are set', '', '']] |
|
|
|
|
|
|
|
time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model') |
|
vector_1 = get_word_vector(time_slice_model, word) |
|
nearest_neighbours = [] |
|
|
|
|
|
for model in models: |
|
model_name = model[0] |
|
model = model[1] |
|
|
|
|
|
for word, index in model.wv.key_to_index.items(): |
|
|
|
|
|
vector_2 = get_word_vector(model, word) |
|
|
|
|
|
cosine_similarity_vectors = cosine_similarity(vector_1, vector_2) |
|
|
|
|
|
if len(nearest_neighbours) < n: |
|
nearest_neighbours.append((word, model_name, cosine_similarity_vectors)) |
|
|
|
|
|
else: |
|
smallest_neighbour = min(nearest_neighbours, key=lambda x: x[2]) |
|
if cosine_similarity_vectors > smallest_neighbour[2]: |
|
nearest_neighbours.remove(smallest_neighbour) |
|
nearest_neighbours.append((word, model_name, cosine_similarity_vectors)) |
|
|
|
|
|
return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True) |
|
|
|
|
|
def write_to_file(data): |
|
''' |
|
Write the data to a file |
|
''' |
|
|
|
temp_file_descriptor, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".txt", dir="/tmp") |
|
|
|
os.close(temp_file_descriptor) |
|
|
|
|
|
with open(temp_file_path, 'w') as temp_file: |
|
temp_file.write(str(data)) |
|
|
|
return temp_file_path |
|
|
|
|
|
def main(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model')) |
|
classical = ('classical', load_word2vec_model('models/classical_cbow.model')) |
|
early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model')) |
|
hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model')) |
|
late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model')) |
|
|
|
models = [archaic, classical, early_roman, hellen, late_roman] |
|
nearest_neighbours = get_nearest_neighbours('πατήρ', archaic[1], models, n=5) |
|
print(nearest_neighbours) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|