agalma / word2vec.py
Mark7549's picture
Added cosine similarity front-end
bdf0a5e
raw
history blame
8.8 kB
from gensim.models import Word2Vec
from collections import defaultdict
import os
import tempfile
def load_all_models():
'''
Load all word2vec models
'''
archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
return [archaic, classical, early_roman, hellen, late_roman]
def load_selected_models(selected_models):
'''
Load the selected word2vec models
'''
models = []
for model in selected_models:
if model == "Early Roman":
model = "early_roman"
elif model == "Late Roman":
model = "late_roman"
elif model == "Hellenistic":
model = "hellen"
model_name = model.lower() + "_cbow"
models.append([model_name, load_word2vec_model(f'models/{model_name}.model')])
return models
def load_word2vec_model(model_path):
'''
Load a word2vec model from a file
'''
return Word2Vec.load(model_path)
def get_word_vector(model, word):
'''
Return the word vector of a word
'''
return model.wv[word]
def iterate_over_words(model):
'''
Iterate over all words in the vocabulary and print their vectors
'''
index = 0
for word, index in model.wv.key_to_index.items():
vector = get_word_vector(model, word)
print(f'{index} Word: {word}, Vector: {vector}')
index += 1
def model_dictionary(model):
'''
Return the dictionary of the word2vec model
Key is the word and value is the vector of the word
'''
dict = defaultdict(list)
for word, index in model.wv.key_to_index.items():
vector = get_word_vector(model, word)
dict[word] = vector
return dict
def dot_product(vector_a, vector_b):
'''
Return the dot product of two vectors
'''
return sum(a * b for a, b in zip(vector_a, vector_b))
def magnitude(vector):
'''
Return the magnitude of a vector
'''
return sum(x**2 for x in vector) ** 0.5
def cosine_similarity(vector_a, vector_b):
'''
Return the cosine similarity of two vectors
'''
dot_prod = dot_product(vector_a, vector_b)
mag_a = magnitude(vector_a)
mag_b = magnitude(vector_b)
# Avoid division by zero
if mag_a == 0 or mag_b == 0:
return 0.0
similarity = dot_prod / (mag_a * mag_b)
return "{:.2f}".format(similarity)
def get_cosine_similarity(word1, time_slice_1, word2, time_slice_2):
'''
Return the cosine similarity of two words
'''
# TO DO: MOET NETTER
# Return if path does not exist
time_slice_1 = convert_time_name_to_model(time_slice_1)
time_slice_2 = convert_time_name_to_model(time_slice_2)
if not os.path.exists(f'models/{time_slice_1}.model'):
return
model_1 = load_word2vec_model(f'models/{time_slice_1}.model')
model_2 = load_word2vec_model(f'models/{time_slice_2}.model')
dict_1 = model_dictionary(model_1)
dict_2 = model_dictionary(model_2)
return cosine_similarity(dict_1[word1], dict_2[word2])
def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
'''
Return the cosine similarity of one word in two different time slices
'''
# Return if path does not exist
if not os.path.exists(f'models/{time_slice1}.model') or not os.path.exists(f'models/{time_slice2}.model'):
return
model1 = load_word2vec_model(f'models/{time_slice1}.model')
model2 = load_word2vec_model(f'models/{time_slice2}.model')
dict1 = model_dictionary(model1)
dict2 = model_dictionary(model2)
return cosine_similarity(dict1[word], dict2[word])
def validate_nearest_neighbours(word, time_slice_model, n, models):
'''
Validate the input of the nearest neighbours function
'''
if word == '' or time_slice_model == [] or n == '' or models == []:
return False
return True
def convert_model_to_time_name(model_name):
'''
Convert the model name to the time slice name
'''
if model_name == 'archaic_cbow':
return 'Archaic'
elif model_name == 'classical_cbow':
return 'Classical'
elif model_name == 'early_roman_cbow':
return 'Early Roman'
elif model_name == 'hellen_cbow':
return 'Hellenistic'
elif model_name == 'late_roman_cbow':
return 'Late Roman'
def convert_time_name_to_model(time_name):
'''
Convert the time slice name to the model name
'''
if time_name == 'Archaic':
return 'archaic_cbow'
elif time_name == 'Classical':
return 'classical_cbow'
elif time_name == 'Early Roman':
return 'early_roman_cbow'
elif time_name == 'Hellenistic':
return 'hellen_cbow'
elif time_name == 'Late Roman':
return 'late_roman_cbow'
def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models()):
'''
Return the nearest neighbours of a word
word: the word for which the nearest neighbours are calculated
time_slice_model: the word2vec model of the time slice of the input word
models: list of tuples with the name of the time slice and the word2vec model (default: all in ./models)
n: the number of nearest neighbours to return (default: 10)
Return: list of tuples with the word, the time slice and
the cosine similarity of the nearest neighbours
'''
time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model')
vector_1 = get_word_vector(time_slice_model, word)
nearest_neighbours = []
# Iterate over all models
for model in models:
model_name = model[0]
time_name = convert_model_to_time_name(model_name)
model = model[1]
# Iterate over all words of the model
for word, index in model.wv.key_to_index.items():
# Vector of the current word
vector_2 = get_word_vector(model, word)
# Calculate the cosine similarity between current word and input word
cosine_similarity_vectors = cosine_similarity(vector_1, vector_2)
# If the list of nearest neighbours is not full yet, add the current word
if len(nearest_neighbours) < n:
nearest_neighbours.append((word, time_name, cosine_similarity_vectors))
# If the list of nearest neighbours is full, replace the word with the smallest cosine similarity
else:
smallest_neighbour = min(nearest_neighbours, key=lambda x: x[2])
if cosine_similarity_vectors > smallest_neighbour[2]:
nearest_neighbours.remove(smallest_neighbour)
nearest_neighbours.append((word, time_name, cosine_similarity_vectors))
return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)
def write_to_file(data):
'''
Write the data to a file
'''
# Create random tmp file name
temp_file_descriptor, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".txt", dir="/tmp")
os.close(temp_file_descriptor)
# Write data to the temporary file
with open(temp_file_path, 'w') as temp_file:
temp_file.write(str(data))
return temp_file_path
def main():
# model = load_word2vec_model('models/archaic_cbow.model')
# archaic_cbow_dict = model_dictionary(model)
# score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
# print(score)
archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
models = [archaic, classical, early_roman, hellen, late_roman]
nearest_neighbours = get_nearest_neighbours('πατήρ', 'archaic_cbow', n=5)
print(nearest_neighbours)
# vector = get_word_vector(model, 'ἀνήρ')
# print(vector)
# Iterate over all words and print their vectors
# iterate_over_words(model)
if __name__ == "__main__":
main()