Spaces:

GroNLP
/

agalma

Sleeping

App Files Files Community

agalma / word2vec.py

Mark7549

Migrated from gradio to streamlit

0d0f07a over 1 year ago

raw

history blame

7.25 kB

	from gensim.models import Word2Vec
	from collections import defaultdict
	import os
	import tempfile


	def load_all_models():
	'''
	Load all word2vec models
	'''

	archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
	classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
	early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
	hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
	late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))

	return [archaic, classical, early_roman, hellen, late_roman]


	def load_word2vec_model(model_path):
	'''
	Load a word2vec model from a file
	'''
	return Word2Vec.load(model_path)


	def get_word_vector(model, word):
	'''
	Return the word vector of a word
	'''
	return model.wv[word]


	def iterate_over_words(model):
	'''
	Iterate over all words in the vocabulary and print their vectors
	'''
	index = 0
	for word, index in model.wv.key_to_index.items():
	vector = get_word_vector(model, word)
	print(f'{index} Word: {word}, Vector: {vector}')
	index += 1


	def model_dictionary(model):
	'''
	Return the dictionary of the word2vec model
	Key is the word and value is the vector of the word
	'''
	dict = defaultdict(list)
	for word, index in model.wv.key_to_index.items():
	vector = get_word_vector(model, word)
	dict[word] = vector

	return dict


	def dot_product(vector_a, vector_b):
	'''
	Return the dot product of two vectors
	'''
	return sum(a * b for a, b in zip(vector_a, vector_b))


	def magnitude(vector):
	'''
	Return the magnitude of a vector
	'''
	return sum(x2 for x in vector) 0.5


	def cosine_similarity(vector_a, vector_b):
	'''
	Return the cosine similarity of two vectors
	'''
	dot_prod = dot_product(vector_a, vector_b)
	mag_a = magnitude(vector_a)
	mag_b = magnitude(vector_b)

	# Avoid division by zero
	if mag_a == 0 or mag_b == 0:
	return 0.0

	similarity = dot_prod / (mag_a * mag_b)
	return "{:.2f}".format(similarity)


	def get_cosine_similarity(word1, word2, time_slice):
	'''
	Return the cosine similarity of two words
	'''
	# TO DO: MOET NETTER

	# Return if path does not exist
	if not os.path.exists(f'models/{time_slice}.model'):
	return

	model = load_word2vec_model(f'models/{time_slice}.model')
	dict = model_dictionary(model)
	return cosine_similarity(dict[word1], dict[word2])


	def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
	'''
	Return the cosine similarity of one word in two different time slices
	'''

	# Return if path does not exist
	if not os.path.exists(f'models/{time_slice1}.model') or not os.path.exists(f'models/{time_slice2}.model'):
	return

	model1 = load_word2vec_model(f'models/{time_slice1}.model')
	model2 = load_word2vec_model(f'models/{time_slice2}.model')

	dict1 = model_dictionary(model1)
	dict2 = model_dictionary(model2)

	return cosine_similarity(dict1[word], dict2[word])



	def validate_nearest_neighbours(word, time_slice_model, n):
	'''
	Validate the input of the nearest neighbours function
	'''
	if word == '' or time_slice_model == 'models/None.model' or n == '':
	return False
	return True


	def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models()):
	'''
	Return the nearest neighbours of a word

	word: the word for which the nearest neighbours are calculated
	time_slice_model: the word2vec model of the time slice of the input word
	models: list of tuples with the name of the time slice and the word2vec model (default: all in ./models)
	n: the number of nearest neighbours to return (default: 10)

	Return: list of tuples with the word, the time slice and
	the cosine similarity of the nearest neighbours
	'''

	# Check if all parameters are set
	valid = validate_nearest_neighbours(word, time_slice_model, n)
	if valid == False:
	return [['Error: not all parameters are set', '', '']]



	time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model')
	vector_1 = get_word_vector(time_slice_model, word)
	nearest_neighbours = []

	# Iterate over all models
	for model in models:
	model_name = model[0]
	model = model[1]

	# Iterate over all words of the model
	for word, index in model.wv.key_to_index.items():

	# Vector of the current word
	vector_2 = get_word_vector(model, word)

	# Calculate the cosine similarity between current word and input word
	cosine_similarity_vectors = cosine_similarity(vector_1, vector_2)

	# If the list of nearest neighbours is not full yet, add the current word
	if len(nearest_neighbours) < n:
	nearest_neighbours.append((word, model_name, cosine_similarity_vectors))

	# If the list of nearest neighbours is full, replace the word with the smallest cosine similarity
	else:
	smallest_neighbour = min(nearest_neighbours, key=lambda x: x[2])
	if cosine_similarity_vectors > smallest_neighbour[2]:
	nearest_neighbours.remove(smallest_neighbour)
	nearest_neighbours.append((word, model_name, cosine_similarity_vectors))


	return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)


	def write_to_file(data):
	'''
	Write the data to a file
	'''
	# Create random tmp file name
	temp_file_descriptor, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".txt", dir="/tmp")

	os.close(temp_file_descriptor)

	# Write data to the temporary file
	with open(temp_file_path, 'w') as temp_file:
	temp_file.write(str(data))

	return temp_file_path


	def main():
	# model = load_word2vec_model('models/archaic_cbow.model')
	# archaic_cbow_dict = model_dictionary(model)

	# score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
	# print(score)


	archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
	classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
	early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
	hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
	late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))

	models = [archaic, classical, early_roman, hellen, late_roman]
	nearest_neighbours = get_nearest_neighbours('πατήρ', archaic[1], models, n=5)
	print(nearest_neighbours)
	# vector = get_word_vector(model, 'ἀνήρ')
	# print(vector)

	# Iterate over all words and print their vectors
	# iterate_over_words(model)


	if __name__ == "__main__":
	main()