Spaces:

GroNLP
/

agalma

Sleeping

App Files Files Community

Mark7549 commited on Mar 11, 2024

Commit

0d0f07a

1 Parent(s): 47e1289

Migrated from gradio to streamlit

Browse files

Files changed (8) hide show

app.py +44 -0
models/.gitattributes +1 -0
models/archaic_cbow.model +3 -0
models/classical_cbow.model +3 -0
models/early_roman_cbow.model +3 -0
models/hellen_cbow.model +3 -0
models/late_roman_cbow.model +3 -0
word2vec.py +226 -0

app.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import streamlit as st
+from streamlit_option_menu import option_menu
+from word2vec import *
+st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
+# Horizontal menu
+active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D graph", 'Dictionary'],
+    menu_icon="cast", default_index=0, orientation="horizontal")
+# Nearest neighbours tab
+if active_tab == "Nearest neighbours":
+    st.write("### TO DO: add description of function")
+    col1, col2 = st.columns(2)
+    with st.container():
+        with col1:
+            word = st.text_input("Enter a word", placeholder="ἀνήρ")
+        with col2:
+            time_slice = st.multiselect("Time slice", ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"])
+        st.slider("Number of neighbours", 1, 50, 15)
+        nearest_neighbours_button = st.button("Find nearest neighbours")
+        if nearest_neighbours_button:
+            st.write("button pressed")
+# Cosine similarity tab
+elif active_tab == "Cosine similarity":
+    with st.container():
+        st.write("Cosine similarity tab")
+# 3D graph tab
+elif active_tab == "3D graph":
+    with st.container():
+        st.write("3D graph tab")
+# Dictionary tab
+elif active_tab == "Dictionary":
+    with st.container():
+        st.write("Dictionary tab")

models/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.model filter=lfs diff=lfs merge=lfs -text

models/archaic_cbow.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fdd1887db84078af826ae006bf11f884c808342f1ff9da93fd525052eef08204
+size 1647899

models/classical_cbow.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a50d112100a49d901e45e798591d2040c53bc50c67a48da1e05294f207ed5e2e
+size 6263363

models/early_roman_cbow.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f63942fae0974f4c3e39552d2d574a2f4b84e125c648d428a038e6192ec6f3f8
+size 8483329

models/hellen_cbow.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:027f8bdad4555ad4a4821a65ab2d564275105dda2d02e598e1f5f3435aedd90a
+size 5473215

models/late_roman_cbow.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53d66deaf1b14067cead5da52e46e75d0944c2140a9b36782e85f01f2ac454f4
+size 3696190

word2vec.py ADDED Viewed

	@@ -0,0 +1,226 @@

+from gensim.models import Word2Vec
+from collections import defaultdict
+import os
+import tempfile
+def load_all_models():
+    '''
+        Load all word2vec models
+    '''
+    archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
+    classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
+    early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
+    hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
+    late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
+    return [archaic, classical, early_roman, hellen, late_roman]
+def load_word2vec_model(model_path):
+    '''
+        Load a word2vec model from a file
+    '''
+    return Word2Vec.load(model_path)
+def get_word_vector(model, word):
+    '''
+        Return the word vector of a word
+    '''
+    return model.wv[word]
+def iterate_over_words(model):
+    '''
+        Iterate over all words in the vocabulary and print their vectors
+    '''
+    index = 0
+    for word, index in model.wv.key_to_index.items():
+        vector = get_word_vector(model, word)
+        print(f'{index} Word: {word}, Vector: {vector}')
+        index += 1
+def model_dictionary(model):
+    '''
+        Return the dictionary of the word2vec model
+        Key is the word and value is the vector of the word
+    '''
+    dict = defaultdict(list)
+    for word, index in model.wv.key_to_index.items():
+        vector = get_word_vector(model, word)
+        dict[word] = vector
+    return dict
+def dot_product(vector_a, vector_b):
+    '''
+        Return the dot product of two vectors
+    '''
+    return sum(a * b for a, b in zip(vector_a, vector_b))
+def magnitude(vector):
+    '''
+        Return the magnitude of a vector
+    '''
+    return sum(x**2 for x in vector) ** 0.5
+def cosine_similarity(vector_a, vector_b):
+    '''
+        Return the cosine similarity of two vectors
+    '''
+    dot_prod = dot_product(vector_a, vector_b)
+    mag_a = magnitude(vector_a)
+    mag_b = magnitude(vector_b)
+    # Avoid division by zero
+    if mag_a == 0 or mag_b == 0:
+        return 0.0
+    similarity = dot_prod / (mag_a * mag_b)
+    return "{:.2f}".format(similarity)
+def get_cosine_similarity(word1, word2, time_slice):
+    '''
+        Return the cosine similarity of two words
+    '''
+    # TO DO: MOET NETTER
+    # Return if path does not exist
+    if not os.path.exists(f'models/{time_slice}.model'):
+        return
+    model = load_word2vec_model(f'models/{time_slice}.model')
+    dict = model_dictionary(model)
+    return cosine_similarity(dict[word1], dict[word2])
+def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
+    '''
+        Return the cosine similarity of one word in two different time slices
+    '''
+    # Return if path does not exist
+    if not os.path.exists(f'models/{time_slice1}.model') or not os.path.exists(f'models/{time_slice2}.model'):
+        return
+    model1 = load_word2vec_model(f'models/{time_slice1}.model')
+    model2 = load_word2vec_model(f'models/{time_slice2}.model')
+    dict1 = model_dictionary(model1)
+    dict2 = model_dictionary(model2)
+    return cosine_similarity(dict1[word], dict2[word])
+def validate_nearest_neighbours(word, time_slice_model, n):
+    '''
+        Validate the input of the nearest neighbours function
+    '''
+    if word == '' or time_slice_model == 'models/None.model' or n == '':
+        return False
+    return True
+def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models()):
+    '''
+        Return the nearest neighbours of a word
+        word: the word for which the nearest neighbours are calculated
+        time_slice_model: the word2vec model of the time slice of the input word
+        models: list of tuples with the name of the time slice and the word2vec model (default: all in ./models)
+        n: the number of nearest neighbours to return (default: 10)
+        Return: list of tuples with the word, the time slice and
+                the cosine similarity of the nearest neighbours
+    '''
+    # Check if all parameters are set
+    valid = validate_nearest_neighbours(word, time_slice_model, n)
+    if valid == False:
+        return [['Error: not all parameters are set', '', '']]
+    time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model')
+    vector_1 = get_word_vector(time_slice_model, word)
+    nearest_neighbours = []
+    # Iterate over all models
+    for model in models:
+        model_name = model[0]
+        model = model[1]
+        # Iterate over all words of the model
+        for word, index in model.wv.key_to_index.items():
+            # Vector of the current word
+            vector_2 = get_word_vector(model, word)
+            # Calculate the cosine similarity between current word and input word
+            cosine_similarity_vectors = cosine_similarity(vector_1, vector_2)
+            # If the list of nearest neighbours is not full yet, add the current word
+            if len(nearest_neighbours) < n:
+                nearest_neighbours.append((word, model_name, cosine_similarity_vectors))
+            # If the list of nearest neighbours is full, replace the word with the smallest cosine similarity
+            else:
+                smallest_neighbour = min(nearest_neighbours, key=lambda x: x[2])
+                if cosine_similarity_vectors > smallest_neighbour[2]:
+                    nearest_neighbours.remove(smallest_neighbour)
+                    nearest_neighbours.append((word, model_name, cosine_similarity_vectors))
+    return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)
+def write_to_file(data):
+    '''
+        Write the data to a file
+    '''
+    # Create random tmp file name
+    temp_file_descriptor, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".txt", dir="/tmp")
+    os.close(temp_file_descriptor)
+    # Write data to the temporary file
+    with open(temp_file_path, 'w') as temp_file:
+        temp_file.write(str(data))
+    return temp_file_path
+def main():
+    # model = load_word2vec_model('models/archaic_cbow.model')
+    # archaic_cbow_dict = model_dictionary(model)
+    # score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
+    # print(score)
+    archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
+    classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
+    early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
+    hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
+    late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
+    models = [archaic, classical, early_roman, hellen, late_roman]
+    nearest_neighbours = get_nearest_neighbours('πατήρ', archaic[1], models, n=5)
+    print(nearest_neighbours)
+    # vector = get_word_vector(model, 'ἀνήρ')
+    # print(vector)
+    # Iterate over all words and print their vectors
+    # iterate_over_words(model)
+if __name__ == "__main__":
+    main()