Spaces:

GroNLP
/

agalma

Running

App Files Files Community

Mark7549 commited on Mar 11, 2024

Commit

169869e

1 Parent(s): 14c3a4f

Added option to select models to search word in

Browse files

Files changed (2) hide show

app.py +16 -3
word2vec.py +39 -4

app.py CHANGED Viewed

@@ -20,6 +20,7 @@ if active_tab == "Nearest neighbours":
         with col2:
             time_slice = st.selectbox("Time slice", ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"])
         n = st.slider("Number of neighbours", 1, 50, 15)
         nearest_neighbours_button = st.button("Find nearest neighbours")
@@ -28,14 +29,26 @@ if active_tab == "Nearest neighbours":
         if nearest_neighbours_button:
             # Rewrite timeslices to model names: Archaic -> archaic_cbow
             time_slice = time_slice.lower() + "_cbow"
-            st.write(time_slice)
             # Check if all fields are filled in
-            if validate_nearest_neighbours(word, time_slice, n) == False:
                 st.error('Please fill in all fields')
             else:
-                nearest_neighbours = get_nearest_neighbours(word, time_slice, n)
                 df = pd.DataFrame(nearest_neighbours, columns=["Word", "Time slice", "Similarity"])
                 st.table(df)

         with col2:
             time_slice = st.selectbox("Time slice", ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"])
+        models = st.multiselect("Select models to search for neighbours", ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"])
         n = st.slider("Number of neighbours", 1, 50, 15)
         nearest_neighbours_button = st.button("Find nearest neighbours")
         if nearest_neighbours_button:
             # Rewrite timeslices to model names: Archaic -> archaic_cbow
+            if time_slice == 'Hellenistic':
+                time_slice = 'hellen'
+            elif time_slice == 'Early Roman':
+                time_slice = 'early_roman'
+            elif time_slice == 'Late Roman':
+                time_slice = 'late_roman'
             time_slice = time_slice.lower() + "_cbow"
             # Check if all fields are filled in
+            if validate_nearest_neighbours(word, time_slice, n, models) == False:
                 st.error('Please fill in all fields')
             else:
+                # Rewrite models to list of all loaded models
+                models = load_selected_models(models)
+                nearest_neighbours = get_nearest_neighbours(word, time_slice, n, models)
                 df = pd.DataFrame(nearest_neighbours, columns=["Word", "Time slice", "Similarity"])
                 st.table(df)

word2vec.py CHANGED Viewed

@@ -18,6 +18,24 @@ def load_all_models():
     return [archaic, classical, early_roman, hellen, late_roman]
 def load_word2vec_model(model_path):
     '''
         Load a word2vec model from a file
@@ -120,15 +138,31 @@ def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
-def validate_nearest_neighbours(word, time_slice_model, n):
     '''
         Validate the input of the nearest neighbours function
     '''
-    if word == '' or time_slice_model == [] or n == '':
         return False
     return True
 def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models()):
     '''
         Return the nearest neighbours of a word
@@ -149,6 +183,7 @@ def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models(
     # Iterate over all models
     for model in models:
         model_name = model[0]
         model = model[1]
         # Iterate over all words of the model
@@ -162,14 +197,14 @@ def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models(
             # If the list of nearest neighbours is not full yet, add the current word
             if len(nearest_neighbours) < n:
-                nearest_neighbours.append((word, model_name, cosine_similarity_vectors))
             # If the list of nearest neighbours is full, replace the word with the smallest cosine similarity
             else:
                 smallest_neighbour = min(nearest_neighbours, key=lambda x: x[2])
                 if cosine_similarity_vectors > smallest_neighbour[2]:
                     nearest_neighbours.remove(smallest_neighbour)
-                    nearest_neighbours.append((word, model_name, cosine_similarity_vectors))
     return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)

     return [archaic, classical, early_roman, hellen, late_roman]
+def load_selected_models(selected_models):
+    '''
+        Load the selected word2vec models
+    '''
+    models = []
+    for model in selected_models:
+        if model == "Early Roman":
+            model = "early_roman"
+        elif model == "Late Roman":
+            model = "late_roman"
+        elif model == "Hellenistic":
+            model = "hellen"
+        model_name = model.lower() + "_cbow"
+        models.append([model_name, load_word2vec_model(f'models/{model_name}.model')])
+    return models
 def load_word2vec_model(model_path):
     '''
         Load a word2vec model from a file
+def validate_nearest_neighbours(word, time_slice_model, n, models):
     '''
         Validate the input of the nearest neighbours function
     '''
+    if word == '' or time_slice_model == [] or n == '' or models == []:
         return False
     return True
+def convert_model_to_time_name(model_name):
+    '''
+        Convert the model name to the time slice name
+    '''
+    if model_name == 'archaic_cbow':
+        return 'Archaic'
+    elif model_name == 'classical_cbow':
+        return 'Classical'
+    elif model_name == 'early_roman_cbow':
+        return 'Early Roman'
+    elif model_name == 'hellen_cbow':
+        return 'Hellenistic'
+    elif model_name == 'late_roman_cbow':
+        return 'Late Roman'
 def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models()):
     '''
         Return the nearest neighbours of a word
     # Iterate over all models
     for model in models:
         model_name = model[0]
+        time_name = convert_model_to_time_name(model_name)
         model = model[1]
         # Iterate over all words of the model
             # If the list of nearest neighbours is not full yet, add the current word
             if len(nearest_neighbours) < n:
+                nearest_neighbours.append((word, time_name, cosine_similarity_vectors))
             # If the list of nearest neighbours is full, replace the word with the smallest cosine similarity
             else:
                 smallest_neighbour = min(nearest_neighbours, key=lambda x: x[2])
                 if cosine_similarity_vectors > smallest_neighbour[2]:
                     nearest_neighbours.remove(smallest_neighbour)
+                    nearest_neighbours.append((word, time_name, cosine_similarity_vectors))
     return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)