Jan Mühlnikel commited on
Commit
139b395
·
1 Parent(s): 5ae2590

added semantic search engine

Browse files
__pycache__/similarity.cpython-310.pyc CHANGED
Binary files a/__pycache__/similarity.cpython-310.pyc and b/__pycache__/similarity.cpython-310.pyc differ
 
similarity.py CHANGED
@@ -8,7 +8,11 @@ Page for similarities
8
  import streamlit as st
9
  import pandas as pd
10
  from scipy.sparse import load_npz
 
 
 
11
  import utils.similarity_table as similarity_table
 
12
  import psutil
13
  import os
14
 
@@ -40,14 +44,39 @@ def load_projects():
40
 
41
  return projects_df
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  # LOAD DATA
44
  sim_matrix = load_sim_matrix()
45
  projects_df = load_projects()
 
 
46
 
47
  def show_page():
48
  st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB")
49
  st.write("Similarities")
50
 
 
 
51
  df_subset = projects_df.head(10)
52
  selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id'])
53
 
 
8
  import streamlit as st
9
  import pandas as pd
10
  from scipy.sparse import load_npz
11
+ import pickle
12
+ import faiss
13
+ from sentence_transformers import SentenceTransformer
14
  import utils.similarity_table as similarity_table
15
+ import utils.semantic_search as semantic_search
16
  import psutil
17
  import os
18
 
 
44
 
45
  return projects_df
46
 
47
+ @st.cache_resource
48
+ def load_model():
49
+ model = SentenceTransformer('all-MiniLM-L6-v2')
50
+ return model
51
+
52
+ # LOAD EMBEDDINGS
53
+ @st.cache_data
54
+ def load_embeddings_and_index():
55
+ # Load embeddings
56
+ with open("..\synergy-app\src\embeddings.pkl", "rb") as fIn:
57
+ stored_data = pickle.load(fIn)
58
+ sentences = stored_data["sentences"]
59
+ embeddings = stored_data["embeddings"]
60
+
61
+ # Load or create FAISS index
62
+ dimension = embeddings.shape[1]
63
+ faiss_index = faiss.IndexFlatL2(dimension)
64
+ faiss_index.add(embeddings)
65
+
66
+ return sentences, embeddings, faiss_index
67
+
68
  # LOAD DATA
69
  sim_matrix = load_sim_matrix()
70
  projects_df = load_projects()
71
+ model = load_model()
72
+ sentences, embeddings, faiss_index = load_embeddings_and_index()
73
 
74
  def show_page():
75
  st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB")
76
  st.write("Similarities")
77
 
78
+ semantic_search.show_search(model, faiss_index, sentences)
79
+
80
  df_subset = projects_df.head(10)
81
  selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id'])
82
 
utils/__pycache__/semantic_search.cpython-310.pyc ADDED
Binary file (825 Bytes). View file
 
utils/semantic_search.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import faiss
3
+ import streamlit as st
4
+ from sentence_transformers import SentenceTransformer
5
+
6
+ def show_search(model, faiss_index, sentences):
7
+ query = st.text_input("Enter your search query:")
8
+
9
+ if query:
10
+ # Convert query to embedding
11
+ query_embedding = model.encode([query])[0].reshape(1, -1)
12
+
13
+ # Perform search
14
+ D, I = faiss_index.search(query_embedding, k=5) # Search for top 5 similar items
15
+
16
+ # Display results
17
+ st.write("Top results:")
18
+ for i in I[0]:
19
+ st.write(sentences[i])