Jan Mühlnikel
commited on
Commit
·
139b395
1
Parent(s):
5ae2590
added semantic search engine
Browse files
__pycache__/similarity.cpython-310.pyc
CHANGED
Binary files a/__pycache__/similarity.cpython-310.pyc and b/__pycache__/similarity.cpython-310.pyc differ
|
|
similarity.py
CHANGED
@@ -8,7 +8,11 @@ Page for similarities
|
|
8 |
import streamlit as st
|
9 |
import pandas as pd
|
10 |
from scipy.sparse import load_npz
|
|
|
|
|
|
|
11 |
import utils.similarity_table as similarity_table
|
|
|
12 |
import psutil
|
13 |
import os
|
14 |
|
@@ -40,14 +44,39 @@ def load_projects():
|
|
40 |
|
41 |
return projects_df
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
# LOAD DATA
|
44 |
sim_matrix = load_sim_matrix()
|
45 |
projects_df = load_projects()
|
|
|
|
|
46 |
|
47 |
def show_page():
|
48 |
st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB")
|
49 |
st.write("Similarities")
|
50 |
|
|
|
|
|
51 |
df_subset = projects_df.head(10)
|
52 |
selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id'])
|
53 |
|
|
|
8 |
import streamlit as st
|
9 |
import pandas as pd
|
10 |
from scipy.sparse import load_npz
|
11 |
+
import pickle
|
12 |
+
import faiss
|
13 |
+
from sentence_transformers import SentenceTransformer
|
14 |
import utils.similarity_table as similarity_table
|
15 |
+
import utils.semantic_search as semantic_search
|
16 |
import psutil
|
17 |
import os
|
18 |
|
|
|
44 |
|
45 |
return projects_df
|
46 |
|
47 |
+
@st.cache_resource
|
48 |
+
def load_model():
|
49 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
50 |
+
return model
|
51 |
+
|
52 |
+
# LOAD EMBEDDINGS
|
53 |
+
@st.cache_data
|
54 |
+
def load_embeddings_and_index():
|
55 |
+
# Load embeddings
|
56 |
+
with open("..\synergy-app\src\embeddings.pkl", "rb") as fIn:
|
57 |
+
stored_data = pickle.load(fIn)
|
58 |
+
sentences = stored_data["sentences"]
|
59 |
+
embeddings = stored_data["embeddings"]
|
60 |
+
|
61 |
+
# Load or create FAISS index
|
62 |
+
dimension = embeddings.shape[1]
|
63 |
+
faiss_index = faiss.IndexFlatL2(dimension)
|
64 |
+
faiss_index.add(embeddings)
|
65 |
+
|
66 |
+
return sentences, embeddings, faiss_index
|
67 |
+
|
68 |
# LOAD DATA
|
69 |
sim_matrix = load_sim_matrix()
|
70 |
projects_df = load_projects()
|
71 |
+
model = load_model()
|
72 |
+
sentences, embeddings, faiss_index = load_embeddings_and_index()
|
73 |
|
74 |
def show_page():
|
75 |
st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB")
|
76 |
st.write("Similarities")
|
77 |
|
78 |
+
semantic_search.show_search(model, faiss_index, sentences)
|
79 |
+
|
80 |
df_subset = projects_df.head(10)
|
81 |
selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id'])
|
82 |
|
utils/__pycache__/semantic_search.cpython-310.pyc
ADDED
Binary file (825 Bytes). View file
|
|
utils/semantic_search.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
import faiss
|
3 |
+
import streamlit as st
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
|
6 |
+
def show_search(model, faiss_index, sentences):
|
7 |
+
query = st.text_input("Enter your search query:")
|
8 |
+
|
9 |
+
if query:
|
10 |
+
# Convert query to embedding
|
11 |
+
query_embedding = model.encode([query])[0].reshape(1, -1)
|
12 |
+
|
13 |
+
# Perform search
|
14 |
+
D, I = faiss_index.search(query_embedding, k=5) # Search for top 5 similar items
|
15 |
+
|
16 |
+
# Display results
|
17 |
+
st.write("Top results:")
|
18 |
+
for i in I[0]:
|
19 |
+
st.write(sentences[i])
|