""" Page for similarities """ ################ # DEPENDENCIES # ################ import streamlit as st import pandas as pd from scipy.sparse import load_npz import pickle import faiss from sentence_transformers import SentenceTransformer import modules.result_table as result_table import modules.semantic_search as semantic_search from functions.filter_projects import filter_projects import psutil import os def get_process_memory(): process = psutil.Process(os.getpid()) return process.memory_info().rss / (1024 * 1024) # Catch DATA # Load Similarity matrix @st.cache_data def load_sim_matrix(): loaded_matrix = load_npz("src/similarities.npz") dense_matrix = loaded_matrix.toarray() return dense_matrix # Load Projects DFs @st.cache_data def load_projects(): orgas_df = pd.read_csv("src/projects/project_orgas.csv") region_df = pd.read_csv("src/projects/project_region.csv") sector_df = pd.read_csv("src/projects/project_sector.csv") status_df = pd.read_csv("src/projects/project_status.csv") texts_df = pd.read_csv("src/projects/project_texts.csv") projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner') projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner') projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner') projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner') return projects_df # Load CRS 3 data @st.cache_data def getCRS3(): # Read in CRS3 CODELISTS crs3_df = pd.read_csv('src/codelists/crs3_codes.csv') CRS3_CODES = crs3_df['code'].tolist() CRS3_NAME = crs3_df['name'].tolist() CRS3_MERGED = {f"{name} - {code}": code for name, code in zip(CRS3_NAME, CRS3_CODES)} return CRS3_MERGED # Load CRS 5 data @st.cache_data def getCRS5(): # Read in CRS3 CODELISTS crs5_df = pd.read_csv('src/codelists/crs5_codes.csv') CRS5_CODES = crs5_df['code'].tolist() CRS5_NAME = crs5_df['name'].tolist() CRS5_MERGED = {code: [f"{name} - {code}"] for name, code in zip(CRS5_NAME, CRS5_CODES)} return CRS5_MERGED # Load SDG data @st.cache_data def getSDG(): # Read in SDG CODELISTS sdg_df = pd.read_csv('src/codelists/sdg_goals.csv') SDG_NAMES = sdg_df['name'].tolist() return SDG_NAMES # Load Sentence Transformer Model @st.cache_resource def load_model(): model = SentenceTransformer('all-MiniLM-L6-v2') return model # Load Embeddings @st.cache_data def load_embeddings_and_index(): # Load embeddings with open("src/embeddings.pkl", "rb") as fIn: stored_data = pickle.load(fIn) sentences = stored_data["sentences"] embeddings = stored_data["embeddings"] # Load or create FAISS index dimension = embeddings.shape[1] faiss_index = faiss.IndexFlatL2(dimension) faiss_index.add(embeddings) return sentences, embeddings, faiss_index # USE CACHE FUNCTIONS sim_matrix = load_sim_matrix() projects_df = load_projects() CRS3_MERGED = getCRS3() CRS5_MERGED = getCRS5() SDG_NAMES = getSDG() model = load_model() sentences, embeddings, faiss_index = load_embeddings_and_index() def show_page(): st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB") st.write("Similarities") col1, col2 = st.columns([1, 1]) with col1: # CRS 3 SELECTION crs3_option = st.multiselect( 'CRS 3', CRS3_MERGED, placeholder="Select" ) with col2: st.write("x") # CRS CODE LIST crs3_list = [i[-3:] for i in crs3_option] st.write(crs3_list) result_df = filter_projects(projects_df, crs3_list) st.dataframe(result_df) """ #semantic_search.show_search(model, faiss_index, sentences) df_subset = projects_df.head(10) selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id']) st.write(selected_index) # add index and similarity together indecies = range(0, len(sim_matrix)) similarities = sim_matrix[selected_index] zipped_sims = list(zip(indecies, similarities)) # remove all 0 similarities filtered_sims = [(index, similarity) for index, similarity in zipped_sims if similarity != 0] # Select and sort top 20 most similar projects sorted_sims = sorted(filtered_sims, key=lambda x: x[1], reverse=True) top_20_sims = sorted_sims[:20] # create result data frame index_list = [tup[0] for tup in top_20_sims] print(index_list) result_df = projects_df.iloc[index_list] print(len(result_df)) print(len(result_df)) # add other colums to result df similarity_list = [tup[1] for tup in top_20_sims] result_df["similarity"] = similarity_list similarity_table.show_table(result_df, similarity_list) """