|
""" |
|
Page for similarities |
|
""" |
|
|
|
|
|
|
|
|
|
import streamlit as st |
|
import pandas as pd |
|
from scipy.sparse import load_npz |
|
import pickle |
|
import faiss |
|
from sentence_transformers import SentenceTransformer |
|
from modules.result_table import show_table |
|
import modules.semantic_search as semantic_search |
|
from functions.filter_projects import filter_projects |
|
from functions.calc_matches import calc_matches |
|
import psutil |
|
import os |
|
|
|
def get_process_memory(): |
|
process = psutil.Process(os.getpid()) |
|
return process.memory_info().rss / (1024 * 1024) |
|
|
|
|
|
|
|
@st.cache_data |
|
def load_sim_matrix(): |
|
loaded_matrix = load_npz("src/similarities.npz") |
|
dense_matrix = loaded_matrix.toarray() |
|
|
|
return dense_matrix |
|
|
|
|
|
@st.cache_data |
|
def load_projects(): |
|
orgas_df = pd.read_csv("src/projects/project_orgas.csv") |
|
region_df = pd.read_csv("src/projects/project_region.csv") |
|
sector_df = pd.read_csv("src/projects/project_sector.csv") |
|
status_df = pd.read_csv("src/projects/project_status.csv") |
|
texts_df = pd.read_csv("src/projects/project_texts.csv") |
|
|
|
projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner') |
|
projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner') |
|
projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner') |
|
projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner') |
|
|
|
return projects_df |
|
|
|
|
|
@st.cache_data |
|
def getCRS3(): |
|
|
|
crs3_df = pd.read_csv('src/codelists/crs3_codes.csv') |
|
CRS3_CODES = crs3_df['code'].tolist() |
|
CRS3_NAME = crs3_df['name'].tolist() |
|
CRS3_MERGED = {f"{name} - {code}": code for name, code in zip(CRS3_NAME, CRS3_CODES)} |
|
|
|
return CRS3_MERGED |
|
|
|
|
|
@st.cache_data |
|
def getCRS5(): |
|
|
|
crs5_df = pd.read_csv('src/codelists/crs5_codes.csv') |
|
CRS5_CODES = crs5_df['code'].tolist() |
|
CRS5_NAME = crs5_df['name'].tolist() |
|
CRS5_MERGED = {code: [f"{name} - {code}"] for name, code in zip(CRS5_NAME, CRS5_CODES)} |
|
|
|
return CRS5_MERGED |
|
|
|
|
|
@st.cache_data |
|
def getSDG(): |
|
|
|
sdg_df = pd.read_csv('src/codelists/sdg_goals.csv') |
|
SDG_NAMES = sdg_df['name'].tolist() |
|
|
|
return SDG_NAMES |
|
|
|
|
|
@st.cache_resource |
|
def load_model(): |
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
return model |
|
|
|
|
|
|
|
@st.cache_data |
|
def load_embeddings_and_index(): |
|
|
|
with open("src/embeddings.pkl", "rb") as fIn: |
|
stored_data = pickle.load(fIn) |
|
sentences = stored_data["sentences"] |
|
embeddings = stored_data["embeddings"] |
|
|
|
|
|
dimension = embeddings.shape[1] |
|
faiss_index = faiss.IndexFlatL2(dimension) |
|
faiss_index.add(embeddings) |
|
|
|
return sentences, embeddings, faiss_index |
|
|
|
|
|
sim_matrix = load_sim_matrix() |
|
projects_df = load_projects() |
|
|
|
CRS3_MERGED = getCRS3() |
|
CRS5_MERGED = getCRS5() |
|
SDG_NAMES = getSDG() |
|
|
|
model = load_model() |
|
sentences, embeddings, faiss_index = load_embeddings_and_index() |
|
|
|
def show_page(): |
|
st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB") |
|
st.write("Similarities") |
|
|
|
col1, col2 = st.columns([1, 1]) |
|
with col1: |
|
|
|
crs3_option = st.multiselect( |
|
'CRS 3', |
|
CRS3_MERGED, |
|
placeholder="Select" |
|
) |
|
|
|
with col2: |
|
st.write("x") |
|
|
|
|
|
|
|
crs3_list = [i[-3:] for i in crs3_option] |
|
|
|
|
|
filtered_df = filter_projects(projects_df, crs3_list) |
|
|
|
|
|
p1_df, p2_df = calc_matches(filtered_df, projects_df, sim_matrix) |
|
|
|
|
|
show_table(p1_df, p2_df) |
|
|