File size: 3,367 Bytes
4226dcf
 
 
 
 
 
 
 
 
 
139b395
 
 
4226dcf
139b395
1910714
 
 
 
 
 
4226dcf
 
 
 
 
58246cd
4226dcf
 
 
 
 
 
58246cd
 
 
 
 
4226dcf
 
 
 
 
 
 
 
139b395
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4226dcf
 
 
139b395
 
4226dcf
 
1910714
4226dcf
 
139b395
 
4226dcf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Page for similarities
"""

################
# DEPENDENCIES #
################
import streamlit as st
import pandas as pd
from scipy.sparse import load_npz
import pickle
import faiss
from sentence_transformers import SentenceTransformer
import utils.similarity_table as similarity_table
import utils.semantic_search as semantic_search
import psutil
import os

def get_process_memory():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / (1024 * 1024) 

# Catch DATA
# Load Similarity matrix
@st.cache_data
def load_sim_matrix():
    loaded_matrix = load_npz("src/similarities.npz")
    dense_matrix = loaded_matrix.toarray()

    return dense_matrix

@st.cache_data
def load_projects():
    orgas_df = pd.read_csv("src/projects/project_orgas.csv")
    region_df = pd.read_csv("src/projects/project_region.csv")
    sector_df = pd.read_csv("src/projects/project_sector.csv")
    status_df = pd.read_csv("src/projects/project_status.csv")
    texts_df = pd.read_csv("src/projects/project_texts.csv")

    projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
    projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
    projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
    projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')

    return projects_df

@st.cache_resource
def load_model():
    model = SentenceTransformer('all-MiniLM-L6-v2')
    return model

# LOAD EMBEDDINGS
@st.cache_data 
def load_embeddings_and_index():
    # Load embeddings
    with open("..\synergy-app\src\embeddings.pkl", "rb") as fIn:
        stored_data = pickle.load(fIn)
    sentences = stored_data["sentences"]
    embeddings = stored_data["embeddings"]

    # Load or create FAISS index
    dimension = embeddings.shape[1]  
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(embeddings)

    return sentences, embeddings, faiss_index

# LOAD DATA
sim_matrix = load_sim_matrix()
projects_df = load_projects()
model = load_model()
sentences, embeddings, faiss_index = load_embeddings_and_index()

def show_page():
    st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB")
    st.write("Similarities")

    semantic_search.show_search(model, faiss_index, sentences)

    df_subset = projects_df.head(10)
    selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id'])

    st.write(selected_index)

    # add index and similarity together
    indecies = range(0, len(sim_matrix))
    similarities = sim_matrix[selected_index]
    zipped_sims = list(zip(indecies, similarities))

    # remove all 0 similarities
    filtered_sims = [(index, similarity) for index, similarity in zipped_sims if similarity != 0]

    # Select and sort top 20 most similar projects
    sorted_sims = sorted(filtered_sims, key=lambda x: x[1], reverse=True)
    top_20_sims = sorted_sims[:20]

    # create result data frame
    index_list = [tup[0] for tup in top_20_sims]
    print(index_list)
    result_df = projects_df.iloc[index_list]
    print(len(result_df))

    print(len(result_df))
    # add other colums to result df

    similarity_list = [tup[1] for tup in top_20_sims]
    result_df["similarity"] = similarity_list

    similarity_table.show_table(result_df, similarity_list)