Spaces:
Sleeping
Sleeping
script update
Browse files- app.py +8 -91
- appStore/__init__.py +1 -0
- appStore/prep_data.py +49 -0
- appStore/prep_utils.py +23 -0
app.py
CHANGED
@@ -1,7 +1,5 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
-
from langchain_text_splitters import TokenTextSplitter
|
4 |
-
from langchain.docstore.document import Document
|
5 |
from torch import cuda
|
6 |
from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
|
7 |
from langchain_community.vectorstores import Qdrant
|
@@ -10,7 +8,7 @@ from langchain.retrievers import ContextualCompressionRetriever
|
|
10 |
from langchain.retrievers.document_compressors import CrossEncoderReranker
|
11 |
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
|
12 |
from langchain_qdrant import FastEmbedSparse, RetrievalMode
|
13 |
-
|
14 |
|
15 |
# get the device to be used eithe gpu or cpu
|
16 |
device = 'cuda' if cuda.is_available() else 'cpu'
|
@@ -20,89 +18,6 @@ st.set_page_config(page_title="SEARCH IATI",layout='wide')
|
|
20 |
st.title("SEARCH IATI Database")
|
21 |
var=st.text_input("enter keyword")
|
22 |
|
23 |
-
|
24 |
-
def create_chunks(text):
|
25 |
-
"""TAKES A TEXT AND CERATES CREATES CHUNKS"""
|
26 |
-
# chunk size in terms of token
|
27 |
-
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=0)
|
28 |
-
texts = text_splitter.split_text(text)
|
29 |
-
return texts
|
30 |
-
|
31 |
-
def get_chunks():
|
32 |
-
"""
|
33 |
-
this will read the iati files and create the chunks
|
34 |
-
"""
|
35 |
-
orgas_df = pd.read_csv("iati_files/project_orgas.csv")
|
36 |
-
region_df = pd.read_csv("iati_files/project_region.csv")
|
37 |
-
sector_df = pd.read_csv("iati_files/project_sector.csv")
|
38 |
-
status_df = pd.read_csv("iati_files/project_status.csv")
|
39 |
-
texts_df = pd.read_csv("iati_files/project_texts.csv")
|
40 |
-
|
41 |
-
projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
|
42 |
-
projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
|
43 |
-
projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
|
44 |
-
projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')
|
45 |
-
projects_df = projects_df[projects_df.client.str.contains('bmz')].reset_index(drop=True)
|
46 |
-
|
47 |
-
projects_df.drop(columns= ['orga_abbreviation', 'client',
|
48 |
-
'orga_full_name', 'country',
|
49 |
-
'country_flag', 'crs_5_code', 'crs_3_code','country_code_list',
|
50 |
-
'sgd_pred_code','crs_5_name', 'crs_3_name', 'sgd_pred_str'], inplace=True)
|
51 |
-
print(projects_df.columns)
|
52 |
-
projects_df['text_size'] = projects_df.apply(lambda x: len((x['title_main'] + x['description_main']).split()), axis=1)
|
53 |
-
projects_df['chunks'] = projects_df.apply(lambda x:create_chunks(x['title_main'] + x['description_main']),axis=1)
|
54 |
-
projects_df = projects_df.explode(column=['chunks'], ignore_index=True)
|
55 |
-
projects_df['source'] = 'IATI'
|
56 |
-
projects_df.rename(columns = {'iati_id':'id','iati_orga_id':'org'}, inplace=True)
|
57 |
-
|
58 |
-
#### code for eading the giz_worldwide data
|
59 |
-
giz_df = pd.read_json('iati_files/data_giz_website.json')
|
60 |
-
giz_df = giz_df.rename(columns={'content':'project_description'})
|
61 |
-
giz_df['text_size'] = giz_df.apply(lambda x: len((x['project_name'] + x['project_description']).split()), axis=1)
|
62 |
-
giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['project_name'] + x['project_description']),axis=1)
|
63 |
-
giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
|
64 |
-
print(giz_df.columns)
|
65 |
-
giz_df.drop(columns = ['filename', 'url', 'name', 'mail',
|
66 |
-
'language', 'start_year', 'end_year','poli_trager'], inplace=True)
|
67 |
-
giz_df.rename(columns = {'project_name':'title_main','countries':'country_name',
|
68 |
-
'client':'org','project_description':'description_main'}, inplace=True)
|
69 |
-
giz_df['source'] = 'GIZ_WORLDWIDE'
|
70 |
-
giz_df['status'] = "None"
|
71 |
-
df = pd.concat([projects_df,giz_df],ignore_index=True)
|
72 |
-
print(df.columns)
|
73 |
-
print(df)
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
placeholder= []
|
79 |
-
for i in range(len(df)):
|
80 |
-
placeholder.append(Document(page_content= df.loc[i,'chunks'],
|
81 |
-
metadata={"id": df.loc[i,'id'],
|
82 |
-
"org":df.loc[i,'org'],
|
83 |
-
"country_name":str(df.loc[i,'country_name']),
|
84 |
-
"status":df.loc[i,'status'],
|
85 |
-
"title_main":df.loc[i,'title_main'],}))
|
86 |
-
return placeholder
|
87 |
-
|
88 |
-
# placeholder= []
|
89 |
-
# for i in range(len(giz_df)):
|
90 |
-
# placeholder.append(Document(page_content= giz_df.loc[i,'chunks'],
|
91 |
-
# metadata={
|
92 |
-
# "title_main":giz_df.loc[i,'title_main'],
|
93 |
-
# "country_name":str(giz_df.loc[i,'countries']),
|
94 |
-
# "client": giz_df_new.loc[i, 'client'],
|
95 |
-
# "language":giz_df_new.loc[i, 'language'],
|
96 |
-
# "political_sponsor":giz_df.loc[i, 'poli_trager'],
|
97 |
-
# "url": giz_df.loc[i, 'url']
|
98 |
-
# #"iati_id": giz_df.loc[i,'iati_id'],
|
99 |
-
# #"iati_orga_id":giz_df.loc[i,'iati_orga_id'],
|
100 |
-
# #"crs_5_name": giz_df.loc[i,'crs_5_name'],
|
101 |
-
# #"crs_3_name": giz_df.loc[i,'crs_3_name'],
|
102 |
-
# #"sgd_pred_str":giz_df.loc[i,'sgd_pred_str'],
|
103 |
-
# #"status":giz_df.loc[i,'status'],
|
104 |
-
# }))
|
105 |
-
# return placeholder
|
106 |
def embed_chunks(chunks):
|
107 |
"""
|
108 |
takes the chunks and does the hybrid embedding for the list of chunks
|
@@ -160,16 +75,18 @@ def get_context(vectorstore,query):
|
|
160 |
return context_retrieved
|
161 |
|
162 |
# first we create the chunks for iati documents
|
163 |
-
|
|
|
|
|
164 |
#print("chunking done")
|
165 |
|
166 |
# once the chunks are done, we perform hybrid emebddings
|
167 |
#embed_chunks(chunks)
|
168 |
|
169 |
-
vectorstores = get_local_qdrant()
|
170 |
-
vectorstore = vectorstores['all']
|
171 |
-
button=st.button("search")
|
172 |
-
results= get_context(vectorstore, f"find the relvant paragraphs for: {var}")
|
173 |
if button:
|
174 |
st.write(f"Found {len(results)} results for query:{var}")
|
175 |
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
|
|
|
|
3 |
from torch import cuda
|
4 |
from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
|
5 |
from langchain_community.vectorstores import Qdrant
|
|
|
8 |
from langchain.retrievers.document_compressors import CrossEncoderReranker
|
9 |
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
|
10 |
from langchain_qdrant import FastEmbedSparse, RetrievalMode
|
11 |
+
from appStore.prep_data import process_giz_worldwide
|
12 |
|
13 |
# get the device to be used eithe gpu or cpu
|
14 |
device = 'cuda' if cuda.is_available() else 'cpu'
|
|
|
18 |
st.title("SEARCH IATI Database")
|
19 |
var=st.text_input("enter keyword")
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
def embed_chunks(chunks):
|
22 |
"""
|
23 |
takes the chunks and does the hybrid embedding for the list of chunks
|
|
|
75 |
return context_retrieved
|
76 |
|
77 |
# first we create the chunks for iati documents
|
78 |
+
chunks = process_giz_worldwide()
|
79 |
+
for i in range(5):
|
80 |
+
print(chunks.loc[0,'chunks'])
|
81 |
#print("chunking done")
|
82 |
|
83 |
# once the chunks are done, we perform hybrid emebddings
|
84 |
#embed_chunks(chunks)
|
85 |
|
86 |
+
# vectorstores = get_local_qdrant()
|
87 |
+
# vectorstore = vectorstores['all']
|
88 |
+
# button=st.button("search")
|
89 |
+
# results= get_context(vectorstore, f"find the relvant paragraphs for: {var}")
|
90 |
if button:
|
91 |
st.write(f"Found {len(results)} results for query:{var}")
|
92 |
|
appStore/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# src
|
appStore/prep_data.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
path_to_data = "./docStore/"
|
3 |
+
|
4 |
+
def process_iati():
|
5 |
+
"""
|
6 |
+
this will read the iati files and create the chunks
|
7 |
+
"""
|
8 |
+
orgas_df = pd.read_csv(f"{path_to_data}iati_files/project_orgas.csv")
|
9 |
+
region_df = pd.read_csv(f"{path_to_data}iati_files/project_region.csv")
|
10 |
+
sector_df = pd.read_csv(f"{path_to_data}iati_files/project_sector.csv")
|
11 |
+
status_df = pd.read_csv(f"{path_to_data}iati_files/project_status.csv")
|
12 |
+
texts_df = pd.read_csv(f"{path_to_data}iati_files/project_texts.csv")
|
13 |
+
|
14 |
+
projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
|
15 |
+
projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
|
16 |
+
projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
|
17 |
+
projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')
|
18 |
+
projects_df = projects_df[projects_df.client.str.contains('bmz')].reset_index(drop=True)
|
19 |
+
|
20 |
+
projects_df.drop(columns= ['orga_abbreviation', 'client',
|
21 |
+
'orga_full_name', 'country',
|
22 |
+
'country_flag', 'crs_5_code', 'crs_3_code','country_code_list',
|
23 |
+
'sgd_pred_code','crs_5_name', 'crs_3_name', 'sgd_pred_str'], inplace=True)
|
24 |
+
#print(projects_df.columns)
|
25 |
+
projects_df['text_size'] = projects_df.apply(lambda x: len((x['title_main'] + x['description_main']).split()), axis=1)
|
26 |
+
projects_df['chunks'] = projects_df.apply(lambda x:create_chunks(x['title_main'] + x['description_main']),axis=1)
|
27 |
+
projects_df = projects_df.explode(column=['chunks'], ignore_index=True)
|
28 |
+
projects_df['source'] = 'IATI'
|
29 |
+
projects_df.rename(columns = {'iati_id':'id','iati_orga_id':'org'}, inplace=True)
|
30 |
+
|
31 |
+
return projects_df
|
32 |
+
|
33 |
+
def process_giz_worldwide():
|
34 |
+
"""
|
35 |
+
this will read the giz_worldwide files and create the chunks
|
36 |
+
"""
|
37 |
+
giz_df = pd.read_json(f'{path_to_data}giz_worldwide/data_giz_website.json')
|
38 |
+
giz_df = giz_df.rename(columns={'content':'project_description'})
|
39 |
+
giz_df['text_size'] = giz_df.apply(lambda x: len((x['project_name'] + x['project_description']).split()), axis=1)
|
40 |
+
giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['project_name'] + x['project_description']),axis=1)
|
41 |
+
print("initial df length:",len(giz_df))
|
42 |
+
giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
|
43 |
+
print("new df length:",len(giz_df))
|
44 |
+
print(giz_df.columns)
|
45 |
+
#giz_df.drop(columns = ['filename', 'url', 'name', 'mail',
|
46 |
+
# 'language', 'start_year', 'end_year','poli_trager'], inplace=True)
|
47 |
+
giz_df['source'] = 'GIZ_WORLDWIDE'
|
48 |
+
return giz_df
|
49 |
+
|
appStore/prep_utils.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_text_splitters import TokenTextSplitter
|
2 |
+
from langchain.docstore.document import Document
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
path_to_data = "./docStore/"
|
6 |
+
|
7 |
+
def create_chunks(text, tokens_count =400, overlap_count=0):
|
8 |
+
"""TAKES A TEXT AND CERATES CREATES CHUNKS"""
|
9 |
+
# chunk size in terms of token
|
10 |
+
text_splitter = TokenTextSplitter(chunk_size=tokens_count, chunk_overlap = overlap_count)
|
11 |
+
texts = text_splitter.split_text(text)
|
12 |
+
return texts
|
13 |
+
|
14 |
+
def create_documents(df, text_column):
|
15 |
+
"""
|
16 |
+
takes df and creates the langchain doc
|
17 |
+
"""
|
18 |
+
placeholder= []
|
19 |
+
for i in range(len(df)):
|
20 |
+
metadata = {col: df.loc[i, col] for col in list(set(df.columns) - {text_column})}
|
21 |
+
placeholder.append(Document(page_content= df.loc[i,text_column],
|
22 |
+
metadata=metadata))
|
23 |
+
return placeholder
|