ppsingh commited on
Commit
e44062d
·
1 Parent(s): 1f6b181

script update

Browse files
Files changed (4) hide show
  1. app.py +8 -91
  2. appStore/__init__.py +1 -0
  3. appStore/prep_data.py +49 -0
  4. appStore/prep_utils.py +23 -0
app.py CHANGED
@@ -1,7 +1,5 @@
1
  import streamlit as st
2
  import pandas as pd
3
- from langchain_text_splitters import TokenTextSplitter
4
- from langchain.docstore.document import Document
5
  from torch import cuda
6
  from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
7
  from langchain_community.vectorstores import Qdrant
@@ -10,7 +8,7 @@ from langchain.retrievers import ContextualCompressionRetriever
10
  from langchain.retrievers.document_compressors import CrossEncoderReranker
11
  from langchain_community.cross_encoders import HuggingFaceCrossEncoder
12
  from langchain_qdrant import FastEmbedSparse, RetrievalMode
13
-
14
 
15
  # get the device to be used eithe gpu or cpu
16
  device = 'cuda' if cuda.is_available() else 'cpu'
@@ -20,89 +18,6 @@ st.set_page_config(page_title="SEARCH IATI",layout='wide')
20
  st.title("SEARCH IATI Database")
21
  var=st.text_input("enter keyword")
22
 
23
-
24
- def create_chunks(text):
25
- """TAKES A TEXT AND CERATES CREATES CHUNKS"""
26
- # chunk size in terms of token
27
- text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=0)
28
- texts = text_splitter.split_text(text)
29
- return texts
30
-
31
- def get_chunks():
32
- """
33
- this will read the iati files and create the chunks
34
- """
35
- orgas_df = pd.read_csv("iati_files/project_orgas.csv")
36
- region_df = pd.read_csv("iati_files/project_region.csv")
37
- sector_df = pd.read_csv("iati_files/project_sector.csv")
38
- status_df = pd.read_csv("iati_files/project_status.csv")
39
- texts_df = pd.read_csv("iati_files/project_texts.csv")
40
-
41
- projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
42
- projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
43
- projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
44
- projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')
45
- projects_df = projects_df[projects_df.client.str.contains('bmz')].reset_index(drop=True)
46
-
47
- projects_df.drop(columns= ['orga_abbreviation', 'client',
48
- 'orga_full_name', 'country',
49
- 'country_flag', 'crs_5_code', 'crs_3_code','country_code_list',
50
- 'sgd_pred_code','crs_5_name', 'crs_3_name', 'sgd_pred_str'], inplace=True)
51
- print(projects_df.columns)
52
- projects_df['text_size'] = projects_df.apply(lambda x: len((x['title_main'] + x['description_main']).split()), axis=1)
53
- projects_df['chunks'] = projects_df.apply(lambda x:create_chunks(x['title_main'] + x['description_main']),axis=1)
54
- projects_df = projects_df.explode(column=['chunks'], ignore_index=True)
55
- projects_df['source'] = 'IATI'
56
- projects_df.rename(columns = {'iati_id':'id','iati_orga_id':'org'}, inplace=True)
57
-
58
- #### code for eading the giz_worldwide data
59
- giz_df = pd.read_json('iati_files/data_giz_website.json')
60
- giz_df = giz_df.rename(columns={'content':'project_description'})
61
- giz_df['text_size'] = giz_df.apply(lambda x: len((x['project_name'] + x['project_description']).split()), axis=1)
62
- giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['project_name'] + x['project_description']),axis=1)
63
- giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
64
- print(giz_df.columns)
65
- giz_df.drop(columns = ['filename', 'url', 'name', 'mail',
66
- 'language', 'start_year', 'end_year','poli_trager'], inplace=True)
67
- giz_df.rename(columns = {'project_name':'title_main','countries':'country_name',
68
- 'client':'org','project_description':'description_main'}, inplace=True)
69
- giz_df['source'] = 'GIZ_WORLDWIDE'
70
- giz_df['status'] = "None"
71
- df = pd.concat([projects_df,giz_df],ignore_index=True)
72
- print(df.columns)
73
- print(df)
74
-
75
-
76
-
77
-
78
- placeholder= []
79
- for i in range(len(df)):
80
- placeholder.append(Document(page_content= df.loc[i,'chunks'],
81
- metadata={"id": df.loc[i,'id'],
82
- "org":df.loc[i,'org'],
83
- "country_name":str(df.loc[i,'country_name']),
84
- "status":df.loc[i,'status'],
85
- "title_main":df.loc[i,'title_main'],}))
86
- return placeholder
87
-
88
- # placeholder= []
89
- # for i in range(len(giz_df)):
90
- # placeholder.append(Document(page_content= giz_df.loc[i,'chunks'],
91
- # metadata={
92
- # "title_main":giz_df.loc[i,'title_main'],
93
- # "country_name":str(giz_df.loc[i,'countries']),
94
- # "client": giz_df_new.loc[i, 'client'],
95
- # "language":giz_df_new.loc[i, 'language'],
96
- # "political_sponsor":giz_df.loc[i, 'poli_trager'],
97
- # "url": giz_df.loc[i, 'url']
98
- # #"iati_id": giz_df.loc[i,'iati_id'],
99
- # #"iati_orga_id":giz_df.loc[i,'iati_orga_id'],
100
- # #"crs_5_name": giz_df.loc[i,'crs_5_name'],
101
- # #"crs_3_name": giz_df.loc[i,'crs_3_name'],
102
- # #"sgd_pred_str":giz_df.loc[i,'sgd_pred_str'],
103
- # #"status":giz_df.loc[i,'status'],
104
- # }))
105
- # return placeholder
106
  def embed_chunks(chunks):
107
  """
108
  takes the chunks and does the hybrid embedding for the list of chunks
@@ -160,16 +75,18 @@ def get_context(vectorstore,query):
160
  return context_retrieved
161
 
162
  # first we create the chunks for iati documents
163
- #chunks = get_chunks()
 
 
164
  #print("chunking done")
165
 
166
  # once the chunks are done, we perform hybrid emebddings
167
  #embed_chunks(chunks)
168
 
169
- vectorstores = get_local_qdrant()
170
- vectorstore = vectorstores['all']
171
- button=st.button("search")
172
- results= get_context(vectorstore, f"find the relvant paragraphs for: {var}")
173
  if button:
174
  st.write(f"Found {len(results)} results for query:{var}")
175
 
 
1
  import streamlit as st
2
  import pandas as pd
 
 
3
  from torch import cuda
4
  from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
5
  from langchain_community.vectorstores import Qdrant
 
8
  from langchain.retrievers.document_compressors import CrossEncoderReranker
9
  from langchain_community.cross_encoders import HuggingFaceCrossEncoder
10
  from langchain_qdrant import FastEmbedSparse, RetrievalMode
11
+ from appStore.prep_data import process_giz_worldwide
12
 
13
  # get the device to be used eithe gpu or cpu
14
  device = 'cuda' if cuda.is_available() else 'cpu'
 
18
  st.title("SEARCH IATI Database")
19
  var=st.text_input("enter keyword")
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def embed_chunks(chunks):
22
  """
23
  takes the chunks and does the hybrid embedding for the list of chunks
 
75
  return context_retrieved
76
 
77
  # first we create the chunks for iati documents
78
+ chunks = process_giz_worldwide()
79
+ for i in range(5):
80
+ print(chunks.loc[0,'chunks'])
81
  #print("chunking done")
82
 
83
  # once the chunks are done, we perform hybrid emebddings
84
  #embed_chunks(chunks)
85
 
86
+ # vectorstores = get_local_qdrant()
87
+ # vectorstore = vectorstores['all']
88
+ # button=st.button("search")
89
+ # results= get_context(vectorstore, f"find the relvant paragraphs for: {var}")
90
  if button:
91
  st.write(f"Found {len(results)} results for query:{var}")
92
 
appStore/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # src
appStore/prep_data.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ path_to_data = "./docStore/"
3
+
4
+ def process_iati():
5
+ """
6
+ this will read the iati files and create the chunks
7
+ """
8
+ orgas_df = pd.read_csv(f"{path_to_data}iati_files/project_orgas.csv")
9
+ region_df = pd.read_csv(f"{path_to_data}iati_files/project_region.csv")
10
+ sector_df = pd.read_csv(f"{path_to_data}iati_files/project_sector.csv")
11
+ status_df = pd.read_csv(f"{path_to_data}iati_files/project_status.csv")
12
+ texts_df = pd.read_csv(f"{path_to_data}iati_files/project_texts.csv")
13
+
14
+ projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
15
+ projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
16
+ projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
17
+ projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')
18
+ projects_df = projects_df[projects_df.client.str.contains('bmz')].reset_index(drop=True)
19
+
20
+ projects_df.drop(columns= ['orga_abbreviation', 'client',
21
+ 'orga_full_name', 'country',
22
+ 'country_flag', 'crs_5_code', 'crs_3_code','country_code_list',
23
+ 'sgd_pred_code','crs_5_name', 'crs_3_name', 'sgd_pred_str'], inplace=True)
24
+ #print(projects_df.columns)
25
+ projects_df['text_size'] = projects_df.apply(lambda x: len((x['title_main'] + x['description_main']).split()), axis=1)
26
+ projects_df['chunks'] = projects_df.apply(lambda x:create_chunks(x['title_main'] + x['description_main']),axis=1)
27
+ projects_df = projects_df.explode(column=['chunks'], ignore_index=True)
28
+ projects_df['source'] = 'IATI'
29
+ projects_df.rename(columns = {'iati_id':'id','iati_orga_id':'org'}, inplace=True)
30
+
31
+ return projects_df
32
+
33
+ def process_giz_worldwide():
34
+ """
35
+ this will read the giz_worldwide files and create the chunks
36
+ """
37
+ giz_df = pd.read_json(f'{path_to_data}giz_worldwide/data_giz_website.json')
38
+ giz_df = giz_df.rename(columns={'content':'project_description'})
39
+ giz_df['text_size'] = giz_df.apply(lambda x: len((x['project_name'] + x['project_description']).split()), axis=1)
40
+ giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['project_name'] + x['project_description']),axis=1)
41
+ print("initial df length:",len(giz_df))
42
+ giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
43
+ print("new df length:",len(giz_df))
44
+ print(giz_df.columns)
45
+ #giz_df.drop(columns = ['filename', 'url', 'name', 'mail',
46
+ # 'language', 'start_year', 'end_year','poli_trager'], inplace=True)
47
+ giz_df['source'] = 'GIZ_WORLDWIDE'
48
+ return giz_df
49
+
appStore/prep_utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_text_splitters import TokenTextSplitter
2
+ from langchain.docstore.document import Document
3
+ import pandas as pd
4
+
5
+ path_to_data = "./docStore/"
6
+
7
+ def create_chunks(text, tokens_count =400, overlap_count=0):
8
+ """TAKES A TEXT AND CERATES CREATES CHUNKS"""
9
+ # chunk size in terms of token
10
+ text_splitter = TokenTextSplitter(chunk_size=tokens_count, chunk_overlap = overlap_count)
11
+ texts = text_splitter.split_text(text)
12
+ return texts
13
+
14
+ def create_documents(df, text_column):
15
+ """
16
+ takes df and creates the langchain doc
17
+ """
18
+ placeholder= []
19
+ for i in range(len(df)):
20
+ metadata = {col: df.loc[i, col] for col in list(set(df.columns) - {text_column})}
21
+ placeholder.append(Document(page_content= df.loc[i,text_column],
22
+ metadata=metadata))
23
+ return placeholder