scholarly360 commited on
Commit
a420b55
·
1 Parent(s): cd7382a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -38
app.py CHANGED
@@ -8,6 +8,7 @@ from pathlib import Path
8
  import os
9
  os.environ['OPENAI_API_KEY'] = os.environ['OPEN_API_KEY']
10
  st.title("Contracts Multiple File Search ")
 
11
 
12
  from langchain.retrievers import BM25Retriever, EnsembleRetriever
13
  from langchain.schema import Document
@@ -102,17 +103,18 @@ def split_into_sentences_with_offsets(text):
102
  doc = nlp(text)
103
  return [(sent.text, sent.start_char, sent.end_char) for sent in doc.sents]
104
 
105
- def util_get_list_page_and_passage(docs):
106
  page_documents = []
107
  passage_documents = []
108
- for txt_index, txt_page in enumerate(docs):
109
- page_document = txt_page.get_text()##.encode("utf8") # get plain text (is in UTF-8)
110
- page_documents.append(page_document)
111
- sections = split_into_sentences_with_offsets(page_document)
112
- for sub_sub_index, sub_sub_item in enumerate(sections):
113
- sub_text=sub_sub_item[0]
114
- passage_document = Document(page_content=sub_text, metadata={"page_index": txt_index})
115
- passage_documents.append(passage_document)
 
116
  return(page_documents,passage_documents)
117
 
118
  # def util_index_chromadb_passages():
@@ -144,23 +146,29 @@ def util_get_list_pageno_and_contents(some_query_passage, page_documents,passage
144
  ''' page no starts with index 1 '''
145
 
146
  return_value = []
147
-
148
  rescore = reranker.compute_score([[some_query_passage , x.page_content] for x in passage_nodes])
149
- print('rescore ' , rescore)
150
- print(rescore)
151
- max_pos_index = rescore.index(max(rescore))
152
- print("Maximum Index position: ",max_pos_index)
153
- print(passage_nodes[max_pos_index].page_content)
154
 
155
- #Document(page_content=sub_text, metadata={"page_index": txt_index})
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
 
158
- for index, item in enumerate(passage_nodes):
159
- page_no = passage_nodes[index].metadata['page_index']
160
- page_content = page_documents[page_no]
161
- if(index==max_pos_index):
162
- return_value.append((page_no+1,page_content))
163
- return(passage_nodes[max_pos_index].page_content, return_value)
164
 
165
  # # def util_openai_extract_entity(example_passage, example_entity, page_content):
166
  # # import openai
@@ -263,8 +271,9 @@ with st.form("my_form"):
263
 
264
  if submitted and (uploaded_files is not None):
265
  list_docs, list_save_path = util_upload_file_and_return_list_docs(uploaded_files)
266
- print('list_docs ' ,list_docs)
267
- print('list_save_path ' , list_save_path)
 
268
  bm25_retriever = BM25Retriever.from_documents(passage_documents)
269
  bm25_retriever.k = 2
270
  chroma_vectorstore = Chroma.from_documents(passage_documents, embedding)
@@ -272,18 +281,19 @@ with st.form("my_form"):
272
  ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever],weights=[0.25, 0.75])
273
  passage_nodes = ensemble_retriever.get_relevant_documents(single_example_passage)
274
  print('len(passage_nodes):', len(passage_nodes))
275
- found_passage, page_list_retrieve = util_get_list_pageno_and_contents(single_example_passage, page_documents,passage_documents,passage_nodes)
276
- print('len(page_list_retrieve):', len(page_list_retrieve))
277
- if(len(page_list_retrieve)>0):
278
- page_list_retrieve = list(set(page_list_retrieve))
279
- for iindex in page_list_retrieve:
280
- page_no = iindex[0]
281
- page_content = iindex[1]
282
- annotated_text(" ",annotation("RELEVANT PAGENO : ", str(page_no), font_family="Comic Sans MS", border="2px dashed red"),)
283
- util_openai_format(single_example_passage, page_content)
284
- annotated_text(" ",annotation("RELEVANT PASSAGE : ", "", font_family="Comic Sans MS", border="2px dashed red"),)
285
- st.write(found_passage)
286
- pchroma_client = chromadb.Client()
287
- for citem in pchroma_client.list_collections():
288
- print(citem.name)
 
289
 
 
8
  import os
9
  os.environ['OPENAI_API_KEY'] = os.environ['OPEN_API_KEY']
10
  st.title("Contracts Multiple File Search ")
11
+ import pandas as pd
12
 
13
  from langchain.retrievers import BM25Retriever, EnsembleRetriever
14
  from langchain.schema import Document
 
103
  doc = nlp(text)
104
  return [(sent.text, sent.start_char, sent.end_char) for sent in doc.sents]
105
 
106
+ def util_get_list_page_and_passage(list_docs, list_save_path):
107
  page_documents = []
108
  passage_documents = []
109
+ for ind_doc, docs in enumerate(list_docs):
110
+ for txt_index, txt_page in enumerate(docs):
111
+ page_document = txt_page.get_text()##.encode("utf8") # get plain text (is in UTF-8)
112
+ page_documents.append(page_document)
113
+ sections = split_into_sentences_with_offsets(page_document)
114
+ for sub_sub_index, sub_sub_item in enumerate(sections):
115
+ sub_text=sub_sub_item[0]
116
+ passage_document = Document(page_content=sub_text, metadata={"page_index": txt_index, "page_file" : list_save_path[ind_doc]})
117
+ passage_documents.append(passage_document)
118
  return(page_documents,passage_documents)
119
 
120
  # def util_index_chromadb_passages():
 
146
  ''' page no starts with index 1 '''
147
 
148
  return_value = []
 
149
  rescore = reranker.compute_score([[some_query_passage , x.page_content] for x in passage_nodes])
 
 
 
 
 
150
 
151
+ tmp_array = []
152
+ for i, x in enumerate(passage_nodes):
153
+ tmp_dict = {"passage_content":x.page_content, "page_no":x.metadata['page_index']+1, "page_content": passage_documents[x.metadata['page_index']], "score": rescore[i] }
154
+ df = pd.DataFrame(tmp_array)
155
+
156
+
157
+ # print('rescore ' , rescore)
158
+ # print(rescore)
159
+ # max_pos_index = rescore.index(max(rescore))
160
+ # print("Maximum Index position: ",max_pos_index)
161
+ # print(passage_nodes[max_pos_index].page_content)
162
+
163
+ # #Document(page_content=sub_text, metadata={"page_index": txt_index})
164
 
165
 
166
+ # for index, item in enumerate(passage_nodes):
167
+ # page_no = passage_nodes[index]
168
+ # page_content = page_documents[page_no]
169
+ # if(index==max_pos_index):
170
+ # return_value.append((page_no+1,page_content))
171
+ return(df)
172
 
173
  # # def util_openai_extract_entity(example_passage, example_entity, page_content):
174
  # # import openai
 
271
 
272
  if submitted and (uploaded_files is not None):
273
  list_docs, list_save_path = util_upload_file_and_return_list_docs(uploaded_files)
274
+ # print('list_docs ' ,list_docs)
275
+ # print('list_save_path ' , list_save_path)
276
+ page_documents , passage_documents = util_get_list_page_and_passage(list_docs, list_save_path)
277
  bm25_retriever = BM25Retriever.from_documents(passage_documents)
278
  bm25_retriever.k = 2
279
  chroma_vectorstore = Chroma.from_documents(passage_documents, embedding)
 
281
  ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever],weights=[0.25, 0.75])
282
  passage_nodes = ensemble_retriever.get_relevant_documents(single_example_passage)
283
  print('len(passage_nodes):', len(passage_nodes))
284
+ df = util_get_list_pageno_and_contents(single_example_passage, page_documents,passage_documents,passage_nodes)
285
+ st.write(df)
286
+ # print('len(page_list_retrieve):', len(page_list_retrieve))
287
+ # if(len(page_list_retrieve)>0):
288
+ # page_list_retrieve = list(set(page_list_retrieve))
289
+ # for iindex in page_list_retrieve:
290
+ # page_no = iindex[0]
291
+ # page_content = iindex[1]
292
+ # annotated_text(" ",annotation("RELEVANT PAGENO : ", str(page_no), font_family="Comic Sans MS", border="2px dashed red"),)
293
+ # util_openai_format(single_example_passage, page_content)
294
+ # annotated_text(" ",annotation("RELEVANT PASSAGE : ", "", font_family="Comic Sans MS", border="2px dashed red"),)
295
+ # st.write(found_passage)
296
+ # pchroma_client = chromadb.Client()
297
+ # for citem in pchroma_client.list_collections():
298
+ # print(citem.name)
299