Commit
·
a420b55
1
Parent(s):
cd7382a
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,7 @@ from pathlib import Path
|
|
8 |
import os
|
9 |
os.environ['OPENAI_API_KEY'] = os.environ['OPEN_API_KEY']
|
10 |
st.title("Contracts Multiple File Search ")
|
|
|
11 |
|
12 |
from langchain.retrievers import BM25Retriever, EnsembleRetriever
|
13 |
from langchain.schema import Document
|
@@ -102,17 +103,18 @@ def split_into_sentences_with_offsets(text):
|
|
102 |
doc = nlp(text)
|
103 |
return [(sent.text, sent.start_char, sent.end_char) for sent in doc.sents]
|
104 |
|
105 |
-
def util_get_list_page_and_passage(
|
106 |
page_documents = []
|
107 |
passage_documents = []
|
108 |
-
for
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
|
|
116 |
return(page_documents,passage_documents)
|
117 |
|
118 |
# def util_index_chromadb_passages():
|
@@ -144,23 +146,29 @@ def util_get_list_pageno_and_contents(some_query_passage, page_documents,passage
|
|
144 |
''' page no starts with index 1 '''
|
145 |
|
146 |
return_value = []
|
147 |
-
|
148 |
rescore = reranker.compute_score([[some_query_passage , x.page_content] for x in passage_nodes])
|
149 |
-
print('rescore ' , rescore)
|
150 |
-
print(rescore)
|
151 |
-
max_pos_index = rescore.index(max(rescore))
|
152 |
-
print("Maximum Index position: ",max_pos_index)
|
153 |
-
print(passage_nodes[max_pos_index].page_content)
|
154 |
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
|
158 |
-
for index, item in enumerate(passage_nodes):
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
return(
|
164 |
|
165 |
# # def util_openai_extract_entity(example_passage, example_entity, page_content):
|
166 |
# # import openai
|
@@ -263,8 +271,9 @@ with st.form("my_form"):
|
|
263 |
|
264 |
if submitted and (uploaded_files is not None):
|
265 |
list_docs, list_save_path = util_upload_file_and_return_list_docs(uploaded_files)
|
266 |
-
print('list_docs ' ,list_docs)
|
267 |
-
print('list_save_path ' , list_save_path)
|
|
|
268 |
bm25_retriever = BM25Retriever.from_documents(passage_documents)
|
269 |
bm25_retriever.k = 2
|
270 |
chroma_vectorstore = Chroma.from_documents(passage_documents, embedding)
|
@@ -272,18 +281,19 @@ with st.form("my_form"):
|
|
272 |
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever],weights=[0.25, 0.75])
|
273 |
passage_nodes = ensemble_retriever.get_relevant_documents(single_example_passage)
|
274 |
print('len(passage_nodes):', len(passage_nodes))
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
|
|
289 |
|
|
|
8 |
import os
|
9 |
os.environ['OPENAI_API_KEY'] = os.environ['OPEN_API_KEY']
|
10 |
st.title("Contracts Multiple File Search ")
|
11 |
+
import pandas as pd
|
12 |
|
13 |
from langchain.retrievers import BM25Retriever, EnsembleRetriever
|
14 |
from langchain.schema import Document
|
|
|
103 |
doc = nlp(text)
|
104 |
return [(sent.text, sent.start_char, sent.end_char) for sent in doc.sents]
|
105 |
|
106 |
+
def util_get_list_page_and_passage(list_docs, list_save_path):
|
107 |
page_documents = []
|
108 |
passage_documents = []
|
109 |
+
for ind_doc, docs in enumerate(list_docs):
|
110 |
+
for txt_index, txt_page in enumerate(docs):
|
111 |
+
page_document = txt_page.get_text()##.encode("utf8") # get plain text (is in UTF-8)
|
112 |
+
page_documents.append(page_document)
|
113 |
+
sections = split_into_sentences_with_offsets(page_document)
|
114 |
+
for sub_sub_index, sub_sub_item in enumerate(sections):
|
115 |
+
sub_text=sub_sub_item[0]
|
116 |
+
passage_document = Document(page_content=sub_text, metadata={"page_index": txt_index, "page_file" : list_save_path[ind_doc]})
|
117 |
+
passage_documents.append(passage_document)
|
118 |
return(page_documents,passage_documents)
|
119 |
|
120 |
# def util_index_chromadb_passages():
|
|
|
146 |
''' page no starts with index 1 '''
|
147 |
|
148 |
return_value = []
|
|
|
149 |
rescore = reranker.compute_score([[some_query_passage , x.page_content] for x in passage_nodes])
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
+
tmp_array = []
|
152 |
+
for i, x in enumerate(passage_nodes):
|
153 |
+
tmp_dict = {"passage_content":x.page_content, "page_no":x.metadata['page_index']+1, "page_content": passage_documents[x.metadata['page_index']], "score": rescore[i] }
|
154 |
+
df = pd.DataFrame(tmp_array)
|
155 |
+
|
156 |
+
|
157 |
+
# print('rescore ' , rescore)
|
158 |
+
# print(rescore)
|
159 |
+
# max_pos_index = rescore.index(max(rescore))
|
160 |
+
# print("Maximum Index position: ",max_pos_index)
|
161 |
+
# print(passage_nodes[max_pos_index].page_content)
|
162 |
+
|
163 |
+
# #Document(page_content=sub_text, metadata={"page_index": txt_index})
|
164 |
|
165 |
|
166 |
+
# for index, item in enumerate(passage_nodes):
|
167 |
+
# page_no = passage_nodes[index]
|
168 |
+
# page_content = page_documents[page_no]
|
169 |
+
# if(index==max_pos_index):
|
170 |
+
# return_value.append((page_no+1,page_content))
|
171 |
+
return(df)
|
172 |
|
173 |
# # def util_openai_extract_entity(example_passage, example_entity, page_content):
|
174 |
# # import openai
|
|
|
271 |
|
272 |
if submitted and (uploaded_files is not None):
|
273 |
list_docs, list_save_path = util_upload_file_and_return_list_docs(uploaded_files)
|
274 |
+
# print('list_docs ' ,list_docs)
|
275 |
+
# print('list_save_path ' , list_save_path)
|
276 |
+
page_documents , passage_documents = util_get_list_page_and_passage(list_docs, list_save_path)
|
277 |
bm25_retriever = BM25Retriever.from_documents(passage_documents)
|
278 |
bm25_retriever.k = 2
|
279 |
chroma_vectorstore = Chroma.from_documents(passage_documents, embedding)
|
|
|
281 |
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever],weights=[0.25, 0.75])
|
282 |
passage_nodes = ensemble_retriever.get_relevant_documents(single_example_passage)
|
283 |
print('len(passage_nodes):', len(passage_nodes))
|
284 |
+
df = util_get_list_pageno_and_contents(single_example_passage, page_documents,passage_documents,passage_nodes)
|
285 |
+
st.write(df)
|
286 |
+
# print('len(page_list_retrieve):', len(page_list_retrieve))
|
287 |
+
# if(len(page_list_retrieve)>0):
|
288 |
+
# page_list_retrieve = list(set(page_list_retrieve))
|
289 |
+
# for iindex in page_list_retrieve:
|
290 |
+
# page_no = iindex[0]
|
291 |
+
# page_content = iindex[1]
|
292 |
+
# annotated_text(" ",annotation("RELEVANT PAGENO : ", str(page_no), font_family="Comic Sans MS", border="2px dashed red"),)
|
293 |
+
# util_openai_format(single_example_passage, page_content)
|
294 |
+
# annotated_text(" ",annotation("RELEVANT PASSAGE : ", "", font_family="Comic Sans MS", border="2px dashed red"),)
|
295 |
+
# st.write(found_passage)
|
296 |
+
# pchroma_client = chromadb.Client()
|
297 |
+
# for citem in pchroma_client.list_collections():
|
298 |
+
# print(citem.name)
|
299 |
|