Commit
·
04426d9
1
Parent(s):
c0cc382
Update app.py
Browse files
app.py
CHANGED
@@ -29,7 +29,7 @@ def util_upload_file_and_return_list_docs(uploaded_files):
|
|
29 |
save_path = Path(os.getcwd(), uploaded_file.name)
|
30 |
with open(save_path, mode='wb') as w:
|
31 |
w.write(uploaded_file.getvalue())
|
32 |
-
print('save_path:', save_path)
|
33 |
docs = fitz.open(save_path)
|
34 |
list_docs.append(docs)
|
35 |
list_save_path.append(save_path)
|
@@ -104,18 +104,18 @@ def split_into_sentences_with_offsets(text):
|
|
104 |
return [(sent.text, sent.start_char, sent.end_char) for sent in doc.sents]
|
105 |
|
106 |
def util_get_list_page_and_passage(list_docs, list_save_path):
|
107 |
-
page_documents = []
|
108 |
passage_documents = []
|
109 |
for ind_doc, docs in enumerate(list_docs):
|
110 |
for txt_index, txt_page in enumerate(docs):
|
111 |
page_document = txt_page.get_text()##.encode("utf8") # get plain text (is in UTF-8)
|
112 |
-
page_documents.append(page_document)
|
113 |
sections = split_into_sentences_with_offsets(page_document)
|
114 |
for sub_sub_index, sub_sub_item in enumerate(sections):
|
115 |
sub_text=sub_sub_item[0]
|
116 |
-
passage_document = Document(page_content=sub_text, metadata={"page_index": txt_index, "file_name" : str(list_save_path[ind_doc])})
|
117 |
passage_documents.append(passage_document)
|
118 |
-
return(
|
119 |
|
120 |
# def util_index_chromadb_passages():
|
121 |
# ##### PROCESSING
|
@@ -150,26 +150,16 @@ def util_get_list_pageno_and_contents(some_query_passage, page_documents,passage
|
|
150 |
print('rescore :: ',rescore)
|
151 |
tmp_array = []
|
152 |
for i, x in enumerate(passage_nodes):
|
153 |
-
tmp_dict = {"passage_content":x.page_content,
|
|
|
|
|
|
|
|
|
154 |
tmp_array.append(tmp_dict)
|
155 |
df = pd.DataFrame(tmp_array)
|
156 |
df = df.sort_values(by='score', ascending=False)
|
157 |
df = df.drop_duplicates(subset=['file_name'], keep='first')
|
158 |
-
|
159 |
-
# print('rescore ' , rescore)
|
160 |
-
# print(rescore)
|
161 |
-
# max_pos_index = rescore.index(max(rescore))
|
162 |
-
# print("Maximum Index position: ",max_pos_index)
|
163 |
-
# print(passage_nodes[max_pos_index].page_content)
|
164 |
-
|
165 |
-
# #Document(page_content=sub_text, metadata={"page_index": txt_index})
|
166 |
-
|
167 |
-
|
168 |
-
# for index, item in enumerate(passage_nodes):
|
169 |
-
# page_no = passage_nodes[index]
|
170 |
-
# page_content = page_documents[page_no]
|
171 |
-
# if(index==max_pos_index):
|
172 |
-
# return_value.append((page_no+1,page_content))
|
173 |
return(df)
|
174 |
|
175 |
# # def util_openai_extract_entity(example_passage, example_entity, page_content):
|
@@ -243,7 +233,7 @@ def util_openai_modify_prompt(example_prompt, page_content):
|
|
243 |
# tmp_list = sorted(tmp_list, key=itemgetter(2), reverse=True)
|
244 |
# return(tmp_list)
|
245 |
|
246 |
-
|
247 |
passage_documents = []
|
248 |
|
249 |
with st.form("my_form"):
|
@@ -275,7 +265,7 @@ with st.form("my_form"):
|
|
275 |
list_docs, list_save_path = util_upload_file_and_return_list_docs(uploaded_files)
|
276 |
# print('list_docs ' ,list_docs)
|
277 |
# print('list_save_path ' , list_save_path)
|
278 |
-
|
279 |
bm25_retriever = BM25Retriever.from_documents(passage_documents)
|
280 |
bm25_retriever.k = 2
|
281 |
chroma_vectorstore = Chroma.from_documents(passage_documents, embedding)
|
@@ -283,7 +273,7 @@ with st.form("my_form"):
|
|
283 |
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever],weights=[0.25, 0.75])
|
284 |
passage_nodes = ensemble_retriever.get_relevant_documents(single_example_passage)
|
285 |
print('len(passage_nodes):', len(passage_nodes))
|
286 |
-
df = util_get_list_pageno_and_contents(single_example_passage,
|
287 |
st.write(df)
|
288 |
# print('len(page_list_retrieve):', len(page_list_retrieve))
|
289 |
# if(len(page_list_retrieve)>0):
|
|
|
29 |
save_path = Path(os.getcwd(), uploaded_file.name)
|
30 |
with open(save_path, mode='wb') as w:
|
31 |
w.write(uploaded_file.getvalue())
|
32 |
+
#print('save_path:', save_path)
|
33 |
docs = fitz.open(save_path)
|
34 |
list_docs.append(docs)
|
35 |
list_save_path.append(save_path)
|
|
|
104 |
return [(sent.text, sent.start_char, sent.end_char) for sent in doc.sents]
|
105 |
|
106 |
def util_get_list_page_and_passage(list_docs, list_save_path):
|
107 |
+
#page_documents = []
|
108 |
passage_documents = []
|
109 |
for ind_doc, docs in enumerate(list_docs):
|
110 |
for txt_index, txt_page in enumerate(docs):
|
111 |
page_document = txt_page.get_text()##.encode("utf8") # get plain text (is in UTF-8)
|
112 |
+
#page_documents.append(page_document)
|
113 |
sections = split_into_sentences_with_offsets(page_document)
|
114 |
for sub_sub_index, sub_sub_item in enumerate(sections):
|
115 |
sub_text=sub_sub_item[0]
|
116 |
+
passage_document = Document(page_content=sub_text, metadata={"page_content": page_document,"page_index": txt_index, "file_name" : str(list_save_path[ind_doc])})
|
117 |
passage_documents.append(passage_document)
|
118 |
+
return(passage_documents)
|
119 |
|
120 |
# def util_index_chromadb_passages():
|
121 |
# ##### PROCESSING
|
|
|
150 |
print('rescore :: ',rescore)
|
151 |
tmp_array = []
|
152 |
for i, x in enumerate(passage_nodes):
|
153 |
+
tmp_dict = {"passage_content":x.page_content,
|
154 |
+
"page_no":int(x.metadata['page_index'])+1,
|
155 |
+
"page_content":str(x.metadata['page_content']),
|
156 |
+
"file_name": str(x.metadata['file_name']),
|
157 |
+
"score" : float(rescore[i])}
|
158 |
tmp_array.append(tmp_dict)
|
159 |
df = pd.DataFrame(tmp_array)
|
160 |
df = df.sort_values(by='score', ascending=False)
|
161 |
df = df.drop_duplicates(subset=['file_name'], keep='first')
|
162 |
+
df = df[["passage_content","file_name""page_no","page_content"]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
return(df)
|
164 |
|
165 |
# # def util_openai_extract_entity(example_passage, example_entity, page_content):
|
|
|
233 |
# tmp_list = sorted(tmp_list, key=itemgetter(2), reverse=True)
|
234 |
# return(tmp_list)
|
235 |
|
236 |
+
|
237 |
passage_documents = []
|
238 |
|
239 |
with st.form("my_form"):
|
|
|
265 |
list_docs, list_save_path = util_upload_file_and_return_list_docs(uploaded_files)
|
266 |
# print('list_docs ' ,list_docs)
|
267 |
# print('list_save_path ' , list_save_path)
|
268 |
+
passage_documents = util_get_list_page_and_passage(list_docs, list_save_path)
|
269 |
bm25_retriever = BM25Retriever.from_documents(passage_documents)
|
270 |
bm25_retriever.k = 2
|
271 |
chroma_vectorstore = Chroma.from_documents(passage_documents, embedding)
|
|
|
273 |
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever],weights=[0.25, 0.75])
|
274 |
passage_nodes = ensemble_retriever.get_relevant_documents(single_example_passage)
|
275 |
print('len(passage_nodes):', len(passage_nodes))
|
276 |
+
df = util_get_list_pageno_and_contents(single_example_passage,passage_documents,passage_nodes)
|
277 |
st.write(df)
|
278 |
# print('len(page_list_retrieve):', len(page_list_retrieve))
|
279 |
# if(len(page_list_retrieve)>0):
|