scholarly360 commited on
Commit
04426d9
·
1 Parent(s): c0cc382

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -24
app.py CHANGED
@@ -29,7 +29,7 @@ def util_upload_file_and_return_list_docs(uploaded_files):
29
  save_path = Path(os.getcwd(), uploaded_file.name)
30
  with open(save_path, mode='wb') as w:
31
  w.write(uploaded_file.getvalue())
32
- print('save_path:', save_path)
33
  docs = fitz.open(save_path)
34
  list_docs.append(docs)
35
  list_save_path.append(save_path)
@@ -104,18 +104,18 @@ def split_into_sentences_with_offsets(text):
104
  return [(sent.text, sent.start_char, sent.end_char) for sent in doc.sents]
105
 
106
  def util_get_list_page_and_passage(list_docs, list_save_path):
107
- page_documents = []
108
  passage_documents = []
109
  for ind_doc, docs in enumerate(list_docs):
110
  for txt_index, txt_page in enumerate(docs):
111
  page_document = txt_page.get_text()##.encode("utf8") # get plain text (is in UTF-8)
112
- page_documents.append(page_document)
113
  sections = split_into_sentences_with_offsets(page_document)
114
  for sub_sub_index, sub_sub_item in enumerate(sections):
115
  sub_text=sub_sub_item[0]
116
- passage_document = Document(page_content=sub_text, metadata={"page_index": txt_index, "file_name" : str(list_save_path[ind_doc])})
117
  passage_documents.append(passage_document)
118
- return(page_documents,passage_documents)
119
 
120
  # def util_index_chromadb_passages():
121
  # ##### PROCESSING
@@ -150,26 +150,16 @@ def util_get_list_pageno_and_contents(some_query_passage, page_documents,passage
150
  print('rescore :: ',rescore)
151
  tmp_array = []
152
  for i, x in enumerate(passage_nodes):
153
- tmp_dict = {"passage_content":x.page_content, "page_no":int(x.metadata['page_index'])+1, "file_name": str(x.metadata['file_name']), "score" : float(rescore[i])}
 
 
 
 
154
  tmp_array.append(tmp_dict)
155
  df = pd.DataFrame(tmp_array)
156
  df = df.sort_values(by='score', ascending=False)
157
  df = df.drop_duplicates(subset=['file_name'], keep='first')
158
-
159
- # print('rescore ' , rescore)
160
- # print(rescore)
161
- # max_pos_index = rescore.index(max(rescore))
162
- # print("Maximum Index position: ",max_pos_index)
163
- # print(passage_nodes[max_pos_index].page_content)
164
-
165
- # #Document(page_content=sub_text, metadata={"page_index": txt_index})
166
-
167
-
168
- # for index, item in enumerate(passage_nodes):
169
- # page_no = passage_nodes[index]
170
- # page_content = page_documents[page_no]
171
- # if(index==max_pos_index):
172
- # return_value.append((page_no+1,page_content))
173
  return(df)
174
 
175
  # # def util_openai_extract_entity(example_passage, example_entity, page_content):
@@ -243,7 +233,7 @@ def util_openai_modify_prompt(example_prompt, page_content):
243
  # tmp_list = sorted(tmp_list, key=itemgetter(2), reverse=True)
244
  # return(tmp_list)
245
 
246
- page_documents = []
247
  passage_documents = []
248
 
249
  with st.form("my_form"):
@@ -275,7 +265,7 @@ with st.form("my_form"):
275
  list_docs, list_save_path = util_upload_file_and_return_list_docs(uploaded_files)
276
  # print('list_docs ' ,list_docs)
277
  # print('list_save_path ' , list_save_path)
278
- page_documents , passage_documents = util_get_list_page_and_passage(list_docs, list_save_path)
279
  bm25_retriever = BM25Retriever.from_documents(passage_documents)
280
  bm25_retriever.k = 2
281
  chroma_vectorstore = Chroma.from_documents(passage_documents, embedding)
@@ -283,7 +273,7 @@ with st.form("my_form"):
283
  ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever],weights=[0.25, 0.75])
284
  passage_nodes = ensemble_retriever.get_relevant_documents(single_example_passage)
285
  print('len(passage_nodes):', len(passage_nodes))
286
- df = util_get_list_pageno_and_contents(single_example_passage, page_documents,passage_documents,passage_nodes)
287
  st.write(df)
288
  # print('len(page_list_retrieve):', len(page_list_retrieve))
289
  # if(len(page_list_retrieve)>0):
 
29
  save_path = Path(os.getcwd(), uploaded_file.name)
30
  with open(save_path, mode='wb') as w:
31
  w.write(uploaded_file.getvalue())
32
+ #print('save_path:', save_path)
33
  docs = fitz.open(save_path)
34
  list_docs.append(docs)
35
  list_save_path.append(save_path)
 
104
  return [(sent.text, sent.start_char, sent.end_char) for sent in doc.sents]
105
 
106
  def util_get_list_page_and_passage(list_docs, list_save_path):
107
+ #page_documents = []
108
  passage_documents = []
109
  for ind_doc, docs in enumerate(list_docs):
110
  for txt_index, txt_page in enumerate(docs):
111
  page_document = txt_page.get_text()##.encode("utf8") # get plain text (is in UTF-8)
112
+ #page_documents.append(page_document)
113
  sections = split_into_sentences_with_offsets(page_document)
114
  for sub_sub_index, sub_sub_item in enumerate(sections):
115
  sub_text=sub_sub_item[0]
116
+ passage_document = Document(page_content=sub_text, metadata={"page_content": page_document,"page_index": txt_index, "file_name" : str(list_save_path[ind_doc])})
117
  passage_documents.append(passage_document)
118
+ return(passage_documents)
119
 
120
  # def util_index_chromadb_passages():
121
  # ##### PROCESSING
 
150
  print('rescore :: ',rescore)
151
  tmp_array = []
152
  for i, x in enumerate(passage_nodes):
153
+ tmp_dict = {"passage_content":x.page_content,
154
+ "page_no":int(x.metadata['page_index'])+1,
155
+ "page_content":str(x.metadata['page_content']),
156
+ "file_name": str(x.metadata['file_name']),
157
+ "score" : float(rescore[i])}
158
  tmp_array.append(tmp_dict)
159
  df = pd.DataFrame(tmp_array)
160
  df = df.sort_values(by='score', ascending=False)
161
  df = df.drop_duplicates(subset=['file_name'], keep='first')
162
+ df = df[["passage_content","file_name""page_no","page_content"]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  return(df)
164
 
165
  # # def util_openai_extract_entity(example_passage, example_entity, page_content):
 
233
  # tmp_list = sorted(tmp_list, key=itemgetter(2), reverse=True)
234
  # return(tmp_list)
235
 
236
+
237
  passage_documents = []
238
 
239
  with st.form("my_form"):
 
265
  list_docs, list_save_path = util_upload_file_and_return_list_docs(uploaded_files)
266
  # print('list_docs ' ,list_docs)
267
  # print('list_save_path ' , list_save_path)
268
+ passage_documents = util_get_list_page_and_passage(list_docs, list_save_path)
269
  bm25_retriever = BM25Retriever.from_documents(passage_documents)
270
  bm25_retriever.k = 2
271
  chroma_vectorstore = Chroma.from_documents(passage_documents, embedding)
 
273
  ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever],weights=[0.25, 0.75])
274
  passage_nodes = ensemble_retriever.get_relevant_documents(single_example_passage)
275
  print('len(passage_nodes):', len(passage_nodes))
276
+ df = util_get_list_pageno_and_contents(single_example_passage,passage_documents,passage_nodes)
277
  st.write(df)
278
  # print('len(page_list_retrieve):', len(page_list_retrieve))
279
  # if(len(page_list_retrieve)>0):