semantic-search-with-retrieve-and-rerank

Sleeping

App Files Files Community

nickmuchi commited on May 6, 2022

Commit

35f456f

1 Parent(s): d33afe3

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -6

app.py CHANGED Viewed

@@ -52,17 +52,22 @@ def extract_text_from_file(file):
         # To read file as string:
         file_text = stringio.read()
     # read pdf file
     elif file.type == "application/pdf":
         pdfReader = PdfFileReader(file)
         count = pdfReader.numPages
         all_text = ""
         for i in range(count):
             page = pdfReader.getPage(i)
             all_text += page.extractText()
             file_text = all_text
     # read docx file
     elif (
@@ -70,8 +75,8 @@ def extract_text_from_file(file):
         == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
     ):
         file_text = docx2txt.process(file)
-    return file_text
 def preprocess_plain_text(text,window_size=3):
@@ -171,6 +176,10 @@ def search_func(query, top_k=2):
     if url_text:
         st.write(f"Document Header: {title}")
     ##### BM25 search (lexical search) #####
     bm25_scores = bm25.get_scores(bm25_tokenizer(query))
@@ -178,7 +187,7 @@ def search_func(query, top_k=2):
     bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
     bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
-    st.write(f"Top-{top_k} lexical search (BM25) hits")
     for hit in bm25_hits[0:top_k]:
         st.write("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
@@ -200,14 +209,14 @@ def search_func(query, top_k=2):
     # Output of top-3 hits from bi-encoder
     st.markdown("\n-------------------------\n")
-    st.write(f"Top-{top_k} Bi-Encoder Retrieval hits")
     hits = sorted(hits, key=lambda x: x['score'], reverse=True)
     for hit in hits[0:top_k]:
         st.write("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
     # Output of top-3 hits from re-ranker
     st.markdown("\n-------------------------\n")
-    st.write(f"Top-{top_k} Cross-Encoder Re-ranker hits")
     hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
     for hit in hits[0:top_k]:
         st.write("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
@@ -284,7 +293,8 @@ if validators.url(url_text):
 elif upload_doc:
-    passages = preprocess_plain_text(extract_text_from_file(upload_doc),window_size=window_size)
 search = st.button("Search")

         # To read file as string:
         file_text = stringio.read()
+        return file_text, None
     # read pdf file
     elif file.type == "application/pdf":
         pdfReader = PdfFileReader(file)
         count = pdfReader.numPages
         all_text = ""
+        pdf_title = pdfReader.getDocumentInfo().title
         for i in range(count):
             page = pdfReader.getPage(i)
             all_text += page.extractText()
             file_text = all_text
+        return file_text, pdf_title
     # read docx file
     elif (
         == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
     ):
         file_text = docx2txt.process(file)
+        return file_text, None
 def preprocess_plain_text(text,window_size=3):
     if url_text:
         st.write(f"Document Header: {title}")
+    elif pdf_title:
+        st.write(f"Document Header: {pdf_title}")
     ##### BM25 search (lexical search) #####
     bm25_scores = bm25.get_scores(bm25_tokenizer(query))
     bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
     bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
+    st.subheader(f"Top-{top_k} lexical search (BM25) hits")
     for hit in bm25_hits[0:top_k]:
         st.write("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
     # Output of top-3 hits from bi-encoder
     st.markdown("\n-------------------------\n")
+    st.subheader(f"Top-{top_k} Bi-Encoder Retrieval hits")
     hits = sorted(hits, key=lambda x: x['score'], reverse=True)
     for hit in hits[0:top_k]:
         st.write("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
     # Output of top-3 hits from re-ranker
     st.markdown("\n-------------------------\n")
+    st.subheader(f"Top-{top_k} Cross-Encoder Re-ranker hits")
     hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
     for hit in hits[0:top_k]:
         st.write("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
 elif upload_doc:
+    text, pdf_title = extract_text_from_file(upload_doc)
+    passages = preprocess_plain_text(text,window_size=window_size)
 search = st.button("Search")