semantic-search-with-retrieve-and-rerank

Sleeping

App Files Files Community

nickmuchi commited on May 6, 2022

Commit

7580e3c

1 Parent(s): 35f456f

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -9

app.py CHANGED Viewed

@@ -6,8 +6,7 @@ from rank_bm25 import BM25Okapi
 from sklearn.feature_extraction import _stop_words
 import string
 import numpy as np
-from bs4 import BeautifulSoup
-import time
 from newspaper import Article
 import base64
 import docx2txt
@@ -168,9 +167,16 @@ def bm25_api(passages):
 bi_enc_options = ["multi-qa-mpnet-base-dot-v1","all-mpnet-base-v2","multi-qa-MiniLM-L6-cos-v1"]
 # This function will search all wikipedia articles for passages that
 # answer the query
-def search_func(query, top_k=2):
     st.write(f"Search Query: {query}")
     if url_text:
@@ -188,8 +194,9 @@ def search_func(query, top_k=2):
     bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
     st.subheader(f"Top-{top_k} lexical search (BM25) hits")
-    for hit in bm25_hits[0:top_k]:
-        st.write("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
     ##### Sematic Search #####
     # Encode the query using the bi-encoder and find potentially relevant passages
@@ -211,15 +218,17 @@ def search_func(query, top_k=2):
     st.markdown("\n-------------------------\n")
     st.subheader(f"Top-{top_k} Bi-Encoder Retrieval hits")
     hits = sorted(hits, key=lambda x: x['score'], reverse=True)
-    for hit in hits[0:top_k]:
-        st.write("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
     # Output of top-3 hits from re-ranker
     st.markdown("\n-------------------------\n")
     st.subheader(f"Top-{top_k} Cross-Encoder Re-ranker hits")
     hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
-    for hit in hits[0:top_k]:
-        st.write("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
 #Streamlit App

 from sklearn.feature_extraction import _stop_words
 import string
 import numpy as np
+import pandas as pd
 from newspaper import Article
 import base64
 import docx2txt
 bi_enc_options = ["multi-qa-mpnet-base-dot-v1","all-mpnet-base-v2","multi-qa-MiniLM-L6-cos-v1"]
+def display_df_as_table(model,top_k,score):
+    # Display the df with text and scores as a table
+    df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
+    df['Score'] = round(df['Score'],2)
+    return df
 # This function will search all wikipedia articles for passages that
 # answer the query
+def search_func(query, top_k=top_k):
     st.write(f"Search Query: {query}")
     if url_text:
     bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
     st.subheader(f"Top-{top_k} lexical search (BM25) hits")
+    bm25_df = display_df_as_table(bm25_hits,top_k,'score')
+    st.write(bm25_df.to_html(index=False), unsafe_allow_html=True)
     ##### Sematic Search #####
     # Encode the query using the bi-encoder and find potentially relevant passages
     st.markdown("\n-------------------------\n")
     st.subheader(f"Top-{top_k} Bi-Encoder Retrieval hits")
     hits = sorted(hits, key=lambda x: x['score'], reverse=True)
+    cross_df = display_df_as_table(hits,top_k,'score')
+    st.write(cross_df.to_html(index=False), unsafe_allow_html=True)
     # Output of top-3 hits from re-ranker
     st.markdown("\n-------------------------\n")
     st.subheader(f"Top-{top_k} Cross-Encoder Re-ranker hits")
     hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
+    rerank_df = display_df_as_table(hits,top_k,'cross-score')
+    st.write(rerank_df.to_html(index=False), unsafe_allow_html=True)
 #Streamlit App