Update app.py
Browse files
app.py
CHANGED
@@ -107,9 +107,9 @@ def preprocess_plain_text(text,window_size=3):
|
|
107 |
end_idx = min(start_idx+window_size, len(paragraph))
|
108 |
passages.append(" ".join(paragraph[start_idx:end_idx]))
|
109 |
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
|
114 |
return passages
|
115 |
|
@@ -121,10 +121,10 @@ def bi_encode(bi_enc,passages):
|
|
121 |
bi_encoder = SentenceTransformer(bi_enc)
|
122 |
|
123 |
#Compute the embeddings using the multi-process pool
|
124 |
-
|
125 |
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)
|
126 |
|
127 |
-
|
128 |
|
129 |
return corpus_embeddings
|
130 |
|
@@ -169,7 +169,9 @@ bi_enc_options = ["multi-qa-mpnet-base-dot-v1","all-mpnet-base-v2","multi-qa-Min
|
|
169 |
def search_func(query, top_k=2):
|
170 |
st.write(f"Search Query: {query}")
|
171 |
|
172 |
-
|
|
|
|
|
173 |
|
174 |
##### BM25 search (lexical search) #####
|
175 |
bm25_scores = bm25.get_scores(bm25_tokenizer(query))
|
|
|
107 |
end_idx = min(start_idx+window_size, len(paragraph))
|
108 |
passages.append(" ".join(paragraph[start_idx:end_idx]))
|
109 |
|
110 |
+
st.write(f"Paragraphs: {len(paragraphs)}")
|
111 |
+
st.write(f"Sentences: {sum([len(p) for p in paragraphs])}")
|
112 |
+
st.write(f"Passages: {len(passages)}")
|
113 |
|
114 |
return passages
|
115 |
|
|
|
121 |
bi_encoder = SentenceTransformer(bi_enc)
|
122 |
|
123 |
#Compute the embeddings using the multi-process pool
|
124 |
+
st.write('encoding passages into a vector space...')
|
125 |
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)
|
126 |
|
127 |
+
st.write(f"Embeddings computed. Shape: {corpus_embeddings.shape}")
|
128 |
|
129 |
return corpus_embeddings
|
130 |
|
|
|
169 |
def search_func(query, top_k=2):
|
170 |
st.write(f"Search Query: {query}")
|
171 |
|
172 |
+
if url_text:
|
173 |
+
|
174 |
+
st.write(f"Document Header: {title}")
|
175 |
|
176 |
##### BM25 search (lexical search) #####
|
177 |
bm25_scores = bm25.get_scores(bm25_tokenizer(query))
|