Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -109,7 +109,7 @@ with col3:
|
|
109 |
|
110 |
# Checkbox to control whether to show only exact matches
|
111 |
show_exact_matches = st.checkbox("Show only exact matches", value=False)
|
112 |
-
button = st.button("Refresh Search")
|
113 |
|
114 |
def filter_results(results, country_filter, region_filter, end_year_range):
|
115 |
filtered = []
|
@@ -143,195 +143,195 @@ def filter_results(results, country_filter, region_filter, end_year_range):
|
|
143 |
filtered.append(r)
|
144 |
return filtered
|
145 |
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
-
|
170 |
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
-
|
174 |
-
|
175 |
-
if show_exact_matches:
|
176 |
-
# 1) Display heading
|
177 |
-
st.write(f"Showing **Top 15 Lexical Search results** for query: {var}")
|
178 |
-
|
179 |
-
# 2) Do a simple substring check (case-insensitive)
|
180 |
-
# We'll create a new list lexical_substring_filtered
|
181 |
-
query_substring = var.strip().lower()
|
182 |
-
lexical_substring_filtered = []
|
183 |
-
for r in lexical_all:
|
184 |
-
# page_content in lowercase
|
185 |
-
page_text_lower = r.payload["page_content"].lower()
|
186 |
-
# Keep this result only if the query substring is found
|
187 |
-
if query_substring in page_text_lower:
|
188 |
-
lexical_substring_filtered.append(r)
|
189 |
-
|
190 |
-
# 3) Now apply your region/country/year filter on that new list
|
191 |
-
filtered_lexical = filter_results(
|
192 |
-
lexical_substring_filtered, country_filter, region_filter, end_year_range
|
193 |
-
)
|
194 |
-
|
195 |
-
# 4) Remove duplicates
|
196 |
-
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
197 |
-
|
198 |
-
# 5) If empty after substring + filters + dedupe, show a custom message
|
199 |
-
if not filtered_lexical_no_dupe:
|
200 |
-
st.write('No exact matches, consider unchecking "Show only exact matches"')
|
201 |
-
else:
|
202 |
-
# 6) Display the first 15 matching results
|
203 |
-
for res in filtered_lexical_no_dupe[:15]:
|
204 |
-
project_name = res.payload['metadata'].get('project_name', 'Project Link')
|
205 |
-
url = res.payload['metadata'].get('url', '#')
|
206 |
-
st.markdown(f"#### [{project_name}]({url})")
|
207 |
-
|
208 |
-
# Snippet logic (80 words)
|
209 |
-
full_text = res.payload['page_content']
|
210 |
-
words = full_text.split()
|
211 |
-
preview_word_count = 80
|
212 |
-
preview_text = " ".join(words[:preview_word_count])
|
213 |
-
remainder_text = " ".join(words[preview_word_count:])
|
214 |
-
st.write(preview_text + ("..." if remainder_text else ""))
|
215 |
-
|
216 |
-
# Keywords
|
217 |
-
top_keywords = extract_top_keywords(full_text, top_n=5)
|
218 |
-
if top_keywords:
|
219 |
-
st.markdown(f"_{' 路 '.join(top_keywords)}_")
|
220 |
-
|
221 |
-
# Metadata
|
222 |
-
metadata = res.payload.get('metadata', {})
|
223 |
-
countries = metadata.get('countries', "[]")
|
224 |
-
client_name = metadata.get('client', 'Unknown Client')
|
225 |
-
start_year = metadata.get('start_year', None)
|
226 |
-
end_year_ = metadata.get('end_year', None)
|
227 |
-
|
228 |
-
try:
|
229 |
-
c_list = json.loads(countries.replace("'", '"'))
|
230 |
-
except json.JSONDecodeError:
|
231 |
-
c_list = []
|
232 |
-
|
233 |
-
# Only keep country names if the region lookup (get_country_name)
|
234 |
-
# returns something different than the raw code.
|
235 |
-
matched_countries = []
|
236 |
-
for code in c_list:
|
237 |
-
if len(code) == 2:
|
238 |
-
resolved_name = get_country_name(code.upper(), region_df)
|
239 |
-
# If get_country_name didn't find a match,
|
240 |
-
# it typically just returns the same code (like "XX").
|
241 |
-
# We'll consider "successfully looked up" if
|
242 |
-
# resolved_name != code.upper().
|
243 |
-
if resolved_name.upper() != code.upper():
|
244 |
-
matched_countries.append(resolved_name)
|
245 |
-
|
246 |
-
# Format the year range
|
247 |
-
start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
|
248 |
-
end_year_str = f"{int(round(float(end_year_)))}" if end_year_ else "Unknown"
|
249 |
-
|
250 |
-
# Build the final string
|
251 |
-
if matched_countries:
|
252 |
-
# We have at least 1 valid country name
|
253 |
-
additional_text = (
|
254 |
-
f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
|
255 |
-
f"**{start_year_str}-{end_year_str}**"
|
256 |
-
)
|
257 |
-
else:
|
258 |
-
# No valid countries found
|
259 |
-
additional_text = (
|
260 |
-
f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**"
|
261 |
-
)
|
262 |
-
|
263 |
-
st.markdown(additional_text)
|
264 |
-
st.divider()
|
265 |
|
|
|
|
|
266 |
else:
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
st.
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**"
|
331 |
-
)
|
332 |
-
|
333 |
-
st.markdown(additional_text)
|
334 |
-
st.divider()
|
335 |
|
336 |
|
337 |
# for i in results:
|
|
|
109 |
|
110 |
# Checkbox to control whether to show only exact matches
|
111 |
show_exact_matches = st.checkbox("Show only exact matches", value=False)
|
112 |
+
####L button = st.button("Refresh Search")
|
113 |
|
114 |
def filter_results(results, country_filter, region_filter, end_year_range):
|
115 |
filtered = []
|
|
|
143 |
filtered.append(r)
|
144 |
return filtered
|
145 |
|
146 |
+
####Lif button:
|
147 |
+
# 1) Adjust limit so we get more than 15 results
|
148 |
+
results = hybrid_search(client, var, collection_name, limit=500) # e.g., 100 or 200
|
149 |
+
|
150 |
+
# results is a tuple: (semantic_results, lexical_results)
|
151 |
+
semantic_all = results[0]
|
152 |
+
lexical_all = results[1]
|
153 |
+
|
154 |
+
# 2) Filter out content < 20 chars (as intermediate fix to problem that e.g. super short paragraphs with few chars get high similarity score)
|
155 |
+
semantic_all = [
|
156 |
+
r for r in semantic_all if len(r.payload["page_content"]) >= 70
|
157 |
+
]
|
158 |
+
lexical_all = [
|
159 |
+
r for r in lexical_all if len(r.payload["page_content"]) >= 70
|
160 |
+
]
|
161 |
+
|
162 |
+
# 2) Apply a threshold to SEMANTIC results (score >= 0.4)
|
163 |
+
semantic_thresholded = [r for r in semantic_all if r.score >= 0.4]
|
164 |
+
|
165 |
+
# 2) Filter the entire sets
|
166 |
+
filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range)
|
167 |
+
filtered_lexical = filter_results(lexical_all, country_filter, region_filter, end_year_range)
|
168 |
+
|
169 |
+
filtered_semantic_no_dupe = remove_duplicates(filtered_semantic)
|
170 |
+
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
171 |
+
|
172 |
+
|
173 |
+
# 3) Retrieve top 15 *after* filtering
|
174 |
+
# Check user preference
|
175 |
+
if show_exact_matches:
|
176 |
+
# 1) Display heading
|
177 |
+
st.write(f"Showing **Top 15 Lexical Search results** for query: {var}")
|
178 |
+
|
179 |
+
# 2) Do a simple substring check (case-insensitive)
|
180 |
+
# We'll create a new list lexical_substring_filtered
|
181 |
+
query_substring = var.strip().lower()
|
182 |
+
lexical_substring_filtered = []
|
183 |
+
for r in lexical_all:
|
184 |
+
# page_content in lowercase
|
185 |
+
page_text_lower = r.payload["page_content"].lower()
|
186 |
+
# Keep this result only if the query substring is found
|
187 |
+
if query_substring in page_text_lower:
|
188 |
+
lexical_substring_filtered.append(r)
|
189 |
+
|
190 |
+
# 3) Now apply your region/country/year filter on that new list
|
191 |
+
filtered_lexical = filter_results(
|
192 |
+
lexical_substring_filtered, country_filter, region_filter, end_year_range
|
193 |
+
)
|
194 |
|
195 |
+
# 4) Remove duplicates
|
196 |
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
197 |
|
198 |
+
# 5) If empty after substring + filters + dedupe, show a custom message
|
199 |
+
if not filtered_lexical_no_dupe:
|
200 |
+
st.write('No exact matches, consider unchecking "Show only exact matches"')
|
201 |
+
else:
|
202 |
+
# 6) Display the first 15 matching results
|
203 |
+
for res in filtered_lexical_no_dupe[:15]:
|
204 |
+
project_name = res.payload['metadata'].get('project_name', 'Project Link')
|
205 |
+
url = res.payload['metadata'].get('url', '#')
|
206 |
+
st.markdown(f"#### [{project_name}]({url})")
|
207 |
+
|
208 |
+
# Snippet logic (80 words)
|
209 |
+
full_text = res.payload['page_content']
|
210 |
+
words = full_text.split()
|
211 |
+
preview_word_count = 80
|
212 |
+
preview_text = " ".join(words[:preview_word_count])
|
213 |
+
remainder_text = " ".join(words[preview_word_count:])
|
214 |
+
st.write(preview_text + ("..." if remainder_text else ""))
|
215 |
+
|
216 |
+
# Keywords
|
217 |
+
top_keywords = extract_top_keywords(full_text, top_n=5)
|
218 |
+
if top_keywords:
|
219 |
+
st.markdown(f"_{' 路 '.join(top_keywords)}_")
|
220 |
+
|
221 |
+
# Metadata
|
222 |
+
metadata = res.payload.get('metadata', {})
|
223 |
+
countries = metadata.get('countries', "[]")
|
224 |
+
client_name = metadata.get('client', 'Unknown Client')
|
225 |
+
start_year = metadata.get('start_year', None)
|
226 |
+
end_year_ = metadata.get('end_year', None)
|
227 |
+
|
228 |
+
try:
|
229 |
+
c_list = json.loads(countries.replace("'", '"'))
|
230 |
+
except json.JSONDecodeError:
|
231 |
+
c_list = []
|
232 |
+
|
233 |
+
# Only keep country names if the region lookup (get_country_name)
|
234 |
+
# returns something different than the raw code.
|
235 |
+
matched_countries = []
|
236 |
+
for code in c_list:
|
237 |
+
if len(code) == 2:
|
238 |
+
resolved_name = get_country_name(code.upper(), region_df)
|
239 |
+
# If get_country_name didn't find a match,
|
240 |
+
# it typically just returns the same code (like "XX").
|
241 |
+
# We'll consider "successfully looked up" if
|
242 |
+
# resolved_name != code.upper().
|
243 |
+
if resolved_name.upper() != code.upper():
|
244 |
+
matched_countries.append(resolved_name)
|
245 |
+
|
246 |
+
# Format the year range
|
247 |
+
start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
|
248 |
+
end_year_str = f"{int(round(float(end_year_)))}" if end_year_ else "Unknown"
|
249 |
+
|
250 |
+
# Build the final string
|
251 |
+
if matched_countries:
|
252 |
+
# We have at least 1 valid country name
|
253 |
+
additional_text = (
|
254 |
+
f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
|
255 |
+
f"**{start_year_str}-{end_year_str}**"
|
256 |
+
)
|
257 |
+
else:
|
258 |
+
# No valid countries found
|
259 |
+
additional_text = (
|
260 |
+
f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**"
|
261 |
+
)
|
262 |
+
|
263 |
+
st.markdown(additional_text)
|
264 |
+
st.divider()
|
265 |
|
266 |
+
else:
|
267 |
+
st.write(f"Showing **Top 15 Semantic Search results** for query: {var}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
|
269 |
+
if not filtered_semantic_no_dupe:
|
270 |
+
st.write("No relevant results found.")
|
271 |
else:
|
272 |
+
# Show the top 15 from filtered_semantic
|
273 |
+
for res in filtered_semantic_no_dupe[:15]:
|
274 |
+
project_name = res.payload['metadata'].get('project_name', 'Project Link')
|
275 |
+
url = res.payload['metadata'].get('url', '#')
|
276 |
+
st.markdown(f"#### [{project_name}]({url})")
|
277 |
+
|
278 |
+
# Snippet logic
|
279 |
+
full_text = res.payload['page_content']
|
280 |
+
words = full_text.split()
|
281 |
+
preview_word_count = 80
|
282 |
+
preview_text = " ".join(words[:preview_word_count])
|
283 |
+
remainder_text = " ".join(words[preview_word_count:])
|
284 |
+
st.write(preview_text + ("..." if remainder_text else ""))
|
285 |
+
|
286 |
+
# Keywords
|
287 |
+
top_keywords = extract_top_keywords(full_text, top_n=5)
|
288 |
+
if top_keywords:
|
289 |
+
st.markdown(f"_{' 路 '.join(top_keywords)}_")
|
290 |
+
|
291 |
+
# Metadata
|
292 |
+
metadata = res.payload.get('metadata', {})
|
293 |
+
countries = metadata.get('countries', "[]")
|
294 |
+
client_name = metadata.get('client', 'Unknown Client')
|
295 |
+
start_year = metadata.get('start_year', None)
|
296 |
+
end_year_ = metadata.get('end_year', None)
|
297 |
+
|
298 |
+
try:
|
299 |
+
c_list = json.loads(countries.replace("'", '"'))
|
300 |
+
except json.JSONDecodeError:
|
301 |
+
c_list = []
|
302 |
+
|
303 |
+
# Only keep country names if the region lookup (get_country_name)
|
304 |
+
# returns something different than the raw code.
|
305 |
+
matched_countries = []
|
306 |
+
for code in c_list:
|
307 |
+
if len(code) == 2:
|
308 |
+
resolved_name = get_country_name(code.upper(), region_df)
|
309 |
+
# If get_country_name didn't find a match,
|
310 |
+
# it typically just returns the same code (like "XX").
|
311 |
+
# We'll consider "successfully looked up" if
|
312 |
+
# resolved_name != code.upper().
|
313 |
+
if resolved_name.upper() != code.upper():
|
314 |
+
matched_countries.append(resolved_name)
|
315 |
+
|
316 |
+
# Format the year range
|
317 |
+
start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
|
318 |
+
end_year_str = f"{int(round(float(end_year_)))}" if end_year_ else "Unknown"
|
319 |
+
|
320 |
+
# Build the final string
|
321 |
+
if matched_countries:
|
322 |
+
# We have at least 1 valid country name
|
323 |
+
additional_text = (
|
324 |
+
f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
|
325 |
+
f"**{start_year_str}-{end_year_str}**"
|
326 |
+
)
|
327 |
+
else:
|
328 |
+
# No valid countries found
|
329 |
+
additional_text = (
|
330 |
+
f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**"
|
331 |
+
)
|
332 |
+
|
333 |
+
st.markdown(additional_text)
|
334 |
+
st.divider()
|
|
|
|
|
|
|
|
|
|
|
335 |
|
336 |
|
337 |
# for i in results:
|