leavoigt commited on
Commit
d6bab54
verified
1 Parent(s): 6bbd8f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +184 -184
app.py CHANGED
@@ -109,7 +109,7 @@ with col3:
109
 
110
  # Checkbox to control whether to show only exact matches
111
  show_exact_matches = st.checkbox("Show only exact matches", value=False)
112
- button = st.button("Refresh Search")
113
 
114
  def filter_results(results, country_filter, region_filter, end_year_range):
115
  filtered = []
@@ -143,195 +143,195 @@ def filter_results(results, country_filter, region_filter, end_year_range):
143
  filtered.append(r)
144
  return filtered
145
 
146
- if button:
147
- # 1) Adjust limit so we get more than 15 results
148
- results = hybrid_search(client, var, collection_name, limit=500) # e.g., 100 or 200
149
-
150
- # results is a tuple: (semantic_results, lexical_results)
151
- semantic_all = results[0]
152
- lexical_all = results[1]
153
-
154
- # 2) Filter out content < 20 chars (as intermediate fix to problem that e.g. super short paragraphs with few chars get high similarity score)
155
- semantic_all = [
156
- r for r in semantic_all if len(r.payload["page_content"]) >= 70
157
- ]
158
- lexical_all = [
159
- r for r in lexical_all if len(r.payload["page_content"]) >= 70
160
- ]
161
-
162
- # 2) Apply a threshold to SEMANTIC results (score >= 0.4)
163
- semantic_thresholded = [r for r in semantic_all if r.score >= 0.4]
164
-
165
- # 2) Filter the entire sets
166
- filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range)
167
- filtered_lexical = filter_results(lexical_all, country_filter, region_filter, end_year_range)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
- filtered_semantic_no_dupe = remove_duplicates(filtered_semantic)
170
  filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
- # 3) Retrieve top 15 *after* filtering
174
- # Check user preference
175
- if show_exact_matches:
176
- # 1) Display heading
177
- st.write(f"Showing **Top 15 Lexical Search results** for query: {var}")
178
-
179
- # 2) Do a simple substring check (case-insensitive)
180
- # We'll create a new list lexical_substring_filtered
181
- query_substring = var.strip().lower()
182
- lexical_substring_filtered = []
183
- for r in lexical_all:
184
- # page_content in lowercase
185
- page_text_lower = r.payload["page_content"].lower()
186
- # Keep this result only if the query substring is found
187
- if query_substring in page_text_lower:
188
- lexical_substring_filtered.append(r)
189
-
190
- # 3) Now apply your region/country/year filter on that new list
191
- filtered_lexical = filter_results(
192
- lexical_substring_filtered, country_filter, region_filter, end_year_range
193
- )
194
-
195
- # 4) Remove duplicates
196
- filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
197
-
198
- # 5) If empty after substring + filters + dedupe, show a custom message
199
- if not filtered_lexical_no_dupe:
200
- st.write('No exact matches, consider unchecking "Show only exact matches"')
201
- else:
202
- # 6) Display the first 15 matching results
203
- for res in filtered_lexical_no_dupe[:15]:
204
- project_name = res.payload['metadata'].get('project_name', 'Project Link')
205
- url = res.payload['metadata'].get('url', '#')
206
- st.markdown(f"#### [{project_name}]({url})")
207
-
208
- # Snippet logic (80 words)
209
- full_text = res.payload['page_content']
210
- words = full_text.split()
211
- preview_word_count = 80
212
- preview_text = " ".join(words[:preview_word_count])
213
- remainder_text = " ".join(words[preview_word_count:])
214
- st.write(preview_text + ("..." if remainder_text else ""))
215
-
216
- # Keywords
217
- top_keywords = extract_top_keywords(full_text, top_n=5)
218
- if top_keywords:
219
- st.markdown(f"_{' 路 '.join(top_keywords)}_")
220
-
221
- # Metadata
222
- metadata = res.payload.get('metadata', {})
223
- countries = metadata.get('countries', "[]")
224
- client_name = metadata.get('client', 'Unknown Client')
225
- start_year = metadata.get('start_year', None)
226
- end_year_ = metadata.get('end_year', None)
227
-
228
- try:
229
- c_list = json.loads(countries.replace("'", '"'))
230
- except json.JSONDecodeError:
231
- c_list = []
232
-
233
- # Only keep country names if the region lookup (get_country_name)
234
- # returns something different than the raw code.
235
- matched_countries = []
236
- for code in c_list:
237
- if len(code) == 2:
238
- resolved_name = get_country_name(code.upper(), region_df)
239
- # If get_country_name didn't find a match,
240
- # it typically just returns the same code (like "XX").
241
- # We'll consider "successfully looked up" if
242
- # resolved_name != code.upper().
243
- if resolved_name.upper() != code.upper():
244
- matched_countries.append(resolved_name)
245
-
246
- # Format the year range
247
- start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
248
- end_year_str = f"{int(round(float(end_year_)))}" if end_year_ else "Unknown"
249
-
250
- # Build the final string
251
- if matched_countries:
252
- # We have at least 1 valid country name
253
- additional_text = (
254
- f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
255
- f"**{start_year_str}-{end_year_str}**"
256
- )
257
- else:
258
- # No valid countries found
259
- additional_text = (
260
- f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**"
261
- )
262
-
263
- st.markdown(additional_text)
264
- st.divider()
265
 
 
 
266
  else:
267
- st.write(f"Showing **Top 15 Semantic Search results** for query: {var}")
268
-
269
- if not filtered_semantic_no_dupe:
270
- st.write("No relevant results found.")
271
- else:
272
- # Show the top 15 from filtered_semantic
273
- for res in filtered_semantic_no_dupe[:15]:
274
- project_name = res.payload['metadata'].get('project_name', 'Project Link')
275
- url = res.payload['metadata'].get('url', '#')
276
- st.markdown(f"#### [{project_name}]({url})")
277
-
278
- # Snippet logic
279
- full_text = res.payload['page_content']
280
- words = full_text.split()
281
- preview_word_count = 80
282
- preview_text = " ".join(words[:preview_word_count])
283
- remainder_text = " ".join(words[preview_word_count:])
284
- st.write(preview_text + ("..." if remainder_text else ""))
285
-
286
- # Keywords
287
- top_keywords = extract_top_keywords(full_text, top_n=5)
288
- if top_keywords:
289
- st.markdown(f"_{' '.join(top_keywords)}_")
290
-
291
- # Metadata
292
- metadata = res.payload.get('metadata', {})
293
- countries = metadata.get('countries', "[]")
294
- client_name = metadata.get('client', 'Unknown Client')
295
- start_year = metadata.get('start_year', None)
296
- end_year_ = metadata.get('end_year', None)
297
-
298
- try:
299
- c_list = json.loads(countries.replace("'", '"'))
300
- except json.JSONDecodeError:
301
- c_list = []
302
-
303
- # Only keep country names if the region lookup (get_country_name)
304
- # returns something different than the raw code.
305
- matched_countries = []
306
- for code in c_list:
307
- if len(code) == 2:
308
- resolved_name = get_country_name(code.upper(), region_df)
309
- # If get_country_name didn't find a match,
310
- # it typically just returns the same code (like "XX").
311
- # We'll consider "successfully looked up" if
312
- # resolved_name != code.upper().
313
- if resolved_name.upper() != code.upper():
314
- matched_countries.append(resolved_name)
315
-
316
- # Format the year range
317
- start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
318
- end_year_str = f"{int(round(float(end_year_)))}" if end_year_ else "Unknown"
319
-
320
- # Build the final string
321
- if matched_countries:
322
- # We have at least 1 valid country name
323
- additional_text = (
324
- f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
325
- f"**{start_year_str}-{end_year_str}**"
326
- )
327
- else:
328
- # No valid countries found
329
- additional_text = (
330
- f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**"
331
- )
332
-
333
- st.markdown(additional_text)
334
- st.divider()
335
 
336
 
337
  # for i in results:
 
109
 
110
  # Checkbox to control whether to show only exact matches
111
  show_exact_matches = st.checkbox("Show only exact matches", value=False)
112
+ ####L button = st.button("Refresh Search")
113
 
114
  def filter_results(results, country_filter, region_filter, end_year_range):
115
  filtered = []
 
143
  filtered.append(r)
144
  return filtered
145
 
146
+ ####Lif button:
147
+ # 1) Adjust limit so we get more than 15 results
148
+ results = hybrid_search(client, var, collection_name, limit=500) # e.g., 100 or 200
149
+
150
+ # results is a tuple: (semantic_results, lexical_results)
151
+ semantic_all = results[0]
152
+ lexical_all = results[1]
153
+
154
+ # 2) Filter out content < 20 chars (as intermediate fix to problem that e.g. super short paragraphs with few chars get high similarity score)
155
+ semantic_all = [
156
+ r for r in semantic_all if len(r.payload["page_content"]) >= 70
157
+ ]
158
+ lexical_all = [
159
+ r for r in lexical_all if len(r.payload["page_content"]) >= 70
160
+ ]
161
+
162
+ # 2) Apply a threshold to SEMANTIC results (score >= 0.4)
163
+ semantic_thresholded = [r for r in semantic_all if r.score >= 0.4]
164
+
165
+ # 2) Filter the entire sets
166
+ filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range)
167
+ filtered_lexical = filter_results(lexical_all, country_filter, region_filter, end_year_range)
168
+
169
+ filtered_semantic_no_dupe = remove_duplicates(filtered_semantic)
170
+ filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
171
+
172
+
173
+ # 3) Retrieve top 15 *after* filtering
174
+ # Check user preference
175
+ if show_exact_matches:
176
+ # 1) Display heading
177
+ st.write(f"Showing **Top 15 Lexical Search results** for query: {var}")
178
+
179
+ # 2) Do a simple substring check (case-insensitive)
180
+ # We'll create a new list lexical_substring_filtered
181
+ query_substring = var.strip().lower()
182
+ lexical_substring_filtered = []
183
+ for r in lexical_all:
184
+ # page_content in lowercase
185
+ page_text_lower = r.payload["page_content"].lower()
186
+ # Keep this result only if the query substring is found
187
+ if query_substring in page_text_lower:
188
+ lexical_substring_filtered.append(r)
189
+
190
+ # 3) Now apply your region/country/year filter on that new list
191
+ filtered_lexical = filter_results(
192
+ lexical_substring_filtered, country_filter, region_filter, end_year_range
193
+ )
194
 
195
+ # 4) Remove duplicates
196
  filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
197
 
198
+ # 5) If empty after substring + filters + dedupe, show a custom message
199
+ if not filtered_lexical_no_dupe:
200
+ st.write('No exact matches, consider unchecking "Show only exact matches"')
201
+ else:
202
+ # 6) Display the first 15 matching results
203
+ for res in filtered_lexical_no_dupe[:15]:
204
+ project_name = res.payload['metadata'].get('project_name', 'Project Link')
205
+ url = res.payload['metadata'].get('url', '#')
206
+ st.markdown(f"#### [{project_name}]({url})")
207
+
208
+ # Snippet logic (80 words)
209
+ full_text = res.payload['page_content']
210
+ words = full_text.split()
211
+ preview_word_count = 80
212
+ preview_text = " ".join(words[:preview_word_count])
213
+ remainder_text = " ".join(words[preview_word_count:])
214
+ st.write(preview_text + ("..." if remainder_text else ""))
215
+
216
+ # Keywords
217
+ top_keywords = extract_top_keywords(full_text, top_n=5)
218
+ if top_keywords:
219
+ st.markdown(f"_{' 路 '.join(top_keywords)}_")
220
+
221
+ # Metadata
222
+ metadata = res.payload.get('metadata', {})
223
+ countries = metadata.get('countries', "[]")
224
+ client_name = metadata.get('client', 'Unknown Client')
225
+ start_year = metadata.get('start_year', None)
226
+ end_year_ = metadata.get('end_year', None)
227
+
228
+ try:
229
+ c_list = json.loads(countries.replace("'", '"'))
230
+ except json.JSONDecodeError:
231
+ c_list = []
232
+
233
+ # Only keep country names if the region lookup (get_country_name)
234
+ # returns something different than the raw code.
235
+ matched_countries = []
236
+ for code in c_list:
237
+ if len(code) == 2:
238
+ resolved_name = get_country_name(code.upper(), region_df)
239
+ # If get_country_name didn't find a match,
240
+ # it typically just returns the same code (like "XX").
241
+ # We'll consider "successfully looked up" if
242
+ # resolved_name != code.upper().
243
+ if resolved_name.upper() != code.upper():
244
+ matched_countries.append(resolved_name)
245
+
246
+ # Format the year range
247
+ start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
248
+ end_year_str = f"{int(round(float(end_year_)))}" if end_year_ else "Unknown"
249
+
250
+ # Build the final string
251
+ if matched_countries:
252
+ # We have at least 1 valid country name
253
+ additional_text = (
254
+ f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
255
+ f"**{start_year_str}-{end_year_str}**"
256
+ )
257
+ else:
258
+ # No valid countries found
259
+ additional_text = (
260
+ f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**"
261
+ )
262
+
263
+ st.markdown(additional_text)
264
+ st.divider()
265
 
266
+ else:
267
+ st.write(f"Showing **Top 15 Semantic Search results** for query: {var}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
+ if not filtered_semantic_no_dupe:
270
+ st.write("No relevant results found.")
271
  else:
272
+ # Show the top 15 from filtered_semantic
273
+ for res in filtered_semantic_no_dupe[:15]:
274
+ project_name = res.payload['metadata'].get('project_name', 'Project Link')
275
+ url = res.payload['metadata'].get('url', '#')
276
+ st.markdown(f"#### [{project_name}]({url})")
277
+
278
+ # Snippet logic
279
+ full_text = res.payload['page_content']
280
+ words = full_text.split()
281
+ preview_word_count = 80
282
+ preview_text = " ".join(words[:preview_word_count])
283
+ remainder_text = " ".join(words[preview_word_count:])
284
+ st.write(preview_text + ("..." if remainder_text else ""))
285
+
286
+ # Keywords
287
+ top_keywords = extract_top_keywords(full_text, top_n=5)
288
+ if top_keywords:
289
+ st.markdown(f"_{' '.join(top_keywords)}_")
290
+
291
+ # Metadata
292
+ metadata = res.payload.get('metadata', {})
293
+ countries = metadata.get('countries', "[]")
294
+ client_name = metadata.get('client', 'Unknown Client')
295
+ start_year = metadata.get('start_year', None)
296
+ end_year_ = metadata.get('end_year', None)
297
+
298
+ try:
299
+ c_list = json.loads(countries.replace("'", '"'))
300
+ except json.JSONDecodeError:
301
+ c_list = []
302
+
303
+ # Only keep country names if the region lookup (get_country_name)
304
+ # returns something different than the raw code.
305
+ matched_countries = []
306
+ for code in c_list:
307
+ if len(code) == 2:
308
+ resolved_name = get_country_name(code.upper(), region_df)
309
+ # If get_country_name didn't find a match,
310
+ # it typically just returns the same code (like "XX").
311
+ # We'll consider "successfully looked up" if
312
+ # resolved_name != code.upper().
313
+ if resolved_name.upper() != code.upper():
314
+ matched_countries.append(resolved_name)
315
+
316
+ # Format the year range
317
+ start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
318
+ end_year_str = f"{int(round(float(end_year_)))}" if end_year_ else "Unknown"
319
+
320
+ # Build the final string
321
+ if matched_countries:
322
+ # We have at least 1 valid country name
323
+ additional_text = (
324
+ f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
325
+ f"**{start_year_str}-{end_year_str}**"
326
+ )
327
+ else:
328
+ # No valid countries found
329
+ additional_text = (
330
+ f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**"
331
+ )
332
+
333
+ st.markdown(additional_text)
334
+ st.divider()
 
 
 
 
 
335
 
336
 
337
  # for i in results: