annikwag commited on
Commit
529dce6
verified
1 Parent(s): a2f475d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -43
app.py CHANGED
@@ -153,14 +153,14 @@ if button:
153
 
154
  # 2) Filter out content < 20 chars (as intermediate fix to problem that e.g. super short paragraphs with few chars get high similarity score)
155
  semantic_all = [
156
- r for r in semantic_all if len(r.payload["page_content"]) >= 20
157
  ]
158
  lexical_all = [
159
- r for r in lexical_all if len(r.payload["page_content"]) >= 20
160
  ]
161
 
162
  # 2) Apply a threshold to SEMANTIC results (score >= 0.3)
163
- semantic_thresholded = [r for r in semantic_all if r.score >= 0.3]
164
 
165
  # 2) Filter the entire sets
166
  filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range)
@@ -173,47 +173,78 @@ if button:
173
  # 3) Now we take the top 10 *after* filtering
174
  # Check user preference
175
  if show_exact_matches:
 
176
  st.write(f"Showing **Top 10 Lexical Search results** for query: {var}")
177
- # Show the top 10 from filtered_lexical
178
- for res in filtered_lexical_no_dupe[:10]:
179
- project_name = res.payload['metadata'].get('project_name', 'Project Link')
180
- url = res.payload['metadata'].get('url', '#')
181
- st.markdown(f"#### [{project_name}]({url})")
182
-
183
- # Snippet logic (80 words)
184
- full_text = res.payload['page_content']
185
- words = full_text.split()
186
- preview_word_count = 80
187
- preview_text = " ".join(words[:preview_word_count])
188
- remainder_text = " ".join(words[preview_word_count:])
189
- st.write(preview_text + ("..." if remainder_text else ""))
190
-
191
- # Keywords
192
- top_keywords = extract_top_keywords(full_text, top_n=5)
193
- if top_keywords:
194
- st.markdown(f"_{' '.join(top_keywords)}_")
195
-
196
- # Metadata
197
- metadata = res.payload.get('metadata', {})
198
- countries = metadata.get('countries', "[]")
199
- client_name = metadata.get('client', 'Unknown Client')
200
- start_year = metadata.get('start_year', None)
201
- end_year_ = metadata.get('end_year', None)
202
-
203
- try:
204
- c_list = json.loads(countries.replace("'", '"'))
205
- country_names = [get_country_name(code.upper(), region_df) for code in c_list if len(code) == 2]
206
- except json.JSONDecodeError:
207
- country_names = []
208
-
209
- start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
210
- end_year_str = f"{int(round(float(end_year_)))}" if end_year_ else "Unknown"
211
-
212
- additional_text = (
213
- f"**{', '.join(country_names)}**, commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**"
214
- )
215
- st.markdown(additional_text)
216
- st.divider()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  else:
218
  st.write(f"Showing **Top 10 Semantic Search results** for query: {var}")
219
 
 
153
 
154
  # 2) Filter out content < 20 chars (as intermediate fix to problem that e.g. super short paragraphs with few chars get high similarity score)
155
  semantic_all = [
156
+ r for r in semantic_all if len(r.payload["page_content"]) >= 70
157
  ]
158
  lexical_all = [
159
+ r for r in lexical_all if len(r.payload["page_content"]) >= 70
160
  ]
161
 
162
  # 2) Apply a threshold to SEMANTIC results (score >= 0.3)
163
+ semantic_thresholded = [r for r in semantic_all if r.score >= 0.4]
164
 
165
  # 2) Filter the entire sets
166
  filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range)
 
173
  # 3) Now we take the top 10 *after* filtering
174
  # Check user preference
175
  if show_exact_matches:
176
+ # 1) Display heading
177
  st.write(f"Showing **Top 10 Lexical Search results** for query: {var}")
178
+
179
+ # 2) Do a simple substring check (case-insensitive)
180
+ # We'll create a new list lexical_substring_filtered
181
+ query_substring = var.strip().lower()
182
+ lexical_substring_filtered = []
183
+ for r in lexical_all:
184
+ # page_content in lowercase
185
+ page_text_lower = r.payload["page_content"].lower()
186
+ # Keep this result only if the query substring is found
187
+ if query_substring in page_text_lower:
188
+ lexical_substring_filtered.append(r)
189
+
190
+ # 3) Now apply your region/country/year filter on that new list
191
+ filtered_lexical = filter_results(
192
+ lexical_substring_filtered, country_filter, region_filter, end_year_range
193
+ )
194
+
195
+ # 4) Remove duplicates
196
+ filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
197
+
198
+ # 5) If empty after substring + filters + dedupe, show a custom message
199
+ if not filtered_lexical_no_dupe:
200
+ st.write('No exact matches, consider unchecking "Show only exact matches"')
201
+ else:
202
+ # 6) Display the first 10 matching results
203
+ for res in filtered_lexical_no_dupe[:10]:
204
+ project_name = res.payload['metadata'].get('project_name', 'Project Link')
205
+ url = res.payload['metadata'].get('url', '#')
206
+ st.markdown(f"#### [{project_name}]({url})")
207
+
208
+ # Snippet logic (80 words)
209
+ full_text = res.payload['page_content']
210
+ words = full_text.split()
211
+ preview_word_count = 80
212
+ preview_text = " ".join(words[:preview_word_count])
213
+ remainder_text = " ".join(words[preview_word_count:])
214
+ st.write(preview_text + ("..." if remainder_text else ""))
215
+
216
+ # Keywords
217
+ top_keywords = extract_top_keywords(full_text, top_n=5)
218
+ if top_keywords:
219
+ st.markdown(f"_{' 路 '.join(top_keywords)}_")
220
+
221
+ # Metadata
222
+ metadata = res.payload.get('metadata', {})
223
+ countries = metadata.get('countries', "[]")
224
+ client_name = metadata.get('client', 'Unknown Client')
225
+ start_year = metadata.get('start_year', None)
226
+ end_year_ = metadata.get('end_year', None)
227
+
228
+ try:
229
+ c_list = json.loads(countries.replace("'", '"'))
230
+ country_names = [
231
+ get_country_name(code.upper(), region_df)
232
+ for code in c_list
233
+ if len(code) == 2
234
+ ]
235
+ except json.JSONDecodeError:
236
+ country_names = []
237
+
238
+ start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
239
+ end_year_str = f"{int(round(float(end_year_)))}" if end_year_ else "Unknown"
240
+
241
+ additional_text = (
242
+ f"**{', '.join(country_names)}**, commissioned by **{client_name}**, "
243
+ f"**{start_year_str}-{end_year_str}**"
244
+ )
245
+ st.markdown(additional_text)
246
+ st.divider()
247
+
248
  else:
249
  st.write(f"Showing **Top 10 Semantic Search results** for query: {var}")
250