Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update app.py
Browse files
app.py
CHANGED
@@ -153,14 +153,14 @@ if button:
|
|
153 |
|
154 |
# 2) Filter out content < 20 chars (as intermediate fix to problem that e.g. super short paragraphs with few chars get high similarity score)
|
155 |
semantic_all = [
|
156 |
-
r for r in semantic_all if len(r.payload["page_content"]) >=
|
157 |
]
|
158 |
lexical_all = [
|
159 |
-
r for r in lexical_all if len(r.payload["page_content"]) >=
|
160 |
]
|
161 |
|
162 |
# 2) Apply a threshold to SEMANTIC results (score >= 0.3)
|
163 |
-
semantic_thresholded = [r for r in semantic_all if r.score >= 0.
|
164 |
|
165 |
# 2) Filter the entire sets
|
166 |
filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range)
|
@@ -173,47 +173,78 @@ if button:
|
|
173 |
# 3) Now we take the top 10 *after* filtering
|
174 |
# Check user preference
|
175 |
if show_exact_matches:
|
|
|
176 |
st.write(f"Showing **Top 10 Lexical Search results** for query: {var}")
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
#
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
else:
|
218 |
st.write(f"Showing **Top 10 Semantic Search results** for query: {var}")
|
219 |
|
|
|
153 |
|
154 |
# 2) Filter out content < 20 chars (as intermediate fix to problem that e.g. super short paragraphs with few chars get high similarity score)
|
155 |
semantic_all = [
|
156 |
+
r for r in semantic_all if len(r.payload["page_content"]) >= 70
|
157 |
]
|
158 |
lexical_all = [
|
159 |
+
r for r in lexical_all if len(r.payload["page_content"]) >= 70
|
160 |
]
|
161 |
|
162 |
# 2) Apply a threshold to SEMANTIC results (score >= 0.3)
|
163 |
+
semantic_thresholded = [r for r in semantic_all if r.score >= 0.4]
|
164 |
|
165 |
# 2) Filter the entire sets
|
166 |
filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range)
|
|
|
173 |
# 3) Now we take the top 10 *after* filtering
|
174 |
# Check user preference
|
175 |
if show_exact_matches:
|
176 |
+
# 1) Display heading
|
177 |
st.write(f"Showing **Top 10 Lexical Search results** for query: {var}")
|
178 |
+
|
179 |
+
# 2) Do a simple substring check (case-insensitive)
|
180 |
+
# We'll create a new list lexical_substring_filtered
|
181 |
+
query_substring = var.strip().lower()
|
182 |
+
lexical_substring_filtered = []
|
183 |
+
for r in lexical_all:
|
184 |
+
# page_content in lowercase
|
185 |
+
page_text_lower = r.payload["page_content"].lower()
|
186 |
+
# Keep this result only if the query substring is found
|
187 |
+
if query_substring in page_text_lower:
|
188 |
+
lexical_substring_filtered.append(r)
|
189 |
+
|
190 |
+
# 3) Now apply your region/country/year filter on that new list
|
191 |
+
filtered_lexical = filter_results(
|
192 |
+
lexical_substring_filtered, country_filter, region_filter, end_year_range
|
193 |
+
)
|
194 |
+
|
195 |
+
# 4) Remove duplicates
|
196 |
+
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
197 |
+
|
198 |
+
# 5) If empty after substring + filters + dedupe, show a custom message
|
199 |
+
if not filtered_lexical_no_dupe:
|
200 |
+
st.write('No exact matches, consider unchecking "Show only exact matches"')
|
201 |
+
else:
|
202 |
+
# 6) Display the first 10 matching results
|
203 |
+
for res in filtered_lexical_no_dupe[:10]:
|
204 |
+
project_name = res.payload['metadata'].get('project_name', 'Project Link')
|
205 |
+
url = res.payload['metadata'].get('url', '#')
|
206 |
+
st.markdown(f"#### [{project_name}]({url})")
|
207 |
+
|
208 |
+
# Snippet logic (80 words)
|
209 |
+
full_text = res.payload['page_content']
|
210 |
+
words = full_text.split()
|
211 |
+
preview_word_count = 80
|
212 |
+
preview_text = " ".join(words[:preview_word_count])
|
213 |
+
remainder_text = " ".join(words[preview_word_count:])
|
214 |
+
st.write(preview_text + ("..." if remainder_text else ""))
|
215 |
+
|
216 |
+
# Keywords
|
217 |
+
top_keywords = extract_top_keywords(full_text, top_n=5)
|
218 |
+
if top_keywords:
|
219 |
+
st.markdown(f"_{' 路 '.join(top_keywords)}_")
|
220 |
+
|
221 |
+
# Metadata
|
222 |
+
metadata = res.payload.get('metadata', {})
|
223 |
+
countries = metadata.get('countries', "[]")
|
224 |
+
client_name = metadata.get('client', 'Unknown Client')
|
225 |
+
start_year = metadata.get('start_year', None)
|
226 |
+
end_year_ = metadata.get('end_year', None)
|
227 |
+
|
228 |
+
try:
|
229 |
+
c_list = json.loads(countries.replace("'", '"'))
|
230 |
+
country_names = [
|
231 |
+
get_country_name(code.upper(), region_df)
|
232 |
+
for code in c_list
|
233 |
+
if len(code) == 2
|
234 |
+
]
|
235 |
+
except json.JSONDecodeError:
|
236 |
+
country_names = []
|
237 |
+
|
238 |
+
start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
|
239 |
+
end_year_str = f"{int(round(float(end_year_)))}" if end_year_ else "Unknown"
|
240 |
+
|
241 |
+
additional_text = (
|
242 |
+
f"**{', '.join(country_names)}**, commissioned by **{client_name}**, "
|
243 |
+
f"**{start_year_str}-{end_year_str}**"
|
244 |
+
)
|
245 |
+
st.markdown(additional_text)
|
246 |
+
st.divider()
|
247 |
+
|
248 |
else:
|
249 |
st.write(f"Showing **Top 10 Semantic Search results** for query: {var}")
|
250 |
|