Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update app.py
Browse files
app.py
CHANGED
@@ -85,132 +85,130 @@ with col2:
|
|
85 |
# Checkbox to control whether to show only exact matches
|
86 |
show_exact_matches = st.checkbox("Show only exact matches", value=False)
|
87 |
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
if button:
|
90 |
-
|
|
|
|
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
metadata = res.payload.get('metadata', {})
|
96 |
-
countries = metadata.get('countries', "[]")
|
97 |
-
end_year = float(metadata.get('end_year', 0))
|
98 |
-
|
99 |
-
# Process countries string to a list
|
100 |
-
try:
|
101 |
-
country_list = json.loads(countries.replace("'", '"'))
|
102 |
-
# Normalize to uppercase and filter only 2-digit ISO codes
|
103 |
-
country_list = [code.upper() for code in country_list if len(code) == 2]
|
104 |
-
except json.JSONDecodeError:
|
105 |
-
country_list = []
|
106 |
-
|
107 |
-
# Translate selected country name back to 2-digit ISO code
|
108 |
-
selected_iso_code = country_name_mapping.get(country_filter, None)
|
109 |
-
|
110 |
-
# Apply country and year filters
|
111 |
-
if (country_filter == "All/Not allocated" or selected_iso_code in country_list) and (end_year_range[0] <= end_year <= end_year_range[1]):
|
112 |
-
filtered.append(res)
|
113 |
-
return filtered
|
114 |
|
|
|
|
|
|
|
115 |
|
116 |
-
#
|
|
|
117 |
if show_exact_matches:
|
118 |
st.write(f"Showing **Top 10 Lexical Search results** for query: {var}")
|
119 |
-
|
120 |
-
|
121 |
-
for res in filtered_lexical_results[:10]:
|
122 |
project_name = res.payload['metadata'].get('project_name', 'Project Link')
|
123 |
url = res.payload['metadata'].get('url', '#')
|
124 |
st.markdown(f"#### [{project_name}]({url})")
|
125 |
-
|
|
|
126 |
full_text = res.payload['page_content']
|
127 |
-
# Split the text by whitespace
|
128 |
words = full_text.split()
|
129 |
-
|
130 |
-
preview_word_count = 120
|
131 |
-
# Create the short preview and the remainder
|
132 |
preview_text = " ".join(words[:preview_word_count])
|
133 |
remainder_text = " ".join(words[preview_word_count:])
|
134 |
-
# Always display the preview_text
|
135 |
st.write(preview_text + ("..." if remainder_text else ""))
|
136 |
-
|
|
|
137 |
top_keywords = extract_top_keywords(full_text, top_n=5)
|
138 |
if top_keywords:
|
139 |
-
st.markdown(f"_{' 路 '.join(top_keywords)}_")
|
140 |
-
|
|
|
141 |
metadata = res.payload.get('metadata', {})
|
142 |
countries = metadata.get('countries', "[]")
|
143 |
-
|
144 |
start_year = metadata.get('start_year', None)
|
145 |
-
|
146 |
|
147 |
-
# Process countries
|
148 |
try:
|
149 |
-
|
150 |
-
|
151 |
-
country_names = [get_country_name(code.upper(), region_df) for code in country_list if len(code) == 2]
|
152 |
-
country_names = country_names if country_names else country_list # Fallback if no names found
|
153 |
except json.JSONDecodeError:
|
154 |
-
country_names =
|
155 |
|
156 |
-
|
157 |
-
|
158 |
-
end_year = f"{int(round(float(end_year)))}" if end_year else "Unknown"
|
159 |
|
160 |
-
|
161 |
-
|
|
|
162 |
st.markdown(additional_text)
|
163 |
-
|
164 |
-
|
165 |
st.divider()
|
166 |
else:
|
167 |
st.write(f"Showing **Top 10 Semantic Search results** for query: {var}")
|
168 |
-
|
169 |
-
|
170 |
-
for res in filtered_semantic_results[:10]:
|
171 |
project_name = res.payload['metadata'].get('project_name', 'Project Link')
|
172 |
url = res.payload['metadata'].get('url', '#')
|
173 |
st.markdown(f"#### [{project_name}]({url})")
|
174 |
-
|
|
|
175 |
full_text = res.payload['page_content']
|
176 |
-
# Split the text by whitespace
|
177 |
words = full_text.split()
|
178 |
-
|
179 |
-
preview_word_count = 40
|
180 |
-
# Create the short preview and the remainder
|
181 |
preview_text = " ".join(words[:preview_word_count])
|
182 |
remainder_text = " ".join(words[preview_word_count:])
|
183 |
-
# Always display the preview_text
|
184 |
st.write(preview_text + ("..." if remainder_text else ""))
|
185 |
-
|
|
|
186 |
top_keywords = extract_top_keywords(full_text, top_n=5)
|
187 |
if top_keywords:
|
188 |
-
st.markdown(f"_{' 路 '.join(top_keywords)}_")
|
189 |
-
|
190 |
-
#
|
191 |
metadata = res.payload.get('metadata', {})
|
192 |
countries = metadata.get('countries', "[]")
|
193 |
-
|
194 |
start_year = metadata.get('start_year', None)
|
195 |
-
|
196 |
|
197 |
-
# Process countries
|
198 |
try:
|
199 |
-
|
200 |
-
country_names = [get_country_name(code.upper(), region_df) for code in
|
201 |
-
country_names = country_names if country_names else country_list
|
202 |
except json.JSONDecodeError:
|
203 |
-
country_names =
|
204 |
|
205 |
-
|
206 |
-
|
207 |
-
end_year = f"{int(round(float(end_year)))}" if end_year else "Unknown"
|
208 |
|
209 |
-
|
210 |
-
|
|
|
211 |
st.markdown(additional_text)
|
212 |
-
|
213 |
-
|
214 |
st.divider()
|
215 |
|
216 |
|
|
|
85 |
# Checkbox to control whether to show only exact matches
|
86 |
show_exact_matches = st.checkbox("Show only exact matches", value=False)
|
87 |
|
88 |
+
def filter_results(results, country_filter, end_year_range):
|
89 |
+
filtered = []
|
90 |
+
for r in results:
|
91 |
+
metadata = r.payload.get('metadata', {})
|
92 |
+
countries = metadata.get('countries', "[]")
|
93 |
+
end_year_val = float(metadata.get('end_year', 0))
|
94 |
+
|
95 |
+
# Convert countries to a list
|
96 |
+
try:
|
97 |
+
c_list = json.loads(countries.replace("'", '"'))
|
98 |
+
c_list = [code.upper() for code in c_list if len(code) == 2]
|
99 |
+
except json.JSONDecodeError:
|
100 |
+
c_list = []
|
101 |
+
|
102 |
+
# Translate selected country name to iso2
|
103 |
+
selected_iso_code = country_name_mapping.get(country_filter, None)
|
104 |
+
|
105 |
+
# Filtering
|
106 |
+
if (
|
107 |
+
(country_filter == "All/Not allocated" or selected_iso_code in c_list)
|
108 |
+
and (end_year_range[0] <= end_year_val <= end_year_range[1])
|
109 |
+
):
|
110 |
+
filtered.append(r)
|
111 |
+
return filtered
|
112 |
+
|
113 |
+
|
114 |
if button:
|
115 |
+
# 1) Use a bigger limit so we get more than 10 results
|
116 |
+
# We'll filter them first, then slice the top 10 from the filtered set.
|
117 |
+
results = hybrid_search(client, var, collection_name, limit=100) # e.g., 100 or 200
|
118 |
|
119 |
+
# results is a tuple: (semantic_results, lexical_results)
|
120 |
+
semantic_all = results[0]
|
121 |
+
lexical_all = results[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
+
# 2) Filter the entire sets
|
124 |
+
filtered_semantic = filter_results(semantic_all, country_filter, end_year_range)
|
125 |
+
filtered_lexical = filter_results(lexical_all, country_filter, end_year_range)
|
126 |
|
127 |
+
# 3) Now we take the top 10 *after* filtering
|
128 |
+
# Check user preference
|
129 |
if show_exact_matches:
|
130 |
st.write(f"Showing **Top 10 Lexical Search results** for query: {var}")
|
131 |
+
# Show the top 10 from filtered_lexical
|
132 |
+
for res in filtered_lexical[:10]:
|
|
|
133 |
project_name = res.payload['metadata'].get('project_name', 'Project Link')
|
134 |
url = res.payload['metadata'].get('url', '#')
|
135 |
st.markdown(f"#### [{project_name}]({url})")
|
136 |
+
|
137 |
+
# Snippet logic (80 words)
|
138 |
full_text = res.payload['page_content']
|
|
|
139 |
words = full_text.split()
|
140 |
+
preview_word_count = 80
|
|
|
|
|
141 |
preview_text = " ".join(words[:preview_word_count])
|
142 |
remainder_text = " ".join(words[preview_word_count:])
|
|
|
143 |
st.write(preview_text + ("..." if remainder_text else ""))
|
144 |
+
|
145 |
+
# Keywords
|
146 |
top_keywords = extract_top_keywords(full_text, top_n=5)
|
147 |
if top_keywords:
|
148 |
+
st.markdown(f"_{' 路 '.join(top_keywords)}_")
|
149 |
+
|
150 |
+
# Metadata
|
151 |
metadata = res.payload.get('metadata', {})
|
152 |
countries = metadata.get('countries', "[]")
|
153 |
+
client_name = metadata.get('client', 'Unknown Client')
|
154 |
start_year = metadata.get('start_year', None)
|
155 |
+
end_year_ = metadata.get('end_year', None)
|
156 |
|
|
|
157 |
try:
|
158 |
+
c_list = json.loads(countries.replace("'", '"'))
|
159 |
+
country_names = [get_country_name(code.upper(), region_df) for code in c_list if len(code) == 2]
|
|
|
|
|
160 |
except json.JSONDecodeError:
|
161 |
+
country_names = []
|
162 |
|
163 |
+
start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
|
164 |
+
end_year_str = f"{int(round(float(end_year_)))}" if end_year_ else "Unknown"
|
|
|
165 |
|
166 |
+
additional_text = (
|
167 |
+
f"**{', '.join(country_names)}**, commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**"
|
168 |
+
)
|
169 |
st.markdown(additional_text)
|
|
|
|
|
170 |
st.divider()
|
171 |
else:
|
172 |
st.write(f"Showing **Top 10 Semantic Search results** for query: {var}")
|
173 |
+
# Show the top 10 from filtered_semantic
|
174 |
+
for res in filtered_semantic[:10]:
|
|
|
175 |
project_name = res.payload['metadata'].get('project_name', 'Project Link')
|
176 |
url = res.payload['metadata'].get('url', '#')
|
177 |
st.markdown(f"#### [{project_name}]({url})")
|
178 |
+
|
179 |
+
# Snippet logic
|
180 |
full_text = res.payload['page_content']
|
|
|
181 |
words = full_text.split()
|
182 |
+
preview_word_count = 80
|
|
|
|
|
183 |
preview_text = " ".join(words[:preview_word_count])
|
184 |
remainder_text = " ".join(words[preview_word_count:])
|
|
|
185 |
st.write(preview_text + ("..." if remainder_text else ""))
|
186 |
+
|
187 |
+
# Keywords
|
188 |
top_keywords = extract_top_keywords(full_text, top_n=5)
|
189 |
if top_keywords:
|
190 |
+
st.markdown(f"_{' 路 '.join(top_keywords)}_")
|
191 |
+
|
192 |
+
# Metadata
|
193 |
metadata = res.payload.get('metadata', {})
|
194 |
countries = metadata.get('countries', "[]")
|
195 |
+
client_name = metadata.get('client', 'Unknown Client')
|
196 |
start_year = metadata.get('start_year', None)
|
197 |
+
end_year_ = metadata.get('end_year', None)
|
198 |
|
|
|
199 |
try:
|
200 |
+
c_list = json.loads(countries.replace("'", '"'))
|
201 |
+
country_names = [get_country_name(code.upper(), region_df) for code in c_list if len(code) == 2]
|
|
|
202 |
except json.JSONDecodeError:
|
203 |
+
country_names = []
|
204 |
|
205 |
+
start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
|
206 |
+
end_year_str = f"{int(round(float(end_year_)))}" if end_year_ else "Unknown"
|
|
|
207 |
|
208 |
+
additional_text = (
|
209 |
+
f"**{', '.join(country_names)}**, commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**"
|
210 |
+
)
|
211 |
st.markdown(additional_text)
|
|
|
|
|
212 |
st.divider()
|
213 |
|
214 |
|