annikwag commited on
Commit
59e8a6b
verified
1 Parent(s): 49a9d87

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -79
app.py CHANGED
@@ -85,132 +85,130 @@ with col2:
85
  # Checkbox to control whether to show only exact matches
86
  show_exact_matches = st.checkbox("Show only exact matches", value=False)
87
 
88
- button = st.button("Search")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  if button:
90
- results = hybrid_search(client, var, collection_name)
 
 
91
 
92
- def filter_results(results, country_filter, end_year_range):
93
- filtered = []
94
- for res in results:
95
- metadata = res.payload.get('metadata', {})
96
- countries = metadata.get('countries', "[]")
97
- end_year = float(metadata.get('end_year', 0))
98
-
99
- # Process countries string to a list
100
- try:
101
- country_list = json.loads(countries.replace("'", '"'))
102
- # Normalize to uppercase and filter only 2-digit ISO codes
103
- country_list = [code.upper() for code in country_list if len(code) == 2]
104
- except json.JSONDecodeError:
105
- country_list = []
106
-
107
- # Translate selected country name back to 2-digit ISO code
108
- selected_iso_code = country_name_mapping.get(country_filter, None)
109
-
110
- # Apply country and year filters
111
- if (country_filter == "All/Not allocated" or selected_iso_code in country_list) and (end_year_range[0] <= end_year <= end_year_range[1]):
112
- filtered.append(res)
113
- return filtered
114
 
 
 
 
115
 
116
- # Check user preference for exact matches
 
117
  if show_exact_matches:
118
  st.write(f"Showing **Top 10 Lexical Search results** for query: {var}")
119
- lexical_results = results[1] # Lexical results are in index 1
120
- filtered_lexical_results = filter_results(lexical_results, country_filter, end_year_range)
121
- for res in filtered_lexical_results[:10]:
122
  project_name = res.payload['metadata'].get('project_name', 'Project Link')
123
  url = res.payload['metadata'].get('url', '#')
124
  st.markdown(f"#### [{project_name}]({url})")
125
- # ------- Display first 4 lines + expander -------
 
126
  full_text = res.payload['page_content']
127
- # Split the text by whitespace
128
  words = full_text.split()
129
- # For instance, show only the first 40 words
130
- preview_word_count = 120
131
- # Create the short preview and the remainder
132
  preview_text = " ".join(words[:preview_word_count])
133
  remainder_text = " ".join(words[preview_word_count:])
134
- # Always display the preview_text
135
  st.write(preview_text + ("..." if remainder_text else ""))
136
- # ------ Extract top 5 keywords and display ------
 
137
  top_keywords = extract_top_keywords(full_text, top_n=5)
138
  if top_keywords:
139
- st.markdown(f"_{' 路 '.join(top_keywords)}_") # italic + ' 路 ' separator
140
- # ------- Additional info below the text -------
 
141
  metadata = res.payload.get('metadata', {})
142
  countries = metadata.get('countries', "[]")
143
- client = metadata.get('client', 'Unknown Client')
144
  start_year = metadata.get('start_year', None)
145
- end_year = metadata.get('end_year', None)
146
 
147
- # Process countries
148
  try:
149
- country_list = json.loads(countries.replace("'", '"'))
150
- # Normalize to uppercase and map to country names
151
- country_names = [get_country_name(code.upper(), region_df) for code in country_list if len(code) == 2]
152
- country_names = country_names if country_names else country_list # Fallback if no names found
153
  except json.JSONDecodeError:
154
- country_names = countries
155
 
156
- # Format start and end year
157
- start_year = f"{int(round(float(start_year)))}" if start_year else "Unknown"
158
- end_year = f"{int(round(float(end_year)))}" if end_year else "Unknown"
159
 
160
- # Generate additional text with Markdown for bold formatting
161
- additional_text = f"**{', '.join(country_names)}**, commissioned by **{client}**, **{start_year}-{end_year}**"
 
162
  st.markdown(additional_text)
163
-
164
-
165
  st.divider()
166
  else:
167
  st.write(f"Showing **Top 10 Semantic Search results** for query: {var}")
168
- semantic_results = results[0] # Semantic results are in index 0
169
- filtered_semantic_results = filter_results(semantic_results, country_filter, end_year_range)
170
- for res in filtered_semantic_results[:10]:
171
  project_name = res.payload['metadata'].get('project_name', 'Project Link')
172
  url = res.payload['metadata'].get('url', '#')
173
  st.markdown(f"#### [{project_name}]({url})")
174
- # ------- Display first 4 lines + expander -------
 
175
  full_text = res.payload['page_content']
176
- # Split the text by whitespace
177
  words = full_text.split()
178
- # For instance, show only the first 40 words
179
- preview_word_count = 40
180
- # Create the short preview and the remainder
181
  preview_text = " ".join(words[:preview_word_count])
182
  remainder_text = " ".join(words[preview_word_count:])
183
- # Always display the preview_text
184
  st.write(preview_text + ("..." if remainder_text else ""))
185
- # ------ Extract top 5 keywords and display ------
 
186
  top_keywords = extract_top_keywords(full_text, top_n=5)
187
  if top_keywords:
188
- st.markdown(f"_{' 路 '.join(top_keywords)}_") # italic + ' 路 ' separator
189
-
190
- # Additional text below the content
191
  metadata = res.payload.get('metadata', {})
192
  countries = metadata.get('countries', "[]")
193
- client = metadata.get('client', 'Unknown Client')
194
  start_year = metadata.get('start_year', None)
195
- end_year = metadata.get('end_year', None)
196
 
197
- # Process countries
198
  try:
199
- country_list = json.loads(countries.replace("'", '"'))
200
- country_names = [get_country_name(code.upper(), region_df) for code in country_list if len(code) == 2]
201
- country_names = country_names if country_names else country_list
202
  except json.JSONDecodeError:
203
- country_names = countries
204
 
205
- # Format start and end year
206
- start_year = f"{int(round(float(start_year)))}" if start_year else "Unknown"
207
- end_year = f"{int(round(float(end_year)))}" if end_year else "Unknown"
208
 
209
- # Generate additional text with Markdown for bold formatting
210
- additional_text = f"**{', '.join(country_names)}**, commissioned by **{client}**, **{start_year}-{end_year}**"
 
211
  st.markdown(additional_text)
212
-
213
-
214
  st.divider()
215
 
216
 
 
85
  # Checkbox to control whether to show only exact matches
86
  show_exact_matches = st.checkbox("Show only exact matches", value=False)
87
 
88
+ def filter_results(results, country_filter, end_year_range):
89
+ filtered = []
90
+ for r in results:
91
+ metadata = r.payload.get('metadata', {})
92
+ countries = metadata.get('countries', "[]")
93
+ end_year_val = float(metadata.get('end_year', 0))
94
+
95
+ # Convert countries to a list
96
+ try:
97
+ c_list = json.loads(countries.replace("'", '"'))
98
+ c_list = [code.upper() for code in c_list if len(code) == 2]
99
+ except json.JSONDecodeError:
100
+ c_list = []
101
+
102
+ # Translate selected country name to iso2
103
+ selected_iso_code = country_name_mapping.get(country_filter, None)
104
+
105
+ # Filtering
106
+ if (
107
+ (country_filter == "All/Not allocated" or selected_iso_code in c_list)
108
+ and (end_year_range[0] <= end_year_val <= end_year_range[1])
109
+ ):
110
+ filtered.append(r)
111
+ return filtered
112
+
113
+
114
  if button:
115
+ # 1) Use a bigger limit so we get more than 10 results
116
+ # We'll filter them first, then slice the top 10 from the filtered set.
117
+ results = hybrid_search(client, var, collection_name, limit=100) # e.g., 100 or 200
118
 
119
+ # results is a tuple: (semantic_results, lexical_results)
120
+ semantic_all = results[0]
121
+ lexical_all = results[1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
+ # 2) Filter the entire sets
124
+ filtered_semantic = filter_results(semantic_all, country_filter, end_year_range)
125
+ filtered_lexical = filter_results(lexical_all, country_filter, end_year_range)
126
 
127
+ # 3) Now we take the top 10 *after* filtering
128
+ # Check user preference
129
  if show_exact_matches:
130
  st.write(f"Showing **Top 10 Lexical Search results** for query: {var}")
131
+ # Show the top 10 from filtered_lexical
132
+ for res in filtered_lexical[:10]:
 
133
  project_name = res.payload['metadata'].get('project_name', 'Project Link')
134
  url = res.payload['metadata'].get('url', '#')
135
  st.markdown(f"#### [{project_name}]({url})")
136
+
137
+ # Snippet logic (80 words)
138
  full_text = res.payload['page_content']
 
139
  words = full_text.split()
140
+ preview_word_count = 80
 
 
141
  preview_text = " ".join(words[:preview_word_count])
142
  remainder_text = " ".join(words[preview_word_count:])
 
143
  st.write(preview_text + ("..." if remainder_text else ""))
144
+
145
+ # Keywords
146
  top_keywords = extract_top_keywords(full_text, top_n=5)
147
  if top_keywords:
148
+ st.markdown(f"_{' 路 '.join(top_keywords)}_")
149
+
150
+ # Metadata
151
  metadata = res.payload.get('metadata', {})
152
  countries = metadata.get('countries', "[]")
153
+ client_name = metadata.get('client', 'Unknown Client')
154
  start_year = metadata.get('start_year', None)
155
+ end_year_ = metadata.get('end_year', None)
156
 
 
157
  try:
158
+ c_list = json.loads(countries.replace("'", '"'))
159
+ country_names = [get_country_name(code.upper(), region_df) for code in c_list if len(code) == 2]
 
 
160
  except json.JSONDecodeError:
161
+ country_names = []
162
 
163
+ start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
164
+ end_year_str = f"{int(round(float(end_year_)))}" if end_year_ else "Unknown"
 
165
 
166
+ additional_text = (
167
+ f"**{', '.join(country_names)}**, commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**"
168
+ )
169
  st.markdown(additional_text)
 
 
170
  st.divider()
171
  else:
172
  st.write(f"Showing **Top 10 Semantic Search results** for query: {var}")
173
+ # Show the top 10 from filtered_semantic
174
+ for res in filtered_semantic[:10]:
 
175
  project_name = res.payload['metadata'].get('project_name', 'Project Link')
176
  url = res.payload['metadata'].get('url', '#')
177
  st.markdown(f"#### [{project_name}]({url})")
178
+
179
+ # Snippet logic
180
  full_text = res.payload['page_content']
 
181
  words = full_text.split()
182
+ preview_word_count = 80
 
 
183
  preview_text = " ".join(words[:preview_word_count])
184
  remainder_text = " ".join(words[preview_word_count:])
 
185
  st.write(preview_text + ("..." if remainder_text else ""))
186
+
187
+ # Keywords
188
  top_keywords = extract_top_keywords(full_text, top_n=5)
189
  if top_keywords:
190
+ st.markdown(f"_{' 路 '.join(top_keywords)}_")
191
+
192
+ # Metadata
193
  metadata = res.payload.get('metadata', {})
194
  countries = metadata.get('countries', "[]")
195
+ client_name = metadata.get('client', 'Unknown Client')
196
  start_year = metadata.get('start_year', None)
197
+ end_year_ = metadata.get('end_year', None)
198
 
 
199
  try:
200
+ c_list = json.loads(countries.replace("'", '"'))
201
+ country_names = [get_country_name(code.upper(), region_df) for code in c_list if len(code) == 2]
 
202
  except json.JSONDecodeError:
203
+ country_names = []
204
 
205
+ start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
206
+ end_year_str = f"{int(round(float(end_year_)))}" if end_year_ else "Unknown"
 
207
 
208
+ additional_text = (
209
+ f"**{', '.join(country_names)}**, commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**"
210
+ )
211
  st.markdown(additional_text)
 
 
212
  st.divider()
213
 
214