ola13 commited on
Commit
61841be
Β·
1 Parent(s): cd7903c

maintenance message

Browse files
Files changed (1) hide show
  1. app.py +17 -43
app.py CHANGED
@@ -10,9 +10,7 @@ from huggingface_hub import HfApi
10
  hf_api = HfApi()
11
  roots_datasets = {
12
  dset.id.split("/")[-1]: dset
13
- for dset in hf_api.list_datasets(
14
- author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token")
15
- )
16
  }
17
 
18
 
@@ -64,9 +62,7 @@ def process_pii(text):
64
  for tag in PII_TAGS:
65
  text = text.replace(
66
  PII_PREFIX + tag,
67
- """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(
68
- tag
69
- ),
70
  )
71
  return text
72
 
@@ -133,9 +129,7 @@ def format_result(result, highlight_terms, exact_search, datasets_filter=None):
133
  return "<p>" + result_html + "</p>"
134
 
135
 
136
- def format_result_page(
137
- language, results, highlight_terms, num_results, exact_search, datasets_filter=None
138
- ) -> gr.HTML:
139
  filtered_num_results = 0
140
  header_html = ""
141
 
@@ -160,9 +154,7 @@ def format_result_page(
160
  continue
161
  results_for_lang_html = ""
162
  for result in results_for_lang:
163
- result_html = format_result(
164
- result, highlight_terms, exact_search, datasets_filter
165
- )
166
  if result_html != "":
167
  filtered_num_results += 1
168
  results_for_lang_html += result_html
@@ -204,9 +196,7 @@ def extract_results_from_payload(query, language, payload, exact_search):
204
  text = result["text"]
205
  url = (
206
  result["meta"]["url"]
207
- if "meta" in result
208
- and result["meta"] is not None
209
- and "url" in result["meta"]
210
  else None
211
  )
212
  docid = result["docid"]
@@ -244,11 +234,7 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
244
  post_data = {"query": query, "k": num_results, "received_results": received_results}
245
  if language != "detect_language":
246
  post_data["lang"] = language
247
- address = (
248
- os.environ.get("address_exact_search")
249
- if exact_search
250
- else os.environ.get("address")
251
- )
252
  output = requests.post(
253
  address,
254
  headers={"Content-type": "application/json"},
@@ -259,10 +245,12 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
259
  return payload
260
 
261
 
262
- title = (
263
- """<p style="text-align: center; font-size:28px"> 🌸 πŸ”Ž ROOTS search tool πŸ” 🌸 </p>"""
264
- )
265
  description = """
 
 
 
 
266
  The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
267
  of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search
268
  Tool allows you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages
@@ -379,9 +367,7 @@ if __name__ == "__main__":
379
  payload,
380
  exact_search,
381
  )
382
- result_page = format_result_page(
383
- lang, processed_results, highlight_terms, num_results, exact_search
384
- )
385
  return (
386
  processed_results,
387
  highlight_terms,
@@ -402,19 +388,13 @@ if __name__ == "__main__":
402
  datasets,
403
  ) = run_query(query, lang, k, dropdown_input, 0)
404
  has_more_results = exact_search and (num_results > k)
405
- current_results = (
406
- len(next(iter(processed_results.values())))
407
- if len(processed_results) > 0
408
- else 0
409
- )
410
  return [
411
  processed_results,
412
  highlight_terms,
413
  num_results,
414
  exact_search,
415
- gr.update(visible=True)
416
- if current_results > 0
417
- else gr.update(visible=False),
418
  gr.Dropdown.update(choices=datasets, value=datasets),
419
  gr.update(visible=has_more_results),
420
  current_results,
@@ -437,12 +417,8 @@ if __name__ == "__main__":
437
  result_page,
438
  datasets,
439
  ) = run_query(query, lang, k, dropdown_input, received_results)
440
- current_results = sum(
441
- len(results) for results in processed_results.values()
442
- )
443
- has_more_results = exact_search and (
444
- received_results + current_results < num_results
445
- )
446
  print("received_results", received_results)
447
  print("current_results", current_results)
448
  print("has_more_results", has_more_results)
@@ -451,9 +427,7 @@ if __name__ == "__main__":
451
  highlight_terms,
452
  num_results,
453
  exact_search,
454
- gr.update(visible=True)
455
- if current_results > 0
456
- else gr.update(visible=False),
457
  gr.Dropdown.update(choices=datasets, value=datasets),
458
  gr.update(visible=current_results >= k and has_more_results),
459
  received_results + current_results,
 
10
  hf_api = HfApi()
11
  roots_datasets = {
12
  dset.id.split("/")[-1]: dset
13
+ for dset in hf_api.list_datasets(author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token"))
 
 
14
  }
15
 
16
 
 
62
  for tag in PII_TAGS:
63
  text = text.replace(
64
  PII_PREFIX + tag,
65
+ """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(tag),
 
 
66
  )
67
  return text
68
 
 
129
  return "<p>" + result_html + "</p>"
130
 
131
 
132
+ def format_result_page(language, results, highlight_terms, num_results, exact_search, datasets_filter=None) -> gr.HTML:
 
 
133
  filtered_num_results = 0
134
  header_html = ""
135
 
 
154
  continue
155
  results_for_lang_html = ""
156
  for result in results_for_lang:
157
+ result_html = format_result(result, highlight_terms, exact_search, datasets_filter)
 
 
158
  if result_html != "":
159
  filtered_num_results += 1
160
  results_for_lang_html += result_html
 
196
  text = result["text"]
197
  url = (
198
  result["meta"]["url"]
199
+ if "meta" in result and result["meta"] is not None and "url" in result["meta"]
 
 
200
  else None
201
  )
202
  docid = result["docid"]
 
234
  post_data = {"query": query, "k": num_results, "received_results": received_results}
235
  if language != "detect_language":
236
  post_data["lang"] = language
237
+ address = os.environ.get("address_exact_search") if exact_search else os.environ.get("address")
 
 
 
 
238
  output = requests.post(
239
  address,
240
  headers={"Content-type": "application/json"},
 
245
  return payload
246
 
247
 
248
+ title = """<p style="text-align: center; font-size:28px"> 🌸 πŸ”Ž ROOTS search tool πŸ” 🌸 </p>"""
 
 
249
  description = """
250
+ # We're running maintenance works on the exact search index, so it may not work properly until the end of the day,
251
+ Monday 27th of March.
252
+
253
+
254
  The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
255
  of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search
256
  Tool allows you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages
 
367
  payload,
368
  exact_search,
369
  )
370
+ result_page = format_result_page(lang, processed_results, highlight_terms, num_results, exact_search)
 
 
371
  return (
372
  processed_results,
373
  highlight_terms,
 
388
  datasets,
389
  ) = run_query(query, lang, k, dropdown_input, 0)
390
  has_more_results = exact_search and (num_results > k)
391
+ current_results = len(next(iter(processed_results.values()))) if len(processed_results) > 0 else 0
 
 
 
 
392
  return [
393
  processed_results,
394
  highlight_terms,
395
  num_results,
396
  exact_search,
397
+ gr.update(visible=True) if current_results > 0 else gr.update(visible=False),
 
 
398
  gr.Dropdown.update(choices=datasets, value=datasets),
399
  gr.update(visible=has_more_results),
400
  current_results,
 
417
  result_page,
418
  datasets,
419
  ) = run_query(query, lang, k, dropdown_input, received_results)
420
+ current_results = sum(len(results) for results in processed_results.values())
421
+ has_more_results = exact_search and (received_results + current_results < num_results)
 
 
 
 
422
  print("received_results", received_results)
423
  print("current_results", current_results)
424
  print("has_more_results", has_more_results)
 
427
  highlight_terms,
428
  num_results,
429
  exact_search,
430
+ gr.update(visible=True) if current_results > 0 else gr.update(visible=False),
 
 
431
  gr.Dropdown.update(choices=datasets, value=datasets),
432
  gr.update(visible=current_results >= k and has_more_results),
433
  received_results + current_results,