Spaces:
Runtime error
Runtime error
maintenance message
Browse files
app.py
CHANGED
@@ -10,9 +10,7 @@ from huggingface_hub import HfApi
|
|
10 |
hf_api = HfApi()
|
11 |
roots_datasets = {
|
12 |
dset.id.split("/")[-1]: dset
|
13 |
-
for dset in hf_api.list_datasets(
|
14 |
-
author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token")
|
15 |
-
)
|
16 |
}
|
17 |
|
18 |
|
@@ -64,9 +62,7 @@ def process_pii(text):
|
|
64 |
for tag in PII_TAGS:
|
65 |
text = text.replace(
|
66 |
PII_PREFIX + tag,
|
67 |
-
"""<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(
|
68 |
-
tag
|
69 |
-
),
|
70 |
)
|
71 |
return text
|
72 |
|
@@ -133,9 +129,7 @@ def format_result(result, highlight_terms, exact_search, datasets_filter=None):
|
|
133 |
return "<p>" + result_html + "</p>"
|
134 |
|
135 |
|
136 |
-
def format_result_page(
|
137 |
-
language, results, highlight_terms, num_results, exact_search, datasets_filter=None
|
138 |
-
) -> gr.HTML:
|
139 |
filtered_num_results = 0
|
140 |
header_html = ""
|
141 |
|
@@ -160,9 +154,7 @@ def format_result_page(
|
|
160 |
continue
|
161 |
results_for_lang_html = ""
|
162 |
for result in results_for_lang:
|
163 |
-
result_html = format_result(
|
164 |
-
result, highlight_terms, exact_search, datasets_filter
|
165 |
-
)
|
166 |
if result_html != "":
|
167 |
filtered_num_results += 1
|
168 |
results_for_lang_html += result_html
|
@@ -204,9 +196,7 @@ def extract_results_from_payload(query, language, payload, exact_search):
|
|
204 |
text = result["text"]
|
205 |
url = (
|
206 |
result["meta"]["url"]
|
207 |
-
if "meta" in result
|
208 |
-
and result["meta"] is not None
|
209 |
-
and "url" in result["meta"]
|
210 |
else None
|
211 |
)
|
212 |
docid = result["docid"]
|
@@ -244,11 +234,7 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
|
|
244 |
post_data = {"query": query, "k": num_results, "received_results": received_results}
|
245 |
if language != "detect_language":
|
246 |
post_data["lang"] = language
|
247 |
-
address = (
|
248 |
-
os.environ.get("address_exact_search")
|
249 |
-
if exact_search
|
250 |
-
else os.environ.get("address")
|
251 |
-
)
|
252 |
output = requests.post(
|
253 |
address,
|
254 |
headers={"Content-type": "application/json"},
|
@@ -259,10 +245,12 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
|
|
259 |
return payload
|
260 |
|
261 |
|
262 |
-
title =
|
263 |
-
"""<p style="text-align: center; font-size:28px"> πΈ π ROOTS search tool π πΈ </p>"""
|
264 |
-
)
|
265 |
description = """
|
|
|
|
|
|
|
|
|
266 |
The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
|
267 |
of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search
|
268 |
Tool allows you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages
|
@@ -379,9 +367,7 @@ if __name__ == "__main__":
|
|
379 |
payload,
|
380 |
exact_search,
|
381 |
)
|
382 |
-
result_page = format_result_page(
|
383 |
-
lang, processed_results, highlight_terms, num_results, exact_search
|
384 |
-
)
|
385 |
return (
|
386 |
processed_results,
|
387 |
highlight_terms,
|
@@ -402,19 +388,13 @@ if __name__ == "__main__":
|
|
402 |
datasets,
|
403 |
) = run_query(query, lang, k, dropdown_input, 0)
|
404 |
has_more_results = exact_search and (num_results > k)
|
405 |
-
current_results = (
|
406 |
-
len(next(iter(processed_results.values())))
|
407 |
-
if len(processed_results) > 0
|
408 |
-
else 0
|
409 |
-
)
|
410 |
return [
|
411 |
processed_results,
|
412 |
highlight_terms,
|
413 |
num_results,
|
414 |
exact_search,
|
415 |
-
gr.update(visible=True)
|
416 |
-
if current_results > 0
|
417 |
-
else gr.update(visible=False),
|
418 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
419 |
gr.update(visible=has_more_results),
|
420 |
current_results,
|
@@ -437,12 +417,8 @@ if __name__ == "__main__":
|
|
437 |
result_page,
|
438 |
datasets,
|
439 |
) = run_query(query, lang, k, dropdown_input, received_results)
|
440 |
-
current_results = sum(
|
441 |
-
|
442 |
-
)
|
443 |
-
has_more_results = exact_search and (
|
444 |
-
received_results + current_results < num_results
|
445 |
-
)
|
446 |
print("received_results", received_results)
|
447 |
print("current_results", current_results)
|
448 |
print("has_more_results", has_more_results)
|
@@ -451,9 +427,7 @@ if __name__ == "__main__":
|
|
451 |
highlight_terms,
|
452 |
num_results,
|
453 |
exact_search,
|
454 |
-
gr.update(visible=True)
|
455 |
-
if current_results > 0
|
456 |
-
else gr.update(visible=False),
|
457 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
458 |
gr.update(visible=current_results >= k and has_more_results),
|
459 |
received_results + current_results,
|
|
|
10 |
hf_api = HfApi()
|
11 |
roots_datasets = {
|
12 |
dset.id.split("/")[-1]: dset
|
13 |
+
for dset in hf_api.list_datasets(author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token"))
|
|
|
|
|
14 |
}
|
15 |
|
16 |
|
|
|
62 |
for tag in PII_TAGS:
|
63 |
text = text.replace(
|
64 |
PII_PREFIX + tag,
|
65 |
+
"""<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(tag),
|
|
|
|
|
66 |
)
|
67 |
return text
|
68 |
|
|
|
129 |
return "<p>" + result_html + "</p>"
|
130 |
|
131 |
|
132 |
+
def format_result_page(language, results, highlight_terms, num_results, exact_search, datasets_filter=None) -> gr.HTML:
|
|
|
|
|
133 |
filtered_num_results = 0
|
134 |
header_html = ""
|
135 |
|
|
|
154 |
continue
|
155 |
results_for_lang_html = ""
|
156 |
for result in results_for_lang:
|
157 |
+
result_html = format_result(result, highlight_terms, exact_search, datasets_filter)
|
|
|
|
|
158 |
if result_html != "":
|
159 |
filtered_num_results += 1
|
160 |
results_for_lang_html += result_html
|
|
|
196 |
text = result["text"]
|
197 |
url = (
|
198 |
result["meta"]["url"]
|
199 |
+
if "meta" in result and result["meta"] is not None and "url" in result["meta"]
|
|
|
|
|
200 |
else None
|
201 |
)
|
202 |
docid = result["docid"]
|
|
|
234 |
post_data = {"query": query, "k": num_results, "received_results": received_results}
|
235 |
if language != "detect_language":
|
236 |
post_data["lang"] = language
|
237 |
+
address = os.environ.get("address_exact_search") if exact_search else os.environ.get("address")
|
|
|
|
|
|
|
|
|
238 |
output = requests.post(
|
239 |
address,
|
240 |
headers={"Content-type": "application/json"},
|
|
|
245 |
return payload
|
246 |
|
247 |
|
248 |
+
title = """<p style="text-align: center; font-size:28px"> πΈ π ROOTS search tool π πΈ </p>"""
|
|
|
|
|
249 |
description = """
|
250 |
+
# We're running maintenance works on the exact search index, so it may not work properly until the end of the day,
|
251 |
+
Monday 27th of March.
|
252 |
+
|
253 |
+
|
254 |
The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
|
255 |
of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search
|
256 |
Tool allows you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages
|
|
|
367 |
payload,
|
368 |
exact_search,
|
369 |
)
|
370 |
+
result_page = format_result_page(lang, processed_results, highlight_terms, num_results, exact_search)
|
|
|
|
|
371 |
return (
|
372 |
processed_results,
|
373 |
highlight_terms,
|
|
|
388 |
datasets,
|
389 |
) = run_query(query, lang, k, dropdown_input, 0)
|
390 |
has_more_results = exact_search and (num_results > k)
|
391 |
+
current_results = len(next(iter(processed_results.values()))) if len(processed_results) > 0 else 0
|
|
|
|
|
|
|
|
|
392 |
return [
|
393 |
processed_results,
|
394 |
highlight_terms,
|
395 |
num_results,
|
396 |
exact_search,
|
397 |
+
gr.update(visible=True) if current_results > 0 else gr.update(visible=False),
|
|
|
|
|
398 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
399 |
gr.update(visible=has_more_results),
|
400 |
current_results,
|
|
|
417 |
result_page,
|
418 |
datasets,
|
419 |
) = run_query(query, lang, k, dropdown_input, received_results)
|
420 |
+
current_results = sum(len(results) for results in processed_results.values())
|
421 |
+
has_more_results = exact_search and (received_results + current_results < num_results)
|
|
|
|
|
|
|
|
|
422 |
print("received_results", received_results)
|
423 |
print("current_results", current_results)
|
424 |
print("has_more_results", has_more_results)
|
|
|
427 |
highlight_terms,
|
428 |
num_results,
|
429 |
exact_search,
|
430 |
+
gr.update(visible=True) if current_results > 0 else gr.update(visible=False),
|
|
|
|
|
431 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
432 |
gr.update(visible=current_results >= k and has_more_results),
|
433 |
received_results + current_results,
|