Spaces:
Runtime error
Runtime error
readd exact
Browse files
app.py
CHANGED
@@ -71,6 +71,23 @@ def extract_lang_from_docid(docid):
|
|
71 |
return docid.split("_")[1]
|
72 |
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
def format_result(result, highlight_terms, exact_search, datasets_filter=None):
|
75 |
text, url, docid = result
|
76 |
if datasets_filter is not None:
|
@@ -80,11 +97,17 @@ def format_result(result, highlight_terms, exact_search, datasets_filter=None):
|
|
80 |
return ""
|
81 |
|
82 |
if exact_search:
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
tokens_html
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
else:
|
89 |
tokens = text.split()
|
90 |
tokens_html = []
|
@@ -247,8 +270,6 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
|
|
247 |
|
248 |
title = """<p style="text-align: center; font-size:28px"> πΈ π ROOTS search tool π πΈ </p>"""
|
249 |
description = """
|
250 |
-
## We're running maintenance works on the exact search index, so it may not work properly until the end of the day, Thursday 30th of March.
|
251 |
-
|
252 |
|
253 |
The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
|
254 |
of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search
|
|
|
71 |
return docid.split("_")[1]
|
72 |
|
73 |
|
74 |
+
def normalize(document):
|
75 |
+
def remove_articles(text):
|
76 |
+
return re.sub(r"\b(a|an|the)\b", " ", text)
|
77 |
+
|
78 |
+
def white_space_fix(text):
|
79 |
+
return " ".join(text.split())
|
80 |
+
|
81 |
+
def remove_punc(text):
|
82 |
+
exclude = set(string.punctuation)
|
83 |
+
return "".join(ch for ch in text if ch not in exclude)
|
84 |
+
|
85 |
+
def lower(text):
|
86 |
+
return text.lower()
|
87 |
+
|
88 |
+
return white_space_fix(remove_articles(remove_punc(lower(document))))
|
89 |
+
|
90 |
+
|
91 |
def format_result(result, highlight_terms, exact_search, datasets_filter=None):
|
92 |
text, url, docid = result
|
93 |
if datasets_filter is not None:
|
|
|
97 |
return ""
|
98 |
|
99 |
if exact_search:
|
100 |
+
highlight_terms = normalize(highlight_terms).split()
|
101 |
+
print("highlight_terms", highlight_terms)
|
102 |
+
tokens = text.split()
|
103 |
+
tokens_html = []
|
104 |
+
for token in tokens:
|
105 |
+
norm_token = normalize(token)
|
106 |
+
if norm_token in highlight_terms:
|
107 |
+
tokens_html.append("<b>{}</b>".format(token))
|
108 |
+
else:
|
109 |
+
tokens_html.append(token)
|
110 |
+
tokens_html = " ".join(tokens_html)
|
111 |
else:
|
112 |
tokens = text.split()
|
113 |
tokens_html = []
|
|
|
270 |
|
271 |
title = """<p style="text-align: center; font-size:28px"> πΈ π ROOTS search tool π πΈ </p>"""
|
272 |
description = """
|
|
|
|
|
273 |
|
274 |
The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
|
275 |
of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search
|