Spaces:

bigscience-data
/

roots-search

Runtime error

App Files Files Community

ola13 commited on Mar 31, 2023

Commit

58c2d21

1 Parent(s): 98c2f5d

readd exact

Browse files

Files changed (1) hide show

app.py +28 -7

app.py CHANGED Viewed

@@ -71,6 +71,23 @@ def extract_lang_from_docid(docid):
     return docid.split("_")[1]
 def format_result(result, highlight_terms, exact_search, datasets_filter=None):
     text, url, docid = result
     if datasets_filter is not None:
@@ -80,11 +97,17 @@ def format_result(result, highlight_terms, exact_search, datasets_filter=None):
             return ""
     if exact_search:
-        query_start = text.find(highlight_terms)
-        query_end = query_start + len(highlight_terms)
-        tokens_html = text[0:query_start]
-        tokens_html += "<b>{}</b>".format(text[query_start:query_end])
-        tokens_html += text[query_end:]
     else:
         tokens = text.split()
         tokens_html = []
@@ -247,8 +270,6 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
 title = """<p style="text-align: center; font-size:28px"> 🌸 🔎 ROOTS search tool 🔍 🌸 </p>"""
 description = """
-## We're running maintenance works on the exact search index, so it may not work properly until the end of the day, Thursday 30th of March.
 The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
 of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search

     return docid.split("_")[1]
+def normalize(document):
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+    def white_space_fix(text):
+        return " ".join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(document))))
 def format_result(result, highlight_terms, exact_search, datasets_filter=None):
     text, url, docid = result
     if datasets_filter is not None:
             return ""
     if exact_search:
+        highlight_terms = normalize(highlight_terms).split()
+        print("highlight_terms", highlight_terms)
+        tokens = text.split()
+        tokens_html = []
+        for token in tokens:
+            norm_token = normalize(token)
+            if norm_token in highlight_terms:
+                tokens_html.append("<b>{}</b>".format(token))
+            else:
+                tokens_html.append(token)
+        tokens_html = " ".join(tokens_html)
     else:
         tokens = text.split()
         tokens_html = []
 title = """<p style="text-align: center; font-size:28px"> 🌸 🔎 ROOTS search tool 🔍 🌸 </p>"""
 description = """
 The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
 of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search