ola13 commited on
Commit
58c2d21
Β·
1 Parent(s): 98c2f5d

readd exact

Browse files
Files changed (1) hide show
  1. app.py +28 -7
app.py CHANGED
@@ -71,6 +71,23 @@ def extract_lang_from_docid(docid):
71
  return docid.split("_")[1]
72
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def format_result(result, highlight_terms, exact_search, datasets_filter=None):
75
  text, url, docid = result
76
  if datasets_filter is not None:
@@ -80,11 +97,17 @@ def format_result(result, highlight_terms, exact_search, datasets_filter=None):
80
  return ""
81
 
82
  if exact_search:
83
- query_start = text.find(highlight_terms)
84
- query_end = query_start + len(highlight_terms)
85
- tokens_html = text[0:query_start]
86
- tokens_html += "<b>{}</b>".format(text[query_start:query_end])
87
- tokens_html += text[query_end:]
 
 
 
 
 
 
88
  else:
89
  tokens = text.split()
90
  tokens_html = []
@@ -247,8 +270,6 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
247
 
248
  title = """<p style="text-align: center; font-size:28px"> 🌸 πŸ”Ž ROOTS search tool πŸ” 🌸 </p>"""
249
  description = """
250
- ## We're running maintenance works on the exact search index, so it may not work properly until the end of the day, Thursday 30th of March.
251
-
252
 
253
  The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
254
  of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search
 
71
  return docid.split("_")[1]
72
 
73
 
74
+ def normalize(document):
75
+ def remove_articles(text):
76
+ return re.sub(r"\b(a|an|the)\b", " ", text)
77
+
78
+ def white_space_fix(text):
79
+ return " ".join(text.split())
80
+
81
+ def remove_punc(text):
82
+ exclude = set(string.punctuation)
83
+ return "".join(ch for ch in text if ch not in exclude)
84
+
85
+ def lower(text):
86
+ return text.lower()
87
+
88
+ return white_space_fix(remove_articles(remove_punc(lower(document))))
89
+
90
+
91
  def format_result(result, highlight_terms, exact_search, datasets_filter=None):
92
  text, url, docid = result
93
  if datasets_filter is not None:
 
97
  return ""
98
 
99
  if exact_search:
100
+ highlight_terms = normalize(highlight_terms).split()
101
+ print("highlight_terms", highlight_terms)
102
+ tokens = text.split()
103
+ tokens_html = []
104
+ for token in tokens:
105
+ norm_token = normalize(token)
106
+ if norm_token in highlight_terms:
107
+ tokens_html.append("<b>{}</b>".format(token))
108
+ else:
109
+ tokens_html.append(token)
110
+ tokens_html = " ".join(tokens_html)
111
  else:
112
  tokens = text.split()
113
  tokens_html = []
 
270
 
271
  title = """<p style="text-align: center; font-size:28px"> 🌸 πŸ”Ž ROOTS search tool πŸ” 🌸 </p>"""
272
  description = """
 
 
273
 
274
  The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
275
  of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search