Kevin Hu commited on
Commit
1a2e406
·
1 Parent(s): d3d83ec

Refine synonym query. (#3855)

Browse files

### What problem does this PR solve?

### Type of change

- [x] Performance Improvement

Files changed (2) hide show
  1. conf/mapping.json +9 -1
  2. rag/nlp/query.py +16 -9
conf/mapping.json CHANGED
@@ -140,13 +140,21 @@
140
  }
141
  },
142
  {
143
- "string": {
144
  "match": "*_fea",
145
  "mapping": {
146
  "type": "rank_feature"
147
  }
148
  }
149
  },
 
 
 
 
 
 
 
 
150
  {
151
  "dense_vector": {
152
  "match": "*_512_vec",
 
140
  }
141
  },
142
  {
143
+ "rank_feature": {
144
  "match": "*_fea",
145
  "mapping": {
146
  "type": "rank_feature"
147
  }
148
  }
149
  },
150
+ {
151
+ "rank_features": {
152
+ "match": "*_feas",
153
+ "mapping": {
154
+ "type": "rank_features"
155
+ }
156
+ }
157
+ },
158
  {
159
  "dense_vector": {
160
  "match": "*_512_vec",
rag/nlp/query.py CHANGED
@@ -120,7 +120,7 @@ class FulltextQueryer:
120
  keywords.append(tt)
121
  twts = self.tw.weights([tt])
122
  syns = self.syn.lookup(tt)
123
- if syns: keywords.extend(syns)
124
  logging.debug(json.dumps(twts, ensure_ascii=False))
125
  tms = []
126
  for tk, w in sorted(twts, key=lambda x: x[1] * -1):
@@ -140,17 +140,24 @@ class FulltextQueryer:
140
  sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
141
  sm = [m for m in sm if len(m) > 1]
142
 
143
- keywords.append(re.sub(r"[ \\\"']+", "", tk))
144
- keywords.extend(sm)
145
- if len(keywords) >= 12:
146
- break
147
 
148
  tk_syns = self.syn.lookup(tk)
 
 
 
 
 
 
 
 
149
  tk = FulltextQueryer.subSpecialChar(tk)
150
  if tk.find(" ") > 0:
151
  tk = '"%s"' % tk
152
  if tk_syns:
153
- tk = f"({tk} %s)" % " ".join(tk_syns)
154
  if sm:
155
  tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm))
156
  if tk.strip():
@@ -159,14 +166,14 @@ class FulltextQueryer:
159
  tms = " ".join([f"({t})^{w}" for t, w in tms])
160
 
161
  if len(twts) > 1:
162
- tms += ' ("%s"~4)^1.5' % (" ".join([t for t, _ in twts]))
163
  if re.match(r"[0-9a-z ]+$", tt):
164
  tms = f'("{tt}" OR "%s")' % rag_tokenizer.tokenize(tt)
165
 
166
  syns = " OR ".join(
167
  [
168
- '"%s"^0.7'
169
- % FulltextQueryer.subSpecialChar(rag_tokenizer.tokenize(s))
170
  for s in syns
171
  ]
172
  )
 
120
  keywords.append(tt)
121
  twts = self.tw.weights([tt])
122
  syns = self.syn.lookup(tt)
123
+ if syns and len(keywords) < 32: keywords.extend(syns)
124
  logging.debug(json.dumps(twts, ensure_ascii=False))
125
  tms = []
126
  for tk, w in sorted(twts, key=lambda x: x[1] * -1):
 
140
  sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
141
  sm = [m for m in sm if len(m) > 1]
142
 
143
+ if len(keywords) < 32:
144
+ keywords.append(re.sub(r"[ \\\"']+", "", tk))
145
+ keywords.extend(sm)
 
146
 
147
  tk_syns = self.syn.lookup(tk)
148
+ tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns]
149
+ if len(keywords) < 32: keywords.extend([s for s in tk_syns if s])
150
+ tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
151
+ tk_syns = [f"\"{s}\"" if s.find(" ")>0 else s for s in tk_syns]
152
+
153
+ if len(keywords) >= 32:
154
+ break
155
+
156
  tk = FulltextQueryer.subSpecialChar(tk)
157
  if tk.find(" ") > 0:
158
  tk = '"%s"' % tk
159
  if tk_syns:
160
+ tk = f"({tk} OR (%s)^0.2)" % " ".join(tk_syns)
161
  if sm:
162
  tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm))
163
  if tk.strip():
 
166
  tms = " ".join([f"({t})^{w}" for t, w in tms])
167
 
168
  if len(twts) > 1:
169
+ tms += ' ("%s"~2)^1.5' % rag_tokenizer.tokenize(tt)
170
  if re.match(r"[0-9a-z ]+$", tt):
171
  tms = f'("{tt}" OR "%s")' % rag_tokenizer.tokenize(tt)
172
 
173
  syns = " OR ".join(
174
  [
175
+ '"%s"'
176
+ % rag_tokenizer.tokenize(FulltextQueryer.subSpecialChar(s))
177
  for s in syns
178
  ]
179
  )