Kevin Hu commited on
Commit
0f25ebd
·
1 Parent(s): 916b3cc

make language judgement robuster (#3287)

Browse files

### What problem does this PR solve?



### Type of change

- [x] Performance Improvement

Files changed (1) hide show
  1. rag/nlp/query.py +2 -1
rag/nlp/query.py CHANGED
@@ -63,9 +63,9 @@ class EsQueryer:
63
  rag_tokenizer.tradi2simp(
64
  rag_tokenizer.strQ2B(
65
  txt.lower()))).strip()
66
- txt = EsQueryer.rmWWW(txt)
67
 
68
  if not self.isChinese(txt):
 
69
  tks = rag_tokenizer.tokenize(txt).split(" ")
70
  tks_w = self.tw.weights(tks)
71
  tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
@@ -89,6 +89,7 @@ class EsQueryer:
89
  return False
90
  return True
91
 
 
92
  qs, keywords = [], []
93
  for tt in self.tw.split(txt)[:256]: # .split(" "):
94
  if not tt:
 
63
  rag_tokenizer.tradi2simp(
64
  rag_tokenizer.strQ2B(
65
  txt.lower()))).strip()
 
66
 
67
  if not self.isChinese(txt):
68
+ txt = EsQueryer.rmWWW(txt)
69
  tks = rag_tokenizer.tokenize(txt).split(" ")
70
  tks_w = self.tw.weights(tks)
71
  tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
 
89
  return False
90
  return True
91
 
92
+ txt = EsQueryer.rmWWW(txt)
93
  qs, keywords = [], []
94
  for tt in self.tw.split(txt)[:256]: # .split(" "):
95
  if not tt: