Kevin Hu
commited on
Commit
·
0f25ebd
1
Parent(s):
916b3cc
make language judgement robuster (#3287)
Browse files### What problem does this PR solve?
### Type of change
- [x] Performance Improvement
- rag/nlp/query.py +2 -1
rag/nlp/query.py
CHANGED
@@ -63,9 +63,9 @@ class EsQueryer:
|
|
63 |
rag_tokenizer.tradi2simp(
|
64 |
rag_tokenizer.strQ2B(
|
65 |
txt.lower()))).strip()
|
66 |
-
txt = EsQueryer.rmWWW(txt)
|
67 |
|
68 |
if not self.isChinese(txt):
|
|
|
69 |
tks = rag_tokenizer.tokenize(txt).split(" ")
|
70 |
tks_w = self.tw.weights(tks)
|
71 |
tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
|
@@ -89,6 +89,7 @@ class EsQueryer:
|
|
89 |
return False
|
90 |
return True
|
91 |
|
|
|
92 |
qs, keywords = [], []
|
93 |
for tt in self.tw.split(txt)[:256]: # .split(" "):
|
94 |
if not tt:
|
|
|
63 |
rag_tokenizer.tradi2simp(
|
64 |
rag_tokenizer.strQ2B(
|
65 |
txt.lower()))).strip()
|
|
|
66 |
|
67 |
if not self.isChinese(txt):
|
68 |
+
txt = EsQueryer.rmWWW(txt)
|
69 |
tks = rag_tokenizer.tokenize(txt).split(" ")
|
70 |
tks_w = self.tw.weights(tks)
|
71 |
tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
|
|
|
89 |
return False
|
90 |
return True
|
91 |
|
92 |
+
txt = EsQueryer.rmWWW(txt)
|
93 |
qs, keywords = [], []
|
94 |
for tt in self.tw.split(txt)[:256]: # .split(" "):
|
95 |
if not tt:
|