Kevin Hu
commited on
Commit
·
75f6aef
1
Parent(s):
8ce7a30
accelerate term weight calculation (#3206)
Browse files### What problem does this PR solve?
### Type of change
- [x] Performance Improvement
- rag/llm/__init__.py +0 -1
- rag/nlp/query.py +3 -3
- rag/nlp/term_weight.py +15 -9
rag/llm/__init__.py
CHANGED
|
@@ -83,7 +83,6 @@ ChatModel = {
|
|
| 83 |
"VolcEngine": VolcEngineChat,
|
| 84 |
"BaiChuan": BaiChuanChat,
|
| 85 |
"MiniMax": MiniMaxChat,
|
| 86 |
-
"Minimax": MiniMaxChat,
|
| 87 |
"Mistral": MistralChat,
|
| 88 |
"Gemini": GeminiChat,
|
| 89 |
"Bedrock": BedrockChat,
|
|
|
|
| 83 |
"VolcEngine": VolcEngineChat,
|
| 84 |
"BaiChuan": BaiChuanChat,
|
| 85 |
"MiniMax": MiniMaxChat,
|
|
|
|
| 86 |
"Mistral": MistralChat,
|
| 87 |
"Gemini": GeminiChat,
|
| 88 |
"Bedrock": BedrockChat,
|
rag/nlp/query.py
CHANGED
|
@@ -165,7 +165,7 @@ class EsQueryer:
|
|
| 165 |
d = {}
|
| 166 |
if isinstance(tks, str):
|
| 167 |
tks = tks.split(" ")
|
| 168 |
-
for t, c in self.tw.weights(tks):
|
| 169 |
if t not in d:
|
| 170 |
d[t] = 0
|
| 171 |
d[t] += c
|
|
@@ -177,9 +177,9 @@ class EsQueryer:
|
|
| 177 |
|
| 178 |
def similarity(self, qtwt, dtwt):
|
| 179 |
if isinstance(dtwt, type("")):
|
| 180 |
-
dtwt = {t: w for t, w in self.tw.weights(self.tw.split(dtwt))}
|
| 181 |
if isinstance(qtwt, type("")):
|
| 182 |
-
qtwt = {t: w for t, w in self.tw.weights(self.tw.split(qtwt))}
|
| 183 |
s = 1e-9
|
| 184 |
for k, v in qtwt.items():
|
| 185 |
if k in dtwt:
|
|
|
|
| 165 |
d = {}
|
| 166 |
if isinstance(tks, str):
|
| 167 |
tks = tks.split(" ")
|
| 168 |
+
for t, c in self.tw.weights(tks, preprocess=False):
|
| 169 |
if t not in d:
|
| 170 |
d[t] = 0
|
| 171 |
d[t] += c
|
|
|
|
| 177 |
|
| 178 |
def similarity(self, qtwt, dtwt):
|
| 179 |
if isinstance(dtwt, type("")):
|
| 180 |
+
dtwt = {t: w for t, w in self.tw.weights(self.tw.split(dtwt), preprocess=False)}
|
| 181 |
if isinstance(qtwt, type("")):
|
| 182 |
+
qtwt = {t: w for t, w in self.tw.weights(self.tw.split(qtwt), preprocess=False)}
|
| 183 |
s = 1e-9
|
| 184 |
for k, v in qtwt.items():
|
| 185 |
if k in dtwt:
|
rag/nlp/term_weight.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
#
|
| 2 |
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
| 3 |
#
|
| 4 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
@@ -158,7 +158,7 @@ class Dealer:
|
|
| 158 |
tks.append(t)
|
| 159 |
return tks
|
| 160 |
|
| 161 |
-
def weights(self, tks):
|
| 162 |
def skill(t):
|
| 163 |
if t not in self.sk:
|
| 164 |
return 1
|
|
@@ -222,14 +222,20 @@ class Dealer:
|
|
| 222 |
def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
|
| 223 |
|
| 224 |
tw = []
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
|
| 229 |
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
| 230 |
-
np.array([ner(t) * postag(t) for t in
|
| 231 |
-
|
| 232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
S = np.sum([s for _, s in tw])
|
| 235 |
return [(t, s / S) for t, s in tw]
|
|
|
|
| 1 |
+
#
|
| 2 |
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
| 3 |
#
|
| 4 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
| 158 |
tks.append(t)
|
| 159 |
return tks
|
| 160 |
|
| 161 |
+
def weights(self, tks, preprocess=True):
|
| 162 |
def skill(t):
|
| 163 |
if t not in self.sk:
|
| 164 |
return 1
|
|
|
|
| 222 |
def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
|
| 223 |
|
| 224 |
tw = []
|
| 225 |
+
if not preprocess:
|
| 226 |
+
idf1 = np.array([idf(freq(t), 10000000) for t in tks])
|
| 227 |
+
idf2 = np.array([idf(df(t), 1000000000) for t in tks])
|
|
|
|
| 228 |
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
| 229 |
+
np.array([ner(t) * postag(t) for t in tks])
|
| 230 |
+
tw = zip(tks, wts)
|
| 231 |
+
else:
|
| 232 |
+
for tk in tks:
|
| 233 |
+
tt = self.tokenMerge(self.pretoken(tk, True))
|
| 234 |
+
idf1 = np.array([idf(freq(t), 10000000) for t in tt])
|
| 235 |
+
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
|
| 236 |
+
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
| 237 |
+
np.array([ner(t) * postag(t) for t in tt])
|
| 238 |
+
tw.extend(zip(tt, wts))
|
| 239 |
|
| 240 |
S = np.sum([s for _, s in tw])
|
| 241 |
return [(t, s / S) for t, s in tw]
|