Spaces:

retopara
/

ragflow

Build error

Kevin Hu commited on Nov 5, 2024

Commit

75f6aef

1 Parent(s): 8ce7a30

accelerate term weight calculation (#3206)

### What problem does this PR solve?

### Type of change

- [x] Performance Improvement

Files changed (3) hide show

rag/llm/__init__.py CHANGED Viewed

@@ -83,7 +83,6 @@ ChatModel = {
     "VolcEngine": VolcEngineChat,
     "BaiChuan": BaiChuanChat,
     "MiniMax": MiniMaxChat,
-    "Minimax": MiniMaxChat,
     "Mistral": MistralChat,
     "Gemini": GeminiChat,
     "Bedrock": BedrockChat,

     "VolcEngine": VolcEngineChat,
     "BaiChuan": BaiChuanChat,
     "MiniMax": MiniMaxChat,
     "Mistral": MistralChat,
     "Gemini": GeminiChat,
     "Bedrock": BedrockChat,

rag/nlp/query.py CHANGED Viewed

@@ -165,7 +165,7 @@ class EsQueryer:
             d = {}
             if isinstance(tks, str):
                 tks = tks.split(" ")
-            for t, c in self.tw.weights(tks):
                 if t not in d:
                     d[t] = 0
                 d[t] += c
@@ -177,9 +177,9 @@ class EsQueryer:
     def similarity(self, qtwt, dtwt):
         if isinstance(dtwt, type("")):
-            dtwt = {t: w for t, w in self.tw.weights(self.tw.split(dtwt))}
         if isinstance(qtwt, type("")):
-            qtwt = {t: w for t, w in self.tw.weights(self.tw.split(qtwt))}
         s = 1e-9
         for k, v in qtwt.items():
             if k in dtwt:

             d = {}
             if isinstance(tks, str):
                 tks = tks.split(" ")
+            for t, c in self.tw.weights(tks, preprocess=False):
                 if t not in d:
                     d[t] = 0
                 d[t] += c
     def similarity(self, qtwt, dtwt):
         if isinstance(dtwt, type("")):
+            dtwt = {t: w for t, w in self.tw.weights(self.tw.split(dtwt), preprocess=False)}
         if isinstance(qtwt, type("")):
+            qtwt = {t: w for t, w in self.tw.weights(self.tw.split(qtwt), preprocess=False)}
         s = 1e-9
         for k, v in qtwt.items():
             if k in dtwt:

rag/nlp/term_weight.py CHANGED Viewed

@@ -1,4 +1,4 @@
-#
 #  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
@@ -158,7 +158,7 @@ class Dealer:
                 tks.append(t)
         return tks
-    def weights(self, tks):
         def skill(t):
             if t not in self.sk:
                 return 1
@@ -222,14 +222,20 @@ class Dealer:
         def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
         tw = []
-        for tk in tks:
-            tt = self.tokenMerge(self.pretoken(tk, True))
-            idf1 = np.array([idf(freq(t), 10000000) for t in tt])
-            idf2 = np.array([idf(df(t), 1000000000) for t in tt])
             wts = (0.3 * idf1 + 0.7 * idf2) * \
-                np.array([ner(t) * postag(t) for t in tt])
-            tw.extend(zip(tt, wts))
         S = np.sum([s for _, s in tw])
         return [(t, s / S) for t, s in tw]

+    #
 #  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
                 tks.append(t)
         return tks
+    def weights(self, tks, preprocess=True):
         def skill(t):
             if t not in self.sk:
                 return 1
         def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
         tw = []
+        if not preprocess:
+            idf1 = np.array([idf(freq(t), 10000000) for t in tks])
+            idf2 = np.array([idf(df(t), 1000000000) for t in tks])
             wts = (0.3 * idf1 + 0.7 * idf2) * \
+                np.array([ner(t) * postag(t) for t in tks])
+            tw = zip(tks, wts)
+        else:
+            for tk in tks:
+                tt = self.tokenMerge(self.pretoken(tk, True))
+                idf1 = np.array([idf(freq(t), 10000000) for t in tt])
+                idf2 = np.array([idf(df(t), 1000000000) for t in tt])
+                wts = (0.3 * idf1 + 0.7 * idf2) * \
+                    np.array([ner(t) * postag(t) for t in tt])
+                tw.extend(zip(tt, wts))
         S = np.sum([s for _, s in tw])
         return [(t, s / S) for t, s in tw]