Kevin Hu
commited on
Commit
·
2d7e5db
1
Parent(s):
db89829
Feat: Add question parameter to edit chunk modal (#3875)
Browse files### What problem does this PR solve?
Close #3873
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- api/apps/chunk_app.py +8 -3
- api/apps/sdk/doc.py +21 -3
- conf/infinity_mapping.json +2 -0
- rag/nlp/query.py +1 -0
- rag/nlp/search.py +7 -5
- rag/svr/task_executor.py +12 -10
- sdk/python/ragflow_sdk/modules/chunk.py +1 -0
- sdk/python/ragflow_sdk/modules/document.py +3 -3
api/apps/chunk_app.py
CHANGED
|
@@ -68,6 +68,7 @@ def list_chunk():
|
|
| 68 |
"doc_id": sres.field[id]["doc_id"],
|
| 69 |
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
| 70 |
"important_kwd": sres.field[id].get("important_kwd", []),
|
|
|
|
| 71 |
"image_id": sres.field[id].get("img_id", ""),
|
| 72 |
"available_int": sres.field[id].get("available_int", 1),
|
| 73 |
"positions": json.loads(sres.field[id].get("position_list", "[]")),
|
|
@@ -115,7 +116,7 @@ def get():
|
|
| 115 |
@manager.route('/set', methods=['POST'])
|
| 116 |
@login_required
|
| 117 |
@validate_request("doc_id", "chunk_id", "content_with_weight",
|
| 118 |
-
"important_kwd")
|
| 119 |
def set():
|
| 120 |
req = request.json
|
| 121 |
d = {
|
|
@@ -125,6 +126,8 @@ def set():
|
|
| 125 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
| 126 |
d["important_kwd"] = req["important_kwd"]
|
| 127 |
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
|
|
|
|
|
|
|
| 128 |
if "available_int" in req:
|
| 129 |
d["available_int"] = req["available_int"]
|
| 130 |
|
|
@@ -152,7 +155,7 @@ def set():
|
|
| 152 |
d = beAdoc(d, arr[0], arr[1], not any(
|
| 153 |
[rag_tokenizer.is_chinese(t) for t in q + a]))
|
| 154 |
|
| 155 |
-
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
| 156 |
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
| 157 |
d["q_%d_vec" % len(v)] = v.tolist()
|
| 158 |
settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id)
|
|
@@ -213,6 +216,8 @@ def create():
|
|
| 213 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
| 214 |
d["important_kwd"] = req.get("important_kwd", [])
|
| 215 |
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
|
|
|
|
|
|
|
| 216 |
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
| 217 |
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
| 218 |
|
|
@@ -237,7 +242,7 @@ def create():
|
|
| 237 |
embd_id = DocumentService.get_embd_id(req["doc_id"])
|
| 238 |
embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING.value, embd_id)
|
| 239 |
|
| 240 |
-
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
| 241 |
v = 0.1 * v[0] + 0.9 * v[1]
|
| 242 |
d["q_%d_vec" % len(v)] = v.tolist()
|
| 243 |
settings.docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
|
|
|
|
| 68 |
"doc_id": sres.field[id]["doc_id"],
|
| 69 |
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
| 70 |
"important_kwd": sres.field[id].get("important_kwd", []),
|
| 71 |
+
"question_kwd": sres.field[id].get("question_kwd", []),
|
| 72 |
"image_id": sres.field[id].get("img_id", ""),
|
| 73 |
"available_int": sres.field[id].get("available_int", 1),
|
| 74 |
"positions": json.loads(sres.field[id].get("position_list", "[]")),
|
|
|
|
| 116 |
@manager.route('/set', methods=['POST'])
|
| 117 |
@login_required
|
| 118 |
@validate_request("doc_id", "chunk_id", "content_with_weight",
|
| 119 |
+
"important_kwd", "question_kwd")
|
| 120 |
def set():
|
| 121 |
req = request.json
|
| 122 |
d = {
|
|
|
|
| 126 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
| 127 |
d["important_kwd"] = req["important_kwd"]
|
| 128 |
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
|
| 129 |
+
d["question_kwd"] = req["question_kwd"]
|
| 130 |
+
d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["question_kwd"]))
|
| 131 |
if "available_int" in req:
|
| 132 |
d["available_int"] = req["available_int"]
|
| 133 |
|
|
|
|
| 155 |
d = beAdoc(d, arr[0], arr[1], not any(
|
| 156 |
[rag_tokenizer.is_chinese(t) for t in q + a]))
|
| 157 |
|
| 158 |
+
v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d["question_kwd"] else "\n".join(d["question_kwd"])])
|
| 159 |
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
| 160 |
d["q_%d_vec" % len(v)] = v.tolist()
|
| 161 |
settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id)
|
|
|
|
| 216 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
| 217 |
d["important_kwd"] = req.get("important_kwd", [])
|
| 218 |
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
|
| 219 |
+
d["question_kwd"] = req.get("question_kwd", [])
|
| 220 |
+
d["question_tks"] = rag_tokenizer.tokenize("\n".join(req.get("question_kwd", [])))
|
| 221 |
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
| 222 |
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
| 223 |
|
|
|
|
| 242 |
embd_id = DocumentService.get_embd_id(req["doc_id"])
|
| 243 |
embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING.value, embd_id)
|
| 244 |
|
| 245 |
+
v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d["question_kwd"] else "\n".join(d["question_kwd"])])
|
| 246 |
v = 0.1 * v[0] + 0.9 * v[1]
|
| 247 |
d["q_%d_vec" % len(v)] = v.tolist()
|
| 248 |
settings.docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
|
api/apps/sdk/doc.py
CHANGED
|
@@ -844,6 +844,7 @@ def list_chunks(tenant_id, dataset_id, document_id):
|
|
| 844 |
"doc_id": sres.field[id]["doc_id"],
|
| 845 |
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
| 846 |
"important_kwd": sres.field[id].get("important_kwd", []),
|
|
|
|
| 847 |
"img_id": sres.field[id].get("img_id", ""),
|
| 848 |
"available_int": sres.field[id].get("available_int", 1),
|
| 849 |
"positions": sres.field[id].get("position_int", "").split("\t"),
|
|
@@ -879,6 +880,7 @@ def list_chunks(tenant_id, dataset_id, document_id):
|
|
| 879 |
"content_with_weight": "content",
|
| 880 |
"doc_id": "document_id",
|
| 881 |
"important_kwd": "important_keywords",
|
|
|
|
| 882 |
"img_id": "image_id",
|
| 883 |
"available_int": "available",
|
| 884 |
}
|
|
@@ -978,6 +980,11 @@ def add_chunk(tenant_id, dataset_id, document_id):
|
|
| 978 |
return get_error_data_result(
|
| 979 |
"`important_keywords` is required to be a list"
|
| 980 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 981 |
md5 = hashlib.md5()
|
| 982 |
md5.update((req["content"] + document_id).encode("utf-8"))
|
| 983 |
|
|
@@ -992,6 +999,10 @@ def add_chunk(tenant_id, dataset_id, document_id):
|
|
| 992 |
d["important_tks"] = rag_tokenizer.tokenize(
|
| 993 |
" ".join(req.get("important_keywords", []))
|
| 994 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 995 |
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
| 996 |
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
| 997 |
d["kb_id"] = dataset_id
|
|
@@ -1001,7 +1012,7 @@ def add_chunk(tenant_id, dataset_id, document_id):
|
|
| 1001 |
embd_mdl = TenantLLMService.model_instance(
|
| 1002 |
tenant_id, LLMType.EMBEDDING.value, embd_id
|
| 1003 |
)
|
| 1004 |
-
v, c = embd_mdl.encode([doc.name, req["content"]])
|
| 1005 |
v = 0.1 * v[0] + 0.9 * v[1]
|
| 1006 |
d["q_%d_vec" % len(v)] = v.tolist()
|
| 1007 |
settings.docStoreConn.insert([d], search.index_name(tenant_id), dataset_id)
|
|
@@ -1013,6 +1024,7 @@ def add_chunk(tenant_id, dataset_id, document_id):
|
|
| 1013 |
"content_with_weight": "content",
|
| 1014 |
"doc_id": "document_id",
|
| 1015 |
"important_kwd": "important_keywords",
|
|
|
|
| 1016 |
"kb_id": "dataset_id",
|
| 1017 |
"create_timestamp_flt": "create_timestamp",
|
| 1018 |
"create_time": "create_time",
|
|
@@ -1166,8 +1178,13 @@ def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
|
|
| 1166 |
if "important_keywords" in req:
|
| 1167 |
if not isinstance(req["important_keywords"], list):
|
| 1168 |
return get_error_data_result("`important_keywords` should be a list")
|
| 1169 |
-
d["important_kwd"] = req.get("important_keywords")
|
| 1170 |
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1171 |
if "available" in req:
|
| 1172 |
d["available_int"] = int(req["available"])
|
| 1173 |
embd_id = DocumentService.get_embd_id(document_id)
|
|
@@ -1185,7 +1202,7 @@ def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
|
|
| 1185 |
d, arr[0], arr[1], not any([rag_tokenizer.is_chinese(t) for t in q + a])
|
| 1186 |
)
|
| 1187 |
|
| 1188 |
-
v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
|
| 1189 |
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
| 1190 |
d["q_%d_vec" % len(v)] = v.tolist()
|
| 1191 |
settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id)
|
|
@@ -1353,6 +1370,7 @@ def retrieval_test(tenant_id):
|
|
| 1353 |
"content_with_weight": "content",
|
| 1354 |
"doc_id": "document_id",
|
| 1355 |
"important_kwd": "important_keywords",
|
|
|
|
| 1356 |
"docnm_kwd": "document_keyword",
|
| 1357 |
}
|
| 1358 |
rename_chunk = {}
|
|
|
|
| 844 |
"doc_id": sres.field[id]["doc_id"],
|
| 845 |
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
| 846 |
"important_kwd": sres.field[id].get("important_kwd", []),
|
| 847 |
+
"question_kwd": sres.field[id].get("question_kwd", []),
|
| 848 |
"img_id": sres.field[id].get("img_id", ""),
|
| 849 |
"available_int": sres.field[id].get("available_int", 1),
|
| 850 |
"positions": sres.field[id].get("position_int", "").split("\t"),
|
|
|
|
| 880 |
"content_with_weight": "content",
|
| 881 |
"doc_id": "document_id",
|
| 882 |
"important_kwd": "important_keywords",
|
| 883 |
+
"question_kwd": "questions",
|
| 884 |
"img_id": "image_id",
|
| 885 |
"available_int": "available",
|
| 886 |
}
|
|
|
|
| 980 |
return get_error_data_result(
|
| 981 |
"`important_keywords` is required to be a list"
|
| 982 |
)
|
| 983 |
+
if "questions" in req:
|
| 984 |
+
if type(req["questions"]) != list:
|
| 985 |
+
return get_error_data_result(
|
| 986 |
+
"`questions` is required to be a list"
|
| 987 |
+
)
|
| 988 |
md5 = hashlib.md5()
|
| 989 |
md5.update((req["content"] + document_id).encode("utf-8"))
|
| 990 |
|
|
|
|
| 999 |
d["important_tks"] = rag_tokenizer.tokenize(
|
| 1000 |
" ".join(req.get("important_keywords", []))
|
| 1001 |
)
|
| 1002 |
+
d["question_kwd"] = req.get("questions", [])
|
| 1003 |
+
d["question_tks"] = rag_tokenizer.tokenize(
|
| 1004 |
+
"\n".join(req.get("questions", []))
|
| 1005 |
+
)
|
| 1006 |
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
| 1007 |
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
| 1008 |
d["kb_id"] = dataset_id
|
|
|
|
| 1012 |
embd_mdl = TenantLLMService.model_instance(
|
| 1013 |
tenant_id, LLMType.EMBEDDING.value, embd_id
|
| 1014 |
)
|
| 1015 |
+
v, c = embd_mdl.encode([doc.name, req["content"] if not d["question_kwd"] else "\n".join(d["question_kwd"])])
|
| 1016 |
v = 0.1 * v[0] + 0.9 * v[1]
|
| 1017 |
d["q_%d_vec" % len(v)] = v.tolist()
|
| 1018 |
settings.docStoreConn.insert([d], search.index_name(tenant_id), dataset_id)
|
|
|
|
| 1024 |
"content_with_weight": "content",
|
| 1025 |
"doc_id": "document_id",
|
| 1026 |
"important_kwd": "important_keywords",
|
| 1027 |
+
"question_kwd": "questions",
|
| 1028 |
"kb_id": "dataset_id",
|
| 1029 |
"create_timestamp_flt": "create_timestamp",
|
| 1030 |
"create_time": "create_time",
|
|
|
|
| 1178 |
if "important_keywords" in req:
|
| 1179 |
if not isinstance(req["important_keywords"], list):
|
| 1180 |
return get_error_data_result("`important_keywords` should be a list")
|
| 1181 |
+
d["important_kwd"] = req.get("important_keywords", [])
|
| 1182 |
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
|
| 1183 |
+
if "questions" in req:
|
| 1184 |
+
if not isinstance(req["questions"], list):
|
| 1185 |
+
return get_error_data_result("`questions` should be a list")
|
| 1186 |
+
d["question_kwd"] = req.get("questions")
|
| 1187 |
+
d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["questions"]))
|
| 1188 |
if "available" in req:
|
| 1189 |
d["available_int"] = int(req["available"])
|
| 1190 |
embd_id = DocumentService.get_embd_id(document_id)
|
|
|
|
| 1202 |
d, arr[0], arr[1], not any([rag_tokenizer.is_chinese(t) for t in q + a])
|
| 1203 |
)
|
| 1204 |
|
| 1205 |
+
v, c = embd_mdl.encode([doc.name, d["content_with_weight"] if not d.get("question_kwd") else "\n".join(d["question_kwd"])])
|
| 1206 |
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
| 1207 |
d["q_%d_vec" % len(v)] = v.tolist()
|
| 1208 |
settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id)
|
|
|
|
| 1370 |
"content_with_weight": "content",
|
| 1371 |
"doc_id": "document_id",
|
| 1372 |
"important_kwd": "important_keywords",
|
| 1373 |
+
"question_kwd": "questions",
|
| 1374 |
"docnm_kwd": "document_keyword",
|
| 1375 |
}
|
| 1376 |
rename_chunk = {}
|
conf/infinity_mapping.json
CHANGED
|
@@ -11,6 +11,8 @@
|
|
| 11 |
"name_kwd": {"type": "varchar", "default": ""},
|
| 12 |
"important_kwd": {"type": "varchar", "default": ""},
|
| 13 |
"important_tks": {"type": "varchar", "default": ""},
|
|
|
|
|
|
|
| 14 |
"content_with_weight": {"type": "varchar", "default": ""},
|
| 15 |
"content_ltks": {"type": "varchar", "default": ""},
|
| 16 |
"content_sm_ltks": {"type": "varchar", "default": ""},
|
|
|
|
| 11 |
"name_kwd": {"type": "varchar", "default": ""},
|
| 12 |
"important_kwd": {"type": "varchar", "default": ""},
|
| 13 |
"important_tks": {"type": "varchar", "default": ""},
|
| 14 |
+
"question_kwd": {"type": "varchar", "default": ""},
|
| 15 |
+
"question_tks": {"type": "varchar", "default": ""},
|
| 16 |
"content_with_weight": {"type": "varchar", "default": ""},
|
| 17 |
"content_ltks": {"type": "varchar", "default": ""},
|
| 18 |
"content_sm_ltks": {"type": "varchar", "default": ""},
|
rag/nlp/query.py
CHANGED
|
@@ -31,6 +31,7 @@ class FulltextQueryer:
|
|
| 31 |
"title_sm_tks^5",
|
| 32 |
"important_kwd^30",
|
| 33 |
"important_tks^20",
|
|
|
|
| 34 |
"content_ltks^2",
|
| 35 |
"content_sm_ltks",
|
| 36 |
]
|
|
|
|
| 31 |
"title_sm_tks^5",
|
| 32 |
"important_kwd^30",
|
| 33 |
"important_tks^20",
|
| 34 |
+
"question_tks^20",
|
| 35 |
"content_ltks^2",
|
| 36 |
"content_sm_ltks",
|
| 37 |
]
|
rag/nlp/search.py
CHANGED
|
@@ -74,7 +74,7 @@ class Dealer:
|
|
| 74 |
offset, limit = pg * ps, (pg + 1) * ps
|
| 75 |
|
| 76 |
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd",
|
| 77 |
-
"doc_id", "position_list", "knowledge_graph_kwd",
|
| 78 |
"available_int", "content_with_weight", "pagerank_fea"])
|
| 79 |
kwds = set([])
|
| 80 |
|
|
@@ -251,8 +251,9 @@ class Dealer:
|
|
| 251 |
for i in sres.ids:
|
| 252 |
content_ltks = sres.field[i][cfield].split()
|
| 253 |
title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
|
|
|
|
| 254 |
important_kwd = sres.field[i].get("important_kwd", [])
|
| 255 |
-
tks = content_ltks + title_tks*2 + important_kwd*5
|
| 256 |
ins_tw.append(tks)
|
| 257 |
|
| 258 |
sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
|
|
@@ -322,11 +323,14 @@ class Dealer:
|
|
| 322 |
sim = tsim = vsim = [1]*len(sres.ids)
|
| 323 |
idx = list(range(len(sres.ids)))
|
| 324 |
|
|
|
|
|
|
|
|
|
|
| 325 |
dim = len(sres.query_vector)
|
| 326 |
vector_column = f"q_{dim}_vec"
|
| 327 |
zero_vector = [0.0] * dim
|
| 328 |
for i in idx:
|
| 329 |
-
if sim[i] < similarity_threshold:
|
| 330 |
break
|
| 331 |
if len(ranks["chunks"]) >= page_size:
|
| 332 |
if aggs:
|
|
@@ -337,8 +341,6 @@ class Dealer:
|
|
| 337 |
dnm = chunk["docnm_kwd"]
|
| 338 |
did = chunk["doc_id"]
|
| 339 |
position_list = chunk.get("position_list", "[]")
|
| 340 |
-
if not position_list:
|
| 341 |
-
position_list = "[]"
|
| 342 |
d = {
|
| 343 |
"chunk_id": id,
|
| 344 |
"content_ltks": chunk["content_ltks"],
|
|
|
|
| 74 |
offset, limit = pg * ps, (pg + 1) * ps
|
| 75 |
|
| 76 |
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd",
|
| 77 |
+
"doc_id", "position_list", "knowledge_graph_kwd", "question_kwd", "question_tks",
|
| 78 |
"available_int", "content_with_weight", "pagerank_fea"])
|
| 79 |
kwds = set([])
|
| 80 |
|
|
|
|
| 251 |
for i in sres.ids:
|
| 252 |
content_ltks = sres.field[i][cfield].split()
|
| 253 |
title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
|
| 254 |
+
question_tks = [t for t in sres.field[i].get("question_tks", "").split() if t]
|
| 255 |
important_kwd = sres.field[i].get("important_kwd", [])
|
| 256 |
+
tks = content_ltks + title_tks*2 + important_kwd*5 + question_tks*6
|
| 257 |
ins_tw.append(tks)
|
| 258 |
|
| 259 |
sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
|
|
|
|
| 323 |
sim = tsim = vsim = [1]*len(sres.ids)
|
| 324 |
idx = list(range(len(sres.ids)))
|
| 325 |
|
| 326 |
+
def floor_sim(score):
|
| 327 |
+
return (int(score * 100.)%100)/100.
|
| 328 |
+
|
| 329 |
dim = len(sres.query_vector)
|
| 330 |
vector_column = f"q_{dim}_vec"
|
| 331 |
zero_vector = [0.0] * dim
|
| 332 |
for i in idx:
|
| 333 |
+
if floor_sim(sim[i]) < similarity_threshold:
|
| 334 |
break
|
| 335 |
if len(ranks["chunks"]) >= page_size:
|
| 336 |
if aggs:
|
|
|
|
| 341 |
dnm = chunk["docnm_kwd"]
|
| 342 |
did = chunk["doc_id"]
|
| 343 |
position_list = chunk.get("position_list", "[]")
|
|
|
|
|
|
|
| 344 |
d = {
|
| 345 |
"chunk_id": id,
|
| 346 |
"content_ltks": chunk["content_ltks"],
|
rag/svr/task_executor.py
CHANGED
|
@@ -255,13 +255,8 @@ def build_chunks(task, progress_callback):
|
|
| 255 |
progress_callback(msg="Start to generate questions for every chunk ...")
|
| 256 |
chat_mdl = LLMBundle(task["tenant_id"], LLMType.CHAT, llm_name=task["llm_id"], lang=task["language"])
|
| 257 |
for d in docs:
|
| 258 |
-
|
| 259 |
-
d["
|
| 260 |
-
qst = rag_tokenizer.tokenize(qst)
|
| 261 |
-
if "content_ltks" in d:
|
| 262 |
-
d["content_ltks"] += " " + qst
|
| 263 |
-
if "content_sm_ltks" in d:
|
| 264 |
-
d["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
|
| 265 |
progress_callback(msg="Question generation completed in {:.2f}s".format(timer() - st))
|
| 266 |
|
| 267 |
return docs
|
|
@@ -275,9 +270,16 @@ def init_kb(row, vector_size: int):
|
|
| 275 |
def embedding(docs, mdl, parser_config=None, callback=None):
|
| 276 |
if parser_config is None:
|
| 277 |
parser_config = {}
|
| 278 |
-
batch_size =
|
| 279 |
-
tts, cnts = [
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
tk_count = 0
|
| 282 |
if len(tts) == len(cnts):
|
| 283 |
tts_ = np.array([])
|
|
|
|
| 255 |
progress_callback(msg="Start to generate questions for every chunk ...")
|
| 256 |
chat_mdl = LLMBundle(task["tenant_id"], LLMType.CHAT, llm_name=task["llm_id"], lang=task["language"])
|
| 257 |
for d in docs:
|
| 258 |
+
d["question_kwd"] = question_proposal(chat_mdl, d["content_with_weight"], task["parser_config"]["auto_questions"]).split("\n")
|
| 259 |
+
d["question_tks"] = rag_tokenizer.tokenize("\n".join(d["question_kwd"]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
progress_callback(msg="Question generation completed in {:.2f}s".format(timer() - st))
|
| 261 |
|
| 262 |
return docs
|
|
|
|
| 270 |
def embedding(docs, mdl, parser_config=None, callback=None):
|
| 271 |
if parser_config is None:
|
| 272 |
parser_config = {}
|
| 273 |
+
batch_size = 16
|
| 274 |
+
tts, cnts = [], []
|
| 275 |
+
for d in docs:
|
| 276 |
+
tts.append(rmSpace(d["title_tks"]))
|
| 277 |
+
c = "\n".join(d.get("question_kwd", []))
|
| 278 |
+
if not c:
|
| 279 |
+
c = d["content_with_weight"]
|
| 280 |
+
c = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", c)
|
| 281 |
+
cnts.append(c)
|
| 282 |
+
|
| 283 |
tk_count = 0
|
| 284 |
if len(tts) == len(cnts):
|
| 285 |
tts_ = np.array([])
|
sdk/python/ragflow_sdk/modules/chunk.py
CHANGED
|
@@ -6,6 +6,7 @@ class Chunk(Base):
|
|
| 6 |
self.id = ""
|
| 7 |
self.content = ""
|
| 8 |
self.important_keywords = []
|
|
|
|
| 9 |
self.create_time = ""
|
| 10 |
self.create_timestamp = 0.0
|
| 11 |
self.dataset_id = None
|
|
|
|
| 6 |
self.id = ""
|
| 7 |
self.content = ""
|
| 8 |
self.important_keywords = []
|
| 9 |
+
self.questions = []
|
| 10 |
self.create_time = ""
|
| 11 |
self.create_timestamp = 0.0
|
| 12 |
self.dataset_id = None
|
sdk/python/ragflow_sdk/modules/document.py
CHANGED
|
@@ -61,9 +61,9 @@ class Document(Base):
|
|
| 61 |
return chunks
|
| 62 |
raise Exception(res.get("message"))
|
| 63 |
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
res = res.json()
|
| 68 |
if res.get("code") == 0:
|
| 69 |
return Chunk(self.rag,res["data"].get("chunk"))
|
|
|
|
| 61 |
return chunks
|
| 62 |
raise Exception(res.get("message"))
|
| 63 |
|
| 64 |
+
def add_chunk(self, content: str, important_keywords: list[str] = [], questions: list[str] = []):
|
| 65 |
+
res = self.post(f'/datasets/{self.dataset_id}/documents/{self.id}/chunks',
|
| 66 |
+
{"content":content,"important_keywords":important_keywords, "questions": questions})
|
| 67 |
res = res.json()
|
| 68 |
if res.get("code") == 0:
|
| 69 |
return Chunk(self.rag,res["data"].get("chunk"))
|