Kevin Hu
commited on
Commit
·
6a44b6e
1
Parent(s):
d696cd8
fix uploading docx for mind map (#2064)
Browse files### What problem does this PR solve?
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
api/db/services/document_service.py
CHANGED
|
@@ -17,6 +17,8 @@ import hashlib
|
|
| 17 |
import json
|
| 18 |
import os
|
| 19 |
import random
|
|
|
|
|
|
|
| 20 |
from concurrent.futures import ThreadPoolExecutor
|
| 21 |
from copy import deepcopy
|
| 22 |
from datetime import datetime
|
|
@@ -33,7 +35,7 @@ from graphrag.mind_map_extractor import MindMapExtractor
|
|
| 33 |
from rag.settings import SVR_QUEUE_NAME
|
| 34 |
from rag.utils.es_conn import ELASTICSEARCH
|
| 35 |
from rag.utils.minio_conn import MINIO
|
| 36 |
-
from rag.nlp import search
|
| 37 |
|
| 38 |
from api.db import FileType, TaskStatus, ParserType, LLMType
|
| 39 |
from api.db.db_models import DB, Knowledgebase, Tenant, Task
|
|
@@ -432,6 +434,9 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
|
| 432 |
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False}
|
| 433 |
exe = ThreadPoolExecutor(max_workers=12)
|
| 434 |
threads = []
|
|
|
|
|
|
|
|
|
|
| 435 |
for d, blob in files:
|
| 436 |
kwargs = {
|
| 437 |
"callback": dummy,
|
|
@@ -504,6 +509,9 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
|
| 504 |
"id": get_uuid(),
|
| 505 |
"doc_id": doc_id,
|
| 506 |
"kb_id": [kb.id],
|
|
|
|
|
|
|
|
|
|
| 507 |
"content_with_weight": mind_map,
|
| 508 |
"knowledge_graph_kwd": "mind_map"
|
| 509 |
})
|
|
|
|
| 17 |
import json
|
| 18 |
import os
|
| 19 |
import random
|
| 20 |
+
import re
|
| 21 |
+
import traceback
|
| 22 |
from concurrent.futures import ThreadPoolExecutor
|
| 23 |
from copy import deepcopy
|
| 24 |
from datetime import datetime
|
|
|
|
| 35 |
from rag.settings import SVR_QUEUE_NAME
|
| 36 |
from rag.utils.es_conn import ELASTICSEARCH
|
| 37 |
from rag.utils.minio_conn import MINIO
|
| 38 |
+
from rag.nlp import search, rag_tokenizer
|
| 39 |
|
| 40 |
from api.db import FileType, TaskStatus, ParserType, LLMType
|
| 41 |
from api.db.db_models import DB, Knowledgebase, Tenant, Task
|
|
|
|
| 434 |
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False}
|
| 435 |
exe = ThreadPoolExecutor(max_workers=12)
|
| 436 |
threads = []
|
| 437 |
+
doc_nm = {}
|
| 438 |
+
for d, blob in files:
|
| 439 |
+
doc_nm[d["id"]] = d["name"]
|
| 440 |
for d, blob in files:
|
| 441 |
kwargs = {
|
| 442 |
"callback": dummy,
|
|
|
|
| 509 |
"id": get_uuid(),
|
| 510 |
"doc_id": doc_id,
|
| 511 |
"kb_id": [kb.id],
|
| 512 |
+
"docnm_kwd": doc_nm[doc_id],
|
| 513 |
+
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc_nm[doc_id])),
|
| 514 |
+
"content_ltks": "",
|
| 515 |
"content_with_weight": mind_map,
|
| 516 |
"knowledge_graph_kwd": "mind_map"
|
| 517 |
})
|