Kevin Hu
commited on
Commit
·
44731b3
1
Parent(s):
8efa7c5
add auto keywords and auto-question (#2965)
Browse files### What problem does this PR solve?
#2687
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- api/apps/api_app.py +1 -2
- api/apps/chunk_app.py +2 -1
- api/apps/sdk/chat.py +2 -3
- api/apps/sdk/dify_retrieval.py +20 -5
- api/apps/sdk/doc.py +23 -38
- api/db/services/dialog_service.py +53 -1
- rag/nlp/__init__.py +0 -11
- rag/svr/task_executor.py +18 -0
api/apps/api_app.py
CHANGED
|
@@ -25,7 +25,7 @@ from api.db import FileType, LLMType, ParserType, FileSource
|
|
| 25 |
from api.db.db_models import APIToken, Task, File
|
| 26 |
from api.db.services import duplicate_name
|
| 27 |
from api.db.services.api_service import APITokenService, API4ConversationService
|
| 28 |
-
from api.db.services.dialog_service import DialogService, chat
|
| 29 |
from api.db.services.document_service import DocumentService, doc_upload_and_parse
|
| 30 |
from api.db.services.file2document_service import File2DocumentService
|
| 31 |
from api.db.services.file_service import FileService
|
|
@@ -38,7 +38,6 @@ from api.utils.api_utils import server_error_response, get_data_error_result, ge
|
|
| 38 |
generate_confirmation_token
|
| 39 |
|
| 40 |
from api.utils.file_utils import filename_type, thumbnail
|
| 41 |
-
from rag.nlp import keyword_extraction
|
| 42 |
from rag.utils.storage_factory import STORAGE_IMPL
|
| 43 |
|
| 44 |
from api.db.services.canvas_service import UserCanvasService
|
|
|
|
| 25 |
from api.db.db_models import APIToken, Task, File
|
| 26 |
from api.db.services import duplicate_name
|
| 27 |
from api.db.services.api_service import APITokenService, API4ConversationService
|
| 28 |
+
from api.db.services.dialog_service import DialogService, chat, keyword_extraction
|
| 29 |
from api.db.services.document_service import DocumentService, doc_upload_and_parse
|
| 30 |
from api.db.services.file2document_service import File2DocumentService
|
| 31 |
from api.db.services.file_service import FileService
|
|
|
|
| 38 |
generate_confirmation_token
|
| 39 |
|
| 40 |
from api.utils.file_utils import filename_type, thumbnail
|
|
|
|
| 41 |
from rag.utils.storage_factory import STORAGE_IMPL
|
| 42 |
|
| 43 |
from api.db.services.canvas_service import UserCanvasService
|
api/apps/chunk_app.py
CHANGED
|
@@ -21,8 +21,9 @@ from flask import request
|
|
| 21 |
from flask_login import login_required, current_user
|
| 22 |
from elasticsearch_dsl import Q
|
| 23 |
|
|
|
|
| 24 |
from rag.app.qa import rmPrefix, beAdoc
|
| 25 |
-
from rag.nlp import search, rag_tokenizer
|
| 26 |
from rag.utils.es_conn import ELASTICSEARCH
|
| 27 |
from rag.utils import rmSpace
|
| 28 |
from api.db import LLMType, ParserType
|
|
|
|
| 21 |
from flask_login import login_required, current_user
|
| 22 |
from elasticsearch_dsl import Q
|
| 23 |
|
| 24 |
+
from api.db.services.dialog_service import keyword_extraction
|
| 25 |
from rag.app.qa import rmPrefix, beAdoc
|
| 26 |
+
from rag.nlp import search, rag_tokenizer
|
| 27 |
from rag.utils.es_conn import ELASTICSEARCH
|
| 28 |
from rag.utils import rmSpace
|
| 29 |
from api.db import LLMType, ParserType
|
api/apps/sdk/chat.py
CHANGED
|
@@ -16,16 +16,15 @@
|
|
| 16 |
from flask import request
|
| 17 |
|
| 18 |
from api.db import StatusEnum
|
| 19 |
-
from api.db.db_models import TenantLLM
|
| 20 |
from api.db.services.dialog_service import DialogService
|
| 21 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
| 22 |
-
from api.db.services.llm_service import
|
| 23 |
from api.db.services.user_service import TenantService
|
| 24 |
-
from api.settings import RetCode
|
| 25 |
from api.utils import get_uuid
|
| 26 |
from api.utils.api_utils import get_error_data_result, token_required
|
| 27 |
from api.utils.api_utils import get_result
|
| 28 |
|
|
|
|
| 29 |
@manager.route('/chat', methods=['POST'])
|
| 30 |
@token_required
|
| 31 |
def create(tenant_id):
|
|
|
|
| 16 |
from flask import request
|
| 17 |
|
| 18 |
from api.db import StatusEnum
|
|
|
|
| 19 |
from api.db.services.dialog_service import DialogService
|
| 20 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
| 21 |
+
from api.db.services.llm_service import TenantLLMService
|
| 22 |
from api.db.services.user_service import TenantService
|
|
|
|
| 23 |
from api.utils import get_uuid
|
| 24 |
from api.utils.api_utils import get_error_data_result, token_required
|
| 25 |
from api.utils.api_utils import get_result
|
| 26 |
|
| 27 |
+
|
| 28 |
@manager.route('/chat', methods=['POST'])
|
| 29 |
@token_required
|
| 30 |
def create(tenant_id):
|
api/apps/sdk/dify_retrieval.py
CHANGED
|
@@ -1,10 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from flask import request, jsonify
|
| 2 |
|
| 3 |
-
from db import LLMType, ParserType
|
| 4 |
-
from db.services.knowledgebase_service import KnowledgebaseService
|
| 5 |
-
from db.services.llm_service import LLMBundle
|
| 6 |
-
from settings import retrievaler, kg_retrievaler, RetCode
|
| 7 |
-
from utils.api_utils import validate_request, build_error_result, apikey_required
|
| 8 |
|
| 9 |
|
| 10 |
@manager.route('/dify/retrieval', methods=['POST'])
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
#
|
| 16 |
from flask import request, jsonify
|
| 17 |
|
| 18 |
+
from api.db import LLMType, ParserType
|
| 19 |
+
from api.db.services.knowledgebase_service import KnowledgebaseService
|
| 20 |
+
from api.db.services.llm_service import LLMBundle
|
| 21 |
+
from api.settings import retrievaler, kg_retrievaler, RetCode
|
| 22 |
+
from api.utils.api_utils import validate_request, build_error_result, apikey_required
|
| 23 |
|
| 24 |
|
| 25 |
@manager.route('/dify/retrieval', methods=['POST'])
|
api/apps/sdk/doc.py
CHANGED
|
@@ -1,48 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import pathlib
|
| 2 |
-
import re
|
| 3 |
import datetime
|
| 4 |
-
import json
|
| 5 |
-
import traceback
|
| 6 |
-
|
| 7 |
-
from botocore.docs.method import document_model_driven_method
|
| 8 |
-
from flask import request
|
| 9 |
-
from flask_login import login_required, current_user
|
| 10 |
-
from elasticsearch_dsl import Q
|
| 11 |
-
from pygments import highlight
|
| 12 |
-
from sphinx.addnodes import document
|
| 13 |
|
|
|
|
| 14 |
from rag.app.qa import rmPrefix, beAdoc
|
| 15 |
-
from rag.nlp import
|
| 16 |
-
from rag.utils.es_conn import ELASTICSEARCH
|
| 17 |
-
from rag.utils import rmSpace
|
| 18 |
from api.db import LLMType, ParserType
|
| 19 |
-
from api.db.services.knowledgebase_service import KnowledgebaseService
|
| 20 |
from api.db.services.llm_service import TenantLLMService
|
| 21 |
-
from api.
|
| 22 |
-
from api.utils.api_utils import server_error_response, get_error_data_result, validate_request
|
| 23 |
-
from api.db.services.document_service import DocumentService
|
| 24 |
-
from api.settings import RetCode, retrievaler, kg_retrievaler
|
| 25 |
-
from api.utils.api_utils import get_result
|
| 26 |
import hashlib
|
| 27 |
import re
|
| 28 |
-
from api.utils.api_utils import
|
| 29 |
-
|
| 30 |
-
from api.db.db_models import Task, File
|
| 31 |
-
|
| 32 |
from api.db.services.task_service import TaskService, queue_tasks
|
| 33 |
-
from api.
|
| 34 |
-
|
| 35 |
-
from api.utils.api_utils import server_error_response, get_error_data_result, validate_request
|
| 36 |
-
|
| 37 |
-
from api.utils.api_utils import get_result, get_result, get_error_data_result
|
| 38 |
-
|
| 39 |
-
from functools import partial
|
| 40 |
from io import BytesIO
|
| 41 |
-
|
| 42 |
from elasticsearch_dsl import Q
|
| 43 |
from flask import request, send_file
|
| 44 |
-
from flask_login import login_required
|
| 45 |
-
|
| 46 |
from api.db import FileSource, TaskStatus, FileType
|
| 47 |
from api.db.db_models import File
|
| 48 |
from api.db.services.document_service import DocumentService
|
|
@@ -50,8 +39,7 @@ from api.db.services.file2document_service import File2DocumentService
|
|
| 50 |
from api.db.services.file_service import FileService
|
| 51 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
| 52 |
from api.settings import RetCode, retrievaler
|
| 53 |
-
from api.utils.api_utils import construct_json_result
|
| 54 |
-
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
|
| 55 |
from rag.nlp import search
|
| 56 |
from rag.utils import rmSpace
|
| 57 |
from rag.utils.es_conn import ELASTICSEARCH
|
|
@@ -365,7 +353,6 @@ def list_chunks(tenant_id,dataset_id,document_id):
|
|
| 365 |
return get_result(data=res)
|
| 366 |
|
| 367 |
|
| 368 |
-
|
| 369 |
@manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST'])
|
| 370 |
@token_required
|
| 371 |
def create(tenant_id,dataset_id,document_id):
|
|
@@ -454,7 +441,6 @@ def rm_chunk(tenant_id,dataset_id,document_id):
|
|
| 454 |
return get_result()
|
| 455 |
|
| 456 |
|
| 457 |
-
|
| 458 |
@manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT'])
|
| 459 |
@token_required
|
| 460 |
def update_chunk(tenant_id,dataset_id,document_id,chunk_id):
|
|
@@ -512,7 +498,6 @@ def update_chunk(tenant_id,dataset_id,document_id,chunk_id):
|
|
| 512 |
return get_result()
|
| 513 |
|
| 514 |
|
| 515 |
-
|
| 516 |
@manager.route('/retrieval', methods=['POST'])
|
| 517 |
@token_required
|
| 518 |
def retrieval_test(tenant_id):
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
#
|
| 16 |
import pathlib
|
|
|
|
| 17 |
import datetime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
from api.db.services.dialog_service import keyword_extraction
|
| 20 |
from rag.app.qa import rmPrefix, beAdoc
|
| 21 |
+
from rag.nlp import rag_tokenizer
|
|
|
|
|
|
|
| 22 |
from api.db import LLMType, ParserType
|
|
|
|
| 23 |
from api.db.services.llm_service import TenantLLMService
|
| 24 |
+
from api.settings import kg_retrievaler
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
import hashlib
|
| 26 |
import re
|
| 27 |
+
from api.utils.api_utils import token_required
|
| 28 |
+
from api.db.db_models import Task
|
|
|
|
|
|
|
| 29 |
from api.db.services.task_service import TaskService, queue_tasks
|
| 30 |
+
from api.utils.api_utils import server_error_response
|
| 31 |
+
from api.utils.api_utils import get_result, get_error_data_result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
from io import BytesIO
|
|
|
|
| 33 |
from elasticsearch_dsl import Q
|
| 34 |
from flask import request, send_file
|
|
|
|
|
|
|
| 35 |
from api.db import FileSource, TaskStatus, FileType
|
| 36 |
from api.db.db_models import File
|
| 37 |
from api.db.services.document_service import DocumentService
|
|
|
|
| 39 |
from api.db.services.file_service import FileService
|
| 40 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
| 41 |
from api.settings import RetCode, retrievaler
|
| 42 |
+
from api.utils.api_utils import construct_json_result
|
|
|
|
| 43 |
from rag.nlp import search
|
| 44 |
from rag.utils import rmSpace
|
| 45 |
from rag.utils.es_conn import ELASTICSEARCH
|
|
|
|
| 353 |
return get_result(data=res)
|
| 354 |
|
| 355 |
|
|
|
|
| 356 |
@manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST'])
|
| 357 |
@token_required
|
| 358 |
def create(tenant_id,dataset_id,document_id):
|
|
|
|
| 441 |
return get_result()
|
| 442 |
|
| 443 |
|
|
|
|
| 444 |
@manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT'])
|
| 445 |
@token_required
|
| 446 |
def update_chunk(tenant_id,dataset_id,document_id,chunk_id):
|
|
|
|
| 498 |
return get_result()
|
| 499 |
|
| 500 |
|
|
|
|
| 501 |
@manager.route('/retrieval', methods=['POST'])
|
| 502 |
@token_required
|
| 503 |
def retrieval_test(tenant_id):
|
api/db/services/dialog_service.py
CHANGED
|
@@ -28,7 +28,6 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
|
|
| 28 |
from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle
|
| 29 |
from api.settings import chat_logger, retrievaler, kg_retrievaler
|
| 30 |
from rag.app.resume import forbidden_select_fields4resume
|
| 31 |
-
from rag.nlp import keyword_extraction
|
| 32 |
from rag.nlp.search import index_name
|
| 33 |
from rag.utils import rmSpace, num_tokens_from_string, encoder
|
| 34 |
from api.utils.file_utils import get_project_base_directory
|
|
@@ -80,6 +79,7 @@ class ConversationService(CommonService):
|
|
| 80 |
|
| 81 |
return list(sessions.dicts())
|
| 82 |
|
|
|
|
| 83 |
def message_fit_in(msg, max_length=4000):
|
| 84 |
def count():
|
| 85 |
nonlocal msg
|
|
@@ -456,6 +456,58 @@ def rewrite(tenant_id, llm_id, question):
|
|
| 456 |
return ans
|
| 457 |
|
| 458 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
def full_question(tenant_id, llm_id, messages):
|
| 460 |
if llm_id2llm_type(llm_id) == "image2text":
|
| 461 |
chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)
|
|
|
|
| 28 |
from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle
|
| 29 |
from api.settings import chat_logger, retrievaler, kg_retrievaler
|
| 30 |
from rag.app.resume import forbidden_select_fields4resume
|
|
|
|
| 31 |
from rag.nlp.search import index_name
|
| 32 |
from rag.utils import rmSpace, num_tokens_from_string, encoder
|
| 33 |
from api.utils.file_utils import get_project_base_directory
|
|
|
|
| 79 |
|
| 80 |
return list(sessions.dicts())
|
| 81 |
|
| 82 |
+
|
| 83 |
def message_fit_in(msg, max_length=4000):
|
| 84 |
def count():
|
| 85 |
nonlocal msg
|
|
|
|
| 456 |
return ans
|
| 457 |
|
| 458 |
|
| 459 |
+
def keyword_extraction(chat_mdl, content, topn=3):
|
| 460 |
+
prompt = f"""
|
| 461 |
+
Role: You're a text analyzer.
|
| 462 |
+
Task: extract the most important keywords/phrases of a given piece of text content.
|
| 463 |
+
Requirements:
|
| 464 |
+
- Summarize the text content, and give top {topn} important keywords/phrases.
|
| 465 |
+
- The keywords MUST be in language of the given piece of text content.
|
| 466 |
+
- The keywords are delimited by ENGLISH COMMA.
|
| 467 |
+
- Keywords ONLY in output.
|
| 468 |
+
|
| 469 |
+
### Text Content
|
| 470 |
+
{content}
|
| 471 |
+
|
| 472 |
+
"""
|
| 473 |
+
msg = [
|
| 474 |
+
{"role": "system", "content": prompt},
|
| 475 |
+
{"role": "user", "content": "Output: "}
|
| 476 |
+
]
|
| 477 |
+
_, msg = message_fit_in(msg, chat_mdl.max_length)
|
| 478 |
+
kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
|
| 479 |
+
if isinstance(kwd, tuple): kwd = kwd[0]
|
| 480 |
+
if kwd.find("**ERROR**") >=0: return ""
|
| 481 |
+
return kwd
|
| 482 |
+
|
| 483 |
+
|
| 484 |
+
def question_proposal(chat_mdl, content, topn=3):
|
| 485 |
+
prompt = f"""
|
| 486 |
+
Role: You're a text analyzer.
|
| 487 |
+
Task: propose {topn} questions about a given piece of text content.
|
| 488 |
+
Requirements:
|
| 489 |
+
- Understand and summarize the text content, and propose top {topn} important questions.
|
| 490 |
+
- The questions SHOULD NOT have overlapping meanings.
|
| 491 |
+
- The questions SHOULD cover the main content of the text as much as possible.
|
| 492 |
+
- The questions MUST be in language of the given piece of text content.
|
| 493 |
+
- One question per line.
|
| 494 |
+
- Question ONLY in output.
|
| 495 |
+
|
| 496 |
+
### Text Content
|
| 497 |
+
{content}
|
| 498 |
+
|
| 499 |
+
"""
|
| 500 |
+
msg = [
|
| 501 |
+
{"role": "system", "content": prompt},
|
| 502 |
+
{"role": "user", "content": "Output: "}
|
| 503 |
+
]
|
| 504 |
+
_, msg = message_fit_in(msg, chat_mdl.max_length)
|
| 505 |
+
kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
|
| 506 |
+
if isinstance(kwd, tuple): kwd = kwd[0]
|
| 507 |
+
if kwd.find("**ERROR**") >= 0: return ""
|
| 508 |
+
return kwd
|
| 509 |
+
|
| 510 |
+
|
| 511 |
def full_question(tenant_id, llm_id, messages):
|
| 512 |
if llm_id2llm_type(llm_id) == "image2text":
|
| 513 |
chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)
|
rag/nlp/__init__.py
CHANGED
|
@@ -570,14 +570,3 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
|
| 570 |
|
| 571 |
return cks, images
|
| 572 |
|
| 573 |
-
|
| 574 |
-
def keyword_extraction(chat_mdl, content):
|
| 575 |
-
prompt = """
|
| 576 |
-
You're a question analyzer.
|
| 577 |
-
1. Please give me the most important keyword/phrase of this question.
|
| 578 |
-
Answer format: (in language of user's question)
|
| 579 |
-
- keyword:
|
| 580 |
-
"""
|
| 581 |
-
kwd = chat_mdl.chat(prompt, [{"role": "user", "content": content}], {"temperature": 0.2})
|
| 582 |
-
if isinstance(kwd, tuple): return kwd[0]
|
| 583 |
-
return kwd
|
|
|
|
| 570 |
|
| 571 |
return cks, images
|
| 572 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rag/svr/task_executor.py
CHANGED
|
@@ -34,6 +34,7 @@ import pandas as pd
|
|
| 34 |
from elasticsearch_dsl import Q
|
| 35 |
|
| 36 |
from api.db import LLMType, ParserType
|
|
|
|
| 37 |
from api.db.services.document_service import DocumentService
|
| 38 |
from api.db.services.llm_service import LLMBundle
|
| 39 |
from api.db.services.task_service import TaskService
|
|
@@ -198,6 +199,23 @@ def build(row):
|
|
| 198 |
d["_id"] = md5.hexdigest()
|
| 199 |
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
| 200 |
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
if not d.get("image"):
|
| 202 |
docs.append(d)
|
| 203 |
continue
|
|
|
|
| 34 |
from elasticsearch_dsl import Q
|
| 35 |
|
| 36 |
from api.db import LLMType, ParserType
|
| 37 |
+
from api.db.services.dialog_service import keyword_extraction, question_proposal
|
| 38 |
from api.db.services.document_service import DocumentService
|
| 39 |
from api.db.services.llm_service import LLMBundle
|
| 40 |
from api.db.services.task_service import TaskService
|
|
|
|
| 199 |
d["_id"] = md5.hexdigest()
|
| 200 |
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
| 201 |
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
| 202 |
+
|
| 203 |
+
if row["parser_config"].get("auto_keywords", 0):
|
| 204 |
+
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
|
| 205 |
+
d["important_kwd"] = keyword_extraction(chat_mdl, ck["content_with_weight"],
|
| 206 |
+
row["parser_config"]["auto_keywords"]).split(",")
|
| 207 |
+
d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))
|
| 208 |
+
|
| 209 |
+
if row["parser_config"].get("auto_questions", 0):
|
| 210 |
+
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
|
| 211 |
+
qst = question_proposal(chat_mdl, ck["content_with_weight"], row["parser_config"]["auto_keywords"])
|
| 212 |
+
ck["content_with_weight"] = f"Question: \n{qst}\n\nAnswer:\n" + ck["content_with_weight"]
|
| 213 |
+
qst = rag_tokenizer.tokenize(qst)
|
| 214 |
+
if "content_ltks" in ck:
|
| 215 |
+
ck["content_ltks"] += " " + qst
|
| 216 |
+
if "content_sm_ltks" in ck:
|
| 217 |
+
ck["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
|
| 218 |
+
|
| 219 |
if not d.get("image"):
|
| 220 |
docs.append(d)
|
| 221 |
continue
|