Supports obtaining PDF documents from web pages (#1107)
Browse files### What problem does this PR solve?
Knowledge base management supports crawling information from web pages
and generating PDF documents
### Type of change
- [x] New Feature (Support document from web pages)
- api/apps/document_app.py +68 -1
- api/utils/web_utils.py +82 -0
- requirements.txt +3 -1
- web/src/hooks/documentHooks.ts +32 -10
- web/src/locales/en.ts +1 -0
- web/src/locales/zh-traditional.ts +1 -0
- web/src/locales/zh.ts +1 -0
- web/src/pages/add-knowledge/components/knowledge-file/document-toolbar.tsx +19 -4
- web/src/pages/add-knowledge/components/knowledge-file/hooks.ts +32 -0
- web/src/pages/add-knowledge/components/knowledge-file/index.tsx +16 -1
- web/src/pages/add-knowledge/components/knowledge-file/model.ts +21 -0
- web/src/pages/add-knowledge/components/knowledge-file/web-crawl-modal.tsx +54 -0
- web/src/services/kbService.ts +5 -0
- web/src/utils/api.ts +1 -0
api/apps/document_app.py
CHANGED
|
@@ -39,6 +39,7 @@ from api.settings import RetCode
|
|
| 39 |
from api.utils.api_utils import get_json_result
|
| 40 |
from rag.utils.minio_conn import MINIO
|
| 41 |
from api.utils.file_utils import filename_type, thumbnail
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
@manager.route('/upload', methods=['POST'])
|
|
@@ -289,7 +290,7 @@ def run():
|
|
| 289 |
return get_data_error_result(retmsg="Tenant not found!")
|
| 290 |
ELASTICSEARCH.deleteByQuery(
|
| 291 |
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
|
| 292 |
-
|
| 293 |
if str(req["run"]) == TaskStatus.RUNNING.value:
|
| 294 |
TaskService.filter_delete([Task.doc_id == id])
|
| 295 |
e, doc = DocumentService.get_by_id(id)
|
|
@@ -416,3 +417,69 @@ def get_image(image_id):
|
|
| 416 |
return response
|
| 417 |
except Exception as e:
|
| 418 |
return server_error_response(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
from api.utils.api_utils import get_json_result
|
| 40 |
from rag.utils.minio_conn import MINIO
|
| 41 |
from api.utils.file_utils import filename_type, thumbnail
|
| 42 |
+
from api.utils.web_utils import html2pdf, is_valid_url
|
| 43 |
|
| 44 |
|
| 45 |
@manager.route('/upload', methods=['POST'])
|
|
|
|
| 290 |
return get_data_error_result(retmsg="Tenant not found!")
|
| 291 |
ELASTICSEARCH.deleteByQuery(
|
| 292 |
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
|
| 293 |
+
|
| 294 |
if str(req["run"]) == TaskStatus.RUNNING.value:
|
| 295 |
TaskService.filter_delete([Task.doc_id == id])
|
| 296 |
e, doc = DocumentService.get_by_id(id)
|
|
|
|
| 417 |
return response
|
| 418 |
except Exception as e:
|
| 419 |
return server_error_response(e)
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
@manager.route('/web_crawl', methods=['POST'])
|
| 423 |
+
@login_required
|
| 424 |
+
def web_crawl():
|
| 425 |
+
kb_id = request.form.get("kb_id")
|
| 426 |
+
if not kb_id:
|
| 427 |
+
return get_json_result(
|
| 428 |
+
data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
|
| 429 |
+
name = request.form.get("name")
|
| 430 |
+
url = request.form.get("url")
|
| 431 |
+
if not name:
|
| 432 |
+
return get_json_result(
|
| 433 |
+
data=False, retmsg='Lack of "name"', retcode=RetCode.ARGUMENT_ERROR)
|
| 434 |
+
if not url:
|
| 435 |
+
return get_json_result(
|
| 436 |
+
data=False, retmsg='Lack of "url"', retcode=RetCode.ARGUMENT_ERROR)
|
| 437 |
+
if not is_valid_url(url):
|
| 438 |
+
return get_json_result(
|
| 439 |
+
data=False, retmsg='The URL format is invalid', retcode=RetCode.ARGUMENT_ERROR)
|
| 440 |
+
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
| 441 |
+
if not e:
|
| 442 |
+
raise LookupError("Can't find this knowledgebase!")
|
| 443 |
+
|
| 444 |
+
root_folder = FileService.get_root_folder(current_user.id)
|
| 445 |
+
pf_id = root_folder["id"]
|
| 446 |
+
FileService.init_knowledgebase_docs(pf_id, current_user.id)
|
| 447 |
+
kb_root_folder = FileService.get_kb_folder(current_user.id)
|
| 448 |
+
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
|
| 449 |
+
|
| 450 |
+
try:
|
| 451 |
+
filename = duplicate_name(
|
| 452 |
+
DocumentService.query,
|
| 453 |
+
name=name+".pdf",
|
| 454 |
+
kb_id=kb.id)
|
| 455 |
+
filetype = filename_type(filename)
|
| 456 |
+
if filetype == FileType.OTHER.value:
|
| 457 |
+
raise RuntimeError("This type of file has not been supported yet!")
|
| 458 |
+
|
| 459 |
+
location = filename
|
| 460 |
+
while MINIO.obj_exist(kb_id, location):
|
| 461 |
+
location += "_"
|
| 462 |
+
blob = html2pdf(url)
|
| 463 |
+
MINIO.put(kb_id, location, blob)
|
| 464 |
+
doc = {
|
| 465 |
+
"id": get_uuid(),
|
| 466 |
+
"kb_id": kb.id,
|
| 467 |
+
"parser_id": kb.parser_id,
|
| 468 |
+
"parser_config": kb.parser_config,
|
| 469 |
+
"created_by": current_user.id,
|
| 470 |
+
"type": filetype,
|
| 471 |
+
"name": filename,
|
| 472 |
+
"location": location,
|
| 473 |
+
"size": len(blob),
|
| 474 |
+
"thumbnail": thumbnail(filename, blob)
|
| 475 |
+
}
|
| 476 |
+
if doc["type"] == FileType.VISUAL:
|
| 477 |
+
doc["parser_id"] = ParserType.PICTURE.value
|
| 478 |
+
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
| 479 |
+
doc["parser_id"] = ParserType.PRESENTATION.value
|
| 480 |
+
DocumentService.insert(doc)
|
| 481 |
+
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
|
| 482 |
+
except Exception as e:
|
| 483 |
+
return get_json_result(
|
| 484 |
+
data=False, retmsg=e, retcode=RetCode.SERVER_ERROR)
|
| 485 |
+
return get_json_result(data=True)
|
api/utils/web_utils.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import json
|
| 3 |
+
import base64
|
| 4 |
+
|
| 5 |
+
from selenium import webdriver
|
| 6 |
+
from selenium.webdriver.chrome.options import Options
|
| 7 |
+
from selenium.webdriver.chrome.service import Service
|
| 8 |
+
from selenium.common.exceptions import TimeoutException
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support.expected_conditions import staleness_of
|
| 11 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 12 |
+
from selenium.webdriver.common.by import By
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def html2pdf(
|
| 16 |
+
source: str,
|
| 17 |
+
timeout: int = 2,
|
| 18 |
+
install_driver: bool = True,
|
| 19 |
+
print_options: dict = {},
|
| 20 |
+
):
|
| 21 |
+
result = __get_pdf_from_html(source, timeout, install_driver, print_options)
|
| 22 |
+
return result
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def __send_devtools(driver, cmd, params={}):
|
| 26 |
+
resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
|
| 27 |
+
url = driver.command_executor._url + resource
|
| 28 |
+
body = json.dumps({"cmd": cmd, "params": params})
|
| 29 |
+
response = driver.command_executor._request("POST", url, body)
|
| 30 |
+
|
| 31 |
+
if not response:
|
| 32 |
+
raise Exception(response.get("value"))
|
| 33 |
+
|
| 34 |
+
return response.get("value")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def __get_pdf_from_html(
|
| 38 |
+
path: str,
|
| 39 |
+
timeout: int,
|
| 40 |
+
install_driver: bool,
|
| 41 |
+
print_options: dict
|
| 42 |
+
):
|
| 43 |
+
webdriver_options = Options()
|
| 44 |
+
webdriver_prefs = {}
|
| 45 |
+
webdriver_options.add_argument("--headless")
|
| 46 |
+
webdriver_options.add_argument("--disable-gpu")
|
| 47 |
+
webdriver_options.add_argument("--no-sandbox")
|
| 48 |
+
webdriver_options.add_argument("--disable-dev-shm-usage")
|
| 49 |
+
webdriver_options.experimental_options["prefs"] = webdriver_prefs
|
| 50 |
+
|
| 51 |
+
webdriver_prefs["profile.default_content_settings"] = {"images": 2}
|
| 52 |
+
|
| 53 |
+
if install_driver:
|
| 54 |
+
service = Service(ChromeDriverManager().install())
|
| 55 |
+
driver = webdriver.Chrome(service=service, options=webdriver_options)
|
| 56 |
+
else:
|
| 57 |
+
driver = webdriver.Chrome(options=webdriver_options)
|
| 58 |
+
|
| 59 |
+
driver.get(path)
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
WebDriverWait(driver, timeout).until(
|
| 63 |
+
staleness_of(driver.find_element(by=By.TAG_NAME, value="html"))
|
| 64 |
+
)
|
| 65 |
+
except TimeoutException:
|
| 66 |
+
calculated_print_options = {
|
| 67 |
+
"landscape": False,
|
| 68 |
+
"displayHeaderFooter": False,
|
| 69 |
+
"printBackground": True,
|
| 70 |
+
"preferCSSPageSize": True,
|
| 71 |
+
}
|
| 72 |
+
calculated_print_options.update(print_options)
|
| 73 |
+
result = __send_devtools(
|
| 74 |
+
driver, "Page.printToPDF", calculated_print_options)
|
| 75 |
+
driver.quit()
|
| 76 |
+
return base64.b64decode(result["data"])
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def is_valid_url(url: str) -> bool:
|
| 80 |
+
return bool(re.match(r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url))
|
| 81 |
+
|
| 82 |
+
|
requirements.txt
CHANGED
|
@@ -138,4 +138,6 @@ umap-learn
|
|
| 138 |
fasttext==0.9.2
|
| 139 |
volcengine==1.0.141
|
| 140 |
readability-lxml==0.8.1
|
| 141 |
-
html_text==0.6.2
|
|
|
|
|
|
|
|
|
| 138 |
fasttext==0.9.2
|
| 139 |
volcengine==1.0.141
|
| 140 |
readability-lxml==0.8.1
|
| 141 |
+
html_text==0.6.2
|
| 142 |
+
selenium==4.21.0
|
| 143 |
+
webdriver-manager==4.0.1
|
web/src/hooks/documentHooks.ts
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
-
import {
|
| 2 |
-
import {
|
| 3 |
-
import {
|
| 4 |
-
import {
|
| 5 |
-
import {
|
| 6 |
-
import {
|
| 7 |
-
import {
|
| 8 |
-
import {
|
| 9 |
-
import {
|
| 10 |
-
import {
|
| 11 |
|
| 12 |
export const useGetDocumentUrl = (documentId?: string) => {
|
| 13 |
const getDocumentUrl = useCallback(
|
|
@@ -207,6 +207,28 @@ export const useUploadDocument = () => {
|
|
| 207 |
return uploadDocument;
|
| 208 |
};
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
export const useRunDocument = () => {
|
| 211 |
const dispatch = useDispatch();
|
| 212 |
|
|
|
|
| 1 |
+
import {IChunk, IKnowledgeFile} from '@/interfaces/database/knowledge';
|
| 2 |
+
import {IChangeParserConfigRequestBody} from '@/interfaces/request/document';
|
| 3 |
+
import {api_host} from '@/utils/api';
|
| 4 |
+
import {buildChunkHighlights} from '@/utils/documentUtils';
|
| 5 |
+
import {UploadFile} from 'antd';
|
| 6 |
+
import {useCallback, useMemo, useState} from 'react';
|
| 7 |
+
import {IHighlight} from 'react-pdf-highlighter';
|
| 8 |
+
import {useDispatch, useSelector} from 'umi';
|
| 9 |
+
import {useGetKnowledgeSearchParams} from './routeHook';
|
| 10 |
+
import {useOneNamespaceEffectsLoading} from './storeHooks';
|
| 11 |
|
| 12 |
export const useGetDocumentUrl = (documentId?: string) => {
|
| 13 |
const getDocumentUrl = useCallback(
|
|
|
|
| 207 |
return uploadDocument;
|
| 208 |
};
|
| 209 |
|
| 210 |
+
export const useWebCrawl = () => {
|
| 211 |
+
const dispatch = useDispatch();
|
| 212 |
+
const { knowledgeId } = useGetKnowledgeSearchParams();
|
| 213 |
+
return useCallback(
|
| 214 |
+
(name: string, url: string) => {
|
| 215 |
+
try {
|
| 216 |
+
return dispatch<any>({
|
| 217 |
+
type: 'kFModel/web_crawl',
|
| 218 |
+
payload: {
|
| 219 |
+
name,
|
| 220 |
+
url,
|
| 221 |
+
kb_id: knowledgeId,
|
| 222 |
+
},
|
| 223 |
+
});
|
| 224 |
+
} catch (errorInfo) {
|
| 225 |
+
console.log('Failed:', errorInfo);
|
| 226 |
+
}
|
| 227 |
+
},
|
| 228 |
+
[dispatch],
|
| 229 |
+
);
|
| 230 |
+
};
|
| 231 |
+
|
| 232 |
export const useRunDocument = () => {
|
| 233 |
const dispatch = useDispatch();
|
| 234 |
|
web/src/locales/en.ts
CHANGED
|
@@ -81,6 +81,7 @@ export default {
|
|
| 81 |
searchFiles: 'Search your files',
|
| 82 |
localFiles: 'Local files',
|
| 83 |
emptyFiles: 'Create empty file',
|
|
|
|
| 84 |
chunkNumber: 'Chunk Number',
|
| 85 |
uploadDate: 'Upload Date',
|
| 86 |
chunkMethod: 'Chunk Method',
|
|
|
|
| 81 |
searchFiles: 'Search your files',
|
| 82 |
localFiles: 'Local files',
|
| 83 |
emptyFiles: 'Create empty file',
|
| 84 |
+
webCrawl: 'Web Crawl',
|
| 85 |
chunkNumber: 'Chunk Number',
|
| 86 |
uploadDate: 'Upload Date',
|
| 87 |
chunkMethod: 'Chunk Method',
|
web/src/locales/zh-traditional.ts
CHANGED
|
@@ -80,6 +80,7 @@ export default {
|
|
| 80 |
searchFiles: '搜索文件',
|
| 81 |
localFiles: '本地文件',
|
| 82 |
emptyFiles: '新建空文件',
|
|
|
|
| 83 |
chunkNumber: '分塊數',
|
| 84 |
uploadDate: '上傳日期',
|
| 85 |
chunkMethod: '解析方法',
|
|
|
|
| 80 |
searchFiles: '搜索文件',
|
| 81 |
localFiles: '本地文件',
|
| 82 |
emptyFiles: '新建空文件',
|
| 83 |
+
webCrawl: '網頁抓取',
|
| 84 |
chunkNumber: '分塊數',
|
| 85 |
uploadDate: '上傳日期',
|
| 86 |
chunkMethod: '解析方法',
|
web/src/locales/zh.ts
CHANGED
|
@@ -80,6 +80,7 @@ export default {
|
|
| 80 |
searchFiles: '搜索文件',
|
| 81 |
localFiles: '本地文件',
|
| 82 |
emptyFiles: '新建空文件',
|
|
|
|
| 83 |
chunkNumber: '分块数',
|
| 84 |
uploadDate: '上传日期',
|
| 85 |
chunkMethod: '解析方法',
|
|
|
|
| 80 |
searchFiles: '搜索文件',
|
| 81 |
localFiles: '本地文件',
|
| 82 |
emptyFiles: '新建空文件',
|
| 83 |
+
webCrawl: '网页抓取',
|
| 84 |
chunkNumber: '分块数',
|
| 85 |
uploadDate: '上传日期',
|
| 86 |
chunkMethod: '解析方法',
|
web/src/pages/add-knowledge/components/knowledge-file/document-toolbar.tsx
CHANGED
|
@@ -29,13 +29,15 @@ import styles from './index.less';
|
|
| 29 |
interface IProps {
|
| 30 |
selectedRowKeys: string[];
|
| 31 |
showCreateModal(): void;
|
|
|
|
| 32 |
showDocumentUploadModal(): void;
|
| 33 |
}
|
| 34 |
|
| 35 |
const DocumentToolbar = ({
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
| 39 |
}: IProps) => {
|
| 40 |
const { t } = useTranslate('knowledgeDetails');
|
| 41 |
const { fetchDocumentList } = useFetchDocumentListOnMount();
|
|
@@ -66,6 +68,19 @@ const DocumentToolbar = ({
|
|
| 66 |
{ type: 'divider' },
|
| 67 |
{
|
| 68 |
key: '2',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
onClick: showCreateModal,
|
| 70 |
label: (
|
| 71 |
<div>
|
|
@@ -77,7 +92,7 @@ const DocumentToolbar = ({
|
|
| 77 |
),
|
| 78 |
},
|
| 79 |
];
|
| 80 |
-
}, [showDocumentUploadModal, showCreateModal, t]);
|
| 81 |
|
| 82 |
const handleDelete = useCallback(() => {
|
| 83 |
showDeleteConfirm({
|
|
|
|
| 29 |
interface IProps {
|
| 30 |
selectedRowKeys: string[];
|
| 31 |
showCreateModal(): void;
|
| 32 |
+
showWebCrawlModal(): void;
|
| 33 |
showDocumentUploadModal(): void;
|
| 34 |
}
|
| 35 |
|
| 36 |
const DocumentToolbar = ({
|
| 37 |
+
selectedRowKeys,
|
| 38 |
+
showCreateModal,
|
| 39 |
+
showWebCrawlModal,
|
| 40 |
+
showDocumentUploadModal,
|
| 41 |
}: IProps) => {
|
| 42 |
const { t } = useTranslate('knowledgeDetails');
|
| 43 |
const { fetchDocumentList } = useFetchDocumentListOnMount();
|
|
|
|
| 68 |
{ type: 'divider' },
|
| 69 |
{
|
| 70 |
key: '2',
|
| 71 |
+
onClick: showWebCrawlModal,
|
| 72 |
+
label: (
|
| 73 |
+
<div>
|
| 74 |
+
<Button type="link">
|
| 75 |
+
<FileTextOutlined />
|
| 76 |
+
{t('webCrawl')}
|
| 77 |
+
</Button>
|
| 78 |
+
</div>
|
| 79 |
+
),
|
| 80 |
+
},
|
| 81 |
+
{ type: 'divider' },
|
| 82 |
+
{
|
| 83 |
+
key: '3',
|
| 84 |
onClick: showCreateModal,
|
| 85 |
label: (
|
| 86 |
<div>
|
|
|
|
| 92 |
),
|
| 93 |
},
|
| 94 |
];
|
| 95 |
+
}, [showDocumentUploadModal, showWebCrawlModal, showCreateModal, t]);
|
| 96 |
|
| 97 |
const handleDelete = useCallback(() => {
|
| 98 |
showDeleteConfirm({
|
web/src/pages/add-knowledge/components/knowledge-file/hooks.ts
CHANGED
|
@@ -7,6 +7,7 @@ import {
|
|
| 7 |
useSelectRunDocumentLoading,
|
| 8 |
useSetDocumentParser,
|
| 9 |
useUploadDocument,
|
|
|
|
| 10 |
} from '@/hooks/documentHooks';
|
| 11 |
import { useGetKnowledgeSearchParams } from '@/hooks/routeHook';
|
| 12 |
import { useOneNamespaceEffectsLoading } from '@/hooks/storeHooks';
|
|
@@ -286,6 +287,37 @@ export const useHandleUploadDocument = () => {
|
|
| 286 |
};
|
| 287 |
};
|
| 288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
export const useHandleRunDocumentByIds = (id: string) => {
|
| 290 |
const loading = useSelectRunDocumentLoading();
|
| 291 |
const runDocumentByIds = useRunDocument();
|
|
|
|
| 7 |
useSelectRunDocumentLoading,
|
| 8 |
useSetDocumentParser,
|
| 9 |
useUploadDocument,
|
| 10 |
+
useWebCrawl,
|
| 11 |
} from '@/hooks/documentHooks';
|
| 12 |
import { useGetKnowledgeSearchParams } from '@/hooks/routeHook';
|
| 13 |
import { useOneNamespaceEffectsLoading } from '@/hooks/storeHooks';
|
|
|
|
| 287 |
};
|
| 288 |
};
|
| 289 |
|
| 290 |
+
export const useHandleWebCrawl = () => {
|
| 291 |
+
const {
|
| 292 |
+
visible: webCrawlUploadVisible,
|
| 293 |
+
hideModal: hideWebCrawlUploadModal,
|
| 294 |
+
showModal: showWebCrawlUploadModal,
|
| 295 |
+
} = useSetModalState();
|
| 296 |
+
const webCrawl = useWebCrawl();
|
| 297 |
+
|
| 298 |
+
const onWebCrawlUploadOk = useCallback(
|
| 299 |
+
async (name: string, url: string ) => {
|
| 300 |
+
const ret = await webCrawl(name, url);
|
| 301 |
+
if (ret === 0) {
|
| 302 |
+
hideWebCrawlUploadModal();
|
| 303 |
+
return 0
|
| 304 |
+
}
|
| 305 |
+
return -1
|
| 306 |
+
},
|
| 307 |
+
[webCrawl, hideWebCrawlUploadModal],
|
| 308 |
+
);
|
| 309 |
+
|
| 310 |
+
const loading = useOneNamespaceEffectsLoading('kFModel', ['web_crawl']);
|
| 311 |
+
|
| 312 |
+
return {
|
| 313 |
+
webCrawlUploadLoading: loading,
|
| 314 |
+
onWebCrawlUploadOk,
|
| 315 |
+
webCrawlUploadVisible,
|
| 316 |
+
hideWebCrawlUploadModal,
|
| 317 |
+
showWebCrawlUploadModal,
|
| 318 |
+
};
|
| 319 |
+
};
|
| 320 |
+
|
| 321 |
export const useHandleRunDocumentByIds = (id: string) => {
|
| 322 |
const loading = useSelectRunDocumentLoading();
|
| 323 |
const runDocumentByIds = useRunDocument();
|
web/src/pages/add-knowledge/components/knowledge-file/index.tsx
CHANGED
|
@@ -12,6 +12,7 @@ import { Divider, Flex, Switch, Table, Typography } from 'antd';
|
|
| 12 |
import type { ColumnsType } from 'antd/es/table';
|
| 13 |
import { useTranslation } from 'react-i18next';
|
| 14 |
import CreateFileModal from './create-file-modal';
|
|
|
|
| 15 |
import DocumentToolbar from './document-toolbar';
|
| 16 |
import {
|
| 17 |
useChangeDocumentParser,
|
|
@@ -19,7 +20,7 @@ import {
|
|
| 19 |
useFetchDocumentListOnMount,
|
| 20 |
useGetPagination,
|
| 21 |
useGetRowSelection,
|
| 22 |
-
useHandleUploadDocument,
|
| 23 |
useNavigateToOtherPage,
|
| 24 |
useRenameDocument,
|
| 25 |
} from './hooks';
|
|
@@ -69,6 +70,13 @@ const KnowledgeFile = () => {
|
|
| 69 |
onDocumentUploadOk,
|
| 70 |
documentUploadLoading,
|
| 71 |
} = useHandleUploadDocument();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
const { t } = useTranslation('translation', {
|
| 73 |
keyPrefix: 'knowledgeDetails',
|
| 74 |
});
|
|
@@ -170,6 +178,7 @@ const KnowledgeFile = () => {
|
|
| 170 |
<DocumentToolbar
|
| 171 |
selectedRowKeys={rowSelection.selectedRowKeys as string[]}
|
| 172 |
showCreateModal={showCreateModal}
|
|
|
|
| 173 |
showDocumentUploadModal={showDocumentUploadModal}
|
| 174 |
></DocumentToolbar>
|
| 175 |
<Table
|
|
@@ -211,6 +220,12 @@ const KnowledgeFile = () => {
|
|
| 211 |
loading={documentUploadLoading}
|
| 212 |
onOk={onDocumentUploadOk}
|
| 213 |
></FileUploadModal>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
</div>
|
| 215 |
);
|
| 216 |
};
|
|
|
|
| 12 |
import type { ColumnsType } from 'antd/es/table';
|
| 13 |
import { useTranslation } from 'react-i18next';
|
| 14 |
import CreateFileModal from './create-file-modal';
|
| 15 |
+
import WebCrawlModal from './web-crawl-modal';
|
| 16 |
import DocumentToolbar from './document-toolbar';
|
| 17 |
import {
|
| 18 |
useChangeDocumentParser,
|
|
|
|
| 20 |
useFetchDocumentListOnMount,
|
| 21 |
useGetPagination,
|
| 22 |
useGetRowSelection,
|
| 23 |
+
useHandleUploadDocument, useHandleWebCrawl,
|
| 24 |
useNavigateToOtherPage,
|
| 25 |
useRenameDocument,
|
| 26 |
} from './hooks';
|
|
|
|
| 70 |
onDocumentUploadOk,
|
| 71 |
documentUploadLoading,
|
| 72 |
} = useHandleUploadDocument();
|
| 73 |
+
const {
|
| 74 |
+
webCrawlUploadVisible,
|
| 75 |
+
hideWebCrawlUploadModal,
|
| 76 |
+
showWebCrawlUploadModal,
|
| 77 |
+
onWebCrawlUploadOk,
|
| 78 |
+
webCrawlUploadLoading,
|
| 79 |
+
} = useHandleWebCrawl();
|
| 80 |
const { t } = useTranslation('translation', {
|
| 81 |
keyPrefix: 'knowledgeDetails',
|
| 82 |
});
|
|
|
|
| 178 |
<DocumentToolbar
|
| 179 |
selectedRowKeys={rowSelection.selectedRowKeys as string[]}
|
| 180 |
showCreateModal={showCreateModal}
|
| 181 |
+
showWebCrawlModal={showWebCrawlUploadModal}
|
| 182 |
showDocumentUploadModal={showDocumentUploadModal}
|
| 183 |
></DocumentToolbar>
|
| 184 |
<Table
|
|
|
|
| 220 |
loading={documentUploadLoading}
|
| 221 |
onOk={onDocumentUploadOk}
|
| 222 |
></FileUploadModal>
|
| 223 |
+
<WebCrawlModal
|
| 224 |
+
visible={webCrawlUploadVisible}
|
| 225 |
+
hideModal={hideWebCrawlUploadModal}
|
| 226 |
+
loading={webCrawlUploadLoading}
|
| 227 |
+
onOk={onWebCrawlUploadOk}
|
| 228 |
+
></WebCrawlModal>
|
| 229 |
</div>
|
| 230 |
);
|
| 231 |
};
|
web/src/pages/add-knowledge/components/knowledge-file/model.ts
CHANGED
|
@@ -232,6 +232,27 @@ const model: DvaModel<KFModelState> = {
|
|
| 232 |
}
|
| 233 |
return data;
|
| 234 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
},
|
| 236 |
subscriptions: {
|
| 237 |
setup({ dispatch, history }) {
|
|
|
|
| 232 |
}
|
| 233 |
return data;
|
| 234 |
},
|
| 235 |
+
*web_crawl({ payload = {} }, { call, put }) {
|
| 236 |
+
const formData = new FormData();
|
| 237 |
+
formData.append('name', payload.name);
|
| 238 |
+
formData.append('url', payload.url);
|
| 239 |
+
formData.append('kb_id', payload.kb_id);
|
| 240 |
+
|
| 241 |
+
const { data } = yield call(kbService.web_crawl, formData);
|
| 242 |
+
|
| 243 |
+
const succeed = data.retcode === 0;
|
| 244 |
+
|
| 245 |
+
if (succeed) {
|
| 246 |
+
message.success(i18n.t('message.uploaded'));
|
| 247 |
+
}
|
| 248 |
+
if (succeed || data.retcode === 500) {
|
| 249 |
+
yield put({
|
| 250 |
+
type: 'getKfList',
|
| 251 |
+
payload: { kb_id: payload.kb_id },
|
| 252 |
+
});
|
| 253 |
+
}
|
| 254 |
+
return data.retcode;
|
| 255 |
+
},
|
| 256 |
},
|
| 257 |
subscriptions: {
|
| 258 |
setup({ dispatch, history }) {
|
web/src/pages/add-knowledge/components/knowledge-file/web-crawl-modal.tsx
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { IModalManagerChildrenProps } from '@/components/modal-manager';
|
| 2 |
+
import { Form, Input, Modal } from 'antd';
|
| 3 |
+
import React from 'react';
|
| 4 |
+
import {useTranslate} from "@/hooks/commonHooks";
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
|
| 8 |
+
loading: boolean;
|
| 9 |
+
onOk: (name: string, url: string) => void;
|
| 10 |
+
showModal?(): void;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
const WebCrawlModal: React.FC<IProps> = ({ visible, hideModal, onOk }) => {
|
| 14 |
+
const [form] = Form.useForm();
|
| 15 |
+
const { t } = useTranslate('knowledgeDetails');
|
| 16 |
+
const handleOk = async () => {
|
| 17 |
+
const values = await form.validateFields();
|
| 18 |
+
onOk(values.name, values.url);
|
| 19 |
+
};
|
| 20 |
+
|
| 21 |
+
return (
|
| 22 |
+
<Modal
|
| 23 |
+
title={t('webCrawl')}
|
| 24 |
+
open={visible}
|
| 25 |
+
onOk={handleOk}
|
| 26 |
+
onCancel={hideModal}
|
| 27 |
+
>
|
| 28 |
+
<Form
|
| 29 |
+
form={form}
|
| 30 |
+
name="validateOnly"
|
| 31 |
+
labelCol={{ span: 4 }}
|
| 32 |
+
wrapperCol={{ span: 20 }}
|
| 33 |
+
style={{ maxWidth: 600 }}
|
| 34 |
+
autoComplete="off"
|
| 35 |
+
>
|
| 36 |
+
<Form.Item
|
| 37 |
+
label="Name"
|
| 38 |
+
name="name"
|
| 39 |
+
rules={[{ required: true, message: 'Please input name!' },{ max: 10, message: 'The maximum length of name is 128 characters' }]}
|
| 40 |
+
>
|
| 41 |
+
<Input placeholder="Document name" />
|
| 42 |
+
</Form.Item>
|
| 43 |
+
<Form.Item
|
| 44 |
+
label="URL"
|
| 45 |
+
name="url"
|
| 46 |
+
rules={[{ required: true, message: 'Please input url!' },{pattern: new RegExp('(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'), message: 'Please enter a valid URL!'}]}
|
| 47 |
+
>
|
| 48 |
+
<Input placeholder="https://www.baidu.com" />
|
| 49 |
+
</Form.Item>
|
| 50 |
+
</Form>
|
| 51 |
+
</Modal>
|
| 52 |
+
);
|
| 53 |
+
};
|
| 54 |
+
export default WebCrawlModal;
|
web/src/services/kbService.ts
CHANGED
|
@@ -26,6 +26,7 @@ const {
|
|
| 26 |
document_run,
|
| 27 |
get_document_file,
|
| 28 |
document_upload,
|
|
|
|
| 29 |
} = api;
|
| 30 |
|
| 31 |
const methods = {
|
|
@@ -87,6 +88,10 @@ const methods = {
|
|
| 87 |
url: document_upload,
|
| 88 |
method: 'post',
|
| 89 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
// chunk管理
|
| 91 |
chunk_list: {
|
| 92 |
url: chunk_list,
|
|
|
|
| 26 |
document_run,
|
| 27 |
get_document_file,
|
| 28 |
document_upload,
|
| 29 |
+
web_crawl,
|
| 30 |
} = api;
|
| 31 |
|
| 32 |
const methods = {
|
|
|
|
| 88 |
url: document_upload,
|
| 89 |
method: 'post',
|
| 90 |
},
|
| 91 |
+
web_crawl: {
|
| 92 |
+
url: web_crawl,
|
| 93 |
+
method: 'post',
|
| 94 |
+
},
|
| 95 |
// chunk管理
|
| 96 |
chunk_list: {
|
| 97 |
url: chunk_list,
|
web/src/utils/api.ts
CHANGED
|
@@ -48,6 +48,7 @@ export default {
|
|
| 48 |
document_thumbnails: `${api_host}/document/thumbnails`,
|
| 49 |
get_document_file: `${api_host}/document/get`,
|
| 50 |
document_upload: `${api_host}/document/upload`,
|
|
|
|
| 51 |
|
| 52 |
// chat
|
| 53 |
setDialog: `${api_host}/dialog/set`,
|
|
|
|
| 48 |
document_thumbnails: `${api_host}/document/thumbnails`,
|
| 49 |
get_document_file: `${api_host}/document/get`,
|
| 50 |
document_upload: `${api_host}/document/upload`,
|
| 51 |
+
web_crawl: `${api_host}/document/web_crawl`,
|
| 52 |
|
| 53 |
// chat
|
| 54 |
setDialog: `${api_host}/dialog/set`,
|