balibabu
commited on
Commit
·
2b252d0
1
Parent(s):
240ac86
feat: Added explanation on the parsing method of knowledge graph #1594 (#1916)
Browse files### What problem does this PR solve?
feat: Added explanation on the parsing method of knowledge graph #1594
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- web/src/assets/svg/chunk-method/knowledge-graph-01.svg +0 -0
- web/src/assets/svg/chunk-method/knowledge-graph-02.svg +0 -0
- web/src/components/chunk-method-modal/hooks.ts +2 -2
- web/src/locales/en.ts +8 -1
- web/src/locales/zh-traditional.ts +8 -1
- web/src/locales/zh.ts +8 -1
- web/src/pages/add-knowledge/components/knowledge-setting/category-panel.tsx +2 -1
- web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts +6 -1
- web/src/pages/add-knowledge/components/knowledge-setting/utils.ts +1 -0
web/src/assets/svg/chunk-method/knowledge-graph-01.svg
ADDED
|
web/src/assets/svg/chunk-method/knowledge-graph-02.svg
ADDED
|
web/src/components/chunk-method-modal/hooks.ts
CHANGED
@@ -27,7 +27,7 @@ const ParserListMap = new Map([
|
|
27 |
'one',
|
28 |
'qa',
|
29 |
'manual',
|
30 |
-
'knowledge_graph'
|
31 |
],
|
32 |
],
|
33 |
[
|
@@ -67,7 +67,7 @@ const ParserListMap = new Map([
|
|
67 |
],
|
68 |
[['md'], ['naive', 'qa', 'knowledge_graph']],
|
69 |
[['json'], ['naive', 'knowledge_graph']],
|
70 |
-
[['eml'], ['email']]
|
71 |
]);
|
72 |
|
73 |
const getParserList = (
|
|
|
27 |
'one',
|
28 |
'qa',
|
29 |
'manual',
|
30 |
+
'knowledge_graph',
|
31 |
],
|
32 |
],
|
33 |
[
|
|
|
67 |
],
|
68 |
[['md'], ['naive', 'qa', 'knowledge_graph']],
|
69 |
[['json'], ['naive', 'knowledge_graph']],
|
70 |
+
[['eml'], ['email']],
|
71 |
]);
|
72 |
|
73 |
const getParserList = (
|
web/src/locales/en.ts
CHANGED
@@ -199,7 +199,7 @@ export default {
|
|
199 |
We assume manual has hierarchical section structure. We use the lowest section titles as pivots to slice documents.
|
200 |
So, the figures and tables in the same section will not be sliced apart, and chunk size might be large.
|
201 |
</p>`,
|
202 |
-
naive: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT</b>.</p>
|
203 |
<p>This method apply the naive ways to chunk files: </p>
|
204 |
<p>
|
205 |
<li>Successive text will be sliced into pieces using vision detection model.</li>
|
@@ -271,6 +271,13 @@ export default {
|
|
271 |
</p><p>
|
272 |
If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
|
273 |
</p>`,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
useRaptor: 'Use RAPTOR to enhance retrieval',
|
275 |
useRaptorTip:
|
276 |
'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059',
|
|
|
199 |
We assume manual has hierarchical section structure. We use the lowest section titles as pivots to slice documents.
|
200 |
So, the figures and tables in the same section will not be sliced apart, and chunk size might be large.
|
201 |
</p>`,
|
202 |
+
naive: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EML</b>.</p>
|
203 |
<p>This method apply the naive ways to chunk files: </p>
|
204 |
<p>
|
205 |
<li>Successive text will be sliced into pieces using vision detection model.</li>
|
|
|
271 |
</p><p>
|
272 |
If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
|
273 |
</p>`,
|
274 |
+
knowledgeGraph: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EML</b>
|
275 |
+
|
276 |
+
<p>After files being chunked, it uses chunks to extract knowledge graph and mind map of the entire document. This method apply the naive ways to chunk files:
|
277 |
+
Successive text will be sliced into pieces each of which is around 512 token number.</p>
|
278 |
+
<p>Next, chunks will be transmited to LLM to extract nodes and relationships of a knowledge graph, and a mind map.</p>
|
279 |
+
|
280 |
+
Mind the entiry type you need to specify.</p>`,
|
281 |
useRaptor: 'Use RAPTOR to enhance retrieval',
|
282 |
useRaptorTip:
|
283 |
'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059',
|
web/src/locales/zh-traditional.ts
CHANGED
@@ -190,7 +190,7 @@ export default {
|
|
190 |
我們假設手冊具有分層部分結構。我們使用最低的部分標題作為對文檔進行切片的樞軸。
|
191 |
因此,同一部分中的圖和表不會被分割,並且塊大小可能會很大。
|
192 |
</p>`,
|
193 |
-
naive: `<p>支持的文件格式為<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT</b>。</p>
|
194 |
<p>此方法將簡單的方法應用於塊文件:</p>
|
195 |
<p>
|
196 |
<li>系統將使用視覺檢測模型將連續文本分割成多個片段。</li>
|
@@ -244,6 +244,13 @@ export default {
|
|
244 |
</p><p>
|
245 |
如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
|
246 |
</p>`,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
useRaptor: '使用RAPTOR文件增強策略',
|
248 |
useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059',
|
249 |
prompt: '提示詞',
|
|
|
190 |
我們假設手冊具有分層部分結構。我們使用最低的部分標題作為對文檔進行切片的樞軸。
|
191 |
因此,同一部分中的圖和表不會被分割,並且塊大小可能會很大。
|
192 |
</p>`,
|
193 |
+
naive: `<p>支持的文件格式為<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>。</p>
|
194 |
<p>此方法將簡單的方法應用於塊文件:</p>
|
195 |
<p>
|
196 |
<li>系統將使用視覺檢測模型將連續文本分割成多個片段。</li>
|
|
|
244 |
</p><p>
|
245 |
如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
|
246 |
</p>`,
|
247 |
+
knowledgeGraph: `<p>支援的檔案格式為<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>
|
248 |
+
|
249 |
+
<p>文件分塊後,使用分塊擷取整個文件的知識圖譜和心智圖。此方法將簡單的方法應用於區塊檔案:
|
250 |
+
連續的文字將被分割成多個片段,每個片段大約有 512 個令牌數。
|
251 |
+
<p>接下來,區塊將傳送到LLM以提取知識圖譜和思維導圖的節點和關係。
|
252 |
+
|
253 |
+
<p>請注意您需要指定的條目類型。</p></p>`,
|
254 |
useRaptor: '使用RAPTOR文件增強策略',
|
255 |
useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059',
|
256 |
prompt: '提示詞',
|
web/src/locales/zh.ts
CHANGED
@@ -191,7 +191,7 @@ export default {
|
|
191 |
我们假设手册具有分层部分结构。 我们使用最低的部分标题作为对文档进行切片的枢轴。
|
192 |
因此,同一部分中的图和表不会被分割,并且块大小可能会很大。
|
193 |
</p>`,
|
194 |
-
naive: `<p>支持的文件格式为<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT</b>。</p>
|
195 |
<p>此方法将简单的方法应用于块文件:</p>
|
196 |
<p>
|
197 |
<li>系统将使用视觉检测模型将连续文本分割成多个片段。</li>
|
@@ -261,6 +261,13 @@ export default {
|
|
261 |
</p><p>
|
262 |
如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
|
263 |
</p>`,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
useRaptor: '使用召回增强RAPTOR策略',
|
265 |
useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059',
|
266 |
prompt: '提示词',
|
|
|
191 |
我们假设手册具有分层部分结构。 我们使用最低的部分标题作为对文档进行切片的枢轴。
|
192 |
因此,同一部分中的图和表不会被分割,并且块大小可能会很大。
|
193 |
</p>`,
|
194 |
+
naive: `<p>支持的文件格式为<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>。</p>
|
195 |
<p>此方法将简单的方法应用于块文件:</p>
|
196 |
<p>
|
197 |
<li>系统将使用视觉检测模型将连续文本分割成多个片段。</li>
|
|
|
261 |
</p><p>
|
262 |
如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
|
263 |
</p>`,
|
264 |
+
knowledgeGraph: `<p>支持的文件格式为<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>
|
265 |
+
|
266 |
+
<p>文件分块后,使用分块提取整个文档的知识图谱和思维导图。此方法将简单的方法应用于分块文件:
|
267 |
+
连续的文本将被切成大约 512 个 token 数的块。</p>
|
268 |
+
<p>接下来,将分块传输到 LLM 以提取知识图谱和思维导图的节点和关系。</p>
|
269 |
+
|
270 |
+
注意您需要指定的条目类型。</p>`,
|
271 |
useRaptor: '使用召回增强RAPTOR策略',
|
272 |
useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059',
|
273 |
prompt: '提示词',
|
web/src/pages/add-knowledge/components/knowledge-setting/category-panel.tsx
CHANGED
@@ -3,6 +3,7 @@ import { useTranslate } from '@/hooks/common-hooks';
|
|
3 |
import { useSelectParserList } from '@/hooks/user-setting-hooks';
|
4 |
import { Col, Divider, Empty, Row, Typography } from 'antd';
|
5 |
import DOMPurify from 'dompurify';
|
|
|
6 |
import { useMemo } from 'react';
|
7 |
import styles from './index.less';
|
8 |
import { ImageMap } from './utils';
|
@@ -18,7 +19,7 @@ const CategoryPanel = ({ chunkMethod }: { chunkMethod: string }) => {
|
|
18 |
if (item) {
|
19 |
return {
|
20 |
title: item.label,
|
21 |
-
description: t(item.value),
|
22 |
};
|
23 |
}
|
24 |
return { title: '', description: '' };
|
|
|
3 |
import { useSelectParserList } from '@/hooks/user-setting-hooks';
|
4 |
import { Col, Divider, Empty, Row, Typography } from 'antd';
|
5 |
import DOMPurify from 'dompurify';
|
6 |
+
import camelCase from 'lodash/camelCase';
|
7 |
import { useMemo } from 'react';
|
8 |
import styles from './index.less';
|
9 |
import { ImageMap } from './utils';
|
|
|
19 |
if (item) {
|
20 |
return {
|
21 |
title: item.label,
|
22 |
+
description: t(camelCase(item.value)),
|
23 |
};
|
24 |
}
|
25 |
return { title: '', description: '' };
|
web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts
CHANGED
@@ -37,6 +37,9 @@ export const useSubmitKnowledgeConfiguration = (form: FormInstance) => {
|
|
37 |
};
|
38 |
};
|
39 |
|
|
|
|
|
|
|
40 |
export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
|
41 |
const parserList = useSelectParserList();
|
42 |
const allOptions = useSelectLlmOptionsByModelType();
|
@@ -62,7 +65,9 @@ export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
|
|
62 |
}, [form, knowledgeDetails]);
|
63 |
|
64 |
return {
|
65 |
-
parserList
|
|
|
|
|
66 |
embeddingModelOptions: allOptions[LlmModelType.Embedding],
|
67 |
disabled: knowledgeDetails.chunk_num > 0,
|
68 |
};
|
|
|
37 |
};
|
38 |
};
|
39 |
|
40 |
+
// The value that does not need to be displayed in the analysis method Select
|
41 |
+
const HiddenFields = ['email', 'picture', 'audio'];
|
42 |
+
|
43 |
export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
|
44 |
const parserList = useSelectParserList();
|
45 |
const allOptions = useSelectLlmOptionsByModelType();
|
|
|
65 |
}, [form, knowledgeDetails]);
|
66 |
|
67 |
return {
|
68 |
+
parserList: parserList.filter(
|
69 |
+
(x) => !HiddenFields.some((y) => y === x.value),
|
70 |
+
),
|
71 |
embeddingModelOptions: allOptions[LlmModelType.Embedding],
|
72 |
disabled: knowledgeDetails.chunk_num > 0,
|
73 |
};
|
web/src/pages/add-knowledge/components/knowledge-setting/utils.ts
CHANGED
@@ -15,6 +15,7 @@ export const ImageMap = {
|
|
15 |
resume: getImageName('resume', 2),
|
16 |
table: getImageName('table', 2),
|
17 |
one: getImageName('one', 2),
|
|
|
18 |
};
|
19 |
|
20 |
export const TextMap = {
|
|
|
15 |
resume: getImageName('resume', 2),
|
16 |
table: getImageName('table', 2),
|
17 |
one: getImageName('one', 2),
|
18 |
+
knowledge_graph: getImageName('knowledge-graph', 2),
|
19 |
};
|
20 |
|
21 |
export const TextMap = {
|