balibabu commited on
Commit
2b252d0
·
1 Parent(s): 240ac86

feat: Added explanation on the parsing method of knowledge graph #1594 (#1916)

Browse files

### What problem does this PR solve?

feat: Added explanation on the parsing method of knowledge graph #1594

### Type of change


- [x] New Feature (non-breaking change which adds functionality)

web/src/assets/svg/chunk-method/knowledge-graph-01.svg ADDED
web/src/assets/svg/chunk-method/knowledge-graph-02.svg ADDED
web/src/components/chunk-method-modal/hooks.ts CHANGED
@@ -27,7 +27,7 @@ const ParserListMap = new Map([
27
  'one',
28
  'qa',
29
  'manual',
30
- 'knowledge_graph'
31
  ],
32
  ],
33
  [
@@ -67,7 +67,7 @@ const ParserListMap = new Map([
67
  ],
68
  [['md'], ['naive', 'qa', 'knowledge_graph']],
69
  [['json'], ['naive', 'knowledge_graph']],
70
- [['eml'], ['email']]
71
  ]);
72
 
73
  const getParserList = (
 
27
  'one',
28
  'qa',
29
  'manual',
30
+ 'knowledge_graph',
31
  ],
32
  ],
33
  [
 
67
  ],
68
  [['md'], ['naive', 'qa', 'knowledge_graph']],
69
  [['json'], ['naive', 'knowledge_graph']],
70
+ [['eml'], ['email']],
71
  ]);
72
 
73
  const getParserList = (
web/src/locales/en.ts CHANGED
@@ -199,7 +199,7 @@ export default {
199
  We assume manual has hierarchical section structure. We use the lowest section titles as pivots to slice documents.
200
  So, the figures and tables in the same section will not be sliced apart, and chunk size might be large.
201
  </p>`,
202
- naive: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT</b>.</p>
203
  <p>This method apply the naive ways to chunk files: </p>
204
  <p>
205
  <li>Successive text will be sliced into pieces using vision detection model.</li>
@@ -271,6 +271,13 @@ export default {
271
  </p><p>
272
  If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
273
  </p>`,
 
 
 
 
 
 
 
274
  useRaptor: 'Use RAPTOR to enhance retrieval',
275
  useRaptorTip:
276
  'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059',
 
199
  We assume manual has hierarchical section structure. We use the lowest section titles as pivots to slice documents.
200
  So, the figures and tables in the same section will not be sliced apart, and chunk size might be large.
201
  </p>`,
202
+ naive: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EML</b>.</p>
203
  <p>This method apply the naive ways to chunk files: </p>
204
  <p>
205
  <li>Successive text will be sliced into pieces using vision detection model.</li>
 
271
  </p><p>
272
  If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
273
  </p>`,
274
+ knowledgeGraph: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EML</b>
275
+
276
+ <p>After files being chunked, it uses chunks to extract knowledge graph and mind map of the entire document. This method apply the naive ways to chunk files:
277
+ Successive text will be sliced into pieces each of which is around 512 token number.</p>
278
+ <p>Next, chunks will be transmited to LLM to extract nodes and relationships of a knowledge graph, and a mind map.</p>
279
+
280
+ Mind the entiry type you need to specify.</p>`,
281
  useRaptor: 'Use RAPTOR to enhance retrieval',
282
  useRaptorTip:
283
  'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059',
web/src/locales/zh-traditional.ts CHANGED
@@ -190,7 +190,7 @@ export default {
190
  我們假設手冊具有分層部分結構。我們使用最低的部分標題作為對文檔進行切片的樞軸。
191
  因此,同一部分中的圖和表不會被分割,並且塊大小可能會很大。
192
  </p>`,
193
- naive: `<p>支持的文件格式為<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT</b>。</p>
194
  <p>此方法將簡單的方法應用於塊文件:</p>
195
  <p>
196
  <li>系統將使用視覺檢測模型將連續文本分割成多個片段。</li>
@@ -244,6 +244,13 @@ export default {
244
  </p><p>
245
  如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
246
  </p>`,
 
 
 
 
 
 
 
247
  useRaptor: '使用RAPTOR文件增強策略',
248
  useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059',
249
  prompt: '提示詞',
 
190
  我們假設手冊具有分層部分結構。我們使用最低的部分標題作為對文檔進行切片的樞軸。
191
  因此,同一部分中的圖和表不會被分割,並且塊大小可能會很大。
192
  </p>`,
193
+ naive: `<p>支持的文件格式為<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>。</p>
194
  <p>此方法將簡單的方法應用於塊文件:</p>
195
  <p>
196
  <li>系統將使用視覺檢測模型將連續文本分割成多個片段。</li>
 
244
  </p><p>
245
  如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
246
  </p>`,
247
+ knowledgeGraph: `<p>支援的檔案格式為<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>
248
+
249
+ <p>文件分塊後,使用分塊擷取整個文件的知識圖譜和心智圖。此方法將簡單的方法應用於區塊檔案:
250
+ 連續的文字將被分割成多個片段,每個片段大約有 512 個令牌數。
251
+ <p>接下來,區塊將傳送到LLM以提取知識圖譜和思維導圖的節點和關係。
252
+
253
+ <p>請注意您需要指定的條目類型。</p></p>`,
254
  useRaptor: '使用RAPTOR文件增強策略',
255
  useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059',
256
  prompt: '提示詞',
web/src/locales/zh.ts CHANGED
@@ -191,7 +191,7 @@ export default {
191
  我们假设手册具有分层部分结构。 我们使用最低的部分标题作为对文档进行切片的枢轴。
192
  因此,同一部分中的图和表不会被分割,并且块大小可能会很大。
193
  </p>`,
194
- naive: `<p>支持的文件格式为<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT</b>。</p>
195
  <p>此方法将简单的方法应用于块文件:</p>
196
  <p>
197
  <li>系统将使用视觉检测模型将连续文本分割成多个片段。</li>
@@ -261,6 +261,13 @@ export default {
261
  </p><p>
262
  如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
263
  </p>`,
 
 
 
 
 
 
 
264
  useRaptor: '使用召回增强RAPTOR策略',
265
  useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059',
266
  prompt: '提示词',
 
191
  我们假设手册具有分层部分结构。 我们使用最低的部分标题作为对文档进行切片的枢轴。
192
  因此,同一部分中的图和表不会被分割,并且块大小可能会很大。
193
  </p>`,
194
+ naive: `<p>支持的文件格式为<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>。</p>
195
  <p>此方法将简单的方法应用于块文件:</p>
196
  <p>
197
  <li>系统将使用视觉检测模型将连续文本分割成多个片段。</li>
 
261
  </p><p>
262
  如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
263
  </p>`,
264
+ knowledgeGraph: `<p>支持的文件格式为<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>
265
+
266
+ <p>文件分块后,使用分块提取整个文档的知识图谱和思维导图。此方法将简单的方法应用于分块文件:
267
+ 连续的文本将被切成大约 512 个 token 数的块。</p>
268
+ <p>接下来,将分块传输到 LLM 以提取知识图谱和思维导图的节点和关系。</p>
269
+
270
+ 注意您需要指定的条目类型。</p>`,
271
  useRaptor: '使用召回增强RAPTOR策略',
272
  useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059',
273
  prompt: '提示词',
web/src/pages/add-knowledge/components/knowledge-setting/category-panel.tsx CHANGED
@@ -3,6 +3,7 @@ import { useTranslate } from '@/hooks/common-hooks';
3
  import { useSelectParserList } from '@/hooks/user-setting-hooks';
4
  import { Col, Divider, Empty, Row, Typography } from 'antd';
5
  import DOMPurify from 'dompurify';
 
6
  import { useMemo } from 'react';
7
  import styles from './index.less';
8
  import { ImageMap } from './utils';
@@ -18,7 +19,7 @@ const CategoryPanel = ({ chunkMethod }: { chunkMethod: string }) => {
18
  if (item) {
19
  return {
20
  title: item.label,
21
- description: t(item.value),
22
  };
23
  }
24
  return { title: '', description: '' };
 
3
  import { useSelectParserList } from '@/hooks/user-setting-hooks';
4
  import { Col, Divider, Empty, Row, Typography } from 'antd';
5
  import DOMPurify from 'dompurify';
6
+ import camelCase from 'lodash/camelCase';
7
  import { useMemo } from 'react';
8
  import styles from './index.less';
9
  import { ImageMap } from './utils';
 
19
  if (item) {
20
  return {
21
  title: item.label,
22
+ description: t(camelCase(item.value)),
23
  };
24
  }
25
  return { title: '', description: '' };
web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts CHANGED
@@ -37,6 +37,9 @@ export const useSubmitKnowledgeConfiguration = (form: FormInstance) => {
37
  };
38
  };
39
 
 
 
 
40
  export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
41
  const parserList = useSelectParserList();
42
  const allOptions = useSelectLlmOptionsByModelType();
@@ -62,7 +65,9 @@ export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
62
  }, [form, knowledgeDetails]);
63
 
64
  return {
65
- parserList,
 
 
66
  embeddingModelOptions: allOptions[LlmModelType.Embedding],
67
  disabled: knowledgeDetails.chunk_num > 0,
68
  };
 
37
  };
38
  };
39
 
40
+ // The value that does not need to be displayed in the analysis method Select
41
+ const HiddenFields = ['email', 'picture', 'audio'];
42
+
43
  export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
44
  const parserList = useSelectParserList();
45
  const allOptions = useSelectLlmOptionsByModelType();
 
65
  }, [form, knowledgeDetails]);
66
 
67
  return {
68
+ parserList: parserList.filter(
69
+ (x) => !HiddenFields.some((y) => y === x.value),
70
+ ),
71
  embeddingModelOptions: allOptions[LlmModelType.Embedding],
72
  disabled: knowledgeDetails.chunk_num > 0,
73
  };
web/src/pages/add-knowledge/components/knowledge-setting/utils.ts CHANGED
@@ -15,6 +15,7 @@ export const ImageMap = {
15
  resume: getImageName('resume', 2),
16
  table: getImageName('table', 2),
17
  one: getImageName('one', 2),
 
18
  };
19
 
20
  export const TextMap = {
 
15
  resume: getImageName('resume', 2),
16
  table: getImageName('table', 2),
17
  one: getImageName('one', 2),
18
+ knowledge_graph: getImageName('knowledge-graph', 2),
19
  };
20
 
21
  export const TextMap = {