aopstudio commited on
Commit
1043572
·
1 Parent(s): dee8078

Optimize docx handle method in laws parser (#1302)

Browse files

### What problem does this PR solve?

Optimize docx handle method in laws parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Files changed (2) hide show
  1. rag/app/laws.py +76 -4
  2. rag/nlp/__init__.py +7 -2
rag/app/laws.py CHANGED
@@ -18,7 +18,7 @@ from docx import Document
18
 
19
  from api.db import ParserType
20
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
21
- make_colon_as_title, add_positions, tokenize_chunks, find_codec
22
  from rag.nlp import rag_tokenizer
23
  from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
24
  from rag.settings import cron_logger
@@ -32,7 +32,7 @@ class Docx(DocxParser):
32
  line = re.sub(r"\u3000", " ", line).strip()
33
  return line
34
 
35
- def __call__(self, filename, binary=None, from_page=0, to_page=100000):
36
  self.doc = Document(
37
  filename) if not binary else Document(BytesIO(binary))
38
  pn = 0
@@ -50,6 +50,74 @@ class Docx(DocxParser):
50
  pn += 1
51
  return [l for l in lines if l]
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  class Pdf(PdfParser):
55
  def __init__(self):
@@ -94,11 +162,16 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
94
  doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
95
  pdf_parser = None
96
  sections = []
 
 
 
97
  if re.search(r"\.docx$", filename, re.IGNORECASE):
98
  callback(0.1, "Start to parse.")
99
  for txt in Docx()(filename, binary):
100
  sections.append(txt)
101
  callback(0.8, "Finish parsing.")
 
 
102
 
103
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
104
  pdf_parser = Pdf() if kwargs.get(
@@ -143,8 +216,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
143
  raise NotImplementedError(
144
  "file type not supported yet(doc, docx, pdf, txt supported)")
145
 
146
- # is it English
147
- eng = lang.lower() == "english" # is_english(sections)
148
  # Remove 'Contents' part
149
  remove_contents_table(sections, eng)
150
 
 
18
 
19
  from api.db import ParserType
20
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
21
+ make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level
22
  from rag.nlp import rag_tokenizer
23
  from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
24
  from rag.settings import cron_logger
 
32
  line = re.sub(r"\u3000", " ", line).strip()
33
  return line
34
 
35
+ def old_call(self, filename, binary=None, from_page=0, to_page=100000):
36
  self.doc = Document(
37
  filename) if not binary else Document(BytesIO(binary))
38
  pn = 0
 
50
  pn += 1
51
  return [l for l in lines if l]
52
 
53
+ def __call__(self, filename, binary=None, from_page=0, to_page=100000):
54
+ self.doc = Document(
55
+ filename) if not binary else Document(BytesIO(binary))
56
+ pn = 0
57
+ last_question, last_answer, last_level = "", "", -1
58
+ lines = []
59
+ root = DocxNode()
60
+ point = root
61
+ bull = bullets_category([p.text for p in self.doc.paragraphs])
62
+ for p in self.doc.paragraphs:
63
+ if pn > to_page:
64
+ break
65
+ question_level, p_text = 0, ''
66
+ if from_page <= pn < to_page and p.text.strip():
67
+ question_level, p_text = docx_question_level(p, bull)
68
+ if not question_level or question_level > 6: # not a question
69
+ last_answer = f'{last_answer}\n{p_text}'
70
+ else: # is a question
71
+ if last_question:
72
+ while last_level <= point.level:
73
+ point = point.parent
74
+ new_node = DocxNode(last_question, last_answer, last_level, [], point)
75
+ point.childs.append(new_node)
76
+ point = new_node
77
+ last_question, last_answer, last_level = '', '', -1
78
+ last_level = question_level
79
+ last_answer = ''
80
+ last_question = p_text
81
+
82
+ for run in p.runs:
83
+ if 'lastRenderedPageBreak' in run._element.xml:
84
+ pn += 1
85
+ continue
86
+ if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
87
+ pn += 1
88
+ if last_question:
89
+ while last_level <= point.level:
90
+ point = point.parent
91
+ new_node = DocxNode(last_question, last_answer, last_level, [], point)
92
+ point.childs.append(new_node)
93
+ point = new_node
94
+ last_question, last_answer, last_level = '', '', -1
95
+ traversal_queue = [root]
96
+ while traversal_queue:
97
+ current_node: DocxNode = traversal_queue.pop()
98
+ sum_text = f'{self.__clean(current_node.question)}\n{self.__clean(current_node.answer)}'
99
+ if not current_node.childs and not current_node.answer.strip():
100
+ continue
101
+ for child in current_node.childs:
102
+ sum_text = f'{sum_text}\n{self.__clean(child.question)}'
103
+ traversal_queue.insert(0, child)
104
+ lines.append(self.__clean(sum_text))
105
+ return [l for l in lines if l]
106
+ class DocxNode:
107
+ def __init__(self, question: str = '', answer: str = '', level: int = 0, childs: list = [], parent = None) -> None:
108
+ self.question = question
109
+ self.answer = answer
110
+ self.level = level
111
+ self.childs = childs
112
+ self.parent = parent
113
+ def __str__(self) -> str:
114
+ return f'''
115
+ question:{self.question},
116
+ answer:{self.answer},
117
+ level:{self.level},
118
+ childs:{self.childs}
119
+ '''
120
+
121
 
122
  class Pdf(PdfParser):
123
  def __init__(self):
 
162
  doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
163
  pdf_parser = None
164
  sections = []
165
+ # is it English
166
+ eng = lang.lower() == "english" # is_english(sections)
167
+
168
  if re.search(r"\.docx$", filename, re.IGNORECASE):
169
  callback(0.1, "Start to parse.")
170
  for txt in Docx()(filename, binary):
171
  sections.append(txt)
172
  callback(0.8, "Finish parsing.")
173
+ chunks = sections
174
+ return tokenize_chunks(chunks, doc, eng, pdf_parser)
175
 
176
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
177
  pdf_parser = Pdf() if kwargs.get(
 
216
  raise NotImplementedError(
217
  "file type not supported yet(doc, docx, pdf, txt supported)")
218
 
219
+
 
220
  # Remove 'Contents' part
221
  remove_contents_table(sections, eng)
222
 
rag/nlp/__init__.py CHANGED
@@ -514,11 +514,16 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
514
 
515
  return cks
516
 
517
- def docx_question_level(p):
518
  if p.style.name.startswith('Heading'):
519
  return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
520
  else:
521
- return 0, re.sub(r"\u3000", " ", p.text).strip()
 
 
 
 
 
522
 
523
  def concat_img(img1, img2):
524
  if img1 and not img2:
 
514
 
515
  return cks
516
 
517
+ def docx_question_level(p, bull = -1):
518
  if p.style.name.startswith('Heading'):
519
  return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
520
  else:
521
+ if bull < 0:
522
+ return 0, re.sub(r"\u3000", " ", p.text).strip()
523
+ for j, title in enumerate(BULLET_PATTERN[bull]):
524
+ if re.match(title, re.sub(r"\u3000", " ", p.text).strip()):
525
+ return j+1, re.sub(r"\u3000", " ", p.text).strip()
526
+ return 0, re.sub(r"\u3000", " ", p.text).strip()
527
 
528
  def concat_img(img1, img2):
529
  if img1 and not img2: