kuschzzp
Kevin Hu
commited on
Commit
·
ff43695
1
Parent(s):
5fb0114
Fix:#3230 When parsing a docx file using the Book parsing method, to_page is always -1, resulting in a block count of 0 even if parsing is successful (#3249)
Browse files### What problem does this PR solve?
When parsing a docx file using the Book parsing method, to_page is
always -1, resulting in a block count of 0 even if parsing is successful
Fix:#3230
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
Co-authored-by: Kevin Hu <[email protected]>
- api/db/db_models.py +1 -1
- deepdoc/parser/docx_parser.py +2 -2
api/db/db_models.py
CHANGED
|
@@ -840,7 +840,7 @@ class Task(DataBaseModel):
|
|
| 840 |
doc_id = CharField(max_length=32, null=False, index=True)
|
| 841 |
from_page = IntegerField(default=0)
|
| 842 |
|
| 843 |
-
to_page = IntegerField(default
|
| 844 |
|
| 845 |
begin_at = DateTimeField(null=True, index=True)
|
| 846 |
process_duation = FloatField(default=0)
|
|
|
|
| 840 |
doc_id = CharField(max_length=32, null=False, index=True)
|
| 841 |
from_page = IntegerField(default=0)
|
| 842 |
|
| 843 |
+
to_page = IntegerField(default=100000000)
|
| 844 |
|
| 845 |
begin_at = DateTimeField(null=True, index=True)
|
| 846 |
process_duation = FloatField(default=0)
|
deepdoc/parser/docx_parser.py
CHANGED
|
@@ -110,7 +110,7 @@ class RAGFlowDocxParser:
|
|
| 110 |
return lines
|
| 111 |
return ["\n".join(lines)]
|
| 112 |
|
| 113 |
-
def __call__(self, fnm, from_page=0, to_page=
|
| 114 |
self.doc = Document(fnm) if isinstance(
|
| 115 |
fnm, str) else Document(BytesIO(fnm))
|
| 116 |
pn = 0 # parsed page
|
|
@@ -130,7 +130,7 @@ class RAGFlowDocxParser:
|
|
| 130 |
if 'lastRenderedPageBreak' in run._element.xml:
|
| 131 |
pn += 1
|
| 132 |
|
| 133 |
-
secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph
|
| 134 |
|
| 135 |
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
|
| 136 |
return secs, tbls
|
|
|
|
| 110 |
return lines
|
| 111 |
return ["\n".join(lines)]
|
| 112 |
|
| 113 |
+
def __call__(self, fnm, from_page=0, to_page=100000000):
|
| 114 |
self.doc = Document(fnm) if isinstance(
|
| 115 |
fnm, str) else Document(BytesIO(fnm))
|
| 116 |
pn = 0 # parsed page
|
|
|
|
| 130 |
if 'lastRenderedPageBreak' in run._element.xml:
|
| 131 |
pn += 1
|
| 132 |
|
| 133 |
+
secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
|
| 134 |
|
| 135 |
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
|
| 136 |
return secs, tbls
|