H
Kevin Hu
commited on
Commit
·
dda4c86
1
Parent(s):
971f83c
Fix docx parser line bug (#1715)
Browse files### What problem does this PR solve?
#1704
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
---------
Co-authored-by: Kevin Hu <[email protected]>
- deepdoc/parser/pdf_parser.py +1 -1
- rag/app/naive.py +4 -1
deepdoc/parser/pdf_parser.py
CHANGED
|
@@ -952,7 +952,7 @@ class RAGFlowPdfParser:
|
|
| 952 |
fnm, str) else pdfplumber.open(BytesIO(fnm))
|
| 953 |
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
| 954 |
enumerate(self.pdf.pages[page_from:page_to])]
|
| 955 |
-
self.page_chars = [[{**c, 'top':
|
| 956 |
self.pdf.pages[page_from:page_to]]
|
| 957 |
self.total_page = len(self.pdf.pages)
|
| 958 |
except Exception as e:
|
|
|
|
| 952 |
fnm, str) else pdfplumber.open(BytesIO(fnm))
|
| 953 |
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
| 954 |
enumerate(self.pdf.pages[page_from:page_to])]
|
| 955 |
+
self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
|
| 956 |
self.pdf.pages[page_from:page_to]]
|
| 957 |
self.total_page = len(self.pdf.pages)
|
| 958 |
except Exception as e:
|
rag/app/naive.py
CHANGED
|
@@ -23,6 +23,8 @@ from rag.utils import num_tokens_from_string
|
|
| 23 |
from PIL import Image
|
| 24 |
from functools import reduce
|
| 25 |
from markdown import markdown
|
|
|
|
|
|
|
| 26 |
class Docx(DocxParser):
|
| 27 |
def __init__(self):
|
| 28 |
pass
|
|
@@ -81,7 +83,8 @@ class Docx(DocxParser):
|
|
| 81 |
continue
|
| 82 |
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
| 83 |
pn += 1
|
| 84 |
-
new_line = [(line[0], reduce(concat_img, line[1])) for line in lines]
|
|
|
|
| 85 |
tbls = []
|
| 86 |
for tb in self.doc.tables:
|
| 87 |
html= "<table>"
|
|
|
|
| 23 |
from PIL import Image
|
| 24 |
from functools import reduce
|
| 25 |
from markdown import markdown
|
| 26 |
+
|
| 27 |
+
|
| 28 |
class Docx(DocxParser):
|
| 29 |
def __init__(self):
|
| 30 |
pass
|
|
|
|
| 83 |
continue
|
| 84 |
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
| 85 |
pn += 1
|
| 86 |
+
new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]
|
| 87 |
+
|
| 88 |
tbls = []
|
| 89 |
for tb in self.doc.tables:
|
| 90 |
html= "<table>"
|