File size: 3,941 Bytes
96a1a44 f666f56 64a0633 cdba7f7 96a1a44 cdba7f7 f666f56 96a1a44 64a0633 96a1a44 b83edb4 279ca43 96a1a44 cdba7f7 b83edb4 96a1a44 b83edb4 96a1a44 b83edb4 4e03dc3 96a1a44 e6acaf6 96a1a44 e6acaf6 96a1a44 41c7a59 a8294f2 96a1a44 6224edc 96a1a44 41c7a59 96a1a44 bcb7249 96a1a44 64a0633 96a1a44 51482f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import copy
import re
from api.db import ParserType
from rag.nlp import huqie, tokenize, tokenize_table, add_positions
from deepdoc.parser import PdfParser
from rag.utils import num_tokens_from_string
class Pdf(PdfParser):
def __init__(self):
self.model_speciess = ParserType.MANUAL.value
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
callback(msg="OCR is running...")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished.")
from timeit import default_timer as timer
start = timer()
self._layouts_rec(zoomin)
callback(0.65, "Layout analysis finished.")
print("paddle layouts:", timer() - start)
self._table_transformer_job(zoomin)
callback(0.67, "Table analysis finished.")
self._text_merge()
self._concat_downward(concat_between_pages=False)
self._filter_forpages()
callback(0.68, "Text merging finished")
tbls = self._extract_table_figure(True, zoomin, True, True)
# clean mess
for b in self.boxes:
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
# merge chunks with the same bullets
self._merge_with_same_bullet()
# merge title with decent chunk
i = 0
while i + 1 < len(self.boxes):
b = self.boxes[i]
if b.get("layoutno","").find("title") < 0:
i += 1
continue
b_ = self.boxes[i + 1]
b_["text"] = b["text"] + "\n" + b_["text"]
b_["x0"] = min(b["x0"], b_["x0"])
b_["x1"] = max(b["x1"], b_["x1"])
b_["top"] = b["top"]
self.boxes.pop(i)
callback(0.8, "Parsing finished")
for b in self.boxes: print(b["text"], b.get("layoutno"))
print(tbls)
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Only pdf is supported.
"""
pdf_parser = None
if re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
cks, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
else: raise NotImplementedError("file type not supported yet(pdf supported)")
doc = {
"docnm_kwd": filename
}
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
# is it English
eng = lang.lower() == "english"#pdf_parser.is_english
res = tokenize_table(tbls, doc, eng)
i = 0
chunk = []
tk_cnt = 0
def add_chunk():
nonlocal chunk, res, doc, pdf_parser, tk_cnt
d = copy.deepcopy(doc)
ck = "\n".join(chunk)
tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
d["image"], poss = pdf_parser.crop(ck, need_position=True)
add_positions(d, poss)
res.append(d)
chunk = []
tk_cnt = 0
while i < len(cks):
if tk_cnt > 128: add_chunk()
txt = cks[i]
txt_ = pdf_parser.remove_tag(txt)
i += 1
cnt = num_tokens_from_string(txt_)
chunk.append(txt)
tk_cnt += cnt
if chunk: add_chunk()
for i, d in enumerate(res):
print(d)
# d["image"].save(f"./logs/{i}.jpg")
return res
if __name__ == "__main__":
import sys
def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)
|