KevinHuSh
		
	commited on
		
		
					Commit 
							
							·
						
						072f9dd
	
1
								Parent(s):
							
							e32ef75
								
Add app to rag module: presentaion & laws (#43)
Browse files- api/utils/file_utils.py +1 -1
- rag/app/__init__.py +48 -0
- rag/app/laws.py +192 -0
- rag/app/presentation.py +127 -0
- rag/nlp/huchunk.py +0 -5
- rag/parser/pdf_parser.py +104 -97
- rag/settings.py +1 -0
    	
        api/utils/file_utils.py
    CHANGED
    
    | @@ -150,4 +150,4 @@ def filename_type(filename): | |
| 150 | 
             
                    return FileType.AURAL.value
         | 
| 151 |  | 
| 152 | 
             
                if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
         | 
| 153 | 
            -
                    return FileType.VISUAL
         | 
|  | |
| 150 | 
             
                    return FileType.AURAL.value
         | 
| 151 |  | 
| 152 | 
             
                if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
         | 
| 153 | 
            +
                    return FileType.VISUAL
         | 
    	
        rag/app/__init__.py
    ADDED
    
    | @@ -0,0 +1,48 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import re
         | 
| 2 | 
            +
             | 
| 3 | 
            +
             | 
| 4 | 
            +
            def callback__(progress, msg, func):
         | 
| 5 | 
            +
                if not func :return
         | 
| 6 | 
            +
                func(progress, msg)
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            BULLET_PATTERN = [[
         | 
| 10 | 
            +
                    r"第[零一二三四五六七八九十百]+编",
         | 
| 11 | 
            +
                    r"第[零一二三四五六七八九十百]+章",
         | 
| 12 | 
            +
                    r"第[零一二三四五六七八九十百]+节",
         | 
| 13 | 
            +
                    r"第[零一二三四五六七八九十百]+条",
         | 
| 14 | 
            +
                    r"[\((][零一二三四五六七八九十百]+[\))]",
         | 
| 15 | 
            +
                ], [
         | 
| 16 | 
            +
                    r"[0-9]{,3}[\. 、]",
         | 
| 17 | 
            +
                    r"[0-9]{,2}\.[0-9]{,2}",
         | 
| 18 | 
            +
                    r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
         | 
| 19 | 
            +
                    r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
         | 
| 20 | 
            +
                ], [
         | 
| 21 | 
            +
                    r"[零一二三四五六七八九十百]+[ 、]",
         | 
| 22 | 
            +
                    r"[\((][零一二三四五六七八九十百]+[\))]",
         | 
| 23 | 
            +
                    r"[\((][0-9]{,2}[\))]",
         | 
| 24 | 
            +
                ] ,[
         | 
| 25 | 
            +
                    r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
         | 
| 26 | 
            +
                    r"Chapter (I+V?|VI*|XI|IX|X)",
         | 
| 27 | 
            +
                    r"Section [0-9]+",
         | 
| 28 | 
            +
                    r"Article [0-9]+"
         | 
| 29 | 
            +
                ]
         | 
| 30 | 
            +
                ]
         | 
| 31 | 
            +
             | 
| 32 | 
            +
             | 
| 33 | 
            +
            def bullets_category(sections):
         | 
| 34 | 
            +
                global BULLET_PATTERN
         | 
| 35 | 
            +
                hits = [0] * len(BULLET_PATTERN)
         | 
| 36 | 
            +
                for i, pro in enumerate(BULLET_PATTERN):
         | 
| 37 | 
            +
                    for sec in sections:
         | 
| 38 | 
            +
                        for p in pro:
         | 
| 39 | 
            +
                            if re.match(p, sec):
         | 
| 40 | 
            +
                                hits[i] += 1
         | 
| 41 | 
            +
                                break
         | 
| 42 | 
            +
                maxium = 0
         | 
| 43 | 
            +
                res = -1
         | 
| 44 | 
            +
                for i,h in enumerate(hits):
         | 
| 45 | 
            +
                    if h <= maxium:continue
         | 
| 46 | 
            +
                    res = i
         | 
| 47 | 
            +
                    maxium = h
         | 
| 48 | 
            +
                return res
         | 
    	
        rag/app/laws.py
    ADDED
    
    | @@ -0,0 +1,192 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import copy
         | 
| 2 | 
            +
            import re
         | 
| 3 | 
            +
            from io import BytesIO
         | 
| 4 | 
            +
            from docx import Document
         | 
| 5 | 
            +
            import numpy as np
         | 
| 6 | 
            +
            from rag.app import callback__, bullets_category, BULLET_PATTERN
         | 
| 7 | 
            +
            from rag.nlp import huqie
         | 
| 8 | 
            +
            from rag.parser.pdf_parser import HuParser
         | 
| 9 | 
            +
             | 
| 10 | 
            +
             | 
| 11 | 
            +
            class Docx(object):
         | 
| 12 | 
            +
                def __init__(self):
         | 
| 13 | 
            +
                    pass
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                def __clean(self, line):
         | 
| 16 | 
            +
                    line = re.sub(r"\u3000", " ", line).strip()
         | 
| 17 | 
            +
                    return line
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                def __call__(self, filename, binary=None):
         | 
| 20 | 
            +
                    self.doc = Document(
         | 
| 21 | 
            +
                        filename) if not binary else Document(BytesIO(binary))
         | 
| 22 | 
            +
                    lines = [self.__clean(p.text) for p in self.doc.paragraphs]
         | 
| 23 | 
            +
                    return [l for l in lines if l]
         | 
| 24 | 
            +
             | 
| 25 | 
            +
             | 
| 26 | 
            +
            class Pdf(HuParser):
         | 
| 27 | 
            +
                def __call__(self, filename, binary=None, from_page=0,
         | 
| 28 | 
            +
                             to_page=100000, zoomin=3, callback=None):
         | 
| 29 | 
            +
                    self.__images__(
         | 
| 30 | 
            +
                        filename if not binary else binary,
         | 
| 31 | 
            +
                        zoomin,
         | 
| 32 | 
            +
                        from_page,
         | 
| 33 | 
            +
                        to_page)
         | 
| 34 | 
            +
                    callback__((min(to_page, self.total_page) - from_page) / self.total_page / 2,
         | 
| 35 | 
            +
                               "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                    from timeit import default_timer as timer
         | 
| 38 | 
            +
                    start = timer()
         | 
| 39 | 
            +
                    self._layouts_paddle(zoomin)
         | 
| 40 | 
            +
                    callback__((min(to_page, self.total_page) - from_page) / self.total_page / 2,
         | 
| 41 | 
            +
                               "Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback)
         | 
| 42 | 
            +
                    print("paddle layouts:", timer()-start)
         | 
| 43 | 
            +
                    bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
         | 
| 44 | 
            +
                    # is it English
         | 
| 45 | 
            +
                    eng = 0
         | 
| 46 | 
            +
                    for b in bxs:
         | 
| 47 | 
            +
                        if re.match(r"[a-zA-Z]", b["text"].strip()):
         | 
| 48 | 
            +
                            eng += 1
         | 
| 49 | 
            +
                    if eng / len(bxs) > 0.8:
         | 
| 50 | 
            +
                        eng = True
         | 
| 51 | 
            +
                    else:
         | 
| 52 | 
            +
                        eng = False
         | 
| 53 | 
            +
                    # Merge vertically
         | 
| 54 | 
            +
                    i = 0
         | 
| 55 | 
            +
                    while i + 1 < len(bxs):
         | 
| 56 | 
            +
                        b = bxs[i]
         | 
| 57 | 
            +
                        b_ = bxs[i + 1]
         | 
| 58 | 
            +
                        if b["page_number"] < b_["page_number"] and re.match(r"[0-9  •一—-]+$", b["text"]):
         | 
| 59 | 
            +
                            bxs.pop(i)
         | 
| 60 | 
            +
                            continue
         | 
| 61 | 
            +
                        concatting_feats = [
         | 
| 62 | 
            +
                            b["text"].strip()[-1] in ",;:'\",、‘“;:",
         | 
| 63 | 
            +
                            len(b["text"].strip())>1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
         | 
| 64 | 
            +
                            b["text"].strip()[0] in "。;?!?”)),,、:",
         | 
| 65 | 
            +
                        ]
         | 
| 66 | 
            +
                        # features for not concating
         | 
| 67 | 
            +
                        feats = [
         | 
| 68 | 
            +
                            b.get("layoutno",0) != b.get("layoutno",0),
         | 
| 69 | 
            +
                            b["text"].strip()[-1] in "。?!?",
         | 
| 70 | 
            +
                            eng and b["text"].strip()[-1] in ".!?",
         | 
| 71 | 
            +
                            b["page_number"] == b_["page_number"] and b_["top"] - \
         | 
| 72 | 
            +
                            b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
         | 
| 73 | 
            +
                            b["page_number"] < b_["page_number"] and abs(
         | 
| 74 | 
            +
                                b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
         | 
| 75 | 
            +
                        ]
         | 
| 76 | 
            +
                        if any(feats) and not any(concatting_feats):
         | 
| 77 | 
            +
                            i += 1
         | 
| 78 | 
            +
                            continue
         | 
| 79 | 
            +
                        # merge up and down
         | 
| 80 | 
            +
                        b["bottom"] = b_["bottom"]
         | 
| 81 | 
            +
                        b["text"] += b_["text"]
         | 
| 82 | 
            +
                        b["x0"] = min(b["x0"], b_["x0"])
         | 
| 83 | 
            +
                        b["x1"] = max(b["x1"], b_["x1"])
         | 
| 84 | 
            +
                        bxs.pop(i + 1)
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                    callback__((min(to_page, self.total_page) - from_page) / self.total_page / 2,
         | 
| 87 | 
            +
                               "Page {}~{}: Text extraction finished".format(from_page, min(to_page, self.total_page)), callback)
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                    return [b["text"] + self._line_tag(b, zoomin) for b in bxs]
         | 
| 90 | 
            +
             | 
| 91 | 
            +
             | 
| 92 | 
            +
            def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
         | 
| 93 | 
            +
                doc = {
         | 
| 94 | 
            +
                    "docnm_kwd": filename,
         | 
| 95 | 
            +
                    "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
         | 
| 96 | 
            +
                }
         | 
| 97 | 
            +
                doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
         | 
| 98 | 
            +
                pdf_parser = None
         | 
| 99 | 
            +
                sections = []
         | 
| 100 | 
            +
                if re.search(r"\.docx?$", filename, re.IGNORECASE):
         | 
| 101 | 
            +
                    for txt in Docx()(filename, binary):
         | 
| 102 | 
            +
                        sections.append(txt)
         | 
| 103 | 
            +
                if re.search(r"\.pdf$", filename, re.IGNORECASE):
         | 
| 104 | 
            +
                    pdf_parser = Pdf()
         | 
| 105 | 
            +
                    for txt in pdf_parser(filename if not binary else binary,
         | 
| 106 | 
            +
                                     from_page=from_page, to_page=to_page, callback=callback):
         | 
| 107 | 
            +
                        sections.append(txt)
         | 
| 108 | 
            +
                if re.search(r"\.txt$", filename, re.IGNORECASE):
         | 
| 109 | 
            +
                    txt = ""
         | 
| 110 | 
            +
                    if binary:txt = binary.decode("utf-8")
         | 
| 111 | 
            +
                    else:
         | 
| 112 | 
            +
                        with open(filename, "r") as f:
         | 
| 113 | 
            +
                            while True:
         | 
| 114 | 
            +
                                l = f.readline()
         | 
| 115 | 
            +
                                if not l:break
         | 
| 116 | 
            +
                                txt += l
         | 
| 117 | 
            +
                        sections = txt.split("\n")
         | 
| 118 | 
            +
                    sections = [l for l in sections if l]
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                # is it English
         | 
| 121 | 
            +
                eng = 0
         | 
| 122 | 
            +
                for sec in sections:
         | 
| 123 | 
            +
                    if re.match(r"[a-zA-Z]", sec.strip()):
         | 
| 124 | 
            +
                        eng += 1
         | 
| 125 | 
            +
                if eng / len(sections) > 0.8:
         | 
| 126 | 
            +
                    eng = True
         | 
| 127 | 
            +
                else:
         | 
| 128 | 
            +
                    eng = False
         | 
| 129 | 
            +
                # Remove 'Contents' part
         | 
| 130 | 
            +
                i = 0
         | 
| 131 | 
            +
                while i < len(sections):
         | 
| 132 | 
            +
                    if not re.match(r"(Contents|目录|目次)$", re.sub(r"( | |\u3000)+", "", sections[i].split("@@")[0])):
         | 
| 133 | 
            +
                        i += 1
         | 
| 134 | 
            +
                        continue
         | 
| 135 | 
            +
                    sections.pop(i)
         | 
| 136 | 
            +
                    if i >= len(sections): break
         | 
| 137 | 
            +
                    prefix = sections[i].strip()[:3] if not eng else " ".join(sections[i].strip().split(" ")[:2])
         | 
| 138 | 
            +
                    while not prefix:
         | 
| 139 | 
            +
                        sections.pop(i)
         | 
| 140 | 
            +
                        if i >= len(sections): break
         | 
| 141 | 
            +
                        prefix = sections[i].strip()[:3] if not eng else " ".join(sections[i].strip().split(" ")[:2])
         | 
| 142 | 
            +
                    sections.pop(i)
         | 
| 143 | 
            +
                    if i >= len(sections) or not prefix: break
         | 
| 144 | 
            +
                    for j in range(i, min(i+128, len(sections))):
         | 
| 145 | 
            +
                        if not re.match(prefix, sections[j]):
         | 
| 146 | 
            +
                            continue
         | 
| 147 | 
            +
                        for k in range(i, j):sections.pop(i)
         | 
| 148 | 
            +
                        break
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                bull = bullets_category(sections)
         | 
| 151 | 
            +
                projs = [len(BULLET_PATTERN[bull])] * len(sections)
         | 
| 152 | 
            +
                for i, sec in enumerate(sections):
         | 
| 153 | 
            +
                    for j,p in enumerate(BULLET_PATTERN[bull]):
         | 
| 154 | 
            +
                        if re.match(p, sec.strip()):
         | 
| 155 | 
            +
                            projs[i] = j
         | 
| 156 | 
            +
                            break
         | 
| 157 | 
            +
                readed = [0] * len(sections)
         | 
| 158 | 
            +
                cks = []
         | 
| 159 | 
            +
                for pr in range(len(BULLET_PATTERN[bull])-1, 1, -1):
         | 
| 160 | 
            +
                    for i in range(len(sections)):
         | 
| 161 | 
            +
                        if readed[i] or projs[i] < pr:
         | 
| 162 | 
            +
                            continue
         | 
| 163 | 
            +
                        # find father and grand-father and grand...father
         | 
| 164 | 
            +
                        p = projs[i]
         | 
| 165 | 
            +
                        readed[i] = 1
         | 
| 166 | 
            +
                        ck = [sections[i]]
         | 
| 167 | 
            +
                        for j in range(i-1, -1, -1):
         | 
| 168 | 
            +
                            if projs[j] >= p:continue
         | 
| 169 | 
            +
                            ck.append(sections[j])
         | 
| 170 | 
            +
                            readed[j] = 1
         | 
| 171 | 
            +
                            p = projs[j]
         | 
| 172 | 
            +
                            if p == 0: break
         | 
| 173 | 
            +
                        cks.append(ck[::-1])
         | 
| 174 | 
            +
             | 
| 175 | 
            +
                res = []
         | 
| 176 | 
            +
                # wrap up to es documents
         | 
| 177 | 
            +
                for ck in cks:
         | 
| 178 | 
            +
                    print("\n-".join(ck))
         | 
| 179 | 
            +
                    ck = "\n".join(ck)
         | 
| 180 | 
            +
                    d = copy.deepcopy(doc)
         | 
| 181 | 
            +
                    if pdf_parser:
         | 
| 182 | 
            +
                        d["image"] = pdf_parser.crop(ck)
         | 
| 183 | 
            +
                        ck = pdf_parser.remove_tag(ck)
         | 
| 184 | 
            +
                    d["content_ltks"] = huqie.qie(ck)
         | 
| 185 | 
            +
                    d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
         | 
| 186 | 
            +
                    res.append(d)
         | 
| 187 | 
            +
                return res
         | 
| 188 | 
            +
             | 
| 189 | 
            +
             | 
| 190 | 
            +
            if __name__ == "__main__":
         | 
| 191 | 
            +
                import sys
         | 
| 192 | 
            +
                chunk(sys.argv[1])
         | 
    	
        rag/app/presentation.py
    ADDED
    
    | @@ -0,0 +1,127 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import copy
         | 
| 2 | 
            +
            import re
         | 
| 3 | 
            +
            from io import BytesIO
         | 
| 4 | 
            +
            from pptx import Presentation
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from rag.app import callback__
         | 
| 7 | 
            +
            from rag.nlp import huqie
         | 
| 8 | 
            +
            from rag.parser.pdf_parser import HuParser
         | 
| 9 | 
            +
             | 
| 10 | 
            +
             | 
| 11 | 
            +
            class Ppt(object):
         | 
| 12 | 
            +
                def __init__(self):
         | 
| 13 | 
            +
                    super().__init__()
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                def __extract(self, shape):
         | 
| 16 | 
            +
                    if shape.shape_type == 19:
         | 
| 17 | 
            +
                        tb = shape.table
         | 
| 18 | 
            +
                        rows = []
         | 
| 19 | 
            +
                        for i in range(1, len(tb.rows)):
         | 
| 20 | 
            +
                            rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
         | 
| 21 | 
            +
                        return "\n".join(rows)
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                    if shape.has_text_frame:
         | 
| 24 | 
            +
                        return shape.text_frame.text
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                    if shape.shape_type == 6:
         | 
| 27 | 
            +
                        texts = []
         | 
| 28 | 
            +
                        for p in shape.shapes:
         | 
| 29 | 
            +
                            t = self.__extract(p)
         | 
| 30 | 
            +
                            if t: texts.append(t)
         | 
| 31 | 
            +
                        return "\n".join(texts)
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                def __call__(self, fnm, from_page, to_page, callback=None):
         | 
| 34 | 
            +
                    ppt = Presentation(fnm) if isinstance(
         | 
| 35 | 
            +
                        fnm, str) else Presentation(
         | 
| 36 | 
            +
                        BytesIO(fnm))
         | 
| 37 | 
            +
                    txts = []
         | 
| 38 | 
            +
                    self.total_page = len(ppt.slides)
         | 
| 39 | 
            +
                    for i, slide in enumerate(ppt.slides[from_page: to_page]):
         | 
| 40 | 
            +
                        texts = []
         | 
| 41 | 
            +
                        for shape in slide.shapes:
         | 
| 42 | 
            +
                            txt = self.__extract(shape)
         | 
| 43 | 
            +
                            if txt: texts.append(txt)
         | 
| 44 | 
            +
                        txts.append("\n".join(texts))
         | 
| 45 | 
            +
                        callback__((i+1)/self.total_page/2, "", callback)
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                    callback__((min(to_page, self.total_page) - from_page) / self.total_page,
         | 
| 48 | 
            +
                               "Page {}~{}: Text extraction finished".format(from_page, min(to_page, self.total_page)), callback)
         | 
| 49 | 
            +
                    import aspose.slides as slides
         | 
| 50 | 
            +
                    import aspose.pydrawing as drawing
         | 
| 51 | 
            +
                    imgs = []
         | 
| 52 | 
            +
                    with slides.Presentation(BytesIO(fnm)) as presentation:
         | 
| 53 | 
            +
                        for i, slide in enumerate(presentation.slides[from_page: to_page]):
         | 
| 54 | 
            +
                            buffered = BytesIO()
         | 
| 55 | 
            +
                            slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
         | 
| 56 | 
            +
                            imgs.append(buffered.getvalue())
         | 
| 57 | 
            +
                    assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
         | 
| 58 | 
            +
                    callback__((min(to_page, self.total_page) - from_page) / self.total_page,
         | 
| 59 | 
            +
                               "Page {}~{}: Image extraction finished".format(from_page, min(to_page, self.total_page)), callback)
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                    return [(txts[i], imgs[i]) for i in range(len(txts))]
         | 
| 62 | 
            +
             | 
| 63 | 
            +
             | 
| 64 | 
            +
            class Pdf(HuParser):
         | 
| 65 | 
            +
                def __init__(self):
         | 
| 66 | 
            +
                    super().__init__()
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                def __garbage(self, txt):
         | 
| 69 | 
            +
                    txt = txt.lower().strip()
         | 
| 70 | 
            +
                    if re.match(r"[0-9\.,%/-]+$", txt): return True
         | 
| 71 | 
            +
                    if len(txt) < 3:return True
         | 
| 72 | 
            +
                    return False
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
         | 
| 75 | 
            +
                    self.__images__(filename if not binary else binary, zoomin, from_page, to_page)
         | 
| 76 | 
            +
                    callback__((min(to_page, self.total_page)-from_page) / self.total_page, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
         | 
| 77 | 
            +
                    assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
         | 
| 78 | 
            +
                    res = []
         | 
| 79 | 
            +
                    #################### More precisely ###################
         | 
| 80 | 
            +
                    # self._layouts_paddle(zoomin)
         | 
| 81 | 
            +
                    # self._text_merge()
         | 
| 82 | 
            +
                    # pages = {}
         | 
| 83 | 
            +
                    # for b in self.boxes:
         | 
| 84 | 
            +
                    #     if self.__garbage(b["text"]):continue
         | 
| 85 | 
            +
                    #     if b["page_number"] not in pages: pages[b["page_number"]] = []
         | 
| 86 | 
            +
                    #     pages[b["page_number"]].append(b["text"])
         | 
| 87 | 
            +
                    # for i, lines in pages.items():
         | 
| 88 | 
            +
                    #     res.append(("\n".join(lines), self.page_images[i-1]))
         | 
| 89 | 
            +
                    # return res
         | 
| 90 | 
            +
                    ########################################
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                    for i in range(len(self.boxes)):
         | 
| 93 | 
            +
                        lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
         | 
| 94 | 
            +
                        res.append((lines, self.page_images[i]))
         | 
| 95 | 
            +
                    return res
         | 
| 96 | 
            +
             | 
| 97 | 
            +
             | 
| 98 | 
            +
            def chunk(filename, binary=None,  from_page=0, to_page=100000, callback=None):
         | 
| 99 | 
            +
                doc = {
         | 
| 100 | 
            +
                    "docnm_kwd": filename,
         | 
| 101 | 
            +
                    "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
         | 
| 102 | 
            +
                }
         | 
| 103 | 
            +
                doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
         | 
| 104 | 
            +
                res = []
         | 
| 105 | 
            +
                if re.search(r"\.pptx?$", filename, re.IGNORECASE):
         | 
| 106 | 
            +
                    for txt,img in Ppt()(filename if not binary else binary, from_page, to_page, callback):
         | 
| 107 | 
            +
                        d = copy.deepcopy(doc)
         | 
| 108 | 
            +
                        d["content_ltks"] = huqie.qie(txt)
         | 
| 109 | 
            +
                        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
         | 
| 110 | 
            +
                        d["image"] = img
         | 
| 111 | 
            +
                        res.append(d)
         | 
| 112 | 
            +
                    return res
         | 
| 113 | 
            +
                if re.search(r"\.pdf$", filename, re.IGNORECASE):
         | 
| 114 | 
            +
                    for txt,img in Pdf()(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback):
         | 
| 115 | 
            +
                        d = copy.deepcopy(doc)
         | 
| 116 | 
            +
                        d["content_ltks"] = huqie.qie(txt)
         | 
| 117 | 
            +
                        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
         | 
| 118 | 
            +
                        d["image"] = img
         | 
| 119 | 
            +
                        res.append(d)
         | 
| 120 | 
            +
                    return res
         | 
| 121 | 
            +
                callback__(-1, "This kind of presentation document did not support yet!", callback)
         | 
| 122 | 
            +
             | 
| 123 | 
            +
             | 
| 124 | 
            +
            if __name__== "__main__":
         | 
| 125 | 
            +
                import sys
         | 
| 126 | 
            +
                print(chunk(sys.argv[1]))
         | 
| 127 | 
            +
             | 
    	
        rag/nlp/huchunk.py
    CHANGED
    
    | @@ -352,11 +352,6 @@ class ExcelChunker(HuChunker): | |
| 352 |  | 
| 353 | 
             
            class PptChunker(HuChunker):
         | 
| 354 |  | 
| 355 | 
            -
                @dataclass
         | 
| 356 | 
            -
                class Fields:
         | 
| 357 | 
            -
                    text_chunks: List = None
         | 
| 358 | 
            -
                    table_chunks: List = None
         | 
| 359 | 
            -
             | 
| 360 | 
             
                def __init__(self):
         | 
| 361 | 
             
                    super().__init__()
         | 
| 362 |  | 
|  | |
| 352 |  | 
| 353 | 
             
            class PptChunker(HuChunker):
         | 
| 354 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 355 | 
             
                def __init__(self):
         | 
| 356 | 
             
                    super().__init__()
         | 
| 357 |  | 
    	
        rag/parser/pdf_parser.py
    CHANGED
    
    | @@ -370,7 +370,7 @@ class HuParser: | |
| 370 | 
             
                        res.append(lts)
         | 
| 371 | 
             
                    return res
         | 
| 372 |  | 
| 373 | 
            -
                def  | 
| 374 | 
             
                    logging.info("Table processing...")
         | 
| 375 | 
             
                    imgs, pos = [], []
         | 
| 376 | 
             
                    tbcnt = [0]
         | 
| @@ -416,6 +416,50 @@ class HuParser: | |
| 416 | 
             
                                pg.append(it)
         | 
| 417 | 
             
                        self.tb_cpns.extend(pg)
         | 
| 418 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 419 | 
             
                def __ocr_paddle(self, pagenum, img, chars, ZM=3):
         | 
| 420 | 
             
                    bxs = self.ocr.ocr(np.array(img), cls=True)[0]
         | 
| 421 | 
             
                    if not bxs:
         | 
| @@ -453,7 +497,7 @@ class HuParser: | |
| 453 |  | 
| 454 | 
             
                    self.boxes.append(bxs)
         | 
| 455 |  | 
| 456 | 
            -
                def  | 
| 457 | 
             
                    assert len(self.page_images) == len(self.boxes)
         | 
| 458 | 
             
                    # Tag layout type
         | 
| 459 | 
             
                    boxes = []
         | 
| @@ -524,7 +568,24 @@ class HuParser: | |
| 524 |  | 
| 525 | 
             
                    self.boxes = boxes
         | 
| 526 |  | 
| 527 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 528 | 
             
                    # merge adjusted boxes
         | 
| 529 | 
             
                    bxs = self.boxes
         | 
| 530 |  | 
| @@ -537,6 +598,7 @@ class HuParser: | |
| 537 | 
             
                        tt = b.get("text", "").strip()
         | 
| 538 | 
             
                        return tt and any([tt.find(t.strip()) == 0 for t in txts])
         | 
| 539 |  | 
|  | |
| 540 | 
             
                    i = 0
         | 
| 541 | 
             
                    while i < len(bxs) - 1:
         | 
| 542 | 
             
                        b = bxs[i]
         | 
| @@ -567,7 +629,8 @@ class HuParser: | |
| 567 | 
             
                        i += 1
         | 
| 568 | 
             
                    self.boxes = bxs
         | 
| 569 |  | 
| 570 | 
            -
             | 
|  | |
| 571 | 
             
                    for i in range(len(self.boxes)):
         | 
| 572 | 
             
                        mh = self.mean_height[self.boxes[i]["page_number"] - 1]
         | 
| 573 | 
             
                        self.boxes[i]["in_row"] = 0
         | 
| @@ -583,49 +646,6 @@ class HuParser: | |
| 583 | 
             
                                break
         | 
| 584 | 
             
                            j += 1
         | 
| 585 |  | 
| 586 | 
            -
                    def gather(kwd, fzy=10, ption=0.6):
         | 
| 587 | 
            -
                        eles = self.sort_Y_firstly(
         | 
| 588 | 
            -
                            [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
         | 
| 589 | 
            -
                        eles = self.__layouts_cleanup(self.boxes, eles, 5, ption)
         | 
| 590 | 
            -
                        return self.sort_Y_firstly(eles, 0)
         | 
| 591 | 
            -
             | 
| 592 | 
            -
                    headers = gather(r".*header$")
         | 
| 593 | 
            -
                    rows = gather(r".* (row|header)")
         | 
| 594 | 
            -
                    spans = gather(r".*spanning")
         | 
| 595 | 
            -
                    clmns = sorted([r for r in self.tb_cpns if re.match(
         | 
| 596 | 
            -
                        r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
         | 
| 597 | 
            -
                    clmns = self.__layouts_cleanup(self.boxes, clmns, 5, 0.5)
         | 
| 598 | 
            -
                    for b in self.boxes:
         | 
| 599 | 
            -
                        if b.get("layout_type", "") != "table":
         | 
| 600 | 
            -
                            continue
         | 
| 601 | 
            -
                        ii = self.__find_overlapped_with_threashold(b, rows, thr=0.3)
         | 
| 602 | 
            -
                        if ii is not None:
         | 
| 603 | 
            -
                            b["R"] = ii
         | 
| 604 | 
            -
                            b["R_top"] = rows[ii]["top"]
         | 
| 605 | 
            -
                            b["R_bott"] = rows[ii]["bottom"]
         | 
| 606 | 
            -
             | 
| 607 | 
            -
                        ii = self.__find_overlapped_with_threashold(b, headers, thr=0.3)
         | 
| 608 | 
            -
                        if ii is not None:
         | 
| 609 | 
            -
                            b["H_top"] = headers[ii]["top"]
         | 
| 610 | 
            -
                            b["H_bott"] = headers[ii]["bottom"]
         | 
| 611 | 
            -
                            b["H_left"] = headers[ii]["x0"]
         | 
| 612 | 
            -
                            b["H_right"] = headers[ii]["x1"]
         | 
| 613 | 
            -
                            b["H"] = ii
         | 
| 614 | 
            -
             | 
| 615 | 
            -
                        ii = self.__find_overlapped_with_threashold(b, clmns, thr=0.3)
         | 
| 616 | 
            -
                        if ii is not None:
         | 
| 617 | 
            -
                            b["C"] = ii
         | 
| 618 | 
            -
                            b["C_left"] = clmns[ii]["x0"]
         | 
| 619 | 
            -
                            b["C_right"] = clmns[ii]["x1"]
         | 
| 620 | 
            -
             | 
| 621 | 
            -
                        ii = self.__find_overlapped_with_threashold(b, spans, thr=0.3)
         | 
| 622 | 
            -
                        if ii is not None:
         | 
| 623 | 
            -
                            b["H_top"] = spans[ii]["top"]
         | 
| 624 | 
            -
                            b["H_bott"] = spans[ii]["bottom"]
         | 
| 625 | 
            -
                            b["H_left"] = spans[ii]["x0"]
         | 
| 626 | 
            -
                            b["H_right"] = spans[ii]["x1"]
         | 
| 627 | 
            -
                            b["SP"] = ii
         | 
| 628 | 
            -
             | 
| 629 | 
             
                    # concat between rows
         | 
| 630 | 
             
                    boxes = deepcopy(self.boxes)
         | 
| 631 | 
             
                    blocks = []
         | 
| @@ -633,8 +653,6 @@ class HuParser: | |
| 633 | 
             
                        chunks = []
         | 
| 634 |  | 
| 635 | 
             
                        def dfs(up, dp):
         | 
| 636 | 
            -
                            if not up["text"].strip() or up["text"].strip() in garbage:
         | 
| 637 | 
            -
                                return
         | 
| 638 | 
             
                            chunks.append(up)
         | 
| 639 | 
             
                            i = dp
         | 
| 640 | 
             
                            while i < min(dp + 12, len(boxes)):
         | 
| @@ -658,8 +676,7 @@ class HuParser: | |
| 658 | 
             
                                    i += 1
         | 
| 659 | 
             
                                    continue
         | 
| 660 |  | 
| 661 | 
            -
                                if not down["text"].strip() | 
| 662 | 
            -
                                        or down["text"].strip() in garbage:
         | 
| 663 | 
             
                                    i += 1
         | 
| 664 | 
             
                                    continue
         | 
| 665 |  | 
| @@ -1444,18 +1461,19 @@ class HuParser: | |
| 1444 | 
             
                            return j
         | 
| 1445 | 
             
                    return
         | 
| 1446 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1447 | 
             
                def __filterout_scraps(self, boxes, ZM):
         | 
| 1448 | 
            -
                    def line_tag(bx):
         | 
| 1449 | 
            -
                        pn = [bx["page_number"]]
         | 
| 1450 | 
            -
                        top = bx["top"] - self.page_cum_height[pn[0] - 1]
         | 
| 1451 | 
            -
                        bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
         | 
| 1452 | 
            -
                        while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
         | 
| 1453 | 
            -
                            bott -= self.page_images[pn[-1] - 1].size[1] / ZM
         | 
| 1454 | 
            -
                            pn.append(pn[-1] + 1)
         | 
| 1455 | 
            -
             | 
| 1456 | 
            -
                        return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
         | 
| 1457 | 
            -
                            .format("-".join([str(p) for p in pn]),
         | 
| 1458 | 
            -
                                    bx["x0"], bx["x1"], top, bott)
         | 
| 1459 |  | 
| 1460 | 
             
                    def width(b):
         | 
| 1461 | 
             
                        return b["x1"] - b["x0"]
         | 
| @@ -1520,14 +1538,14 @@ class HuParser: | |
| 1520 | 
             
                        boxes.pop(0)
         | 
| 1521 | 
             
                        mw = np.mean(widths)
         | 
| 1522 | 
             
                        if mj or mw / pw >= 0.35 or mw > 200:
         | 
| 1523 | 
            -
                            res.append("\n".join([c["text"] +  | 
| 1524 | 
             
                        else:
         | 
| 1525 | 
             
                            logging.debug("REMOVED: " +
         | 
| 1526 | 
             
                                          "<<".join([c["text"] for c in lines]))
         | 
| 1527 |  | 
| 1528 | 
             
                    return "\n\n".join(res)
         | 
| 1529 |  | 
| 1530 | 
            -
                def  | 
| 1531 | 
             
                    self.lefted_chars = []
         | 
| 1532 | 
             
                    self.mean_height = []
         | 
| 1533 | 
             
                    self.mean_width = []
         | 
| @@ -1537,22 +1555,25 @@ class HuParser: | |
| 1537 | 
             
                    self.page_layout = []
         | 
| 1538 | 
             
                    try:
         | 
| 1539 | 
             
                        self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
         | 
| 1540 | 
            -
                        self.page_images = [p.to_image(resolution=72*zoomin).annotated for i,p in | 
| 1541 | 
            -
             | 
|  | |
|  | |
|  | |
| 1542 | 
             
                    except Exception as e:
         | 
| 1543 | 
             
                        self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
         | 
| 1544 | 
             
                        self.page_images = []
         | 
| 1545 | 
             
                        self.page_chars = []
         | 
| 1546 | 
             
                        mat = fitz.Matrix(zoomin, zoomin)
         | 
| 1547 | 
            -
                         | 
| 1548 | 
            -
             | 
|  | |
| 1549 | 
             
                            img = Image.frombytes("RGB", [pix.width, pix.height],
         | 
| 1550 | 
             
                                                  pix.samples)
         | 
| 1551 | 
             
                            self.page_images.append(img)
         | 
| 1552 | 
             
                            self.page_chars.append([])
         | 
| 1553 |  | 
| 1554 | 
             
                    logging.info("Images converted.")
         | 
| 1555 | 
            -
             | 
| 1556 | 
             
                    for i, img in enumerate(self.page_images):
         | 
| 1557 | 
             
                        chars = self.page_chars[i]
         | 
| 1558 | 
             
                        self.mean_height.append(
         | 
| @@ -1561,40 +1582,26 @@ class HuParser: | |
| 1561 | 
             
                        self.mean_width.append(
         | 
| 1562 | 
             
                            np.median(sorted([c["width"] for c in chars])) if chars else 8
         | 
| 1563 | 
             
                        )
         | 
| 1564 | 
            -
                         | 
| 1565 | 
            -
             | 
| 1566 | 
            -
             | 
| 1567 | 
            -
             | 
| 1568 | 
            -
             | 
| 1569 | 
            -
             | 
|  | |
| 1570 | 
             
                        self.__ocr_paddle(i + 1, img, chars, zoomin)
         | 
| 1571 | 
            -
                    self.__layouts_paddle(zoomin)
         | 
| 1572 |  | 
| 1573 | 
             
                    self.page_cum_height = np.cumsum(self.page_cum_height)
         | 
| 1574 | 
            -
                    assert len(self.page_cum_height) == len(self.page_images)
         | 
| 1575 |  | 
| 1576 | 
            -
             | 
| 1577 | 
            -
                     | 
| 1578 | 
            -
             | 
| 1579 | 
            -
             | 
| 1580 | 
            -
             | 
| 1581 | 
            -
             | 
| 1582 | 
            -
             | 
| 1583 | 
            -
                    logging.debug("GARBAGE:" + ",".join(garbage))
         | 
| 1584 | 
            -
                    self.boxes = [b for b in self.boxes if b["text"] not in garbage]
         | 
| 1585 | 
            -
             | 
| 1586 | 
            -
                    # cumlative Y
         | 
| 1587 | 
            -
                    for i in range(len(self.boxes)):
         | 
| 1588 | 
            -
                        self.boxes[i]["top"] += \
         | 
| 1589 | 
            -
                            self.page_cum_height[self.boxes[i]["page_number"] - 1]
         | 
| 1590 | 
            -
                        self.boxes[i]["bottom"] += \
         | 
| 1591 | 
            -
                            self.page_cum_height[self.boxes[i]["page_number"] - 1]
         | 
| 1592 | 
            -
             | 
| 1593 | 
            -
                    self.__table_transformer_job(zoomin)
         | 
| 1594 | 
            -
                    self.__text_merge(garbage)
         | 
| 1595 | 
             
                    self.__filter_forpages()
         | 
| 1596 | 
             
                    tbls = self.__extract_table_figure(need_image, zoomin, return_html)
         | 
| 1597 | 
            -
             | 
| 1598 | 
             
                    return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
         | 
| 1599 |  | 
| 1600 | 
             
                def remove_tag(self, txt):
         | 
|  | |
| 370 | 
             
                        res.append(lts)
         | 
| 371 | 
             
                    return res
         | 
| 372 |  | 
| 373 | 
            +
                def _table_transformer_job(self, ZM):
         | 
| 374 | 
             
                    logging.info("Table processing...")
         | 
| 375 | 
             
                    imgs, pos = [], []
         | 
| 376 | 
             
                    tbcnt = [0]
         | 
|  | |
| 416 | 
             
                                pg.append(it)
         | 
| 417 | 
             
                        self.tb_cpns.extend(pg)
         | 
| 418 |  | 
| 419 | 
            +
                    def gather(kwd, fzy=10, ption=0.6):
         | 
| 420 | 
            +
                        eles = self.sort_Y_firstly(
         | 
| 421 | 
            +
                            [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
         | 
| 422 | 
            +
                        eles = self.__layouts_cleanup(self.boxes, eles, 5, ption)
         | 
| 423 | 
            +
                        return self.sort_Y_firstly(eles, 0)
         | 
| 424 | 
            +
             | 
| 425 | 
            +
                    # add R,H,C,SP tag to boxes within table layout
         | 
| 426 | 
            +
                    headers = gather(r".*header$")
         | 
| 427 | 
            +
                    rows = gather(r".* (row|header)")
         | 
| 428 | 
            +
                    spans = gather(r".*spanning")
         | 
| 429 | 
            +
                    clmns = sorted([r for r in self.tb_cpns if re.match(
         | 
| 430 | 
            +
                        r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
         | 
| 431 | 
            +
                    clmns = self.__layouts_cleanup(self.boxes, clmns, 5, 0.5)
         | 
| 432 | 
            +
                    for b in self.boxes:
         | 
| 433 | 
            +
                        if b.get("layout_type", "") != "table":
         | 
| 434 | 
            +
                            continue
         | 
| 435 | 
            +
                        ii = self.__find_overlapped_with_threashold(b, rows, thr=0.3)
         | 
| 436 | 
            +
                        if ii is not None:
         | 
| 437 | 
            +
                            b["R"] = ii
         | 
| 438 | 
            +
                            b["R_top"] = rows[ii]["top"]
         | 
| 439 | 
            +
                            b["R_bott"] = rows[ii]["bottom"]
         | 
| 440 | 
            +
             | 
| 441 | 
            +
                        ii = self.__find_overlapped_with_threashold(b, headers, thr=0.3)
         | 
| 442 | 
            +
                        if ii is not None:
         | 
| 443 | 
            +
                            b["H_top"] = headers[ii]["top"]
         | 
| 444 | 
            +
                            b["H_bott"] = headers[ii]["bottom"]
         | 
| 445 | 
            +
                            b["H_left"] = headers[ii]["x0"]
         | 
| 446 | 
            +
                            b["H_right"] = headers[ii]["x1"]
         | 
| 447 | 
            +
                            b["H"] = ii
         | 
| 448 | 
            +
             | 
| 449 | 
            +
                        ii = self.__find_overlapped_with_threashold(b, clmns, thr=0.3)
         | 
| 450 | 
            +
                        if ii is not None:
         | 
| 451 | 
            +
                            b["C"] = ii
         | 
| 452 | 
            +
                            b["C_left"] = clmns[ii]["x0"]
         | 
| 453 | 
            +
                            b["C_right"] = clmns[ii]["x1"]
         | 
| 454 | 
            +
             | 
| 455 | 
            +
                        ii = self.__find_overlapped_with_threashold(b, spans, thr=0.3)
         | 
| 456 | 
            +
                        if ii is not None:
         | 
| 457 | 
            +
                            b["H_top"] = spans[ii]["top"]
         | 
| 458 | 
            +
                            b["H_bott"] = spans[ii]["bottom"]
         | 
| 459 | 
            +
                            b["H_left"] = spans[ii]["x0"]
         | 
| 460 | 
            +
                            b["H_right"] = spans[ii]["x1"]
         | 
| 461 | 
            +
                            b["SP"] = ii
         | 
| 462 | 
            +
             | 
| 463 | 
             
                def __ocr_paddle(self, pagenum, img, chars, ZM=3):
         | 
| 464 | 
             
                    bxs = self.ocr.ocr(np.array(img), cls=True)[0]
         | 
| 465 | 
             
                    if not bxs:
         | 
|  | |
| 497 |  | 
| 498 | 
             
                    self.boxes.append(bxs)
         | 
| 499 |  | 
| 500 | 
            +
                def _layouts_paddle(self, ZM):
         | 
| 501 | 
             
                    assert len(self.page_images) == len(self.boxes)
         | 
| 502 | 
             
                    # Tag layout type
         | 
| 503 | 
             
                    boxes = []
         | 
|  | |
| 568 |  | 
| 569 | 
             
                    self.boxes = boxes
         | 
| 570 |  | 
| 571 | 
            +
                    garbage = set()
         | 
| 572 | 
            +
                    for k in self.garbages.keys():
         | 
| 573 | 
            +
                        self.garbages[k] = Counter(self.garbages[k])
         | 
| 574 | 
            +
                        for g, c in self.garbages[k].items():
         | 
| 575 | 
            +
                            if c > 1:
         | 
| 576 | 
            +
                                garbage.add(g)
         | 
| 577 | 
            +
             | 
| 578 | 
            +
                    logging.debug("GARBAGE:" + ",".join(garbage))
         | 
| 579 | 
            +
                    self.boxes = [b for b in self.boxes if b["text"].strip() not in garbage]
         | 
| 580 | 
            +
             | 
| 581 | 
            +
                    # cumlative Y
         | 
| 582 | 
            +
                    for i in range(len(self.boxes)):
         | 
| 583 | 
            +
                        self.boxes[i]["top"] += \
         | 
| 584 | 
            +
                            self.page_cum_height[self.boxes[i]["page_number"] - 1]
         | 
| 585 | 
            +
                        self.boxes[i]["bottom"] += \
         | 
| 586 | 
            +
                            self.page_cum_height[self.boxes[i]["page_number"] - 1]
         | 
| 587 | 
            +
             | 
| 588 | 
            +
                def _text_merge(self):
         | 
| 589 | 
             
                    # merge adjusted boxes
         | 
| 590 | 
             
                    bxs = self.boxes
         | 
| 591 |  | 
|  | |
| 598 | 
             
                        tt = b.get("text", "").strip()
         | 
| 599 | 
             
                        return tt and any([tt.find(t.strip()) == 0 for t in txts])
         | 
| 600 |  | 
| 601 | 
            +
                    # horizontally merge adjacent box with the same layout
         | 
| 602 | 
             
                    i = 0
         | 
| 603 | 
             
                    while i < len(bxs) - 1:
         | 
| 604 | 
             
                        b = bxs[i]
         | 
|  | |
| 629 | 
             
                        i += 1
         | 
| 630 | 
             
                    self.boxes = bxs
         | 
| 631 |  | 
| 632 | 
            +
                def _concat_downward(self):
         | 
| 633 | 
            +
                    # count boxes in the same row as a feature
         | 
| 634 | 
             
                    for i in range(len(self.boxes)):
         | 
| 635 | 
             
                        mh = self.mean_height[self.boxes[i]["page_number"] - 1]
         | 
| 636 | 
             
                        self.boxes[i]["in_row"] = 0
         | 
|  | |
| 646 | 
             
                                break
         | 
| 647 | 
             
                            j += 1
         | 
| 648 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 649 | 
             
                    # concat between rows
         | 
| 650 | 
             
                    boxes = deepcopy(self.boxes)
         | 
| 651 | 
             
                    blocks = []
         | 
|  | |
| 653 | 
             
                        chunks = []
         | 
| 654 |  | 
| 655 | 
             
                        def dfs(up, dp):
         | 
|  | |
|  | |
| 656 | 
             
                            chunks.append(up)
         | 
| 657 | 
             
                            i = dp
         | 
| 658 | 
             
                            while i < min(dp + 12, len(boxes)):
         | 
|  | |
| 676 | 
             
                                    i += 1
         | 
| 677 | 
             
                                    continue
         | 
| 678 |  | 
| 679 | 
            +
                                if not down["text"].strip():
         | 
|  | |
| 680 | 
             
                                    i += 1
         | 
| 681 | 
             
                                    continue
         | 
| 682 |  | 
|  | |
| 1461 | 
             
                            return j
         | 
| 1462 | 
             
                    return
         | 
| 1463 |  | 
| 1464 | 
            +
                def _line_tag(self, bx, ZM):
         | 
| 1465 | 
            +
                    pn = [bx["page_number"]]
         | 
| 1466 | 
            +
                    top = bx["top"] - self.page_cum_height[pn[0] - 1]
         | 
| 1467 | 
            +
                    bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
         | 
| 1468 | 
            +
                    while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
         | 
| 1469 | 
            +
                        bott -= self.page_images[pn[-1] - 1].size[1] / ZM
         | 
| 1470 | 
            +
                        pn.append(pn[-1] + 1)
         | 
| 1471 | 
            +
             | 
| 1472 | 
            +
                    return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
         | 
| 1473 | 
            +
                        .format("-".join([str(p) for p in pn]),
         | 
| 1474 | 
            +
                                bx["x0"], bx["x1"], top, bott)
         | 
| 1475 | 
            +
             | 
| 1476 | 
             
                def __filterout_scraps(self, boxes, ZM):
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1477 |  | 
| 1478 | 
             
                    def width(b):
         | 
| 1479 | 
             
                        return b["x1"] - b["x0"]
         | 
|  | |
| 1538 | 
             
                        boxes.pop(0)
         | 
| 1539 | 
             
                        mw = np.mean(widths)
         | 
| 1540 | 
             
                        if mj or mw / pw >= 0.35 or mw > 200:
         | 
| 1541 | 
            +
                            res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
         | 
| 1542 | 
             
                        else:
         | 
| 1543 | 
             
                            logging.debug("REMOVED: " +
         | 
| 1544 | 
             
                                          "<<".join([c["text"] for c in lines]))
         | 
| 1545 |  | 
| 1546 | 
             
                    return "\n\n".join(res)
         | 
| 1547 |  | 
| 1548 | 
            +
                def __images__(self, fnm, zoomin=3, page_from=0, page_to=299):
         | 
| 1549 | 
             
                    self.lefted_chars = []
         | 
| 1550 | 
             
                    self.mean_height = []
         | 
| 1551 | 
             
                    self.mean_width = []
         | 
|  | |
| 1555 | 
             
                    self.page_layout = []
         | 
| 1556 | 
             
                    try:
         | 
| 1557 | 
             
                        self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
         | 
| 1558 | 
            +
                        self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
         | 
| 1559 | 
            +
                                            enumerate(self.pdf.pages[page_from:page_to])]
         | 
| 1560 | 
            +
                        self.page_chars = [[c for c in self.pdf.pages[i].chars if self._has_color(c)] for i in
         | 
| 1561 | 
            +
                                           range(len(self.page_images))]
         | 
| 1562 | 
            +
                        self.total_page = len(self.pdf.pages)
         | 
| 1563 | 
             
                    except Exception as e:
         | 
| 1564 | 
             
                        self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
         | 
| 1565 | 
             
                        self.page_images = []
         | 
| 1566 | 
             
                        self.page_chars = []
         | 
| 1567 | 
             
                        mat = fitz.Matrix(zoomin, zoomin)
         | 
| 1568 | 
            +
                        self.total_page = len(self.pdf)
         | 
| 1569 | 
            +
                        for page in self.pdf[page_from:page_to]:
         | 
| 1570 | 
            +
                            pix = page.getPixmap(matrix=mat)
         | 
| 1571 | 
             
                            img = Image.frombytes("RGB", [pix.width, pix.height],
         | 
| 1572 | 
             
                                                  pix.samples)
         | 
| 1573 | 
             
                            self.page_images.append(img)
         | 
| 1574 | 
             
                            self.page_chars.append([])
         | 
| 1575 |  | 
| 1576 | 
             
                    logging.info("Images converted.")
         | 
|  | |
| 1577 | 
             
                    for i, img in enumerate(self.page_images):
         | 
| 1578 | 
             
                        chars = self.page_chars[i]
         | 
| 1579 | 
             
                        self.mean_height.append(
         | 
|  | |
| 1582 | 
             
                        self.mean_width.append(
         | 
| 1583 | 
             
                            np.median(sorted([c["width"] for c in chars])) if chars else 8
         | 
| 1584 | 
             
                        )
         | 
| 1585 | 
            +
                        self.page_cum_height.append(img.size[1] / zoomin)
         | 
| 1586 | 
            +
                        # if i > 0:
         | 
| 1587 | 
            +
                        #     if not chars:
         | 
| 1588 | 
            +
                        #         self.page_cum_height.append(img.size[1] / zoomin)
         | 
| 1589 | 
            +
                        #     else:
         | 
| 1590 | 
            +
                        #         self.page_cum_height.append(
         | 
| 1591 | 
            +
                        #             np.max([c["bottom"] for c in chars]))
         | 
| 1592 | 
             
                        self.__ocr_paddle(i + 1, img, chars, zoomin)
         | 
|  | |
| 1593 |  | 
| 1594 | 
             
                    self.page_cum_height = np.cumsum(self.page_cum_height)
         | 
| 1595 | 
            +
                    assert len(self.page_cum_height) == len(self.page_images)+1
         | 
| 1596 |  | 
| 1597 | 
            +
                def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
         | 
| 1598 | 
            +
                    self.__images__(fnm, zoomin)
         | 
| 1599 | 
            +
                    self._layouts_paddle(zoomin)
         | 
| 1600 | 
            +
                    self._table_transformer_job(zoomin)
         | 
| 1601 | 
            +
                    self._text_merge()
         | 
| 1602 | 
            +
                    self._concat_downward()
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1603 | 
             
                    self.__filter_forpages()
         | 
| 1604 | 
             
                    tbls = self.__extract_table_figure(need_image, zoomin, return_html)
         | 
|  | |
| 1605 | 
             
                    return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
         | 
| 1606 |  | 
| 1607 | 
             
                def remove_tag(self, txt):
         | 
    	
        rag/settings.py
    CHANGED
    
    | @@ -35,3 +35,4 @@ LoggerFactory.LEVEL = 10 | |
| 35 | 
             
            es_logger = getLogger("es")
         | 
| 36 | 
             
            minio_logger = getLogger("minio")
         | 
| 37 | 
             
            cron_logger = getLogger("cron_logger")
         | 
|  | 
|  | |
| 35 | 
             
            es_logger = getLogger("es")
         | 
| 36 | 
             
            minio_logger = getLogger("minio")
         | 
| 37 | 
             
            cron_logger = getLogger("cron_logger")
         | 
| 38 | 
            +
            chunk_logger = getLogger("chunk_logger")
         |