File size: 1,314 Bytes
79fc11d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import layoutparser as lp
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import json

model = lp.Detectron2LayoutModel(
    config_path="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config",
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
    label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
)

def load_images(uploaded_file):
    if uploaded_file.name.endswith(".pdf"):
        return convert_from_path(uploaded_file)
    else:
        return [Image.open(uploaded_file)]

def analyze_layout(image):
    layout = model.detect(image)
    return layout

def extract_text_from_blocks(image, layout):
    blocks = []
    for block in layout:
        cropped = block.crop_image(image)
        text = pytesseract.image_to_string(cropped)
        blocks.append({
            "type": block.type,
            "text": text.strip(),
            "coords": block.coordinates
        })
    return blocks

def rule_based_kv_extraction(blocks):
    data = {}
    for b in blocks:
        t = b["text"].lower()
        if "invoice" in t and "number" in t:
            data["Invoice Number"] = b["text"]
        elif "total" in t:
            data["Total Amount"] = b["text"]
        elif "customer" in t:
            data["Customer Name"] = b["text"]
    return data