import layoutparser as lp import pytesseract from pdf2image import convert_from_path from PIL import Image import json model = lp.Detectron2LayoutModel( config_path="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config", extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8], label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}, ) def load_images(uploaded_file): if uploaded_file.name.endswith(".pdf"): return convert_from_path(uploaded_file) else: return [Image.open(uploaded_file)] def analyze_layout(image): layout = model.detect(image) return layout def extract_text_from_blocks(image, layout): blocks = [] for block in layout: cropped = block.crop_image(image) text = pytesseract.image_to_string(cropped) blocks.append({ "type": block.type, "text": text.strip(), "coords": block.coordinates }) return blocks def rule_based_kv_extraction(blocks): data = {} for b in blocks: t = b["text"].lower() if "invoice" in t and "number" in t: data["Invoice Number"] = b["text"] elif "total" in t: data["Total Amount"] = b["text"] elif "customer" in t: data["Customer Name"] = b["text"] return data