import layoutparser as lp import pytesseract import json from pdf2image import convert_from_path from PIL import Image def convert_pdf_to_images(pdf_path): return convert_from_path(pdf_path) def analyze_layout(image): model = lp.Detectron2LayoutModel( config_path='lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config', extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8], label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"} ) layout = model.detect(image) return layout def extract_text_from_blocks(image, layout): blocks = [] for block in layout: segment_image = block.crop_image(image) text = pytesseract.image_to_string(segment_image) blocks.append({ "type": block.type, "text": text.strip(), "coordinates": block.coordinates }) return blocks def extract_key_values(blocks): data = {} for block in blocks: text = block["text"] if "invoice" in text.lower(): data["invoice_number"] = text elif "total" in text.lower(): data["total_amount"] = text elif "customer" in text.lower(): data["customer_name"] = text return data