Midterm_Task_5 / layout_extractor.py
resolverkatla's picture
Update
79fc11d
raw
history blame
1.25 kB
import layoutparser as lp
import pytesseract
import json
from pdf2image import convert_from_path
from PIL import Image
def convert_pdf_to_images(pdf_path):
return convert_from_path(pdf_path)
def analyze_layout(image):
model = lp.Detectron2LayoutModel(
config_path='lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
)
layout = model.detect(image)
return layout
def extract_text_from_blocks(image, layout):
blocks = []
for block in layout:
segment_image = block.crop_image(image)
text = pytesseract.image_to_string(segment_image)
blocks.append({
"type": block.type,
"text": text.strip(),
"coordinates": block.coordinates
})
return blocks
def extract_key_values(blocks):
data = {}
for block in blocks:
text = block["text"]
if "invoice" in text.lower():
data["invoice_number"] = text
elif "total" in text.lower():
data["total_amount"] = text
elif "customer" in text.lower():
data["customer_name"] = text
return data