Midterm_Task_5 / layout_extractor.py
resolverkatla's picture
Update
4a27fa8
import layoutparser as lp
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
def convert_pdf_to_images(pdf_path):
return convert_from_path(pdf_path)
# ✅ Use EfficientDet instead of Detectron2 for better compatibility
def analyze_layout(image):
model = lp.EfficientDetLayoutModel(
"lp://efficientdet/PubLayNet",
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.6],
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
)
layout = model.detect(image)
return layout
def extract_text_from_blocks(image, layout):
blocks = []
for block in layout:
segment_image = block.crop_image(image)
text = pytesseract.image_to_string(segment_image)
blocks.append({
"type": block.type,
"text": text.strip(),
"coordinates": block.coordinates
})
return blocks
def extract_key_values(blocks):
data = {}
for block in blocks:
text = block["text"].lower()
if "invoice" in text:
data["Invoice Number"] = block["text"]
elif "total" in text:
data["Total Amount"] = block["text"]
elif "customer" in text:
data["Customer Name"] = block["text"]
return data