Midterm_Task_5 / processor.py
resolverkatla's picture
Update
79fc11d
raw
history blame
1.31 kB
import layoutparser as lp
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import json
model = lp.Detectron2LayoutModel(
config_path="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config",
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
)
def load_images(uploaded_file):
if uploaded_file.name.endswith(".pdf"):
return convert_from_path(uploaded_file)
else:
return [Image.open(uploaded_file)]
def analyze_layout(image):
layout = model.detect(image)
return layout
def extract_text_from_blocks(image, layout):
blocks = []
for block in layout:
cropped = block.crop_image(image)
text = pytesseract.image_to_string(cropped)
blocks.append({
"type": block.type,
"text": text.strip(),
"coords": block.coordinates
})
return blocks
def rule_based_kv_extraction(blocks):
data = {}
for b in blocks:
t = b["text"].lower()
if "invoice" in t and "number" in t:
data["Invoice Number"] = b["text"]
elif "total" in t:
data["Total Amount"] = b["text"]
elif "customer" in t:
data["Customer Name"] = b["text"]
return data