Spaces:
Sleeping
Sleeping
import layoutparser as lp | |
import pytesseract | |
from pdf2image import convert_from_path | |
from PIL import Image | |
import json | |
model = lp.Detectron2LayoutModel( | |
config_path="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config", | |
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8], | |
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}, | |
) | |
def load_images(uploaded_file): | |
if uploaded_file.name.endswith(".pdf"): | |
return convert_from_path(uploaded_file) | |
else: | |
return [Image.open(uploaded_file)] | |
def analyze_layout(image): | |
layout = model.detect(image) | |
return layout | |
def extract_text_from_blocks(image, layout): | |
blocks = [] | |
for block in layout: | |
cropped = block.crop_image(image) | |
text = pytesseract.image_to_string(cropped) | |
blocks.append({ | |
"type": block.type, | |
"text": text.strip(), | |
"coords": block.coordinates | |
}) | |
return blocks | |
def rule_based_kv_extraction(blocks): | |
data = {} | |
for b in blocks: | |
t = b["text"].lower() | |
if "invoice" in t and "number" in t: | |
data["Invoice Number"] = b["text"] | |
elif "total" in t: | |
data["Total Amount"] = b["text"] | |
elif "customer" in t: | |
data["Customer Name"] = b["text"] | |
return data | |