Midterm_Task_5 / processor.py
resolverkatla's picture
Update
bcf1f17
import layoutparser as lp
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
def load_images(uploaded_file):
if uploaded_file.name.endswith(".pdf"):
return convert_from_path(uploaded_file)
else:
return [Image.open(uploaded_file)]
def analyze_layout(image):
model = lp.EfficientDetLayoutModel(
"lp://efficientdet/PubLayNet",
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.6],
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
)
return model.detect(image)
def extract_text_from_blocks(image, layout):
blocks = []
for block in layout:
segment_image = block.crop_image(image)
text = pytesseract.image_to_string(segment_image)
blocks.append({
"type": block.type,
"text": text.strip(),
"coordinates": block.coordinates
})
return blocks
def rule_based_kv_extraction(blocks):
data = {}
for block in blocks:
text = block["text"].lower()
if "invoice" in text:
data["Invoice Number"] = block["text"]
elif "total" in text:
data["Total Amount"] = block["text"]
elif "customer" in text:
data["Customer Name"] = block["text"]
return data