Spaces:
Sleeping
Sleeping
File size: 1,286 Bytes
79fc11d 4a27fa8 79fc11d 4a27fa8 79fc11d 4a27fa8 79fc11d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
import layoutparser as lp
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
def convert_pdf_to_images(pdf_path):
return convert_from_path(pdf_path)
# ✅ Use EfficientDet instead of Detectron2 for better compatibility
def analyze_layout(image):
model = lp.EfficientDetLayoutModel(
"lp://efficientdet/PubLayNet",
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.6],
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
)
layout = model.detect(image)
return layout
def extract_text_from_blocks(image, layout):
blocks = []
for block in layout:
segment_image = block.crop_image(image)
text = pytesseract.image_to_string(segment_image)
blocks.append({
"type": block.type,
"text": text.strip(),
"coordinates": block.coordinates
})
return blocks
def extract_key_values(blocks):
data = {}
for block in blocks:
text = block["text"].lower()
if "invoice" in text:
data["Invoice Number"] = block["text"]
elif "total" in text:
data["Total Amount"] = block["text"]
elif "customer" in text:
data["Customer Name"] = block["text"]
return data
|