Spaces:

resolverkatla
/

Midterm_Task_5

Sleeping

Midterm_Task_5 / processor.py

Update

79fc11d 4 months ago

1.31 kB

	import layoutparser as lp
	import pytesseract
	from pdf2image import convert_from_path
	from PIL import Image
	import json

	model = lp.Detectron2LayoutModel(
	config_path="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config",
	extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
	label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
	)

	def load_images(uploaded_file):
	if uploaded_file.name.endswith(".pdf"):
	return convert_from_path(uploaded_file)
	else:
	return [Image.open(uploaded_file)]

	def analyze_layout(image):
	layout = model.detect(image)
	return layout

	def extract_text_from_blocks(image, layout):
	blocks = []
	for block in layout:
	cropped = block.crop_image(image)
	text = pytesseract.image_to_string(cropped)
	blocks.append({
	"type": block.type,
	"text": text.strip(),
	"coords": block.coordinates
	})
	return blocks

	def rule_based_kv_extraction(blocks):
	data = {}
	for b in blocks:
	t = b["text"].lower()
	if "invoice" in t and "number" in t:
	data["Invoice Number"] = b["text"]
	elif "total" in t:
	data["Total Amount"] = b["text"]
	elif "customer" in t:
	data["Customer Name"] = b["text"]
	return data