Spaces:

resolverkatla
/

Midterm_Task_5

Sleeping

Midterm_Task_5 / layout_extractor.py

Update

79fc11d 4 months ago

1.25 kB

	import layoutparser as lp
	import pytesseract
	import json
	from pdf2image import convert_from_path
	from PIL import Image

	def convert_pdf_to_images(pdf_path):
	return convert_from_path(pdf_path)

	def analyze_layout(image):
	model = lp.Detectron2LayoutModel(
	config_path='lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
	extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
	label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
	)
	layout = model.detect(image)
	return layout

	def extract_text_from_blocks(image, layout):
	blocks = []
	for block in layout:
	segment_image = block.crop_image(image)
	text = pytesseract.image_to_string(segment_image)
	blocks.append({
	"type": block.type,
	"text": text.strip(),
	"coordinates": block.coordinates
	})
	return blocks

	def extract_key_values(blocks):
	data = {}
	for block in blocks:
	text = block["text"]
	if "invoice" in text.lower():
	data["invoice_number"] = text
	elif "total" in text.lower():
	data["total_amount"] = text
	elif "customer" in text.lower():
	data["customer_name"] = text
	return data