Spaces:

resolverkatla
/

Midterm_Task_5

Sleeping

Midterm_Task_5 / processor.py

Update

bcf1f17 4 months ago

1.3 kB

	import layoutparser as lp
	import pytesseract
	from pdf2image import convert_from_path
	from PIL import Image

	def load_images(uploaded_file):
	if uploaded_file.name.endswith(".pdf"):
	return convert_from_path(uploaded_file)
	else:
	return [Image.open(uploaded_file)]

	def analyze_layout(image):
	model = lp.EfficientDetLayoutModel(
	"lp://efficientdet/PubLayNet",
	extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.6],
	label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
	)
	return model.detect(image)

	def extract_text_from_blocks(image, layout):
	blocks = []
	for block in layout:
	segment_image = block.crop_image(image)
	text = pytesseract.image_to_string(segment_image)
	blocks.append({
	"type": block.type,
	"text": text.strip(),
	"coordinates": block.coordinates
	})
	return blocks

	def rule_based_kv_extraction(blocks):
	data = {}
	for block in blocks:
	text = block["text"].lower()
	if "invoice" in text:
	data["Invoice Number"] = block["text"]
	elif "total" in text:
	data["Total Amount"] = block["text"]
	elif "customer" in text:
	data["Customer Name"] = block["text"]
	return data