import layoutparser as lp
import pytesseract
import json
from pdf2image import convert_from_path
from PIL import Image

def convert_pdf_to_images(pdf_path):
    return convert_from_path(pdf_path)

def analyze_layout(image):
    model = lp.Detectron2LayoutModel(
        config_path='lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
        extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
        label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
    )
    layout = model.detect(image)
    return layout

def extract_text_from_blocks(image, layout):
    blocks = []
    for block in layout:
        segment_image = block.crop_image(image)
        text = pytesseract.image_to_string(segment_image)
        blocks.append({
            "type": block.type,
            "text": text.strip(),
            "coordinates": block.coordinates
        })
    return blocks

def extract_key_values(blocks):
    data = {}
    for block in blocks:
        text = block["text"]
        if "invoice" in text.lower():
            data["invoice_number"] = text
        elif "total" in text.lower():
            data["total_amount"] = text
        elif "customer" in text.lower():
            data["customer_name"] = text
    return data