File size: 1,865 Bytes

import streamlit as st
import os
import tempfile
from huggingface_hub import snapshot_download
from pdf2image import convert_from_path
from PIL import Image
import fitz  # PyMuPDF

# Step 1: Download model if not present
MODEL_DIR = "./pdf-extract-kit"
if not os.path.exists(MODEL_DIR):
    with st.spinner("Downloading model..."):
        snapshot_download(repo_id="opendatalab/pdf-extract-kit-1.0", local_dir=MODEL_DIR, max_workers=20)

# Step 2: Import model logic dynamically
import sys
sys.path.append(MODEL_DIR + "/inference")
try:
    from table_recognizer import TableRecognizer
except ImportError:
    st.error("❌ Unable to load TableRecognizer. Check model directory structure.")
    st.stop()

# Step 3: Set up recognizer
table_model = TableRecognizer(
    model_dir=os.path.join(MODEL_DIR, "models", "table_recognition"),
    device="cpu"  # Change to 'cuda' if using GPU
)

st.title("📄 PDF Table Extractor")

uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file:
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
        tmp_pdf.write(uploaded_file.read())
        tmp_pdf_path = tmp_pdf.name

    images = convert_from_path(tmp_pdf_path)

    for i, img in enumerate(images):
        st.subheader(f"Page {i + 1}")
        st.image(img, caption="Original Page", use_column_width=True)

        # Step 4: Run Table Recognizer
        with st.spinner("Extracting tables..."):
            table_results = table_model(img)  # This assumes model takes a PIL image and returns result

        if table_results:
            for idx, table in enumerate(table_results):
                st.markdown(f"#### Table {idx + 1}")
                st.dataframe(table["data"])  # Assuming table["data"] is a 2D list or pandas DataFrame
        else:
            st.info("No tables detected on this page.")