Spaces:
Build error
Build error
import streamlit as st | |
import os | |
import tempfile | |
from huggingface_hub import snapshot_download | |
from pdf2image import convert_from_path | |
from PIL import Image | |
import fitz # PyMuPDF | |
# Step 1: Download model if not present | |
MODEL_DIR = "./pdf-extract-kit" | |
if not os.path.exists(MODEL_DIR): | |
with st.spinner("Downloading model..."): | |
snapshot_download(repo_id="opendatalab/pdf-extract-kit-1.0", local_dir=MODEL_DIR, max_workers=20) | |
# Step 2: Import model logic dynamically | |
import sys | |
sys.path.append(MODEL_DIR + "/inference") | |
try: | |
from table_recognizer import TableRecognizer | |
except ImportError: | |
st.error("β Unable to load TableRecognizer. Check model directory structure.") | |
st.stop() | |
# Step 3: Set up recognizer | |
table_model = TableRecognizer( | |
model_dir=os.path.join(MODEL_DIR, "models", "table_recognition"), | |
device="cpu" # Change to 'cuda' if using GPU | |
) | |
st.title("π PDF Table Extractor") | |
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
if uploaded_file: | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf: | |
tmp_pdf.write(uploaded_file.read()) | |
tmp_pdf_path = tmp_pdf.name | |
images = convert_from_path(tmp_pdf_path) | |
for i, img in enumerate(images): | |
st.subheader(f"Page {i + 1}") | |
st.image(img, caption="Original Page", use_column_width=True) | |
# Step 4: Run Table Recognizer | |
with st.spinner("Extracting tables..."): | |
table_results = table_model(img) # This assumes model takes a PIL image and returns result | |
if table_results: | |
for idx, table in enumerate(table_results): | |
st.markdown(f"#### Table {idx + 1}") | |
st.dataframe(table["data"]) # Assuming table["data"] is a 2D list or pandas DataFrame | |
else: | |
st.info("No tables detected on this page.") | |