Spaces:
Build error
Build error
File size: 1,865 Bytes
005a185 b39b068 c062f7b d29af94 c062f7b 005a185 c062f7b d29af94 005a185 d29af94 005a185 d29af94 c062f7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import streamlit as st
import os
import tempfile
from huggingface_hub import snapshot_download
from pdf2image import convert_from_path
from PIL import Image
import fitz # PyMuPDF
# Step 1: Download model if not present
MODEL_DIR = "./pdf-extract-kit"
if not os.path.exists(MODEL_DIR):
with st.spinner("Downloading model..."):
snapshot_download(repo_id="opendatalab/pdf-extract-kit-1.0", local_dir=MODEL_DIR, max_workers=20)
# Step 2: Import model logic dynamically
import sys
sys.path.append(MODEL_DIR + "/inference")
try:
from table_recognizer import TableRecognizer
except ImportError:
st.error("❌ Unable to load TableRecognizer. Check model directory structure.")
st.stop()
# Step 3: Set up recognizer
table_model = TableRecognizer(
model_dir=os.path.join(MODEL_DIR, "models", "table_recognition"),
device="cpu" # Change to 'cuda' if using GPU
)
st.title("📄 PDF Table Extractor")
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file:
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
tmp_pdf.write(uploaded_file.read())
tmp_pdf_path = tmp_pdf.name
images = convert_from_path(tmp_pdf_path)
for i, img in enumerate(images):
st.subheader(f"Page {i + 1}")
st.image(img, caption="Original Page", use_column_width=True)
# Step 4: Run Table Recognizer
with st.spinner("Extracting tables..."):
table_results = table_model(img) # This assumes model takes a PIL image and returns result
if table_results:
for idx, table in enumerate(table_results):
st.markdown(f"#### Table {idx + 1}")
st.dataframe(table["data"]) # Assuming table["data"] is a 2D list or pandas DataFrame
else:
st.info("No tables detected on this page.")
|