Spaces:

ignaciaginting
/

extract_from_doc

Build error

App Files Files Community

extract_from_doc / app.py

ignaciaginting

Update app.py

c062f7b verified 2 months ago

raw

history blame

1.87 kB

	import streamlit as st
	import os
	import tempfile
	from huggingface_hub import snapshot_download
	from pdf2image import convert_from_path
	from PIL import Image
	import fitz # PyMuPDF

	# Step 1: Download model if not present
	MODEL_DIR = "./pdf-extract-kit"
	if not os.path.exists(MODEL_DIR):
	with st.spinner("Downloading model..."):
	snapshot_download(repo_id="opendatalab/pdf-extract-kit-1.0", local_dir=MODEL_DIR, max_workers=20)

	# Step 2: Import model logic dynamically
	import sys
	sys.path.append(MODEL_DIR + "/inference")
	try:
	from table_recognizer import TableRecognizer
	except ImportError:
	st.error("❌ Unable to load TableRecognizer. Check model directory structure.")
	st.stop()

	# Step 3: Set up recognizer
	table_model = TableRecognizer(
	model_dir=os.path.join(MODEL_DIR, "models", "table_recognition"),
	device="cpu" # Change to 'cuda' if using GPU
	)

	st.title("📄 PDF Table Extractor")

	uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
	if uploaded_file:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
	tmp_pdf.write(uploaded_file.read())
	tmp_pdf_path = tmp_pdf.name

	images = convert_from_path(tmp_pdf_path)

	for i, img in enumerate(images):
	st.subheader(f"Page {i + 1}")
	st.image(img, caption="Original Page", use_column_width=True)

	# Step 4: Run Table Recognizer
	with st.spinner("Extracting tables..."):
	table_results = table_model(img) # This assumes model takes a PIL image and returns result

	if table_results:
	for idx, table in enumerate(table_results):
	st.markdown(f"#### Table {idx + 1}")
	st.dataframe(table["data"]) # Assuming table["data"] is a 2D list or pandas DataFrame
	else:
	st.info("No tables detected on this page.")