Spaces:

ignaciaginting
/

extract_from_doc

Build error

App Files Files Community

ignaciaginting commited on May 6

Commit

c062f7b

verified ·

1 Parent(s): fc93aac

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -19

app.py CHANGED Viewed

@@ -1,22 +1,36 @@
 import streamlit as st
-import fitz  # PyMuPDF
-from huggingface_hub import snapshot_download
 import os
 from pdf2image import convert_from_path
 from PIL import Image
-import tempfile
-# Download the model if not already downloaded
-model_dir = "./pdf-extract-kit"
-if not os.path.exists(model_dir):
-    snapshot_download(repo_id="opendatalab/pdf-extract-kit-1.0", local_dir=model_dir, max_workers=20)
-st.title("PDF Table Extractor with PDF-Extract-Kit-1.0")
-uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
 if uploaded_file:
-    st.write("Converting PDF to images...")
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
         tmp_pdf.write(uploaded_file.read())
         tmp_pdf_path = tmp_pdf.name
@@ -24,9 +38,16 @@ if uploaded_file:
     images = convert_from_path(tmp_pdf_path)
     for i, img in enumerate(images):
-        st.image(img, caption=f"Page {i+1}", use_column_width=True)
-        # Here you would call the table detection model on each image
-        st.info("🛠 Table detection model would run here... (to be implemented)")
-    st.success("Done processing PDF!")

 import streamlit as st
 import os
+import tempfile
+from huggingface_hub import snapshot_download
 from pdf2image import convert_from_path
 from PIL import Image
+import fitz  # PyMuPDF
+# Step 1: Download model if not present
+MODEL_DIR = "./pdf-extract-kit"
+if not os.path.exists(MODEL_DIR):
+    with st.spinner("Downloading model..."):
+        snapshot_download(repo_id="opendatalab/pdf-extract-kit-1.0", local_dir=MODEL_DIR, max_workers=20)
+# Step 2: Import model logic dynamically
+import sys
+sys.path.append(MODEL_DIR + "/inference")
+try:
+    from table_recognizer import TableRecognizer
+except ImportError:
+    st.error("❌ Unable to load TableRecognizer. Check model directory structure.")
+    st.stop()
+# Step 3: Set up recognizer
+table_model = TableRecognizer(
+    model_dir=os.path.join(MODEL_DIR, "models", "table_recognition"),
+    device="cpu"  # Change to 'cuda' if using GPU
+)
+st.title("📄 PDF Table Extractor")
+uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
 if uploaded_file:
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
         tmp_pdf.write(uploaded_file.read())
         tmp_pdf_path = tmp_pdf.name
     images = convert_from_path(tmp_pdf_path)
     for i, img in enumerate(images):
+        st.subheader(f"Page {i + 1}")
+        st.image(img, caption="Original Page", use_column_width=True)
+        # Step 4: Run Table Recognizer
+        with st.spinner("Extracting tables..."):
+            table_results = table_model(img)  # This assumes model takes a PIL image and returns result
+        if table_results:
+            for idx, table in enumerate(table_results):
+                st.markdown(f"#### Table {idx + 1}")
+                st.dataframe(table["data"])  # Assuming table["data"] is a 2D list or pandas DataFrame
+        else:
+            st.info("No tables detected on this page.")