ignaciaginting commited on
Commit
c062f7b
·
verified ·
1 Parent(s): fc93aac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -19
app.py CHANGED
@@ -1,22 +1,36 @@
1
  import streamlit as st
2
- import fitz # PyMuPDF
3
- from huggingface_hub import snapshot_download
4
  import os
 
 
5
  from pdf2image import convert_from_path
6
  from PIL import Image
7
- import tempfile
8
-
9
- # Download the model if not already downloaded
10
- model_dir = "./pdf-extract-kit"
11
- if not os.path.exists(model_dir):
12
- snapshot_download(repo_id="opendatalab/pdf-extract-kit-1.0", local_dir=model_dir, max_workers=20)
13
-
14
- st.title("PDF Table Extractor with PDF-Extract-Kit-1.0")
15
-
16
- uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  if uploaded_file:
19
- st.write("Converting PDF to images...")
20
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
21
  tmp_pdf.write(uploaded_file.read())
22
  tmp_pdf_path = tmp_pdf.name
@@ -24,9 +38,16 @@ if uploaded_file:
24
  images = convert_from_path(tmp_pdf_path)
25
 
26
  for i, img in enumerate(images):
27
- st.image(img, caption=f"Page {i+1}", use_column_width=True)
28
-
29
- # Here you would call the table detection model on each image
30
- st.info("🛠 Table detection model would run here... (to be implemented)")
31
-
32
- st.success("Done processing PDF!")
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
 
2
  import os
3
+ import tempfile
4
+ from huggingface_hub import snapshot_download
5
  from pdf2image import convert_from_path
6
  from PIL import Image
7
+ import fitz # PyMuPDF
 
 
 
 
 
 
 
 
 
8
 
9
+ # Step 1: Download model if not present
10
+ MODEL_DIR = "./pdf-extract-kit"
11
+ if not os.path.exists(MODEL_DIR):
12
+ with st.spinner("Downloading model..."):
13
+ snapshot_download(repo_id="opendatalab/pdf-extract-kit-1.0", local_dir=MODEL_DIR, max_workers=20)
14
+
15
+ # Step 2: Import model logic dynamically
16
+ import sys
17
+ sys.path.append(MODEL_DIR + "/inference")
18
+ try:
19
+ from table_recognizer import TableRecognizer
20
+ except ImportError:
21
+ st.error("❌ Unable to load TableRecognizer. Check model directory structure.")
22
+ st.stop()
23
+
24
+ # Step 3: Set up recognizer
25
+ table_model = TableRecognizer(
26
+ model_dir=os.path.join(MODEL_DIR, "models", "table_recognition"),
27
+ device="cpu" # Change to 'cuda' if using GPU
28
+ )
29
+
30
+ st.title("📄 PDF Table Extractor")
31
+
32
+ uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
33
  if uploaded_file:
 
34
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
35
  tmp_pdf.write(uploaded_file.read())
36
  tmp_pdf_path = tmp_pdf.name
 
38
  images = convert_from_path(tmp_pdf_path)
39
 
40
  for i, img in enumerate(images):
41
+ st.subheader(f"Page {i + 1}")
42
+ st.image(img, caption="Original Page", use_column_width=True)
43
+
44
+ # Step 4: Run Table Recognizer
45
+ with st.spinner("Extracting tables..."):
46
+ table_results = table_model(img) # This assumes model takes a PIL image and returns result
47
+
48
+ if table_results:
49
+ for idx, table in enumerate(table_results):
50
+ st.markdown(f"#### Table {idx + 1}")
51
+ st.dataframe(table["data"]) # Assuming table["data"] is a 2D list or pandas DataFrame
52
+ else:
53
+ st.info("No tables detected on this page.")