Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,9 +6,7 @@ import os
|
|
6 |
import io
|
7 |
from PIL import Image
|
8 |
import pandas as pd
|
9 |
-
import
|
10 |
-
import camelot
|
11 |
-
from PyPDF2 import PdfReader
|
12 |
|
13 |
def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
|
14 |
"""
|
@@ -55,17 +53,12 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
|
|
55 |
except Exception as e:
|
56 |
print(f"Error extracting image: {e}")
|
57 |
|
58 |
-
# Enhanced table extraction
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
camelot_tables = camelot.read_pdf(pdf_file)
|
65 |
-
for table in camelot_tables:
|
66 |
-
tables.append(table.df)
|
67 |
-
except Exception as e:
|
68 |
-
print(f"camelot also failed: {e}. No tables extracted.")
|
69 |
|
70 |
# Format extracted data based on user selection
|
71 |
if output_format == "JSON":
|
|
|
6 |
import io
|
7 |
from PIL import Image
|
8 |
import pandas as pd
|
9 |
+
import pdfplumber
|
|
|
|
|
10 |
|
11 |
def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
|
12 |
"""
|
|
|
53 |
except Exception as e:
|
54 |
print(f"Error extracting image: {e}")
|
55 |
|
56 |
+
# Enhanced table extraction using pdfplumber
|
57 |
+
with pdfplumber.open(pdf_file) as pdf:
|
58 |
+
for page_num, page in enumerate(pdf.pages):
|
59 |
+
for table in page.extract_tables():
|
60 |
+
df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
|
61 |
+
tables.append(df)
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
# Format extracted data based on user selection
|
64 |
if output_format == "JSON":
|