sblumenf commited on
Commit
6544d14
·
verified ·
1 Parent(s): 2f9a0a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -14
app.py CHANGED
@@ -6,9 +6,7 @@ import os
6
  import io
7
  from PIL import Image
8
  import pandas as pd
9
- import tabula
10
- import camelot
11
- from PyPDF2 import PdfReader
12
 
13
  def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
14
  """
@@ -55,17 +53,12 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
55
  except Exception as e:
56
  print(f"Error extracting image: {e}")
57
 
58
- # Enhanced table extraction (tabula-py preferred, fallback to camelot)
59
- try:
60
- tables = tabula.read_pdf(pdf_file, pages='all', multiple_tables=True)
61
- except Exception as e:
62
- print(f"tabula-py failed: {e}. Trying camelot...")
63
- try:
64
- camelot_tables = camelot.read_pdf(pdf_file)
65
- for table in camelot_tables:
66
- tables.append(table.df)
67
- except Exception as e:
68
- print(f"camelot also failed: {e}. No tables extracted.")
69
 
70
  # Format extracted data based on user selection
71
  if output_format == "JSON":
 
6
  import io
7
  from PIL import Image
8
  import pandas as pd
9
+ import pdfplumber
 
 
10
 
11
  def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
12
  """
 
53
  except Exception as e:
54
  print(f"Error extracting image: {e}")
55
 
56
+ # Enhanced table extraction using pdfplumber
57
+ with pdfplumber.open(pdf_file) as pdf:
58
+ for page_num, page in enumerate(pdf.pages):
59
+ for table in page.extract_tables():
60
+ df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
61
+ tables.append(df)
 
 
 
 
 
62
 
63
  # Format extracted data based on user selection
64
  if output_format == "JSON":