sblumenf commited on
Commit
0a3a380
·
verified ·
1 Parent(s): 6544d14

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -2
app.py CHANGED
@@ -57,14 +57,24 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
57
  with pdfplumber.open(pdf_file) as pdf:
58
  for page_num, page in enumerate(pdf.pages):
59
  for table in page.extract_tables():
60
- df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
 
 
 
 
 
 
 
 
 
 
61
  tables.append(df)
62
 
63
  # Format extracted data based on user selection
64
  if output_format == "JSON":
65
  json_data = {
66
  "text": text,
67
- "tables": [table.to_dict() for table in tables],
68
  "images": images
69
  }
70
  download_data = json.dumps(json_data, indent=4) # Add indentation for readability
 
57
  with pdfplumber.open(pdf_file) as pdf:
58
  for page_num, page in enumerate(pdf.pages):
59
  for table in page.extract_tables():
60
+ # Handle potential duplicate columns
61
+ if len(table) > 0 and len(set(table[0])) != len(table[0]):
62
+ # If duplicate columns exist, try to create unique column names
63
+ unique_columns = []
64
+ for col in table[0]:
65
+ if col in unique_columns:
66
+ col = f"{col}_{unique_columns.count(col)}" # Append a counter
67
+ unique_columns.append(col)
68
+ df = pd.DataFrame(table[1:], columns=unique_columns)
69
+ else:
70
+ df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
71
  tables.append(df)
72
 
73
  # Format extracted data based on user selection
74
  if output_format == "JSON":
75
  json_data = {
76
  "text": text,
77
+ "tables": [table.to_dict(orient='records') for table in tables], # Use 'records' for better handling of duplicate columns
78
  "images": images
79
  }
80
  download_data = json.dumps(json_data, indent=4) # Add indentation for readability