Spaces:

sblumenf
/

pdf-convert

Sleeping

sblumenf commited on Dec 12, 2024

Commit

0a3a380

verified ·

1 Parent(s): 6544d14

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -57,14 +57,24 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
             with pdfplumber.open(pdf_file) as pdf:
                 for page_num, page in enumerate(pdf.pages):
                     for table in page.extract_tables():
-                        df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                         tables.append(df)
             # Format extracted data based on user selection
             if output_format == "JSON":
                 json_data = {
                     "text": text,
-                    "tables": [table.to_dict() for table in tables],
                     "images": images
                 }
                 download_data = json.dumps(json_data, indent=4)  # Add indentation for readability

             with pdfplumber.open(pdf_file) as pdf:
                 for page_num, page in enumerate(pdf.pages):
                     for table in page.extract_tables():
+                        # Handle potential duplicate columns
+                        if len(table) > 0 and len(set(table[0])) != len(table[0]):
+                            # If duplicate columns exist, try to create unique column names
+                            unique_columns = []
+                            for col in table[0]:
+                                if col in unique_columns:
+                                    col = f"{col}_{unique_columns.count(col)}"  # Append a counter
+                                unique_columns.append(col)
+                            df = pd.DataFrame(table[1:], columns=unique_columns)
+                        else:
+                            df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                         tables.append(df)
             # Format extracted data based on user selection
             if output_format == "JSON":
                 json_data = {
                     "text": text,
+                    "tables": [table.to_dict(orient='records') for table in tables],  # Use 'records' for better handling of duplicate columns
                     "images": images
                 }
                 download_data = json.dumps(json_data, indent=4)  # Add indentation for readability