Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -57,14 +57,24 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
|
|
57 |
with pdfplumber.open(pdf_file) as pdf:
|
58 |
for page_num, page in enumerate(pdf.pages):
|
59 |
for table in page.extract_tables():
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
tables.append(df)
|
62 |
|
63 |
# Format extracted data based on user selection
|
64 |
if output_format == "JSON":
|
65 |
json_data = {
|
66 |
"text": text,
|
67 |
-
"tables": [table.to_dict() for table in tables],
|
68 |
"images": images
|
69 |
}
|
70 |
download_data = json.dumps(json_data, indent=4) # Add indentation for readability
|
|
|
57 |
with pdfplumber.open(pdf_file) as pdf:
|
58 |
for page_num, page in enumerate(pdf.pages):
|
59 |
for table in page.extract_tables():
|
60 |
+
# Handle potential duplicate columns
|
61 |
+
if len(table) > 0 and len(set(table[0])) != len(table[0]):
|
62 |
+
# If duplicate columns exist, try to create unique column names
|
63 |
+
unique_columns = []
|
64 |
+
for col in table[0]:
|
65 |
+
if col in unique_columns:
|
66 |
+
col = f"{col}_{unique_columns.count(col)}" # Append a counter
|
67 |
+
unique_columns.append(col)
|
68 |
+
df = pd.DataFrame(table[1:], columns=unique_columns)
|
69 |
+
else:
|
70 |
+
df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
|
71 |
tables.append(df)
|
72 |
|
73 |
# Format extracted data based on user selection
|
74 |
if output_format == "JSON":
|
75 |
json_data = {
|
76 |
"text": text,
|
77 |
+
"tables": [table.to_dict(orient='records') for table in tables], # Use 'records' for better handling of duplicate columns
|
78 |
"images": images
|
79 |
}
|
80 |
download_data = json.dumps(json_data, indent=4) # Add indentation for readability
|