Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -36,15 +36,15 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
|
|
36 |
elif isinstance(element, (LTFigure, LTImage)):
|
37 |
try:
|
38 |
if hasattr(element, 'stream'):
|
39 |
-
image_data = element.stream.
|
40 |
image = Image.open(io.BytesIO(image_data))
|
41 |
image_filename = f"extracted_image_{len(images)}.png"
|
42 |
image.save(image_filename)
|
43 |
images.append({"filename": image_filename})
|
44 |
else:
|
45 |
for child in element:
|
46 |
-
if isinstance(child, LTImage):
|
47 |
-
image_data = child.stream.
|
48 |
image = Image.open(io.BytesIO(image_data))
|
49 |
image_filename = f"extracted_image_{len(images)}.png"
|
50 |
image.save(image_filename)
|
@@ -55,16 +55,19 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
|
|
55 |
with pdfplumber.open(pdf_file) as pdf:
|
56 |
for page_num, page in enumerate(pdf.pages):
|
57 |
for table in page.extract_tables():
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
col
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
68 |
|
69 |
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
|
70 |
if output_format == "JSON":
|
|
|
36 |
elif isinstance(element, (LTFigure, LTImage)):
|
37 |
try:
|
38 |
if hasattr(element, 'stream'):
|
39 |
+
image_data = element.stream.get_rawdata()
|
40 |
image = Image.open(io.BytesIO(image_data))
|
41 |
image_filename = f"extracted_image_{len(images)}.png"
|
42 |
image.save(image_filename)
|
43 |
images.append({"filename": image_filename})
|
44 |
else:
|
45 |
for child in element:
|
46 |
+
if isinstance(child, LTImage) and hasattr(child, 'stream'):
|
47 |
+
image_data = child.stream.get_rawdata()
|
48 |
image = Image.open(io.BytesIO(image_data))
|
49 |
image_filename = f"extracted_image_{len(images)}.png"
|
50 |
image.save(image_filename)
|
|
|
55 |
with pdfplumber.open(pdf_file) as pdf:
|
56 |
for page_num, page in enumerate(pdf.pages):
|
57 |
for table in page.extract_tables():
|
58 |
+
try:
|
59 |
+
if len(table) > 0 and len(set(table[0])) != len(table[0]):
|
60 |
+
unique_columns = []
|
61 |
+
for col in table[0]:
|
62 |
+
if col in unique_columns:
|
63 |
+
col = f"{col}_{unique_columns.count(col)}"
|
64 |
+
unique_columns.append(col)
|
65 |
+
df = pd.DataFrame(table[1:], columns=unique_columns)
|
66 |
+
else:
|
67 |
+
df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
|
68 |
+
tables.append(df)
|
69 |
+
except Exception as e:
|
70 |
+
print(f"Error processing table: {e}")
|
71 |
|
72 |
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
|
73 |
if output_format == "JSON":
|