Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -36,15 +36,15 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
|
|
| 36 |
elif isinstance(element, (LTFigure, LTImage)):
|
| 37 |
try:
|
| 38 |
if hasattr(element, 'stream'):
|
| 39 |
-
image_data = element.stream.
|
| 40 |
image = Image.open(io.BytesIO(image_data))
|
| 41 |
image_filename = f"extracted_image_{len(images)}.png"
|
| 42 |
image.save(image_filename)
|
| 43 |
images.append({"filename": image_filename})
|
| 44 |
else:
|
| 45 |
for child in element:
|
| 46 |
-
if isinstance(child, LTImage):
|
| 47 |
-
image_data = child.stream.
|
| 48 |
image = Image.open(io.BytesIO(image_data))
|
| 49 |
image_filename = f"extracted_image_{len(images)}.png"
|
| 50 |
image.save(image_filename)
|
|
@@ -55,16 +55,19 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
|
|
| 55 |
with pdfplumber.open(pdf_file) as pdf:
|
| 56 |
for page_num, page in enumerate(pdf.pages):
|
| 57 |
for table in page.extract_tables():
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
col
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
|
| 70 |
if output_format == "JSON":
|
|
|
|
| 36 |
elif isinstance(element, (LTFigure, LTImage)):
|
| 37 |
try:
|
| 38 |
if hasattr(element, 'stream'):
|
| 39 |
+
image_data = element.stream.get_rawdata()
|
| 40 |
image = Image.open(io.BytesIO(image_data))
|
| 41 |
image_filename = f"extracted_image_{len(images)}.png"
|
| 42 |
image.save(image_filename)
|
| 43 |
images.append({"filename": image_filename})
|
| 44 |
else:
|
| 45 |
for child in element:
|
| 46 |
+
if isinstance(child, LTImage) and hasattr(child, 'stream'):
|
| 47 |
+
image_data = child.stream.get_rawdata()
|
| 48 |
image = Image.open(io.BytesIO(image_data))
|
| 49 |
image_filename = f"extracted_image_{len(images)}.png"
|
| 50 |
image.save(image_filename)
|
|
|
|
| 55 |
with pdfplumber.open(pdf_file) as pdf:
|
| 56 |
for page_num, page in enumerate(pdf.pages):
|
| 57 |
for table in page.extract_tables():
|
| 58 |
+
try:
|
| 59 |
+
if len(table) > 0 and len(set(table[0])) != len(table[0]):
|
| 60 |
+
unique_columns = []
|
| 61 |
+
for col in table[0]:
|
| 62 |
+
if col in unique_columns:
|
| 63 |
+
col = f"{col}_{unique_columns.count(col)}"
|
| 64 |
+
unique_columns.append(col)
|
| 65 |
+
df = pd.DataFrame(table[1:], columns=unique_columns)
|
| 66 |
+
else:
|
| 67 |
+
df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
|
| 68 |
+
tables.append(df)
|
| 69 |
+
except Exception as e:
|
| 70 |
+
print(f"Error processing table: {e}")
|
| 71 |
|
| 72 |
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
|
| 73 |
if output_format == "JSON":
|