Spaces:
Sleeping
Sleeping
File size: 5,583 Bytes
7dec78f e92fbe1 7dec78f 17d36dc b8d5f22 6544d14 7cb3598 6a30f2e b8d5f22 7cb3598 b8d5f22 875f540 b8d5f22 17d36dc b8d5f22 6544d14 0a3a380 3403d47 0a3a380 875f540 6544d14 b8d5f22 ce01472 3403d47 ce01472 3403d47 ce01472 3403d47 ce01472 7cb3598 6a30f2e 3403d47 b8d5f22 6a30f2e b8d5f22 875f540 7dec78f 3403d47 7dec78f 5e94ef1 ce01472 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import json
import gradio as gr
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
import os
import io
from PIL import Image
import pandas as pd
import pdfplumber
import tempfile
import traceback
def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
"""
Parses a PDF file, extracts text, tables, and images, and formats the output.
Args:
pdf_file: Path to the uploaded PDF file.
output_format: Desired output format ("JSON", "Markdown", or "HTML").
progress: Gradio Progress object for displaying progress.
Returns:
tuple: Extracted text and download data in the specified format.
Returns an empty string and None if there is an error.
"""
try:
with open(pdf_file, 'rb') as file:
text = ""
tables = []
images = []
for page in extract_pages(file):
for element in page:
if isinstance(element, LTTextBoxHorizontal):
text += element.get_text()
elif isinstance(element, (LTFigure, LTImage)):
try:
if hasattr(element, 'stream'):
image_data = element.stream.read()
image = Image.open(io.BytesIO(image_data))
image_filename = f"extracted_image_{len(images)}.png"
image.save(image_filename)
images.append({"filename": image_filename})
else:
for child in element:
if isinstance(child, LTImage):
image_data = child.stream.read()
image = Image.open(io.BytesIO(image_data))
image_filename = f"extracted_image_{len(images)}.png"
image.save(image_filename)
images.append({"filename": image_filename})
except Exception as e:
print(f"Error extracting image: {e}")
with pdfplumber.open(pdf_file) as pdf:
for page_num, page in enumerate(pdf.pages):
for table in page.extract_tables():
if len(table) > 0 and len(set(table[0])) != len(table[0]):
unique_columns = []
for col in table[0]:
if col in unique_columns:
col = f"{col}_{unique_columns.count(col)}"
unique_columns.append(col)
df = pd.DataFrame(table[1:], columns=unique_columns)
else:
df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
tables.append(df)
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
if output_format == "JSON":
json_data = {
"text": text,
"tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
"images": images
}
json.dump(json_data, tmp, ensure_ascii=False, indent=4)
elif output_format == "Markdown":
markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
for i, table in enumerate(tables):
if not table.columns.duplicated().any():
markdown_text += f"## Table {i+1}\n"
markdown_text += table.to_markdown(index=False) + "\n\n"
markdown_text += "\n\n# Images\n\n"
for image in images:
image_path = os.path.join(os.getcwd(), image["filename"])
markdown_text += f'\n'
tmp.write(markdown_text)
elif output_format == "HTML":
html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
for i, table in enumerate(tables):
if not table.columns.duplicated().any():
html_text += f"<h2>Table {i+1}</h2>\n"
html_text += table.to_html() + "<br>"
html_text += "\n\n<h2>Images</h2>\n\n"
for image in images:
image_path = os.path.join(os.getcwd(), image["filename"])
html_text += f'<img src="{image_path}" alt="Image"><br>\n'
tmp.write(html_text)
download_path = tmp.name
return text, download_path
except Exception as main_e:
traceback.print_exc() # Print full traceback to console
print(f"A main error occurred: {main_e}")
return "", None
iface = gr.Interface(
fn=parse_pdf,
inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
outputs=[
gr.Text(label="Output Text"),
gr.File(label="Download Output")
],
title="PDF Parser",
description="Parse a PDF and choose the output format."
)
if __name__ == "__main__":
iface.launch() # Temporarily disable sharing for debugging
|