Spaces:
Sleeping
Sleeping
File size: 5,833 Bytes
7dec78f e92fbe1 7dec78f 17d36dc b8d5f22 6544d14 7cb3598 6a30f2e 41a1dac b8d5f22 28f23fa e2fb9c7 28f23fa e2fb9c7 28f23fa 41a1dac b8d5f22 7cb3598 b8d5f22 875f540 b8d5f22 e2fb9c7 28f23fa b8d5f22 41a1dac 6544d14 e17150e b8d5f22 ce01472 3403d47 41a1dac 3403d47 ce01472 3403d47 41a1dac 3403d47 ce01472 3403d47 41a1dac 3403d47 ce01472 7cb3598 6a30f2e 41a1dac b8d5f22 6a30f2e b8d5f22 875f540 7dec78f 3403d47 7dec78f 5e94ef1 e2fb9c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import json
import gradio as gr
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
import os
import io
from PIL import Image
import pandas as pd
import pdfplumber
import tempfile
import traceback
import re
def save_image(element, images):
try:
if hasattr(element, 'stream') and element.stream:
image_data = element.stream.get_rawdata()
image = Image.open(io.BytesIO(image_data))
image_filename = f"extracted_image_{len(images)}.png"
image.save(image_filename)
images.append({"filename": image_filename})
else:
print("No stream data for image element")
except Exception as e:
print(f"Error extracting image: {e}")
def detect_headers(text):
"""Detect headers in the text and format them."""
lines = text.split('\n')
formatted_text = ""
header_patterns = [r"^\d+\.\s", r"^[A-Z\s]+$", r"^[A-Z][a-z]+\s\d"]
for line in lines:
if any(re.match(pattern, line.strip()) for pattern in header_patterns):
formatted_text += f"# {line.strip()}\n"
else:
formatted_text += f"{line.strip()}\n"
return formatted_text
def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
"""
Parses a PDF file, extracts text, tables, and images, and formats the output.
Args:
pdf_file: Path to the uploaded PDF file.
output_format: Desired output format ("JSON", "Markdown", or "HTML").
progress: Gradio Progress object for displaying progress.
Returns:
tuple: Extracted text and download data in the specified format.
Returns an empty string and None if there is an error.
"""
try:
with open(pdf_file, 'rb') as file:
text = ""
tables = []
images = []
for page in extract_pages(file):
for element in page:
if isinstance(element, LTTextBoxHorizontal):
text += element.get_text()
elif isinstance(element, (LTFigure, LTImage)):
print(f"Processing element: {type(element)}")
save_image(element, images)
formatted_text = detect_headers(text)
with pdfplumber.open(pdf_file) as pdf:
for page_num, page in enumerate(pdf.pages):
for table in page.extract_tables():
try:
if len(table) > 0 and len(set(table[0])) != len(table[0]):
unique_columns = []
for col in table[0]:
if col in unique_columns:
col = f"{col}_{unique_columns.count(col)}"
unique_columns.append(col)
df = pd.DataFrame(table[1:], columns=unique_columns)
else:
df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
tables.append(df)
except Exception as e:
print(f"Error processing table: {e}")
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
if output_format == "JSON":
json_data = {
"text": formatted_text,
"tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
"images": images
}
json.dump(json_data, tmp, ensure_ascii=False, indent=4)
elif output_format == "Markdown":
markdown_text = f"# Extracted Text\n\n{formatted_text}\n\n# Tables\n"
for i, table in enumerate(tables):
if not table.columns.duplicated().any():
markdown_text += f"## Table {i+1}\n"
markdown_text += table.to_markdown(index=False) + "\n\n"
markdown_text += "\n\n# Images\n\n"
for image in images:
image_path = os.path.join(os.getcwd(), image["filename"])
markdown_text += f'\n'
tmp.write(markdown_text)
elif output_format == "HTML":
html_text = f"<p>{formatted_text}</p>\n\n<h2>Tables</h2>\n"
for i, table in enumerate(tables):
if not table.columns.duplicated().any():
html_text += f"<h2>Table {i+1}</h2>\n"
html_text += table.to_html() + "<br>"
html_text += "\n\n<h2>Images</h2>\n\n"
for image in images:
image_path = os.path.join(os.getcwd(), image["filename"])
html_text += f'<img src="{image_path}" alt="Image"><br>\n'
tmp.write(html_text)
download_path = tmp.name
return formatted_text, download_path
except Exception as main_e:
traceback.print_exc() # Print full traceback to console
print(f"A main error occurred: {main_e}")
return "", None
iface = gr.Interface(
fn=parse_pdf,
inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
outputs=[
gr.Text(label="Output Text"),
gr.File(label="Download Output")
],
title="PDF Parser",
description="Parse a PDF and choose the output format."
)
if __name__ == "__main__":
iface.launch() # Temporarily disable sharing for debugging
|