Spaces:

sblumenf
/

pdf-convert

Sleeping

File size: 5,850 Bytes

import json
import gradio as gr
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
import os
import io
from PIL import Image
import pandas as pd
import pdfplumber

def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
    """
    Parses a PDF file, extracts text, tables, and images, and formats the output.

    Args:
        pdf_file: Path to the uploaded PDF file.
        output_format: Desired output format ("JSON", "Markdown", or "HTML").
        progress: Gradio Progress object for displaying progress.

    Returns:
        tuple: Extracted text and download data in the specified format.
               Returns an empty string and None if there is an error.
    """
    try:
        with open(pdf_file, 'rb') as file:
            pages = list(extract_pages(file))  # Convert generator to list
            text = ""
            tables = []
            images = []

            # Iterate through pages and extract text and images
            for i, page in enumerate(pages):
                progress(i / len(pages))  # Update progress bar
                for element in page:
                    if isinstance(element, LTTextBoxHorizontal):
                        text += element.get_text()
                    elif isinstance(element, (LTFigure, LTImage)):
                        try:
                            if hasattr(element, 'stream'):
                                image_data = element.stream.read()
                                image = Image.open(io.BytesIO(image_data))
                                image_filename = f"extracted_image_{len(images)}.png"
                                image.save(image_filename)
                                images.append({"filename": image_filename})
                            else:
                                for child in element:
                                    if isinstance(child, LTImage):
                                        image_data = child.stream.read()
                                        image = Image.open(io.BytesIO(image_data))
                                        image_filename = f"extracted_image_{len(images)}.png"
                                        image.save(image_filename)
                                        images.append({"filename": image_filename})
                        except Exception as e:
                            print(f"Error extracting image: {e}")

            # Enhanced table extraction using pdfplumber
            with pdfplumber.open(pdf_file) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    for table in page.extract_tables():
                        # Handle potential duplicate columns
                        if len(table) > 0 and len(set(table[0])) != len(table[0]):
                            # If duplicate columns exist, try to create unique column names
                            unique_columns = []
                            for col in table[0]:
                                if col in unique_columns:
                                    col = f"{col}_{unique_columns.count(col)}"  # Append a counter
                                unique_columns.append(col)
                            df = pd.DataFrame(table[1:], columns=unique_columns)
                        else:
                            df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None) 
                        tables.append(df)

            # Format extracted data based on user selection
            if output_format == "JSON":
                json_data = {
                    "text": text,
                    "tables": [table.to_dict(orient='records') for table in tables],  # Use 'records' for better handling of duplicate columns
                    "images": images
                }
                download_data = json.dumps(json_data, indent=4)  # Add indentation for readability
            elif output_format == "Markdown":
                markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
                for i, table in enumerate(tables):
                    markdown_text += f"## Table {i+1}\n"
                    markdown_text += table.to_markdown(index=False) + "\n\n"

                # Image embedding in Markdown (using relative paths)
                markdown_text += "\n\n# Images\n\n"
                for image in images:
                  image_path = os.path.join(os.getcwd(), image["filename"])
                  markdown_text += f'![Image]({image_path})\n'

                download_data = markdown_text
            elif output_format == "HTML":
                html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
                for i, table in enumerate(tables):
                    html_text += f"<h2>Table {i+1}</h2>\n"
                    html_text += table.to_html() + "<br>"

                # Image embedding in HTML (using relative paths)
                html_text += "\n\n<h2>Images</h2>\n\n"
                for image in images:
                  image_path = os.path.join(os.getcwd(), image["filename"])
                  html_text += f'<img src="{image_path}" alt="Image"><br>\n'

                download_data = html_text.encode("utf-8")  # Encode for HTML download
            return text, download_data

    except Exception as main_e:
        print(f"A main error occurred: {main_e}")
        return "", None # Return empty string and None in case of error

iface = gr.Interface(
    fn=parse_pdf,
    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])], # Remove gr.Progress() from inputs
    outputs=[
        gr.Text(label="Output Text"),
        gr.File(label="Download Output")
    ],
    title="PDF Parser",
    description="Parse a PDF and choose the output format."
)

if __name__ == "__main__":
    iface.launch(share=False)