Spaces:

sblumenf
/

pdf-convert

Sleeping

File size: 2,931 Bytes

7dec78f

import json
import gradio as gr
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
import mistletoe  # for Markdown table generation (optional)

def parse_pdf(pdf_file, output_format):
    with open(pdf_file, 'rb') as file:
        pages = extract_pages(file)

        text = ""
        tables = []  # Placeholder for extracted table data
        images = []  # Placeholder for extracted image data

        for page in pages:
            for element in page:
                if isinstance(element, LTTextBoxHorizontal):
                    text += element.get_text()
                elif isinstance(element, LTFigure):
                    # Extract image data (e.g., save as image, convert to base64)
                    images.append(element)
                elif isinstance(element, LTImage):
                    # Extract image data (e.g., save as image, convert to base64)
                    images.append(element)

        # Implement table extraction logic (e.g., using heuristics or advanced techniques)
        # You can use libraries like Camelot for complex tables
        # ...

    # Convert extracted data to desired format and populate download_data
    if output_format == "JSON":
        json_data = {
            "text": text,
            "tables": tables,  # Replace with actual table data
            "images": images  # Replace with actual image data (e.g., base64)
        }
        download_data = json.dumps(json_data).encode("utf-8")  # Encode JSON for download

    elif output_format == "Markdown":
        # Implement table conversion using mistletoe or other Markdown libraries
        markdown_tables = mistletoe.markdown(convert_table=True)(tables)  # Example using mistletoe

        markdown_text = f"# Extracted Text\n\n{text}\n\n{markdown_tables}\n\n# Images\n"
        # Implement image conversion (e.g., relative paths or base64 encoding)
        # ...
        download_data = markdown_text.encode("utf-8")

    elif output_format == "HTML":
        # Implement table conversion using HTML table tags
        html_tables = "<table>"  # Start of HTML table (replace with actual table structure)
        # ... (Implement table data conversion to HTML)
        html_tables += "</table>"

        html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n"
        # Implement image conversion using `<img>` tag
        # ...
        download_data = html_text.encode("utf-8")

    return text, download_data

iface = gr.Interface(
    fn=parse_pdf,
    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
    outputs=[
        gr.Text(label="Output Text"),
        gr.File(label="Download Output")
    ],
    title="PDF Parser",
    description="Parse a PDF and choose the output format."
)

if __name__ == "__main__":
    iface.launch(share=True)  # Set share=True to create a public link