Spaces:

sblumenf
/

pdf-convert

Sleeping

File size: 1,358 Bytes

7dec78f
 
 
 
c506d0d
f598e4b
7dec78f
546291c
7dec78f
 
 
 
 
 
f15272f
7dec78f
546291c
7dec78f
 
546291c
 
7dec78f
 
546291c
 
c506d0d
546291c
7dec78f
 
 
 
 
 
 
 
 
 
 
 
5e94ef1
546291c

import json
import gradio as gr
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
import os  # Import os for file path manipulation

def parse_pdf(pdf_file, output_format):
    # ... (Your existing parsing logic)

    # Convert extracted data to desired format and populate download_data
    if output_format == "JSON":
        json_data = {
            "text": text,
            "tables": tables,  # Replace with actual table data
            "images": images  # List of dictionaries with filenames
        }
        download_data = json.dumps(json_data)  # No need to encode as Gradio handles it

    elif output_format == "Markdown":
        # ... (Your Markdown conversion logic)
        download_data = markdown_text

    elif output_format == "HTML":
        # ... (Your HTML conversion logic)
        download_data = html_text

    return text, download_data

iface = gr.Interface(
    fn=parse_pdf,
    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
    outputs=[
        gr.Text(label="Output Text"),
        gr.File(label="Download Output")
    ],
    title="PDF Parser",
    description="Parse a PDF and choose the output format."
)

if __name__ == "__main__":
    iface.launch(share=False)  # Set share=False as Gradio warns about it on Hugging Face Spaces