pdf-convert / app.py
sblumenf's picture
Update app.py
5ebff26 verified
raw
history blame
2.04 kB
import PyPDF2
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal, LTFigure
import gradio as gr
def parse_pdf(pdf_file, output_format):
with open(pdf_file, 'rb') as file:
pages = extract_pages(file)
text = ""
tables = [] # Placeholder for tables (implementation needed)
figures = []
for page in pages:
for element in page:
if isinstance(element, LTTextBoxHorizontal):
text += element.get_text()
elif isinstance(element, LTFigure):
figures.append(element)
# Extract tables (more advanced techniques might be needed)
# ... (Implement table extraction logic here)
if output_format == "JSON":
# Replace this with your JSON conversion logic, including tables and figures
json_output = {"text": text, "figures": figures} # Placeholder for JSON conversion
return json_output
elif output_format == "Markdown":
markdown_output = f"# Extracted Text\n\n{text}\n\n# Figures\n"
for fig in figures:
# Process each figure (e.g., save as image)
# ... (Implement figure processing logic here)
markdown_output += f"\n![]({processed_image_url})" # Example for adding image reference
return markdown_output
elif output_format == "HTML":
html_output = f"<p>{text}</p>\n"
for fig in figures:
# Process each figure (e.g., embed image)
# ... (Implement figure processing logic here)
html_output += f"<img src='{processed_image_url}' alt='Figure'>" # Example for embedding image
return html_output
# Create the Gradio interface
iface = gr.Interface(
fn=parse_pdf,
inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
outputs="text",
title="PDF Parser",
description="Parse a PDF and choose the output format."
)
# Launch the Gradio app
if __name__ == "__main__":
iface.launch()