Spaces:
Sleeping
Sleeping
import json | |
import gradio as gr | |
from pdfminer.high_level import extract_pages, extract_text | |
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage | |
import os # Import os for file path manipulation | |
def parse_pdf(pdf_file, output_format): | |
# ... (Your existing parsing logic) | |
# Convert extracted data to desired format and populate download_data | |
if output_format == "JSON": | |
json_data = { | |
"text": text, | |
"tables": tables, # Replace with actual table data | |
"images": images # List of dictionaries with filenames | |
} | |
download_data = json.dumps(json_data) # No need to encode as Gradio handles it | |
elif output_format == "Markdown": | |
# ... (Your Markdown conversion logic) | |
download_data = markdown_text | |
elif output_format == "HTML": | |
# ... (Your HTML conversion logic) | |
download_data = html_text | |
return text, download_data | |
iface = gr.Interface( | |
fn=parse_pdf, | |
inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])], | |
outputs=[ | |
gr.Text(label="Output Text"), | |
gr.File(label="Download Output") | |
], | |
title="PDF Parser", | |
description="Parse a PDF and choose the output format." | |
) | |
if __name__ == "__main__": | |
iface.launch(share=False) # Set share=False as Gradio warns about it on Hugging Face Spaces |