import json import gradio as gr from pdfminer.high_level import extract_pages, extract_text from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage import os # Import os for file path manipulation def parse_pdf(pdf_file, output_format): # ... (Your existing parsing logic) # Convert extracted data to desired format and populate download_data if output_format == "JSON": json_data = { "text": text, "tables": tables, # Replace with actual table data "images": images # List of dictionaries with filenames } download_data = json.dumps(json_data) # No need to encode as Gradio handles it elif output_format == "Markdown": # ... (Your Markdown conversion logic) download_data = markdown_text elif output_format == "HTML": # ... (Your HTML conversion logic) download_data = html_text return text, download_data iface = gr.Interface( fn=parse_pdf, inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])], outputs=[ gr.Text(label="Output Text"), gr.File(label="Download Output") ], title="PDF Parser", description="Parse a PDF and choose the output format." ) if __name__ == "__main__": iface.launch(share=False) # Set share=False as Gradio warns about it on Hugging Face Spaces