import json
import gradio as gr
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
import os  # Import os for file path manipulation

def parse_pdf(pdf_file, output_format):
    # ... (Your existing parsing logic)

    # Convert extracted data to desired format and populate download_data
    if output_format == "JSON":
        json_data = {
            "text": text,
            "tables": tables,  # Replace with actual table data
            "images": images  # List of dictionaries with filenames
        }
        download_data = json.dumps(json_data)  # No need to encode as Gradio handles it

    elif output_format == "Markdown":
        # ... (Your Markdown conversion logic)
        download_data = markdown_text

    elif output_format == "HTML":
        # ... (Your HTML conversion logic)
        download_data = html_text

    return text, download_data

iface = gr.Interface(
    fn=parse_pdf,
    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
    outputs=[
        gr.Text(label="Output Text"),
        gr.File(label="Download Output")
    ],
    title="PDF Parser",
    description="Parse a PDF and choose the output format."
)

if __name__ == "__main__":
    iface.launch(share=False)  # Set share=False as Gradio warns about it on Hugging Face Spaces