Spaces:
Sleeping
Sleeping
File size: 1,358 Bytes
7dec78f c506d0d f598e4b 7dec78f 546291c 7dec78f f15272f 7dec78f 546291c 7dec78f 546291c 7dec78f 546291c c506d0d 546291c 7dec78f 5e94ef1 546291c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
import json
import gradio as gr
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
import os # Import os for file path manipulation
def parse_pdf(pdf_file, output_format):
# ... (Your existing parsing logic)
# Convert extracted data to desired format and populate download_data
if output_format == "JSON":
json_data = {
"text": text,
"tables": tables, # Replace with actual table data
"images": images # List of dictionaries with filenames
}
download_data = json.dumps(json_data) # No need to encode as Gradio handles it
elif output_format == "Markdown":
# ... (Your Markdown conversion logic)
download_data = markdown_text
elif output_format == "HTML":
# ... (Your HTML conversion logic)
download_data = html_text
return text, download_data
iface = gr.Interface(
fn=parse_pdf,
inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
outputs=[
gr.Text(label="Output Text"),
gr.File(label="Download Output")
],
title="PDF Parser",
description="Parse a PDF and choose the output format."
)
if __name__ == "__main__":
iface.launch(share=False) # Set share=False as Gradio warns about it on Hugging Face Spaces |