pdf-convert / app.py
sblumenf's picture
Update app.py
b7f45c8 verified
raw
history blame
2.38 kB
import gradio as gr
import fitz # PyMuPDF for PDF handling
def convert_pdf(input_file, output_format):
"""
Convert a PDF file to the specified format.
Args:
input_file: Uploaded PDF file.
output_format: Desired output format (Markdown, HTML, JSON).
Returns:
Path to the converted file.
"""
# Open the PDF file using PyMuPDF (fitz)
pdf_document = fitz.open(input_file.name)
output_file_path = f"output.{output_format.split(' ')[0].lower()}"
if output_format == "Markdown (.md)":
# Extract text and convert to markdown format (this is basic extraction)
with open(output_file_path, "w") as f:
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
f.write(page.get_text("text")) # You can enhance this by adding markdown syntax
elif output_format == "HTML (.html)":
# Convert PDF to HTML format
with open(output_file_path, "w") as f:
html_content = ""
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
html_content += page.get_text("html") # Extract HTML content
f.write(html_content)
elif output_format == "JSON (.json)":
# Convert PDF to simple JSON format (extracting text and metadata)
import json
with open(output_file_path, "w") as f:
json_content = []
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
json_content.append({"page": page_num + 1, "text": page.get_text("text")})
json.dump(json_content, f)
else:
return "Unsupported output format!"
return output_file_path
# Update inputs and outputs for Gradio v3.x
output_format_dropdown = gr.Dropdown(
choices=["Markdown (.md)", "HTML (.html)", "JSON (.json)"],
label="Select Output File Format",
)
file_input = gr.File(label="Upload PDF File")
output_file = gr.File(label="Download Converted File")
gr_interface = gr.Interface(
fn=convert_pdf,
inputs=[file_input, output_format_dropdown],
outputs=output_file,
title="PDF Converter",
description="Upload a PDF file and select the desired output format (Markdown, HTML, or JSON).",
)
gr_interface.launch()