Spaces:
Sleeping
Sleeping
import gradio as gr | |
import fitz # PyMuPDF for PDF handling | |
def convert_pdf(input_file, output_format): | |
""" | |
Convert a PDF file to the specified format. | |
Args: | |
input_file: Uploaded PDF file. | |
output_format: Desired output format (Markdown, HTML, JSON). | |
Returns: | |
Path to the converted file. | |
""" | |
# Open the PDF file using PyMuPDF (fitz) | |
pdf_document = fitz.open(input_file.name) | |
output_file_path = f"output.{output_format.split(' ')[0].lower()}" | |
if output_format == "Markdown (.md)": | |
# Extract text and convert to markdown format (this is basic extraction) | |
with open(output_file_path, "w") as f: | |
for page_num in range(pdf_document.page_count): | |
page = pdf_document.load_page(page_num) | |
f.write(page.get_text("text")) # You can enhance this by adding markdown syntax | |
elif output_format == "HTML (.html)": | |
# Convert PDF to HTML format | |
with open(output_file_path, "w") as f: | |
html_content = "" | |
for page_num in range(pdf_document.page_count): | |
page = pdf_document.load_page(page_num) | |
html_content += page.get_text("html") # Extract HTML content | |
f.write(html_content) | |
elif output_format == "JSON (.json)": | |
# Convert PDF to simple JSON format (extracting text and metadata) | |
import json | |
with open(output_file_path, "w") as f: | |
json_content = [] | |
for page_num in range(pdf_document.page_count): | |
page = pdf_document.load_page(page_num) | |
json_content.append({"page": page_num + 1, "text": page.get_text("text")}) | |
json.dump(json_content, f) | |
else: | |
return "Unsupported output format!" | |
return output_file_path | |
# Update inputs and outputs for Gradio v3.x | |
output_format_dropdown = gr.Dropdown( | |
choices=["Markdown (.md)", "HTML (.html)", "JSON (.json)"], | |
label="Select Output File Format", | |
) | |
file_input = gr.File(label="Upload PDF File") | |
output_file = gr.File(label="Download Converted File") | |
gr_interface = gr.Interface( | |
fn=convert_pdf, | |
inputs=[file_input, output_format_dropdown], | |
outputs=output_file, | |
title="PDF Converter", | |
description="Upload a PDF file and select the desired output format (Markdown, HTML, or JSON).", | |
) | |
gr_interface.launch() | |