Spaces:
Sleeping
Sleeping
File size: 2,379 Bytes
919f74f 8b85809 f3515e2 b2971fd f3515e2 b7f45c8 b2971fd f3515e2 8b85809 f3515e2 8b85809 f3515e2 8b85809 f3515e2 8b85809 f3515e2 8b85809 f3515e2 8b85809 f3515e2 0f6f41c f3515e2 0f6f41c f3515e2 0f6f41c f3515e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import gradio as gr
import fitz # PyMuPDF for PDF handling
def convert_pdf(input_file, output_format):
"""
Convert a PDF file to the specified format.
Args:
input_file: Uploaded PDF file.
output_format: Desired output format (Markdown, HTML, JSON).
Returns:
Path to the converted file.
"""
# Open the PDF file using PyMuPDF (fitz)
pdf_document = fitz.open(input_file.name)
output_file_path = f"output.{output_format.split(' ')[0].lower()}"
if output_format == "Markdown (.md)":
# Extract text and convert to markdown format (this is basic extraction)
with open(output_file_path, "w") as f:
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
f.write(page.get_text("text")) # You can enhance this by adding markdown syntax
elif output_format == "HTML (.html)":
# Convert PDF to HTML format
with open(output_file_path, "w") as f:
html_content = ""
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
html_content += page.get_text("html") # Extract HTML content
f.write(html_content)
elif output_format == "JSON (.json)":
# Convert PDF to simple JSON format (extracting text and metadata)
import json
with open(output_file_path, "w") as f:
json_content = []
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
json_content.append({"page": page_num + 1, "text": page.get_text("text")})
json.dump(json_content, f)
else:
return "Unsupported output format!"
return output_file_path
# Update inputs and outputs for Gradio v3.x
output_format_dropdown = gr.Dropdown(
choices=["Markdown (.md)", "HTML (.html)", "JSON (.json)"],
label="Select Output File Format",
)
file_input = gr.File(label="Upload PDF File")
output_file = gr.File(label="Download Converted File")
gr_interface = gr.Interface(
fn=convert_pdf,
inputs=[file_input, output_format_dropdown],
outputs=output_file,
title="PDF Converter",
description="Upload a PDF file and select the desired output format (Markdown, HTML, or JSON).",
)
gr_interface.launch()
|