Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 11, 2024

Commit

8b85809

verified ·

1 Parent(s): 1e8f4c1

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -19

app.py CHANGED Viewed

@@ -1,19 +1,6 @@
-import subprocess
-import sys
-# Install the 'marker' package from GitHub if not already installed
-try:
-    import marker
-except ImportError:
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/VikParuchuri/marker.git"])
-# Verify the marker package is installed and check its contents
-import marker
-print("Available modules in marker:", dir(marker))
-from marker.pdf import PDF  # Updated import path
-import os
 import gradio as gr
 def convert_pdf(input_file, output_format):
     """
@@ -26,19 +13,33 @@ def convert_pdf(input_file, output_format):
     Returns:
         Path to the converted file.
     """
-    pdf = PDF(input_file.name)  # Initialize the PDF object
     output_file_path = f"output.{output_format.split(' ')[0].lower()}"
     if output_format == "Markdown (.md)":
         with open(output_file_path, "w") as f:
-            f.write(pdf.to_markdown())
     elif output_format == "HTML (.html)":
         with open(output_file_path, "w") as f:
-            f.write(pdf.to_html())
     elif output_format == "JSON (.json)":
         with open(output_file_path, "w") as f:
-            f.write(pdf.to_json())
     else:
         return "Unsupported output format!"

 import gradio as gr
+import fitz  # PyMuPDF for PDF handling
+import os
 def convert_pdf(input_file, output_format):
     """
     Returns:
         Path to the converted file.
     """
+    pdf_document = fitz.open(input_file.name)  # Open the PDF file with PyMuPDF
     output_file_path = f"output.{output_format.split(' ')[0].lower()}"
     if output_format == "Markdown (.md)":
+        # Extract text and convert to markdown format (this is basic extraction)
         with open(output_file_path, "w") as f:
+            for page_num in range(pdf_document.page_count):
+                page = pdf_document.load_page(page_num)
+                f.write(page.get_text("text"))  # You can enhance this by adding markdown syntax
     elif output_format == "HTML (.html)":
+        # Convert PDF to HTML format
         with open(output_file_path, "w") as f:
+            html_content = ""
+            for page_num in range(pdf_document.page_count):
+                page = pdf_document.load_page(page_num)
+                html_content += page.get_text("html")  # Extract HTML content
+            f.write(html_content)
     elif output_format == "JSON (.json)":
+        # Convert PDF to simple JSON format (extracting text and metadata)
+        import json
         with open(output_file_path, "w") as f:
+            json_content = []
+            for page_num in range(pdf_document.page_count):
+                page = pdf_document.load_page(page_num)
+                json_content.append({"page": page_num + 1, "text": page.get_text("text")})
+            json.dump(json_content, f)
     else:
         return "Unsupported output format!"