sblumenf commited on
Commit
8b85809
·
verified ·
1 Parent(s): 1e8f4c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -19
app.py CHANGED
@@ -1,19 +1,6 @@
1
- import subprocess
2
- import sys
3
-
4
- # Install the 'marker' package from GitHub if not already installed
5
- try:
6
- import marker
7
- except ImportError:
8
- subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/VikParuchuri/marker.git"])
9
-
10
- # Verify the marker package is installed and check its contents
11
- import marker
12
- print("Available modules in marker:", dir(marker))
13
-
14
- from marker.pdf import PDF # Updated import path
15
- import os
16
  import gradio as gr
 
 
17
 
18
  def convert_pdf(input_file, output_format):
19
  """
@@ -26,19 +13,33 @@ def convert_pdf(input_file, output_format):
26
  Returns:
27
  Path to the converted file.
28
  """
29
- pdf = PDF(input_file.name) # Initialize the PDF object
30
 
31
  output_file_path = f"output.{output_format.split(' ')[0].lower()}"
32
 
33
  if output_format == "Markdown (.md)":
 
34
  with open(output_file_path, "w") as f:
35
- f.write(pdf.to_markdown())
 
 
36
  elif output_format == "HTML (.html)":
 
37
  with open(output_file_path, "w") as f:
38
- f.write(pdf.to_html())
 
 
 
 
39
  elif output_format == "JSON (.json)":
 
 
40
  with open(output_file_path, "w") as f:
41
- f.write(pdf.to_json())
 
 
 
 
42
  else:
43
  return "Unsupported output format!"
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import fitz # PyMuPDF for PDF handling
3
+ import os
4
 
5
  def convert_pdf(input_file, output_format):
6
  """
 
13
  Returns:
14
  Path to the converted file.
15
  """
16
+ pdf_document = fitz.open(input_file.name) # Open the PDF file with PyMuPDF
17
 
18
  output_file_path = f"output.{output_format.split(' ')[0].lower()}"
19
 
20
  if output_format == "Markdown (.md)":
21
+ # Extract text and convert to markdown format (this is basic extraction)
22
  with open(output_file_path, "w") as f:
23
+ for page_num in range(pdf_document.page_count):
24
+ page = pdf_document.load_page(page_num)
25
+ f.write(page.get_text("text")) # You can enhance this by adding markdown syntax
26
  elif output_format == "HTML (.html)":
27
+ # Convert PDF to HTML format
28
  with open(output_file_path, "w") as f:
29
+ html_content = ""
30
+ for page_num in range(pdf_document.page_count):
31
+ page = pdf_document.load_page(page_num)
32
+ html_content += page.get_text("html") # Extract HTML content
33
+ f.write(html_content)
34
  elif output_format == "JSON (.json)":
35
+ # Convert PDF to simple JSON format (extracting text and metadata)
36
+ import json
37
  with open(output_file_path, "w") as f:
38
+ json_content = []
39
+ for page_num in range(pdf_document.page_count):
40
+ page = pdf_document.load_page(page_num)
41
+ json_content.append({"page": page_num + 1, "text": page.get_text("text")})
42
+ json.dump(json_content, f)
43
  else:
44
  return "Unsupported output format!"
45