sblumenf commited on
Commit
8b0be64
·
verified ·
1 Parent(s): 06bae88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -57
app.py CHANGED
@@ -1,65 +1,27 @@
1
  import gradio as gr
2
- import fitz # PyMuPDF for PDF handling
3
 
4
- def convert_pdf(input_file, output_format):
5
- """
6
- Convert a PDF file to the specified format.
 
 
 
 
7
 
8
- Args:
9
- input_file: Uploaded PDF file.
10
- output_format: Desired output format (Markdown, HTML, JSON).
11
-
12
- Returns:
13
- Path to the converted file.
14
- """
15
- # Open the PDF file using PyMuPDF (fitz)
16
- pdf_document = fitz.open(input_file.name)
17
-
18
- output_file_path = f"output.{output_format.split(' ')[0].lower()}"
19
-
20
- if output_format == "Markdown (.md)":
21
- # Extract text and convert to markdown format (this is basic extraction)
22
- with open(output_file_path, "w") as f:
23
- for page_num in range(pdf_document.page_count):
24
- page = pdf_document.load_page(page_num)
25
- f.write(page.get_text("text")) # You can enhance this by adding markdown syntax
26
- elif output_format == "HTML (.html)":
27
- # Convert PDF to HTML format
28
- with open(output_file_path, "w") as f:
29
- html_content = ""
30
- for page_num in range(pdf_document.page_count):
31
- page = pdf_document.load_page(page_num)
32
- html_content += page.get_text("html") # Extract HTML content
33
- f.write(html_content)
34
- elif output_format == "JSON (.json)":
35
- # Convert PDF to simple JSON format (extracting text and metadata)
36
- import json
37
- with open(output_file_path, "w") as f:
38
- json_content = []
39
- for page_num in range(pdf_document.page_count):
40
- page = pdf_document.load_page(page_num)
41
- json_content.append({"page": page_num + 1, "text": page.get_text("text")})
42
- json.dump(json_content, f)
43
- else:
44
- return "Unsupported output format!"
45
-
46
- return output_file_path
47
-
48
- # Update inputs and outputs for Gradio v3.x
49
  output_format_dropdown = gr.Dropdown(
50
- choices=["Markdown (.md)", "HTML (.html)", "JSON (.json)"],
51
- label="Select Output File Format",
 
52
  )
53
- file_input = gr.File(label="Upload PDF File")
54
-
55
- output_file = gr.File(label="Download Converted File")
56
 
57
- gr_interface = gr.Interface(
58
- fn=convert_pdf,
59
- inputs=[file_input, output_format_dropdown],
60
- outputs=output_file,
61
- title="PDF Converter",
62
- description="Upload a PDF file and select the desired output format (Markdown, HTML, or JSON).",
63
  )
64
 
65
- gr_interface.launch()
 
 
1
  import gradio as gr
2
+ import PyMuPDF as fitz # Importing PyMuPDF as fitz
3
 
4
+ # Function to extract text from a PDF
5
+ def extract_pdf_text(file):
6
+ doc = fitz.open(file.name) # Open the PDF file using PyMuPDF
7
+ text = ""
8
+ for page in doc:
9
+ text += page.get_text() # Extract text from each page
10
+ return text
11
 
12
+ # Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  output_format_dropdown = gr.Dropdown(
14
+ choices=["txt", "pdf", "docx"],
15
+ label="Output Format",
16
+ default="txt"
17
  )
 
 
 
18
 
19
+ iface = gr.Interface(
20
+ fn=extract_pdf_text,
21
+ inputs=gr.File(label="Upload PDF File"),
22
+ outputs=[gr.Textbox(label="Extracted Text"), output_format_dropdown],
23
+ live=True
 
24
  )
25
 
26
+ if __name__ == "__main__":
27
+ iface.launch()