Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 11, 2024

Commit

8b0be64

verified ·

1 Parent(s): 06bae88

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -57

app.py CHANGED Viewed

@@ -1,65 +1,27 @@
 import gradio as gr
-import fitz  # PyMuPDF for PDF handling
-def convert_pdf(input_file, output_format):
-    """
-    Convert a PDF file to the specified format.
-    Args:
-        input_file: Uploaded PDF file.
-        output_format: Desired output format (Markdown, HTML, JSON).
-    Returns:
-        Path to the converted file.
-    """
-    # Open the PDF file using PyMuPDF (fitz)
-    pdf_document = fitz.open(input_file.name)
-    output_file_path = f"output.{output_format.split(' ')[0].lower()}"
-    if output_format == "Markdown (.md)":
-        # Extract text and convert to markdown format (this is basic extraction)
-        with open(output_file_path, "w") as f:
-            for page_num in range(pdf_document.page_count):
-                page = pdf_document.load_page(page_num)
-                f.write(page.get_text("text"))  # You can enhance this by adding markdown syntax
-    elif output_format == "HTML (.html)":
-        # Convert PDF to HTML format
-        with open(output_file_path, "w") as f:
-            html_content = ""
-            for page_num in range(pdf_document.page_count):
-                page = pdf_document.load_page(page_num)
-                html_content += page.get_text("html")  # Extract HTML content
-            f.write(html_content)
-    elif output_format == "JSON (.json)":
-        # Convert PDF to simple JSON format (extracting text and metadata)
-        import json
-        with open(output_file_path, "w") as f:
-            json_content = []
-            for page_num in range(pdf_document.page_count):
-                page = pdf_document.load_page(page_num)
-                json_content.append({"page": page_num + 1, "text": page.get_text("text")})
-            json.dump(json_content, f)
-    else:
-        return "Unsupported output format!"
-    return output_file_path
-# Update inputs and outputs for Gradio v3.x
 output_format_dropdown = gr.Dropdown(
-    choices=["Markdown (.md)", "HTML (.html)", "JSON (.json)"],
-    label="Select Output File Format",
 )
-file_input = gr.File(label="Upload PDF File")
-output_file = gr.File(label="Download Converted File")
-gr_interface = gr.Interface(
-    fn=convert_pdf,
-    inputs=[file_input, output_format_dropdown],
-    outputs=output_file,
-    title="PDF Converter",
-    description="Upload a PDF file and select the desired output format (Markdown, HTML, or JSON).",
 )
-gr_interface.launch()

 import gradio as gr
+import PyMuPDF as fitz  # Importing PyMuPDF as fitz
+# Function to extract text from a PDF
+def extract_pdf_text(file):
+    doc = fitz.open(file.name)  # Open the PDF file using PyMuPDF
+    text = ""
+    for page in doc:
+        text += page.get_text()  # Extract text from each page
+    return text
+# Gradio interface
 output_format_dropdown = gr.Dropdown(
+    choices=["txt", "pdf", "docx"],
+    label="Output Format",
+    default="txt"
 )
+iface = gr.Interface(
+    fn=extract_pdf_text,
+    inputs=gr.File(label="Upload PDF File"),
+    outputs=[gr.Textbox(label="Extracted Text"), output_format_dropdown],
+    live=True
 )
+if __name__ == "__main__":
+    iface.launch()