Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 12, 2024

Commit

b8d5f22

verified ·

1 Parent(s): f9c1d23

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -74

app.py CHANGED Viewed

@@ -1,90 +1,114 @@
 import json
 import gradio as gr
-from pdfminer.high_level import extract_pages, extract_text
 from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
 import os
 import io
 from PIL import Image
-def parse_pdf(pdf_file, output_format):
-    with open(pdf_file, 'rb') as file:
-        pages = extract_pages(file)
-        text = ""
-        tables = []
-        images = []
-        for page in pages:
-            for element in page:
-                if isinstance(element, LTTextBoxHorizontal):
-                    text += element.get_text()
-                elif isinstance(element, (LTFigure, LTImage)):
-                    # Extract image data
-                    if hasattr(element, 'stream'):
-                        image_data = element.stream.read()
-                        image = Image.open(io.BytesIO(image_data))
-                        image_filename = f"extracted_image_{len(images)}.png"
-                        image.save(image_filename)
-                        images.append({"filename": image_filename})
-                    else:
-                        # Handle LTFigure (potentially nested LTImage)
-                        for child in element:
-                            if isinstance(child, LTImage):
-                                image_data = child.stream.read()
                                 image = Image.open(io.BytesIO(image_data))
                                 image_filename = f"extracted_image_{len(images)}.png"
                                 image.save(image_filename)
                                 images.append({"filename": image_filename})
-                        # You can add logic here to handle other child elements within LTFigure
-        # Implement table extraction logic using Camelot
-        import camelot
-        tables = camelot.read_pdf(pdf_file)
-    # Convert extracted data to desired format and populate download_data
-    if output_format == "JSON":
-        json_data = {
-            "text": text,
-            "tables": [table.df.to_dict() for table in tables],
-            "images": images
-        }
-        download_data = json.dumps(json_data)
-    elif output_format == "Markdown":
-        markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
-        for table in tables:
-            markdown_text += table.df.to_markdown(index=False) + "\n\n"
-        # Image embedding in Markdown (using relative paths)
-        image_tags = []
-        for image in images:
-            image_path = os.path.join(os.getcwd(), image["filename"])  # Replace with your path logic
-            image_tags.append(f'![Image {len(image_tags) + 1}]({image_path})')
-        markdown_text += "\n\n# Images\n\n" + "\n".join(image_tags)
-        download_data = markdown_text
-    elif output_format == "HTML":
-        html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
-        for table in tables:
-            html_text += table.df.to_html() + "<br>"
-        # Image embedding in HTML (using relative paths)
-        image_tags = []
-        for image in images:
-            image_path = os.path.join(os.getcwd(), image["filename"])  # Replace with your path logic
-            image_tags.append(f'<img src="{image_path}" alt="Image {len(image_tags) + 1}">')
-        html_text += "\n\n<h2>Images</h2>\n\n" + "\n".join(image_tags)
-        download_data = html_text.encode("utf-8")  # Encode for HTML download
-    return text, download_data
 iface = gr.Interface(
     fn=parse_pdf,
-    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
     outputs=[
         gr.Text(label="Output Text"),
         gr.File(label="Download Output")

 import json
 import gradio as gr
+from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
 import os
 import io
 from PIL import Image
+import pandas as pd
+import tabula
+import camelot
+def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
+    """
+    Parses a PDF file, extracts text, tables, and images, and formats the output.
+    Args:
+        pdf_file: Path to the uploaded PDF file.
+        output_format: Desired output format ("JSON", "Markdown", or "HTML").
+        progress: Gradio Progress object for displaying progress.
+    Returns:
+        tuple: Extracted text and download data in the specified format.
+               Returns an empty string and None if there is an error.
+    """
+    try:
+        with open(pdf_file, 'rb') as file:
+            pages = extract_pages(file)
+            text = ""
+            tables = []
+            images = []
+            # Iterate through pages and extract text and images
+            for i, page in enumerate(pages):
+                progress(i / len(pages))  # Update progress bar
+                for element in page:
+                    if isinstance(element, LTTextBoxHorizontal):
+                        text += element.get_text()
+                    elif isinstance(element, (LTFigure, LTImage)):
+                        try:
+                            if hasattr(element, 'stream'):
+                                image_data = element.stream.read()
                                 image = Image.open(io.BytesIO(image_data))
                                 image_filename = f"extracted_image_{len(images)}.png"
                                 image.save(image_filename)
                                 images.append({"filename": image_filename})
+                            else:
+                                for child in element:
+                                    if isinstance(child, LTImage):
+                                        image_data = child.stream.read()
+                                        image = Image.open(io.BytesIO(image_data))
+                                        image_filename = f"extracted_image_{len(images)}.png"
+                                        image.save(image_filename)
+                                        images.append({"filename": image_filename})
+                        except Exception as e:
+                            print(f"Error extracting image: {e}")
+            # Enhanced table extraction (tabula-py preferred, fallback to camelot)
+            try:
+                tables = tabula.read_pdf(pdf_file, pages='all', multiple_tables=True)
+            except Exception as e:
+                print(f"tabula-py failed: {e}. Trying camelot...")
+                try:
+                    camelot_tables = camelot.read_pdf(pdf_file)
+                    for table in camelot_tables:
+                        tables.append(table.df)
+                except Exception as e:
+                    print(f"camelot also failed: {e}. No tables extracted.")
+            # Format extracted data based on user selection
+            if output_format == "JSON":
+                json_data = {
+                    "text": text,
+                    "tables": [table.to_dict() for table in tables],
+                    "images": images
+                }
+                download_data = json.dumps(json_data, indent=4)  # Add indentation for readability
+            elif output_format == "Markdown":
+                markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
+                for i, table in enumerate(tables):
+                    markdown_text += f"## Table {i+1}\n"
+                    markdown_text += table.to_markdown(index=False) + "\n\n"
+                # Image embedding in Markdown (using relative paths)
+                markdown_text += "\n\n# Images\n\n"
+                for image in images:
+                  image_path = os.path.join(os.getcwd(), image["filename"])
+                  markdown_text += f'![Image]({image_path})\n'
+                download_data = markdown_text
+            elif output_format == "HTML":
+                html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
+                for i, table in enumerate(tables):
+                    html_text += f"<h2>Table {i+1}</h2>\n"
+                    html_text += table.to_html() + "<br>"
+                # Image embedding in HTML (using relative paths)
+                html_text += "\n\n<h2>Images</h2>\n\n"
+                for image in images:
+                  image_path = os.path.join(os.getcwd(), image["filename"])
+                  html_text += f'<img src="{image_path}" alt="Image"><br>\n'
+                download_data = html_text.encode("utf-8")  # Encode for HTML download
+            return text, download_data
+    except Exception as main_e:
+        print(f"A main error occurred: {main_e}")
+        return "", None # Return empty string and None in case of error
 iface = gr.Interface(
     fn=parse_pdf,
+    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"]), gr.Progress()],
     outputs=[
         gr.Text(label="Output Text"),
         gr.File(label="Download Output")