Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 12, 2024

Commit

17d36dc

verified ·

1 Parent(s): c82a3c1

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -9

app.py CHANGED Viewed

@@ -2,27 +2,83 @@ import json
 import gradio as gr
 from pdfminer.high_level import extract_pages, extract_text
 from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
-import os  # Import os for file path manipulation
 def parse_pdf(pdf_file, output_format):
-    # ... (Your existing parsing logic)
     # Convert extracted data to desired format and populate download_data
     if output_format == "JSON":
         json_data = {
             "text": text,
-            "tables": tables,  # Replace with actual table data
-            "images": images  # List of dictionaries with filenames
         }
-        download_data = json.dumps(json_data)  # No need to encode as Gradio handles it
     elif output_format == "Markdown":
-        # ... (Your Markdown conversion logic)
         download_data = markdown_text
     elif output_format == "HTML":
-        # ... (Your HTML conversion logic)
-        download_data = html_text
     return text, download_data
@@ -38,4 +94,4 @@ iface = gr.Interface(
 )
 if __name__ == "__main__":
-    iface.launch(share=False)  # Set share=False as Gradio warns about it on Hugging Face Spaces

 import gradio as gr
 from pdfminer.high_level import extract_pages, extract_text
 from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
+import os
+import io
+from PIL import Image
 def parse_pdf(pdf_file, output_format):
+    with open(pdf_file, 'rb') as file:
+        pages = extract_pages(file)
+        text = ""
+        tables = []
+        images = []
+        for page in pages:
+            for element in page:
+                if isinstance(element, LTTextBoxHorizontal):
+                    text += element.get_text()
+                elif isinstance(element, (LTFigure, LTImage)):
+                    # Extract image data
+                    if hasattr(element, 'stream'):
+                        image_data = element.stream.read()
+                        image = Image.open(io.BytesIO(image_data))
+                        image_filename = f"extracted_image_{len(images)}.png"
+                        image.save(image_filename)
+                        images.append({"filename": image_filename})
+                    else:
+                        # Handle LTFigure (potentially nested LTImage)
+                        for child in element:
+                            if isinstance(child, LTImage):
+                                image_data = child.stream.read()
+                                image = Image.open(io.BytesIO(image_data))
+                                image_filename = f"extracted_image_{len(images)}.png"
+                                image.save(image_filename)
+                                images.append({"filename": image_filename})
+                        # You can add logic here to handle other child elements within LTFigure
+        # Implement table extraction logic using Camelot
+        import camelot
+        tables = camelot.read_pdf(pdf_file)
     # Convert extracted data to desired format and populate download_data
     if output_format == "JSON":
         json_data = {
             "text": text,
+            "tables": [table.df.to_dict() for table in tables],
+            "images": images
         }
+        download_data = json.dumps(json_data)
     elif output_format == "Markdown":
+        markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
+        for table in tables:
+            markdown_text += table.df.to_markdown(index=False) + "\n\n"
+        # Image embedding in Markdown (using relative paths)
+        image_tags = []
+        for image in images:
+            image_path = os.path.join(os.getcwd(), image["filename"])  # Replace with your path logic
+            image_tags.append(f'![Image {len(image_tags) + 1}]({image_path})')
+        markdown_text += "\n\n# Images\n\n" + "\n".join(image_tags)
         download_data = markdown_text
     elif output_format == "HTML":
+        html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
+        for table in tables:
+            html_text += table.df.to_html() + "<br>"
+        # Image embedding in HTML (using relative paths)
+        image_tags = []
+        for image in images:
+            image_path = os.path.join(os.getcwd(), image["filename"])  # Replace with your path logic
+            image_tags.append(f'<img src="{image_path}" alt="Image {len(image_tags) + 1}">')
+        html_text += "\n\n<h2>Images</h2>\n\n" + "\n".join(image_tags)
+        download_data = html_text.encode("utf-8")  # Encode for HTML download
     return text, download_data
 )
 if __name__ == "__main__":
+    iface.launch(share=False)