Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 12, 2024

Commit

7dec78f

verified ·

1 Parent(s): 432b041

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -21

app.py CHANGED Viewed

@@ -1,21 +1,73 @@
-To create a public link, set `share=True` in `launch()`.
-Traceback (most recent call last):
-  File "/usr/local/lib/python3.10/site-packages/gradio/queueing.py", line 624, in process_events
-    response = await route_utils.call_process_api(
-  File "/usr/local/lib/python3.10/site-packages/gradio/route_utils.py", line 323, in call_process_api
-    output = await app.get_blocks().process_api(
-  File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 2043, in process_api
-    result = await self.call_function(
-  File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 1590, in call_function
-    prediction = await anyio.to_thread.run_sync(  # type: ignore
-  File "/usr/local/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
-    return await get_async_backend().run_sync_in_worker_thread(
-  File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2505, in run_sync_in_worker_thread
-    return await future
-  File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 1005, in run
-    result = context.run(func, *args)
-  File "/usr/local/lib/python3.10/site-packages/gradio/utils.py", line 865, in wrapper
-    response = f(*args, **kwargs)
-  File "/home/user/app/app.py", line 35, in parse_pdf
-    download_data = json.dumps(json_data).encode("utf-8")  # Encode JSON for download
-NameError: name 'json' is not defined

+import json
+import gradio as gr
+from pdfminer.high_level import extract_pages, extract_text
+from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
+import mistletoe  # for Markdown table generation (optional)
+def parse_pdf(pdf_file, output_format):
+    with open(pdf_file, 'rb') as file:
+        pages = extract_pages(file)
+        text = ""
+        tables = []  # Placeholder for extracted table data
+        images = []  # Placeholder for extracted image data
+        for page in pages:
+            for element in page:
+                if isinstance(element, LTTextBoxHorizontal):
+                    text += element.get_text()
+                elif isinstance(element, LTFigure):
+                    # Extract image data (e.g., save as image, convert to base64)
+                    images.append(element)
+                elif isinstance(element, LTImage):
+                    # Extract image data (e.g., save as image, convert to base64)
+                    images.append(element)
+        # Implement table extraction logic (e.g., using heuristics or advanced techniques)
+        # You can use libraries like Camelot for complex tables
+        # ...
+    # Convert extracted data to desired format and populate download_data
+    if output_format == "JSON":
+        json_data = {
+            "text": text,
+            "tables": tables,  # Replace with actual table data
+            "images": images  # Replace with actual image data (e.g., base64)
+        }
+        download_data = json.dumps(json_data).encode("utf-8")  # Encode JSON for download
+    elif output_format == "Markdown":
+        # Implement table conversion using mistletoe or other Markdown libraries
+        markdown_tables = mistletoe.markdown(convert_table=True)(tables)  # Example using mistletoe
+        markdown_text = f"# Extracted Text\n\n{text}\n\n{markdown_tables}\n\n# Images\n"
+        # Implement image conversion (e.g., relative paths or base64 encoding)
+        # ...
+        download_data = markdown_text.encode("utf-8")
+    elif output_format == "HTML":
+        # Implement table conversion using HTML table tags
+        html_tables = "<table>"  # Start of HTML table (replace with actual table structure)
+        # ... (Implement table data conversion to HTML)
+        html_tables += "</table>"
+        html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n"
+        # Implement image conversion using `<img>` tag
+        # ...
+        download_data = html_text.encode("utf-8")
+    return text, download_data
+iface = gr.Interface(
+    fn=parse_pdf,
+    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
+    outputs=[
+        gr.Text(label="Output Text"),
+        gr.File(label="Download Output")
+    ],
+    title="PDF Parser",
+    description="Parse a PDF and choose the output format."
+)
+if __name__ == "__main__":
+    iface.launch(share=True)  # Set share=True to create a public link