Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 12, 2024

Commit

432b041

verified ·

1 Parent(s): 4d96b5c

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -69

app.py CHANGED Viewed

@@ -1,69 +1,21 @@
-import gradio as gr
-from pdfminer.high_level import extract_pages, extract_text
-from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
-def parse_pdf(pdf_file, output_format):
-    with open(pdf_file, 'rb') as file:
-        pages = extract_pages(file)
-        text = ""
-        tables = []
-        images = []
-        download_data = None  # Initialize an empty variable for download data
-        for page in pages:
-            for element in page:
-                if isinstance(element, LTTextBoxHorizontal):
-                    text += element.get_text()
-                elif isinstance(element, LTFigure):
-                    # Extract image data (e.g., save as image, convert to base64)
-                    images.append(element)
-                elif isinstance(element, LTImage):
-                    # Extract image data (e.g., save as image, convert to base64)
-                    images.append(element)
-        # Implement table extraction logic (e.g., using heuristics or advanced techniques)
-        # ...
-    # Convert extracted data to desired format and populate download_data
-    if output_format == "JSON":
-        json_data = {
-            "text": text,
-            "tables": tables,  # Implement table conversion to JSON
-            "images": images  # Implement image conversion to JSON (e.g., base64)
-        }
-        download_data = json.dumps(json_data).encode("utf-8")  # Encode JSON for download
-    elif output_format == "Markdown":
-        markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
-        # Implement table conversion to Markdown
-        # ...
-        markdown_text += "\n# Images\n"
-        # Implement image conversion to Markdown (e.g., embedding images)
-        # ...
-        download_data = markdown_text.encode("utf-8")  # Encode Markdown for download
-    elif output_format == "HTML":
-        html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
-        # Implement table conversion to HTML
-        # ...
-        html_text += "<h2>Images</h2>\n"
-        # Implement image conversion to HTML (e.g., embedding images)
-        # ...
-        download_data = html_text.encode("utf-8")  # Encode HTML for download
-    return text, download_data
-iface = gr.Interface(
-    fn=parse_pdf,
-    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
-    outputs=[
-        gr.Text(label="Output Text"),
-        gr.File(label="Download Output")
-    ],
-    title="PDF Parser",
-    description="Parse a PDF and choose the output format."
-)
-if __name__ == "__main__":
-    iface.launch(share=True)  # Set share=True to create a public link

+To create a public link, set `share=True` in `launch()`.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.10/site-packages/gradio/queueing.py", line 624, in process_events
+    response = await route_utils.call_process_api(
+  File "/usr/local/lib/python3.10/site-packages/gradio/route_utils.py", line 323, in call_process_api
+    output = await app.get_blocks().process_api(
+  File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 2043, in process_api
+    result = await self.call_function(
+  File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 1590, in call_function
+    prediction = await anyio.to_thread.run_sync(  # type: ignore
+  File "/usr/local/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
+    return await get_async_backend().run_sync_in_worker_thread(
+  File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2505, in run_sync_in_worker_thread
+    return await future
+  File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 1005, in run
+    result = context.run(func, *args)
+  File "/usr/local/lib/python3.10/site-packages/gradio/utils.py", line 865, in wrapper
+    response = f(*args, **kwargs)
+  File "/home/user/app/app.py", line 35, in parse_pdf
+    download_data = json.dumps(json_data).encode("utf-8")  # Encode JSON for download
+NameError: name 'json' is not defined