sblumenf commited on
Commit
432b041
·
verified ·
1 Parent(s): 4d96b5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -69
app.py CHANGED
@@ -1,69 +1,21 @@
1
- import gradio as gr
2
- from pdfminer.high_level import extract_pages, extract_text
3
- from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
4
-
5
- def parse_pdf(pdf_file, output_format):
6
- with open(pdf_file, 'rb') as file:
7
- pages = extract_pages(file)
8
-
9
- text = ""
10
- tables = []
11
- images = []
12
- download_data = None # Initialize an empty variable for download data
13
-
14
- for page in pages:
15
- for element in page:
16
- if isinstance(element, LTTextBoxHorizontal):
17
- text += element.get_text()
18
- elif isinstance(element, LTFigure):
19
- # Extract image data (e.g., save as image, convert to base64)
20
- images.append(element)
21
- elif isinstance(element, LTImage):
22
- # Extract image data (e.g., save as image, convert to base64)
23
- images.append(element)
24
-
25
- # Implement table extraction logic (e.g., using heuristics or advanced techniques)
26
- # ...
27
-
28
- # Convert extracted data to desired format and populate download_data
29
- if output_format == "JSON":
30
- json_data = {
31
- "text": text,
32
- "tables": tables, # Implement table conversion to JSON
33
- "images": images # Implement image conversion to JSON (e.g., base64)
34
- }
35
- download_data = json.dumps(json_data).encode("utf-8") # Encode JSON for download
36
-
37
- elif output_format == "Markdown":
38
- markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
39
- # Implement table conversion to Markdown
40
- # ...
41
- markdown_text += "\n# Images\n"
42
- # Implement image conversion to Markdown (e.g., embedding images)
43
- # ...
44
- download_data = markdown_text.encode("utf-8") # Encode Markdown for download
45
-
46
- elif output_format == "HTML":
47
- html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
48
- # Implement table conversion to HTML
49
- # ...
50
- html_text += "<h2>Images</h2>\n"
51
- # Implement image conversion to HTML (e.g., embedding images)
52
- # ...
53
- download_data = html_text.encode("utf-8") # Encode HTML for download
54
-
55
- return text, download_data
56
-
57
- iface = gr.Interface(
58
- fn=parse_pdf,
59
- inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
60
- outputs=[
61
- gr.Text(label="Output Text"),
62
- gr.File(label="Download Output")
63
- ],
64
- title="PDF Parser",
65
- description="Parse a PDF and choose the output format."
66
- )
67
-
68
- if __name__ == "__main__":
69
- iface.launch(share=True) # Set share=True to create a public link
 
1
+ To create a public link, set `share=True` in `launch()`.
2
+ Traceback (most recent call last):
3
+ File "/usr/local/lib/python3.10/site-packages/gradio/queueing.py", line 624, in process_events
4
+ response = await route_utils.call_process_api(
5
+ File "/usr/local/lib/python3.10/site-packages/gradio/route_utils.py", line 323, in call_process_api
6
+ output = await app.get_blocks().process_api(
7
+ File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 2043, in process_api
8
+ result = await self.call_function(
9
+ File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 1590, in call_function
10
+ prediction = await anyio.to_thread.run_sync( # type: ignore
11
+ File "/usr/local/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
12
+ return await get_async_backend().run_sync_in_worker_thread(
13
+ File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2505, in run_sync_in_worker_thread
14
+ return await future
15
+ File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 1005, in run
16
+ result = context.run(func, *args)
17
+ File "/usr/local/lib/python3.10/site-packages/gradio/utils.py", line 865, in wrapper
18
+ response = f(*args, **kwargs)
19
+ File "/home/user/app/app.py", line 35, in parse_pdf
20
+ download_data = json.dumps(json_data).encode("utf-8") # Encode JSON for download
21
+ NameError: name 'json' is not defined