sblumenf commited on
Commit
7dec78f
·
verified ·
1 Parent(s): 432b041

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -21
app.py CHANGED
@@ -1,21 +1,73 @@
1
- To create a public link, set `share=True` in `launch()`.
2
- Traceback (most recent call last):
3
- File "/usr/local/lib/python3.10/site-packages/gradio/queueing.py", line 624, in process_events
4
- response = await route_utils.call_process_api(
5
- File "/usr/local/lib/python3.10/site-packages/gradio/route_utils.py", line 323, in call_process_api
6
- output = await app.get_blocks().process_api(
7
- File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 2043, in process_api
8
- result = await self.call_function(
9
- File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 1590, in call_function
10
- prediction = await anyio.to_thread.run_sync( # type: ignore
11
- File "/usr/local/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
12
- return await get_async_backend().run_sync_in_worker_thread(
13
- File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2505, in run_sync_in_worker_thread
14
- return await future
15
- File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 1005, in run
16
- result = context.run(func, *args)
17
- File "/usr/local/lib/python3.10/site-packages/gradio/utils.py", line 865, in wrapper
18
- response = f(*args, **kwargs)
19
- File "/home/user/app/app.py", line 35, in parse_pdf
20
- download_data = json.dumps(json_data).encode("utf-8") # Encode JSON for download
21
- NameError: name 'json' is not defined
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import gradio as gr
3
+ from pdfminer.high_level import extract_pages, extract_text
4
+ from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
5
+ import mistletoe # for Markdown table generation (optional)
6
+
7
+ def parse_pdf(pdf_file, output_format):
8
+ with open(pdf_file, 'rb') as file:
9
+ pages = extract_pages(file)
10
+
11
+ text = ""
12
+ tables = [] # Placeholder for extracted table data
13
+ images = [] # Placeholder for extracted image data
14
+
15
+ for page in pages:
16
+ for element in page:
17
+ if isinstance(element, LTTextBoxHorizontal):
18
+ text += element.get_text()
19
+ elif isinstance(element, LTFigure):
20
+ # Extract image data (e.g., save as image, convert to base64)
21
+ images.append(element)
22
+ elif isinstance(element, LTImage):
23
+ # Extract image data (e.g., save as image, convert to base64)
24
+ images.append(element)
25
+
26
+ # Implement table extraction logic (e.g., using heuristics or advanced techniques)
27
+ # You can use libraries like Camelot for complex tables
28
+ # ...
29
+
30
+ # Convert extracted data to desired format and populate download_data
31
+ if output_format == "JSON":
32
+ json_data = {
33
+ "text": text,
34
+ "tables": tables, # Replace with actual table data
35
+ "images": images # Replace with actual image data (e.g., base64)
36
+ }
37
+ download_data = json.dumps(json_data).encode("utf-8") # Encode JSON for download
38
+
39
+ elif output_format == "Markdown":
40
+ # Implement table conversion using mistletoe or other Markdown libraries
41
+ markdown_tables = mistletoe.markdown(convert_table=True)(tables) # Example using mistletoe
42
+
43
+ markdown_text = f"# Extracted Text\n\n{text}\n\n{markdown_tables}\n\n# Images\n"
44
+ # Implement image conversion (e.g., relative paths or base64 encoding)
45
+ # ...
46
+ download_data = markdown_text.encode("utf-8")
47
+
48
+ elif output_format == "HTML":
49
+ # Implement table conversion using HTML table tags
50
+ html_tables = "<table>" # Start of HTML table (replace with actual table structure)
51
+ # ... (Implement table data conversion to HTML)
52
+ html_tables += "</table>"
53
+
54
+ html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n"
55
+ # Implement image conversion using `<img>` tag
56
+ # ...
57
+ download_data = html_text.encode("utf-8")
58
+
59
+ return text, download_data
60
+
61
+ iface = gr.Interface(
62
+ fn=parse_pdf,
63
+ inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
64
+ outputs=[
65
+ gr.Text(label="Output Text"),
66
+ gr.File(label="Download Output")
67
+ ],
68
+ title="PDF Parser",
69
+ description="Parse a PDF and choose the output format."
70
+ )
71
+
72
+ if __name__ == "__main__":
73
+ iface.launch(share=True) # Set share=True to create a public link