sblumenf commited on
Commit
b8d5f22
·
verified ·
1 Parent(s): f9c1d23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -74
app.py CHANGED
@@ -1,90 +1,114 @@
1
  import json
2
  import gradio as gr
3
- from pdfminer.high_level import extract_pages, extract_text
4
  from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
5
  import os
6
  import io
7
  from PIL import Image
8
-
9
- def parse_pdf(pdf_file, output_format):
10
- with open(pdf_file, 'rb') as file:
11
- pages = extract_pages(file)
12
-
13
- text = ""
14
- tables = []
15
- images = []
16
-
17
- for page in pages:
18
- for element in page:
19
- if isinstance(element, LTTextBoxHorizontal):
20
- text += element.get_text()
21
- elif isinstance(element, (LTFigure, LTImage)):
22
- # Extract image data
23
- if hasattr(element, 'stream'):
24
- image_data = element.stream.read()
25
- image = Image.open(io.BytesIO(image_data))
26
- image_filename = f"extracted_image_{len(images)}.png"
27
- image.save(image_filename)
28
- images.append({"filename": image_filename})
29
- else:
30
- # Handle LTFigure (potentially nested LTImage)
31
- for child in element:
32
- if isinstance(child, LTImage):
33
- image_data = child.stream.read()
 
 
 
 
 
 
 
 
34
  image = Image.open(io.BytesIO(image_data))
35
  image_filename = f"extracted_image_{len(images)}.png"
36
  image.save(image_filename)
37
  images.append({"filename": image_filename})
38
- # You can add logic here to handle other child elements within LTFigure
39
-
40
- # Implement table extraction logic using Camelot
41
- import camelot
42
- tables = camelot.read_pdf(pdf_file)
43
-
44
- # Convert extracted data to desired format and populate download_data
45
- if output_format == "JSON":
46
- json_data = {
47
- "text": text,
48
- "tables": [table.df.to_dict() for table in tables],
49
- "images": images
50
- }
51
- download_data = json.dumps(json_data)
52
-
53
- elif output_format == "Markdown":
54
- markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
55
- for table in tables:
56
- markdown_text += table.df.to_markdown(index=False) + "\n\n"
57
-
58
- # Image embedding in Markdown (using relative paths)
59
- image_tags = []
60
- for image in images:
61
- image_path = os.path.join(os.getcwd(), image["filename"]) # Replace with your path logic
62
- image_tags.append(f'![Image {len(image_tags) + 1}]({image_path})')
63
-
64
- markdown_text += "\n\n# Images\n\n" + "\n".join(image_tags)
65
-
66
- download_data = markdown_text
67
-
68
- elif output_format == "HTML":
69
- html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
70
- for table in tables:
71
- html_text += table.df.to_html() + "<br>"
72
-
73
- # Image embedding in HTML (using relative paths)
74
- image_tags = []
75
- for image in images:
76
- image_path = os.path.join(os.getcwd(), image["filename"]) # Replace with your path logic
77
- image_tags.append(f'<img src="{image_path}" alt="Image {len(image_tags) + 1}">')
78
-
79
- html_text += "\n\n<h2>Images</h2>\n\n" + "\n".join(image_tags)
80
-
81
- download_data = html_text.encode("utf-8") # Encode for HTML download
82
-
83
- return text, download_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  iface = gr.Interface(
86
  fn=parse_pdf,
87
- inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
88
  outputs=[
89
  gr.Text(label="Output Text"),
90
  gr.File(label="Download Output")
 
1
  import json
2
  import gradio as gr
3
+ from pdfminer.high_level import extract_pages
4
  from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
5
  import os
6
  import io
7
  from PIL import Image
8
+ import pandas as pd
9
+ import tabula
10
+ import camelot
11
+
12
+ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
13
+ """
14
+ Parses a PDF file, extracts text, tables, and images, and formats the output.
15
+
16
+ Args:
17
+ pdf_file: Path to the uploaded PDF file.
18
+ output_format: Desired output format ("JSON", "Markdown", or "HTML").
19
+ progress: Gradio Progress object for displaying progress.
20
+
21
+ Returns:
22
+ tuple: Extracted text and download data in the specified format.
23
+ Returns an empty string and None if there is an error.
24
+ """
25
+ try:
26
+ with open(pdf_file, 'rb') as file:
27
+ pages = extract_pages(file)
28
+ text = ""
29
+ tables = []
30
+ images = []
31
+
32
+ # Iterate through pages and extract text and images
33
+ for i, page in enumerate(pages):
34
+ progress(i / len(pages)) # Update progress bar
35
+ for element in page:
36
+ if isinstance(element, LTTextBoxHorizontal):
37
+ text += element.get_text()
38
+ elif isinstance(element, (LTFigure, LTImage)):
39
+ try:
40
+ if hasattr(element, 'stream'):
41
+ image_data = element.stream.read()
42
  image = Image.open(io.BytesIO(image_data))
43
  image_filename = f"extracted_image_{len(images)}.png"
44
  image.save(image_filename)
45
  images.append({"filename": image_filename})
46
+ else:
47
+ for child in element:
48
+ if isinstance(child, LTImage):
49
+ image_data = child.stream.read()
50
+ image = Image.open(io.BytesIO(image_data))
51
+ image_filename = f"extracted_image_{len(images)}.png"
52
+ image.save(image_filename)
53
+ images.append({"filename": image_filename})
54
+ except Exception as e:
55
+ print(f"Error extracting image: {e}")
56
+
57
+ # Enhanced table extraction (tabula-py preferred, fallback to camelot)
58
+ try:
59
+ tables = tabula.read_pdf(pdf_file, pages='all', multiple_tables=True)
60
+ except Exception as e:
61
+ print(f"tabula-py failed: {e}. Trying camelot...")
62
+ try:
63
+ camelot_tables = camelot.read_pdf(pdf_file)
64
+ for table in camelot_tables:
65
+ tables.append(table.df)
66
+ except Exception as e:
67
+ print(f"camelot also failed: {e}. No tables extracted.")
68
+
69
+ # Format extracted data based on user selection
70
+ if output_format == "JSON":
71
+ json_data = {
72
+ "text": text,
73
+ "tables": [table.to_dict() for table in tables],
74
+ "images": images
75
+ }
76
+ download_data = json.dumps(json_data, indent=4) # Add indentation for readability
77
+ elif output_format == "Markdown":
78
+ markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
79
+ for i, table in enumerate(tables):
80
+ markdown_text += f"## Table {i+1}\n"
81
+ markdown_text += table.to_markdown(index=False) + "\n\n"
82
+
83
+ # Image embedding in Markdown (using relative paths)
84
+ markdown_text += "\n\n# Images\n\n"
85
+ for image in images:
86
+ image_path = os.path.join(os.getcwd(), image["filename"])
87
+ markdown_text += f'![Image]({image_path})\n'
88
+
89
+ download_data = markdown_text
90
+ elif output_format == "HTML":
91
+ html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
92
+ for i, table in enumerate(tables):
93
+ html_text += f"<h2>Table {i+1}</h2>\n"
94
+ html_text += table.to_html() + "<br>"
95
+
96
+ # Image embedding in HTML (using relative paths)
97
+ html_text += "\n\n<h2>Images</h2>\n\n"
98
+ for image in images:
99
+ image_path = os.path.join(os.getcwd(), image["filename"])
100
+ html_text += f'<img src="{image_path}" alt="Image"><br>\n'
101
+
102
+ download_data = html_text.encode("utf-8") # Encode for HTML download
103
+ return text, download_data
104
+
105
+ except Exception as main_e:
106
+ print(f"A main error occurred: {main_e}")
107
+ return "", None # Return empty string and None in case of error
108
 
109
  iface = gr.Interface(
110
  fn=parse_pdf,
111
+ inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"]), gr.Progress()],
112
  outputs=[
113
  gr.Text(label="Output Text"),
114
  gr.File(label="Download Output")