sblumenf commited on
Commit
3403d47
·
verified ·
1 Parent(s): 1f2e0af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -61
app.py CHANGED
@@ -7,29 +7,16 @@ import io
7
  from PIL import Image
8
  import pandas as pd
9
  import pdfplumber
 
10
 
11
  def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
12
- """
13
- Parses a PDF file, extracts text, tables, and images, and formats the output.
14
-
15
- Args:
16
- pdf_file: Path to the uploaded PDF file.
17
- output_format: Desired output format ("JSON", "Markdown", or "HTML").
18
- progress: Gradio Progress object for displaying progress.
19
-
20
- Returns:
21
- tuple: Extracted text and download data in the specified format.
22
- Returns an empty string and None if there is an error.
23
- """
24
  try:
25
  with open(pdf_file, 'rb') as file:
26
  text = ""
27
  tables = []
28
  images = []
29
 
30
- # Iterate directly over pages
31
  for page in extract_pages(file):
32
- # progress(i / len(pages)) # Update progress bar (if you still want to use a progress bar, you'll need to determine the total number of pages beforehand)
33
  for element in page:
34
  if isinstance(element, LTTextBoxHorizontal):
35
  text += element.get_text()
@@ -52,64 +39,56 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
52
  except Exception as e:
53
  print(f"Error extracting image: {e}")
54
 
55
- # Enhanced table extraction using pdfplumber
56
  with pdfplumber.open(pdf_file) as pdf:
57
  for page_num, page in enumerate(pdf.pages):
58
  for table in page.extract_tables():
59
- # Handle potential duplicate columns
60
  if len(table) > 0 and len(set(table[0])) != len(table[0]):
61
- # If duplicate columns exist, try to create unique column names
62
  unique_columns = []
63
  for col in table[0]:
64
  if col in unique_columns:
65
- col = f"{col}_{unique_columns.count(col)}" # Append a counter
66
  unique_columns.append(col)
67
  df = pd.DataFrame(table[1:], columns=unique_columns)
68
  else:
69
  df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
70
  tables.append(df)
71
 
72
- # Format extracted data based on user selection
73
- if output_format == "JSON":
74
- json_data = {
75
- "text": text,
76
- "tables": [
77
- table.to_dict(orient='records')
78
- for table in tables
79
- if not table.columns.duplicated().any()
80
- ], # Use 'records' for better handling of duplicate columns
81
- "images": images
82
- }
83
- download_data = json.dumps(json_data, indent=4) # Add indentation for readability
84
- elif output_format == "Markdown":
85
- markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
86
- for i, table in enumerate(tables):
87
- if not table.columns.duplicated().any(): # Check for duplicate columns
88
- markdown_text += f"## Table {i+1}\n"
89
- markdown_text += table.to_markdown(index=False) + "\n\n"
90
-
91
- # Image embedding in Markdown (using relative paths)
92
- markdown_text += "\n\n# Images\n\n"
93
- for image in images:
94
- image_path = os.path.join(os.getcwd(), image["filename"])
95
- markdown_text += f'![Image]({image_path})\n'
96
-
97
- download_data = markdown_text
98
- elif output_format == "HTML":
99
- html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
100
- for i, table in enumerate(tables):
101
- if not table.columns.duplicated().any(): # Check for duplicate columns
102
- html_text += f"<h2>Table {i+1}</h2>\n"
103
- html_text += table.to_html() + "<br>"
104
-
105
- # Image embedding in HTML (using relative paths)
106
- html_text += "\n\n<h2>Images</h2>\n\n"
107
- for image in images:
108
- image_path = os.path.join(os.getcwd(), image["filename"])
109
- html_text += f'<img src="{image_path}" alt="Image"><br>\n'
110
-
111
- download_data = html_text.encode("utf-8") # Encode for HTML download
112
- return text, download_data
113
 
114
  except Exception as main_e:
115
  print(f"A main error occurred: {main_e}")
@@ -117,7 +96,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
117
 
118
  iface = gr.Interface(
119
  fn=parse_pdf,
120
- inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])], # Remove gr.Progress() from inputs
121
  outputs=[
122
  gr.Text(label="Output Text"),
123
  gr.File(label="Download Output")
@@ -127,4 +106,4 @@ iface = gr.Interface(
127
  )
128
 
129
  if __name__ == "__main__":
130
- iface.launch(share=True) # Set share=True to create a public link
 
7
  from PIL import Image
8
  import pandas as pd
9
  import pdfplumber
10
+ import tempfile # Import tempfile
11
 
12
  def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
 
 
 
 
 
 
 
 
 
 
 
 
13
  try:
14
  with open(pdf_file, 'rb') as file:
15
  text = ""
16
  tables = []
17
  images = []
18
 
 
19
  for page in extract_pages(file):
 
20
  for element in page:
21
  if isinstance(element, LTTextBoxHorizontal):
22
  text += element.get_text()
 
39
  except Exception as e:
40
  print(f"Error extracting image: {e}")
41
 
 
42
  with pdfplumber.open(pdf_file) as pdf:
43
  for page_num, page in enumerate(pdf.pages):
44
  for table in page.extract_tables():
 
45
  if len(table) > 0 and len(set(table[0])) != len(table[0]):
 
46
  unique_columns = []
47
  for col in table[0]:
48
  if col in unique_columns:
49
+ col = f"{col}_{unique_columns.count(col)}"
50
  unique_columns.append(col)
51
  df = pd.DataFrame(table[1:], columns=unique_columns)
52
  else:
53
  df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
54
  tables.append(df)
55
 
56
+ # Use a temporary file for the download
57
+ with tempfile.NamedTemporaryFile(mode="w+b", delete=False, suffix="." + output_format.lower()) as tmp:
58
+ if output_format == "JSON":
59
+ json_data = {
60
+ "text": text,
61
+ "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
62
+ "images": images
63
+ }
64
+ json.dump(json_data, tmp, indent=4)
65
+ download_path = tmp.name
66
+ elif output_format == "Markdown":
67
+ markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
68
+ for i, table in enumerate(tables):
69
+ if not table.columns.duplicated().any():
70
+ markdown_text += f"## Table {i+1}\n"
71
+ markdown_text += table.to_markdown(index=False) + "\n\n"
72
+ markdown_text += "\n\n# Images\n\n"
73
+ for image in images:
74
+ image_path = os.path.join(os.getcwd(), image["filename"])
75
+ markdown_text += f'![Image]({image_path})\n'
76
+ tmp.write(markdown_text.encode('utf-8'))
77
+ download_path = tmp.name
78
+ elif output_format == "HTML":
79
+ html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
80
+ for i, table in enumerate(tables):
81
+ if not table.columns.duplicated().any():
82
+ html_text += f"<h2>Table {i+1}</h2>\n"
83
+ html_text += table.to_html() + "<br>"
84
+ html_text += "\n\n<h2>Images</h2>\n\n"
85
+ for image in images:
86
+ image_path = os.path.join(os.getcwd(), image["filename"])
87
+ html_text += f'<img src="{image_path}" alt="Image"><br>\n'
88
+ tmp.write(html_text.encode('utf-8'))
89
+ download_path = tmp.name
90
+
91
+ return text, download_path
 
 
 
 
 
92
 
93
  except Exception as main_e:
94
  print(f"A main error occurred: {main_e}")
 
96
 
97
  iface = gr.Interface(
98
  fn=parse_pdf,
99
+ inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
100
  outputs=[
101
  gr.Text(label="Output Text"),
102
  gr.File(label="Download Output")
 
106
  )
107
 
108
  if __name__ == "__main__":
109
+ iface.launch(share=True)