sblumenf commited on
Commit
875f540
·
verified ·
1 Parent(s): 0a3a380

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -12
app.py CHANGED
@@ -19,18 +19,17 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
19
 
20
  Returns:
21
  tuple: Extracted text and download data in the specified format.
22
- Returns an empty string and None if there is an error.
23
  """
24
  try:
25
  with open(pdf_file, 'rb') as file:
26
- pages = list(extract_pages(file)) # Convert generator to list
27
  text = ""
28
  tables = []
29
  images = []
30
 
31
- # Iterate through pages and extract text and images
32
- for i, page in enumerate(pages):
33
- progress(i / len(pages)) # Update progress bar
34
  for element in page:
35
  if isinstance(element, LTTextBoxHorizontal):
36
  text += element.get_text()
@@ -67,7 +66,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
67
  unique_columns.append(col)
68
  df = pd.DataFrame(table[1:], columns=unique_columns)
69
  else:
70
- df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
71
  tables.append(df)
72
 
73
  # Format extracted data based on user selection
@@ -87,8 +86,8 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
87
  # Image embedding in Markdown (using relative paths)
88
  markdown_text += "\n\n# Images\n\n"
89
  for image in images:
90
- image_path = os.path.join(os.getcwd(), image["filename"])
91
- markdown_text += f'![Image]({image_path})\n'
92
 
93
  download_data = markdown_text
94
  elif output_format == "HTML":
@@ -100,19 +99,19 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
100
  # Image embedding in HTML (using relative paths)
101
  html_text += "\n\n<h2>Images</h2>\n\n"
102
  for image in images:
103
- image_path = os.path.join(os.getcwd(), image["filename"])
104
- html_text += f'<img src="{image_path}" alt="Image"><br>\n'
105
 
106
  download_data = html_text.encode("utf-8") # Encode for HTML download
107
  return text, download_data
108
 
109
  except Exception as main_e:
110
  print(f"A main error occurred: {main_e}")
111
- return "", None # Return empty string and None in case of error
112
 
113
  iface = gr.Interface(
114
  fn=parse_pdf,
115
- inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])], # Remove gr.Progress() from inputs
116
  outputs=[
117
  gr.Text(label="Output Text"),
118
  gr.File(label="Download Output")
 
19
 
20
  Returns:
21
  tuple: Extracted text and download data in the specified format.
22
+ Returns an empty string and None if there is an error.
23
  """
24
  try:
25
  with open(pdf_file, 'rb') as file:
 
26
  text = ""
27
  tables = []
28
  images = []
29
 
30
+ # Iterate directly over pages
31
+ for page in extract_pages(file):
32
+ # progress(i / len(pages)) # Update progress bar (if you still want to use a progress bar, you'll need to determine the total number of pages beforehand)
33
  for element in page:
34
  if isinstance(element, LTTextBoxHorizontal):
35
  text += element.get_text()
 
66
  unique_columns.append(col)
67
  df = pd.DataFrame(table[1:], columns=unique_columns)
68
  else:
69
+ df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
70
  tables.append(df)
71
 
72
  # Format extracted data based on user selection
 
86
  # Image embedding in Markdown (using relative paths)
87
  markdown_text += "\n\n# Images\n\n"
88
  for image in images:
89
+ image_path = os.path.join(os.getcwd(), image["filename"])
90
+ markdown_text += f'![Image]({image_path})\n'
91
 
92
  download_data = markdown_text
93
  elif output_format == "HTML":
 
99
  # Image embedding in HTML (using relative paths)
100
  html_text += "\n\n<h2>Images</h2>\n\n"
101
  for image in images:
102
+ image_path = os.path.join(os.getcwd(), image["filename"])
103
+ html_text += f'<img src="{image_path}" alt="Image"><br>\n'
104
 
105
  download_data = html_text.encode("utf-8") # Encode for HTML download
106
  return text, download_data
107
 
108
  except Exception as main_e:
109
  print(f"A main error occurred: {main_e}")
110
+ return "", None
111
 
112
  iface = gr.Interface(
113
  fn=parse_pdf,
114
+ inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])], # Remove gr.Progress() from inputs
115
  outputs=[
116
  gr.Text(label="Output Text"),
117
  gr.File(label="Download Output")