sblumenf commited on
Commit
1f2e0af
·
verified ·
1 Parent(s): 875f540

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -6
app.py CHANGED
@@ -73,15 +73,20 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
73
  if output_format == "JSON":
74
  json_data = {
75
  "text": text,
76
- "tables": [table.to_dict(orient='records') for table in tables], # Use 'records' for better handling of duplicate columns
 
 
 
 
77
  "images": images
78
  }
79
  download_data = json.dumps(json_data, indent=4) # Add indentation for readability
80
  elif output_format == "Markdown":
81
  markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
82
  for i, table in enumerate(tables):
83
- markdown_text += f"## Table {i+1}\n"
84
- markdown_text += table.to_markdown(index=False) + "\n\n"
 
85
 
86
  # Image embedding in Markdown (using relative paths)
87
  markdown_text += "\n\n# Images\n\n"
@@ -93,8 +98,9 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
93
  elif output_format == "HTML":
94
  html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
95
  for i, table in enumerate(tables):
96
- html_text += f"<h2>Table {i+1}</h2>\n"
97
- html_text += table.to_html() + "<br>"
 
98
 
99
  # Image embedding in HTML (using relative paths)
100
  html_text += "\n\n<h2>Images</h2>\n\n"
@@ -121,4 +127,4 @@ iface = gr.Interface(
121
  )
122
 
123
  if __name__ == "__main__":
124
- iface.launch(share=False)
 
73
  if output_format == "JSON":
74
  json_data = {
75
  "text": text,
76
+ "tables": [
77
+ table.to_dict(orient='records')
78
+ for table in tables
79
+ if not table.columns.duplicated().any()
80
+ ], # Use 'records' for better handling of duplicate columns
81
  "images": images
82
  }
83
  download_data = json.dumps(json_data, indent=4) # Add indentation for readability
84
  elif output_format == "Markdown":
85
  markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
86
  for i, table in enumerate(tables):
87
+ if not table.columns.duplicated().any(): # Check for duplicate columns
88
+ markdown_text += f"## Table {i+1}\n"
89
+ markdown_text += table.to_markdown(index=False) + "\n\n"
90
 
91
  # Image embedding in Markdown (using relative paths)
92
  markdown_text += "\n\n# Images\n\n"
 
98
  elif output_format == "HTML":
99
  html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
100
  for i, table in enumerate(tables):
101
+ if not table.columns.duplicated().any(): # Check for duplicate columns
102
+ html_text += f"<h2>Table {i+1}</h2>\n"
103
+ html_text += table.to_html() + "<br>"
104
 
105
  # Image embedding in HTML (using relative paths)
106
  html_text += "\n\n<h2>Images</h2>\n\n"
 
127
  )
128
 
129
  if __name__ == "__main__":
130
+ iface.launch(share=True) # Set share=True to create a public link