sblumenf commited on
Commit
ce01472
·
verified ·
1 Parent(s): 5e96fa0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -5
app.py CHANGED
@@ -66,14 +66,14 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
66
  df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
67
  tables.append(df)
68
 
69
- with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix="." + output_format.lower()) as tmp:
70
  if output_format == "JSON":
71
  json_data = {
72
  "text": text,
73
  "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
74
  "images": images
75
  }
76
- json.dump(json_data, tmp, ensure_ascii=False, indent=4) # Ensure ASCII compatibility
77
  elif output_format == "Markdown":
78
  markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
79
  for i, table in enumerate(tables):
@@ -84,7 +84,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
84
  for image in images:
85
  image_path = os.path.join(os.getcwd(), image["filename"])
86
  markdown_text += f'![Image]({image_path})\n'
87
- tmp.write(markdown_text.encode('utf-8'))
88
  elif output_format == "HTML":
89
  html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
90
  for i, table in enumerate(tables):
@@ -95,7 +95,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
95
  for image in images:
96
  image_path = os.path.join(os.getcwd(), image["filename"])
97
  html_text += f'<img src="{image_path}" alt="Image"><br>\n'
98
- tmp.write(html_text.encode('utf-8'))
99
  download_path = tmp.name
100
 
101
  return text, download_path
@@ -117,4 +117,4 @@ iface = gr.Interface(
117
  )
118
 
119
  if __name__ == "__main__":
120
- iface.launch() # Temporarily disable sharing for debugging
 
66
  df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
67
  tables.append(df)
68
 
69
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
70
  if output_format == "JSON":
71
  json_data = {
72
  "text": text,
73
  "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
74
  "images": images
75
  }
76
+ json.dump(json_data, tmp, ensure_ascii=False, indent=4)
77
  elif output_format == "Markdown":
78
  markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
79
  for i, table in enumerate(tables):
 
84
  for image in images:
85
  image_path = os.path.join(os.getcwd(), image["filename"])
86
  markdown_text += f'![Image]({image_path})\n'
87
+ tmp.write(markdown_text)
88
  elif output_format == "HTML":
89
  html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
90
  for i, table in enumerate(tables):
 
95
  for image in images:
96
  image_path = os.path.join(os.getcwd(), image["filename"])
97
  html_text += f'<img src="{image_path}" alt="Image"><br>\n'
98
+ tmp.write(html_text)
99
  download_path = tmp.name
100
 
101
  return text, download_path
 
117
  )
118
 
119
  if __name__ == "__main__":
120
+ iface.launch() # Temporarily disable sharing for debugging