sblumenf commited on
Commit
7cb3598
·
verified ·
1 Parent(s): 3403d47

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -9
app.py CHANGED
@@ -7,9 +7,21 @@ import io
7
  from PIL import Image
8
  import pandas as pd
9
  import pdfplumber
10
- import tempfile # Import tempfile
11
 
12
  def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
 
 
 
 
 
 
 
 
 
 
 
 
13
  try:
14
  with open(pdf_file, 'rb') as file:
15
  text = ""
@@ -53,7 +65,6 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
53
  df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
54
  tables.append(df)
55
 
56
- # Use a temporary file for the download
57
  with tempfile.NamedTemporaryFile(mode="w+b", delete=False, suffix="." + output_format.lower()) as tmp:
58
  if output_format == "JSON":
59
  json_data = {
@@ -61,8 +72,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
61
  "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
62
  "images": images
63
  }
64
- json.dump(json_data, tmp, indent=4)
65
- download_path = tmp.name
66
  elif output_format == "Markdown":
67
  markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
68
  for i, table in enumerate(tables):
@@ -73,8 +83,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
73
  for image in images:
74
  image_path = os.path.join(os.getcwd(), image["filename"])
75
  markdown_text += f'![Image]({image_path})\n'
76
- tmp.write(markdown_text.encode('utf-8'))
77
- download_path = tmp.name
78
  elif output_format == "HTML":
79
  html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
80
  for i, table in enumerate(tables):
@@ -85,9 +94,8 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
85
  for image in images:
86
  image_path = os.path.join(os.getcwd(), image["filename"])
87
  html_text += f'<img src="{image_path}" alt="Image"><br>\n'
88
- tmp.write(html_text.encode('utf-8'))
89
- download_path = tmp.name
90
-
91
  return text, download_path
92
 
93
  except Exception as main_e:
 
7
  from PIL import Image
8
  import pandas as pd
9
  import pdfplumber
10
+ import tempfile
11
 
12
  def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
13
+ """
14
+ Parses a PDF file, extracts text, tables, and images, and formats the output.
15
+
16
+ Args:
17
+ pdf_file: Path to the uploaded PDF file.
18
+ output_format: Desired output format ("JSON", "Markdown", or "HTML").
19
+ progress: Gradio Progress object for displaying progress.
20
+
21
+ Returns:
22
+ tuple: Extracted text and download data in the specified format.
23
+ Returns an empty string and None if there is an error.
24
+ """
25
  try:
26
  with open(pdf_file, 'rb') as file:
27
  text = ""
 
65
  df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
66
  tables.append(df)
67
 
 
68
  with tempfile.NamedTemporaryFile(mode="w+b", delete=False, suffix="." + output_format.lower()) as tmp:
69
  if output_format == "JSON":
70
  json_data = {
 
72
  "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
73
  "images": images
74
  }
75
+ json.dump(json_data, tmp, indent=4)
 
76
  elif output_format == "Markdown":
77
  markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
78
  for i, table in enumerate(tables):
 
83
  for image in images:
84
  image_path = os.path.join(os.getcwd(), image["filename"])
85
  markdown_text += f'![Image]({image_path})\n'
86
+ tmp.write(markdown_text.encode('utf-8'))
 
87
  elif output_format == "HTML":
88
  html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
89
  for i, table in enumerate(tables):
 
94
  for image in images:
95
  image_path = os.path.join(os.getcwd(), image["filename"])
96
  html_text += f'<img src="{image_path}" alt="Image"><br>\n'
97
+ tmp.write(html_text.encode('utf-8'))
98
+ download_path = tmp.name
 
99
  return text, download_path
100
 
101
  except Exception as main_e: