sblumenf commited on
Commit
4d96b5c
·
verified ·
1 Parent(s): 893b405

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -6
app.py CHANGED
@@ -9,7 +9,7 @@ def parse_pdf(pdf_file, output_format):
9
  text = ""
10
  tables = []
11
  images = []
12
- markdown_text = "" # Initialize markdown_text outside conditional blocks
13
 
14
  for page in pages:
15
  for element in page:
@@ -25,14 +25,15 @@ def parse_pdf(pdf_file, output_format):
25
  # Implement table extraction logic (e.g., using heuristics or advanced techniques)
26
  # ...
27
 
28
- # Convert extracted data to desired format
29
  if output_format == "JSON":
30
  json_data = {
31
  "text": text,
32
  "tables": tables, # Implement table conversion to JSON
33
  "images": images # Implement image conversion to JSON (e.g., base64)
34
  }
35
- return json_data
 
36
  elif output_format == "Markdown":
37
  markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
38
  # Implement table conversion to Markdown
@@ -40,7 +41,8 @@ def parse_pdf(pdf_file, output_format):
40
  markdown_text += "\n# Images\n"
41
  # Implement image conversion to Markdown (e.g., embedding images)
42
  # ...
43
- return markdown_text
 
44
  elif output_format == "HTML":
45
  html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
46
  # Implement table conversion to HTML
@@ -48,7 +50,9 @@ def parse_pdf(pdf_file, output_format):
48
  html_text += "<h2>Images</h2>\n"
49
  # Implement image conversion to HTML (e.g., embedding images)
50
  # ...
51
- return html_text
 
 
52
 
53
  iface = gr.Interface(
54
  fn=parse_pdf,
@@ -62,4 +66,4 @@ iface = gr.Interface(
62
  )
63
 
64
  if __name__ == "__main__":
65
- iface.launch()
 
9
  text = ""
10
  tables = []
11
  images = []
12
+ download_data = None # Initialize an empty variable for download data
13
 
14
  for page in pages:
15
  for element in page:
 
25
  # Implement table extraction logic (e.g., using heuristics or advanced techniques)
26
  # ...
27
 
28
+ # Convert extracted data to desired format and populate download_data
29
  if output_format == "JSON":
30
  json_data = {
31
  "text": text,
32
  "tables": tables, # Implement table conversion to JSON
33
  "images": images # Implement image conversion to JSON (e.g., base64)
34
  }
35
+ download_data = json.dumps(json_data).encode("utf-8") # Encode JSON for download
36
+
37
  elif output_format == "Markdown":
38
  markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
39
  # Implement table conversion to Markdown
 
41
  markdown_text += "\n# Images\n"
42
  # Implement image conversion to Markdown (e.g., embedding images)
43
  # ...
44
+ download_data = markdown_text.encode("utf-8") # Encode Markdown for download
45
+
46
  elif output_format == "HTML":
47
  html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
48
  # Implement table conversion to HTML
 
50
  html_text += "<h2>Images</h2>\n"
51
  # Implement image conversion to HTML (e.g., embedding images)
52
  # ...
53
+ download_data = html_text.encode("utf-8") # Encode HTML for download
54
+
55
+ return text, download_data
56
 
57
  iface = gr.Interface(
58
  fn=parse_pdf,
 
66
  )
67
 
68
  if __name__ == "__main__":
69
+ iface.launch(share=True) # Set share=True to create a public link