sblumenf commited on
Commit
17d36dc
·
verified ·
1 Parent(s): c82a3c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -9
app.py CHANGED
@@ -2,27 +2,83 @@ import json
2
  import gradio as gr
3
  from pdfminer.high_level import extract_pages, extract_text
4
  from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
5
- import os # Import os for file path manipulation
 
 
6
 
7
  def parse_pdf(pdf_file, output_format):
8
- # ... (Your existing parsing logic)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  # Convert extracted data to desired format and populate download_data
11
  if output_format == "JSON":
12
  json_data = {
13
  "text": text,
14
- "tables": tables, # Replace with actual table data
15
- "images": images # List of dictionaries with filenames
16
  }
17
- download_data = json.dumps(json_data) # No need to encode as Gradio handles it
18
 
19
  elif output_format == "Markdown":
20
- # ... (Your Markdown conversion logic)
 
 
 
 
 
 
 
 
 
 
 
21
  download_data = markdown_text
22
 
23
  elif output_format == "HTML":
24
- # ... (Your HTML conversion logic)
25
- download_data = html_text
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  return text, download_data
28
 
@@ -38,4 +94,4 @@ iface = gr.Interface(
38
  )
39
 
40
  if __name__ == "__main__":
41
- iface.launch(share=False) # Set share=False as Gradio warns about it on Hugging Face Spaces
 
2
  import gradio as gr
3
  from pdfminer.high_level import extract_pages, extract_text
4
  from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
5
+ import os
6
+ import io
7
+ from PIL import Image
8
 
9
  def parse_pdf(pdf_file, output_format):
10
+ with open(pdf_file, 'rb') as file:
11
+ pages = extract_pages(file)
12
+
13
+ text = ""
14
+ tables = []
15
+ images = []
16
+
17
+ for page in pages:
18
+ for element in page:
19
+ if isinstance(element, LTTextBoxHorizontal):
20
+ text += element.get_text()
21
+ elif isinstance(element, (LTFigure, LTImage)):
22
+ # Extract image data
23
+ if hasattr(element, 'stream'):
24
+ image_data = element.stream.read()
25
+ image = Image.open(io.BytesIO(image_data))
26
+ image_filename = f"extracted_image_{len(images)}.png"
27
+ image.save(image_filename)
28
+ images.append({"filename": image_filename})
29
+ else:
30
+ # Handle LTFigure (potentially nested LTImage)
31
+ for child in element:
32
+ if isinstance(child, LTImage):
33
+ image_data = child.stream.read()
34
+ image = Image.open(io.BytesIO(image_data))
35
+ image_filename = f"extracted_image_{len(images)}.png"
36
+ image.save(image_filename)
37
+ images.append({"filename": image_filename})
38
+ # You can add logic here to handle other child elements within LTFigure
39
+
40
+ # Implement table extraction logic using Camelot
41
+ import camelot
42
+ tables = camelot.read_pdf(pdf_file)
43
 
44
  # Convert extracted data to desired format and populate download_data
45
  if output_format == "JSON":
46
  json_data = {
47
  "text": text,
48
+ "tables": [table.df.to_dict() for table in tables],
49
+ "images": images
50
  }
51
+ download_data = json.dumps(json_data)
52
 
53
  elif output_format == "Markdown":
54
+ markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
55
+ for table in tables:
56
+ markdown_text += table.df.to_markdown(index=False) + "\n\n"
57
+
58
+ # Image embedding in Markdown (using relative paths)
59
+ image_tags = []
60
+ for image in images:
61
+ image_path = os.path.join(os.getcwd(), image["filename"]) # Replace with your path logic
62
+ image_tags.append(f'![Image {len(image_tags) + 1}]({image_path})')
63
+
64
+ markdown_text += "\n\n# Images\n\n" + "\n".join(image_tags)
65
+
66
  download_data = markdown_text
67
 
68
  elif output_format == "HTML":
69
+ html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
70
+ for table in tables:
71
+ html_text += table.df.to_html() + "<br>"
72
+
73
+ # Image embedding in HTML (using relative paths)
74
+ image_tags = []
75
+ for image in images:
76
+ image_path = os.path.join(os.getcwd(), image["filename"]) # Replace with your path logic
77
+ image_tags.append(f'<img src="{image_path}" alt="Image {len(image_tags) + 1}">')
78
+
79
+ html_text += "\n\n<h2>Images</h2>\n\n" + "\n".join(image_tags)
80
+
81
+ download_data = html_text.encode("utf-8") # Encode for HTML download
82
 
83
  return text, download_data
84
 
 
94
  )
95
 
96
  if __name__ == "__main__":
97
+ iface.launch(share=False)