Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -9,7 +9,7 @@ def parse_pdf(pdf_file, output_format):
|
|
9 |
text = ""
|
10 |
tables = []
|
11 |
images = []
|
12 |
-
|
13 |
|
14 |
for page in pages:
|
15 |
for element in page:
|
@@ -25,14 +25,15 @@ def parse_pdf(pdf_file, output_format):
|
|
25 |
# Implement table extraction logic (e.g., using heuristics or advanced techniques)
|
26 |
# ...
|
27 |
|
28 |
-
# Convert extracted data to desired format
|
29 |
if output_format == "JSON":
|
30 |
json_data = {
|
31 |
"text": text,
|
32 |
"tables": tables, # Implement table conversion to JSON
|
33 |
"images": images # Implement image conversion to JSON (e.g., base64)
|
34 |
}
|
35 |
-
|
|
|
36 |
elif output_format == "Markdown":
|
37 |
markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
|
38 |
# Implement table conversion to Markdown
|
@@ -40,7 +41,8 @@ def parse_pdf(pdf_file, output_format):
|
|
40 |
markdown_text += "\n# Images\n"
|
41 |
# Implement image conversion to Markdown (e.g., embedding images)
|
42 |
# ...
|
43 |
-
|
|
|
44 |
elif output_format == "HTML":
|
45 |
html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
|
46 |
# Implement table conversion to HTML
|
@@ -48,7 +50,9 @@ def parse_pdf(pdf_file, output_format):
|
|
48 |
html_text += "<h2>Images</h2>\n"
|
49 |
# Implement image conversion to HTML (e.g., embedding images)
|
50 |
# ...
|
51 |
-
|
|
|
|
|
52 |
|
53 |
iface = gr.Interface(
|
54 |
fn=parse_pdf,
|
@@ -62,4 +66,4 @@ iface = gr.Interface(
|
|
62 |
)
|
63 |
|
64 |
if __name__ == "__main__":
|
65 |
-
iface.launch()
|
|
|
9 |
text = ""
|
10 |
tables = []
|
11 |
images = []
|
12 |
+
download_data = None # Initialize an empty variable for download data
|
13 |
|
14 |
for page in pages:
|
15 |
for element in page:
|
|
|
25 |
# Implement table extraction logic (e.g., using heuristics or advanced techniques)
|
26 |
# ...
|
27 |
|
28 |
+
# Convert extracted data to desired format and populate download_data
|
29 |
if output_format == "JSON":
|
30 |
json_data = {
|
31 |
"text": text,
|
32 |
"tables": tables, # Implement table conversion to JSON
|
33 |
"images": images # Implement image conversion to JSON (e.g., base64)
|
34 |
}
|
35 |
+
download_data = json.dumps(json_data).encode("utf-8") # Encode JSON for download
|
36 |
+
|
37 |
elif output_format == "Markdown":
|
38 |
markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
|
39 |
# Implement table conversion to Markdown
|
|
|
41 |
markdown_text += "\n# Images\n"
|
42 |
# Implement image conversion to Markdown (e.g., embedding images)
|
43 |
# ...
|
44 |
+
download_data = markdown_text.encode("utf-8") # Encode Markdown for download
|
45 |
+
|
46 |
elif output_format == "HTML":
|
47 |
html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
|
48 |
# Implement table conversion to HTML
|
|
|
50 |
html_text += "<h2>Images</h2>\n"
|
51 |
# Implement image conversion to HTML (e.g., embedding images)
|
52 |
# ...
|
53 |
+
download_data = html_text.encode("utf-8") # Encode HTML for download
|
54 |
+
|
55 |
+
return text, download_data
|
56 |
|
57 |
iface = gr.Interface(
|
58 |
fn=parse_pdf,
|
|
|
66 |
)
|
67 |
|
68 |
if __name__ == "__main__":
|
69 |
+
iface.launch(share=True) # Set share=True to create a public link
|