Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -19,18 +19,17 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
|
|
19 |
|
20 |
Returns:
|
21 |
tuple: Extracted text and download data in the specified format.
|
22 |
-
|
23 |
"""
|
24 |
try:
|
25 |
with open(pdf_file, 'rb') as file:
|
26 |
-
pages = list(extract_pages(file)) # Convert generator to list
|
27 |
text = ""
|
28 |
tables = []
|
29 |
images = []
|
30 |
|
31 |
-
# Iterate
|
32 |
-
for
|
33 |
-
progress(i / len(pages)) # Update progress bar
|
34 |
for element in page:
|
35 |
if isinstance(element, LTTextBoxHorizontal):
|
36 |
text += element.get_text()
|
@@ -67,7 +66,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
|
|
67 |
unique_columns.append(col)
|
68 |
df = pd.DataFrame(table[1:], columns=unique_columns)
|
69 |
else:
|
70 |
-
df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
|
71 |
tables.append(df)
|
72 |
|
73 |
# Format extracted data based on user selection
|
@@ -87,8 +86,8 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
|
|
87 |
# Image embedding in Markdown (using relative paths)
|
88 |
markdown_text += "\n\n# Images\n\n"
|
89 |
for image in images:
|
90 |
-
|
91 |
-
|
92 |
|
93 |
download_data = markdown_text
|
94 |
elif output_format == "HTML":
|
@@ -100,19 +99,19 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
|
|
100 |
# Image embedding in HTML (using relative paths)
|
101 |
html_text += "\n\n<h2>Images</h2>\n\n"
|
102 |
for image in images:
|
103 |
-
|
104 |
-
|
105 |
|
106 |
download_data = html_text.encode("utf-8") # Encode for HTML download
|
107 |
return text, download_data
|
108 |
|
109 |
except Exception as main_e:
|
110 |
print(f"A main error occurred: {main_e}")
|
111 |
-
return "", None
|
112 |
|
113 |
iface = gr.Interface(
|
114 |
fn=parse_pdf,
|
115 |
-
inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
|
116 |
outputs=[
|
117 |
gr.Text(label="Output Text"),
|
118 |
gr.File(label="Download Output")
|
|
|
19 |
|
20 |
Returns:
|
21 |
tuple: Extracted text and download data in the specified format.
|
22 |
+
Returns an empty string and None if there is an error.
|
23 |
"""
|
24 |
try:
|
25 |
with open(pdf_file, 'rb') as file:
|
|
|
26 |
text = ""
|
27 |
tables = []
|
28 |
images = []
|
29 |
|
30 |
+
# Iterate directly over pages
|
31 |
+
for page in extract_pages(file):
|
32 |
+
# progress(i / len(pages)) # Update progress bar (if you still want to use a progress bar, you'll need to determine the total number of pages beforehand)
|
33 |
for element in page:
|
34 |
if isinstance(element, LTTextBoxHorizontal):
|
35 |
text += element.get_text()
|
|
|
66 |
unique_columns.append(col)
|
67 |
df = pd.DataFrame(table[1:], columns=unique_columns)
|
68 |
else:
|
69 |
+
df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
|
70 |
tables.append(df)
|
71 |
|
72 |
# Format extracted data based on user selection
|
|
|
86 |
# Image embedding in Markdown (using relative paths)
|
87 |
markdown_text += "\n\n# Images\n\n"
|
88 |
for image in images:
|
89 |
+
image_path = os.path.join(os.getcwd(), image["filename"])
|
90 |
+
markdown_text += f'\n'
|
91 |
|
92 |
download_data = markdown_text
|
93 |
elif output_format == "HTML":
|
|
|
99 |
# Image embedding in HTML (using relative paths)
|
100 |
html_text += "\n\n<h2>Images</h2>\n\n"
|
101 |
for image in images:
|
102 |
+
image_path = os.path.join(os.getcwd(), image["filename"])
|
103 |
+
html_text += f'<img src="{image_path}" alt="Image"><br>\n'
|
104 |
|
105 |
download_data = html_text.encode("utf-8") # Encode for HTML download
|
106 |
return text, download_data
|
107 |
|
108 |
except Exception as main_e:
|
109 |
print(f"A main error occurred: {main_e}")
|
110 |
+
return "", None
|
111 |
|
112 |
iface = gr.Interface(
|
113 |
fn=parse_pdf,
|
114 |
+
inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])], # Remove gr.Progress() from inputs
|
115 |
outputs=[
|
116 |
gr.Text(label="Output Text"),
|
117 |
gr.File(label="Download Output")
|