Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 12, 2024

Commit

875f540

verified ·

1 Parent(s): 0a3a380

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -12

app.py CHANGED Viewed

@@ -19,18 +19,17 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
     Returns:
         tuple: Extracted text and download data in the specified format.
-               Returns an empty string and None if there is an error.
     """
     try:
         with open(pdf_file, 'rb') as file:
-            pages = list(extract_pages(file))  # Convert generator to list
             text = ""
             tables = []
             images = []
-            # Iterate through pages and extract text and images
-            for i, page in enumerate(pages):
-                progress(i / len(pages))  # Update progress bar
                 for element in page:
                     if isinstance(element, LTTextBoxHorizontal):
                         text += element.get_text()
@@ -67,7 +66,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                                 unique_columns.append(col)
                             df = pd.DataFrame(table[1:], columns=unique_columns)
                         else:
-                            df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                         tables.append(df)
             # Format extracted data based on user selection
@@ -87,8 +86,8 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                 # Image embedding in Markdown (using relative paths)
                 markdown_text += "\n\n# Images\n\n"
                 for image in images:
-                  image_path = os.path.join(os.getcwd(), image["filename"])
-                  markdown_text += f'![Image]({image_path})\n'
                 download_data = markdown_text
             elif output_format == "HTML":
@@ -100,19 +99,19 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                 # Image embedding in HTML (using relative paths)
                 html_text += "\n\n<h2>Images</h2>\n\n"
                 for image in images:
-                  image_path = os.path.join(os.getcwd(), image["filename"])
-                  html_text += f'<img src="{image_path}" alt="Image"><br>\n'
                 download_data = html_text.encode("utf-8")  # Encode for HTML download
             return text, download_data
     except Exception as main_e:
         print(f"A main error occurred: {main_e}")
-        return "", None # Return empty string and None in case of error
 iface = gr.Interface(
     fn=parse_pdf,
-    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])], # Remove gr.Progress() from inputs
     outputs=[
         gr.Text(label="Output Text"),
         gr.File(label="Download Output")

     Returns:
         tuple: Extracted text and download data in the specified format.
+            Returns an empty string and None if there is an error.
     """
     try:
         with open(pdf_file, 'rb') as file:
             text = ""
             tables = []
             images = []
+            # Iterate directly over pages
+            for page in extract_pages(file):
+                # progress(i / len(pages))  # Update progress bar (if you still want to use a progress bar, you'll need to determine the total number of pages beforehand)
                 for element in page:
                     if isinstance(element, LTTextBoxHorizontal):
                         text += element.get_text()
                                 unique_columns.append(col)
                             df = pd.DataFrame(table[1:], columns=unique_columns)
                         else:
+                            df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                         tables.append(df)
             # Format extracted data based on user selection
                 # Image embedding in Markdown (using relative paths)
                 markdown_text += "\n\n# Images\n\n"
                 for image in images:
+                    image_path = os.path.join(os.getcwd(), image["filename"])
+                    markdown_text += f'![Image]({image_path})\n'
                 download_data = markdown_text
             elif output_format == "HTML":
                 # Image embedding in HTML (using relative paths)
                 html_text += "\n\n<h2>Images</h2>\n\n"
                 for image in images:
+                    image_path = os.path.join(os.getcwd(), image["filename"])
+                    html_text += f'<img src="{image_path}" alt="Image"><br>\n'
                 download_data = html_text.encode("utf-8")  # Encode for HTML download
             return text, download_data
     except Exception as main_e:
         print(f"A main error occurred: {main_e}")
+        return "", None
 iface = gr.Interface(
     fn=parse_pdf,
+    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],  # Remove gr.Progress() from inputs
     outputs=[
         gr.Text(label="Output Text"),
         gr.File(label="Download Output")