Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 12, 2024

Commit

41a1dac

verified ·

1 Parent(s): e2fb9c7

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -4

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import pandas as pd
 import pdfplumber
 import tempfile
 import traceback
 def save_image(element, images):
     try:
@@ -23,6 +24,20 @@ def save_image(element, images):
     except Exception as e:
         print(f"Error extracting image: {e}")
 def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
     """
     Parses a PDF file, extracts text, tables, and images, and formats the output.
@@ -50,6 +65,8 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                         print(f"Processing element: {type(element)}")
                         save_image(element, images)
             with pdfplumber.open(pdf_file) as pdf:
                 for page_num, page in enumerate(pdf.pages):
                     for table in page.extract_tables():
@@ -70,13 +87,13 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
             with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
                 if output_format == "JSON":
                     json_data = {
-                        "text": text,
                         "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
                         "images": images
                     }
                     json.dump(json_data, tmp, ensure_ascii=False, indent=4)
                 elif output_format == "Markdown":
-                    markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
                     for i, table in enumerate(tables):
                         if not table.columns.duplicated().any():
                             markdown_text += f"## Table {i+1}\n"
@@ -87,7 +104,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                         markdown_text += f'![Image]({image_path})\n'
                     tmp.write(markdown_text)
                 elif output_format == "HTML":
-                    html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
                     for i, table in enumerate(tables):
                         if not table.columns.duplicated().any():
                             html_text += f"<h2>Table {i+1}</h2>\n"
@@ -99,7 +116,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                     tmp.write(html_text)
                 download_path = tmp.name
-            return text, download_path
     except Exception as main_e:
         traceback.print_exc()  # Print full traceback to console

 import pdfplumber
 import tempfile
 import traceback
+import re
 def save_image(element, images):
     try:
     except Exception as e:
         print(f"Error extracting image: {e}")
+def detect_headers(text):
+    """Detect headers in the text and format them."""
+    lines = text.split('\n')
+    formatted_text = ""
+    header_patterns = [r"^\d+\.\s", r"^[A-Z\s]+$", r"^[A-Z][a-z]+\s\d"]
+    for line in lines:
+        if any(re.match(pattern, line.strip()) for pattern in header_patterns):
+            formatted_text += f"# {line.strip()}\n"
+        else:
+            formatted_text += f"{line.strip()}\n"
+    return formatted_text
 def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
     """
     Parses a PDF file, extracts text, tables, and images, and formats the output.
                         print(f"Processing element: {type(element)}")
                         save_image(element, images)
+            formatted_text = detect_headers(text)
             with pdfplumber.open(pdf_file) as pdf:
                 for page_num, page in enumerate(pdf.pages):
                     for table in page.extract_tables():
             with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
                 if output_format == "JSON":
                     json_data = {
+                        "text": formatted_text,
                         "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
                         "images": images
                     }
                     json.dump(json_data, tmp, ensure_ascii=False, indent=4)
                 elif output_format == "Markdown":
+                    markdown_text = f"# Extracted Text\n\n{formatted_text}\n\n# Tables\n"
                     for i, table in enumerate(tables):
                         if not table.columns.duplicated().any():
                             markdown_text += f"## Table {i+1}\n"
                         markdown_text += f'![Image]({image_path})\n'
                     tmp.write(markdown_text)
                 elif output_format == "HTML":
+                    html_text = f"<p>{formatted_text}</p>\n\n<h2>Tables</h2>\n"
                     for i, table in enumerate(tables):
                         if not table.columns.duplicated().any():
                             html_text += f"<h2>Table {i+1}</h2>\n"
                     tmp.write(html_text)
                 download_path = tmp.name
+            return formatted_text, download_path
     except Exception as main_e:
         traceback.print_exc()  # Print full traceback to console