Spaces:

broadfield-dev
/

pdf2markdown

Sleeping

App Files Files Community

broadfield-dev commited on Jun 2

Commit

9bc382d

verified ·

1 Parent(s): 052b496

Update app.py

Browse files

Files changed (1) hide show

app.py +262 -241

app.py CHANGED Viewed

@@ -1,200 +1,214 @@
-import gradio as gr
 import requests
 import pdfplumber
 from pdf2image import convert_from_path, convert_from_bytes
 import pytesseract
 from PIL import Image
-import io
-import os
-from huggingface_hub import HfApi, create_repo
-import re
-from datetime import datetime
-import urllib.parse
-import logging
-import subprocess
-# Set up logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
-# Initialize Hugging Face API
 HF_TOKEN = os.getenv("HF_TOKEN")
-REPO_NAME = "pdf-images-extracted" # Consider making this configurable if needed
 hf_api = HfApi()
 def check_poppler():
     try:
-        result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
-        # pdftoppm -v typically prints version info to stderr
         version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
         if version_info_log:
-            # Log the first line of the version info
-            logger.info(f"Poppler version check: {version_info_log.splitlines()[0]}")
         else:
-            logger.info("Poppler 'pdftoppm -v' ran, but no version output on stdout/stderr. Poppler is likely present.")
-        # The main goal is to confirm 'pdftoppm' is executable.
-        # FileNotFoundError is the primary concern for "not found".
         return True
     except FileNotFoundError:
         logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in PATH.")
         return False
-    except Exception as e: # Catch any other unexpected errors during subprocess execution
         logger.error(f"An unexpected error occurred during Poppler check: {str(e)}")
         return False
 def ensure_hf_dataset():
     try:
-        if not HF_TOKEN:
-            # This case should ideally be caught before attempting dataset operations
-            # However, having a check here is a good safeguard.
-            logger.error("HF_TOKEN is not set. Cannot ensure Hugging Face dataset.")
-            return "Error: HF_TOKEN is not set. Please configure it in Space secrets."
-        # Use hf_api instance which might be pre-configured with token, or pass token explicitly
-        # create_repo will use token from HfApi if initialized with one, or passed token, or env.
-        repo_id_obj = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
         logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
-        return repo_id_obj.repo_id # repo_id_obj is a RepoUrl object or similar
     except Exception as e:
-        logger.error(f"Hugging Face dataset error: {str(e)}")
-        return f"Error: Failed to access or create dataset '{REPO_NAME}': {str(e)}"
-def upload_image_to_hf(image, filename_base):
-    # filename_base should not include extension, it will be added.
     repo_id_or_error = ensure_hf_dataset()
     if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
-        return repo_id_or_error # Return error message from ensure_hf_dataset
-    repo_id = repo_id_or_error # Now it's confirmed to be the repo_id string
     try:
-        # Create a unique filename with timestamp in the repo to avoid collisions
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f") # Added microseconds for more uniqueness
-        repo_filename = f"images/{filename_base}_{timestamp}.png"
-        temp_path = f"/tmp/{filename_base}_{timestamp}.png" # Use unique temp name too
-        image.save(temp_path, format="PNG")
-        logger.info(f"Attempting to upload {temp_path} to {repo_id}/{repo_filename}")
         file_url = hf_api.upload_file(
-            path_or_fileobj=temp_path,
             path_in_repo=repo_filename,
             repo_id=repo_id,
             repo_type="dataset",
-            token=HF_TOKEN # Explicitly pass token for clarity
         )
-        os.remove(temp_path)
         logger.info(f"Successfully uploaded image: {file_url}")
         return file_url
     except Exception as e:
-        logger.error(f"Image upload error for {filename_base}: {str(e)}")
-        # Clean up temp file if it exists and an error occurred after its creation
-        if 'temp_path' in locals() and os.path.exists(temp_path):
             try:
-                os.remove(temp_path)
             except OSError as ose:
-                logger.error(f"Error removing temp file {temp_path} after upload failure: {ose}")
-        return f"Error uploading image {filename_base}: {str(e)}"
-def extract_text_from_pdf(pdf_input_source): # Renamed for clarity (source can be path, URL, or file obj)
     try:
-        if isinstance(pdf_input_source, str): # Indicates a URL
             logger.info(f"Fetching PDF from URL for text extraction: {pdf_input_source}")
-            response = requests.get(pdf_input_source, stream=True, timeout=20) # Increased timeout slightly
             response.raise_for_status()
             pdf_file_like_object = io.BytesIO(response.content)
             logger.info("PDF downloaded successfully from URL.")
-        else: # Assumes a file object (e.g., from Gradio upload)
-            logger.info(f"Processing uploaded PDF file for text extraction: {getattr(pdf_input_source, 'name', 'N/A')}")
             pdf_file_like_object = pdf_input_source
         with pdfplumber.open(pdf_file_like_object) as pdf:
             full_text = ""
             for i, page in enumerate(pdf.pages):
-                logger.debug(f"Extracting text from page {i+1}")
-                page_text = page.extract_text(layout=True, x_density=1, y_density=1) or "" # x_density/y_density can impact layout accuracy
-                full_text += page_text + "\n\n" # Add double newline as page separator
-                logger.debug(f"Extracting tables from page {i+1}")
                 tables = page.extract_tables()
                 if tables:
-                    for table_idx, table_data in enumerate(tables):
-                        logger.debug(f"Processing table {table_idx+1} on page {i+1}")
-                        if table_data: # Ensure table_data is not empty
-                            table_md = "\n".join([" | ".join(str(cell) if cell is not None else "" for cell in row) for row in table_data])
-                            header_separator = " | ".join(["---"] * len(table_data[0])) if table_data[0] else ""
-                            full_text += f"**Table:**\n{table_md[:table_md.find(chr(10)) if table_md.find(chr(10)) > 0 else len(table_md)]}\n{header_separator}\n{table_md[table_md.find(chr(10))+1 if table_md.find(chr(10)) > 0 else '']}\n\n"
-                            # full_text += f"**Table:**\n{table_md}\n\n" # Simpler table version
         logger.info("Text and table extraction successful.")
-        return full_text
     except Exception as e:
         logger.error(f"Text extraction error: {str(e)}", exc_info=True)
         return f"Error extracting text: {str(e)}"
-def extract_images_from_pdf(pdf_input_source): # Renamed for clarity
     if not check_poppler():
         return "Error: poppler-utils not found or not working correctly. Image extraction depends on it."
     try:
-        images = []
-        if isinstance(pdf_input_source, str): # Indicates a URL
             logger.info(f"Fetching PDF from URL for image extraction: {pdf_input_source}")
-            response = requests.get(pdf_input_source, stream=True, timeout=20) # Increased timeout
             response.raise_for_status()
-            logger.info("PDF downloaded successfully, converting to images.")
-            images = convert_from_bytes(response.content, dpi=200) # dpi can be adjusted
-        else: # Assumes a file object (e.g., from Gradio upload which is a TemporaryFileWrapper)
-            file_path = getattr(pdf_input_source, 'name', None)
-            if not file_path:
-                logger.error("Uploaded PDF file has no name attribute, cannot process for images.")
-                return "Error: Could not get path from uploaded PDF file for image extraction."
-            logger.info(f"Processing uploaded PDF file for image extraction: {file_path}")
-            images = convert_from_path(file_path, dpi=200)
-        logger.info(f"Successfully extracted {len(images)} image(s) from PDF.")
-        return images
     except Exception as e:
         logger.error(f"Image extraction error: {str(e)}", exc_info=True)
         return f"Error extracting images: {str(e)}"
-def format_to_markdown(text_content, images_list):
-    markdown_output = "# Extracted PDF Content\n\n"
-    # Normalize newlines: multiple consecutive newlines become a single blank line (two \n chars)
-    text_content = re.sub(r'\n\s*\n+', '\n\n', text_content.strip())
-    lines = text_content.split('\n') # Split by single newline. Blank lines between paragraphs become empty strings.
-    for i, line_text in enumerate(lines):
-        line_stripped = line_text.strip()
-        if not line_stripped: # Handle blank lines explicitly
-            # Add a single newline to markdown. This helps maintain paragraph separation.
-            markdown_output += "\n"
-            continue
-        # Regex for various list markers: "1.", "*", "-", "+" followed by space and content
-        list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
-        is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100 # Length constraint for ALL CAPS headings
-        if is_heading_candidate and not list_match: # Check it's not an ALL CAPS list item
-            markdown_output += f"## {line_stripped}\n\n"
-        elif list_match:
-            list_item_text = list_match.group(1) # Get the content part of the list item
-            markdown_output += f"- {list_item_text}\n" # Single newline for list items to keep them together
-        else:
-            # Default: treat as a paragraph line, add double newline for Markdown paragraph
-            markdown_output += f"{line_text}\n\n"
-    # Consolidate potentially excessive newlines that might arise from the logic above
-    markdown_output = re.sub(r'\n\s*\n+', '\n\n', markdown_output.strip())
-    markdown_output += "\n\n" # Ensure a blank line at the end of text content before images
-    if isinstance(images_list, list) and images_list:
         markdown_output += "## Extracted Images\n\n"
-        for i, img_pil in enumerate(images_list):
             ocr_text = ""
             try:
                 ocr_text = pytesseract.image_to_string(img_pil).strip()
@@ -203,131 +217,138 @@ def format_to_markdown(text_content, images_list):
                 logger.error(f"OCR error for image {i+1}: {str(ocr_e)}")
                 ocr_text = f"OCR failed: {str(ocr_e)}"
-            image_filename_base = f"extracted_image_{i+1}"
-            image_url_or_error = upload_image_to_hf(img_pil, image_filename_base)
-            if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
-                markdown_output += f"![Image {i+1}]({image_url_or_error})\n"
-                if ocr_text and not ocr_text.startswith("OCR failed:"):
-                    markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
-                elif ocr_text: # OCR failed message
-                    markdown_output += f"**Image {i+1} OCR Note:** {ocr_text}\n\n"
-            else: # Error during upload or from ensure_hf_dataset
-                error_message = str(image_url_or_error) # Ensure it's a string
-                markdown_output += f"**Image {i+1} (Upload Error):** {error_message}\n\n"
     return markdown_output.strip()
-def process_pdf(pdf_file_upload, pdf_url_input):
-    current_status = "Starting PDF processing..."
-    logger.info(current_status)
-    if not HF_TOKEN:
-        current_status = "Error: HF_TOKEN is not set. Please set it in Space secrets for image uploads."
-        logger.error(current_status)
-        # App can still try to process text, but image uploads will fail.
-        # Let's allow text extraction to proceed but warn about images.
-        # For a stricter approach, uncomment return:
-        # return current_status, current_status
-    pdf_input_source = None
-    if pdf_url_input and pdf_url_input.strip():
-        resolved_url = urllib.parse.unquote(pdf_url_input.strip())
-        current_status = f"Attempting to download PDF from URL: {resolved_url}"
-        logger.info(current_status)
-        try:
-            # Use HEAD request to check URL validity and content type quickly
-            response = requests.head(resolved_url, allow_redirects=True, timeout=10)
-            response.raise_for_status()
-            content_type = response.headers.get('content-type', '').lower()
-            if 'application/pdf' not in content_type:
-                current_status = f"Error: URL does not point to a PDF file (Content-Type: {content_type})."
-                logger.error(current_status)
-                return current_status, current_status
-            pdf_input_source = resolved_url # Use the URL string as the source
-            logger.info("PDF URL validated.")
-        except requests.RequestException as e:
-            current_status = f"Error accessing URL '{resolved_url}': {str(e)}"
-            logger.error(current_status)
-            return current_status, current_status
-    elif pdf_file_upload:
-        # pdf_file_upload is a tempfile._TemporaryFileWrapper object from Gradio
-        pdf_input_source = pdf_file_upload
-        current_status = f"Processing uploaded PDF file: {pdf_file_upload.name}"
-        logger.info(current_status)
-    else:
-        current_status = "Error: No PDF file uploaded and no PDF URL provided."
-        logger.error(current_status)
-        return current_status, current_status
-    current_status = "Extracting text and tables from PDF..."
-    logger.info(current_status)
-    extracted_text = extract_text_from_pdf(pdf_input_source)
-    if isinstance(extracted_text, str) and extracted_text.startswith("Error extracting text:"):
-        current_status = f"Text extraction failed. {extracted_text}"
-        logger.error(current_status)
-        # Decide if to stop or continue for images
-        # For now, let's return the error directly
-        return extracted_text, current_status
-    # If pdf_input_source was a URL, extract_text_from_pdf already downloaded it.
-    # For extract_images_from_pdf, we need to pass the URL or file path again.
-    # If it was an uploaded file, its stream might have been consumed or pointer moved.
-    # It's safer to re-open/re-access for different libraries if they don't handle streams well.
-    # However, pdfplumber and pdf2image should handle file paths/objects correctly.
-    # If pdf_input_source is a file object, reset its read pointer if necessary.
-    if hasattr(pdf_input_source, 'seek') and not isinstance(pdf_input_source, str):
-        pdf_input_source.seek(0)
-    current_status = "Extracting images from PDF..."
-    logger.info(current_status)
-    extracted_images = extract_images_from_pdf(pdf_input_source)
-    if isinstance(extracted_images, str) and extracted_images.startswith("Error"): # Error string from extraction
-        current_status = f"Image extraction failed or partially failed. {extracted_images}"
-        logger.warning(current_status) # Warning, as text might still be useful
-        # We can proceed to format markdown with text and image error.
-        # Set images to empty list to avoid error in format_to_markdown
-        extracted_images = [] # Or pass the error string to be included by format_to_markdown
-        # Let format_to_markdown handle this, for now, we will pass the error string if it happened
-        # No, format_to_markdown expects a list of images or an error string from check_poppler
-        # if isinstance(extracted_images, str) -> it's an error string, that is fine.
-    current_status = "Formatting content to Markdown..."
-    logger.info(current_status)
-    # Pass the original extracted_images (which could be an error string or list of PIL images)
-    markdown_result = format_to_markdown(extracted_text, extracted_images)
-    current_status = "PDF processing complete."
-    logger.info(current_status)
-    return markdown_result, current_status
-# Gradio Interface
-iface = gr.Interface(
-    fn=process_pdf,
-    inputs=[
-        gr.File(label="Upload PDF File", file_types=[".pdf"]),
-        gr.Textbox(label="Or Enter PDF URL", placeholder="e.g., https://example.com/file.pdf"),
-    ],
-    outputs=[
-        gr.Markdown(label="Markdown Output"),
-        gr.Textbox(label="Processing Status", interactive=False),
-    ],
-    title="PDF to Markdown Converter",
-    description="Convert a PDF (uploaded file or URL) to Markdown. Extracts text, tables, and images. Images are uploaded to a Hugging Face dataset. Requires HF_TOKEN in Spaces Secrets for image functionality.",
-    allow_flagging="never",
-)
-if __name__ == "__main__":
-    logger.info("Starting Gradio app...")
     try:
-        # When running in Hugging Face Spaces, share=False is recommended.
-        # The Space itself provides the public URL.
-        iface.launch(server_name="0.0.0.0", server_port=7860, share=True)
-        logger.info("Gradio app started successfully.")
     except Exception as e:
-        logger.error(f"Failed to start Gradio app: {str(e)}", exc_info=True)
-        # Re-raise the exception to ensure the script exits if Gradio fails to launch
-        raise

+import os
+import io
+import re
+import logging
+import subprocess
+from datetime import datetime
+import urllib.parse
+import tempfile
+from flask import Flask, request, render_template, redirect, url_for
+from werkzeug.utils import secure_filename # For secure file handling
 import requests
 import pdfplumber
 from pdf2image import convert_from_path, convert_from_bytes
 import pytesseract
 from PIL import Image
+from huggingface_hub import HfApi, create_repo, HfHubHTTPError
+# --- Flask App Initialization ---
+app = Flask(__name__)
+app.config['UPLOAD_FOLDER'] = tempfile.gettempdir() # Use system temp dir
+app.config['MAX_CONTENT_LENGTH'] = 30 * 1024 * 1024  # 30 MB limit for uploads
+# --- Logging Configuration ---
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
+# --- Hugging Face Configuration ---
 HF_TOKEN = os.getenv("HF_TOKEN")
+HF_DATASET_REPO_NAME = os.getenv("HF_DATASET_REPO_NAME", "pdf-images-extracted") # Allow override via env var
 hf_api = HfApi()
+# --- PDF Processing Helper Functions (Adapted from Gradio version) ---
 def check_poppler():
     try:
+        result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False)
         version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
         if version_info_log:
+            logger.info(f"Poppler version check: {version_info_log.splitlines()[0] if version_info_log else 'No version output'}")
         else:
+            logger.info("Poppler 'pdftoppm -v' ran. Assuming Poppler is present.")
         return True
     except FileNotFoundError:
         logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in PATH.")
         return False
+    except Exception as e:
         logger.error(f"An unexpected error occurred during Poppler check: {str(e)}")
         return False
 def ensure_hf_dataset():
+    if not HF_TOKEN:
+        logger.warning("HF_TOKEN is not set. Cannot ensure Hugging Face dataset. Image uploads will fail.")
+        return "Error: HF_TOKEN is not set. Please configure it in Space secrets for image uploads."
     try:
+        repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
         logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
+        return repo_id_obj.repo_id
+    except HfHubHTTPError as e:
+        if e.response.status_code == 409: # Conflict, repo already exists
+             logger.info(f"Dataset repo '{HF_DATASET_REPO_NAME}' already exists.")
+             return f"{hf_api.whoami(token=HF_TOKEN)['name']}/{HF_DATASET_REPO_NAME}" # Construct repo_id
+        logger.error(f"Hugging Face dataset error (HTTP {e.response.status_code}): {str(e)}")
+        return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
     except Exception as e:
+        logger.error(f"Hugging Face dataset error: {str(e)}", exc_info=True)
+        return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
+def upload_image_to_hf(image_pil, filename_base):
     repo_id_or_error = ensure_hf_dataset()
     if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
+        return repo_id_or_error
+    repo_id = repo_id_or_error
+    temp_image_path = None
     try:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        repo_filename = f"images/{filename_base}_{timestamp}.png" # Path in repo
+        # Save PIL image to a temporary file to upload
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
+            temp_image_path = tmp_file.name
+        image_pil.save(temp_image_path, format="PNG")
+        logger.info(f"Attempting to upload {temp_image_path} to {repo_id}/{repo_filename}")
         file_url = hf_api.upload_file(
+            path_or_fileobj=temp_image_path,
             path_in_repo=repo_filename,
             repo_id=repo_id,
             repo_type="dataset",
+            token=HF_TOKEN
         )
         logger.info(f"Successfully uploaded image: {file_url}")
         return file_url
     except Exception as e:
+        logger.error(f"Image upload error for {filename_base}: {str(e)}", exc_info=True)
+        return f"Error uploading image {filename_base}: {str(e)}"
+    finally:
+        if temp_image_path and os.path.exists(temp_image_path):
             try:
+                os.remove(temp_image_path)
             except OSError as ose:
+                logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
+def extract_text_from_pdf(pdf_input_source): # pdf_input_source is URL string or local file path
     try:
+        pdf_file_like_object = None
+        if isinstance(pdf_input_source, str) and pdf_input_source.startswith(('http://', 'https://')):
             logger.info(f"Fetching PDF from URL for text extraction: {pdf_input_source}")
+            response = requests.get(pdf_input_source, stream=True, timeout=30)
             response.raise_for_status()
             pdf_file_like_object = io.BytesIO(response.content)
             logger.info("PDF downloaded successfully from URL.")
+        elif isinstance(pdf_input_source, str) and os.path.exists(pdf_input_source): # Local file path
+            logger.info(f"Processing local PDF file for text extraction: {pdf_input_source}")
+            # pdfplumber.open can take a path directly
             pdf_file_like_object = pdf_input_source
+        else:
+            logger.error(f"Invalid pdf_input_source for text extraction: {pdf_input_source}")
+            return "Error: Invalid input for PDF text extraction (must be URL or valid file path)."
         with pdfplumber.open(pdf_file_like_object) as pdf:
             full_text = ""
             for i, page in enumerate(pdf.pages):
+                page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""
+                full_text += page_text + "\n\n"
                 tables = page.extract_tables()
                 if tables:
+                    for table_data in tables:
+                        if table_data:
+                            header = [" | ".join(str(cell) if cell is not None else "" for cell in table_data[0])]
+                            separator = [" | ".join(["---"] * len(table_data[0]))]
+                            body = [" | ".join(str(cell) if cell is not None else "" for cell in row) for row in table_data[1:]]
+                            table_md_lines = header + separator + body
+                            full_text += f"**Table:**\n" + "\n".join(table_md_lines) + "\n\n"
         logger.info("Text and table extraction successful.")
+        return full_text.strip()
+    except requests.RequestException as e:
+        logger.error(f"URL fetch error for text extraction: {str(e)}", exc_info=True)
+        return f"Error fetching PDF from URL: {str(e)}"
     except Exception as e:
         logger.error(f"Text extraction error: {str(e)}", exc_info=True)
         return f"Error extracting text: {str(e)}"
+def extract_images_from_pdf(pdf_input_source): # pdf_input_source is URL string or local file path
     if not check_poppler():
         return "Error: poppler-utils not found or not working correctly. Image extraction depends on it."
+    images_pil = []
     try:
+        if isinstance(pdf_input_source, str) and pdf_input_source.startswith(('http://', 'https://')):
             logger.info(f"Fetching PDF from URL for image extraction: {pdf_input_source}")
+            response = requests.get(pdf_input_source, stream=True, timeout=30)
             response.raise_for_status()
+            logger.info("PDF downloaded successfully from URL, converting to images.")
+            images_pil = convert_from_bytes(response.content, dpi=200)
+        elif isinstance(pdf_input_source, str) and os.path.exists(pdf_input_source): # Local file path
+            logger.info(f"Processing local PDF file for image extraction: {pdf_input_source}")
+            images_pil = convert_from_path(pdf_input_source, dpi=200)
+        else:
+            logger.error(f"Invalid pdf_input_source for image extraction: {pdf_input_source}")
+            return "Error: Invalid input for PDF image extraction (must be URL or valid file path)."
+        logger.info(f"Successfully extracted {len(images_pil)} image(s) from PDF.")
+        return images_pil
+    except requests.RequestException as e:
+        logger.error(f"URL fetch error for image extraction: {str(e)}", exc_info=True)
+        return f"Error fetching PDF from URL for image extraction: {str(e)}"
     except Exception as e:
         logger.error(f"Image extraction error: {str(e)}", exc_info=True)
         return f"Error extracting images: {str(e)}"
+def format_to_markdown(text_content, images_input):
+    markdown_output = "# Extracted PDF Content\n\n"
+    if text_content.startswith("Error"): # If text extraction itself failed
+        markdown_output += f"**Text Extraction Note:**\n{text_content}\n\n"
+    else:
+        text_content = re.sub(r'\n\s*\n+', '\n\n', text_content.strip())
+        lines = text_content.split('\n')
+        is_in_list = False
+        for line_text in lines:
+            line_stripped = line_text.strip()
+            if not line_stripped:
+                markdown_output += "\n"
+                is_in_list = False
+                continue
+            list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
+            is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100
+            if is_heading_candidate and not list_match:
+                markdown_output += f"## {line_stripped}\n\n"
+                is_in_list = False
+            elif list_match:
+                list_item_text = list_match.group(1)
+                markdown_output += f"- {list_item_text}\n"
+                is_in_list = True
+            else:
+                if is_in_list: markdown_output += "\n"
+                markdown_output += f"{line_text}\n\n"
+                is_in_list = False
+        markdown_output = re.sub(r'\n\s*\n+', '\n\n', markdown_output.strip()) + "\n\n"
+    if isinstance(images_input, list) and images_input:
         markdown_output += "## Extracted Images\n\n"
+        if not HF_TOKEN:
+            markdown_output += "**Note:** `HF_TOKEN` not set. Images were extracted but not uploaded to Hugging Face Hub.\n\n"
+        for i, img_pil in enumerate(images_input):
             ocr_text = ""
             try:
                 ocr_text = pytesseract.image_to_string(img_pil).strip()
                 logger.error(f"OCR error for image {i+1}: {str(ocr_e)}")
                 ocr_text = f"OCR failed: {str(ocr_e)}"
+            if HF_TOKEN: # Only attempt upload if token is present
+                image_filename_base = f"extracted_image_{i+1}"
+                image_url_or_error = upload_image_to_hf(img_pil, image_filename_base)
+                if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
+                    markdown_output += f"![Image {i+1}]({image_url_or_error})\n"
+                else:
+                    markdown_output += f"**Image {i+1} (Upload Error):** {str(image_url_or_error)}\n\n"
+            else: # No token, show placeholder or local info if we were saving them locally
+                 markdown_output += f"**Image {i+1} (not uploaded due to missing HF_TOKEN)**\n"
+            if ocr_text:
+                markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
+    elif isinstance(images_input, str) and images_input.startswith("Error"):
+        markdown_output += f"## Image Extraction Note\n\n{images_input}\n\n"
     return markdown_output.strip()
+# --- Flask Routes ---
+@app.route('/', methods=['GET'])
+def index():
+    return render_template('index.html')
+@app.route('/process', methods=['POST'])
+def process_pdf_route():
+    pdf_file = request.files.get('pdf_file')
+    pdf_url = request.form.get('pdf_url', '').strip()
+    status_message = "Starting PDF processing..."
+    error_message = None
+    markdown_output = None
+    temp_pdf_path = None
+    pdf_input_source = None # This will be a URL string or a local file path
     try:
+        if pdf_file and pdf_file.filename:
+            if not pdf_file.filename.lower().endswith('.pdf'):
+                raise ValueError("Uploaded file is not a PDF.")
+            filename = secure_filename(pdf_file.filename)
+            # Save to a temporary file
+            fd, temp_pdf_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
+            os.close(fd) # close file descriptor from mkstemp
+            pdf_file.save(temp_pdf_path)
+            logger.info(f"Uploaded PDF saved to temporary path: {temp_pdf_path}")
+            pdf_input_source = temp_pdf_path
+            status_message = f"Processing uploaded PDF: {filename}"
+        elif pdf_url:
+            pdf_url = urllib.parse.unquote(pdf_url)
+            # Basic URL validation
+            if not (pdf_url.startswith('http://') or pdf_url.startswith('https://')):
+                raise ValueError("Invalid URL scheme. Must be http or https.")
+            if not pdf_url.lower().endswith('.pdf'):
+                 logger.warning(f"URL {pdf_url} does not end with .pdf. Proceeding with caution.")
+                # Allow proceeding but log warning, actual check is content-type or processing error
+            # Quick check with HEAD request (optional, but good practice)
+            try:
+                head_resp = requests.head(pdf_url, allow_redirects=True, timeout=10)
+                head_resp.raise_for_status()
+                content_type = head_resp.headers.get('content-type', '').lower()
+                if 'application/pdf' not in content_type:
+                    logger.warning(f"URL {pdf_url} content-type is '{content_type}', not 'application/pdf'.")
+                    # Depending on strictness, could raise ValueError here
+            except requests.RequestException as re:
+                logger.error(f"Failed HEAD request for URL {pdf_url}: {re}")
+                # Proceed, main request in extract functions will handle final failure
+            pdf_input_source = pdf_url
+            status_message = f"Processing PDF from URL: {pdf_url}"
+        else:
+            raise ValueError("No PDF file uploaded and no PDF URL provided.")
+        # --- Core Processing ---
+        status_message += "\nExtracting text..."
+        logger.info(status_message)
+        extracted_text = extract_text_from_pdf(pdf_input_source)
+        if isinstance(extracted_text, str) and extracted_text.startswith("Error"):
+            # Let format_to_markdown handle displaying this error within its structure
+            logger.error(f"Text extraction resulted in error: {extracted_text}")
+        status_message += "\nExtracting images..."
+        logger.info(status_message)
+        extracted_images = extract_images_from_pdf(pdf_input_source) # list of PIL images or error string
+        if isinstance(extracted_images, str) and extracted_images.startswith("Error"):
+            logger.error(f"Image extraction resulted in error: {extracted_images}")
+        status_message += "\nFormatting to Markdown..."
+        logger.info(status_message)
+        markdown_output = format_to_markdown(extracted_text, extracted_images)
+        status_message = "Processing complete."
+        if isinstance(extracted_text, str) and extracted_text.startswith("Error"):
+             status_message += f" (Text extraction issues: {extracted_text.split(':', 1)[1].strip()})"
+        if isinstance(extracted_images, str) and extracted_images.startswith("Error"):
+             status_message += f" (Image extraction issues: {extracted_images.split(':', 1)[1].strip()})"
+        if not HF_TOKEN and isinstance(extracted_images, list) and extracted_images:
+            status_message += " (Note: HF_TOKEN not set, images not uploaded to Hub)"
+    except ValueError as ve:
+        logger.error(f"Input validation error: {str(ve)}")
+        error_message = str(ve)
+        status_message = "Processing failed."
     except Exception as e:
+        logger.error(f"An unexpected error occurred during processing: {str(e)}", exc_info=True)
+        error_message = f"An unexpected error occurred: {str(e)}"
+        status_message = "Processing failed due to an unexpected error."
+    finally:
+        if temp_pdf_path and os.path.exists(temp_pdf_path):
+            try:
+                os.remove(temp_pdf_path)
+                logger.info(f"Removed temporary PDF: {temp_pdf_path}")
+            except OSError as ose:
+                logger.error(f"Error removing temporary PDF {temp_pdf_path}: {ose}")
+    return render_template('index.html',
+                           markdown_output=markdown_output,
+                           status_message=status_message,
+                           error_message=error_message)
+# --- Main Execution ---
+if __name__ == '__main__':
+    # This is for local development. For Hugging Face Spaces, Gunicorn is used via Dockerfile CMD.
+    # Poppler check at startup for local dev convenience
+    if not check_poppler():
+        logger.warning("Poppler utilities might not be installed correctly. PDF processing might fail.")
+    # Ensure UPLOAD_FOLDER exists
+    os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+    app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True)