Spaces:

broadfield-dev
/

pdf2markdown

Sleeping

App Files Files Community

broadfield-dev commited on Jun 2

Commit

81314aa

verified ·

1 Parent(s): 57afa22

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -80

app.py CHANGED Viewed

@@ -14,16 +14,15 @@ import logging
 import subprocess
 # Set up logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Initialize Hugging Face API
-HF_TOKEN = os.getenv("HF_TOKEN")  # Set in Hugging Face Spaces Secrets
-REPO_NAME = "pdf-images-extracted"  # Hugging Face dataset repo
 hf_api = HfApi()
 def check_poppler():
-    """Check if poppler-utils is installed."""
     try:
         result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
         logger.info(f"Poppler version: {result.stdout}")
@@ -33,29 +32,23 @@ def check_poppler():
         return False
 def ensure_hf_dataset():
-    """Create or get Hugging Face dataset repository."""
     try:
         if not HF_TOKEN:
             raise ValueError("HF_TOKEN is not set")
         repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
-        logger.info(f"Successfully accessed/created dataset repo: {repo_id}")
         return repo_id
     except Exception as e:
-        logger.error(f"Failed to create/access dataset repo: {str(e)}")
-        return f"Error: Failed to create/access dataset repo: {str(e)}"
 def upload_image_to_hf(image, filename):
-    """Upload an image to Hugging Face dataset and return its URL."""
     repo_id = ensure_hf_dataset()
     if isinstance(repo_id, str) and repo_id.startswith("Error"):
         return repo_id
     try:
-        # Save image temporarily
         temp_path = f"/tmp/temp_{filename}.png"
         image.save(temp_path, format="PNG")
-        # Upload to Hugging Face dataset
         file_url = hf_api.upload_file(
             path_or_fileobj=temp_path,
             path_in_repo=f"images/{filename}.png",
@@ -64,20 +57,19 @@ def upload_image_to_hf(image, filename):
             token=HF_TOKEN
         )
         os.remove(temp_path)
-        logger.info(f"Uploaded image to: {file_url}")
         return file_url
     except Exception as e:
-        logger.error(f"Error uploading image: {str(e)}")
         return f"Error uploading image: {str(e)}"
 def extract_text_from_pdf(pdf_input):
-    """Extract text from PDF using pdfplumber."""
     try:
-        if isinstance(pdf_input, str):  # URL case
-            response = requests.get(pdf_input, stream=True)
             response.raise_for_status()
             pdf_file = io.BytesIO(response.content)
-        else:  # File upload case
             pdf_file = pdf_input
         with pdfplumber.open(pdf_file) as pdf:
             text = ""
@@ -89,46 +81,35 @@ def extract_text_from_pdf(pdf_input):
                     text += "**Table:**\n" + "\n".join([" | ".join(str(cell) for cell in row) for row in table]) + "\n\n"
         return text
     except Exception as e:
-        logger.error(f"Error extracting text: {str(e)}")
         return f"Error extracting text: {str(e)}"
 def extract_images_from_pdf(pdf_input):
-    """Extract images from PDF and convert to PIL images."""
     if not check_poppler():
-        return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
     try:
-        if isinstance(pdf_input, str):  # URL case
-            logger.info(f"Downloading PDF from URL: {pdf_input}")
-            response = requests.get(pdf_input, stream=True)
             response.raise_for_status()
             images = convert_from_bytes(response.content)
-        else:  # File upload case
-            logger.info(f"Processing uploaded PDF: {pdf_input.name}")
             images = convert_from_path(pdf_input.name)
         return images
     except Exception as e:
-        logger.error(f"Error extracting images: {str(e)}")
         return f"Error extracting images: {str(e)}"
 def format_to_markdown(text, images):
-    """Convert extracted text and images to Markdown format."""
     markdown_output = "# Extracted PDF Content\n\n"
-    # Clean and format text
-    text = re.sub(r'\n\s*\n+', '\n\n', text.strip())  # Normalize newlines
     lines = text.split("\n")
     for line in lines:
-        # Detect headings (heuristic: all caps or specific keywords)
         if line.isupper() and len(line) > 5:
             markdown_output += f"## {line}\n\n"
-        # Detect lists (lines starting with numbers or bullets)
         elif re.match(r'^\s*[\d\-*+]\.\s+', line):
             markdown_output += f"- {line.strip()[2:]}\n"
         else:
             markdown_output += f"{line}\n\n"
-    # Add images with Hugging Face dataset URLs
     if isinstance(images, list) and images:
         markdown_output += "## Extracted Images\n\n"
         for i, image in enumerate(images):
@@ -136,82 +117,79 @@ def format_to_markdown(text, images):
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             filename = f"image_{i}_{timestamp}"
             image_url = upload_image_to_hf(image, filename)
             if not image_url.startswith("Error"):
                 markdown_output += f"![Image {i+1}]({image_url})\n"
                 if ocr_text:
                     markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
             else:
                 markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
     return markdown_output
 def process_pdf(pdf_input, pdf_url):
-    """Main function to process PDF input (file or URL) and generate Markdown."""
-    status = ["Starting PDF processing..."]
-    logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
-    def update_status(message):
-        status[0] = message
-        return status[0]
     if not HF_TOKEN:
-        update_status("Error: HF_TOKEN not set.")
-        return "Error: HF_TOKEN not set in Spaces Secrets.", status[0]
-    # Log poppler status
-    logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
-    update_status("Checking poppler-utils...")
-    # Decode URL-encoded string if provided
     if pdf_url and pdf_url.strip():
         pdf_url = urllib.parse.unquote(pdf_url)
-        logger.info(f"Decoded URL: {pdf_url}")
-        update_status(f"Downloading PDF from URL: {pdf_url}")
         try:
-            response = requests.head(pdf_url, allow_redirects=True)
             response.raise_for_status()
             pdf_input = pdf_url
         except requests.RequestException as e:
-            logger.error(f"Error accessing URL: {str(e)}")
-            update_status(f"Error accessing URL: {str(e)}")
-            return f"Error accessing URL: {str(e)}", status[0]
     elif not pdf_input:
-        update_status("Error: No PDF provided.")
-        return "Error: Please provide a PDF file or URL.", status[0]
-    update_status("Extracting text from PDF...")
     text = extract_text_from_pdf(pdf_input)
-    update_status("Extracting images from PDF...")
-    images = extract_images_from_pdf(pdf_input)
     if isinstance(text, str) and text.startswith("Error"):
-        update_status("Text extraction failed.")
-        return text, status[0]
     if isinstance(images, str) and images.startswith("Error"):
-        update_status("Image extraction failed.")
-        return images, status[0]
-    update_status("Formatting output as Markdown...")
     markdown_output = format_to_markdown(text, images)
-    update_status("Processing complete.")
-    return markdown_output, status[0]
 # Gradio Interface
 iface = gr.Interface(
     fn=process_pdf,
     inputs=[
-        gr.File(label="Upload PDF File", type="filepath"),
-        gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF"),
     ],
     outputs=[
         gr.Markdown(label="Markdown Output"),
         gr.Textbox(label="Processing Status", interactive=False),
     ],
     title="PDF to Markdown Converter",
-    description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
     allow_flagging="never"
 )
 if __name__ == "__main__":
-    iface.launch(server_name="0.0.0.0", server_port=7860)

 import subprocess
 # Set up logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
 # Initialize Hugging Face API
+HF_TOKEN = os.getenv("HF_TOKEN")
+REPO_NAME = "pdf-images-extracted"
 hf_api = HfApi()
 def check_poppler():
     try:
         result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
         logger.info(f"Poppler version: {result.stdout}")
         return False
 def ensure_hf_dataset():
     try:
         if not HF_TOKEN:
             raise ValueError("HF_TOKEN is not set")
         repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
+        logger.info(f"Dataset repo: {repo_id}")
         return repo_id
     except Exception as e:
+        logger.error(f"Dataset error: {str(e)}")
+        return f"Error: Failed to access dataset: {str(e)}"
 def upload_image_to_hf(image, filename):
     repo_id = ensure_hf_dataset()
     if isinstance(repo_id, str) and repo_id.startswith("Error"):
         return repo_id
     try:
         temp_path = f"/tmp/temp_{filename}.png"
         image.save(temp_path, format="PNG")
         file_url = hf_api.upload_file(
             path_or_fileobj=temp_path,
             path_in_repo=f"images/{filename}.png",
             token=HF_TOKEN
         )
         os.remove(temp_path)
+        logger.info(f"Uploaded image: {file_url}")
         return file_url
     except Exception as e:
+        logger.error(f"Image upload error: {str(e)}")
         return f"Error uploading image: {str(e)}"
 def extract_text_from_pdf(pdf_input):
     try:
+        if isinstance(pdf_input, str):
+            response = requests.get(pdf_input, stream=True, timeout=10)
             response.raise_for_status()
             pdf_file = io.BytesIO(response.content)
+        else:
             pdf_file = pdf_input
         with pdfplumber.open(pdf_file) as pdf:
             text = ""
                     text += "**Table:**\n" + "\n".join([" | ".join(str(cell) for cell in row) for row in table]) + "\n\n"
         return text
     except Exception as e:
+        logger.error(f"Text extraction error: {str(e)}")
         return f"Error extracting text: {str(e)}"
 def extract_images_from_pdf(pdf_input):
     if not check_poppler():
+        return "Error: poppler-utils not found."
     try:
+        if isinstance(pdf_input, str):
+            response = requests.get(pdf_input, stream=True, timeout=10)
             response.raise_for_status()
             images = convert_from_bytes(response.content)
+        else:
             images = convert_from_path(pdf_input.name)
         return images
     except Exception as e:
+        logger.error(f"Image extraction error: {str(e)}")
         return f"Error extracting images: {str(e)}"
 def format_to_markdown(text, images):
     markdown_output = "# Extracted PDF Content\n\n"
+    text = re.sub(r'\n\s*\n+', '\n\n', text.strip())
     lines = text.split("\n")
     for line in lines:
         if line.isupper() and len(line) > 5:
             markdown_output += f"## {line}\n\n"
         elif re.match(r'^\s*[\d\-*+]\.\s+', line):
             markdown_output += f"- {line.strip()[2:]}\n"
         else:
             markdown_output += f"{line}\n\n"
     if isinstance(images, list) and images:
         markdown_output += "## Extracted Images\n\n"
         for i, image in enumerate(images):
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             filename = f"image_{i}_{timestamp}"
             image_url = upload_image_to_hf(image, filename)
             if not image_url.startswith("Error"):
                 markdown_output += f"![Image {i+1}]({image_url})\n"
                 if ocr_text:
                     markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
             else:
                 markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
     return markdown_output
 def process_pdf(pdf_input, pdf_url):
+    status = "Starting PDF processing..."
+    logger.info(status)
     if not HF_TOKEN:
+        status = "Error: HF_TOKEN not set."
+        logger.error(status)
+        return status, status
     if pdf_url and pdf_url.strip():
         pdf_url = urllib.parse.unquote(pdf_url)
+        status = f"Downloading PDF from URL: {pdf_url}"
+        logger.info(status)
         try:
+            response = requests.head(pdf_url, allow_redirects=True, timeout=5)
             response.raise_for_status()
             pdf_input = pdf_url
         except requests.RequestException as e:
+            status = f"Error accessing URL: {str(e)}"
+            logger.error(status)
+            return status, status
     elif not pdf_input:
+        status = "Error: No PDF provided."
+        logger.error(status)
+        return status, status
+    status = "Extracting text..."
+    logger.info(status)
     text = extract_text_from_pdf(pdf_input)
     if isinstance(text, str) and text.startswith("Error"):
+        status = "Text extraction failed."
+        logger.error(status)
+        return text, status
+    status = "Extracting images..."
+    logger.info(status)
+    images = extract_images_from_pdf(pdf_input)
     if isinstance(images, str) and images.startswith("Error"):
+        status = "Image extraction failed."
+        logger.error(status)
+        return images, status
+    status = "Formatting output..."
+    logger.info(status)
     markdown_output = format_to_markdown(text, images)
+    status = "Processing complete."
+    logger.info(status)
+    return markdown_output, status
 # Gradio Interface
 iface = gr.Interface(
     fn=process_pdf,
     inputs=[
+        gr.File(label="Upload PDF File", file_types=[".pdf"]),
+        gr.Textbox(label="PDF URL", placeholder="Enter PDF URL (e.g., https://example.com/file.pdf)"),
     ],
     outputs=[
         gr.Markdown(label="Markdown Output"),
         gr.Textbox(label="Processing Status", interactive=False),
     ],
     title="PDF to Markdown Converter",
+    description="Convert a PDF file or URL to Markdown. Extracts text, images, and tables, with images uploaded to a Hugging Face dataset. Supports URL-encoded strings. Requires HF_TOKEN in Spaces Secrets.",
     allow_flagging="never"
 )
 if __name__ == "__main__":
+    logger.info("Starting Gradio app...")
+    try:
+        iface.launch(server_name="0.0.0.0", server_port=7860, prevent_thread_lock=True)
+        logger.info("Gradio app started successfully.")
+    except Exception as e:
+        logger.error(f"Failed to start Gradio app: {str(e)}")
+        raise