Spaces:

broadfield-dev
/

pdf2markdown

Sleeping

App Files Files Community

broadfield-dev commited on Jun 2

Commit

aec5733

verified ·

1 Parent(s): 5010ab5

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -37

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import requests
-import PyPDF2
 from pdf2image import convert_from_path, convert_from_bytes
 import pytesseract
 from PIL import Image
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)
 # Initialize Hugging Face API
 HF_TOKEN = os.getenv("HF_TOKEN")  # Set in Hugging Face Spaces Secrets
-REPO_NAME = "broadfield-dev/pdf-images-extracted"  # Hugging Face dataset repo
 hf_api = HfApi()
 def check_poppler():
@@ -36,11 +36,11 @@ def ensure_hf_dataset():
     """Create or get Hugging Face dataset repository."""
     try:
         repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
-        logger.info(f"Using dataset repo: {repo_id}")
         return repo_id
     except Exception as e:
-        logger.error(f"Error creating dataset repo: {str(e)}")
-        return f"Error creating dataset repo: {str(e)}"
 def upload_image_to_hf(image, filename):
     """Upload an image to Hugging Face dataset and return its URL."""
@@ -68,8 +68,9 @@ def upload_image_to_hf(image, filename):
         logger.error(f"Error uploading image: {str(e)}")
         return f"Error uploading image: {str(e)}"
-def extract_text_from_pdf(pdf_input):
-    """Extract text from PDF (URL or file) using PyPDF2."""
     try:
         if isinstance(pdf_input, str):  # URL case
             response = requests.get(pdf_input, stream=True)
@@ -77,18 +78,22 @@ def extract_text_from_pdf(pdf_input):
             pdf_file = io.BytesIO(response.content)
         else:  # File upload case
             pdf_file = pdf_input
-        reader = PyPDF2.PdfReader(pdf_file)
-        text = ""
-        for page in reader.pages:
-            page_text = page.extract_text() or ""
-            text += page_text + "\n\n"
         return text
     except Exception as e:
         logger.error(f"Error extracting text: {str(e)}")
         return f"Error extracting text: {str(e)}"
-def extract_images_from_pdf(pdf_input):
-    """Extract images from PDF (URL or file) and convert to PIL images."""
     if not check_poppler():
         return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
@@ -106,15 +111,16 @@ def extract_images_from_pdf(pdf_input):
         logger.error(f"Error extracting images: {str(e)}")
         return f"Error extracting images: {str(e)}"
-def format_to_markdown(text, images):
     """Convert extracted text and images to Markdown format."""
     markdown_output = "# Extracted PDF Content\n\n"
     # Clean and format text
     text = re.sub(r'\n\s*\n', '\n\n', text.strip())  # Remove excessive newlines
     lines = text.split("\n")
     for line in lines:
-        # Detect headings (simple heuristic: all caps or specific keywords)
         if line.isupper() and len(line) > 5:
             markdown_output += f"## {line}\n\n"
         # Detect lists (lines starting with numbers or bullets)
@@ -127,6 +133,7 @@ def format_to_markdown(text, images):
     if isinstance(images, list) and images:
         markdown_output += "## Extracted Images\n\n"
         for i, image in enumerate(images):
             ocr_text = pytesseract.image_to_string(image).strip()
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             filename = f"image_{i}_{timestamp}"
@@ -141,11 +148,14 @@ def format_to_markdown(text, images):
     return markdown_output
-def process_pdf(pdf_input, pdf_url):
     """Main function to process PDF input (file or URL) and generate Markdown."""
     logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
     if not HF_TOKEN:
-        return "Error: HF_TOKEN not set in Spaces Secrets."
     # Log poppler status
     logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
@@ -154,39 +164,55 @@ def process_pdf(pdf_input, pdf_url):
     if pdf_url and pdf_url.strip():
         pdf_url = urllib.parse.unquote(pdf_url)
         logger.info(f"Decoded URL: {pdf_url}")
         try:
             response = requests.head(pdf_url, allow_redirects=True)
             response.raise_for_status()
             pdf_input = pdf_url
         except requests.RequestException as e:
             logger.error(f"Error accessing URL: {str(e)}")
-            return f"Error accessing URL: {str(e)}"
     elif not pdf_input:
-        return "Error: Please provide a PDF file or URL."
-    text = extract_text_from_pdf(pdf_input)
-    images = extract_images_from_pdf(pdf_input)
     if isinstance(text, str) and text.startswith("Error"):
-        return text
     if isinstance(images, str) and images.startswith("Error"):
-        return images
-    markdown_output = format_to_markdown(text, images)
-    return markdown_output
 # Gradio Interface
-iface = gr.Interface(
-    fn=process_pdf,
-    inputs=[
-        gr.File(label="Upload PDF File", type="filepath"),
-        gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF (supports URL-encoded strings with spaces)"),
-    ],
-    outputs=gr.Markdown(label="Markdown Output"),
-    title="PDF to Markdown Converter",
-    description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets. Uses Docker to ensure poppler-utils and tesseract-ocr are installed.",
-    flagging_dir="/tmp/flagged"  # Set writable flagging directory
-)
 if __name__ == "__main__":
     # In Hugging Face Spaces, share=False is sufficient as Spaces handles the server

 import gradio as gr
 import requests
+import pdfplumber
 from pdf2image import convert_from_path, convert_from_bytes
 import pytesseract
 from PIL import Image
 # Initialize Hugging Face API
 HF_TOKEN = os.getenv("HF_TOKEN")  # Set in Hugging Face Spaces Secrets
+REPO_NAME = "pdf-images-extracted"  # Hugging Face dataset repo
 hf_api = HfApi()
 def check_poppler():
     """Create or get Hugging Face dataset repository."""
     try:
         repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
+        logger.info(f"Successfully accessed/created dataset repo: {repo_id}")
         return repo_id
     except Exception as e:
+        logger.error(f"Failed to create/access dataset repo: {str(e)}")
+        return f"Error: Failed to create/access dataset repo: {str(e)}"
 def upload_image_to_hf(image, filename):
     """Upload an image to Hugging Face dataset and return its URL."""
         logger.error(f"Error uploading image: {str(e)}")
         return f"Error uploading image: {str(e)}"
+def extract_text_from_pdf(pdf_input, status_callback):
+    """Extract text from PDF using pdfplumber."""
+    status_callback("Extracting text from PDF...")
     try:
         if isinstance(pdf_input, str):  # URL case
             response = requests.get(pdf_input, stream=True)
             pdf_file = io.BytesIO(response.content)
         else:  # File upload case
             pdf_file = pdf_input
+        with pdfplumber.open(pdf_file) as pdf:
+            text = ""
+            for page in pdf.pages:
+                page_text = page.extract_text() or ""
+                text += page_text + "\n\n"
+                tables = page.extract_tables()
+                for table in tables:
+                    text += "**Table:**\n" + "\n".join([" | ".join(str(cell) for cell in row) for row in table]) + "\n\n"
         return text
     except Exception as e:
         logger.error(f"Error extracting text: {str(e)}")
         return f"Error extracting text: {str(e)}"
+def extract_images_from_pdf(pdf_input, status_callback):
+    """Extract images from PDF and convert to PIL images."""
+    status_callback("Extracting images from PDF...")
     if not check_poppler():
         return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
         logger.error(f"Error extracting images: {str(e)}")
         return f"Error extracting images: {str(e)}"
+def format_to_markdown(text, images, status_callback):
     """Convert extracted text and images to Markdown format."""
+    status_callback("Formatting output as Markdown...")
     markdown_output = "# Extracted PDF Content\n\n"
     # Clean and format text
     text = re.sub(r'\n\s*\n', '\n\n', text.strip())  # Remove excessive newlines
     lines = text.split("\n")
     for line in lines:
+        # Detect headings (heuristic: all caps or specific keywords)
         if line.isupper() and len(line) > 5:
             markdown_output += f"## {line}\n\n"
         # Detect lists (lines starting with numbers or bullets)
     if isinstance(images, list) and images:
         markdown_output += "## Extracted Images\n\n"
         for i, image in enumerate(images):
+            status_callback(f"Uploading image {i+1}...")
             ocr_text = pytesseract.image_to_string(image).strip()
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             filename = f"image_{i}_{timestamp}"
     return markdown_output
+def process_pdf(pdf_input, pdf_url, status_callback):
     """Main function to process PDF input (file or URL) and generate Markdown."""
     logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
+    status_callback("Starting PDF processing...")
     if not HF_TOKEN:
+        status_callback("Error: HF_TOKEN not set.")
+        return "Error: HF_TOKEN not set in Spaces Secrets.", ""
     # Log poppler status
     logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
     if pdf_url and pdf_url.strip():
         pdf_url = urllib.parse.unquote(pdf_url)
         logger.info(f"Decoded URL: {pdf_url}")
+        status_callback(f"Downloading PDF from URL: {pdf_url}")
         try:
             response = requests.head(pdf_url, allow_redirects=True)
             response.raise_for_status()
             pdf_input = pdf_url
         except requests.RequestException as e:
             logger.error(f"Error accessing URL: {str(e)}")
+            status_callback(f"Error accessing URL: {str(e)}")
+            return f"Error accessing URL: {str(e)}", ""
     elif not pdf_input:
+        status_callback("Error: No PDF provided.")
+        return "Error: Please provide a PDF file or URL.", ""
+    text = extract_text_from_pdf(pdf_input, status_callback)
+    images = extract_images_from_pdf(pdf_input, status_callback)
     if isinstance(text, str) and text.startswith("Error"):
+        status_callback("Text extraction failed.")
+        return text, ""
     if isinstance(images, str) and images.startswith("Error"):
+        status_callback("Image extraction failed.")
+        return images, ""
+    markdown_output = format_to_markdown(text, images, status_callback)
+    status_callback("Processing complete.")
+    return markdown_output, ""
 # Gradio Interface
+with gr.Blocks() as iface:
+    gr.Markdown("# PDF to Markdown Converter")
+    gr.Markdown("Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.")
+    with gr.Row():
+        pdf_input = gr.File(label="Upload PDF File", type="filepath")
+        pdf_url = gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF")
+    status = gr.Textbox(label="Processing Status", interactive=False)
+    output = gr.Markdown(label="Markdown Output")
+    submit_btn = gr.Button("Process PDF")
+    def update_status(message):
+        return message
+    submit_btn.click(
+        fn=process_pdf,
+        inputs=[pdf_input, pdf_url, update_status],
+        outputs=[output, status]
+    )
 if __name__ == "__main__":
     # In Hugging Face Spaces, share=False is sufficient as Spaces handles the server