Spaces:

broadfield-dev
/

pdf2markdown

Sleeping

App Files Files Community

broadfield-dev commited on Jun 2

Commit

f86ad35

verified ·

1 Parent(s): ba5d90f

Create app.py

Browse files

Files changed (1) hide show

app.py +154 -0

app.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import gradio as gr
+import requests
+import PyPDF2
+from pdf2image import convert_from_path, convert_from_bytes
+import pytesseract
+from PIL import Image
+import io
+import os
+from huggingface_hub import HfApi, create_repo
+import re
+import markdown
+from datetime import datetime
+# Initialize Hugging Face API
+hf_api = HfApi()
+HF_TOKEN = os.getenv("HF_TOKEN")  # Set your Hugging Face API token as an environment variable
+REPO_NAME = "pdf-images-extracted"  # Hugging Face repo name
+def ensure_hf_repo():
+    """Create or get Hugging Face repository."""
+    try:
+        repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, exist_ok=True)
+        return repo_id
+    except Exception as e:
+        return f"Error creating repo: {str(e)}"
+def upload_image_to_hf(image, filename):
+    """Upload an image to Hugging Face Hub and return its URL."""
+    repo_id = ensure_hf_repo()
+    if isinstance(repo_id, str) and repo_id.startswith("Error"):
+        return repo_id
+    try:
+        # Save image temporarily
+        temp_path = f"temp_{filename}.png"
+        image.save(temp_path, format="PNG")
+        # Upload to Hugging Face
+        file_url = hf_api.upload_file(
+            path_or_fileobj=temp_path,
+            path_in_repo=f"images/{filename}.png",
+            repo_id=repo_id,
+            token=HF_TOKEN
+        )
+        os.remove(temp_path)
+        return file_url
+    except Exception as e:
+        return f"Error uploading image: {str(e)}"
+def extract_text_from_pdf(pdf_file):
+    """Extract text from PDF using PyPDF2."""
+    try:
+        reader = PyPDF2.PdfReader(pdf_file)
+        text = ""
+        for page in reader.pages:
+            page_text = page.extract_text() or ""
+            text += page_text + "\n\n"
+        return text
+    except Exception as e:
+        return f"Error extracting text: {str(e)}"
+def extract_images_from_pdf(pdf_file):
+    """Extract images from PDF and convert to PIL images."""
+    try:
+        if isinstance(pdf_file, str):  # URL case
+            response = requests.get(pdf_file)
+            images = convert_from_bytes(response.content)
+        else:  # File upload case
+            images = convert_from_path(pdf_file.name)
+        return images
+    except Exception as e:
+        return f"Error extracting images: {str(e)}"
+def format_to_markdown(text, images):
+    """Convert extracted text and images to Markdown format."""
+    markdown_output = "# Extracted PDF Content\n\n"
+    # Clean and format text
+    text = re.sub(r'\n\s*\n', '\n\n', text.strip())  # Remove excessive newlines
+    lines = text.split("\n")
+    for line in lines:
+        # Detect headings (simple heuristic: all caps or specific keywords)
+        if line.isupper() and len(line) > 5:
+            markdown_output += f"## {line}\n\n"
+        # Detect lists (lines starting with numbers or bullets)
+        elif re.match(r'^\s*[\d\-*+]\.\s+', line):
+            markdown_output += f"- {line.strip()[2:]}\n"
+        else:
+            markdown_output += f"{line}\n\n"
+    # Add images with Hugging Face URLs
+    if isinstance(images, list) and images:
+        markdown_output += "## Extracted Images\n\n"
+        for i, image in enumerate(images):
+            # Perform OCR on image to include any text (e.g., in charts)
+            ocr_text = pytesseract.image_to_string(image).strip()
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"image_{i}_{timestamp}"
+            image_url = upload_image_to_hf(image, filename)
+            if not image_url.startswith("Error"):
+                markdown_output += f"![Image {i+1}]({image_url})\n"
+                if ocr_text:
+                    markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
+            else:
+                markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
+    return markdown_output
+def process_pdf(pdf_input, pdf_url):
+    """Main function to process PDF input (file or URL) and generate Markdown."""
+    if pdf_url and pdf_url.strip():
+        # Process PDF from URL
+        response = requests.head(pdf_url)
+        if response.status_code != 200:
+            return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
+        pdf_file = pdf_url
+    elif pdf_input:
+        # Process uploaded PDF
+        pdf_file = pdf_input
+    else:
+        return "Error: Please provide a PDF file or URL."
+    # Extract text and images
+    text = extract_text_from_pdf(pdf_file)
+    images = extract_images_from_pdf(pdf_file)
+    if isinstance(text, str) and text.startswith("Error"):
+        return text
+    if isinstance(images, str) and images.startswith("Error"):
+        return images
+    # Generate Markdown
+    markdown_output = format_to_markdown(text, images)
+    return markdown_output
+# Gradio Interface
+iface = gr.Interface(
+    fn=process_pdf,
+    inputs=[
+        gr.File(label="Upload PDF File", type="filepath"),
+        gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF"),
+    ],
+    outputs=gr.Markdown(label="Markdown Output"),
+    title="PDF to Markdown Converter",
+    description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face Hub, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved as much as possible.",
+)
+if __name__ == "__main__":
+    # Ensure Hugging Face token is set
+    if not HF_TOKEN:
+        print("Error: Please set HF_TOKEN environment variable with your Hugging Face API token.")
+    else:
+        iface.launch()