import gradio as gr import requests import PyPDF2 from pdf2image import convert_from_path, convert_from_bytes import pytesseract from PIL import Image import io import os from huggingface_hub import HfApi, create_repo, RepositoryNotFoundError import re from datetime import datetime import urllib.parse import logging import subprocess # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Initialize Hugging Face API HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets REPO_NAME = "pdf-images-extracted" # Hugging Face dataset repo hf_api = HfApi() def check_poppler(): """Check if poppler-utils is installed.""" try: result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True) logger.info(f"Poppler version: {result.stdout}") return True except FileNotFoundError: logger.error("Poppler not found in PATH.") return False def ensure_hf_dataset(): """Create or get Hugging Face dataset repository.""" try: repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True) logger.info(f"Using dataset repo: {repo_id}") return repo_id except Exception as e: logger.error(f"Error creating dataset repo: {str(e)}") return f"Error creating dataset repo: {str(e)}" def upload_image_to_hf(image, filename): """Upload an image to Hugging Face dataset and return its URL.""" repo_id = ensure_hf_dataset() if isinstance(repo_id, str) and repo_id.startswith("Error"): return repo_id try: # Save image temporarily temp_path = f"/tmp/temp_{filename}.png" image.save(temp_path, format="PNG") # Upload to Hugging Face dataset file_url = hf_api.upload_file( path_or_fileobj=temp_path, path_in_repo=f"images/{filename}.png", repo_id=repo_id, repo_type="dataset", token=HF_TOKEN ) os.remove(temp_path) logger.info(f"Uploaded image to: {file_url}") return file_url except Exception as e: logger.error(f"Error uploading image: {str(e)}") return f"Error uploading image: {str(e)}" def extract_text_from_pdf(pdf_input): """Extract text from PDF (URL or file) using PyPDF2.""" try: if isinstance(pdf_input, str): # URL case response = requests.get(pdf_input, stream=True) response.raise_for_status() pdf_file = io.BytesIO(response.content) else: # File upload case pdf_file = pdf_input reader = PyPDF2.PdfReader(pdf_file) text = "" for page in reader.pages: page_text = page.extract_text() or "" text += page_text + "\n\n" return text except Exception as e: logger.error(f"Error extracting text: {str(e)}") return f"Error extracting text: {str(e)}" def extract_images_from_pdf(pdf_input): """Extract images from PDF (URL or file) and convert to PIL images.""" if not check_poppler(): return "Error: poppler-utils not found. Ensure it is installed via Dockerfile." try: if isinstance(pdf_input, str): # URL case logger.info(f"Downloading PDF from URL: {pdf_input}") response = requests.get(pdf_input, stream=True) response.raise_for_status() images = convert_from_bytes(response.content) else: # File upload case logger.info(f"Processing uploaded PDF: {pdf_input.name}") images = convert_from_path(pdf_input.name) return images except Exception as e: logger.error(f"Error extracting images: {str(e)}") return f"Error extracting images: {str(e)}" def format_to_markdown(text, images): """Convert extracted text and images to Markdown format.""" markdown_output = "# Extracted PDF Content\n\n" # Clean and format text text = re.sub(r'\n\s*\n', '\n\n', text.strip()) # Remove excessive newlines lines = text.split("\n") for line in lines: # Detect headings (simple heuristic: all caps or specific keywords) if line.isupper() and len(line) > 5: markdown_output += f"## {line}\n\n" # Detect lists (lines starting with numbers or bullets) elif re.match(r'^\s*[\d\-*+]\.\s+', line): markdown_output += f"- {line.strip()[2:]}\n" else: markdown_output += f"{line}\n\n" # Add images with Hugging Face dataset URLs if isinstance(images, list) and images: markdown_output += "## Extracted Images\n\n" for i, image in enumerate(images): ocr_text = pytesseract.image_to_string(image).strip() timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"image_{i}_{timestamp}" image_url = upload_image_to_hf(image, filename) if not image_url.startswith("Error"): markdown_output += f"![Image {i+1}]({image_url})\n" if ocr_text: markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n" else: markdown_output += f"**Image {i+1} Error:** {image_url}\n\n" return markdown_output def process_pdf(pdf_input, pdf_url): """Main function to process PDF input (file or URL) and generate Markdown.""" logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT")) if not HF_TOKEN: return "Error: HF_TOKEN not set in Spaces Secrets." # Log poppler status logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}") # Decode URL-encoded string if provided if pdf_url and pdf_url.strip(): pdf_url = urllib.parse.unquote(pdf_url) logger.info(f"Decoded URL: {pdf_url}") try: response = requests.head(pdf_url, allow_redirects=True) response.raise_for_status() pdf_input = pdf_url except requests.RequestException as e: logger.error(f"Error accessing URL: {str(e)}") return f"Error accessing URL: {str(e)}" elif not pdf_input: return "Error: Please provide a PDF file or URL." text = extract_text_from_pdf(pdf_input) images = extract_images_from_pdf(pdf_input) if isinstance(text, str) and text.startswith("Error"): return text if isinstance(images, str) and images.startswith("Error"): return images markdown_output = format_to_markdown(text, images) return markdown_output # Gradio Interface iface = gr.Interface( fn=process_pdf, inputs=[ gr.File(label="Upload PDF File", type="filepath"), gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF (supports URL-encoded strings with spaces)"), ], outputs=gr.Markdown(label="Markdown Output"), title="PDF to Markdown Converter", description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets. Uses Docker to ensure poppler-utils and tesseract-ocr are installed.", flagging_dir="/tmp/flagged" # Set writable flagging directory ) if __name__ == "__main__": # In Hugging Face Spaces, share=False is sufficient as Spaces handles the server iface.launch(share=False)