Spaces:
Sleeping
Sleeping
File size: 7,622 Bytes
f86ad35 0e0f376 f86ad35 17a8ae1 f86ad35 0dd31f7 155ac2a dc24da7 155ac2a f86ad35 77541b8 0dd31f7 f86ad35 dc24da7 0dd31f7 f86ad35 0dd31f7 17a8ae1 f86ad35 155ac2a 0dd31f7 f86ad35 0dd31f7 f86ad35 77541b8 f86ad35 0dd31f7 f86ad35 0dd31f7 f86ad35 17a8ae1 f86ad35 155ac2a f86ad35 0e0f376 f86ad35 0e0f376 f86ad35 155ac2a f86ad35 0e0f376 dc24da7 f86ad35 0e0f376 155ac2a 0e0f376 f86ad35 155ac2a 0e0f376 f86ad35 155ac2a f86ad35 0dd31f7 f86ad35 dc24da7 77541b8 dc24da7 0dd31f7 f86ad35 0dd31f7 155ac2a 0dd31f7 0e0f376 0dd31f7 155ac2a 0dd31f7 0e0f376 f86ad35 0e0f376 f86ad35 0e0f376 f86ad35 dc24da7 f86ad35 17a8ae1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import gradio as gr
import requests
import PyPDF2
from pdf2image import convert_from_path, convert_from_bytes
import pytesseract
from PIL import Image
import io
import os
from huggingface_hub import HfApi, create_repo, RepositoryNotFoundError
import re
from datetime import datetime
import urllib.parse
import logging
import subprocess
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize Hugging Face API
HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
REPO_NAME = "pdf-images-extracted" # Hugging Face dataset repo
hf_api = HfApi()
def check_poppler():
"""Check if poppler-utils is installed."""
try:
result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
logger.info(f"Poppler version: {result.stdout}")
return True
except FileNotFoundError:
logger.error("Poppler not found in PATH.")
return False
def ensure_hf_dataset():
"""Create or get Hugging Face dataset repository."""
try:
repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
logger.info(f"Using dataset repo: {repo_id}")
return repo_id
except Exception as e:
logger.error(f"Error creating dataset repo: {str(e)}")
return f"Error creating dataset repo: {str(e)}"
def upload_image_to_hf(image, filename):
"""Upload an image to Hugging Face dataset and return its URL."""
repo_id = ensure_hf_dataset()
if isinstance(repo_id, str) and repo_id.startswith("Error"):
return repo_id
try:
# Save image temporarily
temp_path = f"/tmp/temp_{filename}.png"
image.save(temp_path, format="PNG")
# Upload to Hugging Face dataset
file_url = hf_api.upload_file(
path_or_fileobj=temp_path,
path_in_repo=f"images/{filename}.png",
repo_id=repo_id,
repo_type="dataset",
token=HF_TOKEN
)
os.remove(temp_path)
logger.info(f"Uploaded image to: {file_url}")
return file_url
except Exception as e:
logger.error(f"Error uploading image: {str(e)}")
return f"Error uploading image: {str(e)}"
def extract_text_from_pdf(pdf_input):
"""Extract text from PDF (URL or file) using PyPDF2."""
try:
if isinstance(pdf_input, str): # URL case
response = requests.get(pdf_input, stream=True)
response.raise_for_status()
pdf_file = io.BytesIO(response.content)
else: # File upload case
pdf_file = pdf_input
reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in reader.pages:
page_text = page.extract_text() or ""
text += page_text + "\n\n"
return text
except Exception as e:
logger.error(f"Error extracting text: {str(e)}")
return f"Error extracting text: {str(e)}"
def extract_images_from_pdf(pdf_input):
"""Extract images from PDF (URL or file) and convert to PIL images."""
if not check_poppler():
return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
try:
if isinstance(pdf_input, str): # URL case
logger.info(f"Downloading PDF from URL: {pdf_input}")
response = requests.get(pdf_input, stream=True)
response.raise_for_status()
images = convert_from_bytes(response.content)
else: # File upload case
logger.info(f"Processing uploaded PDF: {pdf_input.name}")
images = convert_from_path(pdf_input.name)
return images
except Exception as e:
logger.error(f"Error extracting images: {str(e)}")
return f"Error extracting images: {str(e)}"
def format_to_markdown(text, images):
"""Convert extracted text and images to Markdown format."""
markdown_output = "# Extracted PDF Content\n\n"
# Clean and format text
text = re.sub(r'\n\s*\n', '\n\n', text.strip()) # Remove excessive newlines
lines = text.split("\n")
for line in lines:
# Detect headings (simple heuristic: all caps or specific keywords)
if line.isupper() and len(line) > 5:
markdown_output += f"## {line}\n\n"
# Detect lists (lines starting with numbers or bullets)
elif re.match(r'^\s*[\d\-*+]\.\s+', line):
markdown_output += f"- {line.strip()[2:]}\n"
else:
markdown_output += f"{line}\n\n"
# Add images with Hugging Face dataset URLs
if isinstance(images, list) and images:
markdown_output += "## Extracted Images\n\n"
for i, image in enumerate(images):
ocr_text = pytesseract.image_to_string(image).strip()
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"image_{i}_{timestamp}"
image_url = upload_image_to_hf(image, filename)
if not image_url.startswith("Error"):
markdown_output += f"\n"
if ocr_text:
markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
else:
markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
return markdown_output
def process_pdf(pdf_input, pdf_url):
"""Main function to process PDF input (file or URL) and generate Markdown."""
logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
if not HF_TOKEN:
return "Error: HF_TOKEN not set in Spaces Secrets."
# Log poppler status
logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
# Decode URL-encoded string if provided
if pdf_url and pdf_url.strip():
pdf_url = urllib.parse.unquote(pdf_url)
logger.info(f"Decoded URL: {pdf_url}")
try:
response = requests.head(pdf_url, allow_redirects=True)
response.raise_for_status()
pdf_input = pdf_url
except requests.RequestException as e:
logger.error(f"Error accessing URL: {str(e)}")
return f"Error accessing URL: {str(e)}"
elif not pdf_input:
return "Error: Please provide a PDF file or URL."
text = extract_text_from_pdf(pdf_input)
images = extract_images_from_pdf(pdf_input)
if isinstance(text, str) and text.startswith("Error"):
return text
if isinstance(images, str) and images.startswith("Error"):
return images
markdown_output = format_to_markdown(text, images)
return markdown_output
# Gradio Interface
iface = gr.Interface(
fn=process_pdf,
inputs=[
gr.File(label="Upload PDF File", type="filepath"),
gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF (supports URL-encoded strings with spaces)"),
],
outputs=gr.Markdown(label="Markdown Output"),
title="PDF to Markdown Converter",
description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets. Uses Docker to ensure poppler-utils and tesseract-ocr are installed.",
flagging_dir="/tmp/flagged" # Set writable flagging directory
)
if __name__ == "__main__":
# In Hugging Face Spaces, share=False is sufficient as Spaces handles the server
iface.launch(share=False) |