Spaces:
Running
Running
File size: 7,233 Bytes
f86ad35 aec5733 f86ad35 0e0f376 f86ad35 40d5277 f86ad35 0dd31f7 155ac2a dc24da7 155ac2a 81314aa 155ac2a f86ad35 81314aa f86ad35 dc24da7 0dd31f7 f86ad35 9db742a 0dd31f7 81314aa f86ad35 81314aa f86ad35 a492eda 0dd31f7 f86ad35 77541b8 f86ad35 0dd31f7 f86ad35 81314aa f86ad35 81314aa f86ad35 a492eda f86ad35 81314aa 0e0f376 81314aa 0e0f376 aec5733 9db742a aec5733 f86ad35 81314aa f86ad35 a492eda dc24da7 81314aa f86ad35 81314aa 0e0f376 f86ad35 81314aa 0e0f376 f86ad35 81314aa f86ad35 a492eda f86ad35 81314aa f86ad35 a492eda f86ad35 a492eda 81314aa 77541b8 81314aa f86ad35 0dd31f7 81314aa 0dd31f7 81314aa 0e0f376 0dd31f7 81314aa 0e0f376 81314aa a492eda f86ad35 81314aa f86ad35 81314aa a492eda 81314aa f86ad35 a492eda 81314aa a492eda 81314aa a492eda f86ad35 81314aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import gradio as gr
import requests
import pdfplumber
from pdf2image import convert_from_path, convert_from_bytes
import pytesseract
from PIL import Image
import io
import os
from huggingface_hub import HfApi, create_repo
import re
from datetime import datetime
import urllib.parse
import logging
import subprocess
# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# Initialize Hugging Face API
HF_TOKEN = os.getenv("HF_TOKEN")
REPO_NAME = "pdf-images-extracted"
hf_api = HfApi()
def check_poppler():
try:
result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
logger.info(f"Poppler version: {result.stdout}")
return True
except FileNotFoundError:
logger.error("Poppler not found in PATH.")
return False
def ensure_hf_dataset():
try:
if not HF_TOKEN:
raise ValueError("HF_TOKEN is not set")
repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
logger.info(f"Dataset repo: {repo_id}")
return repo_id
except Exception as e:
logger.error(f"Dataset error: {str(e)}")
return f"Error: Failed to access dataset: {str(e)}"
def upload_image_to_hf(image, filename):
repo_id = ensure_hf_dataset()
if isinstance(repo_id, str) and repo_id.startswith("Error"):
return repo_id
try:
temp_path = f"/tmp/temp_{filename}.png"
image.save(temp_path, format="PNG")
file_url = hf_api.upload_file(
path_or_fileobj=temp_path,
path_in_repo=f"images/{filename}.png",
repo_id=repo_id,
repo_type="dataset",
token=HF_TOKEN
)
os.remove(temp_path)
logger.info(f"Uploaded image: {file_url}")
return file_url
except Exception as e:
logger.error(f"Image upload error: {str(e)}")
return f"Error uploading image: {str(e)}"
def extract_text_from_pdf(pdf_input):
try:
if isinstance(pdf_input, str):
response = requests.get(pdf_input, stream=True, timeout=10)
response.raise_for_status()
pdf_file = io.BytesIO(response.content)
else:
pdf_file = pdf_input
with pdfplumber.open(pdf_file) as pdf:
text = ""
for page in pdf.pages:
page_text = page.extract_text(layout=True) or ""
text += page_text + "\n\n"
tables = page.extract_tables()
for table in tables:
text += "**Table:**\n" + "\n".join([" | ".join(str(cell) for cell in row) for row in table]) + "\n\n"
return text
except Exception as e:
logger.error(f"Text extraction error: {str(e)}")
return f"Error extracting text: {str(e)}"
def extract_images_from_pdf(pdf_input):
if not check_poppler():
return "Error: poppler-utils not found."
try:
if isinstance(pdf_input, str):
response = requests.get(pdf_input, stream=True, timeout=10)
response.raise_for_status()
images = convert_from_bytes(response.content)
else:
images = convert_from_path(pdf_input.name)
return images
except Exception as e:
logger.error(f"Image extraction error: {str(e)}")
return f"Error extracting images: {str(e)}"
def format_to_markdown(text, images):
markdown_output = "# Extracted PDF Content\n\n"
text = re.sub(r'\n\s*\n+', '\n\n', text.strip())
lines = text.split("\n")
for line in lines:
if line.isupper() and len(line) > 5:
markdown_output += f"## {line}\n\n"
elif re.match(r'^\s*[\d\-*+]\.\s+', line):
markdown_output += f"- {line.strip()[2:]}\n"
else:
markdown_output += f"{line}\n\n"
if isinstance(images, list) and images:
markdown_output += "## Extracted Images\n\n"
for i, image in enumerate(images):
ocr_text = pytesseract.image_to_string(image).strip()
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"image_{i}_{timestamp}"
image_url = upload_image_to_hf(image, filename)
if not image_url.startswith("Error"):
markdown_output += f"\n"
if ocr_text:
markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
else:
markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
return markdown_output
def process_pdf(pdf_input, pdf_url):
status = "Starting PDF processing..."
logger.info(status)
if not HF_TOKEN:
status = "Error: HF_TOKEN not set."
logger.error(status)
return status, status
if pdf_url and pdf_url.strip():
pdf_url = urllib.parse.unquote(pdf_url)
status = f"Downloading PDF from URL: {pdf_url}"
logger.info(status)
try:
response = requests.head(pdf_url, allow_redirects=True, timeout=5)
response.raise_for_status()
pdf_input = pdf_url
except requests.RequestException as e:
status = f"Error accessing URL: {str(e)}"
logger.error(status)
return status, status
elif not pdf_input:
status = "Error: No PDF provided."
logger.error(status)
return status, status
status = "Extracting text..."
logger.info(status)
text = extract_text_from_pdf(pdf_input)
if isinstance(text, str) and text.startswith("Error"):
status = "Text extraction failed."
logger.error(status)
return text, status
status = "Extracting images..."
logger.info(status)
images = extract_images_from_pdf(pdf_input)
if isinstance(images, str) and images.startswith("Error"):
status = "Image extraction failed."
logger.error(status)
return images, status
status = "Formatting output..."
logger.info(status)
markdown_output = format_to_markdown(text, images)
status = "Processing complete."
logger.info(status)
return markdown_output, status
# Gradio Interface
iface = gr.Interface(
fn=process_pdf,
inputs=[
gr.File(label="Upload PDF File", file_types=[".pdf"]),
gr.Textbox(label="PDF URL", placeholder="Enter PDF URL (e.g., https://example.com/file.pdf)"),
],
outputs=[
gr.Markdown(label="Markdown Output"),
gr.Textbox(label="Processing Status", interactive=False),
],
title="PDF to Markdown Converter",
description="Convert a PDF file or URL to Markdown. Extracts text, images, and tables, with images uploaded to a Hugging Face dataset. Supports URL-encoded strings. Requires HF_TOKEN in Spaces Secrets.",
allow_flagging="never"
)
if __name__ == "__main__":
logger.info("Starting Gradio app...")
try:
iface.launch(server_name="0.0.0.0", server_port=7860, prevent_thread_lock=True)
logger.info("Gradio app started successfully.")
except Exception as e:
logger.error(f"Failed to start Gradio app: {str(e)}")
raise |