import gradio as gr import requests import PyPDF2 from pdf2image import convert_from_path, convert_from_bytes import pytesseract from PIL import Image import io import os from huggingface_hub import HfApi, create_repo import re import markdown from datetime import datetime # Initialize Hugging Face API hf_api = HfApi() HF_TOKEN = os.getenv("HF_TOKEN") # Set your Hugging Face API token as an environment variable REPO_NAME = "pdf-images-extracted" # Hugging Face repo name def ensure_hf_repo(): """Create or get Hugging Face repository.""" try: repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, exist_ok=True) return repo_id except Exception as e: return f"Error creating repo: {str(e)}" def upload_image_to_hf(image, filename): """Upload an image to Hugging Face Hub and return its URL.""" repo_id = ensure_hf_repo() if isinstance(repo_id, str) and repo_id.startswith("Error"): return repo_id try: # Save image temporarily temp_path = f"temp_{filename}.png" image.save(temp_path, format="PNG") # Upload to Hugging Face file_url = hf_api.upload_file( path_or_fileobj=temp_path, path_in_repo=f"images/{filename}.png", repo_id=repo_id, token=HF_TOKEN ) os.remove(temp_path) return file_url except Exception as e: return f"Error uploading image: {str(e)}" def extract_text_from_pdf(pdf_file): """Extract text from PDF using PyPDF2.""" try: reader = PyPDF2.PdfReader(pdf_file) text = "" for page in reader.pages: page_text = page.extract_text() or "" text += page_text + "\n\n" return text except Exception as e: return f"Error extracting text: {str(e)}" def extract_images_from_pdf(pdf_file): """Extract images from PDF and convert to PIL images.""" try: if isinstance(pdf_file, str): # URL case response = requests.get(pdf_file) images = convert_from_bytes(response.content) else: # File upload case images = convert_from_path(pdf_file.name) return images except Exception as e: return f"Error extracting images: {str(e)}" def format_to_markdown(text, images): """Convert extracted text and images to Markdown format.""" markdown_output = "# Extracted PDF Content\n\n" # Clean and format text text = re.sub(r'\n\s*\n', '\n\n', text.strip()) # Remove excessive newlines lines = text.split("\n") for line in lines: # Detect headings (simple heuristic: all caps or specific keywords) if line.isupper() and len(line) > 5: markdown_output += f"## {line}\n\n" # Detect lists (lines starting with numbers or bullets) elif re.match(r'^\s*[\d\-*+]\.\s+', line): markdown_output += f"- {line.strip()[2:]}\n" else: markdown_output += f"{line}\n\n" # Add images with Hugging Face URLs if isinstance(images, list) and images: markdown_output += "## Extracted Images\n\n" for i, image in enumerate(images): # Perform OCR on image to include any text (e.g., in charts) ocr_text = pytesseract.image_to_string(image).strip() timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"image_{i}_{timestamp}" image_url = upload_image_to_hf(image, filename) if not image_url.startswith("Error"): markdown_output += f"![Image {i+1}]({image_url})\n" if ocr_text: markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n" else: markdown_output += f"**Image {i+1} Error:** {image_url}\n\n" return markdown_output def process_pdf(pdf_input, pdf_url): """Main function to process PDF input (file or URL) and generate Markdown.""" if pdf_url and pdf_url.strip(): # Process PDF from URL response = requests.head(pdf_url) if response.status_code != 200: return f"Error: Invalid URL or inaccessible PDF: {pdf_url}" pdf_file = pdf_url elif pdf_input: # Process uploaded PDF pdf_file = pdf_input else: return "Error: Please provide a PDF file or URL." # Extract text and images text = extract_text_from_pdf(pdf_file) images = extract_images_from_pdf(pdf_file) if isinstance(text, str) and text.startswith("Error"): return text if isinstance(images, str) and images.startswith("Error"): return images # Generate Markdown markdown_output = format_to_markdown(text, images) return markdown_output # Gradio Interface iface = gr.Interface( fn=process_pdf, inputs=[ gr.File(label="Upload PDF File", type="filepath"), gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF"), ], outputs=gr.Markdown(label="Markdown Output"), title="PDF to Markdown Converter", description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face Hub, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved as much as possible.", ) if __name__ == "__main__": # Ensure Hugging Face token is set if not HF_TOKEN: print("Error: Please set HF_TOKEN environment variable with your Hugging Face API token.") else: iface.launch()