import gradio as gr
import requests
import PyPDF2
from pdf2image import convert_from_path, convert_from_bytes
import pytesseract
from PIL import Image
import io
import os
from huggingface_hub import HfApi, create_repo, RepositoryNotFoundError
import re
from datetime import datetime
import urllib.parse
import logging
import subprocess

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize Hugging Face API
HF_TOKEN = os.getenv("HF_TOKEN")  # Set in Hugging Face Spaces Secrets
REPO_NAME = "pdf-images-extracted"  # Hugging Face dataset repo
hf_api = HfApi()

def check_poppler():
    """Check if poppler-utils is installed."""
    try:
        result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
        logger.info(f"Poppler version: {result.stdout}")
        return True
    except FileNotFoundError:
        logger.error("Poppler not found in PATH.")
        return False

def ensure_hf_dataset():
    """Create or get Hugging Face dataset repository."""
    try:
        repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
        logger.info(f"Using dataset repo: {repo_id}")
        return repo_id
    except Exception as e:
        logger.error(f"Error creating dataset repo: {str(e)}")
        return f"Error creating dataset repo: {str(e)}"

def upload_image_to_hf(image, filename):
    """Upload an image to Hugging Face dataset and return its URL."""
    repo_id = ensure_hf_dataset()
    if isinstance(repo_id, str) and repo_id.startswith("Error"):
        return repo_id

    try:
        # Save image temporarily
        temp_path = f"/tmp/temp_{filename}.png"
        image.save(temp_path, format="PNG")
        
        # Upload to Hugging Face dataset
        file_url = hf_api.upload_file(
            path_or_fileobj=temp_path,
            path_in_repo=f"images/{filename}.png",
            repo_id=repo_id,
            repo_type="dataset",
            token=HF_TOKEN
        )
        os.remove(temp_path)
        logger.info(f"Uploaded image to: {file_url}")
        return file_url
    except Exception as e:
        logger.error(f"Error uploading image: {str(e)}")
        return f"Error uploading image: {str(e)}"

def extract_text_from_pdf(pdf_input):
    """Extract text from PDF (URL or file) using PyPDF2."""
    try:
        if isinstance(pdf_input, str):  # URL case
            response = requests.get(pdf_input, stream=True)
            response.raise_for_status()
            pdf_file = io.BytesIO(response.content)
        else:  # File upload case
            pdf_file = pdf_input
        reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text() or ""
            text += page_text + "\n\n"
        return text
    except Exception as e:
        logger.error(f"Error extracting text: {str(e)}")
        return f"Error extracting text: {str(e)}"

def extract_images_from_pdf(pdf_input):
    """Extract images from PDF (URL or file) and convert to PIL images."""
    if not check_poppler():
        return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
    
    try:
        if isinstance(pdf_input, str):  # URL case
            logger.info(f"Downloading PDF from URL: {pdf_input}")
            response = requests.get(pdf_input, stream=True)
            response.raise_for_status()
            images = convert_from_bytes(response.content)
        else:  # File upload case
            logger.info(f"Processing uploaded PDF: {pdf_input.name}")
            images = convert_from_path(pdf_input.name)
        return images
    except Exception as e:
        logger.error(f"Error extracting images: {str(e)}")
        return f"Error extracting images: {str(e)}"

def format_to_markdown(text, images):
    """Convert extracted text and images to Markdown format."""
    markdown_output = "# Extracted PDF Content\n\n"
    
    # Clean and format text
    text = re.sub(r'\n\s*\n', '\n\n', text.strip())  # Remove excessive newlines
    lines = text.split("\n")
    for line in lines:
        # Detect headings (simple heuristic: all caps or specific keywords)
        if line.isupper() and len(line) > 5:
            markdown_output += f"## {line}\n\n"
        # Detect lists (lines starting with numbers or bullets)
        elif re.match(r'^\s*[\d\-*+]\.\s+', line):
            markdown_output += f"- {line.strip()[2:]}\n"
        else:
            markdown_output += f"{line}\n\n"
    
    # Add images with Hugging Face dataset URLs
    if isinstance(images, list) and images:
        markdown_output += "## Extracted Images\n\n"
        for i, image in enumerate(images):
            ocr_text = pytesseract.image_to_string(image).strip()
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"image_{i}_{timestamp}"
            image_url = upload_image_to_hf(image, filename)
            
            if not image_url.startswith("Error"):
                markdown_output += f"![Image {i+1}]({image_url})\n"
                if ocr_text:
                    markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
            else:
                markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
    
    return markdown_output

def process_pdf(pdf_input, pdf_url):
    """Main function to process PDF input (file or URL) and generate Markdown."""
    logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
    if not HF_TOKEN:
        return "Error: HF_TOKEN not set in Spaces Secrets."

    # Log poppler status
    logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")

    # Decode URL-encoded string if provided
    if pdf_url and pdf_url.strip():
        pdf_url = urllib.parse.unquote(pdf_url)
        logger.info(f"Decoded URL: {pdf_url}")
        try:
            response = requests.head(pdf_url, allow_redirects=True)
            response.raise_for_status()
            pdf_input = pdf_url
        except requests.RequestException as e:
            logger.error(f"Error accessing URL: {str(e)}")
            return f"Error accessing URL: {str(e)}"
    elif not pdf_input:
        return "Error: Please provide a PDF file or URL."

    text = extract_text_from_pdf(pdf_input)
    images = extract_images_from_pdf(pdf_input)

    if isinstance(text, str) and text.startswith("Error"):
        return text
    if isinstance(images, str) and images.startswith("Error"):
        return images

    markdown_output = format_to_markdown(text, images)
    return markdown_output

# Gradio Interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=[
        gr.File(label="Upload PDF File", type="filepath"),
        gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF (supports URL-encoded strings with spaces)"),
    ],
    outputs=gr.Markdown(label="Markdown Output"),
    title="PDF to Markdown Converter",
    description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets. Uses Docker to ensure poppler-utils and tesseract-ocr are installed.",
    flagging_dir="/tmp/flagged"  # Set writable flagging directory
)

if __name__ == "__main__":
    # In Hugging Face Spaces, share=False is sufficient as Spaces handles the server
    iface.launch(share=False)