File size: 5,583 Bytes
f86ad35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import gradio as gr
import requests
import PyPDF2
from pdf2image import convert_from_path, convert_from_bytes
import pytesseract
from PIL import Image
import io
import os
from huggingface_hub import HfApi, create_repo
import re
import markdown
from datetime import datetime

# Initialize Hugging Face API
hf_api = HfApi()
HF_TOKEN = os.getenv("HF_TOKEN")  # Set your Hugging Face API token as an environment variable
REPO_NAME = "pdf-images-extracted"  # Hugging Face repo name

def ensure_hf_repo():
    """Create or get Hugging Face repository."""
    try:
        repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, exist_ok=True)
        return repo_id
    except Exception as e:
        return f"Error creating repo: {str(e)}"

def upload_image_to_hf(image, filename):
    """Upload an image to Hugging Face Hub and return its URL."""
    repo_id = ensure_hf_repo()
    if isinstance(repo_id, str) and repo_id.startswith("Error"):
        return repo_id

    try:
        # Save image temporarily
        temp_path = f"temp_{filename}.png"
        image.save(temp_path, format="PNG")
        
        # Upload to Hugging Face
        file_url = hf_api.upload_file(
            path_or_fileobj=temp_path,
            path_in_repo=f"images/{filename}.png",
            repo_id=repo_id,
            token=HF_TOKEN
        )
        os.remove(temp_path)
        return file_url
    except Exception as e:
        return f"Error uploading image: {str(e)}"

def extract_text_from_pdf(pdf_file):
    """Extract text from PDF using PyPDF2."""
    try:
        reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text() or ""
            text += page_text + "\n\n"
        return text
    except Exception as e:
        return f"Error extracting text: {str(e)}"

def extract_images_from_pdf(pdf_file):
    """Extract images from PDF and convert to PIL images."""
    try:
        if isinstance(pdf_file, str):  # URL case
            response = requests.get(pdf_file)
            images = convert_from_bytes(response.content)
        else:  # File upload case
            images = convert_from_path(pdf_file.name)
        return images
    except Exception as e:
        return f"Error extracting images: {str(e)}"

def format_to_markdown(text, images):
    """Convert extracted text and images to Markdown format."""
    markdown_output = "# Extracted PDF Content\n\n"
    
    # Clean and format text
    text = re.sub(r'\n\s*\n', '\n\n', text.strip())  # Remove excessive newlines
    lines = text.split("\n")
    for line in lines:
        # Detect headings (simple heuristic: all caps or specific keywords)
        if line.isupper() and len(line) > 5:
            markdown_output += f"## {line}\n\n"
        # Detect lists (lines starting with numbers or bullets)
        elif re.match(r'^\s*[\d\-*+]\.\s+', line):
            markdown_output += f"- {line.strip()[2:]}\n"
        else:
            markdown_output += f"{line}\n\n"
    
    # Add images with Hugging Face URLs
    if isinstance(images, list) and images:
        markdown_output += "## Extracted Images\n\n"
        for i, image in enumerate(images):
            # Perform OCR on image to include any text (e.g., in charts)
            ocr_text = pytesseract.image_to_string(image).strip()
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"image_{i}_{timestamp}"
            image_url = upload_image_to_hf(image, filename)
            
            if not image_url.startswith("Error"):
                markdown_output += f"![Image {i+1}]({image_url})\n"
                if ocr_text:
                    markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
            else:
                markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
    
    return markdown_output

def process_pdf(pdf_input, pdf_url):
    """Main function to process PDF input (file or URL) and generate Markdown."""
    if pdf_url and pdf_url.strip():
        # Process PDF from URL
        response = requests.head(pdf_url)
        if response.status_code != 200:
            return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
        pdf_file = pdf_url
    elif pdf_input:
        # Process uploaded PDF
        pdf_file = pdf_input
    else:
        return "Error: Please provide a PDF file or URL."

    # Extract text and images
    text = extract_text_from_pdf(pdf_file)
    images = extract_images_from_pdf(pdf_file)

    if isinstance(text, str) and text.startswith("Error"):
        return text
    if isinstance(images, str) and images.startswith("Error"):
        return images

    # Generate Markdown
    markdown_output = format_to_markdown(text, images)
    return markdown_output

# Gradio Interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=[
        gr.File(label="Upload PDF File", type="filepath"),
        gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF"),
    ],
    outputs=gr.Markdown(label="Markdown Output"),
    title="PDF to Markdown Converter",
    description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face Hub, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved as much as possible.",
)

if __name__ == "__main__":
    # Ensure Hugging Face token is set
    if not HF_TOKEN:
        print("Error: Please set HF_TOKEN environment variable with your Hugging Face API token.")
    else:
        iface.launch()