Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
import PyPDF2
|
4 |
+
from pdf2image import convert_from_path, convert_from_bytes
|
5 |
+
import pytesseract
|
6 |
+
from PIL import Image
|
7 |
+
import io
|
8 |
+
import os
|
9 |
+
from huggingface_hub import HfApi, create_repo
|
10 |
+
import re
|
11 |
+
import markdown
|
12 |
+
from datetime import datetime
|
13 |
+
|
14 |
+
# Initialize Hugging Face API
|
15 |
+
hf_api = HfApi()
|
16 |
+
HF_TOKEN = os.getenv("HF_TOKEN") # Set your Hugging Face API token as an environment variable
|
17 |
+
REPO_NAME = "pdf-images-extracted" # Hugging Face repo name
|
18 |
+
|
19 |
+
def ensure_hf_repo():
|
20 |
+
"""Create or get Hugging Face repository."""
|
21 |
+
try:
|
22 |
+
repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, exist_ok=True)
|
23 |
+
return repo_id
|
24 |
+
except Exception as e:
|
25 |
+
return f"Error creating repo: {str(e)}"
|
26 |
+
|
27 |
+
def upload_image_to_hf(image, filename):
|
28 |
+
"""Upload an image to Hugging Face Hub and return its URL."""
|
29 |
+
repo_id = ensure_hf_repo()
|
30 |
+
if isinstance(repo_id, str) and repo_id.startswith("Error"):
|
31 |
+
return repo_id
|
32 |
+
|
33 |
+
try:
|
34 |
+
# Save image temporarily
|
35 |
+
temp_path = f"temp_{filename}.png"
|
36 |
+
image.save(temp_path, format="PNG")
|
37 |
+
|
38 |
+
# Upload to Hugging Face
|
39 |
+
file_url = hf_api.upload_file(
|
40 |
+
path_or_fileobj=temp_path,
|
41 |
+
path_in_repo=f"images/{filename}.png",
|
42 |
+
repo_id=repo_id,
|
43 |
+
token=HF_TOKEN
|
44 |
+
)
|
45 |
+
os.remove(temp_path)
|
46 |
+
return file_url
|
47 |
+
except Exception as e:
|
48 |
+
return f"Error uploading image: {str(e)}"
|
49 |
+
|
50 |
+
def extract_text_from_pdf(pdf_file):
|
51 |
+
"""Extract text from PDF using PyPDF2."""
|
52 |
+
try:
|
53 |
+
reader = PyPDF2.PdfReader(pdf_file)
|
54 |
+
text = ""
|
55 |
+
for page in reader.pages:
|
56 |
+
page_text = page.extract_text() or ""
|
57 |
+
text += page_text + "\n\n"
|
58 |
+
return text
|
59 |
+
except Exception as e:
|
60 |
+
return f"Error extracting text: {str(e)}"
|
61 |
+
|
62 |
+
def extract_images_from_pdf(pdf_file):
|
63 |
+
"""Extract images from PDF and convert to PIL images."""
|
64 |
+
try:
|
65 |
+
if isinstance(pdf_file, str): # URL case
|
66 |
+
response = requests.get(pdf_file)
|
67 |
+
images = convert_from_bytes(response.content)
|
68 |
+
else: # File upload case
|
69 |
+
images = convert_from_path(pdf_file.name)
|
70 |
+
return images
|
71 |
+
except Exception as e:
|
72 |
+
return f"Error extracting images: {str(e)}"
|
73 |
+
|
74 |
+
def format_to_markdown(text, images):
|
75 |
+
"""Convert extracted text and images to Markdown format."""
|
76 |
+
markdown_output = "# Extracted PDF Content\n\n"
|
77 |
+
|
78 |
+
# Clean and format text
|
79 |
+
text = re.sub(r'\n\s*\n', '\n\n', text.strip()) # Remove excessive newlines
|
80 |
+
lines = text.split("\n")
|
81 |
+
for line in lines:
|
82 |
+
# Detect headings (simple heuristic: all caps or specific keywords)
|
83 |
+
if line.isupper() and len(line) > 5:
|
84 |
+
markdown_output += f"## {line}\n\n"
|
85 |
+
# Detect lists (lines starting with numbers or bullets)
|
86 |
+
elif re.match(r'^\s*[\d\-*+]\.\s+', line):
|
87 |
+
markdown_output += f"- {line.strip()[2:]}\n"
|
88 |
+
else:
|
89 |
+
markdown_output += f"{line}\n\n"
|
90 |
+
|
91 |
+
# Add images with Hugging Face URLs
|
92 |
+
if isinstance(images, list) and images:
|
93 |
+
markdown_output += "## Extracted Images\n\n"
|
94 |
+
for i, image in enumerate(images):
|
95 |
+
# Perform OCR on image to include any text (e.g., in charts)
|
96 |
+
ocr_text = pytesseract.image_to_string(image).strip()
|
97 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
98 |
+
filename = f"image_{i}_{timestamp}"
|
99 |
+
image_url = upload_image_to_hf(image, filename)
|
100 |
+
|
101 |
+
if not image_url.startswith("Error"):
|
102 |
+
markdown_output += f"\n"
|
103 |
+
if ocr_text:
|
104 |
+
markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
|
105 |
+
else:
|
106 |
+
markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
|
107 |
+
|
108 |
+
return markdown_output
|
109 |
+
|
110 |
+
def process_pdf(pdf_input, pdf_url):
|
111 |
+
"""Main function to process PDF input (file or URL) and generate Markdown."""
|
112 |
+
if pdf_url and pdf_url.strip():
|
113 |
+
# Process PDF from URL
|
114 |
+
response = requests.head(pdf_url)
|
115 |
+
if response.status_code != 200:
|
116 |
+
return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
|
117 |
+
pdf_file = pdf_url
|
118 |
+
elif pdf_input:
|
119 |
+
# Process uploaded PDF
|
120 |
+
pdf_file = pdf_input
|
121 |
+
else:
|
122 |
+
return "Error: Please provide a PDF file or URL."
|
123 |
+
|
124 |
+
# Extract text and images
|
125 |
+
text = extract_text_from_pdf(pdf_file)
|
126 |
+
images = extract_images_from_pdf(pdf_file)
|
127 |
+
|
128 |
+
if isinstance(text, str) and text.startswith("Error"):
|
129 |
+
return text
|
130 |
+
if isinstance(images, str) and images.startswith("Error"):
|
131 |
+
return images
|
132 |
+
|
133 |
+
# Generate Markdown
|
134 |
+
markdown_output = format_to_markdown(text, images)
|
135 |
+
return markdown_output
|
136 |
+
|
137 |
+
# Gradio Interface
|
138 |
+
iface = gr.Interface(
|
139 |
+
fn=process_pdf,
|
140 |
+
inputs=[
|
141 |
+
gr.File(label="Upload PDF File", type="filepath"),
|
142 |
+
gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF"),
|
143 |
+
],
|
144 |
+
outputs=gr.Markdown(label="Markdown Output"),
|
145 |
+
title="PDF to Markdown Converter",
|
146 |
+
description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face Hub, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved as much as possible.",
|
147 |
+
)
|
148 |
+
|
149 |
+
if __name__ == "__main__":
|
150 |
+
# Ensure Hugging Face token is set
|
151 |
+
if not HF_TOKEN:
|
152 |
+
print("Error: Please set HF_TOKEN environment variable with your Hugging Face API token.")
|
153 |
+
else:
|
154 |
+
iface.launch()
|