Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,200 +1,214 @@
|
|
1 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import requests
|
3 |
import pdfplumber
|
4 |
from pdf2image import convert_from_path, convert_from_bytes
|
5 |
import pytesseract
|
6 |
from PIL import Image
|
7 |
-
import
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
import logging
|
14 |
-
import subprocess
|
15 |
|
16 |
-
#
|
17 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
18 |
logger = logging.getLogger(__name__)
|
19 |
|
20 |
-
#
|
21 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
22 |
-
|
23 |
hf_api = HfApi()
|
24 |
|
|
|
|
|
|
|
25 |
def check_poppler():
|
26 |
try:
|
27 |
-
result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
|
28 |
-
# pdftoppm -v typically prints version info to stderr
|
29 |
version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
|
30 |
if version_info_log:
|
31 |
-
|
32 |
-
logger.info(f"Poppler version check: {version_info_log.splitlines()[0]}")
|
33 |
else:
|
34 |
-
logger.info("Poppler 'pdftoppm -v' ran
|
35 |
-
# The main goal is to confirm 'pdftoppm' is executable.
|
36 |
-
# FileNotFoundError is the primary concern for "not found".
|
37 |
return True
|
38 |
except FileNotFoundError:
|
39 |
logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in PATH.")
|
40 |
return False
|
41 |
-
except Exception as e:
|
42 |
logger.error(f"An unexpected error occurred during Poppler check: {str(e)}")
|
43 |
return False
|
44 |
|
45 |
def ensure_hf_dataset():
|
|
|
|
|
|
|
46 |
try:
|
47 |
-
|
48 |
-
# This case should ideally be caught before attempting dataset operations
|
49 |
-
# However, having a check here is a good safeguard.
|
50 |
-
logger.error("HF_TOKEN is not set. Cannot ensure Hugging Face dataset.")
|
51 |
-
return "Error: HF_TOKEN is not set. Please configure it in Space secrets."
|
52 |
-
|
53 |
-
# Use hf_api instance which might be pre-configured with token, or pass token explicitly
|
54 |
-
# create_repo will use token from HfApi if initialized with one, or passed token, or env.
|
55 |
-
repo_id_obj = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
56 |
logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
|
57 |
-
return repo_id_obj.repo_id
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
except Exception as e:
|
59 |
-
logger.error(f"Hugging Face dataset error: {str(e)}")
|
60 |
-
return f"Error: Failed to access or create dataset '{
|
|
|
61 |
|
62 |
-
def upload_image_to_hf(
|
63 |
-
# filename_base should not include extension, it will be added.
|
64 |
repo_id_or_error = ensure_hf_dataset()
|
65 |
if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
|
66 |
-
return repo_id_or_error
|
67 |
-
|
68 |
-
repo_id = repo_id_or_error # Now it's confirmed to be the repo_id string
|
69 |
|
|
|
|
|
70 |
try:
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
file_url = hf_api.upload_file(
|
81 |
-
path_or_fileobj=
|
82 |
path_in_repo=repo_filename,
|
83 |
repo_id=repo_id,
|
84 |
repo_type="dataset",
|
85 |
-
token=HF_TOKEN
|
86 |
)
|
87 |
-
os.remove(temp_path)
|
88 |
logger.info(f"Successfully uploaded image: {file_url}")
|
89 |
return file_url
|
90 |
except Exception as e:
|
91 |
-
logger.error(f"Image upload error for {filename_base}: {str(e)}")
|
92 |
-
|
93 |
-
|
|
|
94 |
try:
|
95 |
-
os.remove(
|
96 |
except OSError as ose:
|
97 |
-
logger.error(f"Error removing temp file {
|
98 |
-
return f"Error uploading image {filename_base}: {str(e)}"
|
99 |
|
100 |
-
def extract_text_from_pdf(pdf_input_source): #
|
101 |
try:
|
102 |
-
|
|
|
103 |
logger.info(f"Fetching PDF from URL for text extraction: {pdf_input_source}")
|
104 |
-
response = requests.get(pdf_input_source, stream=True, timeout=
|
105 |
response.raise_for_status()
|
106 |
pdf_file_like_object = io.BytesIO(response.content)
|
107 |
logger.info("PDF downloaded successfully from URL.")
|
108 |
-
|
109 |
-
logger.info(f"Processing
|
|
|
110 |
pdf_file_like_object = pdf_input_source
|
|
|
|
|
|
|
111 |
|
112 |
with pdfplumber.open(pdf_file_like_object) as pdf:
|
113 |
full_text = ""
|
114 |
for i, page in enumerate(pdf.pages):
|
115 |
-
|
116 |
-
|
117 |
-
full_text += page_text + "\n\n" # Add double newline as page separator
|
118 |
-
|
119 |
-
logger.debug(f"Extracting tables from page {i+1}")
|
120 |
tables = page.extract_tables()
|
121 |
if tables:
|
122 |
-
for
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
logger.info("Text and table extraction successful.")
|
130 |
-
return full_text
|
|
|
|
|
|
|
131 |
except Exception as e:
|
132 |
logger.error(f"Text extraction error: {str(e)}", exc_info=True)
|
133 |
return f"Error extracting text: {str(e)}"
|
134 |
|
135 |
-
def extract_images_from_pdf(pdf_input_source): #
|
136 |
if not check_poppler():
|
137 |
return "Error: poppler-utils not found or not working correctly. Image extraction depends on it."
|
138 |
|
|
|
139 |
try:
|
140 |
-
|
141 |
-
if isinstance(pdf_input_source, str): # Indicates a URL
|
142 |
logger.info(f"Fetching PDF from URL for image extraction: {pdf_input_source}")
|
143 |
-
response = requests.get(pdf_input_source, stream=True, timeout=
|
144 |
response.raise_for_status()
|
145 |
-
logger.info("PDF downloaded successfully, converting to images.")
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
images = convert_from_path(file_path, dpi=200)
|
154 |
|
155 |
-
logger.info(f"Successfully extracted {len(
|
156 |
-
return
|
|
|
|
|
|
|
157 |
except Exception as e:
|
158 |
logger.error(f"Image extraction error: {str(e)}", exc_info=True)
|
159 |
return f"Error extracting images: {str(e)}"
|
160 |
|
161 |
-
def format_to_markdown(text_content, images_list):
|
162 |
-
markdown_output = "# Extracted PDF Content\n\n"
|
163 |
-
|
164 |
-
# Normalize newlines: multiple consecutive newlines become a single blank line (two \n chars)
|
165 |
-
text_content = re.sub(r'\n\s*\n+', '\n\n', text_content.strip())
|
166 |
-
|
167 |
-
lines = text_content.split('\n') # Split by single newline. Blank lines between paragraphs become empty strings.
|
168 |
-
|
169 |
-
for i, line_text in enumerate(lines):
|
170 |
-
line_stripped = line_text.strip()
|
171 |
-
|
172 |
-
if not line_stripped: # Handle blank lines explicitly
|
173 |
-
# Add a single newline to markdown. This helps maintain paragraph separation.
|
174 |
-
markdown_output += "\n"
|
175 |
-
continue
|
176 |
-
|
177 |
-
# Regex for various list markers: "1.", "*", "-", "+" followed by space and content
|
178 |
-
list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
|
179 |
-
|
180 |
-
is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100 # Length constraint for ALL CAPS headings
|
181 |
-
|
182 |
-
if is_heading_candidate and not list_match: # Check it's not an ALL CAPS list item
|
183 |
-
markdown_output += f"## {line_stripped}\n\n"
|
184 |
-
elif list_match:
|
185 |
-
list_item_text = list_match.group(1) # Get the content part of the list item
|
186 |
-
markdown_output += f"- {list_item_text}\n" # Single newline for list items to keep them together
|
187 |
-
else:
|
188 |
-
# Default: treat as a paragraph line, add double newline for Markdown paragraph
|
189 |
-
markdown_output += f"{line_text}\n\n"
|
190 |
|
191 |
-
|
192 |
-
markdown_output =
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
-
if isinstance(
|
196 |
markdown_output += "## Extracted Images\n\n"
|
197 |
-
|
|
|
|
|
|
|
198 |
ocr_text = ""
|
199 |
try:
|
200 |
ocr_text = pytesseract.image_to_string(img_pil).strip()
|
@@ -203,131 +217,138 @@ def format_to_markdown(text_content, images_list):
|
|
203 |
logger.error(f"OCR error for image {i+1}: {str(ocr_e)}")
|
204 |
ocr_text = f"OCR failed: {str(ocr_e)}"
|
205 |
|
206 |
-
|
207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
|
209 |
-
|
210 |
-
|
211 |
-
if ocr_text and not ocr_text.startswith("OCR failed:"):
|
212 |
-
markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
|
213 |
-
elif ocr_text: # OCR failed message
|
214 |
-
markdown_output += f"**Image {i+1} OCR Note:** {ocr_text}\n\n"
|
215 |
-
|
216 |
-
else: # Error during upload or from ensure_hf_dataset
|
217 |
-
error_message = str(image_url_or_error) # Ensure it's a string
|
218 |
-
markdown_output += f"**Image {i+1} (Upload Error):** {error_message}\n\n"
|
219 |
|
220 |
return markdown_output.strip()
|
221 |
|
|
|
222 |
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
# Let's allow text extraction to proceed but warn about images.
|
232 |
-
# For a stricter approach, uncomment return:
|
233 |
-
# return current_status, current_status
|
234 |
-
|
235 |
-
pdf_input_source = None
|
236 |
-
|
237 |
-
if pdf_url_input and pdf_url_input.strip():
|
238 |
-
resolved_url = urllib.parse.unquote(pdf_url_input.strip())
|
239 |
-
current_status = f"Attempting to download PDF from URL: {resolved_url}"
|
240 |
-
logger.info(current_status)
|
241 |
-
try:
|
242 |
-
# Use HEAD request to check URL validity and content type quickly
|
243 |
-
response = requests.head(resolved_url, allow_redirects=True, timeout=10)
|
244 |
-
response.raise_for_status()
|
245 |
-
content_type = response.headers.get('content-type', '').lower()
|
246 |
-
if 'application/pdf' not in content_type:
|
247 |
-
current_status = f"Error: URL does not point to a PDF file (Content-Type: {content_type})."
|
248 |
-
logger.error(current_status)
|
249 |
-
return current_status, current_status
|
250 |
-
pdf_input_source = resolved_url # Use the URL string as the source
|
251 |
-
logger.info("PDF URL validated.")
|
252 |
-
except requests.RequestException as e:
|
253 |
-
current_status = f"Error accessing URL '{resolved_url}': {str(e)}"
|
254 |
-
logger.error(current_status)
|
255 |
-
return current_status, current_status
|
256 |
-
elif pdf_file_upload:
|
257 |
-
# pdf_file_upload is a tempfile._TemporaryFileWrapper object from Gradio
|
258 |
-
pdf_input_source = pdf_file_upload
|
259 |
-
current_status = f"Processing uploaded PDF file: {pdf_file_upload.name}"
|
260 |
-
logger.info(current_status)
|
261 |
-
else:
|
262 |
-
current_status = "Error: No PDF file uploaded and no PDF URL provided."
|
263 |
-
logger.error(current_status)
|
264 |
-
return current_status, current_status
|
265 |
-
|
266 |
-
current_status = "Extracting text and tables from PDF..."
|
267 |
-
logger.info(current_status)
|
268 |
-
extracted_text = extract_text_from_pdf(pdf_input_source)
|
269 |
-
if isinstance(extracted_text, str) and extracted_text.startswith("Error extracting text:"):
|
270 |
-
current_status = f"Text extraction failed. {extracted_text}"
|
271 |
-
logger.error(current_status)
|
272 |
-
# Decide if to stop or continue for images
|
273 |
-
# For now, let's return the error directly
|
274 |
-
return extracted_text, current_status
|
275 |
-
|
276 |
-
# If pdf_input_source was a URL, extract_text_from_pdf already downloaded it.
|
277 |
-
# For extract_images_from_pdf, we need to pass the URL or file path again.
|
278 |
-
# If it was an uploaded file, its stream might have been consumed or pointer moved.
|
279 |
-
# It's safer to re-open/re-access for different libraries if they don't handle streams well.
|
280 |
-
# However, pdfplumber and pdf2image should handle file paths/objects correctly.
|
281 |
-
# If pdf_input_source is a file object, reset its read pointer if necessary.
|
282 |
-
if hasattr(pdf_input_source, 'seek') and not isinstance(pdf_input_source, str):
|
283 |
-
pdf_input_source.seek(0)
|
284 |
-
|
285 |
-
current_status = "Extracting images from PDF..."
|
286 |
-
logger.info(current_status)
|
287 |
-
extracted_images = extract_images_from_pdf(pdf_input_source)
|
288 |
-
if isinstance(extracted_images, str) and extracted_images.startswith("Error"): # Error string from extraction
|
289 |
-
current_status = f"Image extraction failed or partially failed. {extracted_images}"
|
290 |
-
logger.warning(current_status) # Warning, as text might still be useful
|
291 |
-
# We can proceed to format markdown with text and image error.
|
292 |
-
# Set images to empty list to avoid error in format_to_markdown
|
293 |
-
extracted_images = [] # Or pass the error string to be included by format_to_markdown
|
294 |
-
# Let format_to_markdown handle this, for now, we will pass the error string if it happened
|
295 |
-
# No, format_to_markdown expects a list of images or an error string from check_poppler
|
296 |
-
# if isinstance(extracted_images, str) -> it's an error string, that is fine.
|
297 |
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
logger.info(current_status)
|
305 |
-
return markdown_result, current_status
|
306 |
-
|
307 |
-
# Gradio Interface
|
308 |
-
iface = gr.Interface(
|
309 |
-
fn=process_pdf,
|
310 |
-
inputs=[
|
311 |
-
gr.File(label="Upload PDF File", file_types=[".pdf"]),
|
312 |
-
gr.Textbox(label="Or Enter PDF URL", placeholder="e.g., https://example.com/file.pdf"),
|
313 |
-
],
|
314 |
-
outputs=[
|
315 |
-
gr.Markdown(label="Markdown Output"),
|
316 |
-
gr.Textbox(label="Processing Status", interactive=False),
|
317 |
-
],
|
318 |
-
title="PDF to Markdown Converter",
|
319 |
-
description="Convert a PDF (uploaded file or URL) to Markdown. Extracts text, tables, and images. Images are uploaded to a Hugging Face dataset. Requires HF_TOKEN in Spaces Secrets for image functionality.",
|
320 |
-
allow_flagging="never",
|
321 |
-
)
|
322 |
-
|
323 |
-
if __name__ == "__main__":
|
324 |
-
logger.info("Starting Gradio app...")
|
325 |
try:
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
except Exception as e:
|
331 |
-
logger.error(f"
|
332 |
-
|
333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import io
|
3 |
+
import re
|
4 |
+
import logging
|
5 |
+
import subprocess
|
6 |
+
from datetime import datetime
|
7 |
+
import urllib.parse
|
8 |
+
import tempfile
|
9 |
+
|
10 |
+
from flask import Flask, request, render_template, redirect, url_for
|
11 |
+
from werkzeug.utils import secure_filename # For secure file handling
|
12 |
+
|
13 |
import requests
|
14 |
import pdfplumber
|
15 |
from pdf2image import convert_from_path, convert_from_bytes
|
16 |
import pytesseract
|
17 |
from PIL import Image
|
18 |
+
from huggingface_hub import HfApi, create_repo, HfHubHTTPError
|
19 |
+
|
20 |
+
# --- Flask App Initialization ---
|
21 |
+
app = Flask(__name__)
|
22 |
+
app.config['UPLOAD_FOLDER'] = tempfile.gettempdir() # Use system temp dir
|
23 |
+
app.config['MAX_CONTENT_LENGTH'] = 30 * 1024 * 1024 # 30 MB limit for uploads
|
|
|
|
|
24 |
|
25 |
+
# --- Logging Configuration ---
|
26 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
27 |
logger = logging.getLogger(__name__)
|
28 |
|
29 |
+
# --- Hugging Face Configuration ---
|
30 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
31 |
+
HF_DATASET_REPO_NAME = os.getenv("HF_DATASET_REPO_NAME", "pdf-images-extracted") # Allow override via env var
|
32 |
hf_api = HfApi()
|
33 |
|
34 |
+
|
35 |
+
# --- PDF Processing Helper Functions (Adapted from Gradio version) ---
|
36 |
+
|
37 |
def check_poppler():
|
38 |
try:
|
39 |
+
result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False)
|
|
|
40 |
version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
|
41 |
if version_info_log:
|
42 |
+
logger.info(f"Poppler version check: {version_info_log.splitlines()[0] if version_info_log else 'No version output'}")
|
|
|
43 |
else:
|
44 |
+
logger.info("Poppler 'pdftoppm -v' ran. Assuming Poppler is present.")
|
|
|
|
|
45 |
return True
|
46 |
except FileNotFoundError:
|
47 |
logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in PATH.")
|
48 |
return False
|
49 |
+
except Exception as e:
|
50 |
logger.error(f"An unexpected error occurred during Poppler check: {str(e)}")
|
51 |
return False
|
52 |
|
53 |
def ensure_hf_dataset():
|
54 |
+
if not HF_TOKEN:
|
55 |
+
logger.warning("HF_TOKEN is not set. Cannot ensure Hugging Face dataset. Image uploads will fail.")
|
56 |
+
return "Error: HF_TOKEN is not set. Please configure it in Space secrets for image uploads."
|
57 |
try:
|
58 |
+
repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
|
60 |
+
return repo_id_obj.repo_id
|
61 |
+
except HfHubHTTPError as e:
|
62 |
+
if e.response.status_code == 409: # Conflict, repo already exists
|
63 |
+
logger.info(f"Dataset repo '{HF_DATASET_REPO_NAME}' already exists.")
|
64 |
+
return f"{hf_api.whoami(token=HF_TOKEN)['name']}/{HF_DATASET_REPO_NAME}" # Construct repo_id
|
65 |
+
logger.error(f"Hugging Face dataset error (HTTP {e.response.status_code}): {str(e)}")
|
66 |
+
return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
|
67 |
except Exception as e:
|
68 |
+
logger.error(f"Hugging Face dataset error: {str(e)}", exc_info=True)
|
69 |
+
return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
|
70 |
+
|
71 |
|
72 |
+
def upload_image_to_hf(image_pil, filename_base):
|
|
|
73 |
repo_id_or_error = ensure_hf_dataset()
|
74 |
if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
|
75 |
+
return repo_id_or_error
|
|
|
|
|
76 |
|
77 |
+
repo_id = repo_id_or_error
|
78 |
+
temp_image_path = None
|
79 |
try:
|
80 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
81 |
+
repo_filename = f"images/{filename_base}_{timestamp}.png" # Path in repo
|
82 |
+
|
83 |
+
# Save PIL image to a temporary file to upload
|
84 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
|
85 |
+
temp_image_path = tmp_file.name
|
86 |
+
image_pil.save(temp_image_path, format="PNG")
|
87 |
+
|
88 |
+
logger.info(f"Attempting to upload {temp_image_path} to {repo_id}/{repo_filename}")
|
89 |
file_url = hf_api.upload_file(
|
90 |
+
path_or_fileobj=temp_image_path,
|
91 |
path_in_repo=repo_filename,
|
92 |
repo_id=repo_id,
|
93 |
repo_type="dataset",
|
94 |
+
token=HF_TOKEN
|
95 |
)
|
|
|
96 |
logger.info(f"Successfully uploaded image: {file_url}")
|
97 |
return file_url
|
98 |
except Exception as e:
|
99 |
+
logger.error(f"Image upload error for {filename_base}: {str(e)}", exc_info=True)
|
100 |
+
return f"Error uploading image {filename_base}: {str(e)}"
|
101 |
+
finally:
|
102 |
+
if temp_image_path and os.path.exists(temp_image_path):
|
103 |
try:
|
104 |
+
os.remove(temp_image_path)
|
105 |
except OSError as ose:
|
106 |
+
logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
|
|
|
107 |
|
108 |
+
def extract_text_from_pdf(pdf_input_source): # pdf_input_source is URL string or local file path
|
109 |
try:
|
110 |
+
pdf_file_like_object = None
|
111 |
+
if isinstance(pdf_input_source, str) and pdf_input_source.startswith(('http://', 'https://')):
|
112 |
logger.info(f"Fetching PDF from URL for text extraction: {pdf_input_source}")
|
113 |
+
response = requests.get(pdf_input_source, stream=True, timeout=30)
|
114 |
response.raise_for_status()
|
115 |
pdf_file_like_object = io.BytesIO(response.content)
|
116 |
logger.info("PDF downloaded successfully from URL.")
|
117 |
+
elif isinstance(pdf_input_source, str) and os.path.exists(pdf_input_source): # Local file path
|
118 |
+
logger.info(f"Processing local PDF file for text extraction: {pdf_input_source}")
|
119 |
+
# pdfplumber.open can take a path directly
|
120 |
pdf_file_like_object = pdf_input_source
|
121 |
+
else:
|
122 |
+
logger.error(f"Invalid pdf_input_source for text extraction: {pdf_input_source}")
|
123 |
+
return "Error: Invalid input for PDF text extraction (must be URL or valid file path)."
|
124 |
|
125 |
with pdfplumber.open(pdf_file_like_object) as pdf:
|
126 |
full_text = ""
|
127 |
for i, page in enumerate(pdf.pages):
|
128 |
+
page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""
|
129 |
+
full_text += page_text + "\n\n"
|
|
|
|
|
|
|
130 |
tables = page.extract_tables()
|
131 |
if tables:
|
132 |
+
for table_data in tables:
|
133 |
+
if table_data:
|
134 |
+
header = [" | ".join(str(cell) if cell is not None else "" for cell in table_data[0])]
|
135 |
+
separator = [" | ".join(["---"] * len(table_data[0]))]
|
136 |
+
body = [" | ".join(str(cell) if cell is not None else "" for cell in row) for row in table_data[1:]]
|
137 |
+
table_md_lines = header + separator + body
|
138 |
+
full_text += f"**Table:**\n" + "\n".join(table_md_lines) + "\n\n"
|
139 |
logger.info("Text and table extraction successful.")
|
140 |
+
return full_text.strip()
|
141 |
+
except requests.RequestException as e:
|
142 |
+
logger.error(f"URL fetch error for text extraction: {str(e)}", exc_info=True)
|
143 |
+
return f"Error fetching PDF from URL: {str(e)}"
|
144 |
except Exception as e:
|
145 |
logger.error(f"Text extraction error: {str(e)}", exc_info=True)
|
146 |
return f"Error extracting text: {str(e)}"
|
147 |
|
148 |
+
def extract_images_from_pdf(pdf_input_source): # pdf_input_source is URL string or local file path
|
149 |
if not check_poppler():
|
150 |
return "Error: poppler-utils not found or not working correctly. Image extraction depends on it."
|
151 |
|
152 |
+
images_pil = []
|
153 |
try:
|
154 |
+
if isinstance(pdf_input_source, str) and pdf_input_source.startswith(('http://', 'https://')):
|
|
|
155 |
logger.info(f"Fetching PDF from URL for image extraction: {pdf_input_source}")
|
156 |
+
response = requests.get(pdf_input_source, stream=True, timeout=30)
|
157 |
response.raise_for_status()
|
158 |
+
logger.info("PDF downloaded successfully from URL, converting to images.")
|
159 |
+
images_pil = convert_from_bytes(response.content, dpi=200)
|
160 |
+
elif isinstance(pdf_input_source, str) and os.path.exists(pdf_input_source): # Local file path
|
161 |
+
logger.info(f"Processing local PDF file for image extraction: {pdf_input_source}")
|
162 |
+
images_pil = convert_from_path(pdf_input_source, dpi=200)
|
163 |
+
else:
|
164 |
+
logger.error(f"Invalid pdf_input_source for image extraction: {pdf_input_source}")
|
165 |
+
return "Error: Invalid input for PDF image extraction (must be URL or valid file path)."
|
|
|
166 |
|
167 |
+
logger.info(f"Successfully extracted {len(images_pil)} image(s) from PDF.")
|
168 |
+
return images_pil
|
169 |
+
except requests.RequestException as e:
|
170 |
+
logger.error(f"URL fetch error for image extraction: {str(e)}", exc_info=True)
|
171 |
+
return f"Error fetching PDF from URL for image extraction: {str(e)}"
|
172 |
except Exception as e:
|
173 |
logger.error(f"Image extraction error: {str(e)}", exc_info=True)
|
174 |
return f"Error extracting images: {str(e)}"
|
175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
|
177 |
+
def format_to_markdown(text_content, images_input):
|
178 |
+
markdown_output = "# Extracted PDF Content\n\n"
|
179 |
+
if text_content.startswith("Error"): # If text extraction itself failed
|
180 |
+
markdown_output += f"**Text Extraction Note:**\n{text_content}\n\n"
|
181 |
+
else:
|
182 |
+
text_content = re.sub(r'\n\s*\n+', '\n\n', text_content.strip())
|
183 |
+
lines = text_content.split('\n')
|
184 |
+
is_in_list = False
|
185 |
+
for line_text in lines:
|
186 |
+
line_stripped = line_text.strip()
|
187 |
+
if not line_stripped:
|
188 |
+
markdown_output += "\n"
|
189 |
+
is_in_list = False
|
190 |
+
continue
|
191 |
+
list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
|
192 |
+
is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100
|
193 |
+
if is_heading_candidate and not list_match:
|
194 |
+
markdown_output += f"## {line_stripped}\n\n"
|
195 |
+
is_in_list = False
|
196 |
+
elif list_match:
|
197 |
+
list_item_text = list_match.group(1)
|
198 |
+
markdown_output += f"- {list_item_text}\n"
|
199 |
+
is_in_list = True
|
200 |
+
else:
|
201 |
+
if is_in_list: markdown_output += "\n"
|
202 |
+
markdown_output += f"{line_text}\n\n"
|
203 |
+
is_in_list = False
|
204 |
+
markdown_output = re.sub(r'\n\s*\n+', '\n\n', markdown_output.strip()) + "\n\n"
|
205 |
|
206 |
+
if isinstance(images_input, list) and images_input:
|
207 |
markdown_output += "## Extracted Images\n\n"
|
208 |
+
if not HF_TOKEN:
|
209 |
+
markdown_output += "**Note:** `HF_TOKEN` not set. Images were extracted but not uploaded to Hugging Face Hub.\n\n"
|
210 |
+
|
211 |
+
for i, img_pil in enumerate(images_input):
|
212 |
ocr_text = ""
|
213 |
try:
|
214 |
ocr_text = pytesseract.image_to_string(img_pil).strip()
|
|
|
217 |
logger.error(f"OCR error for image {i+1}: {str(ocr_e)}")
|
218 |
ocr_text = f"OCR failed: {str(ocr_e)}"
|
219 |
|
220 |
+
if HF_TOKEN: # Only attempt upload if token is present
|
221 |
+
image_filename_base = f"extracted_image_{i+1}"
|
222 |
+
image_url_or_error = upload_image_to_hf(img_pil, image_filename_base)
|
223 |
+
if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
|
224 |
+
markdown_output += f"\n"
|
225 |
+
else:
|
226 |
+
markdown_output += f"**Image {i+1} (Upload Error):** {str(image_url_or_error)}\n\n"
|
227 |
+
else: # No token, show placeholder or local info if we were saving them locally
|
228 |
+
markdown_output += f"**Image {i+1} (not uploaded due to missing HF_TOKEN)**\n"
|
229 |
+
|
230 |
+
if ocr_text:
|
231 |
+
markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
|
232 |
|
233 |
+
elif isinstance(images_input, str) and images_input.startswith("Error"):
|
234 |
+
markdown_output += f"## Image Extraction Note\n\n{images_input}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
|
236 |
return markdown_output.strip()
|
237 |
|
238 |
+
# --- Flask Routes ---
|
239 |
|
240 |
+
@app.route('/', methods=['GET'])
|
241 |
+
def index():
|
242 |
+
return render_template('index.html')
|
243 |
|
244 |
+
@app.route('/process', methods=['POST'])
|
245 |
+
def process_pdf_route():
|
246 |
+
pdf_file = request.files.get('pdf_file')
|
247 |
+
pdf_url = request.form.get('pdf_url', '').strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
|
249 |
+
status_message = "Starting PDF processing..."
|
250 |
+
error_message = None
|
251 |
+
markdown_output = None
|
252 |
+
temp_pdf_path = None
|
253 |
+
pdf_input_source = None # This will be a URL string or a local file path
|
254 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
try:
|
256 |
+
if pdf_file and pdf_file.filename:
|
257 |
+
if not pdf_file.filename.lower().endswith('.pdf'):
|
258 |
+
raise ValueError("Uploaded file is not a PDF.")
|
259 |
+
|
260 |
+
filename = secure_filename(pdf_file.filename)
|
261 |
+
# Save to a temporary file
|
262 |
+
fd, temp_pdf_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
|
263 |
+
os.close(fd) # close file descriptor from mkstemp
|
264 |
+
pdf_file.save(temp_pdf_path)
|
265 |
+
logger.info(f"Uploaded PDF saved to temporary path: {temp_pdf_path}")
|
266 |
+
pdf_input_source = temp_pdf_path
|
267 |
+
status_message = f"Processing uploaded PDF: {filename}"
|
268 |
+
|
269 |
+
elif pdf_url:
|
270 |
+
pdf_url = urllib.parse.unquote(pdf_url)
|
271 |
+
# Basic URL validation
|
272 |
+
if not (pdf_url.startswith('http://') or pdf_url.startswith('https://')):
|
273 |
+
raise ValueError("Invalid URL scheme. Must be http or https.")
|
274 |
+
if not pdf_url.lower().endswith('.pdf'):
|
275 |
+
logger.warning(f"URL {pdf_url} does not end with .pdf. Proceeding with caution.")
|
276 |
+
# Allow proceeding but log warning, actual check is content-type or processing error
|
277 |
+
|
278 |
+
# Quick check with HEAD request (optional, but good practice)
|
279 |
+
try:
|
280 |
+
head_resp = requests.head(pdf_url, allow_redirects=True, timeout=10)
|
281 |
+
head_resp.raise_for_status()
|
282 |
+
content_type = head_resp.headers.get('content-type', '').lower()
|
283 |
+
if 'application/pdf' not in content_type:
|
284 |
+
logger.warning(f"URL {pdf_url} content-type is '{content_type}', not 'application/pdf'.")
|
285 |
+
# Depending on strictness, could raise ValueError here
|
286 |
+
except requests.RequestException as re:
|
287 |
+
logger.error(f"Failed HEAD request for URL {pdf_url}: {re}")
|
288 |
+
# Proceed, main request in extract functions will handle final failure
|
289 |
+
|
290 |
+
pdf_input_source = pdf_url
|
291 |
+
status_message = f"Processing PDF from URL: {pdf_url}"
|
292 |
+
else:
|
293 |
+
raise ValueError("No PDF file uploaded and no PDF URL provided.")
|
294 |
+
|
295 |
+
# --- Core Processing ---
|
296 |
+
status_message += "\nExtracting text..."
|
297 |
+
logger.info(status_message)
|
298 |
+
extracted_text = extract_text_from_pdf(pdf_input_source)
|
299 |
+
if isinstance(extracted_text, str) and extracted_text.startswith("Error"):
|
300 |
+
# Let format_to_markdown handle displaying this error within its structure
|
301 |
+
logger.error(f"Text extraction resulted in error: {extracted_text}")
|
302 |
+
|
303 |
+
status_message += "\nExtracting images..."
|
304 |
+
logger.info(status_message)
|
305 |
+
extracted_images = extract_images_from_pdf(pdf_input_source) # list of PIL images or error string
|
306 |
+
if isinstance(extracted_images, str) and extracted_images.startswith("Error"):
|
307 |
+
logger.error(f"Image extraction resulted in error: {extracted_images}")
|
308 |
+
|
309 |
+
status_message += "\nFormatting to Markdown..."
|
310 |
+
logger.info(status_message)
|
311 |
+
markdown_output = format_to_markdown(extracted_text, extracted_images)
|
312 |
+
|
313 |
+
status_message = "Processing complete."
|
314 |
+
if isinstance(extracted_text, str) and extracted_text.startswith("Error"):
|
315 |
+
status_message += f" (Text extraction issues: {extracted_text.split(':', 1)[1].strip()})"
|
316 |
+
if isinstance(extracted_images, str) and extracted_images.startswith("Error"):
|
317 |
+
status_message += f" (Image extraction issues: {extracted_images.split(':', 1)[1].strip()})"
|
318 |
+
if not HF_TOKEN and isinstance(extracted_images, list) and extracted_images:
|
319 |
+
status_message += " (Note: HF_TOKEN not set, images not uploaded to Hub)"
|
320 |
+
|
321 |
+
|
322 |
+
except ValueError as ve:
|
323 |
+
logger.error(f"Input validation error: {str(ve)}")
|
324 |
+
error_message = str(ve)
|
325 |
+
status_message = "Processing failed."
|
326 |
except Exception as e:
|
327 |
+
logger.error(f"An unexpected error occurred during processing: {str(e)}", exc_info=True)
|
328 |
+
error_message = f"An unexpected error occurred: {str(e)}"
|
329 |
+
status_message = "Processing failed due to an unexpected error."
|
330 |
+
finally:
|
331 |
+
if temp_pdf_path and os.path.exists(temp_pdf_path):
|
332 |
+
try:
|
333 |
+
os.remove(temp_pdf_path)
|
334 |
+
logger.info(f"Removed temporary PDF: {temp_pdf_path}")
|
335 |
+
except OSError as ose:
|
336 |
+
logger.error(f"Error removing temporary PDF {temp_pdf_path}: {ose}")
|
337 |
+
|
338 |
+
return render_template('index.html',
|
339 |
+
markdown_output=markdown_output,
|
340 |
+
status_message=status_message,
|
341 |
+
error_message=error_message)
|
342 |
+
|
343 |
+
|
344 |
+
# --- Main Execution ---
|
345 |
+
if __name__ == '__main__':
|
346 |
+
# This is for local development. For Hugging Face Spaces, Gunicorn is used via Dockerfile CMD.
|
347 |
+
# Poppler check at startup for local dev convenience
|
348 |
+
if not check_poppler():
|
349 |
+
logger.warning("Poppler utilities might not be installed correctly. PDF processing might fail.")
|
350 |
+
|
351 |
+
# Ensure UPLOAD_FOLDER exists
|
352 |
+
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
353 |
+
|
354 |
+
app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True)
|