broadfield-dev commited on
Commit
9bc382d
·
verified ·
1 Parent(s): 052b496

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +262 -241
app.py CHANGED
@@ -1,200 +1,214 @@
1
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
2
  import requests
3
  import pdfplumber
4
  from pdf2image import convert_from_path, convert_from_bytes
5
  import pytesseract
6
  from PIL import Image
7
- import io
8
- import os
9
- from huggingface_hub import HfApi, create_repo
10
- import re
11
- from datetime import datetime
12
- import urllib.parse
13
- import logging
14
- import subprocess
15
 
16
- # Set up logging
17
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
18
  logger = logging.getLogger(__name__)
19
 
20
- # Initialize Hugging Face API
21
  HF_TOKEN = os.getenv("HF_TOKEN")
22
- REPO_NAME = "pdf-images-extracted" # Consider making this configurable if needed
23
  hf_api = HfApi()
24
 
 
 
 
25
  def check_poppler():
26
  try:
27
- result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
28
- # pdftoppm -v typically prints version info to stderr
29
  version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
30
  if version_info_log:
31
- # Log the first line of the version info
32
- logger.info(f"Poppler version check: {version_info_log.splitlines()[0]}")
33
  else:
34
- logger.info("Poppler 'pdftoppm -v' ran, but no version output on stdout/stderr. Poppler is likely present.")
35
- # The main goal is to confirm 'pdftoppm' is executable.
36
- # FileNotFoundError is the primary concern for "not found".
37
  return True
38
  except FileNotFoundError:
39
  logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in PATH.")
40
  return False
41
- except Exception as e: # Catch any other unexpected errors during subprocess execution
42
  logger.error(f"An unexpected error occurred during Poppler check: {str(e)}")
43
  return False
44
 
45
  def ensure_hf_dataset():
 
 
 
46
  try:
47
- if not HF_TOKEN:
48
- # This case should ideally be caught before attempting dataset operations
49
- # However, having a check here is a good safeguard.
50
- logger.error("HF_TOKEN is not set. Cannot ensure Hugging Face dataset.")
51
- return "Error: HF_TOKEN is not set. Please configure it in Space secrets."
52
-
53
- # Use hf_api instance which might be pre-configured with token, or pass token explicitly
54
- # create_repo will use token from HfApi if initialized with one, or passed token, or env.
55
- repo_id_obj = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
56
  logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
57
- return repo_id_obj.repo_id # repo_id_obj is a RepoUrl object or similar
 
 
 
 
 
 
58
  except Exception as e:
59
- logger.error(f"Hugging Face dataset error: {str(e)}")
60
- return f"Error: Failed to access or create dataset '{REPO_NAME}': {str(e)}"
 
61
 
62
- def upload_image_to_hf(image, filename_base):
63
- # filename_base should not include extension, it will be added.
64
  repo_id_or_error = ensure_hf_dataset()
65
  if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
66
- return repo_id_or_error # Return error message from ensure_hf_dataset
67
-
68
- repo_id = repo_id_or_error # Now it's confirmed to be the repo_id string
69
 
 
 
70
  try:
71
- # Create a unique filename with timestamp in the repo to avoid collisions
72
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f") # Added microseconds for more uniqueness
73
- repo_filename = f"images/{filename_base}_{timestamp}.png"
74
-
75
- temp_path = f"/tmp/{filename_base}_{timestamp}.png" # Use unique temp name too
76
- image.save(temp_path, format="PNG")
77
-
78
- logger.info(f"Attempting to upload {temp_path} to {repo_id}/{repo_filename}")
79
-
80
  file_url = hf_api.upload_file(
81
- path_or_fileobj=temp_path,
82
  path_in_repo=repo_filename,
83
  repo_id=repo_id,
84
  repo_type="dataset",
85
- token=HF_TOKEN # Explicitly pass token for clarity
86
  )
87
- os.remove(temp_path)
88
  logger.info(f"Successfully uploaded image: {file_url}")
89
  return file_url
90
  except Exception as e:
91
- logger.error(f"Image upload error for {filename_base}: {str(e)}")
92
- # Clean up temp file if it exists and an error occurred after its creation
93
- if 'temp_path' in locals() and os.path.exists(temp_path):
 
94
  try:
95
- os.remove(temp_path)
96
  except OSError as ose:
97
- logger.error(f"Error removing temp file {temp_path} after upload failure: {ose}")
98
- return f"Error uploading image {filename_base}: {str(e)}"
99
 
100
- def extract_text_from_pdf(pdf_input_source): # Renamed for clarity (source can be path, URL, or file obj)
101
  try:
102
- if isinstance(pdf_input_source, str): # Indicates a URL
 
103
  logger.info(f"Fetching PDF from URL for text extraction: {pdf_input_source}")
104
- response = requests.get(pdf_input_source, stream=True, timeout=20) # Increased timeout slightly
105
  response.raise_for_status()
106
  pdf_file_like_object = io.BytesIO(response.content)
107
  logger.info("PDF downloaded successfully from URL.")
108
- else: # Assumes a file object (e.g., from Gradio upload)
109
- logger.info(f"Processing uploaded PDF file for text extraction: {getattr(pdf_input_source, 'name', 'N/A')}")
 
110
  pdf_file_like_object = pdf_input_source
 
 
 
111
 
112
  with pdfplumber.open(pdf_file_like_object) as pdf:
113
  full_text = ""
114
  for i, page in enumerate(pdf.pages):
115
- logger.debug(f"Extracting text from page {i+1}")
116
- page_text = page.extract_text(layout=True, x_density=1, y_density=1) or "" # x_density/y_density can impact layout accuracy
117
- full_text += page_text + "\n\n" # Add double newline as page separator
118
-
119
- logger.debug(f"Extracting tables from page {i+1}")
120
  tables = page.extract_tables()
121
  if tables:
122
- for table_idx, table_data in enumerate(tables):
123
- logger.debug(f"Processing table {table_idx+1} on page {i+1}")
124
- if table_data: # Ensure table_data is not empty
125
- table_md = "\n".join([" | ".join(str(cell) if cell is not None else "" for cell in row) for row in table_data])
126
- header_separator = " | ".join(["---"] * len(table_data[0])) if table_data[0] else ""
127
- full_text += f"**Table:**\n{table_md[:table_md.find(chr(10)) if table_md.find(chr(10)) > 0 else len(table_md)]}\n{header_separator}\n{table_md[table_md.find(chr(10))+1 if table_md.find(chr(10)) > 0 else '']}\n\n"
128
- # full_text += f"**Table:**\n{table_md}\n\n" # Simpler table version
129
  logger.info("Text and table extraction successful.")
130
- return full_text
 
 
 
131
  except Exception as e:
132
  logger.error(f"Text extraction error: {str(e)}", exc_info=True)
133
  return f"Error extracting text: {str(e)}"
134
 
135
- def extract_images_from_pdf(pdf_input_source): # Renamed for clarity
136
  if not check_poppler():
137
  return "Error: poppler-utils not found or not working correctly. Image extraction depends on it."
138
 
 
139
  try:
140
- images = []
141
- if isinstance(pdf_input_source, str): # Indicates a URL
142
  logger.info(f"Fetching PDF from URL for image extraction: {pdf_input_source}")
143
- response = requests.get(pdf_input_source, stream=True, timeout=20) # Increased timeout
144
  response.raise_for_status()
145
- logger.info("PDF downloaded successfully, converting to images.")
146
- images = convert_from_bytes(response.content, dpi=200) # dpi can be adjusted
147
- else: # Assumes a file object (e.g., from Gradio upload which is a TemporaryFileWrapper)
148
- file_path = getattr(pdf_input_source, 'name', None)
149
- if not file_path:
150
- logger.error("Uploaded PDF file has no name attribute, cannot process for images.")
151
- return "Error: Could not get path from uploaded PDF file for image extraction."
152
- logger.info(f"Processing uploaded PDF file for image extraction: {file_path}")
153
- images = convert_from_path(file_path, dpi=200)
154
 
155
- logger.info(f"Successfully extracted {len(images)} image(s) from PDF.")
156
- return images
 
 
 
157
  except Exception as e:
158
  logger.error(f"Image extraction error: {str(e)}", exc_info=True)
159
  return f"Error extracting images: {str(e)}"
160
 
161
- def format_to_markdown(text_content, images_list):
162
- markdown_output = "# Extracted PDF Content\n\n"
163
-
164
- # Normalize newlines: multiple consecutive newlines become a single blank line (two \n chars)
165
- text_content = re.sub(r'\n\s*\n+', '\n\n', text_content.strip())
166
-
167
- lines = text_content.split('\n') # Split by single newline. Blank lines between paragraphs become empty strings.
168
-
169
- for i, line_text in enumerate(lines):
170
- line_stripped = line_text.strip()
171
-
172
- if not line_stripped: # Handle blank lines explicitly
173
- # Add a single newline to markdown. This helps maintain paragraph separation.
174
- markdown_output += "\n"
175
- continue
176
-
177
- # Regex for various list markers: "1.", "*", "-", "+" followed by space and content
178
- list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
179
-
180
- is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100 # Length constraint for ALL CAPS headings
181
-
182
- if is_heading_candidate and not list_match: # Check it's not an ALL CAPS list item
183
- markdown_output += f"## {line_stripped}\n\n"
184
- elif list_match:
185
- list_item_text = list_match.group(1) # Get the content part of the list item
186
- markdown_output += f"- {list_item_text}\n" # Single newline for list items to keep them together
187
- else:
188
- # Default: treat as a paragraph line, add double newline for Markdown paragraph
189
- markdown_output += f"{line_text}\n\n"
190
 
191
- # Consolidate potentially excessive newlines that might arise from the logic above
192
- markdown_output = re.sub(r'\n\s*\n+', '\n\n', markdown_output.strip())
193
- markdown_output += "\n\n" # Ensure a blank line at the end of text content before images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
- if isinstance(images_list, list) and images_list:
196
  markdown_output += "## Extracted Images\n\n"
197
- for i, img_pil in enumerate(images_list):
 
 
 
198
  ocr_text = ""
199
  try:
200
  ocr_text = pytesseract.image_to_string(img_pil).strip()
@@ -203,131 +217,138 @@ def format_to_markdown(text_content, images_list):
203
  logger.error(f"OCR error for image {i+1}: {str(ocr_e)}")
204
  ocr_text = f"OCR failed: {str(ocr_e)}"
205
 
206
- image_filename_base = f"extracted_image_{i+1}"
207
- image_url_or_error = upload_image_to_hf(img_pil, image_filename_base)
 
 
 
 
 
 
 
 
 
 
208
 
209
- if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
210
- markdown_output += f"![Image {i+1}]({image_url_or_error})\n"
211
- if ocr_text and not ocr_text.startswith("OCR failed:"):
212
- markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
213
- elif ocr_text: # OCR failed message
214
- markdown_output += f"**Image {i+1} OCR Note:** {ocr_text}\n\n"
215
-
216
- else: # Error during upload or from ensure_hf_dataset
217
- error_message = str(image_url_or_error) # Ensure it's a string
218
- markdown_output += f"**Image {i+1} (Upload Error):** {error_message}\n\n"
219
 
220
  return markdown_output.strip()
221
 
 
222
 
223
- def process_pdf(pdf_file_upload, pdf_url_input):
224
- current_status = "Starting PDF processing..."
225
- logger.info(current_status)
226
 
227
- if not HF_TOKEN:
228
- current_status = "Error: HF_TOKEN is not set. Please set it in Space secrets for image uploads."
229
- logger.error(current_status)
230
- # App can still try to process text, but image uploads will fail.
231
- # Let's allow text extraction to proceed but warn about images.
232
- # For a stricter approach, uncomment return:
233
- # return current_status, current_status
234
-
235
- pdf_input_source = None
236
-
237
- if pdf_url_input and pdf_url_input.strip():
238
- resolved_url = urllib.parse.unquote(pdf_url_input.strip())
239
- current_status = f"Attempting to download PDF from URL: {resolved_url}"
240
- logger.info(current_status)
241
- try:
242
- # Use HEAD request to check URL validity and content type quickly
243
- response = requests.head(resolved_url, allow_redirects=True, timeout=10)
244
- response.raise_for_status()
245
- content_type = response.headers.get('content-type', '').lower()
246
- if 'application/pdf' not in content_type:
247
- current_status = f"Error: URL does not point to a PDF file (Content-Type: {content_type})."
248
- logger.error(current_status)
249
- return current_status, current_status
250
- pdf_input_source = resolved_url # Use the URL string as the source
251
- logger.info("PDF URL validated.")
252
- except requests.RequestException as e:
253
- current_status = f"Error accessing URL '{resolved_url}': {str(e)}"
254
- logger.error(current_status)
255
- return current_status, current_status
256
- elif pdf_file_upload:
257
- # pdf_file_upload is a tempfile._TemporaryFileWrapper object from Gradio
258
- pdf_input_source = pdf_file_upload
259
- current_status = f"Processing uploaded PDF file: {pdf_file_upload.name}"
260
- logger.info(current_status)
261
- else:
262
- current_status = "Error: No PDF file uploaded and no PDF URL provided."
263
- logger.error(current_status)
264
- return current_status, current_status
265
-
266
- current_status = "Extracting text and tables from PDF..."
267
- logger.info(current_status)
268
- extracted_text = extract_text_from_pdf(pdf_input_source)
269
- if isinstance(extracted_text, str) and extracted_text.startswith("Error extracting text:"):
270
- current_status = f"Text extraction failed. {extracted_text}"
271
- logger.error(current_status)
272
- # Decide if to stop or continue for images
273
- # For now, let's return the error directly
274
- return extracted_text, current_status
275
-
276
- # If pdf_input_source was a URL, extract_text_from_pdf already downloaded it.
277
- # For extract_images_from_pdf, we need to pass the URL or file path again.
278
- # If it was an uploaded file, its stream might have been consumed or pointer moved.
279
- # It's safer to re-open/re-access for different libraries if they don't handle streams well.
280
- # However, pdfplumber and pdf2image should handle file paths/objects correctly.
281
- # If pdf_input_source is a file object, reset its read pointer if necessary.
282
- if hasattr(pdf_input_source, 'seek') and not isinstance(pdf_input_source, str):
283
- pdf_input_source.seek(0)
284
-
285
- current_status = "Extracting images from PDF..."
286
- logger.info(current_status)
287
- extracted_images = extract_images_from_pdf(pdf_input_source)
288
- if isinstance(extracted_images, str) and extracted_images.startswith("Error"): # Error string from extraction
289
- current_status = f"Image extraction failed or partially failed. {extracted_images}"
290
- logger.warning(current_status) # Warning, as text might still be useful
291
- # We can proceed to format markdown with text and image error.
292
- # Set images to empty list to avoid error in format_to_markdown
293
- extracted_images = [] # Or pass the error string to be included by format_to_markdown
294
- # Let format_to_markdown handle this, for now, we will pass the error string if it happened
295
- # No, format_to_markdown expects a list of images or an error string from check_poppler
296
- # if isinstance(extracted_images, str) -> it's an error string, that is fine.
297
 
298
- current_status = "Formatting content to Markdown..."
299
- logger.info(current_status)
300
- # Pass the original extracted_images (which could be an error string or list of PIL images)
301
- markdown_result = format_to_markdown(extracted_text, extracted_images)
302
-
303
- current_status = "PDF processing complete."
304
- logger.info(current_status)
305
- return markdown_result, current_status
306
-
307
- # Gradio Interface
308
- iface = gr.Interface(
309
- fn=process_pdf,
310
- inputs=[
311
- gr.File(label="Upload PDF File", file_types=[".pdf"]),
312
- gr.Textbox(label="Or Enter PDF URL", placeholder="e.g., https://example.com/file.pdf"),
313
- ],
314
- outputs=[
315
- gr.Markdown(label="Markdown Output"),
316
- gr.Textbox(label="Processing Status", interactive=False),
317
- ],
318
- title="PDF to Markdown Converter",
319
- description="Convert a PDF (uploaded file or URL) to Markdown. Extracts text, tables, and images. Images are uploaded to a Hugging Face dataset. Requires HF_TOKEN in Spaces Secrets for image functionality.",
320
- allow_flagging="never",
321
- )
322
-
323
- if __name__ == "__main__":
324
- logger.info("Starting Gradio app...")
325
  try:
326
- # When running in Hugging Face Spaces, share=False is recommended.
327
- # The Space itself provides the public URL.
328
- iface.launch(server_name="0.0.0.0", server_port=7860, share=True)
329
- logger.info("Gradio app started successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  except Exception as e:
331
- logger.error(f"Failed to start Gradio app: {str(e)}", exc_info=True)
332
- # Re-raise the exception to ensure the script exits if Gradio fails to launch
333
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import re
4
+ import logging
5
+ import subprocess
6
+ from datetime import datetime
7
+ import urllib.parse
8
+ import tempfile
9
+
10
+ from flask import Flask, request, render_template, redirect, url_for
11
+ from werkzeug.utils import secure_filename # For secure file handling
12
+
13
  import requests
14
  import pdfplumber
15
  from pdf2image import convert_from_path, convert_from_bytes
16
  import pytesseract
17
  from PIL import Image
18
+ from huggingface_hub import HfApi, create_repo, HfHubHTTPError
19
+
20
+ # --- Flask App Initialization ---
21
+ app = Flask(__name__)
22
+ app.config['UPLOAD_FOLDER'] = tempfile.gettempdir() # Use system temp dir
23
+ app.config['MAX_CONTENT_LENGTH'] = 30 * 1024 * 1024 # 30 MB limit for uploads
 
 
24
 
25
+ # --- Logging Configuration ---
26
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
27
  logger = logging.getLogger(__name__)
28
 
29
+ # --- Hugging Face Configuration ---
30
  HF_TOKEN = os.getenv("HF_TOKEN")
31
+ HF_DATASET_REPO_NAME = os.getenv("HF_DATASET_REPO_NAME", "pdf-images-extracted") # Allow override via env var
32
  hf_api = HfApi()
33
 
34
+
35
+ # --- PDF Processing Helper Functions (Adapted from Gradio version) ---
36
+
37
  def check_poppler():
38
  try:
39
+ result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False)
 
40
  version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
41
  if version_info_log:
42
+ logger.info(f"Poppler version check: {version_info_log.splitlines()[0] if version_info_log else 'No version output'}")
 
43
  else:
44
+ logger.info("Poppler 'pdftoppm -v' ran. Assuming Poppler is present.")
 
 
45
  return True
46
  except FileNotFoundError:
47
  logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in PATH.")
48
  return False
49
+ except Exception as e:
50
  logger.error(f"An unexpected error occurred during Poppler check: {str(e)}")
51
  return False
52
 
53
  def ensure_hf_dataset():
54
+ if not HF_TOKEN:
55
+ logger.warning("HF_TOKEN is not set. Cannot ensure Hugging Face dataset. Image uploads will fail.")
56
+ return "Error: HF_TOKEN is not set. Please configure it in Space secrets for image uploads."
57
  try:
58
+ repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
 
 
 
 
 
 
 
 
59
  logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
60
+ return repo_id_obj.repo_id
61
+ except HfHubHTTPError as e:
62
+ if e.response.status_code == 409: # Conflict, repo already exists
63
+ logger.info(f"Dataset repo '{HF_DATASET_REPO_NAME}' already exists.")
64
+ return f"{hf_api.whoami(token=HF_TOKEN)['name']}/{HF_DATASET_REPO_NAME}" # Construct repo_id
65
+ logger.error(f"Hugging Face dataset error (HTTP {e.response.status_code}): {str(e)}")
66
+ return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
67
  except Exception as e:
68
+ logger.error(f"Hugging Face dataset error: {str(e)}", exc_info=True)
69
+ return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
70
+
71
 
72
+ def upload_image_to_hf(image_pil, filename_base):
 
73
  repo_id_or_error = ensure_hf_dataset()
74
  if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
75
+ return repo_id_or_error
 
 
76
 
77
+ repo_id = repo_id_or_error
78
+ temp_image_path = None
79
  try:
80
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
81
+ repo_filename = f"images/{filename_base}_{timestamp}.png" # Path in repo
82
+
83
+ # Save PIL image to a temporary file to upload
84
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
85
+ temp_image_path = tmp_file.name
86
+ image_pil.save(temp_image_path, format="PNG")
87
+
88
+ logger.info(f"Attempting to upload {temp_image_path} to {repo_id}/{repo_filename}")
89
  file_url = hf_api.upload_file(
90
+ path_or_fileobj=temp_image_path,
91
  path_in_repo=repo_filename,
92
  repo_id=repo_id,
93
  repo_type="dataset",
94
+ token=HF_TOKEN
95
  )
 
96
  logger.info(f"Successfully uploaded image: {file_url}")
97
  return file_url
98
  except Exception as e:
99
+ logger.error(f"Image upload error for {filename_base}: {str(e)}", exc_info=True)
100
+ return f"Error uploading image {filename_base}: {str(e)}"
101
+ finally:
102
+ if temp_image_path and os.path.exists(temp_image_path):
103
  try:
104
+ os.remove(temp_image_path)
105
  except OSError as ose:
106
+ logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
 
107
 
108
+ def extract_text_from_pdf(pdf_input_source): # pdf_input_source is URL string or local file path
109
  try:
110
+ pdf_file_like_object = None
111
+ if isinstance(pdf_input_source, str) and pdf_input_source.startswith(('http://', 'https://')):
112
  logger.info(f"Fetching PDF from URL for text extraction: {pdf_input_source}")
113
+ response = requests.get(pdf_input_source, stream=True, timeout=30)
114
  response.raise_for_status()
115
  pdf_file_like_object = io.BytesIO(response.content)
116
  logger.info("PDF downloaded successfully from URL.")
117
+ elif isinstance(pdf_input_source, str) and os.path.exists(pdf_input_source): # Local file path
118
+ logger.info(f"Processing local PDF file for text extraction: {pdf_input_source}")
119
+ # pdfplumber.open can take a path directly
120
  pdf_file_like_object = pdf_input_source
121
+ else:
122
+ logger.error(f"Invalid pdf_input_source for text extraction: {pdf_input_source}")
123
+ return "Error: Invalid input for PDF text extraction (must be URL or valid file path)."
124
 
125
  with pdfplumber.open(pdf_file_like_object) as pdf:
126
  full_text = ""
127
  for i, page in enumerate(pdf.pages):
128
+ page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""
129
+ full_text += page_text + "\n\n"
 
 
 
130
  tables = page.extract_tables()
131
  if tables:
132
+ for table_data in tables:
133
+ if table_data:
134
+ header = [" | ".join(str(cell) if cell is not None else "" for cell in table_data[0])]
135
+ separator = [" | ".join(["---"] * len(table_data[0]))]
136
+ body = [" | ".join(str(cell) if cell is not None else "" for cell in row) for row in table_data[1:]]
137
+ table_md_lines = header + separator + body
138
+ full_text += f"**Table:**\n" + "\n".join(table_md_lines) + "\n\n"
139
  logger.info("Text and table extraction successful.")
140
+ return full_text.strip()
141
+ except requests.RequestException as e:
142
+ logger.error(f"URL fetch error for text extraction: {str(e)}", exc_info=True)
143
+ return f"Error fetching PDF from URL: {str(e)}"
144
  except Exception as e:
145
  logger.error(f"Text extraction error: {str(e)}", exc_info=True)
146
  return f"Error extracting text: {str(e)}"
147
 
148
+ def extract_images_from_pdf(pdf_input_source): # pdf_input_source is URL string or local file path
149
  if not check_poppler():
150
  return "Error: poppler-utils not found or not working correctly. Image extraction depends on it."
151
 
152
+ images_pil = []
153
  try:
154
+ if isinstance(pdf_input_source, str) and pdf_input_source.startswith(('http://', 'https://')):
 
155
  logger.info(f"Fetching PDF from URL for image extraction: {pdf_input_source}")
156
+ response = requests.get(pdf_input_source, stream=True, timeout=30)
157
  response.raise_for_status()
158
+ logger.info("PDF downloaded successfully from URL, converting to images.")
159
+ images_pil = convert_from_bytes(response.content, dpi=200)
160
+ elif isinstance(pdf_input_source, str) and os.path.exists(pdf_input_source): # Local file path
161
+ logger.info(f"Processing local PDF file for image extraction: {pdf_input_source}")
162
+ images_pil = convert_from_path(pdf_input_source, dpi=200)
163
+ else:
164
+ logger.error(f"Invalid pdf_input_source for image extraction: {pdf_input_source}")
165
+ return "Error: Invalid input for PDF image extraction (must be URL or valid file path)."
 
166
 
167
+ logger.info(f"Successfully extracted {len(images_pil)} image(s) from PDF.")
168
+ return images_pil
169
+ except requests.RequestException as e:
170
+ logger.error(f"URL fetch error for image extraction: {str(e)}", exc_info=True)
171
+ return f"Error fetching PDF from URL for image extraction: {str(e)}"
172
  except Exception as e:
173
  logger.error(f"Image extraction error: {str(e)}", exc_info=True)
174
  return f"Error extracting images: {str(e)}"
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
+ def format_to_markdown(text_content, images_input):
178
+ markdown_output = "# Extracted PDF Content\n\n"
179
+ if text_content.startswith("Error"): # If text extraction itself failed
180
+ markdown_output += f"**Text Extraction Note:**\n{text_content}\n\n"
181
+ else:
182
+ text_content = re.sub(r'\n\s*\n+', '\n\n', text_content.strip())
183
+ lines = text_content.split('\n')
184
+ is_in_list = False
185
+ for line_text in lines:
186
+ line_stripped = line_text.strip()
187
+ if not line_stripped:
188
+ markdown_output += "\n"
189
+ is_in_list = False
190
+ continue
191
+ list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
192
+ is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100
193
+ if is_heading_candidate and not list_match:
194
+ markdown_output += f"## {line_stripped}\n\n"
195
+ is_in_list = False
196
+ elif list_match:
197
+ list_item_text = list_match.group(1)
198
+ markdown_output += f"- {list_item_text}\n"
199
+ is_in_list = True
200
+ else:
201
+ if is_in_list: markdown_output += "\n"
202
+ markdown_output += f"{line_text}\n\n"
203
+ is_in_list = False
204
+ markdown_output = re.sub(r'\n\s*\n+', '\n\n', markdown_output.strip()) + "\n\n"
205
 
206
+ if isinstance(images_input, list) and images_input:
207
  markdown_output += "## Extracted Images\n\n"
208
+ if not HF_TOKEN:
209
+ markdown_output += "**Note:** `HF_TOKEN` not set. Images were extracted but not uploaded to Hugging Face Hub.\n\n"
210
+
211
+ for i, img_pil in enumerate(images_input):
212
  ocr_text = ""
213
  try:
214
  ocr_text = pytesseract.image_to_string(img_pil).strip()
 
217
  logger.error(f"OCR error for image {i+1}: {str(ocr_e)}")
218
  ocr_text = f"OCR failed: {str(ocr_e)}"
219
 
220
+ if HF_TOKEN: # Only attempt upload if token is present
221
+ image_filename_base = f"extracted_image_{i+1}"
222
+ image_url_or_error = upload_image_to_hf(img_pil, image_filename_base)
223
+ if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
224
+ markdown_output += f"![Image {i+1}]({image_url_or_error})\n"
225
+ else:
226
+ markdown_output += f"**Image {i+1} (Upload Error):** {str(image_url_or_error)}\n\n"
227
+ else: # No token, show placeholder or local info if we were saving them locally
228
+ markdown_output += f"**Image {i+1} (not uploaded due to missing HF_TOKEN)**\n"
229
+
230
+ if ocr_text:
231
+ markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
232
 
233
+ elif isinstance(images_input, str) and images_input.startswith("Error"):
234
+ markdown_output += f"## Image Extraction Note\n\n{images_input}\n\n"
 
 
 
 
 
 
 
 
235
 
236
  return markdown_output.strip()
237
 
238
+ # --- Flask Routes ---
239
 
240
+ @app.route('/', methods=['GET'])
241
+ def index():
242
+ return render_template('index.html')
243
 
244
+ @app.route('/process', methods=['POST'])
245
+ def process_pdf_route():
246
+ pdf_file = request.files.get('pdf_file')
247
+ pdf_url = request.form.get('pdf_url', '').strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
+ status_message = "Starting PDF processing..."
250
+ error_message = None
251
+ markdown_output = None
252
+ temp_pdf_path = None
253
+ pdf_input_source = None # This will be a URL string or a local file path
254
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  try:
256
+ if pdf_file and pdf_file.filename:
257
+ if not pdf_file.filename.lower().endswith('.pdf'):
258
+ raise ValueError("Uploaded file is not a PDF.")
259
+
260
+ filename = secure_filename(pdf_file.filename)
261
+ # Save to a temporary file
262
+ fd, temp_pdf_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
263
+ os.close(fd) # close file descriptor from mkstemp
264
+ pdf_file.save(temp_pdf_path)
265
+ logger.info(f"Uploaded PDF saved to temporary path: {temp_pdf_path}")
266
+ pdf_input_source = temp_pdf_path
267
+ status_message = f"Processing uploaded PDF: {filename}"
268
+
269
+ elif pdf_url:
270
+ pdf_url = urllib.parse.unquote(pdf_url)
271
+ # Basic URL validation
272
+ if not (pdf_url.startswith('http://') or pdf_url.startswith('https://')):
273
+ raise ValueError("Invalid URL scheme. Must be http or https.")
274
+ if not pdf_url.lower().endswith('.pdf'):
275
+ logger.warning(f"URL {pdf_url} does not end with .pdf. Proceeding with caution.")
276
+ # Allow proceeding but log warning, actual check is content-type or processing error
277
+
278
+ # Quick check with HEAD request (optional, but good practice)
279
+ try:
280
+ head_resp = requests.head(pdf_url, allow_redirects=True, timeout=10)
281
+ head_resp.raise_for_status()
282
+ content_type = head_resp.headers.get('content-type', '').lower()
283
+ if 'application/pdf' not in content_type:
284
+ logger.warning(f"URL {pdf_url} content-type is '{content_type}', not 'application/pdf'.")
285
+ # Depending on strictness, could raise ValueError here
286
+ except requests.RequestException as re:
287
+ logger.error(f"Failed HEAD request for URL {pdf_url}: {re}")
288
+ # Proceed, main request in extract functions will handle final failure
289
+
290
+ pdf_input_source = pdf_url
291
+ status_message = f"Processing PDF from URL: {pdf_url}"
292
+ else:
293
+ raise ValueError("No PDF file uploaded and no PDF URL provided.")
294
+
295
+ # --- Core Processing ---
296
+ status_message += "\nExtracting text..."
297
+ logger.info(status_message)
298
+ extracted_text = extract_text_from_pdf(pdf_input_source)
299
+ if isinstance(extracted_text, str) and extracted_text.startswith("Error"):
300
+ # Let format_to_markdown handle displaying this error within its structure
301
+ logger.error(f"Text extraction resulted in error: {extracted_text}")
302
+
303
+ status_message += "\nExtracting images..."
304
+ logger.info(status_message)
305
+ extracted_images = extract_images_from_pdf(pdf_input_source) # list of PIL images or error string
306
+ if isinstance(extracted_images, str) and extracted_images.startswith("Error"):
307
+ logger.error(f"Image extraction resulted in error: {extracted_images}")
308
+
309
+ status_message += "\nFormatting to Markdown..."
310
+ logger.info(status_message)
311
+ markdown_output = format_to_markdown(extracted_text, extracted_images)
312
+
313
+ status_message = "Processing complete."
314
+ if isinstance(extracted_text, str) and extracted_text.startswith("Error"):
315
+ status_message += f" (Text extraction issues: {extracted_text.split(':', 1)[1].strip()})"
316
+ if isinstance(extracted_images, str) and extracted_images.startswith("Error"):
317
+ status_message += f" (Image extraction issues: {extracted_images.split(':', 1)[1].strip()})"
318
+ if not HF_TOKEN and isinstance(extracted_images, list) and extracted_images:
319
+ status_message += " (Note: HF_TOKEN not set, images not uploaded to Hub)"
320
+
321
+
322
+ except ValueError as ve:
323
+ logger.error(f"Input validation error: {str(ve)}")
324
+ error_message = str(ve)
325
+ status_message = "Processing failed."
326
  except Exception as e:
327
+ logger.error(f"An unexpected error occurred during processing: {str(e)}", exc_info=True)
328
+ error_message = f"An unexpected error occurred: {str(e)}"
329
+ status_message = "Processing failed due to an unexpected error."
330
+ finally:
331
+ if temp_pdf_path and os.path.exists(temp_pdf_path):
332
+ try:
333
+ os.remove(temp_pdf_path)
334
+ logger.info(f"Removed temporary PDF: {temp_pdf_path}")
335
+ except OSError as ose:
336
+ logger.error(f"Error removing temporary PDF {temp_pdf_path}: {ose}")
337
+
338
+ return render_template('index.html',
339
+ markdown_output=markdown_output,
340
+ status_message=status_message,
341
+ error_message=error_message)
342
+
343
+
344
+ # --- Main Execution ---
345
+ if __name__ == '__main__':
346
+ # This is for local development. For Hugging Face Spaces, Gunicorn is used via Dockerfile CMD.
347
+ # Poppler check at startup for local dev convenience
348
+ if not check_poppler():
349
+ logger.warning("Poppler utilities might not be installed correctly. PDF processing might fail.")
350
+
351
+ # Ensure UPLOAD_FOLDER exists
352
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
353
+
354
+ app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True)