broadfield-dev commited on
Commit
a87a8f6
·
verified ·
1 Parent(s): 7aec7b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +402 -117
app.py CHANGED
@@ -1,128 +1,413 @@
1
- from flask import Flask, request, render_template_string, send_file
2
- import markdown
3
- import imgkit
4
  import os
5
- import traceback
6
- from io import BytesIO
 
 
 
 
 
 
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  app = Flask(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- # Use a directory within the app's working directory to avoid permission issues
11
- TEMP_DIR = os.path.join(os.getcwd(), "temp")
12
 
13
- # Create temporary directory if it doesn't exist
14
- try:
15
- os.makedirs(TEMP_DIR, exist_ok=True)
16
- except Exception as e:
17
- print(f"Error creating temp directory: {e}")
18
 
19
- @app.route("/", methods=["GET", "POST"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def index():
21
- preview_html = None
22
- download_available = False
23
- download_type = "png"
24
- error_message = None
25
- markdown_text = request.form.get("markdown_text", "") if request.method == "POST" else ""
 
 
 
 
 
 
 
26
 
27
- if request.method == "POST" and markdown_text:
28
  try:
29
- # Convert Markdown to HTML
30
- html_content = markdown.markdown(markdown_text, extensions=['fenced_code', 'tables'])
31
-
32
- # Prepare HTML with basic styling
33
- full_html = f"""
34
- <!DOCTYPE html>
35
- <html>
36
- <head>
37
- <style>
38
- body {{ font-family: Arial, sans-serif; padding: 20px; }}
39
- pre, code {{ background: #f4f4f4; padding: 10px; border-radius: 5px; }}
40
- table {{ border-collapse: collapse; width: 100%; }}
41
- th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
42
- th {{ background-color: #f2f2f2; }}
43
- </style>
44
- </head>
45
- <body>
46
- {html_content}
47
- </body>
48
- </html>
49
- """
50
-
51
- # Save HTML to a temporary file
52
- html_path = os.path.join(TEMP_DIR, "output.html")
53
- with open(html_path, "w", encoding="utf-8") as f:
54
- f.write(full_html)
55
-
56
- # Generate preview HTML
57
- preview_html = full_html
58
- download_available = True
59
- download_type = request.form.get("download_type", "png")
60
-
61
- if "download" in request.form:
62
- if download_type == "html":
63
- return send_file(
64
- html_path,
65
- as_attachment=True,
66
- download_name="output.html",
67
- mimetype="text/html"
68
- )
69
- else: # PNG
70
- # Convert HTML to PNG using imgkit
71
- png_path = os.path.join(TEMP_DIR, "output.png")
72
- imgkit.from_string(full_html, png_path, options={"quiet": ""})
73
- return send_file(
74
- png_path,
75
- as_attachment=True,
76
- download_name="output.png",
77
- mimetype="image/png"
78
- )
79
 
80
  except Exception as e:
81
- error_message = f"Error processing request: {str(e)}"
82
- print(f"Error: {traceback.format_exc()}")
83
-
84
- return render_template_string("""
85
- <!DOCTYPE html>
86
- <html>
87
- <head>
88
- <title>Markdown to PNG/HTML Converter</title>
89
- <style>
90
- body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
91
- textarea { width: 100%; height: 300px; margin-bottom: 10px; }
92
- select, button { padding: 10px; margin: 5px; }
93
- .preview { border: 1px solid #ddd; padding: 15px; margin-top: 20px; }
94
- .download-btn { background-color: #4CAF50; color: white; border: none; cursor: pointer; }
95
- .download-btn:hover { background-color: #45a049; }
96
- .error { color: red; margin-top: 10px; }
97
- </style>
98
- </head>
99
- <body>
100
- <h1>Markdown to PNG/HTML Converter</h1>
101
- <form method="post">
102
- <textarea name="markdown_text" placeholder="Paste your Markdown here...">{{ markdown_text }}</textarea><br>
103
- <label for="download_type">Output format:</label>
104
- <select name="download_type">
105
- <option value="png" {% if download_type == 'png' %}selected{% endif %}>PNG</option>
106
- <option value="html" {% if download_type == 'html' %}selected{% endif %}>HTML</option>
107
- </select><br>
108
- <button type="submit">Generate Preview</button>
109
- {% if download_available %}
110
- <button type="submit" name="download" value="true" class="download-btn">Download {{ download_type.upper() }}</button>
111
- {% endif %}
112
- </form>
113
- {% if error_message %}
114
- <p class="error">{{ error_message }}</p>
115
- {% endif %}
116
- {% if preview_html %}
117
- <h2>Preview</h2>
118
- <div class="preview">
119
- {{ preview_html | safe }}
120
- </div>
121
- {% endif %}
122
- </body>
123
- </html>
124
- """, preview_html=preview_html, download_available=download_available,
125
- download_type=download_type, error_message=error_message, markdown_text=markdown_text)
126
-
127
- if __name__ == "__main__":
128
- app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))
 
 
 
 
1
  import os
2
+ import io
3
+ import re
4
+ import logging
5
+ import subprocess
6
+ from datetime import datetime
7
+ import urllib.parse
8
+ import tempfile
9
+ import json # For streaming JSON messages
10
+ import time # For gevent.sleep
11
 
12
+ from flask import Flask, request, render_template, Response, stream_with_context
13
+ from werkzeug.utils import secure_filename
14
+
15
+ # Ensure gevent is imported and monkey patched if needed for other libraries
16
+ # from gevent import monkey
17
+ # monkey.patch_all() # Apply this early if you suspect issues with other libs
18
+
19
+ import requests # For requests.exceptions.HTTPError
20
+ from requests.exceptions import HTTPError as RequestsHTTPError # Specific import for clarity
21
+
22
+ import pdfplumber
23
+ import pdf2image # <<<<<<<<<<<<<<<< CORRECTED: Added this import
24
+ from pdf2image import convert_from_path, convert_from_bytes # Keep these for direct use too
25
+ # from pdf2image.exceptions import ... # If you need to catch specific pdf2image errors
26
+
27
+ import pytesseract
28
+ from PIL import Image
29
+ from huggingface_hub import HfApi, create_repo
30
+
31
+ # --- Flask App Initialization ---
32
  app = Flask(__name__)
33
+ app.config['UPLOAD_FOLDER'] = tempfile.gettempdir()
34
+ app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 # 50 MB limit for uploads, adjust as needed
35
+
36
+ # --- Logging Configuration ---
37
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
38
+ logger = logging.getLogger(__name__)
39
+
40
+ # --- Hugging Face Configuration ---
41
+ HF_TOKEN = os.getenv("HF_TOKEN")
42
+ HF_DATASET_REPO_NAME = os.getenv("HF_DATASET_REPO_NAME", "pdf-images-extracted")
43
+ hf_api = HfApi()
44
+
45
+ # --- Helper to yield messages for streaming ---
46
+ def yield_message(type, data):
47
+ """Helper to format messages as JSON strings for streaming."""
48
+ return json.dumps({"type": type, **data}) + "\n"
49
+
50
+ # --- PDF Processing Helper Functions (Adapted for Streaming) ---
51
+
52
+ def check_poppler():
53
+ try:
54
+ result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False)
55
+ version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
56
+ if version_info_log:
57
+ logger.info(f"Poppler version check: {version_info_log.splitlines()[0] if version_info_log else 'No version output'}")
58
+ else:
59
+ logger.info("Poppler 'pdftoppm -v' ran. Assuming Poppler is present.")
60
+ return True
61
+ except FileNotFoundError:
62
+ logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in PATH.")
63
+ return False
64
+ except Exception as e:
65
+ logger.error(f"An unexpected error occurred during Poppler check: {str(e)}")
66
+ return False
67
+
68
+ def ensure_hf_dataset():
69
+ if not HF_TOKEN:
70
+ msg = "HF_TOKEN is not set. Cannot ensure Hugging Face dataset. Image uploads will fail."
71
+ logger.warning(msg)
72
+ return "Error: " + msg
73
+ try:
74
+ repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
75
+ logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
76
+ return repo_id_obj.repo_id
77
+ except RequestsHTTPError as e:
78
+ if e.response is not None and e.response.status_code == 409:
79
+ logger.info(f"Dataset repo '{HF_DATASET_REPO_NAME}' already exists (HTTP 409).")
80
+ try:
81
+ user_info = hf_api.whoami(token=HF_TOKEN)
82
+ namespace = user_info.get('name') if user_info else None
83
+ if namespace:
84
+ return f"{namespace}/{HF_DATASET_REPO_NAME}"
85
+ else:
86
+ logger.warning(f"Could not determine namespace for existing repo '{HF_DATASET_REPO_NAME}'. Using generic ID.")
87
+ return HF_DATASET_REPO_NAME
88
+ except Exception as whoami_e:
89
+ logger.error(f"Could not determine namespace for existing repo via whoami due to: {whoami_e}. Using generic ID.")
90
+ return HF_DATASET_REPO_NAME
91
+ else:
92
+ status_code = e.response.status_code if e.response is not None else "Unknown"
93
+ logger.error(f"Hugging Face dataset HTTP error (Status: {status_code}): {str(e)}")
94
+ return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}' due to HTTP error: {str(e)}"
95
+ except Exception as e:
96
+ logger.error(f"Hugging Face dataset general error: {str(e)}", exc_info=True)
97
+ return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
98
+
99
+
100
+ def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
101
+ repo_id_or_error = ensure_hf_dataset()
102
+ if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
103
+ return repo_id_or_error
104
+
105
+ repo_id = repo_id_or_error
106
+ temp_image_path = None
107
+ try:
108
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
109
+ repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
110
+
111
+ # Ensure UPLOAD_FOLDER exists before writing temp file
112
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
113
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
114
+ temp_image_path = tmp_file.name
115
+ image_pil.save(temp_image_path, format="PNG")
116
+
117
+ logger.info(f"Attempting to upload {temp_image_path} to {repo_id}/{repo_filename}")
118
+ file_url = hf_api.upload_file(
119
+ path_or_fileobj=temp_image_path, path_in_repo=repo_filename,
120
+ repo_id=repo_id, repo_type="dataset", token=HF_TOKEN
121
+ )
122
+ logger.info(f"Successfully uploaded image: {file_url}")
123
+ return file_url
124
+ except Exception as e:
125
+ logger.error(f"Image upload error for {filename_base}{page_num_for_log}: {str(e)}", exc_info=True)
126
+ return f"Error uploading image {filename_base}{page_num_for_log}: {str(e)}"
127
+ finally:
128
+ if temp_image_path and os.path.exists(temp_image_path):
129
+ try: os.remove(temp_image_path)
130
+ except OSError as ose: logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
131
+
132
+
133
+ def format_page_text_to_markdown_chunk(page_text_content):
134
+ chunk_md = ""
135
+ page_text_content = re.sub(r'\n\s*\n+', '\n\n', page_text_content.strip())
136
+ lines = page_text_content.split('\n')
137
+ is_in_list = False
138
+ for line_text in lines:
139
+ line_stripped = line_text.strip()
140
+ if not line_stripped:
141
+ chunk_md += "\n"
142
+ is_in_list = False
143
+ continue
144
+ list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
145
+ is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100
146
+ if is_heading_candidate and not list_match:
147
+ chunk_md += f"## {line_stripped}\n\n"
148
+ is_in_list = False
149
+ elif list_match:
150
+ list_item_text = list_match.group(1)
151
+ chunk_md += f"- {list_item_text}\n"
152
+ is_in_list = True
153
+ else:
154
+ if is_in_list: chunk_md += "\n"
155
+ chunk_md += f"{line_text}\n\n"
156
+ is_in_list = False
157
+ return re.sub(r'\n\s*\n+', '\n\n', chunk_md.strip()) + "\n\n"
158
+
159
+
160
+ # --- Main PDF Processing Logic (Generator Function for Streaming) ---
161
+
162
+ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
163
+ try:
164
+ yield yield_message("markdown_replace", {"content": "# Extracted PDF Content\n\n"})
165
+ time.sleep(0.01)
166
 
167
+ yield yield_message("status", {"message": "Opening PDF for text extraction..."})
168
+ time.sleep(0.01)
169
 
170
+ source_is_url = isinstance(pdf_input_source_path_or_url, str) and \
171
+ pdf_input_source_path_or_url.startswith(('http://', 'https://'))
172
+
173
+ pdf_handle_for_text = None
174
+ pdf_bytes_for_images = None
175
 
176
+ if source_is_url:
177
+ try:
178
+ response = requests.get(pdf_input_source_path_or_url, stream=False, timeout=60)
179
+ response.raise_for_status()
180
+ pdf_bytes_for_images = response.content
181
+ pdf_handle_for_text = io.BytesIO(pdf_bytes_for_images)
182
+ yield yield_message("status", {"message": f"PDF downloaded from URL ({len(pdf_bytes_for_images)/1024:.2f} KB)."})
183
+ time.sleep(0.01)
184
+ except RequestsHTTPError as e:
185
+ logger.error(f"URL fetch HTTP error for PDF processing: {str(e)} (Status: {e.response.status_code if e.response else 'N/A'})", exc_info=True)
186
+ yield yield_message("error", {"message": f"Error fetching PDF from URL (HTTP {e.response.status_code if e.response else 'N/A'}): {e.response.reason if e.response else str(e)}"})
187
+ return
188
+ except requests.RequestException as e:
189
+ logger.error(f"URL fetch network error for PDF processing: {str(e)}", exc_info=True)
190
+ yield yield_message("error", {"message": f"Network error fetching PDF from URL: {str(e)}"})
191
+ return
192
+ else:
193
+ pdf_handle_for_text = pdf_input_source_path_or_url
194
+
195
+ total_text_pages = 0
196
+ try:
197
+ with pdfplumber.open(pdf_handle_for_text) as pdf:
198
+ total_text_pages = len(pdf.pages)
199
+ yield yield_message("status", {"message": f"Found {total_text_pages} page(s) for text extraction."})
200
+ time.sleep(0.01)
201
+
202
+ for i, page in enumerate(pdf.pages):
203
+ yield yield_message("status", {"message": f"Extracting text from page {i+1}/{total_text_pages}..."})
204
+ time.sleep(0.01)
205
+
206
+ page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""
207
+
208
+ # Removed table extraction logic here
209
+ # page_tables_md = "" # No longer needed
210
+ # tables = page.extract_tables() # No longer needed
211
+ # if tables: # No longer needed
212
+ # ... (table processing code removed) ...
213
+
214
+ formatted_page_text_md = format_page_text_to_markdown_chunk(page_text)
215
+
216
+ yield yield_message("markdown_chunk", {"content": formatted_page_text_md})
217
+ # if page_tables_md: # No longer needed, as page_tables_md is not created
218
+ # yield yield_message("markdown_chunk", {"content": page_tables_md})
219
+ time.sleep(0.01)
220
+ except Exception as e:
221
+ logger.error(f"Error during PDF text extraction: {str(e)}", exc_info=True) # Updated log message
222
+ yield yield_message("error", {"message": f"Error during text extraction: {str(e)}"})
223
+
224
+ if not check_poppler():
225
+ yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
226
+ else:
227
+ yield yield_message("status", {"message": "Starting image extraction..."})
228
+ yield yield_message("markdown_chunk", {"content": "## Extracted Images\n\n"})
229
+ if not HF_TOKEN:
230
+ yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})
231
+
232
+ time.sleep(0.01)
233
+ extracted_pil_images_overall_count = 0 # Keep track of total images processed for numbering
234
+ try:
235
+ image_source_for_convert = None
236
+ if source_is_url and pdf_bytes_for_images:
237
+ image_source_for_convert = pdf_bytes_for_images
238
+ logger.info("Using downloaded bytes for image conversion.")
239
+ elif not source_is_url:
240
+ image_source_for_convert = pdf_input_source_path_or_url
241
+ logger.info("Using local file path for image conversion.")
242
+
243
+ if image_source_for_convert:
244
+ try:
245
+ pdf_info = None
246
+ if isinstance(image_source_for_convert, bytes):
247
+ pdf_info = pdf2image.pdfinfo_from_bytes(image_source_for_convert, userpw=None, poppler_path=None)
248
+ else:
249
+ pdf_info = pdf2image.pdfinfo_from_path(image_source_for_convert, userpw=None, poppler_path=None)
250
+
251
+ num_image_pages = pdf_info.get("Pages", 0)
252
+ yield yield_message("status", {"message": f"PDF has {num_image_pages} page(s) for potential image extraction."})
253
+
254
+ batch_size = 1
255
+ for page_idx_start in range(1, num_image_pages + 1, batch_size):
256
+ page_idx_end = min(page_idx_start + batch_size - 1, num_image_pages)
257
+ yield yield_message("status", {"message": f"Extracting images from PDF page(s) {page_idx_start}-{page_idx_end}..."})
258
+ time.sleep(0.01)
259
+
260
+ page_images_pil = []
261
+ if isinstance(image_source_for_convert, bytes):
262
+ page_images_pil = convert_from_bytes(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
263
+ else:
264
+ page_images_pil = convert_from_path(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
265
+
266
+ for img_idx_in_batch, img_pil in enumerate(page_images_pil):
267
+ extracted_pil_images_overall_count += 1
268
+ current_pdf_page_num = page_idx_start + img_idx_in_batch # Actual PDF page number
269
+ page_num_for_log = f"pdfpage_{current_pdf_page_num}"
270
+
271
+ yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (from PDF page {current_pdf_page_num}) (OCR & Upload)..."})
272
+ time.sleep(0.01)
273
+
274
+ ocr_text = ""
275
+ try:
276
+ ocr_text = pytesseract.image_to_string(img_pil).strip()
277
+ if ocr_text: yield yield_message("status", {"message": f" OCR successful for image {extracted_pil_images_overall_count}."})
278
+ except Exception as ocr_e:
279
+ logger.error(f"OCR error for image {extracted_pil_images_overall_count}: {str(ocr_e)}")
280
+ ocr_text = f"OCR failed: {str(ocr_e)}"
281
+
282
+ image_md_chunk = ""
283
+ if HF_TOKEN:
284
+ image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image", page_num_for_log)
285
+ if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
286
+ image_md_chunk += f"![Image {extracted_pil_images_overall_count}]({image_url_or_error})\n"
287
+ yield yield_message("status", {"message": f" Image {extracted_pil_images_overall_count} uploaded."})
288
+ else:
289
+ image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Upload Error):** {str(image_url_or_error)}\n\n"
290
+ yield yield_message("error", {"message": f"Failed to upload image {extracted_pil_images_overall_count}: {str(image_url_or_error)}"})
291
+ else:
292
+ image_md_chunk += f"**Image {extracted_pil_images_overall_count} (not uploaded due to missing HF_TOKEN)**\n"
293
+
294
+ if ocr_text:
295
+ image_md_chunk += f"**Image {extracted_pil_images_overall_count} OCR Text:**\n```\n{ocr_text}\n```\n\n"
296
+
297
+ yield yield_message("image_md", {"content": image_md_chunk})
298
+ time.sleep(0.01)
299
+ except Exception as e_img_info:
300
+ logger.error(f"Could not get PDF info for image batching or during batched conversion: {e_img_info}", exc_info=True)
301
+ yield yield_message("error", {"message": f"Error preparing for image extraction: {e_img_info}. Trying bulk."})
302
+ # Fallback to bulk conversion
303
+ bulk_images_pil = []
304
+ if isinstance(image_source_for_convert, bytes):
305
+ bulk_images_pil = convert_from_bytes(image_source_for_convert, dpi=150)
306
+ else:
307
+ bulk_images_pil = convert_from_path(image_source_for_convert, dpi=150)
308
+
309
+ yield yield_message("status", {"message": f"Fallback: Extracted {len(bulk_images_pil)} images in bulk."})
310
+ for i, img_pil in enumerate(bulk_images_pil):
311
+ extracted_pil_images_overall_count +=1
312
+ page_num_for_log = f"bulk_image_{i+1}" # Less precise page info in fallback
313
+ yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (bulk) (OCR & Upload)..."})
314
+ ocr_text = ""
315
+ try: ocr_text = pytesseract.image_to_string(img_pil).strip()
316
+ except Exception as e: ocr_text = f"OCR Error: {e}"
317
+
318
+ image_md_chunk = f"![Image {extracted_pil_images_overall_count} (Fallback)]\n"
319
+ if HF_TOKEN:
320
+ image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image_fallback", page_num_for_log)
321
+ if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
322
+ image_md_chunk = f"![Image {extracted_pil_images_overall_count} (Fallback)]({image_url_or_error})\n"
323
+ else:
324
+ image_md_chunk += f"**Upload Error:** {str(image_url_or_error)}\n"
325
+ if ocr_text: image_md_chunk += f"**OCR Text:**\n```\n{ocr_text}\n```\n\n"
326
+ else: image_md_chunk += "\n"
327
+ yield yield_message("image_md", {"content": image_md_chunk})
328
+ time.sleep(0.01)
329
+
330
+ else:
331
+ yield yield_message("status", {"message": "No valid source for image extraction."})
332
+
333
+ except Exception as e:
334
+ logger.error(f"Error during image extraction/processing: {str(e)}", exc_info=True)
335
+ yield yield_message("error", {"message": f"Error during image extraction: {str(e)}"})
336
+
337
+ yield yield_message("final_status", {"message": "All processing stages complete."})
338
+
339
+ except Exception as e:
340
+ logger.error(f"Unhandled error in PDF conversion stream: {str(e)}", exc_info=True)
341
+ yield yield_message("error", {"message": f"Critical processing error: {str(e)}"})
342
+
343
+
344
+ # --- Flask Routes ---
345
+
346
+ @app.route('/', methods=['GET'])
347
  def index():
348
+ return render_template('index.html')
349
+
350
+ @app.route('/process-stream', methods=['POST'])
351
+ def process_pdf_stream():
352
+ pdf_file = request.files.get('pdf_file')
353
+ pdf_url = request.form.get('pdf_url', '').strip()
354
+
355
+ outer_temp_pdf_path = None
356
+
357
+ def stream_processor():
358
+ nonlocal outer_temp_pdf_path
359
+ pdf_input_source_for_generator = None
360
 
 
361
  try:
362
+ if pdf_file and pdf_file.filename:
363
+ if not pdf_file.filename.lower().endswith('.pdf'):
364
+ yield yield_message("error", {"message": "Uploaded file is not a PDF."})
365
+ return
366
+
367
+ filename = secure_filename(pdf_file.filename)
368
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
369
+ fd, temp_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
370
+ os.close(fd)
371
+ pdf_file.save(temp_path)
372
+ outer_temp_pdf_path = temp_path
373
+ logger.info(f"Uploaded PDF saved to temporary path: {outer_temp_pdf_path}")
374
+ pdf_input_source_for_generator = outer_temp_pdf_path
375
+ yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
376
+ time.sleep(0.01)
377
+
378
+ elif pdf_url:
379
+ unquoted_url = urllib.parse.unquote(pdf_url)
380
+ if not (unquoted_url.startswith('http://') or unquoted_url.startswith('https://')):
381
+ yield yield_message("error", {"message": "Invalid URL scheme. Must be http or https."})
382
+ return
383
+
384
+ pdf_input_source_for_generator = unquoted_url
385
+ yield yield_message("status", {"message": f"Preparing to process PDF from URL: {unquoted_url}"})
386
+ time.sleep(0.01)
387
+ else:
388
+ yield yield_message("error", {"message": "No PDF file uploaded and no PDF URL provided."})
389
+ return
390
+
391
+ for message_part in generate_pdf_conversion_stream(pdf_input_source_for_generator):
392
+ yield message_part
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
 
394
  except Exception as e:
395
+ logger.error(f"Error setting up stream or in initial validation: {str(e)}", exc_info=True)
396
+ yield yield_message("error", {"message": f"Setup error: {str(e)}"})
397
+ finally:
398
+ if outer_temp_pdf_path and os.path.exists(outer_temp_pdf_path):
399
+ try:
400
+ os.remove(outer_temp_pdf_path)
401
+ logger.info(f"Cleaned up temporary PDF: {outer_temp_pdf_path}")
402
+ except OSError as ose:
403
+ logger.error(f"Error removing temporary PDF {outer_temp_pdf_path}: {ose}")
404
+
405
+ return Response(stream_with_context(stream_processor()), mimetype='application/x-ndjson')
406
+
407
+
408
+ # --- Main Execution ---
409
+ if __name__ == '__main__':
410
+ if not check_poppler():
411
+ logger.warning("Poppler utilities might not be installed correctly. PDF processing might fail.")
412
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
413
+ app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True, threaded=True)