File size: 20,615 Bytes
9bc382d
a87a8f6
8323e8f
a87a8f6
 
 
 
 
 
 
9bc382d
a87a8f6
 
 
 
 
 
 
 
 
 
8323e8f
 
 
a87a8f6
 
 
 
 
 
 
9bc382d
a87a8f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8323e8f
a87a8f6
 
 
 
 
8323e8f
a87a8f6
35151aa
8323e8f
 
35151aa
a87a8f6
 
 
 
8323e8f
 
a87a8f6
 
 
8323e8f
 
 
a87a8f6
 
8323e8f
a87a8f6
 
 
8323e8f
a87a8f6
 
 
8323e8f
 
 
 
a87a8f6
8323e8f
a87a8f6
8323e8f
a87a8f6
 
 
8323e8f
 
 
a87a8f6
 
 
 
8323e8f
a87a8f6
8323e8f
 
a87a8f6
8323e8f
 
a87a8f6
8323e8f
a87a8f6
 
8323e8f
a87a8f6
 
 
 
8323e8f
a87a8f6
 
 
8323e8f
 
a87a8f6
8323e8f
a87a8f6
 
 
8323e8f
a87a8f6
 
 
 
 
 
 
 
 
8323e8f
a87a8f6
 
 
 
8323e8f
a87a8f6
8323e8f
a87a8f6
8323e8f
a87a8f6
 
 
 
 
 
 
 
8323e8f
 
 
a87a8f6
 
 
 
 
8323e8f
a87a8f6
 
8323e8f
 
a87a8f6
8323e8f
a87a8f6
8323e8f
a87a8f6
 
8323e8f
 
a87a8f6
 
 
 
8323e8f
a87a8f6
8323e8f
a87a8f6
8323e8f
a87a8f6
 
8323e8f
 
 
 
 
a87a8f6
 
 
 
8323e8f
a87a8f6
 
8323e8f
 
a87a8f6
8323e8f
a87a8f6
 
 
 
 
 
 
 
 
9bc382d
a87a8f6
 
 
 
 
 
 
 
 
 
 
 
35151aa
 
a87a8f6
 
 
 
 
 
 
8323e8f
a87a8f6
8323e8f
 
 
a87a8f6
8323e8f
a87a8f6
 
 
 
 
 
 
 
 
8323e8f
a87a8f6
 
 
 
 
 
 
 
35151aa
 
a87a8f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8323e8f
a87a8f6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
import os
import io
import re # Still needed for some image filename manipulation if any, but not for text formatting
import logging
import subprocess
from datetime import datetime
import urllib.parse
import tempfile
import json # For streaming JSON messages
import time # For gevent.sleep

from flask import Flask, request, render_template, Response, stream_with_context
from werkzeug.utils import secure_filename

# Ensure gevent is imported and monkey patched if needed for other libraries
# from gevent import monkey
# monkey.patch_all() # Apply this early if you suspect issues with other libs

import requests # For requests.exceptions.HTTPError
from requests.exceptions import HTTPError as RequestsHTTPError # Specific import for clarity

# pdfplumber is no longer needed
import pdf2image 
from pdf2image import convert_from_path, convert_from_bytes 
# from pdf2image.exceptions import ... # If you need to catch specific pdf2image errors

import pytesseract
from PIL import Image
from huggingface_hub import HfApi, create_repo

# --- Flask App Initialization ---
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = tempfile.gettempdir()
app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024  # 50 MB limit for uploads, adjust as needed

# --- Logging Configuration ---
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# --- Hugging Face Configuration ---
HF_TOKEN = os.getenv("HF_TOKEN")
HF_DATASET_REPO_NAME = os.getenv("HF_DATASET_REPO_NAME", "pdf-images-extracted")
hf_api = HfApi()

# --- Helper to yield messages for streaming ---
def yield_message(type, data):
    """Helper to format messages as JSON strings for streaming."""
    return json.dumps({"type": type, **data}) + "\n"

# --- PDF Processing Helper Functions (Adapted for Streaming) ---

def check_poppler():
    try:
        result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False)
        version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
        if version_info_log:
            logger.info(f"Poppler version check: {version_info_log.splitlines()[0] if version_info_log else 'No version output'}")
        else:
            logger.info("Poppler 'pdftoppm -v' ran. Assuming Poppler is present.")
        return True
    except FileNotFoundError:
        logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in PATH.")
        return False
    except Exception as e:
        logger.error(f"An unexpected error occurred during Poppler check: {str(e)}")
        return False

def ensure_hf_dataset():
    if not HF_TOKEN:
        msg = "HF_TOKEN is not set. Cannot ensure Hugging Face dataset. Image uploads will fail."
        logger.warning(msg)
        return "Error: " + msg
    try:
        repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
        logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
        return repo_id_obj.repo_id
    except RequestsHTTPError as e: 
        if e.response is not None and e.response.status_code == 409: 
             logger.info(f"Dataset repo '{HF_DATASET_REPO_NAME}' already exists (HTTP 409).")
             try:
                 user_info = hf_api.whoami(token=HF_TOKEN) 
                 namespace = user_info.get('name') if user_info else None
                 if namespace:
                     return f"{namespace}/{HF_DATASET_REPO_NAME}"
                 else: 
                     logger.warning(f"Could not determine namespace for existing repo '{HF_DATASET_REPO_NAME}'. Using generic ID.")
                     return HF_DATASET_REPO_NAME 
             except Exception as whoami_e:
                 logger.error(f"Could not determine namespace for existing repo via whoami due to: {whoami_e}. Using generic ID.")
                 return HF_DATASET_REPO_NAME 
        else: 
            status_code = e.response.status_code if e.response is not None else "Unknown"
            logger.error(f"Hugging Face dataset HTTP error (Status: {status_code}): {str(e)}")
            return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}' due to HTTP error: {str(e)}"
    except Exception as e: 
        logger.error(f"Hugging Face dataset general error: {str(e)}", exc_info=True)
        return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"


def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
    repo_id_or_error = ensure_hf_dataset()
    if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
        return repo_id_or_error

    repo_id = repo_id_or_error
    temp_image_path = None
    try:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
        repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
        
        os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
            temp_image_path = tmp_file.name
        image_pil.save(temp_image_path, format="PNG")

        logger.info(f"Attempting to upload {temp_image_path} to {repo_id}/{repo_filename}")
        file_url = hf_api.upload_file(
            path_or_fileobj=temp_image_path, path_in_repo=repo_filename,
            repo_id=repo_id, repo_type="dataset", token=HF_TOKEN
        )
        logger.info(f"Successfully uploaded image: {file_url}")
        return file_url
    except Exception as e: 
        logger.error(f"Image upload error for {filename_base}{page_num_for_log}: {str(e)}", exc_info=True)
        return f"Error uploading image {filename_base}{page_num_for_log}: {str(e)}"
    finally:
        if temp_image_path and os.path.exists(temp_image_path):
            try: os.remove(temp_image_path)
            except OSError as ose: logger.error(f"Error removing temp image file {temp_image_path}: {ose}")

# format_page_text_to_markdown_chunk function is removed as it's no longer used.

# --- Main PDF Processing Logic (Generator Function for Streaming) ---

def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
    try:
        yield yield_message("markdown_replace", {"content": "# Extracted Images and OCR Text\n\n"})
        time.sleep(0.01) 

        actual_pdf_input_for_images = None
        is_input_bytes = False

        source_is_url = isinstance(pdf_input_source_path_or_url, str) and \
                        pdf_input_source_path_or_url.startswith(('http://', 'https://'))
        
        if source_is_url:
            yield yield_message("status", {"message": f"Downloading PDF from URL..."})
            time.sleep(0.01)
            try:
                response = requests.get(pdf_input_source_path_or_url, stream=False, timeout=60) 
                response.raise_for_status()
                actual_pdf_input_for_images = response.content 
                is_input_bytes = True
                yield yield_message("status", {"message": f"PDF downloaded from URL ({len(actual_pdf_input_for_images)/1024:.2f} KB)."})
                time.sleep(0.01)
            except RequestsHTTPError as e: 
                logger.error(f"URL fetch HTTP error: {str(e)} (Status: {e.response.status_code if e.response else 'N/A'})", exc_info=True)
                yield yield_message("error", {"message": f"Error fetching PDF from URL (HTTP {e.response.status_code if e.response else 'N/A'}): {e.response.reason if e.response else str(e)}"})
                return
            except requests.RequestException as e: 
                logger.error(f"URL fetch network error: {str(e)}", exc_info=True)
                yield yield_message("error", {"message": f"Network error fetching PDF from URL: {str(e)}"})
                return 
        else: 
             actual_pdf_input_for_images = pdf_input_source_path_or_url 
             is_input_bytes = False
             yield yield_message("status", {"message": f"Processing local PDF file..."})
             time.sleep(0.01)

        # ----- Direct Text Extraction (using pdfplumber) is REMOVED -----

        # ----- Image Extraction and OCR -----
        if not check_poppler():
            yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
        else:
            yield yield_message("status", {"message": "Starting image extraction and OCR..."})
            # The "## Extracted Images" title is now more specific
            yield yield_message("markdown_chunk", {"content": "## Extracted Images & OCR Text from PDF Pages\n\n"})
            if not HF_TOKEN:
                 yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})

            time.sleep(0.01)
            extracted_pil_images_overall_count = 0
            try:
                if actual_pdf_input_for_images:
                    try: # Batched conversion attempt
                        pdf_info = None
                        if is_input_bytes:
                            pdf_info = pdf2image.pdfinfo_from_bytes(actual_pdf_input_for_images, userpw=None, poppler_path=None)
                        else: 
                            pdf_info = pdf2image.pdfinfo_from_path(actual_pdf_input_for_images, userpw=None, poppler_path=None)
                        
                        num_image_pages = pdf_info.get("Pages", 0)
                        yield yield_message("status", {"message": f"PDF has {num_image_pages} page(s) for image conversion and OCR."})
                        
                        batch_size = 1 
                        for page_idx_start in range(1, num_image_pages + 1, batch_size):
                            page_idx_end = min(page_idx_start + batch_size - 1, num_image_pages)
                            yield yield_message("status", {"message": f"Converting PDF page(s) {page_idx_start}-{page_idx_end} to image(s)..."})
                            time.sleep(0.01)
                            
                            page_images_pil = []
                            if is_input_bytes:
                                page_images_pil = convert_from_bytes(actual_pdf_input_for_images, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
                            else: 
                                page_images_pil = convert_from_path(actual_pdf_input_for_images, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
                            
                            for img_idx_in_batch, img_pil in enumerate(page_images_pil):
                                extracted_pil_images_overall_count += 1
                                current_pdf_page_num = page_idx_start + img_idx_in_batch
                                page_num_for_log = f"pdfpage_{current_pdf_page_num}"

                                yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (from PDF page {current_pdf_page_num}) (OCR & Upload)..."})
                                time.sleep(0.01)
                                
                                ocr_text = ""
                                try:
                                    ocr_text = pytesseract.image_to_string(img_pil).strip()
                                    if ocr_text: yield yield_message("status", {"message": f"  OCR successful for image {extracted_pil_images_overall_count}."})
                                    else: yield yield_message("status", {"message": f"  OCR complete for image {extracted_pil_images_overall_count} (no text found)."})
                                except Exception as ocr_e:
                                    logger.error(f"OCR error for image {extracted_pil_images_overall_count}: {str(ocr_e)}")
                                    ocr_text = f"OCR failed: {str(ocr_e)}"
                                
                                image_md_chunk = f"### Image from PDF Page {current_pdf_page_num}\n"
                                if HF_TOKEN:
                                    image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_page_image", page_num_for_log)
                                    if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
                                        image_md_chunk += f"![Image from PDF Page {current_pdf_page_num}]({image_url_or_error})\n"
                                        yield yield_message("status", {"message": f"  Image {extracted_pil_images_overall_count} uploaded."})
                                    else:
                                        image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Upload Error):** {str(image_url_or_error)}\n\n"
                                        yield yield_message("error", {"message": f"Failed to upload image {extracted_pil_images_overall_count}: {str(image_url_or_error)}"})
                                else:
                                    image_md_chunk += f"**Image {extracted_pil_images_overall_count} (not uploaded due to missing HF_TOKEN)**\n"

                                if ocr_text:
                                    image_md_chunk += f"**OCR Text (from PDF Page {current_pdf_page_num}):**\n```\n{ocr_text}\n```\n\n"
                                else:
                                    image_md_chunk += f"_(No text detected by OCR for image from PDF page {current_pdf_page_num})_\n\n"
                                
                                yield yield_message("image_md", {"content": image_md_chunk})
                                time.sleep(0.01)
                    except Exception as e_img_info:
                        logger.error(f"Could not get PDF info for image batching or during batched conversion: {e_img_info}", exc_info=True)
                        yield yield_message("error", {"message": f"Error preparing for image extraction: {e_img_info}. Trying bulk conversion."})
                        # Fallback to bulk conversion
                        bulk_images_pil = []
                        if is_input_bytes:
                            bulk_images_pil = convert_from_bytes(actual_pdf_input_for_images, dpi=150)
                        else: 
                            bulk_images_pil = convert_from_path(actual_pdf_input_for_images, dpi=150)
                        
                        yield yield_message("status", {"message": f"Fallback: Converted {len(bulk_images_pil)} PDF pages to images in bulk."})
                        for i, img_pil in enumerate(bulk_images_pil):
                            extracted_pil_images_overall_count +=1
                            page_num_for_log = f"bulk_image_{i+1}" 
                            yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (bulk page {i+1}) (OCR & Upload)..."})
                            ocr_text = ""
                            try: ocr_text = pytesseract.image_to_string(img_pil).strip()
                            except Exception as e: ocr_text = f"OCR Error: {e}"
                            
                            image_md_chunk = f"### Image from PDF Page (Bulk {i+1})\n"
                            if HF_TOKEN:
                                image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_page_image_fallback", page_num_for_log)
                                if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
                                    image_md_chunk += f"![Image {extracted_pil_images_overall_count} (Fallback)]({image_url_or_error})\n"
                                else:
                                    image_md_chunk += f"**Upload Error:** {str(image_url_or_error)}\n"
                            else:
                                 image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Fallback - not uploaded)**\n"

                            if ocr_text: image_md_chunk += f"**OCR Text (Bulk Page {i+1}):**\n```\n{ocr_text}\n```\n\n"
                            else: image_md_chunk += f"_(No text detected by OCR for bulk image {i+1})_\n\n"
                            yield yield_message("image_md", {"content": image_md_chunk})
                            time.sleep(0.01)

                else: 
                    yield yield_message("status", {"message": "No valid PDF input source provided for image extraction."})

            except Exception as e: 
                logger.error(f"Error during image extraction/OCR processing: {str(e)}", exc_info=True)
                yield yield_message("error", {"message": f"Error during image extraction/OCR: {str(e)}"})
        
        yield yield_message("final_status", {"message": "Image extraction and OCR processing complete."})

    except Exception as e:
        logger.error(f"Unhandled error in PDF conversion stream: {str(e)}", exc_info=True)
        yield yield_message("error", {"message": f"Critical processing error: {str(e)}"})


# --- Flask Routes ---

@app.route('/', methods=['GET'])
def index():
    return render_template('index.html')

@app.route('/process-stream', methods=['POST'])
def process_pdf_stream():
    pdf_file = request.files.get('pdf_file')
    pdf_url = request.form.get('pdf_url', '').strip()
    
    outer_temp_pdf_path = None 

    def stream_processor():
        nonlocal outer_temp_pdf_path 
        pdf_input_source_for_generator = None

        try:
            if pdf_file and pdf_file.filename:
                if not pdf_file.filename.lower().endswith('.pdf'):
                    yield yield_message("error", {"message": "Uploaded file is not a PDF."})
                    return
                
                filename = secure_filename(pdf_file.filename)
                os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
                # Save to a temporary file that generate_pdf_conversion_stream can access by path
                fd, temp_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
                os.close(fd) # Close the file descriptor from mkstemp
                pdf_file.save(temp_path) # Save the uploaded file's content to this path
                outer_temp_pdf_path = temp_path # Store for cleanup
                logger.info(f"Uploaded PDF saved to temporary path: {outer_temp_pdf_path}")
                pdf_input_source_for_generator = outer_temp_pdf_path # Pass the path
                yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
                time.sleep(0.01)

            elif pdf_url:
                unquoted_url = urllib.parse.unquote(pdf_url)
                if not (unquoted_url.startswith('http://') or unquoted_url.startswith('https://')):
                    yield yield_message("error", {"message": "Invalid URL scheme. Must be http or https."})
                    return
                
                pdf_input_source_for_generator = unquoted_url # Pass the URL string
                yield yield_message("status", {"message": f"Preparing to process PDF from URL: {unquoted_url}"})
                time.sleep(0.01)
            else:
                yield yield_message("error", {"message": "No PDF file uploaded and no PDF URL provided."})
                return

            for message_part in generate_pdf_conversion_stream(pdf_input_source_for_generator):
                yield message_part

        except Exception as e:
            logger.error(f"Error setting up stream or in initial validation: {str(e)}", exc_info=True)
            yield yield_message("error", {"message": f"Setup error: {str(e)}"})
        finally:
            if outer_temp_pdf_path and os.path.exists(outer_temp_pdf_path):
                try:
                    os.remove(outer_temp_pdf_path)
                    logger.info(f"Cleaned up temporary PDF: {outer_temp_pdf_path}")
                except OSError as ose:
                    logger.error(f"Error removing temporary PDF {outer_temp_pdf_path}: {ose}")
    
    return Response(stream_with_context(stream_processor()), mimetype='application/x-ndjson')


# --- Main Execution ---
if __name__ == '__main__':
    if not check_poppler(): 
        logger.warning("Poppler utilities might not be installed correctly. Image processing might fail.")
    os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
    app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True, threaded=True)