Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on 10 days ago

Commit

2c00d05

1 Parent(s): 01c8eb6

Improved paddle and hybrid OCR analysis across all options. Tried to revise requirements for spaces

Browse files

Files changed (14) hide show

README.md +11 -1
app.py +1 -4
example_data/partnership_toolkit_redact_custom_deny_list.csv +2 -4
pre-requirements.txt +1 -1
pyproject.toml +4 -4
requirements.txt +4 -6
src/app_settings.qmd +1 -1
src/user_guide.qmd +1 -1
tools/config.py +16 -12
tools/custom_image_analyser_engine.py +689 -200
tools/file_redaction.py +213 -60
tools/run_vlm.py +1 -1
tools/secure_regex_utils.py +5 -2
tools/word_segmenter.py +208 -91

README.md CHANGED Viewed

@@ -131,6 +131,16 @@ Alternatively, you can use the full `requirements.txt` file, that contains refer
 pip install -r requirements.txt
 ```
 ### 3. Run the Application
 With all dependencies installed, you can now start the Gradio application.
@@ -938,7 +948,7 @@ The hybrid OCR mode uses several configurable parameters:
 - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
 - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
 - **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): Save comparison images when using hybrid mode
-- **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
 ### When to use different OCR models

 pip install -r requirements.txt
 ```
+Note that the versions of both PaddleOCR and Torch installed by default are the CPU-only versions. If you want to install the equivalent GPU versions, you will need to run the following commands:
+```bash
+pip install paddlepaddle-gpu==3.2.1 --index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
+```
+```bash
+pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu126
+pip install torchvision --index-url https://download.pytorch.org/whl/cu126
+```
 ### 3. Run the Application
 With all dependencies installed, you can now start the Gradio application.
 - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
 - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
 - **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): Save comparison images when using hybrid mode
+- **SAVE_PAGE_OCR_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
 ### When to use different OCR models

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import os
-import spaces
 from pathlib import Path
 import gradio as gr
 import pandas as pd
 from fastapi import FastAPI, status
 from gradio_image_annotation import image_annotator
@@ -260,9 +260,6 @@ app = FastAPI()
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.middleware.trustedhost import TrustedHostMiddleware
-spaces.annotations
 ###
 # Load in Gradio app components
 ###

 import os
 from pathlib import Path
 import gradio as gr
 import pandas as pd
+import spaces
 from fastapi import FastAPI, status
 from gradio_image_annotation import image_annotator
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.middleware.trustedhost import TrustedHostMiddleware
 ###
 # Load in Gradio app components
 ###

example_data/partnership_toolkit_redact_custom_deny_list.csv CHANGED Viewed

@@ -1,4 +1,2 @@
-Sister
-Sister City
-Sister Cities
-Friendship City


1	+ Friendship City
2	+ United States

pre-requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 # --- PaddleOCR (CUDA 12.6) ---
 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
-paddlepaddle-gpu==3.0.0
 paddleocr<=3.3.0

 # --- PaddleOCR (CUDA 12.6) ---
 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
+paddlepaddle-gpu<=3.2.1
 paddleocr<=3.3.0

pyproject.toml CHANGED Viewed

@@ -55,14 +55,14 @@ test = ["pytest", "pytest-cov"]
 # Extra dependencies for PaddleOCR
 paddle = [
-    "paddlepaddle-gpu==3.2.1", # Specific version for compatibility with VLM and torch installation described below -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
-    "paddleocr==3.3.0",
 ]
 # Extra dependencies for VLM models
 vlm = [
-    "torch==2.8.0", # should use --index-url https://download.pytorch.org/whl/cu126 for cuda support for paddleocr, need to install manually
-    "torchvision==0.24.0",
     "transformers==4.57.1",
     "accelerate==1.11.0",
 ]

 # Extra dependencies for PaddleOCR
 paddle = [
+    "paddlepaddle<=3.2.1", # If you want the GPU-accelerated version, run manually pip install paddlepaddle-gpu<=3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+    "paddleocr<=3.3.0",
 ]
 # Extra dependencies for VLM models
 vlm = [
+    "torch<=2.8.0", # should use --index-url https://download.pytorch.org/whl/cu126 for cuda support for paddleocr, need to install manually
+    "torchvision>=0.20.1",
     "transformers==4.57.1",
     "accelerate==1.11.0",
 ]

requirements.txt CHANGED Viewed

@@ -37,16 +37,14 @@ scikit-learn==1.7.2
 spacy==3.8.7
 spaczz==0.6.1
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 # --- Testing ---
 pytest>=7.0.0
 pytest-cov>=4.0.0
-transformers==4.57.1
-accelerate==1.11.0
 # --- PyTorch (CUDA 12.6) ---
-nvidia-nccl-cu12==2.21.5
 --extra-index-url https://download.pytorch.org/whl/cu126
-torch>=2.5.1, <=2.6.0
-torchvision>=0.20.1, <=0.24.0

 spacy==3.8.7
 spaczz==0.6.1
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+transformers==4.57.1
+accelerate==1.11.0
 # --- Testing ---
 pytest>=7.0.0
 pytest-cov>=4.0.0
 # --- PyTorch (CUDA 12.6) ---
 --extra-index-url https://download.pytorch.org/whl/cu126
+torch<=2.8.0
+torchvision>=0.20.1

src/app_settings.qmd CHANGED Viewed

@@ -327,7 +327,7 @@ Configurations related to text extraction, PII detection, and the redaction proc
     * **Description:** Saves comparison images when using "hybrid-paddle" OCR mode.
     * **Default Value:** `"False"`
-* **`SAVE_PADDLE_VISUALISATIONS`**
     * **Description:** Saves images with PaddleOCR's detected bounding boxes overlaid.
     * **Default Value:** `"False"`

     * **Description:** Saves comparison images when using "hybrid-paddle" OCR mode.
     * **Default Value:** `"False"`
+* **`SAVE_PAGE_OCR_VISUALISATIONS`**
     * **Description:** Saves images with PaddleOCR's detected bounding boxes overlaid.
     * **Default Value:** `"False"`

src/user_guide.qmd CHANGED Viewed

@@ -722,7 +722,7 @@ The hybrid OCR mode uses several configurable parameters:
 - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
 - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
 - **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): Save comparison images when using hybrid mode
-- **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
 ### When to use different OCR models

 - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
 - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
 - **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): Save comparison images when using hybrid mode
+- **SAVE_PAGE_OCR_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
 ### When to use different OCR models

tools/config.py CHANGED Viewed

@@ -473,6 +473,14 @@ MAX_INPUT_TOKEN_LENGTH = int(
     get_or_create_env_var("MAX_INPUT_TOKEN_LENGTH", "4096")
 )  # Maximum number of tokens to input to the VLM
 USE_FLASH_ATTENTION = convert_string_to_boolean(
     get_or_create_env_var("USE_FLASH_ATTENTION", "False")
 )  # Whether to use flash attention for the VLM
@@ -506,7 +514,7 @@ MODEL_CACHE_PATH = get_or_create_env_var("MODEL_CACHE_PATH", "./model_cache")
 HYBRID_OCR_CONFIDENCE_THRESHOLD = int(
-    get_or_create_env_var("HYBRID_OCR_CONFIDENCE_THRESHOLD", "65")
 )  # The tesseract confidence threshold under which the text will be passed to PaddleOCR for re-extraction using the hybrid OCR method.
 HYBRID_OCR_PADDING = int(
     get_or_create_env_var("HYBRID_OCR_PADDING", "1")
@@ -536,17 +544,9 @@ SAVE_EXAMPLE_HYBRID_IMAGES = convert_string_to_boolean(
     get_or_create_env_var("SAVE_EXAMPLE_HYBRID_IMAGES", "False")
 )  # Whether to save example images of Tesseract vs PaddleOCR re-extraction in hybrid OCR mode.
-SAVE_PADDLE_VISUALISATIONS = convert_string_to_boolean(
-    get_or_create_env_var("SAVE_PADDLE_VISUALISATIONS", "False")
-)  # Whether to save visualisations of PaddleOCR bounding boxes.
-SAVE_TESSERACT_VISUALISATIONS = convert_string_to_boolean(
-    get_or_create_env_var("SAVE_TESSERACT_VISUALISATIONS", "False")
-)  # Whether to save visualisations of Tesseract bounding boxes.
-SAVE_TEXTRACT_VISUALISATIONS = convert_string_to_boolean(
-    get_or_create_env_var("SAVE_TEXTRACT_VISUALISATIONS", "False")
-)  # Whether to save visualisations of AWS Textract bounding boxes.
 # Model storage paths for Lambda compatibility
 PADDLE_MODEL_PATH = get_or_create_env_var(
@@ -565,6 +565,10 @@ SAVE_PREPROCESS_IMAGES = convert_string_to_boolean(
     get_or_create_env_var("SAVE_PREPROCESS_IMAGES", "False")
 )  # Whether to save the pre-processed images.
 # Entities for redaction
 CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var(
     "CHOSEN_COMPREHEND_ENTITIES",

     get_or_create_env_var("MAX_INPUT_TOKEN_LENGTH", "4096")
 )  # Maximum number of tokens to input to the VLM
+VLM_MAX_IMAGE_SIZE = int(
+    get_or_create_env_var("VLM_MAX_IMAGE_SIZE", "1000000")
+)  # Maximum total pixels (width * height) for images passed to VLM. Images with more pixels will be resized while maintaining aspect ratio. Default is 1000000 (1000x1000).
+VLM_MAX_DPI = float(
+    get_or_create_env_var("VLM_MAX_DPI", "300.0")
+)  # Maximum DPI for images passed to VLM. Images with higher DPI will be resized accordingly.
 USE_FLASH_ATTENTION = convert_string_to_boolean(
     get_or_create_env_var("USE_FLASH_ATTENTION", "False")
 )  # Whether to use flash attention for the VLM
 HYBRID_OCR_CONFIDENCE_THRESHOLD = int(
+    get_or_create_env_var("HYBRID_OCR_CONFIDENCE_THRESHOLD", "80")
 )  # The tesseract confidence threshold under which the text will be passed to PaddleOCR for re-extraction using the hybrid OCR method.
 HYBRID_OCR_PADDING = int(
     get_or_create_env_var("HYBRID_OCR_PADDING", "1")
     get_or_create_env_var("SAVE_EXAMPLE_HYBRID_IMAGES", "False")
 )  # Whether to save example images of Tesseract vs PaddleOCR re-extraction in hybrid OCR mode.
+SAVE_PAGE_OCR_VISUALISATIONS = convert_string_to_boolean(
+    get_or_create_env_var("SAVE_PAGE_OCR_VISUALISATIONS", "False")
+)  # Whether to save visualisations of Tesseract, PaddleOCR, and Textract bounding boxes.
 # Model storage paths for Lambda compatibility
 PADDLE_MODEL_PATH = get_or_create_env_var(
     get_or_create_env_var("SAVE_PREPROCESS_IMAGES", "False")
 )  # Whether to save the pre-processed images.
+SAVE_VLM_INPUT_IMAGES = convert_string_to_boolean(
+    get_or_create_env_var("SAVE_VLM_INPUT_IMAGES", "False")
+)  # Whether to save input images sent to VLM OCR for debugging.
 # Entities for redaction
 CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var(
     "CHOSEN_COMPREHEND_ENTITIES",

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -29,15 +29,18 @@ from tools.config import (
     PADDLE_USE_TEXTLINE_ORIENTATION,
     PREPROCESS_LOCAL_OCR_IMAGES,
     SAVE_EXAMPLE_HYBRID_IMAGES,
-    SAVE_PADDLE_VISUALISATIONS,
     SAVE_PREPROCESS_IMAGES,
     SELECTED_MODEL,
     TESSERACT_SEGMENTATION_LEVEL,
 )
 from tools.helper_functions import clean_unicode_text
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
-from tools.run_vlm import generate_image as vlm_generate_image
 from tools.secure_path_utils import validate_folder_containment
 from tools.secure_regex_utils import safe_sanitize_text
 from tools.word_segmenter import AdaptiveSegmenter
@@ -554,6 +557,84 @@ def _get_tesseract_psm(segmentation_level: str) -> int:
         return 11
 def _vlm_ocr_predict(
     image: Image.Image,
     prompt: str = "Extract the text content from this image.",
@@ -569,10 +650,47 @@ def _vlm_ocr_predict(
         Dictionary in PaddleOCR format with 'rec_texts' and 'rec_scores'
     """
     try:
         # Use the VLM to extract text
         # Pass None for parameters to prioritize model-specific defaults from run_vlm.py
         # If model defaults are not available, general defaults will be used (matching current values)
-        extracted_text = vlm_generate_image(
             text=prompt,
             image=image,
             max_new_tokens=None,  # Use model default if available, otherwise MAX_NEW_TOKENS from config
@@ -582,15 +700,32 @@ def _vlm_ocr_predict(
             repetition_penalty=None,  # Use model default if available, otherwise 1.3
         )
-        if extracted_text and extracted_text.strip():
             # Clean the text
-            cleaned_text = extracted_text.strip()
             # Split into words for compatibility with PaddleOCR format
             words = cleaned_text.split()
-            # If text has more than 5 words, assume something went wrong and skip it
-            if len(words) > 5:
                 return {"rec_texts": [], "rec_scores": []}
             # Create PaddleOCR-compatible result
@@ -601,10 +736,12 @@ def _vlm_ocr_predict(
             return result
         else:
             return {"rec_texts": [], "rec_scores": []}
-    except Exception as e:
-        print(f"VLM OCR error: {e}")
         return {"rec_texts": [], "rec_scores": []}
@@ -814,6 +951,8 @@ class CustomImageAnalyzerEngine:
         paddle_results: List[Any],
         input_image_width: int = None,
         input_image_height: int = None,
     ) -> Dict[str, List]:
         """Converts PaddleOCR result format to Tesseract's dictionary format using relative coordinates.
@@ -825,6 +964,8 @@ class CustomImageAnalyzerEngine:
             paddle_results: List of PaddleOCR result dictionaries
             input_image_width: Width of the input image passed to PaddleOCR (target dimensions for scaling)
             input_image_height: Height of the input image passed to PaddleOCR (target dimensions for scaling)
         """
         output = {
@@ -834,6 +975,7 @@ class CustomImageAnalyzerEngine:
             "width": list(),
             "height": list(),
             "conf": list(),
         }
         # paddle_results is now a list of dictionaries with detailed information
@@ -848,19 +990,24 @@ class CustomImageAnalyzerEngine:
             # Fallback: we'll try to detect from coordinates, but this is less reliable
             use_relative_coords = False
         else:
-            use_relative_coords = False
         for page_result in paddle_results:
             # Extract text recognition results from the new format
             rec_texts = page_result.get("rec_texts", list())
             rec_scores = page_result.get("rec_scores", list())
             rec_polys = page_result.get("rec_polys", list())
             # PaddleOCR may return image dimensions in the result - check for them
             # Some versions of PaddleOCR include this information
             result_image_width = page_result.get("image_width")
             result_image_height = page_result.get("image_height")
             # First pass: determine PaddleOCR's coordinate space by finding max coordinates
             # This tells us what coordinate space PaddleOCR is actually using
             max_x_coord = 0
@@ -879,22 +1026,45 @@ class CustomImageAnalyzerEngine:
                     max_y_coord = max(max_y_coord, max(y_coords) if y_coords else 0)
             # Determine PaddleOCR's coordinate space dimensions
-            # Priority: result metadata > detected from coordinates > input dimensions
-            paddle_coord_width = (
-                result_image_width
-                if result_image_width is not None
-                else max_x_coord if max_x_coord > 0 else input_image_width
-            )
-            paddle_coord_height = (
-                result_image_height
-                if result_image_height is not None
-                else max_y_coord if max_y_coord > 0 else input_image_height
-            )
-            # If we couldn't determine PaddleOCR's coordinate space, fall back to input dimensions
-            if paddle_coord_width is None or paddle_coord_height is None:
                 paddle_coord_width = input_image_width
                 paddle_coord_height = input_image_height
                 use_relative_coords = False
             if paddle_coord_width <= 0 or paddle_coord_height <= 0:
@@ -905,9 +1075,43 @@ class CustomImageAnalyzerEngine:
                 paddle_coord_height = input_image_height or 1
                 use_relative_coords = False
             # Second pass: convert coordinates using relative coordinate approach
-            for line_text, line_confidence, bounding_box in zip(
-                rec_texts, rec_scores, rec_polys
             ):
                 # bounding_box is now a numpy array with shape (4, 2)
                 # Convert to list of coordinates if it's a numpy array
@@ -974,6 +1178,7 @@ class CustomImageAnalyzerEngine:
                 output["width"].append(round(line_width, 2))
                 output["height"].append(round(line_height, 2))
                 output["conf"].append(int(line_confidence * 100))
         return output
@@ -1005,6 +1210,7 @@ class CustomImageAnalyzerEngine:
             "width": list(),
             "height": list(),
             "conf": list(),
         }
         if not line_data or not line_data.get("text"):
@@ -1043,6 +1249,11 @@ class CustomImageAnalyzerEngine:
         for i in range(len(line_data["text"])):
             line_text = line_data["text"][i]
             line_conf = line_data["conf"][i]
             # Get the float values
             f_left = float(line_data["left"][i])
@@ -1171,6 +1382,7 @@ class CustomImageAnalyzerEngine:
                             output["width"].append(clamped_width)
                             output["height"].append(line_height)
                             output["conf"].append(line_conf)
                             current_left += word_width + estimated_space_width
                 continue
@@ -1182,6 +1394,8 @@ class CustomImageAnalyzerEngine:
                 output["width"].append(word_output["width"][j])
                 output["height"].append(word_output["height"][j])
                 output["conf"].append(word_output["conf"][j])
         return output
@@ -1563,19 +1777,21 @@ class CustomImageAnalyzerEngine:
         self,
         image: Image.Image,
         ocr: Optional[Any] = None,
         confidence_threshold: int = HYBRID_OCR_CONFIDENCE_THRESHOLD,
         padding: int = HYBRID_OCR_PADDING,
         image_name: str = "unknown_image_name",
         input_image_width: int = None,
         input_image_height: int = None,
-    ) -> Dict[str, list]:
         """
         Performs OCR using PaddleOCR at line level, then VLM for low-confidence lines.
-        Returns data in the same dictionary format as pytesseract.image_to_data.
         Args:
             image: PIL Image to process
             ocr: PaddleOCR instance (optional, uses self.paddle_ocr if not provided)
             confidence_threshold: Confidence threshold below which VLM is used
             padding: Padding to add around line crops
             image_name: Name of the image for logging/debugging
@@ -1583,7 +1799,7 @@ class CustomImageAnalyzerEngine:
             input_image_height: Original image height (before preprocessing)
         Returns:
-            Dictionary with OCR results in Tesseract format
         """
         if ocr is None:
             if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
@@ -1593,6 +1809,9 @@ class CustomImageAnalyzerEngine:
                     "No OCR object provided and 'paddle_ocr' is not initialized."
                 )
         print("Starting hybrid PaddleOCR + VLM OCR process...")
         # Get image dimensions
@@ -1604,154 +1823,312 @@ class CustomImageAnalyzerEngine:
         if input_image_height is None:
             input_image_height = img_height
-        # 1. Get initial line-level results from PaddleOCR
-        image_np = np.array(image)
-        if len(image_np.shape) == 2:
-            image_np = np.stack([image_np] * 3, axis=-1)
-        paddle_results = ocr.predict(image_np)
-        # Convert PaddleOCR results to line-level format
-        paddle_line_data = self._convert_paddle_to_tesseract_format(
-            paddle_results,
-            input_image_width=input_image_width,
-            input_image_height=input_image_height,
-        )
-        # Prepare final output structure
-        final_data = {
-            "text": list(),
-            "left": list(),
-            "top": list(),
-            "width": list(),
-            "height": list(),
-            "conf": list(),
-            "model": list(),  # Track which model was used for each line
-        }
-        num_lines = len(paddle_line_data["text"])
-        # Process each line
-        for i in range(num_lines):
-            line_text = paddle_line_data["text"][i]
-            line_conf = int(paddle_line_data["conf"][i])
-            line_left = float(paddle_line_data["left"][i])
-            line_top = float(paddle_line_data["top"][i])
-            line_width = float(paddle_line_data["width"][i])
-            line_height = float(paddle_line_data["height"][i])
-            # Skip empty lines
-            if not line_text.strip():
-                continue
-            # Initialize model as PaddleOCR (default)
-            model_used = "Paddle"
-            # Count words in PaddleOCR output
-            paddle_words = line_text.split()
-            paddle_word_count = len(paddle_words)
-            # If confidence is low, use VLM for a second opinion
-            if line_conf < confidence_threshold:
-                # Calculate crop coordinates with padding
-                crop_left = max(0, int(line_left - padding))
-                crop_top = max(0, int(line_top - padding))
-                crop_right = min(img_width, int(line_left + line_width + padding))
-                crop_bottom = min(img_height, int(line_top + line_height + padding))
-                # Ensure crop dimensions are valid
-                if crop_right <= crop_left or crop_bottom <= crop_top:
-                    # Invalid crop, keep original PaddleOCR result
-                    final_data["text"].append(clean_unicode_text(line_text))
-                    final_data["left"].append(line_left)
-                    final_data["top"].append(line_top)
-                    final_data["width"].append(line_width)
-                    final_data["height"].append(line_height)
-                    final_data["conf"].append(line_conf)
-                    final_data["model"].append(model_used)
                     continue
-                # Crop the line image
-                cropped_image = image.crop(
-                    (crop_left, crop_top, crop_right, crop_bottom)
-                )
-                # Use VLM for OCR on this line
-                vlm_result = _vlm_ocr_predict(cropped_image)
-                vlm_rec_texts = vlm_result.get("rec_texts", [])
-                vlm_rec_scores = vlm_result.get("rec_scores", [])
-                if vlm_rec_texts and vlm_rec_scores:
-                    # Combine VLM words into a single text string
-                    vlm_text = " ".join(vlm_rec_texts)
-                    vlm_word_count = len(vlm_rec_texts)
-                    vlm_conf = int(round(np.median(vlm_rec_scores) * 100, 0))
-                    # Only replace if word counts match
-                    if vlm_word_count == paddle_word_count:
                         print(
-                            f"  Re-OCR'd line: '{line_text}' (conf: {line_conf}, words: {paddle_word_count}) "
-                            f"-> '{vlm_text}' (conf: {vlm_conf:.0f}, words: {vlm_word_count}) [VLM]"
                         )
-                        # For exporting example image comparisons
-                        safe_filename = self._create_safe_filename_with_confidence(
-                            line_text, vlm_text, line_conf, vlm_conf, "VLM"
-                        )
-                        if SAVE_EXAMPLE_HYBRID_IMAGES is True:
-                            # Normalize and validate image_name to prevent path traversal attacks
-                            normalized_image_name = os.path.normpath(
-                                image_name + "_hybrid_paddle_vlm"
                             )
-                            if (
-                                ".." in normalized_image_name
-                                or "/" in normalized_image_name
-                                or "\\" in normalized_image_name
-                            ):
-                                normalized_image_name = "safe_image"
-                            hybrid_ocr_examples_folder = (
-                                self.output_folder
-                                + f"/hybrid_ocr_examples/{normalized_image_name}"
                             )
-                            # Validate the constructed path is safe
-                            if not validate_folder_containment(
-                                hybrid_ocr_examples_folder, OUTPUT_FOLDER
-                            ):
-                                raise ValueError(
-                                    f"Unsafe hybrid_ocr_examples folder path: {hybrid_ocr_examples_folder}"
-                                )
-                            if not os.path.exists(hybrid_ocr_examples_folder):
-                                os.makedirs(hybrid_ocr_examples_folder)
-                            output_image_path = (
-                                hybrid_ocr_examples_folder + f"/{safe_filename}.png"
                             )
-                            print(f"Saving example image to {output_image_path}")
-                            cropped_image.save(output_image_path)
-                        # Replace with VLM result
-                        line_text = vlm_text
-                        line_conf = vlm_conf
-                        model_used = "VLM"
                     else:
-                        print(
-                            f"  Line: '{line_text}' (conf: {line_conf}, words: {paddle_word_count}) -> "
-                            f"VLM result '{vlm_text}' (conf: {vlm_conf:.0f}, words: {vlm_word_count}) "
-                            f"word count mismatch. Keeping PaddleOCR result."
-                        )
-            # Append the final result (either original PaddleOCR or replaced VLM)
-            final_data["text"].append(clean_unicode_text(line_text))
-            final_data["left"].append(line_left)
-            final_data["top"].append(line_top)
-            final_data["width"].append(line_width)
-            final_data["height"].append(line_height)
-            final_data["conf"].append(int(line_conf))
-            final_data["model"].append(model_used)
-        return final_data
     def perform_ocr(
         self, image: Union[str, Image.Image, np.ndarray], ocr: Optional[Any] = None
@@ -1772,11 +2149,16 @@ class CustomImageAnalyzerEngine:
         # Store original dimensions BEFORE preprocessing (needed for coordinate conversion)
         original_image_width = None
         original_image_height = None
         if PREPROCESS_LOCAL_OCR_IMAGES:
             print("Pre-processing image...")
             # Get original dimensions before preprocessing
             original_image_width, original_image_height = image.size
             image, preprocessing_metadata = self.image_preprocessor.preprocess_image(
                 image
             )
@@ -1794,9 +2176,15 @@ class CustomImageAnalyzerEngine:
         else:
             preprocessing_metadata = dict()
             original_image_width, original_image_height = image.size
         image_width, image_height = image.size
         # Note: In testing I haven't seen that this necessarily improves results
         if self.ocr_engine == "hybrid-paddle":
             # Try hybrid with original image for cropping:
@@ -1806,23 +2194,6 @@ class CustomImageAnalyzerEngine:
             # Try hybrid VLM with original image for cropping:
             ocr_data = self._perform_hybrid_ocr(image, image_name=image_name)
-        elif self.ocr_engine == "hybrid-paddle-vlm":
-            # Hybrid PaddleOCR + VLM: use PaddleOCR at line level, then VLM for low-confidence lines
-            if ocr is None:
-                if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
-                    ocr = self.paddle_ocr
-                else:
-                    raise ValueError(
-                        "No OCR object provided and 'paddle_ocr' is not initialized."
-                    )
-            ocr_data = self._perform_hybrid_paddle_vlm_ocr(
-                image,
-                ocr=ocr,
-                image_name=image_name,
-                input_image_width=original_image_width,
-                input_image_height=original_image_height,
-            )
         elif self.ocr_engine == "tesseract":
             ocr_data = pytesseract.image_to_data(
@@ -1832,7 +2203,7 @@ class CustomImageAnalyzerEngine:
                 lang=self.tesseract_lang,  # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
             )
-        elif self.ocr_engine == "paddle":
             if ocr is None:
                 if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
@@ -1863,6 +2234,8 @@ class CustomImageAnalyzerEngine:
                 paddle_input_height = image_np.shape[0]
                 paddle_results = ocr.predict(image_np)
             else:
                 # When using image path, load image to get dimensions
                 temp_image = Image.open(image_path)
@@ -1870,9 +2243,13 @@ class CustomImageAnalyzerEngine:
                 # For file path, use the original dimensions (before preprocessing)
                 # original_image_width and original_image_height are already set above
                 paddle_results = ocr.predict(image_path)
             # Save PaddleOCR visualization with bounding boxes
-            if paddle_results and SAVE_PADDLE_VISUALISATIONS is True:
                 for res in paddle_results:
                     # self.output_folder is already validated and normalized at construction time
@@ -1890,24 +2267,137 @@ class CustomImageAnalyzerEngine:
                     os.makedirs(paddle_viz_folder, exist_ok=True)
                     res.save_to_img(paddle_viz_folder)
             ocr_data = self._convert_paddle_to_tesseract_format(
                 paddle_results,
                 input_image_width=original_image_width,
                 input_image_height=original_image_height,
             )
         else:
             raise RuntimeError(f"Unsupported OCR engine: {self.ocr_engine}")
         # Convert line-level results to word-level if configured and needed
         if CONVERT_LINE_TO_WORD_LEVEL and self._is_line_level_data(ocr_data):
             print("Converting line-level OCR results to word-level...")
-            # Check if coordinates need to be scaled to match the preprocessed image
-            # For PaddleOCR: _convert_paddle_to_tesseract_format converts coordinates to original image space,
-            #   but we need to crop from the preprocessed image, so we need to scale coordinates up
             # For Tesseract: OCR runs on preprocessed image, so coordinates are already in preprocessed space,
             #   matching the preprocessed image we're cropping from - no scaling needed
             needs_scaling = False
             if (
                 PREPROCESS_LOCAL_OCR_IMAGES
                 and original_image_width
@@ -1919,7 +2409,19 @@ class CustomImageAnalyzerEngine:
                 ):
                     # PaddleOCR coordinates are converted to original space by _convert_paddle_to_tesseract_format
                     # hybrid-paddle-vlm also uses PaddleOCR and converts to original space
-                    needs_scaling = True
             if needs_scaling:
                 # Calculate scale factors from original to preprocessed
@@ -1937,12 +2439,13 @@ class CustomImageAnalyzerEngine:
                     "width": [w * scale_x for w in ocr_data["width"]],
                     "height": [h * scale_y for h in ocr_data["height"]],
                     "conf": ocr_data["conf"],
                 }
                 ocr_data = self._convert_line_to_word_level(
                     scaled_ocr_data,
-                    image_width,
-                    image_height,
-                    image,
                     image_name=image_name,
                 )
                 # Scale word-level results back to original image space
@@ -1954,27 +2457,15 @@ class CustomImageAnalyzerEngine:
                     ocr_data["width"][i] = ocr_data["width"][i] * scale_factor_x
                     ocr_data["height"][i] = ocr_data["height"][i] * scale_factor_y
             else:
                 ocr_data = self._convert_line_to_word_level(
-                    ocr_data, image_width, image_height, image, image_name=image_name
                 )
-        # Always check for scale_factor, even if preprocessing_metadata is empty
-        # This ensures rescaling happens correctly when preprocessing was applied
-        scale_factor = (
-            preprocessing_metadata.get("scale_factor", 1.0)
-            if preprocessing_metadata
-            else 1.0
-        )
-        if scale_factor != 1.0:
-            # Skip rescaling for PaddleOCR since _convert_paddle_to_tesseract_format
-            # already scales coordinates directly to original image dimensions
-            # hybrid-paddle-vlm also uses PaddleOCR and converts to original space
-            if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid-paddle-vlm":
-                pass
-                # print(f"Skipping rescale_ocr_data for PaddleOCR (already scaled to original dimensions)")
-            else:
-                ocr_data = rescale_ocr_data(ocr_data, scale_factor)
         # The rest of your processing pipeline now works for both engines
         ocr_result = ocr_data
@@ -1986,9 +2477,7 @@ class CustomImageAnalyzerEngine:
         ]
         # Determine default model based on OCR engine if model field is not present
-        if "model" in ocr_result and len(ocr_result["model"]) == len(
-            ocr_result["text"]
-        ):
             # Model field exists and has correct length - use it
             def get_model(idx):
                 return ocr_result["model"][idx]
@@ -2002,13 +2491,13 @@ class CustomImageAnalyzerEngine:
                     "Paddle"
                     if self.ocr_engine == "paddle"
                     else (
-                        "hybrid-paddle"
                         if self.ocr_engine == "hybrid-paddle"
                         else (
-                            "VLM"
                             if self.ocr_engine == "hybrid-vlm"
                             else (
-                                "hybrid-paddle-vlm"
                                 if self.ocr_engine == "hybrid-paddle-vlm"
                                 else None
                             )

     PADDLE_USE_TEXTLINE_ORIENTATION,
     PREPROCESS_LOCAL_OCR_IMAGES,
     SAVE_EXAMPLE_HYBRID_IMAGES,
+    SAVE_PAGE_OCR_VISUALISATIONS,
     SAVE_PREPROCESS_IMAGES,
+    SAVE_VLM_INPUT_IMAGES,
     SELECTED_MODEL,
     TESSERACT_SEGMENTATION_LEVEL,
+    VLM_MAX_DPI,
+    VLM_MAX_IMAGE_SIZE,
 )
 from tools.helper_functions import clean_unicode_text
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
+from tools.run_vlm import extract_text_from_image_vlm
 from tools.secure_path_utils import validate_folder_containment
 from tools.secure_regex_utils import safe_sanitize_text
 from tools.word_segmenter import AdaptiveSegmenter
         return 11
+def _prepare_image_for_vlm(image: Image.Image) -> Image.Image:
+    """
+    Prepare image for VLM by ensuring it doesn't exceed maximum size and DPI limits.
+    Args:
+        image: PIL Image to prepare
+    Returns:
+        PIL Image that has been resized if necessary to meet size and DPI constraints
+    """
+    if image is None:
+        return image
+    width, height = image.size
+    # Get DPI information (if available)
+    dpi = image.info.get("dpi", (72, 72))  # Default to 72 DPI if not specified
+    if isinstance(dpi, tuple):
+        dpi_x, dpi_y = dpi
+        # Use the maximum DPI value
+        current_dpi = max(dpi_x, dpi_y)
+    else:
+        current_dpi = float(dpi) if dpi else 72.0
+    # Calculate scale factors needed
+    size_scale = 1.0
+    dpi_scale = 1.0
+    # Check if total pixels exceed maximum
+    total_pixels = width * height
+    if total_pixels > VLM_MAX_IMAGE_SIZE:
+        # Calculate scale factor to reduce total pixels to maximum
+        # Since area scales with scale^2, we need sqrt of the ratio
+        size_scale = (VLM_MAX_IMAGE_SIZE / total_pixels) ** 0.5
+        print(
+            f"VLM image size check: Image has {total_pixels:,} pixels ({width}x{height}), exceeds maximum {VLM_MAX_IMAGE_SIZE:,} pixels. Will resize by factor {size_scale:.3f}"
+        )
+    # Check if DPI exceeds maximum
+    if current_dpi > VLM_MAX_DPI:
+        dpi_scale = VLM_MAX_DPI / current_dpi
+        print(
+            f"VLM DPI check: Image DPI {current_dpi:.1f} exceeds maximum {VLM_MAX_DPI:.1f} DPI. Will resize by factor {dpi_scale:.3f}"
+        )
+    # Use the smaller scale factor to ensure both constraints are met
+    final_scale = min(size_scale, dpi_scale)
+    # Resize if necessary
+    if final_scale < 1.0:
+        new_width = int(width * final_scale)
+        new_height = int(height * final_scale)
+        print(
+            f"VLM image preparation: Resizing image from {width}x{height} to {new_width}x{new_height} (scale: {final_scale:.3f})"
+        )
+        # Use high-quality resampling for downscaling
+        image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+        # Update DPI info if it was set
+        if "dpi" in image.info:
+            new_dpi = (current_dpi * final_scale, current_dpi * final_scale)
+            # Create a copy with updated DPI info
+            image_info = image.info.copy()
+            image_info["dpi"] = new_dpi
+            # Note: PIL doesn't allow direct modification of info dict, so we'll just note it
+            print(
+                f"VLM image preparation: Effective DPI after resize: {new_dpi[0]:.1f}"
+            )
+    else:
+        total_pixels = width * height
+        print(
+            f"VLM image preparation: Image size {width}x{height} ({total_pixels:,} pixels) and DPI {current_dpi:.1f} are within limits (max pixels: {VLM_MAX_IMAGE_SIZE:,}, max DPI: {VLM_MAX_DPI})"
+        )
+    return image
 def _vlm_ocr_predict(
     image: Image.Image,
     prompt: str = "Extract the text content from this image.",
         Dictionary in PaddleOCR format with 'rec_texts' and 'rec_scores'
     """
     try:
+        # Validate image exists and is not None
+        if image is None:
+            print("VLM OCR error: Image is None")
+            return {"rec_texts": [], "rec_scores": []}
+        # Validate image has valid size (at least 10x10 pixels)
+        try:
+            width, height = image.size
+            if width < 10 or height < 10:
+                print(
+                    f"VLM OCR error: Image is too small ({width}x{height} pixels). Minimum size is 10x10."
+                )
+                return {"rec_texts": [], "rec_scores": []}
+        except Exception as size_error:
+            print(f"VLM OCR error: Could not get image size: {size_error}")
+            return {"rec_texts": [], "rec_scores": []}
+        # Ensure image is in RGB mode (convert if needed)
+        try:
+            if image.mode != "RGB":
+                print(f"VLM OCR: Converting image from {image.mode} to RGB mode")
+                image = image.convert("RGB")
+                # Update width/height after conversion (should be same, but ensure consistency)
+                width, height = image.size
+        except Exception as convert_error:
+            print(f"VLM OCR error: Could not convert image to RGB: {convert_error}")
+            return {"rec_texts": [], "rec_scores": []}
+        # Check and resize image if it exceeds maximum size or DPI limits
+        try:
+            image = _prepare_image_for_vlm(image)
+            width, height = image.size
+        except Exception as prep_error:
+            print(f"VLM OCR error: Could not prepare image for VLM: {prep_error}")
+            return {"rec_texts": [], "rec_scores": []}
         # Use the VLM to extract text
         # Pass None for parameters to prioritize model-specific defaults from run_vlm.py
         # If model defaults are not available, general defaults will be used (matching current values)
+        print(f"Calling extract_text_from_image_vlm with image size: {width}x{height}")
+        extracted_text = extract_text_from_image_vlm(
             text=prompt,
             image=image,
             max_new_tokens=None,  # Use model default if available, otherwise MAX_NEW_TOKENS from config
             repetition_penalty=None,  # Use model default if available, otherwise 1.3
         )
+        # print(f"VLM OCR extracted text type: {type(extracted_text)}, value: {extracted_text}")
+        # Check if extracted_text is None or empty
+        if extracted_text is None:
+            # print("VLM OCR warning: extract_text_from_image_vlm returned None")
+            return {"rec_texts": [], "rec_scores": []}
+        if not isinstance(extracted_text, str):
+            # print(f"VLM OCR warning: extract_text_from_image_vlm returned unexpected type: {type(extracted_text)}")
+            return {"rec_texts": [], "rec_scores": []}
+        if extracted_text.strip():
             # Clean the text
+            cleaned_text = re.sub(r"[\r\n]+", " ", extracted_text)
+            cleaned_text = cleaned_text.strip()
             # Split into words for compatibility with PaddleOCR format
             words = cleaned_text.split()
+            # If text has more than 30 words, assume something went wrong and skip it
+            if len(words) > 30:
+                print(
+                    f"VLM OCR warning: Extracted text has {len(words)} words, which exceeds the 30 word limit. Skipping."
+                )
                 return {"rec_texts": [], "rec_scores": []}
             # Create PaddleOCR-compatible result
             return result
         else:
+            # print("VLM OCR warning: Extracted text is empty after stripping")
             return {"rec_texts": [], "rec_scores": []}
+    except Exception:
+        # print(f"VLM OCR error: {e}")
+        # print(f"VLM OCR error traceback: {traceback.format_exc()}")
         return {"rec_texts": [], "rec_scores": []}
         paddle_results: List[Any],
         input_image_width: int = None,
         input_image_height: int = None,
+        image_name: str = None,
+        image: Image.Image = None,
     ) -> Dict[str, List]:
         """Converts PaddleOCR result format to Tesseract's dictionary format using relative coordinates.
             paddle_results: List of PaddleOCR result dictionaries
             input_image_width: Width of the input image passed to PaddleOCR (target dimensions for scaling)
             input_image_height: Height of the input image passed to PaddleOCR (target dimensions for scaling)
+            image_name: Name of the image
+            image: Image object
         """
         output = {
             "width": list(),
             "height": list(),
             "conf": list(),
+            "model": list(),
         }
         # paddle_results is now a list of dictionaries with detailed information
             # Fallback: we'll try to detect from coordinates, but this is less reliable
             use_relative_coords = False
         else:
+            use_relative_coords = True
         for page_result in paddle_results:
             # Extract text recognition results from the new format
             rec_texts = page_result.get("rec_texts", list())
             rec_scores = page_result.get("rec_scores", list())
             rec_polys = page_result.get("rec_polys", list())
+            rec_models = page_result.get("rec_models", list())
             # PaddleOCR may return image dimensions in the result - check for them
             # Some versions of PaddleOCR include this information
             result_image_width = page_result.get("image_width")
             result_image_height = page_result.get("image_height")
+            # PaddleOCR typically returns coordinates in the input image space
+            # However, it may internally resize images, so we need to check if coordinates
+            # are in a different space by comparing with explicit metadata or detecting from coordinates
             # First pass: determine PaddleOCR's coordinate space by finding max coordinates
             # This tells us what coordinate space PaddleOCR is actually using
             max_x_coord = 0
                     max_y_coord = max(max_y_coord, max(y_coords) if y_coords else 0)
             # Determine PaddleOCR's coordinate space dimensions
+            # Priority: explicit result metadata > input dimensions (standard PaddleOCR behavior)
+            # Note: PaddleOCR typically returns coordinates in the input image space.
+            # We only use a different coordinate space if PaddleOCR provides explicit metadata.
+            # Using max coordinates to detect coordinate space is unreliable because:
+            # 1. Text might not extend to image edges
+            # 2. There might be padding
+            # 3. Max coordinates don't necessarily equal image dimensions
+            if result_image_width is not None and result_image_height is not None:
+                # Use explicit metadata from PaddleOCR if available (most reliable)
+                paddle_coord_width = result_image_width
+                paddle_coord_height = result_image_height
+                # Only use relative conversion if coordinate space differs from input
+                if (
+                    paddle_coord_width != input_image_width
+                    or paddle_coord_height != input_image_height
+                ):
+                    print(
+                        f"PaddleOCR metadata indicates coordinate space ({paddle_coord_width}x{paddle_coord_height}) "
+                        f"differs from input ({input_image_width}x{input_image_height}). "
+                        f"Using metadata for coordinate conversion."
+                    )
+            elif input_image_width is not None and input_image_height is not None:
+                # Default: assume coordinates are in input image space (standard PaddleOCR behavior)
+                # This is the most common case and avoids incorrect scaling
                 paddle_coord_width = input_image_width
                 paddle_coord_height = input_image_height
+            else:
+                # Fallback: use max coordinates if we have no other information
+                paddle_coord_width = max_x_coord if max_x_coord > 0 else 1
+                paddle_coord_height = max_y_coord if max_y_coord > 0 else 1
+                use_relative_coords = False
+                print(
+                    f"Warning: No input dimensions provided. Using detected coordinate space ({paddle_coord_width}x{paddle_coord_height}) from max coordinates."
+                )
+            # Validate coordinate space dimensions
+            if paddle_coord_width is None or paddle_coord_height is None:
+                paddle_coord_width = input_image_width or 1
+                paddle_coord_height = input_image_height or 1
                 use_relative_coords = False
             if paddle_coord_width <= 0 or paddle_coord_height <= 0:
                 paddle_coord_height = input_image_height or 1
                 use_relative_coords = False
+            # If coordinate space matches input dimensions, coordinates are already in the correct space
+            # Only use relative coordinate conversion if coordinate space differs from input
+            if (
+                paddle_coord_width == input_image_width
+                and paddle_coord_height == input_image_height
+                and input_image_width is not None
+                and input_image_height is not None
+            ):
+                # Coordinates are already in input space, no conversion needed
+                use_relative_coords = False
+                print(
+                    f"PaddleOCR coordinates are in input image space ({input_image_width}x{input_image_height}). "
+                    f"Using coordinates directly without conversion."
+                )
             # Second pass: convert coordinates using relative coordinate approach
+            # Use default "Paddle" if rec_models is not available or doesn't match length
+            if len(rec_models) != len(rec_texts):
+                print(
+                    f"Warning: rec_models length ({len(rec_models)}) doesn't match rec_texts length ({len(rec_texts)}). Using default 'Paddle' for all."
+                )
+                rec_models = ["Paddle"] * len(rec_texts)
+                # Update page_result to keep it consistent
+                page_result["rec_models"] = rec_models
+            else:
+                # Ensure we're using the rec_models from page_result (which may have been modified)
+                rec_models = page_result.get("rec_models", rec_models)
+            # Debug: Print model distribution
+            vlm_count = sum(1 for m in rec_models if m == "VLM")
+            if vlm_count > 0:
+                print(
+                    f"Found {vlm_count} VLM-labeled lines out of {len(rec_models)} total lines in page_result"
+                )
+            for line_text, line_confidence, bounding_box, line_model in zip(
+                rec_texts, rec_scores, rec_polys, rec_models
             ):
                 # bounding_box is now a numpy array with shape (4, 2)
                 # Convert to list of coordinates if it's a numpy array
                 output["width"].append(round(line_width, 2))
                 output["height"].append(round(line_height, 2))
                 output["conf"].append(int(line_confidence * 100))
+                output["model"].append(line_model if line_model else "Paddle")
         return output
             "width": list(),
             "height": list(),
             "conf": list(),
+            "model": list(),
         }
         if not line_data or not line_data.get("text"):
         for i in range(len(line_data["text"])):
             line_text = line_data["text"][i]
             line_conf = line_data["conf"][i]
+            # Extract model, defaulting to "Paddle" if not available
+            if "model" in line_data and len(line_data["model"]) > i:
+                line_model = line_data["model"][i]
+            else:
+                line_model = "Paddle"
             # Get the float values
             f_left = float(line_data["left"][i])
                             output["width"].append(clamped_width)
                             output["height"].append(line_height)
                             output["conf"].append(line_conf)
+                            output["model"].append(line_model)
                             current_left += word_width + estimated_space_width
                 continue
                 output["width"].append(word_output["width"][j])
                 output["height"].append(word_output["height"][j])
                 output["conf"].append(word_output["conf"][j])
+                # Preserve the model from the line-level data
+                output["model"].append(line_model)
         return output
         self,
         image: Image.Image,
         ocr: Optional[Any] = None,
+        paddle_results: List[Any] = None,
         confidence_threshold: int = HYBRID_OCR_CONFIDENCE_THRESHOLD,
         padding: int = HYBRID_OCR_PADDING,
         image_name: str = "unknown_image_name",
         input_image_width: int = None,
         input_image_height: int = None,
+    ) -> List[Any]:
         """
         Performs OCR using PaddleOCR at line level, then VLM for low-confidence lines.
+        Returns modified paddle_results in the same format as PaddleOCR output.
         Args:
             image: PIL Image to process
             ocr: PaddleOCR instance (optional, uses self.paddle_ocr if not provided)
+            paddle_results: PaddleOCR results in original format (List of dicts with rec_texts, rec_scores, rec_polys)
             confidence_threshold: Confidence threshold below which VLM is used
             padding: Padding to add around line crops
             image_name: Name of the image for logging/debugging
             input_image_height: Original image height (before preprocessing)
         Returns:
+            Modified paddle_results with VLM replacements for low-confidence lines
         """
         if ocr is None:
             if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
                     "No OCR object provided and 'paddle_ocr' is not initialized."
                 )
+        if paddle_results is None or not paddle_results:
+            return paddle_results
         print("Starting hybrid PaddleOCR + VLM OCR process...")
         # Get image dimensions
         if input_image_height is None:
             input_image_height = img_height
+        # Create a deep copy of paddle_results to modify
+        modified_paddle_results = copy.deepcopy(paddle_results)
+        # Process each page result in paddle_results
+        for page_result in modified_paddle_results:
+            # Extract text recognition results from the paddle format
+            rec_texts = page_result.get("rec_texts", list())
+            rec_scores = page_result.get("rec_scores", list())
+            rec_polys = page_result.get("rec_polys", list())
+            # Initialize rec_models list with "Paddle" as default for all lines
+            num_lines = len(rec_texts)
+            if (
+                "rec_models" not in page_result
+                or len(page_result.get("rec_models", [])) != num_lines
+            ):
+                rec_models = ["Paddle"] * num_lines
+                page_result["rec_models"] = rec_models
+            else:
+                rec_models = page_result["rec_models"]
+            # Get image dimensions from result if available
+            result_image_width = page_result.get("image_width")
+            result_image_height = page_result.get("image_height")
+            # Determine PaddleOCR's coordinate space dimensions
+            max_x_coord = 0
+            max_y_coord = 0
+            for bounding_box in rec_polys:
+                if hasattr(bounding_box, "tolist"):
+                    box = bounding_box.tolist()
+                else:
+                    box = bounding_box
+                if box and len(box) > 0:
+                    x_coords = [p[0] for p in box]
+                    y_coords = [p[1] for p in box]
+                    max_x_coord = max(max_x_coord, max(x_coords) if x_coords else 0)
+                    max_y_coord = max(max_y_coord, max(y_coords) if y_coords else 0)
+            paddle_coord_width = (
+                result_image_width
+                if result_image_width is not None
+                else max_x_coord if max_x_coord > 0 else input_image_width
+            )
+            paddle_coord_height = (
+                result_image_height
+                if result_image_height is not None
+                else max_y_coord if max_y_coord > 0 else input_image_height
+            )
+            if paddle_coord_width is None or paddle_coord_height is None:
+                paddle_coord_width = input_image_width or img_width
+                paddle_coord_height = input_image_height or img_height
+            if paddle_coord_width <= 0 or paddle_coord_height <= 0:
+                paddle_coord_width = input_image_width or img_width
+                paddle_coord_height = input_image_height or img_height
+            # Process each line
+            print(f"Processing {num_lines} lines from PaddleOCR results...")
+            for i in range(num_lines):
+                line_text = rec_texts[i]
+                line_conf = float(rec_scores[i]) * 100  # Convert to percentage
+                bounding_box = rec_polys[i]
+                # Skip empty lines
+                if not line_text.strip():
                     continue
+                # Extract bounding box coordinates
+                if hasattr(bounding_box, "tolist"):
+                    box = bounding_box.tolist()
+                else:
+                    box = bounding_box
+                if not box or len(box) == 0:
+                    continue
+                # Convert polygon to bounding box
+                x_coords = [p[0] for p in box]
+                y_coords = [p[1] for p in box]
+                line_left_paddle = float(min(x_coords))
+                line_top_paddle = float(min(y_coords))
+                line_right_paddle = float(max(x_coords))
+                line_bottom_paddle = float(max(y_coords))
+                line_width_paddle = line_right_paddle - line_left_paddle
+                line_height_paddle = line_bottom_paddle - line_top_paddle
+                # Convert to image coordinate space (scale from paddle coordinates to image coordinates)
+                if paddle_coord_width > 0 and paddle_coord_height > 0:
+                    rel_left = line_left_paddle / paddle_coord_width
+                    rel_top = line_top_paddle / paddle_coord_height
+                    rel_width = line_width_paddle / paddle_coord_width
+                    rel_height = line_height_paddle / paddle_coord_height
+                    line_left = rel_left * img_width
+                    line_top = rel_top * img_height
+                    line_width = rel_width * img_width
+                    line_height = rel_height * img_height
+                else:
+                    line_left = line_left_paddle
+                    line_top = line_top_paddle
+                    line_width = line_width_paddle
+                    line_height = line_height_paddle
+                # Initialize model as PaddleOCR (default)
+                # Count words in PaddleOCR output
+                paddle_words = line_text.split()
+                paddle_word_count = len(paddle_words)
+                # If confidence is low, use VLM for a second opinion
+                if line_conf < confidence_threshold:
+                    # Debug: Print line dimensions before cropping
+                    # print(
+                    #     f"  Line {i}: '{line_text[:50]}...' "
+                    #     f"conf={line_conf}, "
+                    #     f"bbox=({line_left:.1f}, {line_top:.1f}, {line_width:.1f}, {line_height:.1f})"
+                    # )
+                    # Ensure minimum line height for VLM processing
+                    # If line_height is too small, use a minimum height based on typical text line height
+                    min_line_height = max(
+                        line_height, 20
+                    )  # Minimum 20 pixels for text line
+                    if line_height < 20:
                         print(
+                            f"  Warning: Line height ({line_height:.1f}px) is too small. "
+                            f"Using minimum height of {min_line_height}px for cropping."
                         )
+                    # Calculate crop coordinates
+                    crop_left = line_left
+                    crop_top = line_top
+                    crop_right = line_left + line_width
+                    crop_bottom = line_top + min_line_height
+                    print(
+                        f"  Crop coordinates: left={crop_left}, top={crop_top}, "
+                        f"right={crop_right}, bottom={crop_bottom}, "
+                        f"size=({crop_right - crop_left}x{crop_bottom - crop_top})"
+                    )
+                    # Ensure crop dimensions are valid
+                    if crop_right <= crop_left or crop_bottom <= crop_top:
+                        # Invalid crop, keep original PaddleOCR result
+                        continue
+                    # Crop the line image
+                    cropped_image = image.crop(
+                        (crop_left, crop_top, crop_right, crop_bottom)
+                    )
+                    # Check if cropped image is too small for VLM processing
+                    crop_width = crop_right - crop_left
+                    crop_height = crop_bottom - crop_top
+                    if crop_width < 10 or crop_height < 10:
+                        print(
+                            f"  Line: '{line_text}' (conf: {line_conf}) -> "
+                            f"Cropped image too small ({crop_width}x{crop_height} pixels). "
+                            f"Skipping VLM, keeping PaddleOCR result."
+                        )
+                        # Keep original PaddleOCR result for this line
+                        continue
+                    # Ensure cropped image is in RGB mode before passing to VLM
+                    if cropped_image.mode != "RGB":
+                        cropped_image = cropped_image.convert("RGB")
+                    # Save input image for debugging if environment variable is set
+                    if SAVE_VLM_INPUT_IMAGES:
+                        try:
+                            vlm_debug_dir = os.path.join(
+                                OUTPUT_FOLDER,
+                                "hybrid_paddle_vlm_visualisations/vlm_input_images",
+                            )
+                            os.makedirs(vlm_debug_dir, exist_ok=True)
+                            line_text_safe = safe_sanitize_text(line_text)
+                            line_text_shortened = line_text_safe[:20]
+                            image_name_safe = safe_sanitize_text(image_name)
+                            image_name_shortened = image_name_safe[:20]
+                            filename = f"{image_name_shortened}_{line_text_shortened}_vlm_input_image.png"
+                            filepath = os.path.join(vlm_debug_dir, filename)
+                            cropped_image.save(filepath)
+                            print(f"Saved VLM input image to: {filepath}")
+                        except Exception as save_error:
+                            print(
+                                f"Warning: Could not save VLM input image: {save_error}"
                             )
+                    # Use VLM for OCR on this line with error handling
+                    vlm_result = None
+                    vlm_rec_texts = []
+                    vlm_rec_scores = []
+                    try:
+                        vlm_result = _vlm_ocr_predict(cropped_image)
+                        vlm_rec_texts = (
+                            vlm_result.get("rec_texts", []) if vlm_result else []
+                        )
+                        vlm_rec_scores = (
+                            vlm_result.get("rec_scores", []) if vlm_result else []
+                        )
+                    except Exception as vlm_error:
+                        print(
+                            f"  VLM OCR failed for line '{line_text[:50]}...' (conf: {line_conf}): {vlm_error}. "
+                            f"Keeping original PaddleOCR result."
+                        )
+                        # Ensure we keep original PaddleOCR result on error
+                        vlm_rec_texts = []
+                        vlm_rec_scores = []
+                    if vlm_rec_texts and vlm_rec_scores:
+                        # Combine VLM words into a single text string
+                        vlm_text = " ".join(vlm_rec_texts)
+                        vlm_word_count = len(vlm_rec_texts)
+                        vlm_conf = float(
+                            np.median(vlm_rec_scores)
+                        )  # Keep as 0-1 range for paddle format
+                        # Only replace if word counts match
+                        if (
+                            vlm_word_count - paddle_word_count <= 2
+                            and vlm_word_count - paddle_word_count >= -2
+                        ):
+                            print(
+                                f"  Re-OCR'd line: '{line_text}' (conf: {line_conf:.1f}, words: {paddle_word_count}) "
+                                f"-> '{vlm_text}' (conf: {vlm_conf*100:.1f}, words: {vlm_word_count}) [VLM]"
                             )
+                            # For exporting example image comparisons
+                            safe_filename = self._create_safe_filename_with_confidence(
+                                line_text,
+                                vlm_text,
+                                int(line_conf),
+                                int(vlm_conf * 100),
+                                "VLM",
                             )
+                            if SAVE_EXAMPLE_HYBRID_IMAGES is True:
+                                # Normalize and validate image_name to prevent path traversal attacks
+                                normalized_image_name = os.path.normpath(
+                                    image_name + "_hybrid_paddle_vlm"
+                                )
+                                if (
+                                    ".." in normalized_image_name
+                                    or "/" in normalized_image_name
+                                    or "\\" in normalized_image_name
+                                ):
+                                    normalized_image_name = "safe_image"
+                                hybrid_ocr_examples_folder = (
+                                    self.output_folder
+                                    + f"/hybrid_ocr_examples/{normalized_image_name}"
+                                )
+                                # Validate the constructed path is safe
+                                if not validate_folder_containment(
+                                    hybrid_ocr_examples_folder, OUTPUT_FOLDER
+                                ):
+                                    raise ValueError(
+                                        f"Unsafe hybrid_ocr_examples folder path: {hybrid_ocr_examples_folder}"
+                                    )
+                                if not os.path.exists(hybrid_ocr_examples_folder):
+                                    os.makedirs(hybrid_ocr_examples_folder)
+                                output_image_path = (
+                                    hybrid_ocr_examples_folder + f"/{safe_filename}.png"
+                                )
+                                print(f"Saving example image to {output_image_path}")
+                                cropped_image.save(output_image_path)
+                            # Replace with VLM result in paddle_results format
+                            # Update rec_texts, rec_scores, and rec_models for this line
+                            rec_texts[i] = vlm_text
+                            rec_scores[i] = vlm_conf
+                            rec_models[i] = "VLM"
+                            # Ensure page_result is updated with the modified rec_models list
+                            page_result["rec_models"] = rec_models
+                            print(
+                                f"  Set rec_models[{i}] = 'VLM' for line '{vlm_text[:50]}...'"
+                            )
+                        else:
+                            print(
+                                f"  Line: '{line_text}' (conf: {line_conf:.1f}, words: {paddle_word_count}) -> "
+                                f"VLM result '{vlm_text}' (conf: {vlm_conf*100:.1f}, words: {vlm_word_count}) "
+                                f"word count mismatch. Keeping PaddleOCR result."
+                            )
                     else:
+                        # VLM returned empty or no results - keep original PaddleOCR result
+                        if line_conf < confidence_threshold:
+                            print(
+                                f"  Line: '{line_text}' (conf: {line_conf:.1f}) -> "
+                                f"VLM returned no results. Keeping original PaddleOCR result."
+                            )
+        # Debug: Print summary of model labels before returning
+        for page_idx, page_result in enumerate(modified_paddle_results):
+            rec_models = page_result.get("rec_models", [])
+            vlm_count = sum(1 for m in rec_models if m == "VLM")
+            paddle_count = sum(1 for m in rec_models if m == "Paddle")
+            print(
+                f"Page {page_idx}: {vlm_count} VLM, {paddle_count} Paddle out of {len(rec_models)} total lines"
+            )
+        return modified_paddle_results
     def perform_ocr(
         self, image: Union[str, Image.Image, np.ndarray], ocr: Optional[Any] = None
         # Store original dimensions BEFORE preprocessing (needed for coordinate conversion)
         original_image_width = None
         original_image_height = None
+        original_image_for_visualization = (
+            None  # Store original image for visualization
+        )
         if PREPROCESS_LOCAL_OCR_IMAGES:
             print("Pre-processing image...")
             # Get original dimensions before preprocessing
             original_image_width, original_image_height = image.size
+            # Store original image for visualization (coordinates are in original space)
+            original_image_for_visualization = image.copy()
             image, preprocessing_metadata = self.image_preprocessor.preprocess_image(
                 image
             )
         else:
             preprocessing_metadata = dict()
             original_image_width, original_image_height = image.size
+            # When preprocessing is disabled, the current image is the original
+            original_image_for_visualization = image.copy()
         image_width, image_height = image.size
+        # Store original image for line-to-word conversion when PaddleOCR processes original image
+        original_image_for_cropping = None
+        paddle_processed_original = False
         # Note: In testing I haven't seen that this necessarily improves results
         if self.ocr_engine == "hybrid-paddle":
             # Try hybrid with original image for cropping:
             # Try hybrid VLM with original image for cropping:
             ocr_data = self._perform_hybrid_ocr(image, image_name=image_name)
         elif self.ocr_engine == "tesseract":
             ocr_data = pytesseract.image_to_data(
                 lang=self.tesseract_lang,  # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
             )
+        elif self.ocr_engine == "paddle" or self.ocr_engine == "hybrid-paddle-vlm":
             if ocr is None:
                 if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
                 paddle_input_height = image_np.shape[0]
                 paddle_results = ocr.predict(image_np)
+                # PaddleOCR processed the preprocessed image
+                paddle_processed_original = False
             else:
                 # When using image path, load image to get dimensions
                 temp_image = Image.open(image_path)
                 # For file path, use the original dimensions (before preprocessing)
                 # original_image_width and original_image_height are already set above
                 paddle_results = ocr.predict(image_path)
+                # PaddleOCR processed the original image from file path
+                paddle_processed_original = True
+                # Store the original image for cropping
+                original_image_for_cropping = temp_image.copy()
             # Save PaddleOCR visualization with bounding boxes
+            if paddle_results and SAVE_PAGE_OCR_VISUALISATIONS is True:
                 for res in paddle_results:
                     # self.output_folder is already validated and normalized at construction time
                     os.makedirs(paddle_viz_folder, exist_ok=True)
                     res.save_to_img(paddle_viz_folder)
+            if self.ocr_engine == "hybrid-paddle-vlm":
+                paddle_results = self._perform_hybrid_paddle_vlm_ocr(
+                    image,
+                    ocr=ocr,
+                    paddle_results=paddle_results,
+                    image_name=image_name,
+                    input_image_width=original_image_width,
+                    input_image_height=original_image_height,
+                )
+                # Debug: Check structure after hybrid processing
+                if paddle_results:
+                    print(
+                        f"DEBUG: After hybrid, paddle_results length: {len(paddle_results)}"
+                    )
+                    if len(paddle_results) > 0 and isinstance(paddle_results[0], dict):
+                        rec_models = paddle_results[0].get("rec_models", [])
+                        vlm_count = sum(1 for m in rec_models if m == "VLM")
+                        print(
+                            f"DEBUG: After hybrid, first page has {vlm_count} VLM labels out of {len(rec_models)} total"
+                        )
             ocr_data = self._convert_paddle_to_tesseract_format(
                 paddle_results,
                 input_image_width=original_image_width,
                 input_image_height=original_image_height,
             )
+            if SAVE_PAGE_OCR_VISUALISATIONS is True:
+                # Save output to image with identified bounding boxes
+                # Use original image since coordinates are in original image space
+                # Prefer original_image_for_cropping (when PaddleOCR processed from file path),
+                # otherwise use original_image_for_visualization (stored before preprocessing)
+                viz_image = (
+                    original_image_for_cropping
+                    if original_image_for_cropping is not None
+                    else (
+                        original_image_for_visualization
+                        if original_image_for_visualization is not None
+                        else image
+                    )
+                )
+                if isinstance(viz_image, Image.Image):
+                    # Convert PIL Image to numpy array in BGR format for OpenCV
+                    image_cv = cv2.cvtColor(np.array(viz_image), cv2.COLOR_RGB2BGR)
+                else:
+                    image_cv = np.array(viz_image)
+                    if len(image_cv.shape) == 2:
+                        image_cv = cv2.cvtColor(image_cv, cv2.COLOR_GRAY2BGR)
+                    elif len(image_cv.shape) == 3 and image_cv.shape[2] == 3:
+                        # Assume RGB, convert to BGR
+                        image_cv = cv2.cvtColor(image_cv, cv2.COLOR_RGB2BGR)
+                # Draw all bounding boxes on the image
+                for i in range(len(ocr_data["text"])):
+                    left = int(ocr_data["left"][i])
+                    top = int(ocr_data["top"][i])
+                    width = int(ocr_data["width"][i])
+                    height = int(ocr_data["height"][i])
+                    # Ensure coordinates are within image bounds
+                    left = max(0, min(left, image_cv.shape[1] - 1))
+                    top = max(0, min(top, image_cv.shape[0] - 1))
+                    right = max(left + 1, min(left + width, image_cv.shape[1]))
+                    bottom = max(top + 1, min(top + height, image_cv.shape[0]))
+                    cv2.rectangle(
+                        image_cv, (left, top), (right, bottom), (0, 255, 0), 2
+                    )
+                # Save the visualization once with all boxes drawn
+                paddle_viz_folder = os.path.join(
+                    self.output_folder, "paddle_visualisations"
+                )
+                # Double-check the constructed path is safe
+                if not validate_folder_containment(paddle_viz_folder, OUTPUT_FOLDER):
+                    raise ValueError(
+                        f"Unsafe paddle visualisations folder path: {paddle_viz_folder}"
+                    )
+                os.makedirs(paddle_viz_folder, exist_ok=True)
+                # Generate safe filename
+                if image_name:
+                    base_name = os.path.splitext(os.path.basename(image_name))[0]
+                    # Sanitize filename to avoid issues with special characters
+                    base_name = safe_sanitize_text(base_name, max_length=50)
+                    filename = f"{base_name}_ocr_visualisation.jpg"
+                else:
+                    timestamp = int(time.time())
+                    filename = f"ocr_visualisation_{timestamp}.jpg"
+                output_path = os.path.join(paddle_viz_folder, filename)
+                cv2.imwrite(output_path, image_cv)
+                print(f"OCR visualization saved to: {output_path}")
         else:
             raise RuntimeError(f"Unsupported OCR engine: {self.ocr_engine}")
+        # Always check for scale_factor, even if preprocessing_metadata is empty
+        # This ensures rescaling happens correctly when preprocessing was applied
+        scale_factor = (
+            preprocessing_metadata.get("scale_factor", 1.0)
+            if preprocessing_metadata
+            else 1.0
+        )
+        if scale_factor != 1.0:
+            # Skip rescaling for PaddleOCR since _convert_paddle_to_tesseract_format
+            # already scales coordinates directly to original image dimensions
+            # hybrid-paddle-vlm also uses PaddleOCR and converts to original space
+            if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid-paddle-vlm":
+                pass
+                # print(f"Skipping rescale_ocr_data for PaddleOCR (already scaled to original dimensions)")
+            else:
+                print("rescaling ocr_data with scale_factor: ", scale_factor)
+                ocr_data = rescale_ocr_data(ocr_data, scale_factor)
         # Convert line-level results to word-level if configured and needed
         if CONVERT_LINE_TO_WORD_LEVEL and self._is_line_level_data(ocr_data):
             print("Converting line-level OCR results to word-level...")
+            # Check if coordinates need to be scaled to match the image we're cropping from
+            # For PaddleOCR: _convert_paddle_to_tesseract_format converts coordinates to original image space
+            #   - If PaddleOCR processed the original image (image_path provided), crop from original image (no scaling)
+            #   - If PaddleOCR processed the preprocessed image (no image_path), scale coordinates to preprocessed space and crop from preprocessed image
             # For Tesseract: OCR runs on preprocessed image, so coordinates are already in preprocessed space,
             #   matching the preprocessed image we're cropping from - no scaling needed
             needs_scaling = False
+            crop_image = image  # Default to preprocessed image
+            crop_image_width = image_width
+            crop_image_height = image_height
             if (
                 PREPROCESS_LOCAL_OCR_IMAGES
                 and original_image_width
                 ):
                     # PaddleOCR coordinates are converted to original space by _convert_paddle_to_tesseract_format
                     # hybrid-paddle-vlm also uses PaddleOCR and converts to original space
+                    if paddle_processed_original:
+                        # PaddleOCR processed the original image, so crop from original image
+                        # No scaling needed - coordinates are already in original space
+                        crop_image = original_image_for_cropping
+                        crop_image_width = original_image_width
+                        crop_image_height = original_image_height
+                        needs_scaling = False
+                        print(
+                            f"PaddleOCR processed original image. Cropping from original ({original_image_width}x{original_image_height}) without scaling."
+                        )
+                    else:
+                        # PaddleOCR processed the preprocessed image, so scale coordinates to preprocessed space
+                        needs_scaling = True
             if needs_scaling:
                 # Calculate scale factors from original to preprocessed
                     "width": [w * scale_x for w in ocr_data["width"]],
                     "height": [h * scale_y for h in ocr_data["height"]],
                     "conf": ocr_data["conf"],
+                    "model": ocr_data["model"],
                 }
                 ocr_data = self._convert_line_to_word_level(
                     scaled_ocr_data,
+                    crop_image_width,
+                    crop_image_height,
+                    crop_image,
                     image_name=image_name,
                 )
                 # Scale word-level results back to original image space
                     ocr_data["width"][i] = ocr_data["width"][i] * scale_factor_x
                     ocr_data["height"][i] = ocr_data["height"][i] * scale_factor_y
             else:
+                # No scaling needed - coordinates match the crop image space
                 ocr_data = self._convert_line_to_word_level(
+                    ocr_data,
+                    crop_image_width,
+                    crop_image_height,
+                    crop_image,
+                    image_name=image_name,
                 )
         # The rest of your processing pipeline now works for both engines
         ocr_result = ocr_data
         ]
         # Determine default model based on OCR engine if model field is not present
+        if "model" in ocr_result:
             # Model field exists and has correct length - use it
             def get_model(idx):
                 return ocr_result["model"][idx]
                     "Paddle"
                     if self.ocr_engine == "paddle"
                     else (
+                        "Tesseract"
                         if self.ocr_engine == "hybrid-paddle"
                         else (
+                            "Tesseract"
                             if self.ocr_engine == "hybrid-vlm"
                             else (
+                                "Paddle"
                                 if self.ocr_engine == "hybrid-paddle-vlm"
                                 else None
                             )

tools/file_redaction.py CHANGED Viewed

@@ -61,8 +61,7 @@ from tools.config import (
     RETURN_PDF_FOR_REVIEW,
     RETURN_REDACTED_PDF,
     RUN_AWS_FUNCTIONS,
-    SAVE_TESSERACT_VISUALISATIONS,
-    SAVE_TEXTRACT_VISUALISATIONS,
     SELECTABLE_TEXT_EXTRACT_OPTION,
     TESSERACT_TEXT_EXTRACT_OPTION,
     TEXTRACT_TEXT_EXTRACT_OPTION,
@@ -3493,7 +3492,7 @@ def redact_image_pdf(
                 if not textract_data:
                     try:
-                        print(f"Image object: {image}")
                         # Convert the image_path to bytes using an in-memory buffer
                         image_buffer = io.BytesIO()
                         image.save(
@@ -3658,10 +3657,10 @@ def redact_image_pdf(
             # Save OCR visualization with bounding boxes (works for all OCR methods)
             if (
                 text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
-                and SAVE_TEXTRACT_VISUALISATIONS is True
             ) or (
                 text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION
-                and SAVE_TESSERACT_VISUALISATIONS is True
             ):
                 if (
                     page_line_level_ocr_results_with_words
@@ -5189,7 +5188,11 @@ def visualise_ocr_words_bounding_boxes(
         words = line_data.get("words", [])
-        # Process each word in the line
         for word_data in words:
             if not isinstance(word_data, dict):
                 continue
@@ -5219,73 +5222,223 @@ def visualise_ocr_words_bounding_boxes(
             if x2 <= x1 or y2 <= y1:
                 continue
-            # Check if word was replaced by a different model (for reference, but text color always uses confidence)
-            model = word_data.get("model", None)
-            is_replaced = model and model.lower() != base_model_name.lower()
-            # Text color always based on confidence (not affected by model replacement)
-            text_color = (0, 0, 180)  # Default to dark red
-            for min_conf, max_conf, conf_color, _ in text_confidence_ranges:
-                if min_conf <= conf <= max_conf:
-                    text_color = conf_color
-                    break
-            # Calculate font size to fit text within bounding box
             box_width = x2 - x1
             box_height = y2 - y1
-            # Start with a reasonable font scale
-            font_scale = 0.5
-            font_thickness = 1
-            font = cv2.FONT_HERSHEY_SIMPLEX
-            # Get text size and adjust to fit
-            (text_width, text_height), baseline = cv2.getTextSize(
-                text, font, font_scale, font_thickness
-            )
-            # Scale font to fit width (with some padding)
-            if text_width > 0:
-                width_scale = (box_width * 0.9) / text_width
-            else:
-                width_scale = 1.0
-            # Scale font to fit height (with some padding)
-            if text_height > 0:
-                height_scale = (box_height * 0.8) / text_height
             else:
-                height_scale = 1.0
-            # Use the smaller scale to ensure text fits both dimensions
-            font_scale = min(
-                font_scale * min(width_scale, height_scale), 2.0
-            )  # Cap at 2.0
-            # Recalculate text size with adjusted font scale
-            (text_width, text_height), baseline = cv2.getTextSize(
-                text, font, font_scale, font_thickness
-            )
-            # Center text within bounding box
-            text_x = x1 + (box_width - text_width) // 2
-            text_y = y1 + (box_height + text_height) // 2  # Baseline adjustment
-            # Draw text
-            cv2.putText(
-                text_page,
-                text,
-                (text_x, text_y),
-                font,
-                font_scale,
-                text_color,
-                font_thickness,
-                cv2.LINE_AA,
-            )
-            # Draw grey bounding box for replaced words on text page
-            if is_replaced:
-                box_color = (128, 128, 128)  # Grey for model replacements
-                cv2.rectangle(text_page, (x1, y1), (x2, y2), box_color, 1)
     # Add legend to second page
     if add_legend:

     RETURN_PDF_FOR_REVIEW,
     RETURN_REDACTED_PDF,
     RUN_AWS_FUNCTIONS,
+    SAVE_PAGE_OCR_VISUALISATIONS,
     SELECTABLE_TEXT_EXTRACT_OPTION,
     TESSERACT_TEXT_EXTRACT_OPTION,
     TEXTRACT_TEXT_EXTRACT_OPTION,
                 if not textract_data:
                     try:
+                        # print(f"Image object: {image}")
                         # Convert the image_path to bytes using an in-memory buffer
                         image_buffer = io.BytesIO()
                         image.save(
             # Save OCR visualization with bounding boxes (works for all OCR methods)
             if (
                 text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+                and SAVE_PAGE_OCR_VISUALISATIONS is True
             ) or (
                 text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION
+                and SAVE_PAGE_OCR_VISUALISATIONS is True
             ):
                 if (
                     page_line_level_ocr_results_with_words
         words = line_data.get("words", [])
+        # Group words by bounding box (to handle cases where multiple words share the same box)
+        # Use a small tolerance to consider boxes as "the same" if they're very close
+        bbox_tolerance = 5  # pixels
+        bbox_groups = {}  # Maps (x1, y1, x2, y2) to list of word_data
         for word_data in words:
             if not isinstance(word_data, dict):
                 continue
             if x2 <= x1 or y2 <= y1:
                 continue
+            # Round coordinates to nearest tolerance to group similar boxes
+            x1_rounded = (x1 // bbox_tolerance) * bbox_tolerance
+            y1_rounded = (y1 // bbox_tolerance) * bbox_tolerance
+            x2_rounded = (x2 // bbox_tolerance) * bbox_tolerance
+            y2_rounded = (y2 // bbox_tolerance) * bbox_tolerance
+            bbox_key = (x1_rounded, y1_rounded, x2_rounded, y2_rounded)
+            if bbox_key not in bbox_groups:
+                bbox_groups[bbox_key] = []
+            bbox_groups[bbox_key].append(
+                {"word_data": word_data, "original_bbox": (x1, y1, x2, y2)}
+            )
+        # Process each group of words
+        for bbox_key, word_group in bbox_groups.items():
+            if not word_group:
+                continue
+            # Use the first word's bounding box as the reference (they should all be similar)
+            x1, y1, x2, y2 = word_group[0]["original_bbox"]
             box_width = x2 - x1
             box_height = y2 - y1
+            # If only one word in the box, process it normally
+            if len(word_group) == 1:
+                word_data = word_group[0]["word_data"]
+                text = word_data.get("text", "")
+                conf = int(word_data.get("conf", word_data.get("confidence", 0)))
+                # Check if word was replaced by a different model
+                model = word_data.get("model", None)
+                is_replaced = model and model.lower() != base_model_name.lower()
+                # Text color always based on confidence
+                text_color = (0, 0, 180)  # Default to dark red
+                for min_conf, max_conf, conf_color, _ in text_confidence_ranges:
+                    if min_conf <= conf <= max_conf:
+                        text_color = conf_color
+                        break
+                # Calculate font size to fit text within bounding box
+                font_scale = 0.5
+                font_thickness = 1
+                font = cv2.FONT_HERSHEY_SIMPLEX
+                # Get text size and adjust to fit
+                (text_width, text_height), baseline = cv2.getTextSize(
+                    text, font, font_scale, font_thickness
+                )
+                # Scale font to fit width (with some padding)
+                if text_width > 0:
+                    width_scale = (box_width * 0.9) / text_width
+                else:
+                    width_scale = 1.0
+                # Scale font to fit height (with some padding)
+                if text_height > 0:
+                    height_scale = (box_height * 0.8) / text_height
+                else:
+                    height_scale = 1.0
+                # Use the smaller scale to ensure text fits both dimensions
+                font_scale = min(
+                    font_scale * min(width_scale, height_scale), 2.0
+                )  # Cap at 2.0
+                # Recalculate text size with adjusted font scale
+                (text_width, text_height), baseline = cv2.getTextSize(
+                    text, font, font_scale, font_thickness
+                )
+                # Center text within bounding box
+                text_x = x1 + (box_width - text_width) // 2
+                text_y = y1 + (box_height + text_height) // 2  # Baseline adjustment
+                # Draw text
+                cv2.putText(
+                    text_page,
+                    text,
+                    (text_x, text_y),
+                    font,
+                    font_scale,
+                    text_color,
+                    font_thickness,
+                    cv2.LINE_AA,
+                )
+                # Draw grey bounding box for replaced words on text page
+                if is_replaced:
+                    box_color = (128, 128, 128)  # Grey for model replacements
+                    cv2.rectangle(text_page, (x1, y1), (x2, y2), box_color, 1)
             else:
+                # Multiple words in the same box - arrange them side by side
+                # Extract texts and determine colors for each word
+                word_texts = []
+                word_colors = []
+                word_is_replaced = []
+                for item in word_group:
+                    word_data = item["word_data"]
+                    text = word_data.get("text", "")
+                    conf = int(word_data.get("conf", word_data.get("confidence", 0)))
+                    model = word_data.get("model", None)
+                    is_replaced = model and model.lower() != base_model_name.lower()
+                    # Text color based on confidence
+                    text_color = (0, 0, 180)  # Default to dark red
+                    for min_conf, max_conf, conf_color, _ in text_confidence_ranges:
+                        if min_conf <= conf <= max_conf:
+                            text_color = conf_color
+                            break
+                    word_texts.append(text)
+                    word_colors.append(text_color)
+                    word_is_replaced.append(is_replaced)
+                # Calculate font size to fit all words side by side
+                font_scale = 0.5
+                font_thickness = 1
+                font = cv2.FONT_HERSHEY_SIMPLEX
+                # Start with a reasonable font scale and reduce if needed
+                max_font_scale = 2.0
+                min_font_scale = 0.1
+                font_scale = max_font_scale
+                # Binary search or iterative approach to find the right font size
+                for _ in range(20):  # Max iterations
+                    # Calculate total width needed for all words with spaces
+                    total_width = 0
+                    max_text_height = 0
+                    for i, text in enumerate(word_texts):
+                        (text_width, text_height), baseline = cv2.getTextSize(
+                            text, font, font_scale, font_thickness
+                        )
+                        total_width += text_width
+                        max_text_height = max(max_text_height, text_height)
+                        # Add space width between words (except last word)
+                        if i < len(word_texts) - 1:
+                            (space_width, _), _ = cv2.getTextSize(
+                                " ", font, font_scale, font_thickness
+                            )
+                            total_width += space_width
+                    # Check if it fits
+                    width_fits = total_width <= box_width * 0.9
+                    height_fits = max_text_height <= box_height * 0.8
+                    if width_fits and height_fits:
+                        break
+                    # Reduce font scale
+                    font_scale *= 0.9
+                    if font_scale < min_font_scale:
+                        font_scale = min_font_scale
+                        break
+                # Recalculate total width and max height with final font scale
+                total_width = 0
+                max_text_height = 0
+                for i, text in enumerate(word_texts):
+                    (text_width, text_height), baseline = cv2.getTextSize(
+                        text, font, font_scale, font_thickness
+                    )
+                    total_width += text_width
+                    max_text_height = max(max_text_height, text_height)
+                    # Add space width between words (except last word)
+                    if i < len(word_texts) - 1:
+                        (space_width, _), _ = cv2.getTextSize(
+                            " ", font, font_scale, font_thickness
+                        )
+                        total_width += space_width
+                # Now draw each word side by side
+                current_x = (
+                    x1 + (box_width - total_width) // 2
+                )  # Center the combined text
+                text_y = y1 + (box_height + max_text_height) // 2  # Baseline adjustment
+                for i, (text, text_color) in enumerate(zip(word_texts, word_colors)):
+                    # Get text size with final font scale
+                    (text_width, text_height), baseline = cv2.getTextSize(
+                        text, font, font_scale, font_thickness
+                    )
+                    # Draw text
+                    cv2.putText(
+                        text_page,
+                        text,
+                        (int(current_x), text_y),
+                        font,
+                        font_scale,
+                        text_color,
+                        font_thickness,
+                        cv2.LINE_AA,
+                    )
+                    # Move to next position
+                    current_x += text_width
+                    # Add space between words (except last word)
+                    if i < len(word_texts) - 1:
+                        (space_width, _), _ = cv2.getTextSize(
+                            " ", font, font_scale, font_thickness
+                        )
+                        current_x += space_width
+                # Draw grey bounding box if any word was replaced
+                if any(word_is_replaced):
+                    box_color = (128, 128, 128)  # Grey for model replacements
+                    cv2.rectangle(text_page, (x1, y1), (x2, y2), box_color, 1)
     # Add legend to second page
     if add_legend:

tools/run_vlm.py CHANGED Viewed

@@ -221,7 +221,7 @@ if SHOW_VLM_MODEL_OPTIONS is True:
 @spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
-def generate_image(
     text: str,
     image: Image.Image,
     max_new_tokens: int = None,

 @spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
+def extract_text_from_image_vlm(
     text: str,
     image: Image.Image,
     max_new_tokens: int = None,

tools/secure_regex_utils.py CHANGED Viewed

@@ -267,14 +267,14 @@ def safe_extract_latest_number_from_filename(filename: str) -> Optional[int]:
         return None
-def safe_sanitize_text(text: str, replacement: str = "_") -> str:
     """
     Safely sanitize text by removing dangerous characters without ReDoS vulnerability.
     Args:
         text: The text to sanitize
         replacement: Character to replace dangerous characters with
     Returns:
         Sanitized text
     """
@@ -291,4 +291,7 @@ def safe_sanitize_text(text: str, replacement: str = "_") -> str:
     # Remove leading/trailing replacements
     sanitized = sanitized.strip(replacement)
     return sanitized

         return None
+def safe_sanitize_text(text: str, replacement: str = "_", max_length: int = 255) -> str:
     """
     Safely sanitize text by removing dangerous characters without ReDoS vulnerability.
     Args:
         text: The text to sanitize
         replacement: Character to replace dangerous characters with
+        max_length: Maximum length of the text
     Returns:
         Sanitized text
     """
     # Remove leading/trailing replacements
     sanitized = sanitized.strip(replacement)
+    # Truncate to maximum length
+    sanitized = sanitized[:max_length]
     return sanitized

tools/word_segmenter.py CHANGED Viewed

@@ -82,14 +82,14 @@ class AdaptiveSegmenter:
         orientation_angle = 0.0
         if box_height > box_width:
-            print(
-                f"Detected vertical orientation (W:{box_width} < H:{box_height}). Applying 90-degree correction."
-            )
             orientation_angle = 90.0
         else:
-            print(
-                f"Detected horizontal orientation (W:{box_width} >= H:{box_height}). No orientation correction."
-            )
             M_orient = cv2.getRotationMatrix2D(center, 0, 1.0)
             return gray_image, M_orient
@@ -251,29 +251,29 @@ class AdaptiveSegmenter:
     ) -> Tuple[Dict[str, List], bool]:
         if line_image is None:
-            print(
-                f"Error: line_image is None in segment function (image_name: {image_name})"
-            )
             return ({}, False)
         # Validate line_image is a valid numpy array
         if not isinstance(line_image, np.ndarray):
-            print(
-                f"Error: line_image is not a numpy array (type: {type(line_image)}, image_name: {image_name})"
-            )
             return ({}, False)
         # Validate line_image has valid shape and size
         if line_image.size == 0:
-            print(
-                f"Error: line_image is empty (shape: {line_image.shape}, image_name: {image_name})"
-            )
             return ({}, False)
         if len(line_image.shape) < 2:
-            print(
-                f"Error: line_image has invalid shape {line_image.shape} (image_name: {image_name})"
-            )
             return ({}, False)
         # Early return if 1 or fewer words
@@ -283,20 +283,20 @@ class AdaptiveSegmenter:
             if len(words) <= 1:
                 return ({}, False)
         else:
-            print(
-                f"Error: line_data is empty or does not contain text (image_name: {image_name})"
-            )
             return ({}, False)
-        print(f"line_text: {line_text}")
         shortened_line_text = line_text.replace(" ", "_")[:10]
         if SHOW_OUTPUT_IMAGES:
             os.makedirs(self.output_folder, exist_ok=True)
-            output_path = f"{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_original.png"
-            os.makedirs(f"{self.output_folder}/paddle_visualisations", exist_ok=True)
             cv2.imwrite(output_path, line_image)
-            print(f"\nSaved original image to '{output_path}'")
         gray = cv2.cvtColor(line_image, cv2.COLOR_BGR2GRAY)
@@ -334,11 +334,22 @@ class AdaptiveSegmenter:
             borderMode=cv2.BORDER_REPLICATE,
         )
         # Save deskewed image (optional, only if image_name is provided)
         if SHOW_OUTPUT_IMAGES:
             os.makedirs(self.output_folder, exist_ok=True)
-            output_path = f"{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_deskewed.png"
-            os.makedirs(f"{self.output_folder}/paddle_visualisations", exist_ok=True)
             cv2.imwrite(output_path, deskewed_line_image)
             # print(f"\nSaved deskewed image to '{output_path}'")
@@ -351,6 +362,29 @@ class AdaptiveSegmenter:
         block_size = int(avg_char_width_approx * BLOCK_SIZE_FACTOR)
         if block_size % 2 == 0:
             block_size += 1
         binary = cv2.adaptiveThreshold(
             deskewed_gray,
             255,
@@ -360,11 +394,18 @@ class AdaptiveSegmenter:
             C_VALUE,
         )
         # Save cropped image (optional, only if image_name is provided)
         if SHOW_OUTPUT_IMAGES:
             os.makedirs(self.output_folder, exist_ok=True)
-            output_path = f"{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_binary.png"
-            os.makedirs(f"{self.output_folder}/paddle_visualisations", exist_ok=True)
             cv2.imwrite(output_path, binary)
             # print(f"\nSaved cropped image to '{output_path}'")
@@ -380,14 +421,25 @@ class AdaptiveSegmenter:
         # It's a dilation followed by an erosion
         closed_binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=1)
         # (Optional) You could also use a DILATE to make letters thicker
         # dilated_binary = cv2.dilate(closed_binary, kernel, iterations=1)
         # Use 'closed_binary' (or 'dilated_binary') from now on.
         if SHOW_OUTPUT_IMAGES:
             os.makedirs(self.output_folder, exist_ok=True)
-            output_path = f"{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_closed_binary.png"
-            os.makedirs(f"{self.output_folder}/paddle_visualisations", exist_ok=True)
             cv2.imwrite(output_path, closed_binary)
             # print(f"\nSaved dilated binary image to '{output_path}'")
@@ -405,7 +457,7 @@ class AdaptiveSegmenter:
             # Handle edge case of empty 'areas' array
             if len(areas) == 0:
                 clean_binary = binary
-                print("Warning: No components found after binarization.")
                 areas = np.array([0])  # Add a dummy value to prevent crashes
             # --- 1. Calculate the DEFAULT CONSERVATIVE threshold ---
@@ -449,18 +501,18 @@ class AdaptiveSegmenter:
             # --- 3. ADAPTIVE DECISION: Override if conservative threshold is clearly noise ---
             if has_clear_gap:
-                print(
-                    f"Noise Removal: Gap detected. Noise cluster ends at {area_before_gap}px. Aggressive threshold = {aggressive_threshold:.1f}"
-                )
                 # Only use a more aggressive threshold IF our "safe" threshold is clearly
                 # stuck *inside* the noise cluster.
                 # e.g., Safe threshold = 1, but noise goes up to 10.
                 # (We use 0.8 as a buffer, so if thresh=7 and gap=8, we don't switch)
                 if area_threshold < (area_before_gap * 0.8):
-                    print(
-                        f"Noise Removal: Conservative threshold ({area_threshold:.1f}) is deep in noise cluster (ends at {area_before_gap}px)."
-                    )
                     # Instead of using large percentage increases, use a very small absolute increment
                     # This preserves legitimate small letters/words that might be just above the noise
@@ -498,18 +550,18 @@ class AdaptiveSegmenter:
                     # Cap at 15 pixels as absolute upper bound
                     final_threshold = min(final_threshold, 15)
-                    print(
-                        f"Noise Removal: Using MODERATE threshold: {final_threshold:.1f} (noise ends at {area_before_gap}px, increment: {small_increment}px)"
-                    )
                     area_threshold = final_threshold
                 else:
-                    print(
-                        f"Noise Removal: Gap found, but conservative threshold ({area_threshold:.1f}) is sufficient. Sticking with conservative."
-                    )
                     pass
             # --- 4. Apply the final, determined threshold ---
-            print(f"Noise Removal: Final area threshold: {area_threshold:.1f}")
             for i in range(1, num_labels):
                 # Use >= to be inclusive of the threshold itself
                 if stats[i, cv2.CC_STAT_AREA] >= area_threshold:
@@ -518,6 +570,17 @@ class AdaptiveSegmenter:
             # No components found, or only background
             clean_binary = binary
         # Calculate the horizontal projection profile on the cleaned image
         horizontal_projection = np.sum(clean_binary, axis=1)
@@ -539,9 +602,9 @@ class AdaptiveSegmenter:
             # Ensure the crop is valid
             if y_start < y_end:
-                print(
-                    f"Original text height: {text_height}px. Cropping to middle {100 - (2*trim_percentage*100):.0f}% region."
-                )
                 # Slice the image to get the vertically cropped ROI
                 analysis_image = clean_binary[y_start:y_end, :]
             else:
@@ -551,18 +614,29 @@ class AdaptiveSegmenter:
             # If no text is found, use the original cleaned image
             analysis_image = clean_binary
         # --- Step 3: Hierarchical Adaptive Search (using the new clean_binary) ---
         # The rest of the pipeline is identical but now operates on a superior image.
         words = line_data["text"][0].split()
         target_word_count = len(words)
-        print(f"Target word count: {target_word_count}")
         # Save cropped image (optional, only if image_name is provided)
         if SHOW_OUTPUT_IMAGES:
             os.makedirs(self.output_folder, exist_ok=True)
-            output_path = f"{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_clean_binary.png"
-            os.makedirs(f"{self.output_folder}/paddle_visualisations", exist_ok=True)
             cv2.imwrite(output_path, analysis_image)
             # print(f"\nSaved cropped image to '{output_path}'")
@@ -574,7 +648,7 @@ class AdaptiveSegmenter:
         target_word_count = len(words)
         stage1_succeeded = False
-        print("--- Stage 1: Searching with adaptive valley threshold ---")
         valley_factors_to_try = np.arange(INITIAL_VALLEY_THRESHOLD_FACTOR, 0.45, 0.05)
         for v_factor in valley_factors_to_try:
             # Pass the cropped image to the helper
@@ -589,9 +663,9 @@ class AdaptiveSegmenter:
                 break
         if not stage1_succeeded:
-            print(
-                "\n--- Stage 1 failed. Starting Stage 2: Searching with adaptive kernel ---"
-            )
             kernel_factors_to_try = np.arange(INITIAL_KERNEL_WIDTH_FACTOR, 0.5, 0.05)
             fixed_valley_factor = MAIN_VALLEY_THRESHOLD_FACTOR
             for k_factor in kernel_factors_to_try:
@@ -601,12 +675,34 @@ class AdaptiveSegmenter:
                 closed_binary = cv2.morphologyEx(
                     clean_binary, cv2.MORPH_CLOSE, closing_kernel
                 )
                 # We need to re-apply the same vertical crop to this new image
                 if len(non_zero_rows) > 0 and y_start < y_end:
                     analysis_image = closed_binary[y_start:y_end, :]
                 else:
                     analysis_image = closed_binary
                 unlabeled_boxes = self._get_boxes_from_profile(
                     analysis_image,
                     avg_char_width_approx,
@@ -614,11 +710,11 @@ class AdaptiveSegmenter:
                     fixed_valley_factor,
                 )
-                print(
-                    f"Testing kernel factor {k_factor:.2f} ({kernel_width}px): Found {len(unlabeled_boxes)} boxes."
-                )
                 if abs(target_word_count - len(unlabeled_boxes)) <= match_tolerance:
-                    print("SUCCESS (Stage 2): Found a match.")
                     best_boxes = unlabeled_boxes
                     successful_binary_image = (
                         closed_binary  # For Stage 2, the source is the closed_binary
@@ -629,7 +725,7 @@ class AdaptiveSegmenter:
         used_fallback = False
         if best_boxes is None:
-            print("\nWarning: All adaptive searches failed. Falling back.")
             fallback_segmenter = HybridWordSegmenter()
             used_fallback = True
             final_output = fallback_segmenter.refine_words_bidirectional(
@@ -803,22 +899,35 @@ class AdaptiveSegmenter:
         # Visualisation
         if SHOW_OUTPUT_IMAGES:
-            output_path = f"{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_final_boxes.png"
-            os.makedirs(f"{self.output_folder}/paddle_visualisations", exist_ok=True)
             output_image_vis = line_image.copy()
-            print(f"\nFinal refined {len(remapped_output['text'])} words:")
-            for i in range(len(remapped_output["text"])):
-                word = remapped_output["text"][i]
-                x, y, w, h = (
-                    int(remapped_output["left"][i]),
-                    int(remapped_output["top"][i]),
-                    int(remapped_output["width"][i]),
-                    int(remapped_output["height"][i]),
-                )
-                print(f"- '{word}' at ({x}, {y}, {w}, {h})")
-                cv2.rectangle(output_image_vis, (x, y), (x + w, y + h), (0, 255, 0), 2)
-            cv2.imwrite(output_path, output_image_vis)
-            print(f"\nSaved visualisation to '{output_path}'")
         return remapped_output, used_fallback
@@ -1076,9 +1185,9 @@ if __name__ == "__main__":
     # image_path = 'input/london_borough_of_lambeth.png'
     image_basename = os.path.basename(image_path)
     image_name = os.path.splitext(image_basename)[0]
-    output_path = f"outputs/{image_name}_refined_morph.png"
-    if not os.path.exists("outputs"):
-        os.makedirs("outputs")
     line_image_cv = cv2.imread(image_path)
     h, w, _ = line_image_cv.shape
@@ -1100,20 +1209,28 @@ if __name__ == "__main__":
     # Visualisation
     output_image_vis = line_image_cv.copy()
-    print(f"\nFinal refined {len(final_word_data['text'])} words:")
-    for i in range(len(final_word_data["text"])):
-        word = final_word_data["text"][i]
-        x, y, w, h = (
-            int(final_word_data["left"][i]),
-            int(final_word_data["top"][i]),
-            int(final_word_data["width"][i]),
-            int(final_word_data["height"][i]),
-        )
-        print(f"- '{word}' at ({x}, {y}, {w}, {h})")
-        cv2.rectangle(output_image_vis, (x, y), (x + w, y + h), (0, 255, 0), 2)
-    cv2.imwrite(output_path, output_image_vis)
-    print(f"\nSaved visualisation to '{output_path}'")
     # You can also use matplotlib to display it in a notebook
     import matplotlib.pyplot as plt

         orientation_angle = 0.0
         if box_height > box_width:
+            # print(
+            #     f"Detected vertical orientation (W:{box_width} < H:{box_height}). Applying 90-degree correction."
+            # )
             orientation_angle = 90.0
         else:
+            # print(
+            #     f"Detected horizontal orientation (W:{box_width} >= H:{box_height}). No orientation correction."
+            # )
             M_orient = cv2.getRotationMatrix2D(center, 0, 1.0)
             return gray_image, M_orient
     ) -> Tuple[Dict[str, List], bool]:
         if line_image is None:
+            # print(
+            #     f"Error: line_image is None in segment function (image_name: {image_name})"
+            # )
             return ({}, False)
         # Validate line_image is a valid numpy array
         if not isinstance(line_image, np.ndarray):
+            # print(
+            #     f"Error: line_image is not a numpy array (type: {type(line_image)}, image_name: {image_name})"
+            # )
             return ({}, False)
         # Validate line_image has valid shape and size
         if line_image.size == 0:
+            # print(
+            #     f"Error: line_image is empty (shape: {line_image.shape}, image_name: {image_name})"
+            # )
             return ({}, False)
         if len(line_image.shape) < 2:
+            # print(
+            #     f"Error: line_image has invalid shape {line_image.shape} (image_name: {image_name})"
+            # )
             return ({}, False)
         # Early return if 1 or fewer words
             if len(words) <= 1:
                 return ({}, False)
         else:
+            # print(
+            #     f"Error: line_data is empty or does not contain text (image_name: {image_name})"
+            # )
             return ({}, False)
+        # print(f"line_text: {line_text}")
         shortened_line_text = line_text.replace(" ", "_")[:10]
         if SHOW_OUTPUT_IMAGES:
             os.makedirs(self.output_folder, exist_ok=True)
+            output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_original.png"
+            os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
             cv2.imwrite(output_path, line_image)
+            # print(f"\nSaved original image to '{output_path}'")
         gray = cv2.cvtColor(line_image, cv2.COLOR_BGR2GRAY)
             borderMode=cv2.BORDER_REPLICATE,
         )
+        # Validate deskewed_line_image before saving
+        if (
+            deskewed_line_image is None
+            or not isinstance(deskewed_line_image, np.ndarray)
+            or deskewed_line_image.size == 0
+        ):
+            # print(
+            #     f"Error: deskewed_line_image is None or empty (image_name: {image_name})"
+            # )
+            return ({}, False)
         # Save deskewed image (optional, only if image_name is provided)
         if SHOW_OUTPUT_IMAGES:
             os.makedirs(self.output_folder, exist_ok=True)
+            output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_deskewed.png"
+            os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
             cv2.imwrite(output_path, deskewed_line_image)
             # print(f"\nSaved deskewed image to '{output_path}'")
         block_size = int(avg_char_width_approx * BLOCK_SIZE_FACTOR)
         if block_size % 2 == 0:
             block_size += 1
+        # Validate deskewed_gray and ensure block_size is valid
+        if deskewed_gray is None or not isinstance(deskewed_gray, np.ndarray):
+            # print(
+            #     f"Error: deskewed_gray is None or not a numpy array (image_name: {image_name})"
+            # )
+            return ({}, False)
+        if len(deskewed_gray.shape) != 2:
+            # print(
+            #     f"Error: deskewed_gray must be a 2D grayscale image (shape: {deskewed_gray.shape}, image_name: {image_name})"
+            # )
+            return ({}, False)
+        if block_size < 3:
+            # print(
+            #     f"Warning: block_size ({block_size}) is too small for adaptiveThreshold. "
+            #     f"Using minimum value of 3. (image_name: {image_name}, "
+            #     f"img_w: {img_w}, approx_char_count: {approx_char_count}, "
+            #     f"avg_char_width_approx: {avg_char_width_approx:.2f})"
+            # )
+            block_size = 3
         binary = cv2.adaptiveThreshold(
             deskewed_gray,
             255,
             C_VALUE,
         )
+        # Validate binary image before saving
+        if binary is None or not isinstance(binary, np.ndarray) or binary.size == 0:
+            # print(
+            #     f"Error: binary image is None or empty (image_name: {image_name})"
+            # )
+            return ({}, False)
         # Save cropped image (optional, only if image_name is provided)
         if SHOW_OUTPUT_IMAGES:
             os.makedirs(self.output_folder, exist_ok=True)
+            output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_binary.png"
+            os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
             cv2.imwrite(output_path, binary)
             # print(f"\nSaved cropped image to '{output_path}'")
         # It's a dilation followed by an erosion
         closed_binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=1)
+        # Validate closed_binary image before saving
+        if (
+            closed_binary is None
+            or not isinstance(closed_binary, np.ndarray)
+            or closed_binary.size == 0
+        ):
+            # print(
+            #     f"Error: closed_binary image is None or empty (image_name: {image_name})"
+            # )
+            return ({}, False)
         # (Optional) You could also use a DILATE to make letters thicker
         # dilated_binary = cv2.dilate(closed_binary, kernel, iterations=1)
         # Use 'closed_binary' (or 'dilated_binary') from now on.
         if SHOW_OUTPUT_IMAGES:
             os.makedirs(self.output_folder, exist_ok=True)
+            output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_closed_binary.png"
+            os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
             cv2.imwrite(output_path, closed_binary)
             # print(f"\nSaved dilated binary image to '{output_path}'")
             # Handle edge case of empty 'areas' array
             if len(areas) == 0:
                 clean_binary = binary
+                # print("Warning: No components found after binarization.")
                 areas = np.array([0])  # Add a dummy value to prevent crashes
             # --- 1. Calculate the DEFAULT CONSERVATIVE threshold ---
             # --- 3. ADAPTIVE DECISION: Override if conservative threshold is clearly noise ---
             if has_clear_gap:
+                # print(
+                #     f"Noise Removal: Gap detected. Noise cluster ends at {area_before_gap}px. Aggressive threshold = {aggressive_threshold:.1f}"
+                # )
                 # Only use a more aggressive threshold IF our "safe" threshold is clearly
                 # stuck *inside* the noise cluster.
                 # e.g., Safe threshold = 1, but noise goes up to 10.
                 # (We use 0.8 as a buffer, so if thresh=7 and gap=8, we don't switch)
                 if area_threshold < (area_before_gap * 0.8):
+                    # print(
+                    #     f"Noise Removal: Conservative threshold ({area_threshold:.1f}) is deep in noise cluster (ends at {area_before_gap}px)."
+                    # )
                     # Instead of using large percentage increases, use a very small absolute increment
                     # This preserves legitimate small letters/words that might be just above the noise
                     # Cap at 15 pixels as absolute upper bound
                     final_threshold = min(final_threshold, 15)
+                    # print(
+                    #     f"Noise Removal: Using MODERATE threshold: {final_threshold:.1f} (noise ends at {area_before_gap}px, increment: {small_increment}px)"
+                    # )
                     area_threshold = final_threshold
                 else:
+                    # print(
+                    #     f"Noise Removal: Gap found, but conservative threshold ({area_threshold:.1f}) is sufficient. Sticking with conservative."
+                    # )
                     pass
             # --- 4. Apply the final, determined threshold ---
+            # print(f"Noise Removal: Final area threshold: {area_threshold:.1f}")
             for i in range(1, num_labels):
                 # Use >= to be inclusive of the threshold itself
                 if stats[i, cv2.CC_STAT_AREA] >= area_threshold:
             # No components found, or only background
             clean_binary = binary
+        # Validate clean_binary before proceeding
+        if (
+            clean_binary is None
+            or not isinstance(clean_binary, np.ndarray)
+            or clean_binary.size == 0
+        ):
+            # print(
+            #     f"Error: clean_binary image is None or empty (image_name: {image_name})"
+            # )
+            return ({}, False)
         # Calculate the horizontal projection profile on the cleaned image
         horizontal_projection = np.sum(clean_binary, axis=1)
             # Ensure the crop is valid
             if y_start < y_end:
+                # print(
+                #     f"Original text height: {text_height}px. Cropping to middle {100 - (2*trim_percentage*100):.0f}% region."
+                # )
                 # Slice the image to get the vertically cropped ROI
                 analysis_image = clean_binary[y_start:y_end, :]
             else:
             # If no text is found, use the original cleaned image
             analysis_image = clean_binary
+        # Validate analysis_image before proceeding
+        if (
+            analysis_image is None
+            or not isinstance(analysis_image, np.ndarray)
+            or analysis_image.size == 0
+        ):
+            # print(
+            #     f"Error: analysis_image is None or empty (image_name: {image_name})"
+            # )
+            return ({}, False)
         # --- Step 3: Hierarchical Adaptive Search (using the new clean_binary) ---
         # The rest of the pipeline is identical but now operates on a superior image.
         words = line_data["text"][0].split()
         target_word_count = len(words)
+        # print(f"Target word count: {target_word_count}")
         # Save cropped image (optional, only if image_name is provided)
         if SHOW_OUTPUT_IMAGES:
             os.makedirs(self.output_folder, exist_ok=True)
+            output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_clean_binary.png"
+            os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
             cv2.imwrite(output_path, analysis_image)
             # print(f"\nSaved cropped image to '{output_path}'")
         target_word_count = len(words)
         stage1_succeeded = False
+        # print("--- Stage 1: Searching with adaptive valley threshold ---")
         valley_factors_to_try = np.arange(INITIAL_VALLEY_THRESHOLD_FACTOR, 0.45, 0.05)
         for v_factor in valley_factors_to_try:
             # Pass the cropped image to the helper
                 break
         if not stage1_succeeded:
+            # print(
+            #     "\n--- Stage 1 failed. Starting Stage 2: Searching with adaptive kernel ---"
+            # )
             kernel_factors_to_try = np.arange(INITIAL_KERNEL_WIDTH_FACTOR, 0.5, 0.05)
             fixed_valley_factor = MAIN_VALLEY_THRESHOLD_FACTOR
             for k_factor in kernel_factors_to_try:
                 closed_binary = cv2.morphologyEx(
                     clean_binary, cv2.MORPH_CLOSE, closing_kernel
                 )
+                # Validate closed_binary before proceeding
+                if (
+                    closed_binary is None
+                    or not isinstance(closed_binary, np.ndarray)
+                    or closed_binary.size == 0
+                ):
+                    # print(
+                    #     f"Error: closed_binary in Stage 2 is None or empty (image_name: {image_name}, k_factor: {k_factor:.2f})"
+                    # )
+                    continue  # Skip this iteration and try next kernel factor
                 # We need to re-apply the same vertical crop to this new image
                 if len(non_zero_rows) > 0 and y_start < y_end:
                     analysis_image = closed_binary[y_start:y_end, :]
                 else:
                     analysis_image = closed_binary
+                # Validate analysis_image before using it
+                if (
+                    analysis_image is None
+                    or not isinstance(analysis_image, np.ndarray)
+                    or analysis_image.size == 0
+                ):
+                    # print(
+                    #     f"Error: analysis_image in Stage 2 is None or empty (image_name: {image_name}, k_factor: {k_factor:.2f})"
+                    # )
+                    continue  # Skip this iteration and try next kernel factor
                 unlabeled_boxes = self._get_boxes_from_profile(
                     analysis_image,
                     avg_char_width_approx,
                     fixed_valley_factor,
                 )
+                # print(
+                #     f"Testing kernel factor {k_factor:.2f} ({kernel_width}px): Found {len(unlabeled_boxes)} boxes."
+                # )
                 if abs(target_word_count - len(unlabeled_boxes)) <= match_tolerance:
+                    # print("SUCCESS (Stage 2): Found a match.")
                     best_boxes = unlabeled_boxes
                     successful_binary_image = (
                         closed_binary  # For Stage 2, the source is the closed_binary
         used_fallback = False
         if best_boxes is None:
+            # print("\nWarning: All adaptive searches failed. Falling back.")
             fallback_segmenter = HybridWordSegmenter()
             used_fallback = True
             final_output = fallback_segmenter.refine_words_bidirectional(
         # Visualisation
         if SHOW_OUTPUT_IMAGES:
+            output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_final_boxes.png"
+            os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
             output_image_vis = line_image.copy()
+            # Validate output_image_vis before saving
+            if (
+                output_image_vis is None
+                or not isinstance(output_image_vis, np.ndarray)
+                or output_image_vis.size == 0
+            ):
+                pass
+                # print(
+                #     f"Error: output_image_vis is None or empty (image_name: {image_name})"
+                # )
+            else:
+                # print(f"\nFinal refined {len(remapped_output['text'])} words:")
+                for i in range(len(remapped_output["text"])):
+                    word = remapped_output["text"][i]
+                    x, y, w, h = (
+                        int(remapped_output["left"][i]),
+                        int(remapped_output["top"][i]),
+                        int(remapped_output["width"][i]),
+                        int(remapped_output["height"][i]),
+                    )
+                    # print(f"- '{word}' at ({x}, {y}, {w}, {h})")
+                    cv2.rectangle(
+                        output_image_vis, (x, y), (x + w, y + h), (0, 255, 0), 2
+                    )
+                cv2.imwrite(output_path, output_image_vis)
+                # print(f"\nSaved visualisation to '{output_path}'")
         return remapped_output, used_fallback
     # image_path = 'input/london_borough_of_lambeth.png'
     image_basename = os.path.basename(image_path)
     image_name = os.path.splitext(image_basename)[0]
+    output_path = f"output/{image_name}_refined_morph.png"
+    if not os.path.exists("output"):
+        os.makedirs("output")
     line_image_cv = cv2.imread(image_path)
     h, w, _ = line_image_cv.shape
     # Visualisation
     output_image_vis = line_image_cv.copy()
+    # Validate output_image_vis before saving
+    if (
+        output_image_vis is None
+        or not isinstance(output_image_vis, np.ndarray)
+        or output_image_vis.size == 0
+    ):
+        print(f"Error: output_image_vis is None or empty (image_name: {image_name})")
+    else:
+        print(f"\nFinal refined {len(final_word_data['text'])} words:")
+        for i in range(len(final_word_data["text"])):
+            word = final_word_data["text"][i]
+            x, y, w, h = (
+                int(final_word_data["left"][i]),
+                int(final_word_data["top"][i]),
+                int(final_word_data["width"][i]),
+                int(final_word_data["height"][i]),
+            )
+            print(f"- '{word}' at ({x}, {y}, {w}, {h})")
+            cv2.rectangle(output_image_vis, (x, y), (x + w, y + h), (0, 255, 0), 2)
+        cv2.imwrite(output_path, output_image_vis)
+        print(f"\nSaved visualisation to '{output_path}'")
     # You can also use matplotlib to display it in a notebook
     import matplotlib.pyplot as plt