seanpedrickcase commited on
Commit
2c00d05
·
1 Parent(s): 01c8eb6

Improved paddle and hybrid OCR analysis across all options. Tried to revise requirements for spaces

Browse files
README.md CHANGED
@@ -131,6 +131,16 @@ Alternatively, you can use the full `requirements.txt` file, that contains refer
131
  pip install -r requirements.txt
132
  ```
133
 
 
 
 
 
 
 
 
 
 
 
134
  ### 3. Run the Application
135
 
136
  With all dependencies installed, you can now start the Gradio application.
@@ -938,7 +948,7 @@ The hybrid OCR mode uses several configurable parameters:
938
  - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
939
  - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
940
  - **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): Save comparison images when using hybrid mode
941
- - **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
942
 
943
  ### When to use different OCR models
944
 
 
131
  pip install -r requirements.txt
132
  ```
133
 
134
+ Note that the versions of both PaddleOCR and Torch installed by default are the CPU-only versions. If you want to install the equivalent GPU versions, you will need to run the following commands:
135
+ ```bash
136
+ pip install paddlepaddle-gpu==3.2.1 --index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
137
+ ```
138
+
139
+ ```bash
140
+ pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu126
141
+ pip install torchvision --index-url https://download.pytorch.org/whl/cu126
142
+ ```
143
+
144
  ### 3. Run the Application
145
 
146
  With all dependencies installed, you can now start the Gradio application.
 
948
  - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
949
  - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
950
  - **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): Save comparison images when using hybrid mode
951
+ - **SAVE_PAGE_OCR_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
952
 
953
  ### When to use different OCR models
954
 
app.py CHANGED
@@ -1,9 +1,9 @@
1
  import os
2
- import spaces
3
  from pathlib import Path
4
 
5
  import gradio as gr
6
  import pandas as pd
 
7
  from fastapi import FastAPI, status
8
  from gradio_image_annotation import image_annotator
9
 
@@ -260,9 +260,6 @@ app = FastAPI()
260
  from fastapi.middleware.cors import CORSMiddleware
261
  from fastapi.middleware.trustedhost import TrustedHostMiddleware
262
 
263
-
264
- spaces.annotations
265
-
266
  ###
267
  # Load in Gradio app components
268
  ###
 
1
  import os
 
2
  from pathlib import Path
3
 
4
  import gradio as gr
5
  import pandas as pd
6
+ import spaces
7
  from fastapi import FastAPI, status
8
  from gradio_image_annotation import image_annotator
9
 
 
260
  from fastapi.middleware.cors import CORSMiddleware
261
  from fastapi.middleware.trustedhost import TrustedHostMiddleware
262
 
 
 
 
263
  ###
264
  # Load in Gradio app components
265
  ###
example_data/partnership_toolkit_redact_custom_deny_list.csv CHANGED
@@ -1,4 +1,2 @@
1
- Sister
2
- Sister City
3
- Sister Cities
4
- Friendship City
 
1
+ Friendship City
2
+ United States
 
 
pre-requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  # --- PaddleOCR (CUDA 12.6) ---
2
  --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
3
- paddlepaddle-gpu==3.0.0
4
  paddleocr<=3.3.0
 
1
  # --- PaddleOCR (CUDA 12.6) ---
2
  --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
3
+ paddlepaddle-gpu<=3.2.1
4
  paddleocr<=3.3.0
pyproject.toml CHANGED
@@ -55,14 +55,14 @@ test = ["pytest", "pytest-cov"]
55
 
56
  # Extra dependencies for PaddleOCR
57
  paddle = [
58
- "paddlepaddle-gpu==3.2.1", # Specific version for compatibility with VLM and torch installation described below -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
59
- "paddleocr==3.3.0",
60
  ]
61
 
62
  # Extra dependencies for VLM models
63
  vlm = [
64
- "torch==2.8.0", # should use --index-url https://download.pytorch.org/whl/cu126 for cuda support for paddleocr, need to install manually
65
- "torchvision==0.24.0",
66
  "transformers==4.57.1",
67
  "accelerate==1.11.0",
68
  ]
 
55
 
56
  # Extra dependencies for PaddleOCR
57
  paddle = [
58
+ "paddlepaddle<=3.2.1", # If you want the GPU-accelerated version, run manually pip install paddlepaddle-gpu<=3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
59
+ "paddleocr<=3.3.0",
60
  ]
61
 
62
  # Extra dependencies for VLM models
63
  vlm = [
64
+ "torch<=2.8.0", # should use --index-url https://download.pytorch.org/whl/cu126 for cuda support for paddleocr, need to install manually
65
+ "torchvision>=0.20.1",
66
  "transformers==4.57.1",
67
  "accelerate==1.11.0",
68
  ]
requirements.txt CHANGED
@@ -37,16 +37,14 @@ scikit-learn==1.7.2
37
  spacy==3.8.7
38
  spaczz==0.6.1
39
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 
 
40
 
41
  # --- Testing ---
42
  pytest>=7.0.0
43
  pytest-cov>=4.0.0
44
 
45
- transformers==4.57.1
46
- accelerate==1.11.0
47
-
48
  # --- PyTorch (CUDA 12.6) ---
49
- nvidia-nccl-cu12==2.21.5
50
  --extra-index-url https://download.pytorch.org/whl/cu126
51
- torch>=2.5.1, <=2.6.0
52
- torchvision>=0.20.1, <=0.24.0
 
37
  spacy==3.8.7
38
  spaczz==0.6.1
39
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
40
+ transformers==4.57.1
41
+ accelerate==1.11.0
42
 
43
  # --- Testing ---
44
  pytest>=7.0.0
45
  pytest-cov>=4.0.0
46
 
 
 
 
47
  # --- PyTorch (CUDA 12.6) ---
 
48
  --extra-index-url https://download.pytorch.org/whl/cu126
49
+ torch<=2.8.0
50
+ torchvision>=0.20.1
src/app_settings.qmd CHANGED
@@ -327,7 +327,7 @@ Configurations related to text extraction, PII detection, and the redaction proc
327
  * **Description:** Saves comparison images when using "hybrid-paddle" OCR mode.
328
  * **Default Value:** `"False"`
329
 
330
- * **`SAVE_PADDLE_VISUALISATIONS`**
331
  * **Description:** Saves images with PaddleOCR's detected bounding boxes overlaid.
332
  * **Default Value:** `"False"`
333
 
 
327
  * **Description:** Saves comparison images when using "hybrid-paddle" OCR mode.
328
  * **Default Value:** `"False"`
329
 
330
+ * **`SAVE_PAGE_OCR_VISUALISATIONS`**
331
  * **Description:** Saves images with PaddleOCR's detected bounding boxes overlaid.
332
  * **Default Value:** `"False"`
333
 
src/user_guide.qmd CHANGED
@@ -722,7 +722,7 @@ The hybrid OCR mode uses several configurable parameters:
722
  - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
723
  - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
724
  - **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): Save comparison images when using hybrid mode
725
- - **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
726
 
727
  ### When to use different OCR models
728
 
 
722
  - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
723
  - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
724
  - **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): Save comparison images when using hybrid mode
725
+ - **SAVE_PAGE_OCR_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
726
 
727
  ### When to use different OCR models
728
 
tools/config.py CHANGED
@@ -473,6 +473,14 @@ MAX_INPUT_TOKEN_LENGTH = int(
473
  get_or_create_env_var("MAX_INPUT_TOKEN_LENGTH", "4096")
474
  ) # Maximum number of tokens to input to the VLM
475
 
 
 
 
 
 
 
 
 
476
  USE_FLASH_ATTENTION = convert_string_to_boolean(
477
  get_or_create_env_var("USE_FLASH_ATTENTION", "False")
478
  ) # Whether to use flash attention for the VLM
@@ -506,7 +514,7 @@ MODEL_CACHE_PATH = get_or_create_env_var("MODEL_CACHE_PATH", "./model_cache")
506
 
507
 
508
  HYBRID_OCR_CONFIDENCE_THRESHOLD = int(
509
- get_or_create_env_var("HYBRID_OCR_CONFIDENCE_THRESHOLD", "65")
510
  ) # The tesseract confidence threshold under which the text will be passed to PaddleOCR for re-extraction using the hybrid OCR method.
511
  HYBRID_OCR_PADDING = int(
512
  get_or_create_env_var("HYBRID_OCR_PADDING", "1")
@@ -536,17 +544,9 @@ SAVE_EXAMPLE_HYBRID_IMAGES = convert_string_to_boolean(
536
  get_or_create_env_var("SAVE_EXAMPLE_HYBRID_IMAGES", "False")
537
  ) # Whether to save example images of Tesseract vs PaddleOCR re-extraction in hybrid OCR mode.
538
 
539
- SAVE_PADDLE_VISUALISATIONS = convert_string_to_boolean(
540
- get_or_create_env_var("SAVE_PADDLE_VISUALISATIONS", "False")
541
- ) # Whether to save visualisations of PaddleOCR bounding boxes.
542
-
543
- SAVE_TESSERACT_VISUALISATIONS = convert_string_to_boolean(
544
- get_or_create_env_var("SAVE_TESSERACT_VISUALISATIONS", "False")
545
- ) # Whether to save visualisations of Tesseract bounding boxes.
546
-
547
- SAVE_TEXTRACT_VISUALISATIONS = convert_string_to_boolean(
548
- get_or_create_env_var("SAVE_TEXTRACT_VISUALISATIONS", "False")
549
- ) # Whether to save visualisations of AWS Textract bounding boxes.
550
 
551
  # Model storage paths for Lambda compatibility
552
  PADDLE_MODEL_PATH = get_or_create_env_var(
@@ -565,6 +565,10 @@ SAVE_PREPROCESS_IMAGES = convert_string_to_boolean(
565
  get_or_create_env_var("SAVE_PREPROCESS_IMAGES", "False")
566
  ) # Whether to save the pre-processed images.
567
 
 
 
 
 
568
  # Entities for redaction
569
  CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var(
570
  "CHOSEN_COMPREHEND_ENTITIES",
 
473
  get_or_create_env_var("MAX_INPUT_TOKEN_LENGTH", "4096")
474
  ) # Maximum number of tokens to input to the VLM
475
 
476
+ VLM_MAX_IMAGE_SIZE = int(
477
+ get_or_create_env_var("VLM_MAX_IMAGE_SIZE", "1000000")
478
+ ) # Maximum total pixels (width * height) for images passed to VLM. Images with more pixels will be resized while maintaining aspect ratio. Default is 1000000 (1000x1000).
479
+
480
+ VLM_MAX_DPI = float(
481
+ get_or_create_env_var("VLM_MAX_DPI", "300.0")
482
+ ) # Maximum DPI for images passed to VLM. Images with higher DPI will be resized accordingly.
483
+
484
  USE_FLASH_ATTENTION = convert_string_to_boolean(
485
  get_or_create_env_var("USE_FLASH_ATTENTION", "False")
486
  ) # Whether to use flash attention for the VLM
 
514
 
515
 
516
  HYBRID_OCR_CONFIDENCE_THRESHOLD = int(
517
+ get_or_create_env_var("HYBRID_OCR_CONFIDENCE_THRESHOLD", "80")
518
  ) # The tesseract confidence threshold under which the text will be passed to PaddleOCR for re-extraction using the hybrid OCR method.
519
  HYBRID_OCR_PADDING = int(
520
  get_or_create_env_var("HYBRID_OCR_PADDING", "1")
 
544
  get_or_create_env_var("SAVE_EXAMPLE_HYBRID_IMAGES", "False")
545
  ) # Whether to save example images of Tesseract vs PaddleOCR re-extraction in hybrid OCR mode.
546
 
547
+ SAVE_PAGE_OCR_VISUALISATIONS = convert_string_to_boolean(
548
+ get_or_create_env_var("SAVE_PAGE_OCR_VISUALISATIONS", "False")
549
+ ) # Whether to save visualisations of Tesseract, PaddleOCR, and Textract bounding boxes.
 
 
 
 
 
 
 
 
550
 
551
  # Model storage paths for Lambda compatibility
552
  PADDLE_MODEL_PATH = get_or_create_env_var(
 
565
  get_or_create_env_var("SAVE_PREPROCESS_IMAGES", "False")
566
  ) # Whether to save the pre-processed images.
567
 
568
+ SAVE_VLM_INPUT_IMAGES = convert_string_to_boolean(
569
+ get_or_create_env_var("SAVE_VLM_INPUT_IMAGES", "False")
570
+ ) # Whether to save input images sent to VLM OCR for debugging.
571
+
572
  # Entities for redaction
573
  CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var(
574
  "CHOSEN_COMPREHEND_ENTITIES",
tools/custom_image_analyser_engine.py CHANGED
@@ -29,15 +29,18 @@ from tools.config import (
29
  PADDLE_USE_TEXTLINE_ORIENTATION,
30
  PREPROCESS_LOCAL_OCR_IMAGES,
31
  SAVE_EXAMPLE_HYBRID_IMAGES,
32
- SAVE_PADDLE_VISUALISATIONS,
33
  SAVE_PREPROCESS_IMAGES,
 
34
  SELECTED_MODEL,
35
  TESSERACT_SEGMENTATION_LEVEL,
 
 
36
  )
37
  from tools.helper_functions import clean_unicode_text
38
  from tools.load_spacy_model_custom_recognisers import custom_entities
39
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
40
- from tools.run_vlm import generate_image as vlm_generate_image
41
  from tools.secure_path_utils import validate_folder_containment
42
  from tools.secure_regex_utils import safe_sanitize_text
43
  from tools.word_segmenter import AdaptiveSegmenter
@@ -554,6 +557,84 @@ def _get_tesseract_psm(segmentation_level: str) -> int:
554
  return 11
555
 
556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
  def _vlm_ocr_predict(
558
  image: Image.Image,
559
  prompt: str = "Extract the text content from this image.",
@@ -569,10 +650,47 @@ def _vlm_ocr_predict(
569
  Dictionary in PaddleOCR format with 'rec_texts' and 'rec_scores'
570
  """
571
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
  # Use the VLM to extract text
573
  # Pass None for parameters to prioritize model-specific defaults from run_vlm.py
574
  # If model defaults are not available, general defaults will be used (matching current values)
575
- extracted_text = vlm_generate_image(
 
576
  text=prompt,
577
  image=image,
578
  max_new_tokens=None, # Use model default if available, otherwise MAX_NEW_TOKENS from config
@@ -582,15 +700,32 @@ def _vlm_ocr_predict(
582
  repetition_penalty=None, # Use model default if available, otherwise 1.3
583
  )
584
 
585
- if extracted_text and extracted_text.strip():
 
 
 
 
 
 
 
 
 
 
 
 
586
  # Clean the text
587
- cleaned_text = extracted_text.strip()
 
 
588
 
589
  # Split into words for compatibility with PaddleOCR format
590
  words = cleaned_text.split()
591
 
592
- # If text has more than 5 words, assume something went wrong and skip it
593
- if len(words) > 5:
 
 
 
594
  return {"rec_texts": [], "rec_scores": []}
595
 
596
  # Create PaddleOCR-compatible result
@@ -601,10 +736,12 @@ def _vlm_ocr_predict(
601
 
602
  return result
603
  else:
 
604
  return {"rec_texts": [], "rec_scores": []}
605
 
606
- except Exception as e:
607
- print(f"VLM OCR error: {e}")
 
608
  return {"rec_texts": [], "rec_scores": []}
609
 
610
 
@@ -814,6 +951,8 @@ class CustomImageAnalyzerEngine:
814
  paddle_results: List[Any],
815
  input_image_width: int = None,
816
  input_image_height: int = None,
 
 
817
  ) -> Dict[str, List]:
818
  """Converts PaddleOCR result format to Tesseract's dictionary format using relative coordinates.
819
 
@@ -825,6 +964,8 @@ class CustomImageAnalyzerEngine:
825
  paddle_results: List of PaddleOCR result dictionaries
826
  input_image_width: Width of the input image passed to PaddleOCR (target dimensions for scaling)
827
  input_image_height: Height of the input image passed to PaddleOCR (target dimensions for scaling)
 
 
828
  """
829
 
830
  output = {
@@ -834,6 +975,7 @@ class CustomImageAnalyzerEngine:
834
  "width": list(),
835
  "height": list(),
836
  "conf": list(),
 
837
  }
838
 
839
  # paddle_results is now a list of dictionaries with detailed information
@@ -848,19 +990,24 @@ class CustomImageAnalyzerEngine:
848
  # Fallback: we'll try to detect from coordinates, but this is less reliable
849
  use_relative_coords = False
850
  else:
851
- use_relative_coords = False
852
 
853
  for page_result in paddle_results:
854
  # Extract text recognition results from the new format
855
  rec_texts = page_result.get("rec_texts", list())
856
  rec_scores = page_result.get("rec_scores", list())
857
  rec_polys = page_result.get("rec_polys", list())
 
858
 
859
  # PaddleOCR may return image dimensions in the result - check for them
860
  # Some versions of PaddleOCR include this information
861
  result_image_width = page_result.get("image_width")
862
  result_image_height = page_result.get("image_height")
863
 
 
 
 
 
864
  # First pass: determine PaddleOCR's coordinate space by finding max coordinates
865
  # This tells us what coordinate space PaddleOCR is actually using
866
  max_x_coord = 0
@@ -879,22 +1026,45 @@ class CustomImageAnalyzerEngine:
879
  max_y_coord = max(max_y_coord, max(y_coords) if y_coords else 0)
880
 
881
  # Determine PaddleOCR's coordinate space dimensions
882
- # Priority: result metadata > detected from coordinates > input dimensions
883
- paddle_coord_width = (
884
- result_image_width
885
- if result_image_width is not None
886
- else max_x_coord if max_x_coord > 0 else input_image_width
887
- )
888
- paddle_coord_height = (
889
- result_image_height
890
- if result_image_height is not None
891
- else max_y_coord if max_y_coord > 0 else input_image_height
892
- )
893
-
894
- # If we couldn't determine PaddleOCR's coordinate space, fall back to input dimensions
895
- if paddle_coord_width is None or paddle_coord_height is None:
 
 
 
 
 
 
 
 
 
 
896
  paddle_coord_width = input_image_width
897
  paddle_coord_height = input_image_height
 
 
 
 
 
 
 
 
 
 
 
 
 
898
  use_relative_coords = False
899
 
900
  if paddle_coord_width <= 0 or paddle_coord_height <= 0:
@@ -905,9 +1075,43 @@ class CustomImageAnalyzerEngine:
905
  paddle_coord_height = input_image_height or 1
906
  use_relative_coords = False
907
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
908
  # Second pass: convert coordinates using relative coordinate approach
909
- for line_text, line_confidence, bounding_box in zip(
910
- rec_texts, rec_scores, rec_polys
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
911
  ):
912
  # bounding_box is now a numpy array with shape (4, 2)
913
  # Convert to list of coordinates if it's a numpy array
@@ -974,6 +1178,7 @@ class CustomImageAnalyzerEngine:
974
  output["width"].append(round(line_width, 2))
975
  output["height"].append(round(line_height, 2))
976
  output["conf"].append(int(line_confidence * 100))
 
977
 
978
  return output
979
 
@@ -1005,6 +1210,7 @@ class CustomImageAnalyzerEngine:
1005
  "width": list(),
1006
  "height": list(),
1007
  "conf": list(),
 
1008
  }
1009
 
1010
  if not line_data or not line_data.get("text"):
@@ -1043,6 +1249,11 @@ class CustomImageAnalyzerEngine:
1043
  for i in range(len(line_data["text"])):
1044
  line_text = line_data["text"][i]
1045
  line_conf = line_data["conf"][i]
 
 
 
 
 
1046
 
1047
  # Get the float values
1048
  f_left = float(line_data["left"][i])
@@ -1171,6 +1382,7 @@ class CustomImageAnalyzerEngine:
1171
  output["width"].append(clamped_width)
1172
  output["height"].append(line_height)
1173
  output["conf"].append(line_conf)
 
1174
  current_left += word_width + estimated_space_width
1175
  continue
1176
 
@@ -1182,6 +1394,8 @@ class CustomImageAnalyzerEngine:
1182
  output["width"].append(word_output["width"][j])
1183
  output["height"].append(word_output["height"][j])
1184
  output["conf"].append(word_output["conf"][j])
 
 
1185
 
1186
  return output
1187
 
@@ -1563,19 +1777,21 @@ class CustomImageAnalyzerEngine:
1563
  self,
1564
  image: Image.Image,
1565
  ocr: Optional[Any] = None,
 
1566
  confidence_threshold: int = HYBRID_OCR_CONFIDENCE_THRESHOLD,
1567
  padding: int = HYBRID_OCR_PADDING,
1568
  image_name: str = "unknown_image_name",
1569
  input_image_width: int = None,
1570
  input_image_height: int = None,
1571
- ) -> Dict[str, list]:
1572
  """
1573
  Performs OCR using PaddleOCR at line level, then VLM for low-confidence lines.
1574
- Returns data in the same dictionary format as pytesseract.image_to_data.
1575
 
1576
  Args:
1577
  image: PIL Image to process
1578
  ocr: PaddleOCR instance (optional, uses self.paddle_ocr if not provided)
 
1579
  confidence_threshold: Confidence threshold below which VLM is used
1580
  padding: Padding to add around line crops
1581
  image_name: Name of the image for logging/debugging
@@ -1583,7 +1799,7 @@ class CustomImageAnalyzerEngine:
1583
  input_image_height: Original image height (before preprocessing)
1584
 
1585
  Returns:
1586
- Dictionary with OCR results in Tesseract format
1587
  """
1588
  if ocr is None:
1589
  if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
@@ -1593,6 +1809,9 @@ class CustomImageAnalyzerEngine:
1593
  "No OCR object provided and 'paddle_ocr' is not initialized."
1594
  )
1595
 
 
 
 
1596
  print("Starting hybrid PaddleOCR + VLM OCR process...")
1597
 
1598
  # Get image dimensions
@@ -1604,154 +1823,312 @@ class CustomImageAnalyzerEngine:
1604
  if input_image_height is None:
1605
  input_image_height = img_height
1606
 
1607
- # 1. Get initial line-level results from PaddleOCR
1608
- image_np = np.array(image)
1609
- if len(image_np.shape) == 2:
1610
- image_np = np.stack([image_np] * 3, axis=-1)
1611
 
1612
- paddle_results = ocr.predict(image_np)
 
 
 
 
 
1613
 
1614
- # Convert PaddleOCR results to line-level format
1615
- paddle_line_data = self._convert_paddle_to_tesseract_format(
1616
- paddle_results,
1617
- input_image_width=input_image_width,
1618
- input_image_height=input_image_height,
1619
- )
 
 
 
 
1620
 
1621
- # Prepare final output structure
1622
- final_data = {
1623
- "text": list(),
1624
- "left": list(),
1625
- "top": list(),
1626
- "width": list(),
1627
- "height": list(),
1628
- "conf": list(),
1629
- "model": list(), # Track which model was used for each line
1630
- }
 
 
 
 
 
 
 
1631
 
1632
- num_lines = len(paddle_line_data["text"])
 
 
 
 
 
 
 
 
 
1633
 
1634
- # Process each line
1635
- for i in range(num_lines):
1636
- line_text = paddle_line_data["text"][i]
1637
- line_conf = int(paddle_line_data["conf"][i])
1638
- line_left = float(paddle_line_data["left"][i])
1639
- line_top = float(paddle_line_data["top"][i])
1640
- line_width = float(paddle_line_data["width"][i])
1641
- line_height = float(paddle_line_data["height"][i])
1642
-
1643
- # Skip empty lines
1644
- if not line_text.strip():
1645
- continue
1646
 
1647
- # Initialize model as PaddleOCR (default)
1648
- model_used = "Paddle"
 
1649
 
1650
- # Count words in PaddleOCR output
1651
- paddle_words = line_text.split()
1652
- paddle_word_count = len(paddle_words)
1653
 
1654
- # If confidence is low, use VLM for a second opinion
1655
- if line_conf < confidence_threshold:
1656
- # Calculate crop coordinates with padding
1657
- crop_left = max(0, int(line_left - padding))
1658
- crop_top = max(0, int(line_top - padding))
1659
- crop_right = min(img_width, int(line_left + line_width + padding))
1660
- crop_bottom = min(img_height, int(line_top + line_height + padding))
1661
 
1662
- # Ensure crop dimensions are valid
1663
- if crop_right <= crop_left or crop_bottom <= crop_top:
1664
- # Invalid crop, keep original PaddleOCR result
1665
- final_data["text"].append(clean_unicode_text(line_text))
1666
- final_data["left"].append(line_left)
1667
- final_data["top"].append(line_top)
1668
- final_data["width"].append(line_width)
1669
- final_data["height"].append(line_height)
1670
- final_data["conf"].append(line_conf)
1671
- final_data["model"].append(model_used)
1672
  continue
1673
 
1674
- # Crop the line image
1675
- cropped_image = image.crop(
1676
- (crop_left, crop_top, crop_right, crop_bottom)
1677
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1678
 
1679
- # Use VLM for OCR on this line
1680
- vlm_result = _vlm_ocr_predict(cropped_image)
1681
- vlm_rec_texts = vlm_result.get("rec_texts", [])
1682
- vlm_rec_scores = vlm_result.get("rec_scores", [])
 
 
1683
 
1684
- if vlm_rec_texts and vlm_rec_scores:
1685
- # Combine VLM words into a single text string
1686
- vlm_text = " ".join(vlm_rec_texts)
1687
- vlm_word_count = len(vlm_rec_texts)
1688
- vlm_conf = int(round(np.median(vlm_rec_scores) * 100, 0))
 
 
 
 
1689
 
1690
- # Only replace if word counts match
1691
- if vlm_word_count == paddle_word_count:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1692
  print(
1693
- f" Re-OCR'd line: '{line_text}' (conf: {line_conf}, words: {paddle_word_count}) "
1694
- f"-> '{vlm_text}' (conf: {vlm_conf:.0f}, words: {vlm_word_count}) [VLM]"
1695
  )
1696
 
1697
- # For exporting example image comparisons
1698
- safe_filename = self._create_safe_filename_with_confidence(
1699
- line_text, vlm_text, line_conf, vlm_conf, "VLM"
1700
- )
 
1701
 
1702
- if SAVE_EXAMPLE_HYBRID_IMAGES is True:
1703
- # Normalize and validate image_name to prevent path traversal attacks
1704
- normalized_image_name = os.path.normpath(
1705
- image_name + "_hybrid_paddle_vlm"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1706
  )
1707
- if (
1708
- ".." in normalized_image_name
1709
- or "/" in normalized_image_name
1710
- or "\\" in normalized_image_name
1711
- ):
1712
- normalized_image_name = "safe_image"
1713
 
1714
- hybrid_ocr_examples_folder = (
1715
- self.output_folder
1716
- + f"/hybrid_ocr_examples/{normalized_image_name}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1717
  )
1718
- # Validate the constructed path is safe
1719
- if not validate_folder_containment(
1720
- hybrid_ocr_examples_folder, OUTPUT_FOLDER
1721
- ):
1722
- raise ValueError(
1723
- f"Unsafe hybrid_ocr_examples folder path: {hybrid_ocr_examples_folder}"
1724
- )
1725
 
1726
- if not os.path.exists(hybrid_ocr_examples_folder):
1727
- os.makedirs(hybrid_ocr_examples_folder)
1728
- output_image_path = (
1729
- hybrid_ocr_examples_folder + f"/{safe_filename}.png"
 
 
 
1730
  )
1731
- print(f"Saving example image to {output_image_path}")
1732
- cropped_image.save(output_image_path)
1733
 
1734
- # Replace with VLM result
1735
- line_text = vlm_text
1736
- line_conf = vlm_conf
1737
- model_used = "VLM"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1738
  else:
1739
- print(
1740
- f" Line: '{line_text}' (conf: {line_conf}, words: {paddle_word_count}) -> "
1741
- f"VLM result '{vlm_text}' (conf: {vlm_conf:.0f}, words: {vlm_word_count}) "
1742
- f"word count mismatch. Keeping PaddleOCR result."
1743
- )
 
1744
 
1745
- # Append the final result (either original PaddleOCR or replaced VLM)
1746
- final_data["text"].append(clean_unicode_text(line_text))
1747
- final_data["left"].append(line_left)
1748
- final_data["top"].append(line_top)
1749
- final_data["width"].append(line_width)
1750
- final_data["height"].append(line_height)
1751
- final_data["conf"].append(int(line_conf))
1752
- final_data["model"].append(model_used)
1753
 
1754
- return final_data
1755
 
1756
  def perform_ocr(
1757
  self, image: Union[str, Image.Image, np.ndarray], ocr: Optional[Any] = None
@@ -1772,11 +2149,16 @@ class CustomImageAnalyzerEngine:
1772
  # Store original dimensions BEFORE preprocessing (needed for coordinate conversion)
1773
  original_image_width = None
1774
  original_image_height = None
 
 
 
1775
 
1776
  if PREPROCESS_LOCAL_OCR_IMAGES:
1777
  print("Pre-processing image...")
1778
  # Get original dimensions before preprocessing
1779
  original_image_width, original_image_height = image.size
 
 
1780
  image, preprocessing_metadata = self.image_preprocessor.preprocess_image(
1781
  image
1782
  )
@@ -1794,9 +2176,15 @@ class CustomImageAnalyzerEngine:
1794
  else:
1795
  preprocessing_metadata = dict()
1796
  original_image_width, original_image_height = image.size
 
 
1797
 
1798
  image_width, image_height = image.size
1799
 
 
 
 
 
1800
  # Note: In testing I haven't seen that this necessarily improves results
1801
  if self.ocr_engine == "hybrid-paddle":
1802
  # Try hybrid with original image for cropping:
@@ -1806,23 +2194,6 @@ class CustomImageAnalyzerEngine:
1806
  # Try hybrid VLM with original image for cropping:
1807
  ocr_data = self._perform_hybrid_ocr(image, image_name=image_name)
1808
 
1809
- elif self.ocr_engine == "hybrid-paddle-vlm":
1810
- # Hybrid PaddleOCR + VLM: use PaddleOCR at line level, then VLM for low-confidence lines
1811
- if ocr is None:
1812
- if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
1813
- ocr = self.paddle_ocr
1814
- else:
1815
- raise ValueError(
1816
- "No OCR object provided and 'paddle_ocr' is not initialized."
1817
- )
1818
- ocr_data = self._perform_hybrid_paddle_vlm_ocr(
1819
- image,
1820
- ocr=ocr,
1821
- image_name=image_name,
1822
- input_image_width=original_image_width,
1823
- input_image_height=original_image_height,
1824
- )
1825
-
1826
  elif self.ocr_engine == "tesseract":
1827
 
1828
  ocr_data = pytesseract.image_to_data(
@@ -1832,7 +2203,7 @@ class CustomImageAnalyzerEngine:
1832
  lang=self.tesseract_lang, # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
1833
  )
1834
 
1835
- elif self.ocr_engine == "paddle":
1836
 
1837
  if ocr is None:
1838
  if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
@@ -1863,6 +2234,8 @@ class CustomImageAnalyzerEngine:
1863
  paddle_input_height = image_np.shape[0]
1864
 
1865
  paddle_results = ocr.predict(image_np)
 
 
1866
  else:
1867
  # When using image path, load image to get dimensions
1868
  temp_image = Image.open(image_path)
@@ -1870,9 +2243,13 @@ class CustomImageAnalyzerEngine:
1870
  # For file path, use the original dimensions (before preprocessing)
1871
  # original_image_width and original_image_height are already set above
1872
  paddle_results = ocr.predict(image_path)
 
 
 
 
1873
 
1874
  # Save PaddleOCR visualization with bounding boxes
1875
- if paddle_results and SAVE_PADDLE_VISUALISATIONS is True:
1876
 
1877
  for res in paddle_results:
1878
  # self.output_folder is already validated and normalized at construction time
@@ -1890,24 +2267,137 @@ class CustomImageAnalyzerEngine:
1890
  os.makedirs(paddle_viz_folder, exist_ok=True)
1891
  res.save_to_img(paddle_viz_folder)
1892
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1893
  ocr_data = self._convert_paddle_to_tesseract_format(
1894
  paddle_results,
1895
  input_image_width=original_image_width,
1896
  input_image_height=original_image_height,
1897
  )
1898
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1899
  else:
1900
  raise RuntimeError(f"Unsupported OCR engine: {self.ocr_engine}")
1901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1902
  # Convert line-level results to word-level if configured and needed
1903
  if CONVERT_LINE_TO_WORD_LEVEL and self._is_line_level_data(ocr_data):
1904
  print("Converting line-level OCR results to word-level...")
1905
- # Check if coordinates need to be scaled to match the preprocessed image
1906
- # For PaddleOCR: _convert_paddle_to_tesseract_format converts coordinates to original image space,
1907
- # but we need to crop from the preprocessed image, so we need to scale coordinates up
 
1908
  # For Tesseract: OCR runs on preprocessed image, so coordinates are already in preprocessed space,
1909
  # matching the preprocessed image we're cropping from - no scaling needed
 
1910
  needs_scaling = False
 
 
 
 
1911
  if (
1912
  PREPROCESS_LOCAL_OCR_IMAGES
1913
  and original_image_width
@@ -1919,7 +2409,19 @@ class CustomImageAnalyzerEngine:
1919
  ):
1920
  # PaddleOCR coordinates are converted to original space by _convert_paddle_to_tesseract_format
1921
  # hybrid-paddle-vlm also uses PaddleOCR and converts to original space
1922
- needs_scaling = True
 
 
 
 
 
 
 
 
 
 
 
 
1923
 
1924
  if needs_scaling:
1925
  # Calculate scale factors from original to preprocessed
@@ -1937,12 +2439,13 @@ class CustomImageAnalyzerEngine:
1937
  "width": [w * scale_x for w in ocr_data["width"]],
1938
  "height": [h * scale_y for h in ocr_data["height"]],
1939
  "conf": ocr_data["conf"],
 
1940
  }
1941
  ocr_data = self._convert_line_to_word_level(
1942
  scaled_ocr_data,
1943
- image_width,
1944
- image_height,
1945
- image,
1946
  image_name=image_name,
1947
  )
1948
  # Scale word-level results back to original image space
@@ -1954,27 +2457,15 @@ class CustomImageAnalyzerEngine:
1954
  ocr_data["width"][i] = ocr_data["width"][i] * scale_factor_x
1955
  ocr_data["height"][i] = ocr_data["height"][i] * scale_factor_y
1956
  else:
 
1957
  ocr_data = self._convert_line_to_word_level(
1958
- ocr_data, image_width, image_height, image, image_name=image_name
 
 
 
 
1959
  )
1960
 
1961
- # Always check for scale_factor, even if preprocessing_metadata is empty
1962
- # This ensures rescaling happens correctly when preprocessing was applied
1963
- scale_factor = (
1964
- preprocessing_metadata.get("scale_factor", 1.0)
1965
- if preprocessing_metadata
1966
- else 1.0
1967
- )
1968
- if scale_factor != 1.0:
1969
- # Skip rescaling for PaddleOCR since _convert_paddle_to_tesseract_format
1970
- # already scales coordinates directly to original image dimensions
1971
- # hybrid-paddle-vlm also uses PaddleOCR and converts to original space
1972
- if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid-paddle-vlm":
1973
- pass
1974
- # print(f"Skipping rescale_ocr_data for PaddleOCR (already scaled to original dimensions)")
1975
- else:
1976
- ocr_data = rescale_ocr_data(ocr_data, scale_factor)
1977
-
1978
  # The rest of your processing pipeline now works for both engines
1979
  ocr_result = ocr_data
1980
 
@@ -1986,9 +2477,7 @@ class CustomImageAnalyzerEngine:
1986
  ]
1987
 
1988
  # Determine default model based on OCR engine if model field is not present
1989
- if "model" in ocr_result and len(ocr_result["model"]) == len(
1990
- ocr_result["text"]
1991
- ):
1992
  # Model field exists and has correct length - use it
1993
  def get_model(idx):
1994
  return ocr_result["model"][idx]
@@ -2002,13 +2491,13 @@ class CustomImageAnalyzerEngine:
2002
  "Paddle"
2003
  if self.ocr_engine == "paddle"
2004
  else (
2005
- "hybrid-paddle"
2006
  if self.ocr_engine == "hybrid-paddle"
2007
  else (
2008
- "VLM"
2009
  if self.ocr_engine == "hybrid-vlm"
2010
  else (
2011
- "hybrid-paddle-vlm"
2012
  if self.ocr_engine == "hybrid-paddle-vlm"
2013
  else None
2014
  )
 
29
  PADDLE_USE_TEXTLINE_ORIENTATION,
30
  PREPROCESS_LOCAL_OCR_IMAGES,
31
  SAVE_EXAMPLE_HYBRID_IMAGES,
32
+ SAVE_PAGE_OCR_VISUALISATIONS,
33
  SAVE_PREPROCESS_IMAGES,
34
+ SAVE_VLM_INPUT_IMAGES,
35
  SELECTED_MODEL,
36
  TESSERACT_SEGMENTATION_LEVEL,
37
+ VLM_MAX_DPI,
38
+ VLM_MAX_IMAGE_SIZE,
39
  )
40
  from tools.helper_functions import clean_unicode_text
41
  from tools.load_spacy_model_custom_recognisers import custom_entities
42
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
43
+ from tools.run_vlm import extract_text_from_image_vlm
44
  from tools.secure_path_utils import validate_folder_containment
45
  from tools.secure_regex_utils import safe_sanitize_text
46
  from tools.word_segmenter import AdaptiveSegmenter
 
557
  return 11
558
 
559
 
560
+ def _prepare_image_for_vlm(image: Image.Image) -> Image.Image:
561
+ """
562
+ Prepare image for VLM by ensuring it doesn't exceed maximum size and DPI limits.
563
+
564
+ Args:
565
+ image: PIL Image to prepare
566
+
567
+ Returns:
568
+ PIL Image that has been resized if necessary to meet size and DPI constraints
569
+ """
570
+ if image is None:
571
+ return image
572
+
573
+ width, height = image.size
574
+
575
+ # Get DPI information (if available)
576
+ dpi = image.info.get("dpi", (72, 72)) # Default to 72 DPI if not specified
577
+ if isinstance(dpi, tuple):
578
+ dpi_x, dpi_y = dpi
579
+ # Use the maximum DPI value
580
+ current_dpi = max(dpi_x, dpi_y)
581
+ else:
582
+ current_dpi = float(dpi) if dpi else 72.0
583
+
584
+ # Calculate scale factors needed
585
+ size_scale = 1.0
586
+ dpi_scale = 1.0
587
+
588
+ # Check if total pixels exceed maximum
589
+ total_pixels = width * height
590
+ if total_pixels > VLM_MAX_IMAGE_SIZE:
591
+ # Calculate scale factor to reduce total pixels to maximum
592
+ # Since area scales with scale^2, we need sqrt of the ratio
593
+ size_scale = (VLM_MAX_IMAGE_SIZE / total_pixels) ** 0.5
594
+ print(
595
+ f"VLM image size check: Image has {total_pixels:,} pixels ({width}x{height}), exceeds maximum {VLM_MAX_IMAGE_SIZE:,} pixels. Will resize by factor {size_scale:.3f}"
596
+ )
597
+
598
+ # Check if DPI exceeds maximum
599
+ if current_dpi > VLM_MAX_DPI:
600
+ dpi_scale = VLM_MAX_DPI / current_dpi
601
+ print(
602
+ f"VLM DPI check: Image DPI {current_dpi:.1f} exceeds maximum {VLM_MAX_DPI:.1f} DPI. Will resize by factor {dpi_scale:.3f}"
603
+ )
604
+
605
+ # Use the smaller scale factor to ensure both constraints are met
606
+ final_scale = min(size_scale, dpi_scale)
607
+
608
+ # Resize if necessary
609
+ if final_scale < 1.0:
610
+ new_width = int(width * final_scale)
611
+ new_height = int(height * final_scale)
612
+ print(
613
+ f"VLM image preparation: Resizing image from {width}x{height} to {new_width}x{new_height} (scale: {final_scale:.3f})"
614
+ )
615
+
616
+ # Use high-quality resampling for downscaling
617
+ image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
618
+
619
+ # Update DPI info if it was set
620
+ if "dpi" in image.info:
621
+ new_dpi = (current_dpi * final_scale, current_dpi * final_scale)
622
+ # Create a copy with updated DPI info
623
+ image_info = image.info.copy()
624
+ image_info["dpi"] = new_dpi
625
+ # Note: PIL doesn't allow direct modification of info dict, so we'll just note it
626
+ print(
627
+ f"VLM image preparation: Effective DPI after resize: {new_dpi[0]:.1f}"
628
+ )
629
+ else:
630
+ total_pixels = width * height
631
+ print(
632
+ f"VLM image preparation: Image size {width}x{height} ({total_pixels:,} pixels) and DPI {current_dpi:.1f} are within limits (max pixels: {VLM_MAX_IMAGE_SIZE:,}, max DPI: {VLM_MAX_DPI})"
633
+ )
634
+
635
+ return image
636
+
637
+
638
  def _vlm_ocr_predict(
639
  image: Image.Image,
640
  prompt: str = "Extract the text content from this image.",
 
650
  Dictionary in PaddleOCR format with 'rec_texts' and 'rec_scores'
651
  """
652
  try:
653
+ # Validate image exists and is not None
654
+ if image is None:
655
+ print("VLM OCR error: Image is None")
656
+ return {"rec_texts": [], "rec_scores": []}
657
+
658
+ # Validate image has valid size (at least 10x10 pixels)
659
+ try:
660
+ width, height = image.size
661
+ if width < 10 or height < 10:
662
+ print(
663
+ f"VLM OCR error: Image is too small ({width}x{height} pixels). Minimum size is 10x10."
664
+ )
665
+ return {"rec_texts": [], "rec_scores": []}
666
+ except Exception as size_error:
667
+ print(f"VLM OCR error: Could not get image size: {size_error}")
668
+ return {"rec_texts": [], "rec_scores": []}
669
+
670
+ # Ensure image is in RGB mode (convert if needed)
671
+ try:
672
+ if image.mode != "RGB":
673
+ print(f"VLM OCR: Converting image from {image.mode} to RGB mode")
674
+ image = image.convert("RGB")
675
+ # Update width/height after conversion (should be same, but ensure consistency)
676
+ width, height = image.size
677
+ except Exception as convert_error:
678
+ print(f"VLM OCR error: Could not convert image to RGB: {convert_error}")
679
+ return {"rec_texts": [], "rec_scores": []}
680
+
681
+ # Check and resize image if it exceeds maximum size or DPI limits
682
+ try:
683
+ image = _prepare_image_for_vlm(image)
684
+ width, height = image.size
685
+ except Exception as prep_error:
686
+ print(f"VLM OCR error: Could not prepare image for VLM: {prep_error}")
687
+ return {"rec_texts": [], "rec_scores": []}
688
+
689
  # Use the VLM to extract text
690
  # Pass None for parameters to prioritize model-specific defaults from run_vlm.py
691
  # If model defaults are not available, general defaults will be used (matching current values)
692
+ print(f"Calling extract_text_from_image_vlm with image size: {width}x{height}")
693
+ extracted_text = extract_text_from_image_vlm(
694
  text=prompt,
695
  image=image,
696
  max_new_tokens=None, # Use model default if available, otherwise MAX_NEW_TOKENS from config
 
700
  repetition_penalty=None, # Use model default if available, otherwise 1.3
701
  )
702
 
703
+ # print(f"VLM OCR extracted text type: {type(extracted_text)}, value: {extracted_text}")
704
+
705
+ # Check if extracted_text is None or empty
706
+ if extracted_text is None:
707
+ # print("VLM OCR warning: extract_text_from_image_vlm returned None")
708
+ return {"rec_texts": [], "rec_scores": []}
709
+
710
+ if not isinstance(extracted_text, str):
711
+ # print(f"VLM OCR warning: extract_text_from_image_vlm returned unexpected type: {type(extracted_text)}")
712
+ return {"rec_texts": [], "rec_scores": []}
713
+
714
+ if extracted_text.strip():
715
+
716
  # Clean the text
717
+
718
+ cleaned_text = re.sub(r"[\r\n]+", " ", extracted_text)
719
+ cleaned_text = cleaned_text.strip()
720
 
721
  # Split into words for compatibility with PaddleOCR format
722
  words = cleaned_text.split()
723
 
724
+ # If text has more than 30 words, assume something went wrong and skip it
725
+ if len(words) > 30:
726
+ print(
727
+ f"VLM OCR warning: Extracted text has {len(words)} words, which exceeds the 30 word limit. Skipping."
728
+ )
729
  return {"rec_texts": [], "rec_scores": []}
730
 
731
  # Create PaddleOCR-compatible result
 
736
 
737
  return result
738
  else:
739
+ # print("VLM OCR warning: Extracted text is empty after stripping")
740
  return {"rec_texts": [], "rec_scores": []}
741
 
742
+ except Exception:
743
+ # print(f"VLM OCR error: {e}")
744
+ # print(f"VLM OCR error traceback: {traceback.format_exc()}")
745
  return {"rec_texts": [], "rec_scores": []}
746
 
747
 
 
951
  paddle_results: List[Any],
952
  input_image_width: int = None,
953
  input_image_height: int = None,
954
+ image_name: str = None,
955
+ image: Image.Image = None,
956
  ) -> Dict[str, List]:
957
  """Converts PaddleOCR result format to Tesseract's dictionary format using relative coordinates.
958
 
 
964
  paddle_results: List of PaddleOCR result dictionaries
965
  input_image_width: Width of the input image passed to PaddleOCR (target dimensions for scaling)
966
  input_image_height: Height of the input image passed to PaddleOCR (target dimensions for scaling)
967
+ image_name: Name of the image
968
+ image: Image object
969
  """
970
 
971
  output = {
 
975
  "width": list(),
976
  "height": list(),
977
  "conf": list(),
978
+ "model": list(),
979
  }
980
 
981
  # paddle_results is now a list of dictionaries with detailed information
 
990
  # Fallback: we'll try to detect from coordinates, but this is less reliable
991
  use_relative_coords = False
992
  else:
993
+ use_relative_coords = True
994
 
995
  for page_result in paddle_results:
996
  # Extract text recognition results from the new format
997
  rec_texts = page_result.get("rec_texts", list())
998
  rec_scores = page_result.get("rec_scores", list())
999
  rec_polys = page_result.get("rec_polys", list())
1000
+ rec_models = page_result.get("rec_models", list())
1001
 
1002
  # PaddleOCR may return image dimensions in the result - check for them
1003
  # Some versions of PaddleOCR include this information
1004
  result_image_width = page_result.get("image_width")
1005
  result_image_height = page_result.get("image_height")
1006
 
1007
+ # PaddleOCR typically returns coordinates in the input image space
1008
+ # However, it may internally resize images, so we need to check if coordinates
1009
+ # are in a different space by comparing with explicit metadata or detecting from coordinates
1010
+
1011
  # First pass: determine PaddleOCR's coordinate space by finding max coordinates
1012
  # This tells us what coordinate space PaddleOCR is actually using
1013
  max_x_coord = 0
 
1026
  max_y_coord = max(max_y_coord, max(y_coords) if y_coords else 0)
1027
 
1028
  # Determine PaddleOCR's coordinate space dimensions
1029
+ # Priority: explicit result metadata > input dimensions (standard PaddleOCR behavior)
1030
+ # Note: PaddleOCR typically returns coordinates in the input image space.
1031
+ # We only use a different coordinate space if PaddleOCR provides explicit metadata.
1032
+ # Using max coordinates to detect coordinate space is unreliable because:
1033
+ # 1. Text might not extend to image edges
1034
+ # 2. There might be padding
1035
+ # 3. Max coordinates don't necessarily equal image dimensions
1036
+ if result_image_width is not None and result_image_height is not None:
1037
+ # Use explicit metadata from PaddleOCR if available (most reliable)
1038
+ paddle_coord_width = result_image_width
1039
+ paddle_coord_height = result_image_height
1040
+ # Only use relative conversion if coordinate space differs from input
1041
+ if (
1042
+ paddle_coord_width != input_image_width
1043
+ or paddle_coord_height != input_image_height
1044
+ ):
1045
+ print(
1046
+ f"PaddleOCR metadata indicates coordinate space ({paddle_coord_width}x{paddle_coord_height}) "
1047
+ f"differs from input ({input_image_width}x{input_image_height}). "
1048
+ f"Using metadata for coordinate conversion."
1049
+ )
1050
+ elif input_image_width is not None and input_image_height is not None:
1051
+ # Default: assume coordinates are in input image space (standard PaddleOCR behavior)
1052
+ # This is the most common case and avoids incorrect scaling
1053
  paddle_coord_width = input_image_width
1054
  paddle_coord_height = input_image_height
1055
+ else:
1056
+ # Fallback: use max coordinates if we have no other information
1057
+ paddle_coord_width = max_x_coord if max_x_coord > 0 else 1
1058
+ paddle_coord_height = max_y_coord if max_y_coord > 0 else 1
1059
+ use_relative_coords = False
1060
+ print(
1061
+ f"Warning: No input dimensions provided. Using detected coordinate space ({paddle_coord_width}x{paddle_coord_height}) from max coordinates."
1062
+ )
1063
+
1064
+ # Validate coordinate space dimensions
1065
+ if paddle_coord_width is None or paddle_coord_height is None:
1066
+ paddle_coord_width = input_image_width or 1
1067
+ paddle_coord_height = input_image_height or 1
1068
  use_relative_coords = False
1069
 
1070
  if paddle_coord_width <= 0 or paddle_coord_height <= 0:
 
1075
  paddle_coord_height = input_image_height or 1
1076
  use_relative_coords = False
1077
 
1078
+ # If coordinate space matches input dimensions, coordinates are already in the correct space
1079
+ # Only use relative coordinate conversion if coordinate space differs from input
1080
+ if (
1081
+ paddle_coord_width == input_image_width
1082
+ and paddle_coord_height == input_image_height
1083
+ and input_image_width is not None
1084
+ and input_image_height is not None
1085
+ ):
1086
+ # Coordinates are already in input space, no conversion needed
1087
+ use_relative_coords = False
1088
+ print(
1089
+ f"PaddleOCR coordinates are in input image space ({input_image_width}x{input_image_height}). "
1090
+ f"Using coordinates directly without conversion."
1091
+ )
1092
+
1093
  # Second pass: convert coordinates using relative coordinate approach
1094
+ # Use default "Paddle" if rec_models is not available or doesn't match length
1095
+ if len(rec_models) != len(rec_texts):
1096
+ print(
1097
+ f"Warning: rec_models length ({len(rec_models)}) doesn't match rec_texts length ({len(rec_texts)}). Using default 'Paddle' for all."
1098
+ )
1099
+ rec_models = ["Paddle"] * len(rec_texts)
1100
+ # Update page_result to keep it consistent
1101
+ page_result["rec_models"] = rec_models
1102
+ else:
1103
+ # Ensure we're using the rec_models from page_result (which may have been modified)
1104
+ rec_models = page_result.get("rec_models", rec_models)
1105
+
1106
+ # Debug: Print model distribution
1107
+ vlm_count = sum(1 for m in rec_models if m == "VLM")
1108
+ if vlm_count > 0:
1109
+ print(
1110
+ f"Found {vlm_count} VLM-labeled lines out of {len(rec_models)} total lines in page_result"
1111
+ )
1112
+
1113
+ for line_text, line_confidence, bounding_box, line_model in zip(
1114
+ rec_texts, rec_scores, rec_polys, rec_models
1115
  ):
1116
  # bounding_box is now a numpy array with shape (4, 2)
1117
  # Convert to list of coordinates if it's a numpy array
 
1178
  output["width"].append(round(line_width, 2))
1179
  output["height"].append(round(line_height, 2))
1180
  output["conf"].append(int(line_confidence * 100))
1181
+ output["model"].append(line_model if line_model else "Paddle")
1182
 
1183
  return output
1184
 
 
1210
  "width": list(),
1211
  "height": list(),
1212
  "conf": list(),
1213
+ "model": list(),
1214
  }
1215
 
1216
  if not line_data or not line_data.get("text"):
 
1249
  for i in range(len(line_data["text"])):
1250
  line_text = line_data["text"][i]
1251
  line_conf = line_data["conf"][i]
1252
+ # Extract model, defaulting to "Paddle" if not available
1253
+ if "model" in line_data and len(line_data["model"]) > i:
1254
+ line_model = line_data["model"][i]
1255
+ else:
1256
+ line_model = "Paddle"
1257
 
1258
  # Get the float values
1259
  f_left = float(line_data["left"][i])
 
1382
  output["width"].append(clamped_width)
1383
  output["height"].append(line_height)
1384
  output["conf"].append(line_conf)
1385
+ output["model"].append(line_model)
1386
  current_left += word_width + estimated_space_width
1387
  continue
1388
 
 
1394
  output["width"].append(word_output["width"][j])
1395
  output["height"].append(word_output["height"][j])
1396
  output["conf"].append(word_output["conf"][j])
1397
+ # Preserve the model from the line-level data
1398
+ output["model"].append(line_model)
1399
 
1400
  return output
1401
 
 
1777
  self,
1778
  image: Image.Image,
1779
  ocr: Optional[Any] = None,
1780
+ paddle_results: List[Any] = None,
1781
  confidence_threshold: int = HYBRID_OCR_CONFIDENCE_THRESHOLD,
1782
  padding: int = HYBRID_OCR_PADDING,
1783
  image_name: str = "unknown_image_name",
1784
  input_image_width: int = None,
1785
  input_image_height: int = None,
1786
+ ) -> List[Any]:
1787
  """
1788
  Performs OCR using PaddleOCR at line level, then VLM for low-confidence lines.
1789
+ Returns modified paddle_results in the same format as PaddleOCR output.
1790
 
1791
  Args:
1792
  image: PIL Image to process
1793
  ocr: PaddleOCR instance (optional, uses self.paddle_ocr if not provided)
1794
+ paddle_results: PaddleOCR results in original format (List of dicts with rec_texts, rec_scores, rec_polys)
1795
  confidence_threshold: Confidence threshold below which VLM is used
1796
  padding: Padding to add around line crops
1797
  image_name: Name of the image for logging/debugging
 
1799
  input_image_height: Original image height (before preprocessing)
1800
 
1801
  Returns:
1802
+ Modified paddle_results with VLM replacements for low-confidence lines
1803
  """
1804
  if ocr is None:
1805
  if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
 
1809
  "No OCR object provided and 'paddle_ocr' is not initialized."
1810
  )
1811
 
1812
+ if paddle_results is None or not paddle_results:
1813
+ return paddle_results
1814
+
1815
  print("Starting hybrid PaddleOCR + VLM OCR process...")
1816
 
1817
  # Get image dimensions
 
1823
  if input_image_height is None:
1824
  input_image_height = img_height
1825
 
1826
+ # Create a deep copy of paddle_results to modify
1827
+ modified_paddle_results = copy.deepcopy(paddle_results)
 
 
1828
 
1829
+ # Process each page result in paddle_results
1830
+ for page_result in modified_paddle_results:
1831
+ # Extract text recognition results from the paddle format
1832
+ rec_texts = page_result.get("rec_texts", list())
1833
+ rec_scores = page_result.get("rec_scores", list())
1834
+ rec_polys = page_result.get("rec_polys", list())
1835
 
1836
+ # Initialize rec_models list with "Paddle" as default for all lines
1837
+ num_lines = len(rec_texts)
1838
+ if (
1839
+ "rec_models" not in page_result
1840
+ or len(page_result.get("rec_models", [])) != num_lines
1841
+ ):
1842
+ rec_models = ["Paddle"] * num_lines
1843
+ page_result["rec_models"] = rec_models
1844
+ else:
1845
+ rec_models = page_result["rec_models"]
1846
 
1847
+ # Get image dimensions from result if available
1848
+ result_image_width = page_result.get("image_width")
1849
+ result_image_height = page_result.get("image_height")
1850
+
1851
+ # Determine PaddleOCR's coordinate space dimensions
1852
+ max_x_coord = 0
1853
+ max_y_coord = 0
1854
+ for bounding_box in rec_polys:
1855
+ if hasattr(bounding_box, "tolist"):
1856
+ box = bounding_box.tolist()
1857
+ else:
1858
+ box = bounding_box
1859
+ if box and len(box) > 0:
1860
+ x_coords = [p[0] for p in box]
1861
+ y_coords = [p[1] for p in box]
1862
+ max_x_coord = max(max_x_coord, max(x_coords) if x_coords else 0)
1863
+ max_y_coord = max(max_y_coord, max(y_coords) if y_coords else 0)
1864
 
1865
+ paddle_coord_width = (
1866
+ result_image_width
1867
+ if result_image_width is not None
1868
+ else max_x_coord if max_x_coord > 0 else input_image_width
1869
+ )
1870
+ paddle_coord_height = (
1871
+ result_image_height
1872
+ if result_image_height is not None
1873
+ else max_y_coord if max_y_coord > 0 else input_image_height
1874
+ )
1875
 
1876
+ if paddle_coord_width is None or paddle_coord_height is None:
1877
+ paddle_coord_width = input_image_width or img_width
1878
+ paddle_coord_height = input_image_height or img_height
 
 
 
 
 
 
 
 
 
1879
 
1880
+ if paddle_coord_width <= 0 or paddle_coord_height <= 0:
1881
+ paddle_coord_width = input_image_width or img_width
1882
+ paddle_coord_height = input_image_height or img_height
1883
 
1884
+ # Process each line
1885
+ print(f"Processing {num_lines} lines from PaddleOCR results...")
 
1886
 
1887
+ for i in range(num_lines):
1888
+ line_text = rec_texts[i]
1889
+ line_conf = float(rec_scores[i]) * 100 # Convert to percentage
1890
+ bounding_box = rec_polys[i]
 
 
 
1891
 
1892
+ # Skip empty lines
1893
+ if not line_text.strip():
 
 
 
 
 
 
 
 
1894
  continue
1895
 
1896
+ # Extract bounding box coordinates
1897
+ if hasattr(bounding_box, "tolist"):
1898
+ box = bounding_box.tolist()
1899
+ else:
1900
+ box = bounding_box
1901
+
1902
+ if not box or len(box) == 0:
1903
+ continue
1904
+
1905
+ # Convert polygon to bounding box
1906
+ x_coords = [p[0] for p in box]
1907
+ y_coords = [p[1] for p in box]
1908
+ line_left_paddle = float(min(x_coords))
1909
+ line_top_paddle = float(min(y_coords))
1910
+ line_right_paddle = float(max(x_coords))
1911
+ line_bottom_paddle = float(max(y_coords))
1912
+ line_width_paddle = line_right_paddle - line_left_paddle
1913
+ line_height_paddle = line_bottom_paddle - line_top_paddle
1914
 
1915
+ # Convert to image coordinate space (scale from paddle coordinates to image coordinates)
1916
+ if paddle_coord_width > 0 and paddle_coord_height > 0:
1917
+ rel_left = line_left_paddle / paddle_coord_width
1918
+ rel_top = line_top_paddle / paddle_coord_height
1919
+ rel_width = line_width_paddle / paddle_coord_width
1920
+ rel_height = line_height_paddle / paddle_coord_height
1921
 
1922
+ line_left = rel_left * img_width
1923
+ line_top = rel_top * img_height
1924
+ line_width = rel_width * img_width
1925
+ line_height = rel_height * img_height
1926
+ else:
1927
+ line_left = line_left_paddle
1928
+ line_top = line_top_paddle
1929
+ line_width = line_width_paddle
1930
+ line_height = line_height_paddle
1931
 
1932
+ # Initialize model as PaddleOCR (default)
1933
+
1934
+ # Count words in PaddleOCR output
1935
+ paddle_words = line_text.split()
1936
+ paddle_word_count = len(paddle_words)
1937
+
1938
+ # If confidence is low, use VLM for a second opinion
1939
+ if line_conf < confidence_threshold:
1940
+ # Debug: Print line dimensions before cropping
1941
+ # print(
1942
+ # f" Line {i}: '{line_text[:50]}...' "
1943
+ # f"conf={line_conf}, "
1944
+ # f"bbox=({line_left:.1f}, {line_top:.1f}, {line_width:.1f}, {line_height:.1f})"
1945
+ # )
1946
+
1947
+ # Ensure minimum line height for VLM processing
1948
+ # If line_height is too small, use a minimum height based on typical text line height
1949
+ min_line_height = max(
1950
+ line_height, 20
1951
+ ) # Minimum 20 pixels for text line
1952
+ if line_height < 20:
1953
  print(
1954
+ f" Warning: Line height ({line_height:.1f}px) is too small. "
1955
+ f"Using minimum height of {min_line_height}px for cropping."
1956
  )
1957
 
1958
+ # Calculate crop coordinates
1959
+ crop_left = line_left
1960
+ crop_top = line_top
1961
+ crop_right = line_left + line_width
1962
+ crop_bottom = line_top + min_line_height
1963
 
1964
+ print(
1965
+ f" Crop coordinates: left={crop_left}, top={crop_top}, "
1966
+ f"right={crop_right}, bottom={crop_bottom}, "
1967
+ f"size=({crop_right - crop_left}x{crop_bottom - crop_top})"
1968
+ )
1969
+
1970
+ # Ensure crop dimensions are valid
1971
+ if crop_right <= crop_left or crop_bottom <= crop_top:
1972
+ # Invalid crop, keep original PaddleOCR result
1973
+ continue
1974
+
1975
+ # Crop the line image
1976
+ cropped_image = image.crop(
1977
+ (crop_left, crop_top, crop_right, crop_bottom)
1978
+ )
1979
+
1980
+ # Check if cropped image is too small for VLM processing
1981
+ crop_width = crop_right - crop_left
1982
+ crop_height = crop_bottom - crop_top
1983
+ if crop_width < 10 or crop_height < 10:
1984
+ print(
1985
+ f" Line: '{line_text}' (conf: {line_conf}) -> "
1986
+ f"Cropped image too small ({crop_width}x{crop_height} pixels). "
1987
+ f"Skipping VLM, keeping PaddleOCR result."
1988
+ )
1989
+ # Keep original PaddleOCR result for this line
1990
+ continue
1991
+
1992
+ # Ensure cropped image is in RGB mode before passing to VLM
1993
+ if cropped_image.mode != "RGB":
1994
+ cropped_image = cropped_image.convert("RGB")
1995
+
1996
+ # Save input image for debugging if environment variable is set
1997
+ if SAVE_VLM_INPUT_IMAGES:
1998
+ try:
1999
+ vlm_debug_dir = os.path.join(
2000
+ OUTPUT_FOLDER,
2001
+ "hybrid_paddle_vlm_visualisations/vlm_input_images",
2002
+ )
2003
+ os.makedirs(vlm_debug_dir, exist_ok=True)
2004
+ line_text_safe = safe_sanitize_text(line_text)
2005
+ line_text_shortened = line_text_safe[:20]
2006
+ image_name_safe = safe_sanitize_text(image_name)
2007
+ image_name_shortened = image_name_safe[:20]
2008
+ filename = f"{image_name_shortened}_{line_text_shortened}_vlm_input_image.png"
2009
+ filepath = os.path.join(vlm_debug_dir, filename)
2010
+ cropped_image.save(filepath)
2011
+ print(f"Saved VLM input image to: {filepath}")
2012
+ except Exception as save_error:
2013
+ print(
2014
+ f"Warning: Could not save VLM input image: {save_error}"
2015
  )
 
 
 
 
 
 
2016
 
2017
+ # Use VLM for OCR on this line with error handling
2018
+ vlm_result = None
2019
+ vlm_rec_texts = []
2020
+ vlm_rec_scores = []
2021
+
2022
+ try:
2023
+ vlm_result = _vlm_ocr_predict(cropped_image)
2024
+ vlm_rec_texts = (
2025
+ vlm_result.get("rec_texts", []) if vlm_result else []
2026
+ )
2027
+ vlm_rec_scores = (
2028
+ vlm_result.get("rec_scores", []) if vlm_result else []
2029
+ )
2030
+ except Exception as vlm_error:
2031
+ print(
2032
+ f" VLM OCR failed for line '{line_text[:50]}...' (conf: {line_conf}): {vlm_error}. "
2033
+ f"Keeping original PaddleOCR result."
2034
+ )
2035
+ # Ensure we keep original PaddleOCR result on error
2036
+ vlm_rec_texts = []
2037
+ vlm_rec_scores = []
2038
+
2039
+ if vlm_rec_texts and vlm_rec_scores:
2040
+ # Combine VLM words into a single text string
2041
+ vlm_text = " ".join(vlm_rec_texts)
2042
+ vlm_word_count = len(vlm_rec_texts)
2043
+ vlm_conf = float(
2044
+ np.median(vlm_rec_scores)
2045
+ ) # Keep as 0-1 range for paddle format
2046
+
2047
+ # Only replace if word counts match
2048
+ if (
2049
+ vlm_word_count - paddle_word_count <= 2
2050
+ and vlm_word_count - paddle_word_count >= -2
2051
+ ):
2052
+ print(
2053
+ f" Re-OCR'd line: '{line_text}' (conf: {line_conf:.1f}, words: {paddle_word_count}) "
2054
+ f"-> '{vlm_text}' (conf: {vlm_conf*100:.1f}, words: {vlm_word_count}) [VLM]"
2055
  )
 
 
 
 
 
 
 
2056
 
2057
+ # For exporting example image comparisons
2058
+ safe_filename = self._create_safe_filename_with_confidence(
2059
+ line_text,
2060
+ vlm_text,
2061
+ int(line_conf),
2062
+ int(vlm_conf * 100),
2063
+ "VLM",
2064
  )
 
 
2065
 
2066
+ if SAVE_EXAMPLE_HYBRID_IMAGES is True:
2067
+ # Normalize and validate image_name to prevent path traversal attacks
2068
+ normalized_image_name = os.path.normpath(
2069
+ image_name + "_hybrid_paddle_vlm"
2070
+ )
2071
+ if (
2072
+ ".." in normalized_image_name
2073
+ or "/" in normalized_image_name
2074
+ or "\\" in normalized_image_name
2075
+ ):
2076
+ normalized_image_name = "safe_image"
2077
+
2078
+ hybrid_ocr_examples_folder = (
2079
+ self.output_folder
2080
+ + f"/hybrid_ocr_examples/{normalized_image_name}"
2081
+ )
2082
+ # Validate the constructed path is safe
2083
+ if not validate_folder_containment(
2084
+ hybrid_ocr_examples_folder, OUTPUT_FOLDER
2085
+ ):
2086
+ raise ValueError(
2087
+ f"Unsafe hybrid_ocr_examples folder path: {hybrid_ocr_examples_folder}"
2088
+ )
2089
+
2090
+ if not os.path.exists(hybrid_ocr_examples_folder):
2091
+ os.makedirs(hybrid_ocr_examples_folder)
2092
+ output_image_path = (
2093
+ hybrid_ocr_examples_folder + f"/{safe_filename}.png"
2094
+ )
2095
+ print(f"Saving example image to {output_image_path}")
2096
+ cropped_image.save(output_image_path)
2097
+
2098
+ # Replace with VLM result in paddle_results format
2099
+ # Update rec_texts, rec_scores, and rec_models for this line
2100
+ rec_texts[i] = vlm_text
2101
+ rec_scores[i] = vlm_conf
2102
+ rec_models[i] = "VLM"
2103
+ # Ensure page_result is updated with the modified rec_models list
2104
+ page_result["rec_models"] = rec_models
2105
+ print(
2106
+ f" Set rec_models[{i}] = 'VLM' for line '{vlm_text[:50]}...'"
2107
+ )
2108
+ else:
2109
+ print(
2110
+ f" Line: '{line_text}' (conf: {line_conf:.1f}, words: {paddle_word_count}) -> "
2111
+ f"VLM result '{vlm_text}' (conf: {vlm_conf*100:.1f}, words: {vlm_word_count}) "
2112
+ f"word count mismatch. Keeping PaddleOCR result."
2113
+ )
2114
  else:
2115
+ # VLM returned empty or no results - keep original PaddleOCR result
2116
+ if line_conf < confidence_threshold:
2117
+ print(
2118
+ f" Line: '{line_text}' (conf: {line_conf:.1f}) -> "
2119
+ f"VLM returned no results. Keeping original PaddleOCR result."
2120
+ )
2121
 
2122
+ # Debug: Print summary of model labels before returning
2123
+ for page_idx, page_result in enumerate(modified_paddle_results):
2124
+ rec_models = page_result.get("rec_models", [])
2125
+ vlm_count = sum(1 for m in rec_models if m == "VLM")
2126
+ paddle_count = sum(1 for m in rec_models if m == "Paddle")
2127
+ print(
2128
+ f"Page {page_idx}: {vlm_count} VLM, {paddle_count} Paddle out of {len(rec_models)} total lines"
2129
+ )
2130
 
2131
+ return modified_paddle_results
2132
 
2133
  def perform_ocr(
2134
  self, image: Union[str, Image.Image, np.ndarray], ocr: Optional[Any] = None
 
2149
  # Store original dimensions BEFORE preprocessing (needed for coordinate conversion)
2150
  original_image_width = None
2151
  original_image_height = None
2152
+ original_image_for_visualization = (
2153
+ None # Store original image for visualization
2154
+ )
2155
 
2156
  if PREPROCESS_LOCAL_OCR_IMAGES:
2157
  print("Pre-processing image...")
2158
  # Get original dimensions before preprocessing
2159
  original_image_width, original_image_height = image.size
2160
+ # Store original image for visualization (coordinates are in original space)
2161
+ original_image_for_visualization = image.copy()
2162
  image, preprocessing_metadata = self.image_preprocessor.preprocess_image(
2163
  image
2164
  )
 
2176
  else:
2177
  preprocessing_metadata = dict()
2178
  original_image_width, original_image_height = image.size
2179
+ # When preprocessing is disabled, the current image is the original
2180
+ original_image_for_visualization = image.copy()
2181
 
2182
  image_width, image_height = image.size
2183
 
2184
+ # Store original image for line-to-word conversion when PaddleOCR processes original image
2185
+ original_image_for_cropping = None
2186
+ paddle_processed_original = False
2187
+
2188
  # Note: In testing I haven't seen that this necessarily improves results
2189
  if self.ocr_engine == "hybrid-paddle":
2190
  # Try hybrid with original image for cropping:
 
2194
  # Try hybrid VLM with original image for cropping:
2195
  ocr_data = self._perform_hybrid_ocr(image, image_name=image_name)
2196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2197
  elif self.ocr_engine == "tesseract":
2198
 
2199
  ocr_data = pytesseract.image_to_data(
 
2203
  lang=self.tesseract_lang, # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
2204
  )
2205
 
2206
+ elif self.ocr_engine == "paddle" or self.ocr_engine == "hybrid-paddle-vlm":
2207
 
2208
  if ocr is None:
2209
  if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
 
2234
  paddle_input_height = image_np.shape[0]
2235
 
2236
  paddle_results = ocr.predict(image_np)
2237
+ # PaddleOCR processed the preprocessed image
2238
+ paddle_processed_original = False
2239
  else:
2240
  # When using image path, load image to get dimensions
2241
  temp_image = Image.open(image_path)
 
2243
  # For file path, use the original dimensions (before preprocessing)
2244
  # original_image_width and original_image_height are already set above
2245
  paddle_results = ocr.predict(image_path)
2246
+ # PaddleOCR processed the original image from file path
2247
+ paddle_processed_original = True
2248
+ # Store the original image for cropping
2249
+ original_image_for_cropping = temp_image.copy()
2250
 
2251
  # Save PaddleOCR visualization with bounding boxes
2252
+ if paddle_results and SAVE_PAGE_OCR_VISUALISATIONS is True:
2253
 
2254
  for res in paddle_results:
2255
  # self.output_folder is already validated and normalized at construction time
 
2267
  os.makedirs(paddle_viz_folder, exist_ok=True)
2268
  res.save_to_img(paddle_viz_folder)
2269
 
2270
+ if self.ocr_engine == "hybrid-paddle-vlm":
2271
+
2272
+ paddle_results = self._perform_hybrid_paddle_vlm_ocr(
2273
+ image,
2274
+ ocr=ocr,
2275
+ paddle_results=paddle_results,
2276
+ image_name=image_name,
2277
+ input_image_width=original_image_width,
2278
+ input_image_height=original_image_height,
2279
+ )
2280
+
2281
+ # Debug: Check structure after hybrid processing
2282
+ if paddle_results:
2283
+ print(
2284
+ f"DEBUG: After hybrid, paddle_results length: {len(paddle_results)}"
2285
+ )
2286
+ if len(paddle_results) > 0 and isinstance(paddle_results[0], dict):
2287
+ rec_models = paddle_results[0].get("rec_models", [])
2288
+ vlm_count = sum(1 for m in rec_models if m == "VLM")
2289
+ print(
2290
+ f"DEBUG: After hybrid, first page has {vlm_count} VLM labels out of {len(rec_models)} total"
2291
+ )
2292
+
2293
  ocr_data = self._convert_paddle_to_tesseract_format(
2294
  paddle_results,
2295
  input_image_width=original_image_width,
2296
  input_image_height=original_image_height,
2297
  )
2298
 
2299
+ if SAVE_PAGE_OCR_VISUALISATIONS is True:
2300
+ # Save output to image with identified bounding boxes
2301
+ # Use original image since coordinates are in original image space
2302
+ # Prefer original_image_for_cropping (when PaddleOCR processed from file path),
2303
+ # otherwise use original_image_for_visualization (stored before preprocessing)
2304
+ viz_image = (
2305
+ original_image_for_cropping
2306
+ if original_image_for_cropping is not None
2307
+ else (
2308
+ original_image_for_visualization
2309
+ if original_image_for_visualization is not None
2310
+ else image
2311
+ )
2312
+ )
2313
+ if isinstance(viz_image, Image.Image):
2314
+ # Convert PIL Image to numpy array in BGR format for OpenCV
2315
+ image_cv = cv2.cvtColor(np.array(viz_image), cv2.COLOR_RGB2BGR)
2316
+ else:
2317
+ image_cv = np.array(viz_image)
2318
+ if len(image_cv.shape) == 2:
2319
+ image_cv = cv2.cvtColor(image_cv, cv2.COLOR_GRAY2BGR)
2320
+ elif len(image_cv.shape) == 3 and image_cv.shape[2] == 3:
2321
+ # Assume RGB, convert to BGR
2322
+ image_cv = cv2.cvtColor(image_cv, cv2.COLOR_RGB2BGR)
2323
+
2324
+ # Draw all bounding boxes on the image
2325
+ for i in range(len(ocr_data["text"])):
2326
+ left = int(ocr_data["left"][i])
2327
+ top = int(ocr_data["top"][i])
2328
+ width = int(ocr_data["width"][i])
2329
+ height = int(ocr_data["height"][i])
2330
+ # Ensure coordinates are within image bounds
2331
+ left = max(0, min(left, image_cv.shape[1] - 1))
2332
+ top = max(0, min(top, image_cv.shape[0] - 1))
2333
+ right = max(left + 1, min(left + width, image_cv.shape[1]))
2334
+ bottom = max(top + 1, min(top + height, image_cv.shape[0]))
2335
+ cv2.rectangle(
2336
+ image_cv, (left, top), (right, bottom), (0, 255, 0), 2
2337
+ )
2338
+
2339
+ # Save the visualization once with all boxes drawn
2340
+ paddle_viz_folder = os.path.join(
2341
+ self.output_folder, "paddle_visualisations"
2342
+ )
2343
+ # Double-check the constructed path is safe
2344
+ if not validate_folder_containment(paddle_viz_folder, OUTPUT_FOLDER):
2345
+ raise ValueError(
2346
+ f"Unsafe paddle visualisations folder path: {paddle_viz_folder}"
2347
+ )
2348
+
2349
+ os.makedirs(paddle_viz_folder, exist_ok=True)
2350
+
2351
+ # Generate safe filename
2352
+ if image_name:
2353
+ base_name = os.path.splitext(os.path.basename(image_name))[0]
2354
+ # Sanitize filename to avoid issues with special characters
2355
+ base_name = safe_sanitize_text(base_name, max_length=50)
2356
+ filename = f"{base_name}_ocr_visualisation.jpg"
2357
+ else:
2358
+ timestamp = int(time.time())
2359
+ filename = f"ocr_visualisation_{timestamp}.jpg"
2360
+
2361
+ output_path = os.path.join(paddle_viz_folder, filename)
2362
+ cv2.imwrite(output_path, image_cv)
2363
+ print(f"OCR visualization saved to: {output_path}")
2364
+
2365
  else:
2366
  raise RuntimeError(f"Unsupported OCR engine: {self.ocr_engine}")
2367
 
2368
+ # Always check for scale_factor, even if preprocessing_metadata is empty
2369
+ # This ensures rescaling happens correctly when preprocessing was applied
2370
+ scale_factor = (
2371
+ preprocessing_metadata.get("scale_factor", 1.0)
2372
+ if preprocessing_metadata
2373
+ else 1.0
2374
+ )
2375
+ if scale_factor != 1.0:
2376
+ # Skip rescaling for PaddleOCR since _convert_paddle_to_tesseract_format
2377
+ # already scales coordinates directly to original image dimensions
2378
+ # hybrid-paddle-vlm also uses PaddleOCR and converts to original space
2379
+ if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid-paddle-vlm":
2380
+ pass
2381
+ # print(f"Skipping rescale_ocr_data for PaddleOCR (already scaled to original dimensions)")
2382
+ else:
2383
+ print("rescaling ocr_data with scale_factor: ", scale_factor)
2384
+ ocr_data = rescale_ocr_data(ocr_data, scale_factor)
2385
+
2386
  # Convert line-level results to word-level if configured and needed
2387
  if CONVERT_LINE_TO_WORD_LEVEL and self._is_line_level_data(ocr_data):
2388
  print("Converting line-level OCR results to word-level...")
2389
+ # Check if coordinates need to be scaled to match the image we're cropping from
2390
+ # For PaddleOCR: _convert_paddle_to_tesseract_format converts coordinates to original image space
2391
+ # - If PaddleOCR processed the original image (image_path provided), crop from original image (no scaling)
2392
+ # - If PaddleOCR processed the preprocessed image (no image_path), scale coordinates to preprocessed space and crop from preprocessed image
2393
  # For Tesseract: OCR runs on preprocessed image, so coordinates are already in preprocessed space,
2394
  # matching the preprocessed image we're cropping from - no scaling needed
2395
+
2396
  needs_scaling = False
2397
+ crop_image = image # Default to preprocessed image
2398
+ crop_image_width = image_width
2399
+ crop_image_height = image_height
2400
+
2401
  if (
2402
  PREPROCESS_LOCAL_OCR_IMAGES
2403
  and original_image_width
 
2409
  ):
2410
  # PaddleOCR coordinates are converted to original space by _convert_paddle_to_tesseract_format
2411
  # hybrid-paddle-vlm also uses PaddleOCR and converts to original space
2412
+ if paddle_processed_original:
2413
+ # PaddleOCR processed the original image, so crop from original image
2414
+ # No scaling needed - coordinates are already in original space
2415
+ crop_image = original_image_for_cropping
2416
+ crop_image_width = original_image_width
2417
+ crop_image_height = original_image_height
2418
+ needs_scaling = False
2419
+ print(
2420
+ f"PaddleOCR processed original image. Cropping from original ({original_image_width}x{original_image_height}) without scaling."
2421
+ )
2422
+ else:
2423
+ # PaddleOCR processed the preprocessed image, so scale coordinates to preprocessed space
2424
+ needs_scaling = True
2425
 
2426
  if needs_scaling:
2427
  # Calculate scale factors from original to preprocessed
 
2439
  "width": [w * scale_x for w in ocr_data["width"]],
2440
  "height": [h * scale_y for h in ocr_data["height"]],
2441
  "conf": ocr_data["conf"],
2442
+ "model": ocr_data["model"],
2443
  }
2444
  ocr_data = self._convert_line_to_word_level(
2445
  scaled_ocr_data,
2446
+ crop_image_width,
2447
+ crop_image_height,
2448
+ crop_image,
2449
  image_name=image_name,
2450
  )
2451
  # Scale word-level results back to original image space
 
2457
  ocr_data["width"][i] = ocr_data["width"][i] * scale_factor_x
2458
  ocr_data["height"][i] = ocr_data["height"][i] * scale_factor_y
2459
  else:
2460
+ # No scaling needed - coordinates match the crop image space
2461
  ocr_data = self._convert_line_to_word_level(
2462
+ ocr_data,
2463
+ crop_image_width,
2464
+ crop_image_height,
2465
+ crop_image,
2466
+ image_name=image_name,
2467
  )
2468
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2469
  # The rest of your processing pipeline now works for both engines
2470
  ocr_result = ocr_data
2471
 
 
2477
  ]
2478
 
2479
  # Determine default model based on OCR engine if model field is not present
2480
+ if "model" in ocr_result:
 
 
2481
  # Model field exists and has correct length - use it
2482
  def get_model(idx):
2483
  return ocr_result["model"][idx]
 
2491
  "Paddle"
2492
  if self.ocr_engine == "paddle"
2493
  else (
2494
+ "Tesseract"
2495
  if self.ocr_engine == "hybrid-paddle"
2496
  else (
2497
+ "Tesseract"
2498
  if self.ocr_engine == "hybrid-vlm"
2499
  else (
2500
+ "Paddle"
2501
  if self.ocr_engine == "hybrid-paddle-vlm"
2502
  else None
2503
  )
tools/file_redaction.py CHANGED
@@ -61,8 +61,7 @@ from tools.config import (
61
  RETURN_PDF_FOR_REVIEW,
62
  RETURN_REDACTED_PDF,
63
  RUN_AWS_FUNCTIONS,
64
- SAVE_TESSERACT_VISUALISATIONS,
65
- SAVE_TEXTRACT_VISUALISATIONS,
66
  SELECTABLE_TEXT_EXTRACT_OPTION,
67
  TESSERACT_TEXT_EXTRACT_OPTION,
68
  TEXTRACT_TEXT_EXTRACT_OPTION,
@@ -3493,7 +3492,7 @@ def redact_image_pdf(
3493
 
3494
  if not textract_data:
3495
  try:
3496
- print(f"Image object: {image}")
3497
  # Convert the image_path to bytes using an in-memory buffer
3498
  image_buffer = io.BytesIO()
3499
  image.save(
@@ -3658,10 +3657,10 @@ def redact_image_pdf(
3658
  # Save OCR visualization with bounding boxes (works for all OCR methods)
3659
  if (
3660
  text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
3661
- and SAVE_TEXTRACT_VISUALISATIONS is True
3662
  ) or (
3663
  text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION
3664
- and SAVE_TESSERACT_VISUALISATIONS is True
3665
  ):
3666
  if (
3667
  page_line_level_ocr_results_with_words
@@ -5189,7 +5188,11 @@ def visualise_ocr_words_bounding_boxes(
5189
 
5190
  words = line_data.get("words", [])
5191
 
5192
- # Process each word in the line
 
 
 
 
5193
  for word_data in words:
5194
  if not isinstance(word_data, dict):
5195
  continue
@@ -5219,73 +5222,223 @@ def visualise_ocr_words_bounding_boxes(
5219
  if x2 <= x1 or y2 <= y1:
5220
  continue
5221
 
5222
- # Check if word was replaced by a different model (for reference, but text color always uses confidence)
5223
- model = word_data.get("model", None)
5224
- is_replaced = model and model.lower() != base_model_name.lower()
 
 
5225
 
5226
- # Text color always based on confidence (not affected by model replacement)
5227
- text_color = (0, 0, 180) # Default to dark red
5228
- for min_conf, max_conf, conf_color, _ in text_confidence_ranges:
5229
- if min_conf <= conf <= max_conf:
5230
- text_color = conf_color
5231
- break
5232
 
5233
- # Calculate font size to fit text within bounding box
 
 
 
 
 
 
 
 
 
 
 
 
5234
  box_width = x2 - x1
5235
  box_height = y2 - y1
5236
 
5237
- # Start with a reasonable font scale
5238
- font_scale = 0.5
5239
- font_thickness = 1
5240
- font = cv2.FONT_HERSHEY_SIMPLEX
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5241
 
5242
- # Get text size and adjust to fit
5243
- (text_width, text_height), baseline = cv2.getTextSize(
5244
- text, font, font_scale, font_thickness
5245
- )
5246
 
5247
- # Scale font to fit width (with some padding)
5248
- if text_width > 0:
5249
- width_scale = (box_width * 0.9) / text_width
5250
- else:
5251
- width_scale = 1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5252
 
5253
- # Scale font to fit height (with some padding)
5254
- if text_height > 0:
5255
- height_scale = (box_height * 0.8) / text_height
5256
  else:
5257
- height_scale = 1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5258
 
5259
- # Use the smaller scale to ensure text fits both dimensions
5260
- font_scale = min(
5261
- font_scale * min(width_scale, height_scale), 2.0
5262
- ) # Cap at 2.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5263
 
5264
- # Recalculate text size with adjusted font scale
5265
- (text_width, text_height), baseline = cv2.getTextSize(
5266
- text, font, font_scale, font_thickness
5267
- )
 
 
5268
 
5269
- # Center text within bounding box
5270
- text_x = x1 + (box_width - text_width) // 2
5271
- text_y = y1 + (box_height + text_height) // 2 # Baseline adjustment
5272
-
5273
- # Draw text
5274
- cv2.putText(
5275
- text_page,
5276
- text,
5277
- (text_x, text_y),
5278
- font,
5279
- font_scale,
5280
- text_color,
5281
- font_thickness,
5282
- cv2.LINE_AA,
5283
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5284
 
5285
- # Draw grey bounding box for replaced words on text page
5286
- if is_replaced:
5287
- box_color = (128, 128, 128) # Grey for model replacements
5288
- cv2.rectangle(text_page, (x1, y1), (x2, y2), box_color, 1)
5289
 
5290
  # Add legend to second page
5291
  if add_legend:
 
61
  RETURN_PDF_FOR_REVIEW,
62
  RETURN_REDACTED_PDF,
63
  RUN_AWS_FUNCTIONS,
64
+ SAVE_PAGE_OCR_VISUALISATIONS,
 
65
  SELECTABLE_TEXT_EXTRACT_OPTION,
66
  TESSERACT_TEXT_EXTRACT_OPTION,
67
  TEXTRACT_TEXT_EXTRACT_OPTION,
 
3492
 
3493
  if not textract_data:
3494
  try:
3495
+ # print(f"Image object: {image}")
3496
  # Convert the image_path to bytes using an in-memory buffer
3497
  image_buffer = io.BytesIO()
3498
  image.save(
 
3657
  # Save OCR visualization with bounding boxes (works for all OCR methods)
3658
  if (
3659
  text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
3660
+ and SAVE_PAGE_OCR_VISUALISATIONS is True
3661
  ) or (
3662
  text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION
3663
+ and SAVE_PAGE_OCR_VISUALISATIONS is True
3664
  ):
3665
  if (
3666
  page_line_level_ocr_results_with_words
 
5188
 
5189
  words = line_data.get("words", [])
5190
 
5191
+ # Group words by bounding box (to handle cases where multiple words share the same box)
5192
+ # Use a small tolerance to consider boxes as "the same" if they're very close
5193
+ bbox_tolerance = 5 # pixels
5194
+ bbox_groups = {} # Maps (x1, y1, x2, y2) to list of word_data
5195
+
5196
  for word_data in words:
5197
  if not isinstance(word_data, dict):
5198
  continue
 
5222
  if x2 <= x1 or y2 <= y1:
5223
  continue
5224
 
5225
+ # Round coordinates to nearest tolerance to group similar boxes
5226
+ x1_rounded = (x1 // bbox_tolerance) * bbox_tolerance
5227
+ y1_rounded = (y1 // bbox_tolerance) * bbox_tolerance
5228
+ x2_rounded = (x2 // bbox_tolerance) * bbox_tolerance
5229
+ y2_rounded = (y2 // bbox_tolerance) * bbox_tolerance
5230
 
5231
+ bbox_key = (x1_rounded, y1_rounded, x2_rounded, y2_rounded)
 
 
 
 
 
5232
 
5233
+ if bbox_key not in bbox_groups:
5234
+ bbox_groups[bbox_key] = []
5235
+ bbox_groups[bbox_key].append(
5236
+ {"word_data": word_data, "original_bbox": (x1, y1, x2, y2)}
5237
+ )
5238
+
5239
+ # Process each group of words
5240
+ for bbox_key, word_group in bbox_groups.items():
5241
+ if not word_group:
5242
+ continue
5243
+
5244
+ # Use the first word's bounding box as the reference (they should all be similar)
5245
+ x1, y1, x2, y2 = word_group[0]["original_bbox"]
5246
  box_width = x2 - x1
5247
  box_height = y2 - y1
5248
 
5249
+ # If only one word in the box, process it normally
5250
+ if len(word_group) == 1:
5251
+ word_data = word_group[0]["word_data"]
5252
+ text = word_data.get("text", "")
5253
+ conf = int(word_data.get("conf", word_data.get("confidence", 0)))
5254
+
5255
+ # Check if word was replaced by a different model
5256
+ model = word_data.get("model", None)
5257
+ is_replaced = model and model.lower() != base_model_name.lower()
5258
+
5259
+ # Text color always based on confidence
5260
+ text_color = (0, 0, 180) # Default to dark red
5261
+ for min_conf, max_conf, conf_color, _ in text_confidence_ranges:
5262
+ if min_conf <= conf <= max_conf:
5263
+ text_color = conf_color
5264
+ break
5265
+
5266
+ # Calculate font size to fit text within bounding box
5267
+ font_scale = 0.5
5268
+ font_thickness = 1
5269
+ font = cv2.FONT_HERSHEY_SIMPLEX
5270
 
5271
+ # Get text size and adjust to fit
5272
+ (text_width, text_height), baseline = cv2.getTextSize(
5273
+ text, font, font_scale, font_thickness
5274
+ )
5275
 
5276
+ # Scale font to fit width (with some padding)
5277
+ if text_width > 0:
5278
+ width_scale = (box_width * 0.9) / text_width
5279
+ else:
5280
+ width_scale = 1.0
5281
+
5282
+ # Scale font to fit height (with some padding)
5283
+ if text_height > 0:
5284
+ height_scale = (box_height * 0.8) / text_height
5285
+ else:
5286
+ height_scale = 1.0
5287
+
5288
+ # Use the smaller scale to ensure text fits both dimensions
5289
+ font_scale = min(
5290
+ font_scale * min(width_scale, height_scale), 2.0
5291
+ ) # Cap at 2.0
5292
+
5293
+ # Recalculate text size with adjusted font scale
5294
+ (text_width, text_height), baseline = cv2.getTextSize(
5295
+ text, font, font_scale, font_thickness
5296
+ )
5297
+
5298
+ # Center text within bounding box
5299
+ text_x = x1 + (box_width - text_width) // 2
5300
+ text_y = y1 + (box_height + text_height) // 2 # Baseline adjustment
5301
+
5302
+ # Draw text
5303
+ cv2.putText(
5304
+ text_page,
5305
+ text,
5306
+ (text_x, text_y),
5307
+ font,
5308
+ font_scale,
5309
+ text_color,
5310
+ font_thickness,
5311
+ cv2.LINE_AA,
5312
+ )
5313
+
5314
+ # Draw grey bounding box for replaced words on text page
5315
+ if is_replaced:
5316
+ box_color = (128, 128, 128) # Grey for model replacements
5317
+ cv2.rectangle(text_page, (x1, y1), (x2, y2), box_color, 1)
5318
 
 
 
 
5319
  else:
5320
+ # Multiple words in the same box - arrange them side by side
5321
+ # Extract texts and determine colors for each word
5322
+ word_texts = []
5323
+ word_colors = []
5324
+ word_is_replaced = []
5325
+
5326
+ for item in word_group:
5327
+ word_data = item["word_data"]
5328
+ text = word_data.get("text", "")
5329
+ conf = int(word_data.get("conf", word_data.get("confidence", 0)))
5330
+ model = word_data.get("model", None)
5331
+ is_replaced = model and model.lower() != base_model_name.lower()
5332
+
5333
+ # Text color based on confidence
5334
+ text_color = (0, 0, 180) # Default to dark red
5335
+ for min_conf, max_conf, conf_color, _ in text_confidence_ranges:
5336
+ if min_conf <= conf <= max_conf:
5337
+ text_color = conf_color
5338
+ break
5339
 
5340
+ word_texts.append(text)
5341
+ word_colors.append(text_color)
5342
+ word_is_replaced.append(is_replaced)
5343
+
5344
+ # Calculate font size to fit all words side by side
5345
+ font_scale = 0.5
5346
+ font_thickness = 1
5347
+ font = cv2.FONT_HERSHEY_SIMPLEX
5348
+
5349
+ # Start with a reasonable font scale and reduce if needed
5350
+ max_font_scale = 2.0
5351
+ min_font_scale = 0.1
5352
+ font_scale = max_font_scale
5353
+
5354
+ # Binary search or iterative approach to find the right font size
5355
+ for _ in range(20): # Max iterations
5356
+ # Calculate total width needed for all words with spaces
5357
+ total_width = 0
5358
+ max_text_height = 0
5359
+
5360
+ for i, text in enumerate(word_texts):
5361
+ (text_width, text_height), baseline = cv2.getTextSize(
5362
+ text, font, font_scale, font_thickness
5363
+ )
5364
+ total_width += text_width
5365
+ max_text_height = max(max_text_height, text_height)
5366
 
5367
+ # Add space width between words (except last word)
5368
+ if i < len(word_texts) - 1:
5369
+ (space_width, _), _ = cv2.getTextSize(
5370
+ " ", font, font_scale, font_thickness
5371
+ )
5372
+ total_width += space_width
5373
 
5374
+ # Check if it fits
5375
+ width_fits = total_width <= box_width * 0.9
5376
+ height_fits = max_text_height <= box_height * 0.8
5377
+
5378
+ if width_fits and height_fits:
5379
+ break
5380
+
5381
+ # Reduce font scale
5382
+ font_scale *= 0.9
5383
+ if font_scale < min_font_scale:
5384
+ font_scale = min_font_scale
5385
+ break
5386
+
5387
+ # Recalculate total width and max height with final font scale
5388
+ total_width = 0
5389
+ max_text_height = 0
5390
+ for i, text in enumerate(word_texts):
5391
+ (text_width, text_height), baseline = cv2.getTextSize(
5392
+ text, font, font_scale, font_thickness
5393
+ )
5394
+ total_width += text_width
5395
+ max_text_height = max(max_text_height, text_height)
5396
+
5397
+ # Add space width between words (except last word)
5398
+ if i < len(word_texts) - 1:
5399
+ (space_width, _), _ = cv2.getTextSize(
5400
+ " ", font, font_scale, font_thickness
5401
+ )
5402
+ total_width += space_width
5403
+
5404
+ # Now draw each word side by side
5405
+ current_x = (
5406
+ x1 + (box_width - total_width) // 2
5407
+ ) # Center the combined text
5408
+ text_y = y1 + (box_height + max_text_height) // 2 # Baseline adjustment
5409
+
5410
+ for i, (text, text_color) in enumerate(zip(word_texts, word_colors)):
5411
+ # Get text size with final font scale
5412
+ (text_width, text_height), baseline = cv2.getTextSize(
5413
+ text, font, font_scale, font_thickness
5414
+ )
5415
+
5416
+ # Draw text
5417
+ cv2.putText(
5418
+ text_page,
5419
+ text,
5420
+ (int(current_x), text_y),
5421
+ font,
5422
+ font_scale,
5423
+ text_color,
5424
+ font_thickness,
5425
+ cv2.LINE_AA,
5426
+ )
5427
+
5428
+ # Move to next position
5429
+ current_x += text_width
5430
+
5431
+ # Add space between words (except last word)
5432
+ if i < len(word_texts) - 1:
5433
+ (space_width, _), _ = cv2.getTextSize(
5434
+ " ", font, font_scale, font_thickness
5435
+ )
5436
+ current_x += space_width
5437
 
5438
+ # Draw grey bounding box if any word was replaced
5439
+ if any(word_is_replaced):
5440
+ box_color = (128, 128, 128) # Grey for model replacements
5441
+ cv2.rectangle(text_page, (x1, y1), (x2, y2), box_color, 1)
5442
 
5443
  # Add legend to second page
5444
  if add_legend:
tools/run_vlm.py CHANGED
@@ -221,7 +221,7 @@ if SHOW_VLM_MODEL_OPTIONS is True:
221
 
222
 
223
  @spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
224
- def generate_image(
225
  text: str,
226
  image: Image.Image,
227
  max_new_tokens: int = None,
 
221
 
222
 
223
  @spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
224
+ def extract_text_from_image_vlm(
225
  text: str,
226
  image: Image.Image,
227
  max_new_tokens: int = None,
tools/secure_regex_utils.py CHANGED
@@ -267,14 +267,14 @@ def safe_extract_latest_number_from_filename(filename: str) -> Optional[int]:
267
  return None
268
 
269
 
270
- def safe_sanitize_text(text: str, replacement: str = "_") -> str:
271
  """
272
  Safely sanitize text by removing dangerous characters without ReDoS vulnerability.
273
 
274
  Args:
275
  text: The text to sanitize
276
  replacement: Character to replace dangerous characters with
277
-
278
  Returns:
279
  Sanitized text
280
  """
@@ -291,4 +291,7 @@ def safe_sanitize_text(text: str, replacement: str = "_") -> str:
291
  # Remove leading/trailing replacements
292
  sanitized = sanitized.strip(replacement)
293
 
 
 
 
294
  return sanitized
 
267
  return None
268
 
269
 
270
+ def safe_sanitize_text(text: str, replacement: str = "_", max_length: int = 255) -> str:
271
  """
272
  Safely sanitize text by removing dangerous characters without ReDoS vulnerability.
273
 
274
  Args:
275
  text: The text to sanitize
276
  replacement: Character to replace dangerous characters with
277
+ max_length: Maximum length of the text
278
  Returns:
279
  Sanitized text
280
  """
 
291
  # Remove leading/trailing replacements
292
  sanitized = sanitized.strip(replacement)
293
 
294
+ # Truncate to maximum length
295
+ sanitized = sanitized[:max_length]
296
+
297
  return sanitized
tools/word_segmenter.py CHANGED
@@ -82,14 +82,14 @@ class AdaptiveSegmenter:
82
 
83
  orientation_angle = 0.0
84
  if box_height > box_width:
85
- print(
86
- f"Detected vertical orientation (W:{box_width} < H:{box_height}). Applying 90-degree correction."
87
- )
88
  orientation_angle = 90.0
89
  else:
90
- print(
91
- f"Detected horizontal orientation (W:{box_width} >= H:{box_height}). No orientation correction."
92
- )
93
  M_orient = cv2.getRotationMatrix2D(center, 0, 1.0)
94
  return gray_image, M_orient
95
 
@@ -251,29 +251,29 @@ class AdaptiveSegmenter:
251
  ) -> Tuple[Dict[str, List], bool]:
252
 
253
  if line_image is None:
254
- print(
255
- f"Error: line_image is None in segment function (image_name: {image_name})"
256
- )
257
  return ({}, False)
258
 
259
  # Validate line_image is a valid numpy array
260
  if not isinstance(line_image, np.ndarray):
261
- print(
262
- f"Error: line_image is not a numpy array (type: {type(line_image)}, image_name: {image_name})"
263
- )
264
  return ({}, False)
265
 
266
  # Validate line_image has valid shape and size
267
  if line_image.size == 0:
268
- print(
269
- f"Error: line_image is empty (shape: {line_image.shape}, image_name: {image_name})"
270
- )
271
  return ({}, False)
272
 
273
  if len(line_image.shape) < 2:
274
- print(
275
- f"Error: line_image has invalid shape {line_image.shape} (image_name: {image_name})"
276
- )
277
  return ({}, False)
278
 
279
  # Early return if 1 or fewer words
@@ -283,20 +283,20 @@ class AdaptiveSegmenter:
283
  if len(words) <= 1:
284
  return ({}, False)
285
  else:
286
- print(
287
- f"Error: line_data is empty or does not contain text (image_name: {image_name})"
288
- )
289
  return ({}, False)
290
 
291
- print(f"line_text: {line_text}")
292
  shortened_line_text = line_text.replace(" ", "_")[:10]
293
 
294
  if SHOW_OUTPUT_IMAGES:
295
  os.makedirs(self.output_folder, exist_ok=True)
296
- output_path = f"{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_original.png"
297
- os.makedirs(f"{self.output_folder}/paddle_visualisations", exist_ok=True)
298
  cv2.imwrite(output_path, line_image)
299
- print(f"\nSaved original image to '{output_path}'")
300
 
301
  gray = cv2.cvtColor(line_image, cv2.COLOR_BGR2GRAY)
302
 
@@ -334,11 +334,22 @@ class AdaptiveSegmenter:
334
  borderMode=cv2.BORDER_REPLICATE,
335
  )
336
 
 
 
 
 
 
 
 
 
 
 
 
337
  # Save deskewed image (optional, only if image_name is provided)
338
  if SHOW_OUTPUT_IMAGES:
339
  os.makedirs(self.output_folder, exist_ok=True)
340
- output_path = f"{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_deskewed.png"
341
- os.makedirs(f"{self.output_folder}/paddle_visualisations", exist_ok=True)
342
  cv2.imwrite(output_path, deskewed_line_image)
343
  # print(f"\nSaved deskewed image to '{output_path}'")
344
 
@@ -351,6 +362,29 @@ class AdaptiveSegmenter:
351
  block_size = int(avg_char_width_approx * BLOCK_SIZE_FACTOR)
352
  if block_size % 2 == 0:
353
  block_size += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  binary = cv2.adaptiveThreshold(
355
  deskewed_gray,
356
  255,
@@ -360,11 +394,18 @@ class AdaptiveSegmenter:
360
  C_VALUE,
361
  )
362
 
 
 
 
 
 
 
 
363
  # Save cropped image (optional, only if image_name is provided)
364
  if SHOW_OUTPUT_IMAGES:
365
  os.makedirs(self.output_folder, exist_ok=True)
366
- output_path = f"{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_binary.png"
367
- os.makedirs(f"{self.output_folder}/paddle_visualisations", exist_ok=True)
368
  cv2.imwrite(output_path, binary)
369
  # print(f"\nSaved cropped image to '{output_path}'")
370
 
@@ -380,14 +421,25 @@ class AdaptiveSegmenter:
380
  # It's a dilation followed by an erosion
381
  closed_binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=1)
382
 
 
 
 
 
 
 
 
 
 
 
 
383
  # (Optional) You could also use a DILATE to make letters thicker
384
  # dilated_binary = cv2.dilate(closed_binary, kernel, iterations=1)
385
  # Use 'closed_binary' (or 'dilated_binary') from now on.
386
 
387
  if SHOW_OUTPUT_IMAGES:
388
  os.makedirs(self.output_folder, exist_ok=True)
389
- output_path = f"{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_closed_binary.png"
390
- os.makedirs(f"{self.output_folder}/paddle_visualisations", exist_ok=True)
391
  cv2.imwrite(output_path, closed_binary)
392
  # print(f"\nSaved dilated binary image to '{output_path}'")
393
 
@@ -405,7 +457,7 @@ class AdaptiveSegmenter:
405
  # Handle edge case of empty 'areas' array
406
  if len(areas) == 0:
407
  clean_binary = binary
408
- print("Warning: No components found after binarization.")
409
  areas = np.array([0]) # Add a dummy value to prevent crashes
410
 
411
  # --- 1. Calculate the DEFAULT CONSERVATIVE threshold ---
@@ -449,18 +501,18 @@ class AdaptiveSegmenter:
449
 
450
  # --- 3. ADAPTIVE DECISION: Override if conservative threshold is clearly noise ---
451
  if has_clear_gap:
452
- print(
453
- f"Noise Removal: Gap detected. Noise cluster ends at {area_before_gap}px. Aggressive threshold = {aggressive_threshold:.1f}"
454
- )
455
 
456
  # Only use a more aggressive threshold IF our "safe" threshold is clearly
457
  # stuck *inside* the noise cluster.
458
  # e.g., Safe threshold = 1, but noise goes up to 10.
459
  # (We use 0.8 as a buffer, so if thresh=7 and gap=8, we don't switch)
460
  if area_threshold < (area_before_gap * 0.8):
461
- print(
462
- f"Noise Removal: Conservative threshold ({area_threshold:.1f}) is deep in noise cluster (ends at {area_before_gap}px)."
463
- )
464
 
465
  # Instead of using large percentage increases, use a very small absolute increment
466
  # This preserves legitimate small letters/words that might be just above the noise
@@ -498,18 +550,18 @@ class AdaptiveSegmenter:
498
  # Cap at 15 pixels as absolute upper bound
499
  final_threshold = min(final_threshold, 15)
500
 
501
- print(
502
- f"Noise Removal: Using MODERATE threshold: {final_threshold:.1f} (noise ends at {area_before_gap}px, increment: {small_increment}px)"
503
- )
504
  area_threshold = final_threshold
505
  else:
506
- print(
507
- f"Noise Removal: Gap found, but conservative threshold ({area_threshold:.1f}) is sufficient. Sticking with conservative."
508
- )
509
  pass
510
 
511
  # --- 4. Apply the final, determined threshold ---
512
- print(f"Noise Removal: Final area threshold: {area_threshold:.1f}")
513
  for i in range(1, num_labels):
514
  # Use >= to be inclusive of the threshold itself
515
  if stats[i, cv2.CC_STAT_AREA] >= area_threshold:
@@ -518,6 +570,17 @@ class AdaptiveSegmenter:
518
  # No components found, or only background
519
  clean_binary = binary
520
 
 
 
 
 
 
 
 
 
 
 
 
521
  # Calculate the horizontal projection profile on the cleaned image
522
  horizontal_projection = np.sum(clean_binary, axis=1)
523
 
@@ -539,9 +602,9 @@ class AdaptiveSegmenter:
539
 
540
  # Ensure the crop is valid
541
  if y_start < y_end:
542
- print(
543
- f"Original text height: {text_height}px. Cropping to middle {100 - (2*trim_percentage*100):.0f}% region."
544
- )
545
  # Slice the image to get the vertically cropped ROI
546
  analysis_image = clean_binary[y_start:y_end, :]
547
  else:
@@ -551,18 +614,29 @@ class AdaptiveSegmenter:
551
  # If no text is found, use the original cleaned image
552
  analysis_image = clean_binary
553
 
 
 
 
 
 
 
 
 
 
 
 
554
  # --- Step 3: Hierarchical Adaptive Search (using the new clean_binary) ---
555
  # The rest of the pipeline is identical but now operates on a superior image.
556
  words = line_data["text"][0].split()
557
  target_word_count = len(words)
558
 
559
- print(f"Target word count: {target_word_count}")
560
 
561
  # Save cropped image (optional, only if image_name is provided)
562
  if SHOW_OUTPUT_IMAGES:
563
  os.makedirs(self.output_folder, exist_ok=True)
564
- output_path = f"{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_clean_binary.png"
565
- os.makedirs(f"{self.output_folder}/paddle_visualisations", exist_ok=True)
566
  cv2.imwrite(output_path, analysis_image)
567
  # print(f"\nSaved cropped image to '{output_path}'")
568
 
@@ -574,7 +648,7 @@ class AdaptiveSegmenter:
574
  target_word_count = len(words)
575
  stage1_succeeded = False
576
 
577
- print("--- Stage 1: Searching with adaptive valley threshold ---")
578
  valley_factors_to_try = np.arange(INITIAL_VALLEY_THRESHOLD_FACTOR, 0.45, 0.05)
579
  for v_factor in valley_factors_to_try:
580
  # Pass the cropped image to the helper
@@ -589,9 +663,9 @@ class AdaptiveSegmenter:
589
  break
590
 
591
  if not stage1_succeeded:
592
- print(
593
- "\n--- Stage 1 failed. Starting Stage 2: Searching with adaptive kernel ---"
594
- )
595
  kernel_factors_to_try = np.arange(INITIAL_KERNEL_WIDTH_FACTOR, 0.5, 0.05)
596
  fixed_valley_factor = MAIN_VALLEY_THRESHOLD_FACTOR
597
  for k_factor in kernel_factors_to_try:
@@ -601,12 +675,34 @@ class AdaptiveSegmenter:
601
  closed_binary = cv2.morphologyEx(
602
  clean_binary, cv2.MORPH_CLOSE, closing_kernel
603
  )
 
 
 
 
 
 
 
 
 
 
 
604
  # We need to re-apply the same vertical crop to this new image
605
  if len(non_zero_rows) > 0 and y_start < y_end:
606
  analysis_image = closed_binary[y_start:y_end, :]
607
  else:
608
  analysis_image = closed_binary
609
 
 
 
 
 
 
 
 
 
 
 
 
610
  unlabeled_boxes = self._get_boxes_from_profile(
611
  analysis_image,
612
  avg_char_width_approx,
@@ -614,11 +710,11 @@ class AdaptiveSegmenter:
614
  fixed_valley_factor,
615
  )
616
 
617
- print(
618
- f"Testing kernel factor {k_factor:.2f} ({kernel_width}px): Found {len(unlabeled_boxes)} boxes."
619
- )
620
  if abs(target_word_count - len(unlabeled_boxes)) <= match_tolerance:
621
- print("SUCCESS (Stage 2): Found a match.")
622
  best_boxes = unlabeled_boxes
623
  successful_binary_image = (
624
  closed_binary # For Stage 2, the source is the closed_binary
@@ -629,7 +725,7 @@ class AdaptiveSegmenter:
629
  used_fallback = False
630
 
631
  if best_boxes is None:
632
- print("\nWarning: All adaptive searches failed. Falling back.")
633
  fallback_segmenter = HybridWordSegmenter()
634
  used_fallback = True
635
  final_output = fallback_segmenter.refine_words_bidirectional(
@@ -803,22 +899,35 @@ class AdaptiveSegmenter:
803
 
804
  # Visualisation
805
  if SHOW_OUTPUT_IMAGES:
806
- output_path = f"{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_final_boxes.png"
807
- os.makedirs(f"{self.output_folder}/paddle_visualisations", exist_ok=True)
808
  output_image_vis = line_image.copy()
809
- print(f"\nFinal refined {len(remapped_output['text'])} words:")
810
- for i in range(len(remapped_output["text"])):
811
- word = remapped_output["text"][i]
812
- x, y, w, h = (
813
- int(remapped_output["left"][i]),
814
- int(remapped_output["top"][i]),
815
- int(remapped_output["width"][i]),
816
- int(remapped_output["height"][i]),
817
- )
818
- print(f"- '{word}' at ({x}, {y}, {w}, {h})")
819
- cv2.rectangle(output_image_vis, (x, y), (x + w, y + h), (0, 255, 0), 2)
820
- cv2.imwrite(output_path, output_image_vis)
821
- print(f"\nSaved visualisation to '{output_path}'")
 
 
 
 
 
 
 
 
 
 
 
 
 
822
 
823
  return remapped_output, used_fallback
824
 
@@ -1076,9 +1185,9 @@ if __name__ == "__main__":
1076
  # image_path = 'input/london_borough_of_lambeth.png'
1077
  image_basename = os.path.basename(image_path)
1078
  image_name = os.path.splitext(image_basename)[0]
1079
- output_path = f"outputs/{image_name}_refined_morph.png"
1080
- if not os.path.exists("outputs"):
1081
- os.makedirs("outputs")
1082
  line_image_cv = cv2.imread(image_path)
1083
  h, w, _ = line_image_cv.shape
1084
 
@@ -1100,20 +1209,28 @@ if __name__ == "__main__":
1100
 
1101
  # Visualisation
1102
  output_image_vis = line_image_cv.copy()
1103
- print(f"\nFinal refined {len(final_word_data['text'])} words:")
1104
- for i in range(len(final_word_data["text"])):
1105
- word = final_word_data["text"][i]
1106
- x, y, w, h = (
1107
- int(final_word_data["left"][i]),
1108
- int(final_word_data["top"][i]),
1109
- int(final_word_data["width"][i]),
1110
- int(final_word_data["height"][i]),
1111
- )
1112
- print(f"- '{word}' at ({x}, {y}, {w}, {h})")
1113
- cv2.rectangle(output_image_vis, (x, y), (x + w, y + h), (0, 255, 0), 2)
 
 
 
 
 
 
 
 
1114
 
1115
- cv2.imwrite(output_path, output_image_vis)
1116
- print(f"\nSaved visualisation to '{output_path}'")
1117
 
1118
  # You can also use matplotlib to display it in a notebook
1119
  import matplotlib.pyplot as plt
 
82
 
83
  orientation_angle = 0.0
84
  if box_height > box_width:
85
+ # print(
86
+ # f"Detected vertical orientation (W:{box_width} < H:{box_height}). Applying 90-degree correction."
87
+ # )
88
  orientation_angle = 90.0
89
  else:
90
+ # print(
91
+ # f"Detected horizontal orientation (W:{box_width} >= H:{box_height}). No orientation correction."
92
+ # )
93
  M_orient = cv2.getRotationMatrix2D(center, 0, 1.0)
94
  return gray_image, M_orient
95
 
 
251
  ) -> Tuple[Dict[str, List], bool]:
252
 
253
  if line_image is None:
254
+ # print(
255
+ # f"Error: line_image is None in segment function (image_name: {image_name})"
256
+ # )
257
  return ({}, False)
258
 
259
  # Validate line_image is a valid numpy array
260
  if not isinstance(line_image, np.ndarray):
261
+ # print(
262
+ # f"Error: line_image is not a numpy array (type: {type(line_image)}, image_name: {image_name})"
263
+ # )
264
  return ({}, False)
265
 
266
  # Validate line_image has valid shape and size
267
  if line_image.size == 0:
268
+ # print(
269
+ # f"Error: line_image is empty (shape: {line_image.shape}, image_name: {image_name})"
270
+ # )
271
  return ({}, False)
272
 
273
  if len(line_image.shape) < 2:
274
+ # print(
275
+ # f"Error: line_image has invalid shape {line_image.shape} (image_name: {image_name})"
276
+ # )
277
  return ({}, False)
278
 
279
  # Early return if 1 or fewer words
 
283
  if len(words) <= 1:
284
  return ({}, False)
285
  else:
286
+ # print(
287
+ # f"Error: line_data is empty or does not contain text (image_name: {image_name})"
288
+ # )
289
  return ({}, False)
290
 
291
+ # print(f"line_text: {line_text}")
292
  shortened_line_text = line_text.replace(" ", "_")[:10]
293
 
294
  if SHOW_OUTPUT_IMAGES:
295
  os.makedirs(self.output_folder, exist_ok=True)
296
+ output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_original.png"
297
+ os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
298
  cv2.imwrite(output_path, line_image)
299
+ # print(f"\nSaved original image to '{output_path}'")
300
 
301
  gray = cv2.cvtColor(line_image, cv2.COLOR_BGR2GRAY)
302
 
 
334
  borderMode=cv2.BORDER_REPLICATE,
335
  )
336
 
337
+ # Validate deskewed_line_image before saving
338
+ if (
339
+ deskewed_line_image is None
340
+ or not isinstance(deskewed_line_image, np.ndarray)
341
+ or deskewed_line_image.size == 0
342
+ ):
343
+ # print(
344
+ # f"Error: deskewed_line_image is None or empty (image_name: {image_name})"
345
+ # )
346
+ return ({}, False)
347
+
348
  # Save deskewed image (optional, only if image_name is provided)
349
  if SHOW_OUTPUT_IMAGES:
350
  os.makedirs(self.output_folder, exist_ok=True)
351
+ output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_deskewed.png"
352
+ os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
353
  cv2.imwrite(output_path, deskewed_line_image)
354
  # print(f"\nSaved deskewed image to '{output_path}'")
355
 
 
362
  block_size = int(avg_char_width_approx * BLOCK_SIZE_FACTOR)
363
  if block_size % 2 == 0:
364
  block_size += 1
365
+
366
+ # Validate deskewed_gray and ensure block_size is valid
367
+ if deskewed_gray is None or not isinstance(deskewed_gray, np.ndarray):
368
+ # print(
369
+ # f"Error: deskewed_gray is None or not a numpy array (image_name: {image_name})"
370
+ # )
371
+ return ({}, False)
372
+
373
+ if len(deskewed_gray.shape) != 2:
374
+ # print(
375
+ # f"Error: deskewed_gray must be a 2D grayscale image (shape: {deskewed_gray.shape}, image_name: {image_name})"
376
+ # )
377
+ return ({}, False)
378
+
379
+ if block_size < 3:
380
+ # print(
381
+ # f"Warning: block_size ({block_size}) is too small for adaptiveThreshold. "
382
+ # f"Using minimum value of 3. (image_name: {image_name}, "
383
+ # f"img_w: {img_w}, approx_char_count: {approx_char_count}, "
384
+ # f"avg_char_width_approx: {avg_char_width_approx:.2f})"
385
+ # )
386
+ block_size = 3
387
+
388
  binary = cv2.adaptiveThreshold(
389
  deskewed_gray,
390
  255,
 
394
  C_VALUE,
395
  )
396
 
397
+ # Validate binary image before saving
398
+ if binary is None or not isinstance(binary, np.ndarray) or binary.size == 0:
399
+ # print(
400
+ # f"Error: binary image is None or empty (image_name: {image_name})"
401
+ # )
402
+ return ({}, False)
403
+
404
  # Save cropped image (optional, only if image_name is provided)
405
  if SHOW_OUTPUT_IMAGES:
406
  os.makedirs(self.output_folder, exist_ok=True)
407
+ output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_binary.png"
408
+ os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
409
  cv2.imwrite(output_path, binary)
410
  # print(f"\nSaved cropped image to '{output_path}'")
411
 
 
421
  # It's a dilation followed by an erosion
422
  closed_binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=1)
423
 
424
+ # Validate closed_binary image before saving
425
+ if (
426
+ closed_binary is None
427
+ or not isinstance(closed_binary, np.ndarray)
428
+ or closed_binary.size == 0
429
+ ):
430
+ # print(
431
+ # f"Error: closed_binary image is None or empty (image_name: {image_name})"
432
+ # )
433
+ return ({}, False)
434
+
435
  # (Optional) You could also use a DILATE to make letters thicker
436
  # dilated_binary = cv2.dilate(closed_binary, kernel, iterations=1)
437
  # Use 'closed_binary' (or 'dilated_binary') from now on.
438
 
439
  if SHOW_OUTPUT_IMAGES:
440
  os.makedirs(self.output_folder, exist_ok=True)
441
+ output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_closed_binary.png"
442
+ os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
443
  cv2.imwrite(output_path, closed_binary)
444
  # print(f"\nSaved dilated binary image to '{output_path}'")
445
 
 
457
  # Handle edge case of empty 'areas' array
458
  if len(areas) == 0:
459
  clean_binary = binary
460
+ # print("Warning: No components found after binarization.")
461
  areas = np.array([0]) # Add a dummy value to prevent crashes
462
 
463
  # --- 1. Calculate the DEFAULT CONSERVATIVE threshold ---
 
501
 
502
  # --- 3. ADAPTIVE DECISION: Override if conservative threshold is clearly noise ---
503
  if has_clear_gap:
504
+ # print(
505
+ # f"Noise Removal: Gap detected. Noise cluster ends at {area_before_gap}px. Aggressive threshold = {aggressive_threshold:.1f}"
506
+ # )
507
 
508
  # Only use a more aggressive threshold IF our "safe" threshold is clearly
509
  # stuck *inside* the noise cluster.
510
  # e.g., Safe threshold = 1, but noise goes up to 10.
511
  # (We use 0.8 as a buffer, so if thresh=7 and gap=8, we don't switch)
512
  if area_threshold < (area_before_gap * 0.8):
513
+ # print(
514
+ # f"Noise Removal: Conservative threshold ({area_threshold:.1f}) is deep in noise cluster (ends at {area_before_gap}px)."
515
+ # )
516
 
517
  # Instead of using large percentage increases, use a very small absolute increment
518
  # This preserves legitimate small letters/words that might be just above the noise
 
550
  # Cap at 15 pixels as absolute upper bound
551
  final_threshold = min(final_threshold, 15)
552
 
553
+ # print(
554
+ # f"Noise Removal: Using MODERATE threshold: {final_threshold:.1f} (noise ends at {area_before_gap}px, increment: {small_increment}px)"
555
+ # )
556
  area_threshold = final_threshold
557
  else:
558
+ # print(
559
+ # f"Noise Removal: Gap found, but conservative threshold ({area_threshold:.1f}) is sufficient. Sticking with conservative."
560
+ # )
561
  pass
562
 
563
  # --- 4. Apply the final, determined threshold ---
564
+ # print(f"Noise Removal: Final area threshold: {area_threshold:.1f}")
565
  for i in range(1, num_labels):
566
  # Use >= to be inclusive of the threshold itself
567
  if stats[i, cv2.CC_STAT_AREA] >= area_threshold:
 
570
  # No components found, or only background
571
  clean_binary = binary
572
 
573
+ # Validate clean_binary before proceeding
574
+ if (
575
+ clean_binary is None
576
+ or not isinstance(clean_binary, np.ndarray)
577
+ or clean_binary.size == 0
578
+ ):
579
+ # print(
580
+ # f"Error: clean_binary image is None or empty (image_name: {image_name})"
581
+ # )
582
+ return ({}, False)
583
+
584
  # Calculate the horizontal projection profile on the cleaned image
585
  horizontal_projection = np.sum(clean_binary, axis=1)
586
 
 
602
 
603
  # Ensure the crop is valid
604
  if y_start < y_end:
605
+ # print(
606
+ # f"Original text height: {text_height}px. Cropping to middle {100 - (2*trim_percentage*100):.0f}% region."
607
+ # )
608
  # Slice the image to get the vertically cropped ROI
609
  analysis_image = clean_binary[y_start:y_end, :]
610
  else:
 
614
  # If no text is found, use the original cleaned image
615
  analysis_image = clean_binary
616
 
617
+ # Validate analysis_image before proceeding
618
+ if (
619
+ analysis_image is None
620
+ or not isinstance(analysis_image, np.ndarray)
621
+ or analysis_image.size == 0
622
+ ):
623
+ # print(
624
+ # f"Error: analysis_image is None or empty (image_name: {image_name})"
625
+ # )
626
+ return ({}, False)
627
+
628
  # --- Step 3: Hierarchical Adaptive Search (using the new clean_binary) ---
629
  # The rest of the pipeline is identical but now operates on a superior image.
630
  words = line_data["text"][0].split()
631
  target_word_count = len(words)
632
 
633
+ # print(f"Target word count: {target_word_count}")
634
 
635
  # Save cropped image (optional, only if image_name is provided)
636
  if SHOW_OUTPUT_IMAGES:
637
  os.makedirs(self.output_folder, exist_ok=True)
638
+ output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_clean_binary.png"
639
+ os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
640
  cv2.imwrite(output_path, analysis_image)
641
  # print(f"\nSaved cropped image to '{output_path}'")
642
 
 
648
  target_word_count = len(words)
649
  stage1_succeeded = False
650
 
651
+ # print("--- Stage 1: Searching with adaptive valley threshold ---")
652
  valley_factors_to_try = np.arange(INITIAL_VALLEY_THRESHOLD_FACTOR, 0.45, 0.05)
653
  for v_factor in valley_factors_to_try:
654
  # Pass the cropped image to the helper
 
663
  break
664
 
665
  if not stage1_succeeded:
666
+ # print(
667
+ # "\n--- Stage 1 failed. Starting Stage 2: Searching with adaptive kernel ---"
668
+ # )
669
  kernel_factors_to_try = np.arange(INITIAL_KERNEL_WIDTH_FACTOR, 0.5, 0.05)
670
  fixed_valley_factor = MAIN_VALLEY_THRESHOLD_FACTOR
671
  for k_factor in kernel_factors_to_try:
 
675
  closed_binary = cv2.morphologyEx(
676
  clean_binary, cv2.MORPH_CLOSE, closing_kernel
677
  )
678
+ # Validate closed_binary before proceeding
679
+ if (
680
+ closed_binary is None
681
+ or not isinstance(closed_binary, np.ndarray)
682
+ or closed_binary.size == 0
683
+ ):
684
+ # print(
685
+ # f"Error: closed_binary in Stage 2 is None or empty (image_name: {image_name}, k_factor: {k_factor:.2f})"
686
+ # )
687
+ continue # Skip this iteration and try next kernel factor
688
+
689
  # We need to re-apply the same vertical crop to this new image
690
  if len(non_zero_rows) > 0 and y_start < y_end:
691
  analysis_image = closed_binary[y_start:y_end, :]
692
  else:
693
  analysis_image = closed_binary
694
 
695
+ # Validate analysis_image before using it
696
+ if (
697
+ analysis_image is None
698
+ or not isinstance(analysis_image, np.ndarray)
699
+ or analysis_image.size == 0
700
+ ):
701
+ # print(
702
+ # f"Error: analysis_image in Stage 2 is None or empty (image_name: {image_name}, k_factor: {k_factor:.2f})"
703
+ # )
704
+ continue # Skip this iteration and try next kernel factor
705
+
706
  unlabeled_boxes = self._get_boxes_from_profile(
707
  analysis_image,
708
  avg_char_width_approx,
 
710
  fixed_valley_factor,
711
  )
712
 
713
+ # print(
714
+ # f"Testing kernel factor {k_factor:.2f} ({kernel_width}px): Found {len(unlabeled_boxes)} boxes."
715
+ # )
716
  if abs(target_word_count - len(unlabeled_boxes)) <= match_tolerance:
717
+ # print("SUCCESS (Stage 2): Found a match.")
718
  best_boxes = unlabeled_boxes
719
  successful_binary_image = (
720
  closed_binary # For Stage 2, the source is the closed_binary
 
725
  used_fallback = False
726
 
727
  if best_boxes is None:
728
+ # print("\nWarning: All adaptive searches failed. Falling back.")
729
  fallback_segmenter = HybridWordSegmenter()
730
  used_fallback = True
731
  final_output = fallback_segmenter.refine_words_bidirectional(
 
899
 
900
  # Visualisation
901
  if SHOW_OUTPUT_IMAGES:
902
+ output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_final_boxes.png"
903
+ os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
904
  output_image_vis = line_image.copy()
905
+ # Validate output_image_vis before saving
906
+ if (
907
+ output_image_vis is None
908
+ or not isinstance(output_image_vis, np.ndarray)
909
+ or output_image_vis.size == 0
910
+ ):
911
+ pass
912
+ # print(
913
+ # f"Error: output_image_vis is None or empty (image_name: {image_name})"
914
+ # )
915
+ else:
916
+ # print(f"\nFinal refined {len(remapped_output['text'])} words:")
917
+ for i in range(len(remapped_output["text"])):
918
+ word = remapped_output["text"][i]
919
+ x, y, w, h = (
920
+ int(remapped_output["left"][i]),
921
+ int(remapped_output["top"][i]),
922
+ int(remapped_output["width"][i]),
923
+ int(remapped_output["height"][i]),
924
+ )
925
+ # print(f"- '{word}' at ({x}, {y}, {w}, {h})")
926
+ cv2.rectangle(
927
+ output_image_vis, (x, y), (x + w, y + h), (0, 255, 0), 2
928
+ )
929
+ cv2.imwrite(output_path, output_image_vis)
930
+ # print(f"\nSaved visualisation to '{output_path}'")
931
 
932
  return remapped_output, used_fallback
933
 
 
1185
  # image_path = 'input/london_borough_of_lambeth.png'
1186
  image_basename = os.path.basename(image_path)
1187
  image_name = os.path.splitext(image_basename)[0]
1188
+ output_path = f"output/{image_name}_refined_morph.png"
1189
+ if not os.path.exists("output"):
1190
+ os.makedirs("output")
1191
  line_image_cv = cv2.imread(image_path)
1192
  h, w, _ = line_image_cv.shape
1193
 
 
1209
 
1210
  # Visualisation
1211
  output_image_vis = line_image_cv.copy()
1212
+ # Validate output_image_vis before saving
1213
+ if (
1214
+ output_image_vis is None
1215
+ or not isinstance(output_image_vis, np.ndarray)
1216
+ or output_image_vis.size == 0
1217
+ ):
1218
+ print(f"Error: output_image_vis is None or empty (image_name: {image_name})")
1219
+ else:
1220
+ print(f"\nFinal refined {len(final_word_data['text'])} words:")
1221
+ for i in range(len(final_word_data["text"])):
1222
+ word = final_word_data["text"][i]
1223
+ x, y, w, h = (
1224
+ int(final_word_data["left"][i]),
1225
+ int(final_word_data["top"][i]),
1226
+ int(final_word_data["width"][i]),
1227
+ int(final_word_data["height"][i]),
1228
+ )
1229
+ print(f"- '{word}' at ({x}, {y}, {w}, {h})")
1230
+ cv2.rectangle(output_image_vis, (x, y), (x + w, y + h), (0, 255, 0), 2)
1231
 
1232
+ cv2.imwrite(output_path, output_image_vis)
1233
+ print(f"\nSaved visualisation to '{output_path}'")
1234
 
1235
  # You can also use matplotlib to display it in a notebook
1236
  import matplotlib.pyplot as plt