seanpedrickcase commited on
Commit
c2becd8
·
1 Parent(s): e4493fe

Added upgraded line to word parsing algorithm. Added dependencies and framework for Huggingface spaces deployment with ZeroGPU

Browse files
README.md CHANGED
@@ -10,7 +10,7 @@ license: agpl-3.0
10
  ---
11
  # Document redaction
12
 
13
- version: 1.4.1
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, png, jpg), Word files (docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a full walkthrough of all the features in the app.
16
 
 
10
  ---
11
  # Document redaction
12
 
13
+ version: 1.5.0
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, png, jpg), Word files (docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a full walkthrough of all the features in the app.
16
 
packages.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ tesseract-ocr
2
+ poppler-utils
3
+ libgl1
4
+ libglib2.0-0
pyproject.toml CHANGED
@@ -4,11 +4,15 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "doc_redaction"
7
- version = "1.4.1"
8
  description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
11
 
 
 
 
 
12
  dependencies = [
13
  "pdfminer.six==20250506",
14
  "pdf2image==1.17.0",
@@ -38,18 +42,32 @@ dependencies = [
38
  "python-docx==1.2.0",
39
  "polars==1.33.1",
40
  "defusedxml==0.7.1",
41
- #"paddlepaddle==3.2.0", # Optional paddle imports - only if you want to use hybrid OCR mode with tesseract and paddleOCR
42
- #"paddleocr==3.3.0"
43
  ]
44
 
45
- [project.urls]
46
- Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
47
- repository = "https://github.com/seanpedrick-case/doc_redaction"
48
-
49
  [project.optional-dependencies]
50
  dev = ["pytest"]
51
  test = ["pytest", "pytest-cov"]
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  # Configuration for Ruff linter:
54
  [tool.ruff]
55
  line-length = 88
 
4
 
5
  [project]
6
  name = "doc_redaction"
7
+ version = "1.5.0"
8
  description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
11
 
12
+ [project.urls]
13
+ Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
14
+ repository = "https://github.com/seanpedrick-case/doc_redaction"
15
+
16
  dependencies = [
17
  "pdfminer.six==20250506",
18
  "pdf2image==1.17.0",
 
42
  "python-docx==1.2.0",
43
  "polars==1.33.1",
44
  "defusedxml==0.7.1",
45
+ "numpy==2.2.6"
 
46
  ]
47
 
 
 
 
 
48
  [project.optional-dependencies]
49
  dev = ["pytest"]
50
  test = ["pytest", "pytest-cov"]
51
 
52
+ # To install the app with paddle and vlm support with pip, example command (in base folder and correct python environment): pip install .[paddle,vlm], or uv pip install .[ocr,vlm] if using uv. Note need to GPU version of Torch below
53
+
54
+ # New extra for PaddleOCR
55
+ paddle = [
56
+ "paddlepaddle==3.2.0",
57
+ "paddleocr==3.3.0",
58
+ ]
59
+
60
+ # New extra for VLM models (including Torch)
61
+ vlm = [
62
+ "torch==2.6.0", # should use --index-url https://download.pytorch.org/whl/cu126 for cuda support for paddleocr, need to install manually
63
+ "torchvision==0.21",
64
+ "transformers==4.57.1",
65
+ "accelerate==1.11.0",
66
+ ]
67
+
68
+
69
+
70
+
71
  # Configuration for Ruff linter:
72
  [tool.ruff]
73
  line-length = 88
tools/config.py CHANGED
@@ -512,9 +512,9 @@ HYBRID_OCR_PADDING = int(
512
  get_or_create_env_var("HYBRID_OCR_PADDING", "1")
513
  ) # The padding to add to the text when passing it to PaddleOCR for re-extraction using the hybrid OCR method.
514
 
515
- TESSERACT_SEGMENTATION_LEVEL = get_or_create_env_var(
516
- "TESSERACT_SEGMENTATION_LEVEL", "word"
517
- ) # Tesseract segmentation level: "word" (PSM 11) or "line" (PSM 6)
518
 
519
  CONVERT_LINE_TO_WORD_LEVEL = convert_string_to_boolean(
520
  get_or_create_env_var("CONVERT_LINE_TO_WORD_LEVEL", "False")
 
512
  get_or_create_env_var("HYBRID_OCR_PADDING", "1")
513
  ) # The padding to add to the text when passing it to PaddleOCR for re-extraction using the hybrid OCR method.
514
 
515
+ TESSERACT_SEGMENTATION_LEVEL = int(get_or_create_env_var(
516
+ "TESSERACT_SEGMENTATION_LEVEL", "11"
517
+ )) # Tesseract segmentation level: PSM level to use for Tesseract OCR
518
 
519
  CONVERT_LINE_TO_WORD_LEVEL = convert_string_to_boolean(
520
  get_or_create_env_var("CONVERT_LINE_TO_WORD_LEVEL", "False")
tools/custom_image_analyser_engine.py CHANGED
@@ -42,6 +42,7 @@ from tools.presidio_analyzer_custom import recognizer_result_from_dict
42
  from tools.run_vlm import generate_image as vlm_generate_image
43
  from tools.secure_path_utils import validate_folder_containment
44
  from tools.secure_regex_utils import safe_sanitize_text
 
45
 
46
  if PREPROCESS_LOCAL_OCR_IMAGES == "True":
47
  PREPROCESS_LOCAL_OCR_IMAGES = True
@@ -553,7 +554,7 @@ def _get_tesseract_psm(segmentation_level: str) -> int:
553
 
554
  def _vlm_ocr_predict(
555
  image: Image.Image,
556
- prompt: str = "Extract all text from this image. Return only the text, no other information.",
557
  ) -> Dict[str, Any]:
558
  """
559
  VLM OCR prediction function that mimics PaddleOCR's interface.
@@ -688,7 +689,8 @@ class CustomImageAnalyzerEngine:
688
  if tesseract_config:
689
  self.tesseract_config = tesseract_config
690
  else:
691
- psm_value = _get_tesseract_psm(TESSERACT_SEGMENTATION_LEVEL)
 
692
  self.tesseract_config = f"--oem 3 --psm {psm_value}"
693
  # print(
694
  # f"Tesseract configured for {TESSERACT_SEGMENTATION_LEVEL}-level segmentation (PSM {psm_value})"
@@ -772,152 +774,6 @@ class CustomImageAnalyzerEngine:
772
 
773
  return f"{safe_original}_conf_{conf}_to_{safe_new}_conf_{new_conf}"
774
 
775
- # def _convert_line_to_word_level(
776
- # self, line_data: Dict[str, List], image_width: int, image_height: int
777
- # ) -> Dict[str, List]:
778
- # """
779
- # Converts line-level OCR results to word-level results by splitting text and estimating word positions.
780
-
781
- # Args:
782
- # line_data: Dictionary with line-level OCR data (text, left, top, width, height, conf)
783
- # image_width: Width of the original image
784
- # image_height: Height of the original image
785
-
786
- # Returns:
787
- # Dictionary with word-level OCR data in Tesseract format
788
- # """
789
- # output = {
790
- # "text": list(),
791
- # "left": list(),
792
- # "top": list(),
793
- # "width": list(),
794
- # "height": list(),
795
- # "conf": list(),
796
- # }
797
-
798
- # if not line_data or not line_data.get("text"):
799
- # return output
800
-
801
- # for i in range(len(line_data["text"])):
802
- # line_text = line_data["text"][i]
803
- # line_left = line_data["left"][i]
804
- # line_top = line_data["top"][i]
805
- # line_width = line_data["width"][i]
806
- # line_height = line_data["height"][i]
807
- # line_conf = line_data["conf"][i]
808
-
809
- # # Skip empty lines
810
- # if not line_text.strip():
811
- # continue
812
-
813
- # # Split line into words
814
- # words = line_text.split()
815
- # if not words:
816
- # continue
817
-
818
- # # Calculate character width for this line
819
- # total_chars = len(line_text)
820
- # avg_char_width = line_width / total_chars if total_chars > 0 else 0
821
-
822
- # current_char_offset = 0
823
-
824
- # for word in words:
825
- # # Calculate word width based on character count
826
- # word_width = float(len(word) * avg_char_width)
827
- # word_left = line_left + float(current_char_offset * avg_char_width)
828
-
829
- # # Ensure word doesn't exceed image boundaries
830
- # word_left = max(0, min(word_left, image_width - word_width))
831
- # word_width = min(word_width, image_width - word_left)
832
-
833
- # output["text"].append(word)
834
- # output["left"].append(word_left)
835
- # output["top"].append(line_top)
836
- # output["width"].append(word_width)
837
- # output["height"].append(line_height)
838
- # output["conf"].append(line_conf)
839
-
840
- # # Update offset for the next word (add word length + 1 for the space)
841
- # current_char_offset += len(word) + 1
842
-
843
- # return output
844
-
845
- def _convert_line_to_word_level(
846
- self, line_data: Dict[str, List], image_width: int, image_height: int
847
- ) -> Dict[str, List]:
848
- """
849
- Converts line-level OCR results to word-level by using a more robust
850
- proportional estimation method.
851
- """
852
- output = {
853
- "text": list(), "left": list(), "top": list(), "width": list(),
854
- "height": list(), "conf": list(),
855
- }
856
-
857
- if not line_data or not line_data.get("text"):
858
- return output
859
-
860
- for i in range(len(line_data["text"])):
861
- line_text = line_data["text"][i]
862
- line_left = round(float(line_data["left"][i]), 2)
863
- line_top = round(float(line_data["top"][i]), 2)
864
- line_width = round(float(line_data["width"][i]), 2)
865
- line_height = round(float(line_data["height"][i]), 2)
866
- line_conf = line_data["conf"][i]
867
-
868
- if not line_text.strip():
869
- continue
870
-
871
- words = line_text.split()
872
- if not words:
873
- continue
874
-
875
- # --- Improved Logic Starts Here ---
876
-
877
- # 1. Calculate counts of characters and spaces
878
- num_chars = len("".join(words))
879
- num_spaces = len(words) - 1
880
-
881
- if num_chars == 0:
882
- continue
883
-
884
- # 2. Estimate the width of a single space. A common heuristic is that
885
- # the total space between words takes up a certain fraction of the line.
886
- # Let's assume text characters are, on average, twice as wide as a space.
887
- # So, line_width = (num_chars * 2*space_width) + (num_spaces * space_width)
888
- # This allows us to solve for space_width.
889
- if (num_chars * 2 + num_spaces) > 0:
890
- # Heuristic ratio: average char is 2x a space width
891
- char_space_ratio = 2.0
892
- estimated_space_width = line_width / (num_chars * char_space_ratio + num_spaces)
893
- avg_char_width = estimated_space_width * char_space_ratio
894
- else:
895
- # Fallback to your original method if line has no spaces
896
- avg_char_width = line_width / (num_chars if num_chars > 0 else 1)
897
- estimated_space_width = avg_char_width
898
-
899
- # --- End of Improved Logic ---
900
-
901
- current_left = line_left
902
-
903
- for word in words:
904
- word_width = len(word) * avg_char_width
905
-
906
- # Clamp values to be within image boundaries
907
- clamped_left = max(0, min(current_left, image_width))
908
- clamped_width = max(0, min(word_width, image_width - clamped_left))
909
-
910
- output["text"].append(word)
911
- output["left"].append(clamped_left)
912
- output["top"].append(line_top)
913
- output["width"].append(clamped_width)
914
- output["height"].append(line_height)
915
- output["conf"].append(line_conf) # Still a simplification
916
-
917
- # Update the left offset for the next word
918
- current_left += word_width + estimated_space_width
919
-
920
- return output
921
 
922
  def _is_line_level_data(self, ocr_data: Dict[str, List]) -> bool:
923
  """
@@ -1086,6 +942,142 @@ class CustomImageAnalyzerEngine:
1086
 
1087
  return output
1088
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1089
  def _visualize_tesseract_bounding_boxes(
1090
  self,
1091
  image: Image.Image,
@@ -1590,9 +1582,47 @@ class CustomImageAnalyzerEngine:
1590
  # Convert line-level results to word-level if configured and needed
1591
  if CONVERT_LINE_TO_WORD_LEVEL and self._is_line_level_data(ocr_data):
1592
  print("Converting line-level OCR results to word-level...")
1593
- ocr_data = self._convert_line_to_word_level(
1594
- ocr_data, image_width, image_height
1595
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1596
 
1597
  # Always check for scale_factor, even if preprocessing_metadata is empty
1598
  # This ensures rescaling happens correctly when preprocessing was applied
 
42
  from tools.run_vlm import generate_image as vlm_generate_image
43
  from tools.secure_path_utils import validate_folder_containment
44
  from tools.secure_regex_utils import safe_sanitize_text
45
+ from tools.word_segmenter import AdaptiveSegmenter
46
 
47
  if PREPROCESS_LOCAL_OCR_IMAGES == "True":
48
  PREPROCESS_LOCAL_OCR_IMAGES = True
 
554
 
555
  def _vlm_ocr_predict(
556
  image: Image.Image,
557
+ prompt: str = "Extract the text content from this image.",
558
  ) -> Dict[str, Any]:
559
  """
560
  VLM OCR prediction function that mimics PaddleOCR's interface.
 
689
  if tesseract_config:
690
  self.tesseract_config = tesseract_config
691
  else:
692
+ # Following function does not actually work correctly, so always use PSM 11
693
+ psm_value = TESSERACT_SEGMENTATION_LEVEL #_get_tesseract_psm(TESSERACT_SEGMENTATION_LEVEL)
694
  self.tesseract_config = f"--oem 3 --psm {psm_value}"
695
  # print(
696
  # f"Tesseract configured for {TESSERACT_SEGMENTATION_LEVEL}-level segmentation (PSM {psm_value})"
 
774
 
775
  return f"{safe_original}_conf_{conf}_to_{safe_new}_conf_{new_conf}"
776
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
777
 
778
  def _is_line_level_data(self, ocr_data: Dict[str, List]) -> bool:
779
  """
 
942
 
943
  return output
944
 
945
+ def _convert_line_to_word_level(
946
+ self, line_data: Dict[str, List], image_width: int, image_height: int, image: Image.Image, image_name: str = None
947
+ ) -> Dict[str, List]:
948
+ """
949
+ Converts line-level OCR results to word-level using AdaptiveSegmenter.segment().
950
+ This method processes each line individually using the adaptive segmentation algorithm.
951
+
952
+ Args:
953
+ line_data: Dictionary with keys "text", "left", "top", "width", "height", "conf" (all lists)
954
+ image_width: Width of the full image
955
+ image_height: Height of the full image
956
+ image: PIL Image object of the full image
957
+ image_name: Name of the image
958
+ Returns:
959
+ Dictionary with same keys as input, containing word-level bounding boxes
960
+ """
961
+ output = {
962
+ "text": list(), "left": list(), "top": list(), "width": list(),
963
+ "height": list(), "conf": list(),
964
+ }
965
+
966
+ if not line_data or not line_data.get("text"):
967
+ return output
968
+
969
+ # Convert PIL Image to numpy array (BGR format for OpenCV)
970
+ if hasattr(image, 'size'): # PIL Image
971
+ image_np = np.array(image)
972
+ if len(image_np.shape) == 3:
973
+ # Convert RGB to BGR for OpenCV
974
+ image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
975
+ elif len(image_np.shape) == 2:
976
+ # Grayscale - convert to BGR
977
+ image_np = cv2.cvtColor(image_np, cv2.COLOR_GRAY2BGR)
978
+ else:
979
+ # Already numpy array
980
+ image_np = image.copy()
981
+ if len(image_np.shape) == 2:
982
+ image_np = cv2.cvtColor(image_np, cv2.COLOR_GRAY2BGR)
983
+
984
+ segmenter = AdaptiveSegmenter(output_folder=self.output_folder)
985
+
986
+ # Process each line
987
+ for i in range(len(line_data["text"])):
988
+ line_text = line_data["text"][i]
989
+ line_conf = line_data["conf"][i]
990
+
991
+ # Get the float values
992
+ f_left = float(line_data["left"][i])
993
+ f_top = float(line_data["top"][i])
994
+ f_width = float(line_data["width"][i])
995
+ f_height = float(line_data["height"][i])
996
+
997
+ # A simple heuristic to check if coords are normalized
998
+ # If any value is > 1.0, assume they are already pixels
999
+ is_normalized = (f_left <= 1.0 and f_top <= 1.0 and f_width <= 1.0 and f_height <= 1.0)
1000
+
1001
+ if is_normalized:
1002
+ # Convert from normalized (0.0-1.0) to absolute pixels
1003
+ line_left = float(round(f_left * image_width))
1004
+ line_top = float(round(f_top * image_height))
1005
+ line_width = float(round(f_width * image_width))
1006
+ line_height = float(round(f_height * image_height))
1007
+ else:
1008
+ # They are already pixels, just convert to int
1009
+ line_left = float(round(f_left))
1010
+ line_top = float(round(f_top))
1011
+ line_width = float(round(f_width))
1012
+ line_height = float(round(f_height))
1013
+
1014
+ if not line_text.strip():
1015
+ continue
1016
+
1017
+ # Clamp bounding box to image boundaries
1018
+ line_left = int(max(0, min(line_left, image_width - 1)))
1019
+ line_top = int(max(0, min(line_top, image_height - 1)))
1020
+ line_width = int(max(1, min(line_width, image_width - line_left)))
1021
+ line_height = int(max(1, min(line_height, image_height - line_top)))
1022
+
1023
+ print(f"Line left: {line_left}, Line top: {line_top}, Line width: {line_width}, Line height: {line_height}")
1024
+
1025
+ # Crop the line image from the full image
1026
+ line_image = image_np[line_top:line_top + line_height, line_left:line_left + line_width]
1027
+
1028
+ if line_image.size == 0:
1029
+ continue
1030
+
1031
+ # Create single-line data structure for segment method
1032
+ single_line_data = {
1033
+ "text": [line_text],
1034
+ "left": [0], # Relative to cropped image
1035
+ "top": [0],
1036
+ "width": [line_width],
1037
+ "height": [line_height],
1038
+ "conf": [line_conf],
1039
+ }
1040
+
1041
+ # Use AdaptiveSegmenter.segment() to segment this line
1042
+ word_output, _ = segmenter.segment(single_line_data, line_image, image_name=image_name)
1043
+
1044
+ if not word_output or not word_output.get("text"):
1045
+ # If segmentation failed, fall back to proportional estimation
1046
+ words = line_text.split()
1047
+ if words:
1048
+ num_chars = len("".join(words))
1049
+ num_spaces = len(words) - 1
1050
+ if num_chars > 0:
1051
+ char_space_ratio = 2.0
1052
+ estimated_space_width = line_width / (num_chars * char_space_ratio + num_spaces) if (num_chars * char_space_ratio + num_spaces) > 0 else line_width / num_chars
1053
+ avg_char_width = estimated_space_width * char_space_ratio
1054
+ current_left = 0
1055
+ for word in words:
1056
+ word_width = len(word) * avg_char_width
1057
+ clamped_left = max(0, min(current_left, line_width))
1058
+ clamped_width = max(0, min(word_width, line_width - clamped_left))
1059
+ output["text"].append(word)
1060
+ output["left"].append(line_left + clamped_left) # Add line offset
1061
+ output["top"].append(line_top)
1062
+ output["width"].append(clamped_width)
1063
+ output["height"].append(line_height)
1064
+ output["conf"].append(line_conf)
1065
+ current_left += word_width + estimated_space_width
1066
+ continue
1067
+
1068
+ # Adjust coordinates back to full image coordinates
1069
+ for j in range(len(word_output["text"])):
1070
+ output["text"].append(word_output["text"][j])
1071
+ output["left"].append(line_left + word_output["left"][j])
1072
+ output["top"].append(line_top + word_output["top"][j])
1073
+ output["width"].append(word_output["width"][j])
1074
+ output["height"].append(word_output["height"][j])
1075
+ output["conf"].append(word_output["conf"][j])
1076
+
1077
+ print(f"Output: {output}")
1078
+
1079
+ return output
1080
+
1081
  def _visualize_tesseract_bounding_boxes(
1082
  self,
1083
  image: Image.Image,
 
1582
  # Convert line-level results to word-level if configured and needed
1583
  if CONVERT_LINE_TO_WORD_LEVEL and self._is_line_level_data(ocr_data):
1584
  print("Converting line-level OCR results to word-level...")
1585
+ # Check if coordinates need to be scaled to match the preprocessed image
1586
+ # For PaddleOCR: _convert_paddle_to_tesseract_format converts coordinates to original image space,
1587
+ # but we need to crop from the preprocessed image, so we need to scale coordinates up
1588
+ # For Tesseract: OCR runs on preprocessed image, so coordinates are already in preprocessed space,
1589
+ # matching the preprocessed image we're cropping from - no scaling needed
1590
+ needs_scaling = False
1591
+ if PREPROCESS_LOCAL_OCR_IMAGES and original_image_width and original_image_height:
1592
+ if self.ocr_engine == "paddle":
1593
+ # PaddleOCR coordinates are converted to original space by _convert_paddle_to_tesseract_format
1594
+ needs_scaling = True
1595
+
1596
+ if needs_scaling:
1597
+ # Calculate scale factors from original to preprocessed
1598
+ scale_x = image_width / original_image_width
1599
+ scale_y = image_height / original_image_height
1600
+ print(f"Scaling coordinates from original ({original_image_width}x{original_image_height}) to preprocessed ({image_width}x{image_height})")
1601
+ print(f"Scale factors: x={scale_x:.3f}, y={scale_y:.3f}")
1602
+ # Scale coordinates to preprocessed image space for cropping
1603
+ scaled_ocr_data = {
1604
+ "text": ocr_data["text"],
1605
+ "left": [x * scale_x for x in ocr_data["left"]],
1606
+ "top": [y * scale_y for y in ocr_data["top"]],
1607
+ "width": [w * scale_x for w in ocr_data["width"]],
1608
+ "height": [h * scale_y for h in ocr_data["height"]],
1609
+ "conf": ocr_data["conf"],
1610
+ }
1611
+ ocr_data = self._convert_line_to_word_level(
1612
+ scaled_ocr_data, image_width, image_height, image, image_name=image_name
1613
+ )
1614
+ # Scale word-level results back to original image space
1615
+ scale_factor_x = original_image_width / image_width
1616
+ scale_factor_y = original_image_height / image_height
1617
+ for i in range(len(ocr_data["left"])):
1618
+ ocr_data["left"][i] = ocr_data["left"][i] * scale_factor_x
1619
+ ocr_data["top"][i] = ocr_data["top"][i] * scale_factor_y
1620
+ ocr_data["width"][i] = ocr_data["width"][i] * scale_factor_x
1621
+ ocr_data["height"][i] = ocr_data["height"][i] * scale_factor_y
1622
+ else:
1623
+ ocr_data = self._convert_line_to_word_level(
1624
+ ocr_data, image_width, image_height, image, image_name=image_name
1625
+ )
1626
 
1627
  # Always check for scale_factor, even if preprocessing_metadata is empty
1628
  # This ensures rescaling happens correctly when preprocessing was applied
tools/word_segmenter.py ADDED
@@ -0,0 +1,974 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ from typing import Dict, List, Tuple
4
+ import os
5
+ from tools.config import OUTPUT_FOLDER
6
+
7
+ INITIAL_KERNEL_WIDTH_FACTOR = 0.05 # Default 0.05
8
+ INITIAL_VALLEY_THRESHOLD_FACTOR = 0.05 # Default 0.05
9
+ MAIN_VALLEY_THRESHOLD_FACTOR = 0.15 # Default 0.15
10
+ C_VALUE = 4 # Default 4
11
+ BLOCK_SIZE_FACTOR = 1.5 # Default 1.5
12
+ MIN_SPACE_FACTOR = 0.3 # Default 0.4
13
+ MATCH_TOLERANCE = 0 # Default 0
14
+ MIN_AREA_THRESHOLD = 6 # Default 6
15
+ DEFAULT_TRIM_PERCENTAGE = 0.15 # Default 0.15
16
+ SHOW_OUTPUT_IMAGES = True # Default False
17
+
18
+ class AdaptiveSegmenter:
19
+ """
20
+ The final, production-ready pipeline. It features:
21
+ 1. Adaptive Thresholding.
22
+ 2. Targeted Noise Removal using Connected Component Analysis to isolate the main text body.
23
+ 3. The robust two-stage adaptive search (Valley -> Kernel).
24
+ 4. CCA for final pixel-perfect refinement.
25
+ """
26
+ def __init__(self, output_folder: str = OUTPUT_FOLDER):
27
+ self.output_folder = output_folder
28
+
29
+ def _deskew_image(self, gray_image: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
30
+ """
31
+ Detects skew using a robust method that normalizes the output of
32
+ cv2.minAreaRect to correctly handle its angle/dimension ambiguity.
33
+ """
34
+ h, w = gray_image.shape
35
+
36
+
37
+
38
+ # Use a single, reliable binarization method for detection.
39
+ block_size = 21
40
+ if h < block_size:
41
+ block_size = h if h % 2 != 0 else h - 1
42
+
43
+ if block_size > 3:
44
+ binary = cv2.adaptiveThreshold(gray_image, 255,
45
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
46
+ cv2.THRESH_BINARY_INV, block_size, 4)
47
+ else:
48
+ _, binary = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
49
+
50
+ opening_kernel = np.ones((2, 2), np.uint8)
51
+ binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, opening_kernel)
52
+
53
+ coords = np.column_stack(np.where(binary > 0))
54
+ if len(coords) < 50:
55
+ print("Warning: Not enough text pixels to detect skew. Skipping.")
56
+ M = cv2.getRotationMatrix2D((w // 2, h // 2), 0, 1.0)
57
+ return gray_image, M
58
+
59
+ rect = cv2.minAreaRect(coords[:, ::-1])
60
+
61
+ rect_width, rect_height = rect[1]
62
+ angle = rect[2]
63
+
64
+ # If the rectangle is described as vertical, normalize it
65
+ if rect_width < rect_height:
66
+ # Swap dimensions
67
+ rect_width, rect_height = rect_height, rect_width
68
+ # Correct the angle
69
+ angle += 90
70
+
71
+ # The angle from minAreaRect is in [-90, 0). After normalization,
72
+ # our angle for a horizontal line will be close to 0 or -90/90.
73
+ # We need one last correction for angles near +/- 90.
74
+ if angle > 45:
75
+ angle -= 90
76
+ elif angle < -45:
77
+ angle += 90
78
+
79
+ correction_angle = angle
80
+
81
+ print(f"Normalized shape (W:{rect_width:.0f}, H:{rect_height:.0f}). Detected angle: {correction_angle:.2f} degrees.")
82
+
83
+ # Final sanity checks on the angle
84
+ MIN_SKEW_THRESHOLD = 0.5 # Ignore angles smaller than this (likely noise)
85
+ MAX_SKEW_THRESHOLD = 15.0 # Angles larger than this are extreme and likely errors
86
+
87
+ if abs(correction_angle) < MIN_SKEW_THRESHOLD:
88
+ print(f"Detected angle {correction_angle:.2f}° is too small (likely noise). Skipping deskew.")
89
+ correction_angle = 0.0
90
+ elif abs(correction_angle) > MAX_SKEW_THRESHOLD:
91
+ print(f"Warning: Corrected angle {correction_angle:.2f}° is extreme. Skipping deskew.")
92
+ correction_angle = 0.0
93
+
94
+ # Create rotation matrix and apply the final correction
95
+ center = (w // 2, h // 2)
96
+ M = cv2.getRotationMatrix2D(center, correction_angle, 1.0)
97
+
98
+ deskewed_gray = cv2.warpAffine(gray_image, M, (w, h),
99
+ flags=cv2.INTER_CUBIC,
100
+ borderMode=cv2.BORDER_REPLICATE)
101
+
102
+ return deskewed_gray, M
103
+
104
+ def _get_boxes_from_profile(self, binary_image: np.ndarray, stable_avg_char_width: float, min_space_factor: float, valley_threshold_factor: float) -> List:
105
+ # This helper function remains IDENTICAL. No changes needed.
106
+ # ... (code from the previous version)
107
+ img_h, img_w = binary_image.shape
108
+ vertical_projection = np.sum(binary_image, axis=0)
109
+ peaks = vertical_projection[vertical_projection > 0]
110
+ if len(peaks) == 0: return []
111
+ avg_peak_height = np.mean(peaks)
112
+ valley_threshold = int(avg_peak_height * valley_threshold_factor)
113
+ min_space_width = int(stable_avg_char_width * min_space_factor)
114
+ patched_projection = vertical_projection.copy()
115
+ in_gap = False; gap_start = 0
116
+ for x, col_sum in enumerate(patched_projection):
117
+ if col_sum <= valley_threshold and not in_gap: in_gap = True; gap_start = x
118
+ elif col_sum > valley_threshold and in_gap:
119
+ in_gap = False
120
+ if (x - gap_start) < min_space_width: patched_projection[gap_start:x] = int(avg_peak_height)
121
+ unlabeled_boxes = []
122
+ in_word = False; start_x = 0
123
+ for x, col_sum in enumerate(patched_projection):
124
+ if col_sum > valley_threshold and not in_word: start_x = x; in_word = True
125
+ elif col_sum <= valley_threshold and in_word: unlabeled_boxes.append((start_x, 0, x - start_x, img_h)); in_word = False
126
+ if in_word: unlabeled_boxes.append((start_x, 0, img_w - start_x, img_h))
127
+ return unlabeled_boxes
128
+
129
+ def segment(self, line_data: Dict[str, List], line_image: np.ndarray, min_space_factor=MIN_SPACE_FACTOR, match_tolerance=MATCH_TOLERANCE, image_name: str = None) -> Tuple[Dict[str, List], bool]:
130
+ if line_image is None: return ({}, False)
131
+
132
+ shortened_line_text = line_data["text"][0].replace(" ", "_")[:10]
133
+
134
+ if SHOW_OUTPUT_IMAGES:
135
+ os.makedirs(self.output_folder, exist_ok=True)
136
+ output_path = f'{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_original.png'
137
+ os.makedirs(f'{self.output_folder}/paddle_visualisations', exist_ok=True)
138
+ cv2.imwrite(output_path, line_image)
139
+ print(f"\nSaved original image to '{output_path}'")
140
+
141
+
142
+ gray = cv2.cvtColor(line_image, cv2.COLOR_BGR2GRAY)
143
+ # Store the transformation matrix M
144
+ deskewed_gray, M = self._deskew_image(gray)
145
+ h, w = deskewed_gray.shape
146
+ deskewed_line_image = cv2.warpAffine(line_image, M, (w, h),
147
+ flags=cv2.INTER_CUBIC,
148
+ borderMode=cv2.BORDER_REPLICATE)
149
+
150
+
151
+ # Save deskewed image (optional, only if image_name is provided)
152
+ if SHOW_OUTPUT_IMAGES:
153
+ os.makedirs(self.output_folder, exist_ok=True)
154
+ output_path = f'{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_deskewed.png'
155
+ os.makedirs(f'{self.output_folder}/paddle_visualisations', exist_ok=True)
156
+ cv2.imwrite(output_path, deskewed_line_image)
157
+ print(f"\nSaved deskewed image to '{output_path}'")
158
+
159
+ # --- Step 1: Binarization and Stable Width Calculation (Unchanged) ---
160
+ approx_char_count = len(line_data["text"][0].replace(" ", ""))
161
+ if approx_char_count == 0: return ({}, False)
162
+ img_h, img_w = deskewed_gray.shape
163
+ avg_char_width_approx = img_w / approx_char_count
164
+ block_size = int(avg_char_width_approx * BLOCK_SIZE_FACTOR)
165
+ if block_size % 2 == 0: block_size += 1
166
+ binary = cv2.adaptiveThreshold(deskewed_gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, block_size, C_VALUE)
167
+
168
+ # --- Step 2: Intelligent Noise Removal (Improved) ---
169
+ num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(binary, 8, cv2.CV_32S)
170
+ clean_binary = np.zeros_like(binary)
171
+
172
+ if num_labels > 1:
173
+ areas = stats[1:, cv2.CC_STAT_AREA] # Get all component areas, skip background (label 0)
174
+
175
+ # Handle edge case of empty 'areas' array
176
+ if len(areas) == 0:
177
+ clean_binary = binary
178
+ print("Warning: No components found after binarization.")
179
+ areas = np.array([0]) # Add a dummy value to prevent crashes
180
+
181
+ # --- 1. Calculate the DEFAULT CONSERVATIVE threshold ---
182
+ # This is your existing logic, which works well for *clean* lines.
183
+ p1 = np.percentile(areas, 1)
184
+ img_h, img_w = binary.shape
185
+ estimated_char_height = img_h * 0.7
186
+ estimated_min_letter_area = max(2, int(estimated_char_height * 0.2 * estimated_char_height * 0.15))
187
+
188
+ # This is the "safe" threshold that protects small letters on clean lines.
189
+ area_threshold = max(MIN_AREA_THRESHOLD, min(p1, estimated_min_letter_area))
190
+ print(f"Noise Removal: Initial conservative threshold: {area_threshold:.1f} (p1={p1:.1f}, est_min={estimated_min_letter_area:.1f})")
191
+
192
+ # --- 2. Find a "Noise-to-Text" Gap (to enable AGGRESSIVE mode) ---
193
+ sorted_areas = np.sort(areas)
194
+ has_clear_gap = False
195
+ aggressive_threshold = -1
196
+ area_before_gap = -1
197
+
198
+ if len(sorted_areas) > 10: # Need enough components to analyze
199
+ area_diffs = np.diff(sorted_areas)
200
+ if len(area_diffs) > 0:
201
+ # Use your "gap" logic: find a jump > 3x the 95th percentile jump
202
+ jump_threshold = np.percentile(area_diffs, 95)
203
+ significant_jump_thresh = max(10, jump_threshold * 3) # Add a 10px minimum jump
204
+
205
+ jump_indices = np.where(area_diffs > significant_jump_thresh)[0]
206
+
207
+ if len(jump_indices) > 0:
208
+ has_clear_gap = True
209
+ # This is the index of the *last noise component*
210
+ gap_idx = jump_indices[0]
211
+ area_before_gap = sorted_areas[gap_idx]
212
+
213
+ # The aggressive threshold is 1 pixel *larger* than the biggest noise component
214
+ aggressive_threshold = area_before_gap + 1
215
+
216
+ # --- 3. ADAPTIVE DECISION: Override if conservative threshold is clearly noise ---
217
+ if has_clear_gap:
218
+ print(f"Noise Removal: Gap detected. Noise cluster ends at {area_before_gap}px. Aggressive threshold = {aggressive_threshold:.1f}")
219
+
220
+ # THIS IS THE KEY:
221
+ # Only use the aggressive threshold IF our "safe" threshold is clearly
222
+ # stuck *inside* the noise cluster.
223
+ # e.g., Safe threshold = 1, but noise goes up to 10.
224
+ # (We use 0.8 as a buffer, so if thresh=7 and gap=8, we don't switch)
225
+ if area_threshold < (area_before_gap * 0.8):
226
+ print(f"Noise Removal: Conservative threshold ({area_threshold:.1f}) is deep in noise cluster (ends at {area_before_gap}px).")
227
+ print(f"Noise Removal: Switching to AGGRESSIVE threshold: {aggressive_threshold:.1f}")
228
+ area_threshold = aggressive_threshold
229
+ else:
230
+ print(f"Noise Removal: Gap found, but conservative threshold ({area_threshold:.1f}) is sufficient. Sticking with conservative.")
231
+
232
+ # --- 4. Apply the final, determined threshold ---
233
+ print(f"Noise Removal: Final area threshold: {area_threshold:.1f}")
234
+ for i in range(1, num_labels):
235
+ # Use >= to be inclusive of the threshold itself
236
+ if stats[i, cv2.CC_STAT_AREA] >= area_threshold:
237
+ clean_binary[labels == i] = 255
238
+ else:
239
+ # No components found, or only background
240
+ clean_binary = binary
241
+
242
+ # Calculate the horizontal projection profile on the cleaned image
243
+ horizontal_projection = np.sum(clean_binary, axis=1)
244
+
245
+ # Find the top and bottom boundaries of the text
246
+ non_zero_rows = np.where(horizontal_projection > 0)[0]
247
+ if len(non_zero_rows) > 0:
248
+ text_top = non_zero_rows[0]
249
+ text_bottom = non_zero_rows[-1]
250
+ text_height = text_bottom - text_top
251
+
252
+ # Define a percentage to trim off the top and bottom
253
+ # This is a tunable parameter. 15% is a good starting point.
254
+ trim_percentage = DEFAULT_TRIM_PERCENTAGE
255
+ trim_pixels = int(text_height * trim_percentage)
256
+
257
+ # Calculate new, tighter boundaries
258
+ y_start = text_top + trim_pixels
259
+ y_end = text_bottom - trim_pixels
260
+
261
+ # Ensure the crop is valid
262
+ if y_start < y_end:
263
+ print(f"Original text height: {text_height}px. Cropping to middle {100 - (2*trim_percentage*100):.0f}% region.")
264
+ # Slice the image to get the vertically cropped ROI
265
+ analysis_image = clean_binary[y_start:y_end, :]
266
+ else:
267
+ # If trimming would result in an empty image, use the full text region
268
+ analysis_image = clean_binary[text_top:text_bottom, :]
269
+ else:
270
+ # If no text is found, use the original cleaned image
271
+ analysis_image = clean_binary
272
+
273
+ # Save cropped image (optional, only if image_name is provided)
274
+ if SHOW_OUTPUT_IMAGES:
275
+ if image_name is not None:
276
+ os.makedirs(self.output_folder, exist_ok=True)
277
+ output_path = f'{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_cropped_adaptive.png'
278
+ os.makedirs(f'{self.output_folder}/paddle_visualisations', exist_ok=True)
279
+ cv2.imwrite(output_path, analysis_image)
280
+ print(f"\nSaved cropped image to '{output_path}'")
281
+
282
+ # --- Step 3: Hierarchical Adaptive Search (using the new clean_binary) ---
283
+ # The rest of the pipeline is identical but now operates on a superior image.
284
+ words = line_data["text"][0].split()
285
+ target_word_count = len(words)
286
+
287
+ print(f"Target word count: {target_word_count}")
288
+
289
+ best_boxes = None
290
+ successful_binary_image = None
291
+
292
+ # --- Step 3: Hierarchical Adaptive Search (using the CROPPED analysis_image) ---
293
+ words = line_data["text"][0].split()
294
+ target_word_count = len(words)
295
+ stage1_succeeded = False
296
+
297
+ print("--- Stage 1: Searching with adaptive valley threshold ---")
298
+ valley_factors_to_try = np.arange(INITIAL_VALLEY_THRESHOLD_FACTOR, 0.45, 0.05)
299
+ for v_factor in valley_factors_to_try:
300
+ # Pass the cropped image to the helper
301
+ unlabeled_boxes = self._get_boxes_from_profile(analysis_image, avg_char_width_approx, min_space_factor, v_factor)
302
+ if abs(target_word_count - len(unlabeled_boxes)) <= match_tolerance:
303
+ best_boxes = unlabeled_boxes
304
+ successful_binary_image = analysis_image
305
+ stage1_succeeded = True
306
+ break
307
+
308
+ if not stage1_succeeded:
309
+ print("\n--- Stage 1 failed. Starting Stage 2: Searching with adaptive kernel ---")
310
+ kernel_factors_to_try = np.arange(INITIAL_KERNEL_WIDTH_FACTOR, 0.5, 0.05)
311
+ fixed_valley_factor = MAIN_VALLEY_THRESHOLD_FACTOR
312
+ for k_factor in kernel_factors_to_try:
313
+ kernel_width = max(1, int(avg_char_width_approx * k_factor))
314
+ closing_kernel = np.ones((1, kernel_width), np.uint8)
315
+ # Apply closing on the original clean_binary, then crop it
316
+ closed_binary = cv2.morphologyEx(clean_binary, cv2.MORPH_CLOSE, closing_kernel)
317
+ # We need to re-apply the same vertical crop to this new image
318
+ if len(non_zero_rows) > 0 and y_start < y_end:
319
+ analysis_image = closed_binary[y_start:y_end, :]
320
+ else:
321
+ analysis_image = closed_binary
322
+
323
+ unlabeled_boxes = self._get_boxes_from_profile(analysis_image, avg_char_width_approx, min_space_factor, fixed_valley_factor)
324
+
325
+
326
+ print(f"Testing kernel factor {k_factor:.2f} ({kernel_width}px): Found {len(unlabeled_boxes)} boxes.")
327
+ if abs(target_word_count - len(unlabeled_boxes)) <= match_tolerance:
328
+ print(f"SUCCESS (Stage 2): Found a match.")
329
+ best_boxes = unlabeled_boxes
330
+ successful_binary_image = closed_binary # For Stage 2, the source is the closed_binary
331
+ break
332
+
333
+ final_output = None
334
+ used_fallback = False
335
+
336
+ if best_boxes is None:
337
+ print(f"\nWarning: All adaptive searches failed. Falling back.")
338
+ fallback_segmenter = HybridWordSegmenter()
339
+ used_fallback = True
340
+ final_output = fallback_segmenter.refine_words_bidirectional(line_data, deskewed_line_image)
341
+
342
+ else:
343
+ # --- CCA Refinement using the determined successful_binary_image ---
344
+ unlabeled_boxes = best_boxes
345
+ cca_source_image = successful_binary_image
346
+
347
+ if successful_binary_image is analysis_image: # This comparison might not work as intended
348
+ # A safer way is to check if Stage 1 succeeded
349
+ if any(v_factor in locals() and abs(target_word_count - len(self._get_boxes_from_profile(analysis_image, avg_char_width_approx, min_space_factor, v_factor))) <= match_tolerance for v_factor in np.arange(INITIAL_VALLEY_THRESHOLD_FACTOR, 0.45, 0.05)):
350
+ cca_source_image = clean_binary
351
+ else: # Stage 2 must have succeeded
352
+ # Recreate the successful closed_binary for CCA
353
+ successful_k_factor = locals().get('k_factor')
354
+ if successful_k_factor is not None:
355
+ kernel_width = max(1, int(avg_char_width_approx * successful_k_factor))
356
+ closing_kernel = np.ones((1, kernel_width), np.uint8)
357
+ cca_source_image = cv2.morphologyEx(clean_binary, cv2.MORPH_CLOSE, closing_kernel)
358
+ else:
359
+ cca_source_image = clean_binary # Fallback
360
+ else:
361
+ cca_source_image = successful_binary_image
362
+
363
+ # --- Proceed with CCA Refinement ---
364
+ unlabeled_boxes = best_boxes
365
+ num_labels, _, stats, _ = cv2.connectedComponentsWithStats(cca_source_image, 8, cv2.CV_32S)
366
+
367
+ refined_boxes_list = []
368
+ num_to_process = min(len(words), len(unlabeled_boxes))
369
+ for i in range(num_to_process):
370
+ word_label = words[i]
371
+ box_x, _, box_w, _ = unlabeled_boxes[i]
372
+ box_r = box_x + box_w # Box right edge
373
+
374
+ components_in_box = []
375
+ for j in range(1, num_labels): # Skip background
376
+ comp_x = stats[j, cv2.CC_STAT_LEFT]
377
+ comp_w = stats[j, cv2.CC_STAT_WIDTH]
378
+ comp_r = comp_x + comp_w # Component right edge
379
+
380
+ if comp_x < box_r and box_x < comp_r:
381
+ components_in_box.append(stats[j])
382
+
383
+ if not components_in_box: continue
384
+
385
+ # The rest of the CCA union logic is unchanged
386
+ min_x = min(c[cv2.CC_STAT_LEFT] for c in components_in_box)
387
+ min_y = min(c[cv2.CC_STAT_TOP] for c in components_in_box)
388
+ max_r = max(c[cv2.CC_STAT_LEFT] + c[cv2.CC_STAT_WIDTH] for c in components_in_box)
389
+ max_b = max(c[cv2.CC_STAT_TOP] + c[cv2.CC_STAT_HEIGHT] for c in components_in_box)
390
+
391
+ refined_boxes_list.append({
392
+ "text": word_label, "left": min_x, "top": min_y, "width": max_r - min_x, "height": max_b - min_y, "conf": line_data["conf"][0],
393
+ })
394
+
395
+ # Convert to dict format
396
+ final_output = {k: [] for k in ["text", "left", "top", "width", "height", "conf"]}
397
+ for box in refined_boxes_list:
398
+ for key in final_output.keys():
399
+ final_output[key].append(box[key])
400
+
401
+ # --- TRANSFORM COORDINATES BACK ---
402
+
403
+ # Get the inverse transformation matrix
404
+ M_inv = cv2.invertAffineTransform(M)
405
+
406
+ # Create a new list for the re-mapped boxes
407
+ remapped_boxes_list = []
408
+
409
+ # Iterate through the boxes found on the deskewed image
410
+ for i in range(len(final_output["text"])):
411
+ # Get the box coordinates from the deskewed image
412
+ l, t = final_output["left"][i], final_output["top"][i]
413
+ w, h = final_output["width"][i], final_output["height"][i]
414
+
415
+ # Define the 4 corners of this box
416
+ # Use float for accurate transformation
417
+ corners = np.array([
418
+ [l, t],
419
+ [l + w, t],
420
+ [l + w, t + h],
421
+ [l, t + h]
422
+ ], dtype="float32")
423
+
424
+ # Add a '1' to each coordinate for the 2x3 affine matrix
425
+ # shape (4, 1, 2)
426
+ corners_expanded = np.expand_dims(corners, axis=1)
427
+
428
+ # Apply the inverse transformation
429
+ # shape (4, 1, 2)
430
+ original_corners = cv2.transform(corners_expanded, M_inv)
431
+
432
+ # Find the new axis-aligned bounding box in the original image
433
+ # original_corners is now [[ [x1,y1] ], [ [x2,y2] ], ...]
434
+ # We need to squeeze it to get [ [x1,y1], [x2,y2], ...]
435
+ squeezed_corners = original_corners.squeeze(axis=1)
436
+
437
+ # Find the min/max x and y
438
+ min_x = int(np.min(squeezed_corners[:, 0]))
439
+ max_x = int(np.max(squeezed_corners[:, 0]))
440
+ min_y = int(np.min(squeezed_corners[:, 1]))
441
+ max_y = int(np.max(squeezed_corners[:, 1]))
442
+
443
+ # Create the re-mapped box
444
+ remapped_box = {
445
+ "text": final_output["text"][i],
446
+ "left": min_x,
447
+ "top": min_y,
448
+ "width": max_x - min_x,
449
+ "height": max_y - min_y,
450
+ "conf": final_output["conf"][i]
451
+ }
452
+ remapped_boxes_list.append(remapped_box)
453
+
454
+ # Convert the remapped list back to the dictionary format
455
+ remapped_output = {k: [] for k in final_output.keys()}
456
+ for box in remapped_boxes_list:
457
+ for key in remapped_output.keys():
458
+ remapped_output[key].append(box[key])
459
+
460
+ if SHOW_OUTPUT_IMAGES:
461
+ # Visualisation
462
+ output_image_vis = deskewed_line_image.copy()
463
+ print(f"\nFinal refined {len(remapped_output['text'])} words:")
464
+ for i in range(len(remapped_output['text'])):
465
+ word = remapped_output['text'][i]
466
+ x, y, w, h = (
467
+ int(remapped_output['left'][i]), int(remapped_output['top'][i]),
468
+ int(remapped_output['width'][i]), int(remapped_output['height'][i])
469
+ )
470
+ print(f"- '{word}' at ({x}, {y}, {w}, {h})")
471
+ cv2.rectangle(output_image_vis, (x, y), (x + w, y + h), (0, 255, 0), 2)
472
+
473
+ output_path = f'{self.output_folder}/paddle_visualisations/{image_name}_{shortened_line_text}_refined_adaptive.png'
474
+ os.makedirs(f'{self.output_folder}/paddle_visualisations', exist_ok=True)
475
+ cv2.imwrite(output_path, output_image_vis)
476
+ print(f"\nSaved visualisation to '{output_path}'")
477
+
478
+ return remapped_output, used_fallback
479
+
480
+ class HybridWordSegmenter:
481
+ """
482
+ Implements a two-step approach for word segmentation:
483
+ 1. Proportional estimation based on text.
484
+ 2. Image-based refinement with a "Bounded Scan" to prevent
485
+ over-correction.
486
+ """
487
+
488
+ def _convert_line_to_word_level_improved(
489
+ self, line_data: Dict[str, List], image_width: int, image_height: int
490
+ ) -> Dict[str, List]:
491
+ """
492
+ Step 1: Converts line-level OCR results to word-level by using a
493
+ robust proportional estimation method.
494
+ (This function is unchanged from the previous version)
495
+ """
496
+ output = {
497
+ "text": list(), "left": list(), "top": list(), "width": list(),
498
+ "height": list(), "conf": list(),
499
+ }
500
+
501
+ if not line_data or not line_data.get("text"):
502
+ return output
503
+
504
+ i = 0 # Assuming a single line
505
+ line_text = line_data["text"][i]
506
+ line_left = float(line_data["left"][i])
507
+ line_top = float(line_data["top"][i])
508
+ line_width = float(line_data["width"][i])
509
+ line_height = float(line_data["height"][i])
510
+ line_conf = line_data["conf"][i]
511
+
512
+ if not line_text.strip(): return output
513
+ words = line_text.split()
514
+ if not words: return output
515
+ num_chars = len("".join(words))
516
+ num_spaces = len(words) - 1
517
+ if num_chars == 0: return output
518
+
519
+ if (num_chars * 2 + num_spaces) > 0:
520
+ char_space_ratio = 2.0
521
+ estimated_space_width = line_width / (num_chars * char_space_ratio + num_spaces)
522
+ avg_char_width = estimated_space_width * char_space_ratio
523
+ else:
524
+ avg_char_width = line_width / (num_chars if num_chars > 0 else 1)
525
+ estimated_space_width = avg_char_width
526
+
527
+ current_left = line_left
528
+ for word in words:
529
+ word_width = len(word) * avg_char_width
530
+ clamped_left = max(0, min(current_left, image_width))
531
+ clamped_width = max(0, min(word_width, image_width - clamped_left))
532
+ output["text"].append(word)
533
+ output["left"].append(clamped_left)
534
+ output["top"].append(line_top)
535
+ output["width"].append(clamped_width)
536
+ output["height"].append(line_height)
537
+ output["conf"].append(line_conf)
538
+ current_left += word_width + estimated_space_width
539
+ return output
540
+
541
+ def _run_single_pass(
542
+ self,
543
+ initial_boxes: List[Dict],
544
+ vertical_projection: np.ndarray,
545
+ max_scan_distance: int,
546
+ img_w: int,
547
+ direction: str = 'ltr'
548
+ ) -> List[Dict]:
549
+ """Helper function to run one pass of refinement (either LTR or RTL)."""
550
+
551
+ refined_boxes = [box.copy() for box in initial_boxes]
552
+
553
+ if direction == 'ltr':
554
+ last_corrected_right_edge = 0
555
+ indices = range(len(refined_boxes))
556
+ else: # rtl
557
+ next_corrected_left_edge = img_w
558
+ indices = range(len(refined_boxes) - 1, -1, -1)
559
+
560
+ for i in indices:
561
+ box = refined_boxes[i]
562
+ left = int(box['left'])
563
+ right = int(box['left'] + box['width'])
564
+
565
+ left = max(0, min(left, img_w - 1))
566
+ right = max(0, min(right, img_w - 1))
567
+
568
+ new_left, new_right = left, right
569
+
570
+ # Bounded Scan (logic is the same for both directions)
571
+ if right < img_w and vertical_projection[right] > 0:
572
+ scan_limit = min(img_w, right + max_scan_distance)
573
+ for x in range(right + 1, scan_limit):
574
+ if vertical_projection[x] == 0:
575
+ new_right = x
576
+ break
577
+
578
+ if left > 0 and vertical_projection[left] > 0:
579
+ scan_limit = max(0, left - max_scan_distance)
580
+ for x in range(left - 1, scan_limit, -1):
581
+ if vertical_projection[x] == 0:
582
+ new_left = x
583
+ break
584
+
585
+ # Directional De-overlapping
586
+ if direction == 'ltr':
587
+ if new_left < last_corrected_right_edge:
588
+ new_left = last_corrected_right_edge
589
+ last_corrected_right_edge = max(last_corrected_right_edge, new_right)
590
+ else: # rtl
591
+ if new_right > next_corrected_left_edge:
592
+ new_right = next_corrected_left_edge
593
+ next_corrected_left_edge = min(next_corrected_left_edge, new_left)
594
+
595
+ box['left'] = new_left
596
+ box['width'] = max(1, new_right - new_left)
597
+
598
+ return refined_boxes
599
+
600
+ def refine_words_bidirectional(
601
+ self,
602
+ line_data: Dict[str, List],
603
+ line_image: np.ndarray,
604
+ ) -> Dict[str, List]:
605
+ """
606
+ Refines boxes using a more robust bidirectional scan and averaging.
607
+ """
608
+ if line_image is None: return line_data
609
+
610
+ gray = cv2.cvtColor(line_image, cv2.COLOR_BGR2GRAY)
611
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
612
+ img_h, img_w = binary.shape
613
+ vertical_projection = np.sum(binary, axis=0)
614
+
615
+ char_blobs = []
616
+ in_blob = False; blob_start = 0
617
+ for x, col_sum in enumerate(vertical_projection):
618
+ if col_sum > 0 and not in_blob: blob_start = x; in_blob = True
619
+ elif col_sum == 0 and in_blob: char_blobs.append((blob_start, x)); in_blob = False
620
+ if in_blob: char_blobs.append((blob_start, img_w))
621
+
622
+ if not char_blobs:
623
+ return self._convert_line_to_word_level_improved(line_data, img_w, img_h)
624
+
625
+ avg_char_width = np.mean([end - start for start, end in char_blobs])
626
+ max_scan_distance = int(avg_char_width * 1.5)
627
+
628
+ estimated_data = self._convert_line_to_word_level_improved(line_data, img_w, img_h)
629
+ if not estimated_data["text"]: return estimated_data
630
+
631
+ initial_boxes = []
632
+ for i in range(len(estimated_data["text"])):
633
+ initial_boxes.append({
634
+ "text": estimated_data["text"][i], "left": estimated_data["left"][i],
635
+ "top": estimated_data["top"][i], "width": estimated_data["width"][i],
636
+ "height": estimated_data["height"][i], "conf": estimated_data["conf"][i],
637
+ })
638
+
639
+ # 1. & 2. Perform both passes
640
+ ltr_boxes = self._run_single_pass(initial_boxes, vertical_projection, max_scan_distance, img_w, 'ltr')
641
+ rtl_boxes = self._run_single_pass(initial_boxes, vertical_projection, max_scan_distance, img_w, 'rtl')
642
+
643
+ # 3. Average the results
644
+ averaged_boxes = [box.copy() for box in initial_boxes]
645
+ for i in range(len(averaged_boxes)):
646
+ ltr_right = ltr_boxes[i]['left'] + ltr_boxes[i]['width']
647
+ rtl_right = rtl_boxes[i]['left'] + rtl_boxes[i]['width']
648
+
649
+ avg_left = (ltr_boxes[i]['left'] + rtl_boxes[i]['left']) / 2
650
+ avg_right = (ltr_right + rtl_right) / 2
651
+
652
+ averaged_boxes[i]['left'] = int(avg_left)
653
+ averaged_boxes[i]['width'] = int(avg_right - avg_left)
654
+
655
+ # 4. Final De-overlap Pass
656
+ last_corrected_right_edge = 0
657
+ for i, box in enumerate(averaged_boxes):
658
+ if box['left'] < last_corrected_right_edge:
659
+ box['width'] = max(1, box['width'] - (last_corrected_right_edge - box['left']))
660
+ box['left'] = last_corrected_right_edge
661
+
662
+ if box['width'] < 1:
663
+ # Handle edge case where a box is completely eliminated
664
+ if i < len(averaged_boxes) - 1:
665
+ next_left = averaged_boxes[i+1]['left']
666
+ box['width'] = max(1, next_left - box['left'])
667
+ else:
668
+ box['width'] = 1
669
+
670
+ last_corrected_right_edge = box['left'] + box['width']
671
+
672
+ # Convert back to Tesseract-style output dict
673
+ final_output = {k: [] for k in estimated_data.keys()}
674
+ for box in averaged_boxes:
675
+ if box['width'] > 0: # Ensure we don't add zero-width boxes
676
+ for key in final_output.keys():
677
+ final_output[key].append(box[key])
678
+
679
+ return final_output
680
+
681
+ def refine_words_with_image(
682
+
683
+ self,
684
+
685
+ line_data: Dict[str, List],
686
+
687
+ line_image: np.ndarray,
688
+
689
+ ) -> Dict[str, List]:
690
+
691
+ """
692
+
693
+ Step 2: Refines the estimated boxes using image data and a
694
+
695
+ "Bounded Scan" to avoid aggressive corrections.
696
+
697
+ """
698
+
699
+
700
+
701
+ # --- 2a. Get Binarized Image and Projection Profile ---
702
+
703
+ if line_image is None:
704
+
705
+ print("Error: Invalid image passed.")
706
+
707
+ return line_data
708
+
709
+
710
+
711
+ gray = cv2.cvtColor(line_image, cv2.COLOR_BGR2GRAY)
712
+
713
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
714
+
715
+ img_h, img_w = binary.shape
716
+
717
+ vertical_projection = np.sum(binary, axis=0)
718
+
719
+ # --- Calculate Avg. Char Width and Max Scan Distance ---
720
+
721
+ char_blobs = []
722
+
723
+ in_blob = False
724
+
725
+ blob_start = 0
726
+
727
+ for x, col_sum in enumerate(vertical_projection):
728
+
729
+ if col_sum > 0 and not in_blob:
730
+
731
+ blob_start = x
732
+
733
+ in_blob = True
734
+
735
+ elif col_sum == 0 and in_blob:
736
+
737
+ char_blobs.append((blob_start, x))
738
+
739
+ in_blob = False
740
+
741
+ if in_blob:
742
+
743
+ char_blobs.append((blob_start, img_w))
744
+
745
+
746
+
747
+ if not char_blobs:
748
+
749
+ print("No text detected in image for refinement.")
750
+
751
+ return self._convert_line_to_word_level_improved(line_data, img_w, img_h)
752
+
753
+
754
+ avg_char_width = np.mean([end - start for start, end in char_blobs])
755
+
756
+
757
+ # This is our "horizontal buffer". We won't scan further than this.
758
+
759
+ # We use 1.5 as a heuristic, you can tune this.
760
+
761
+ max_scan_distance = int(avg_char_width * 1.5)
762
+
763
+ print(f"Calculated avg char width: {avg_char_width:.2f}px. Max scan: {max_scan_distance}px")
764
+
765
+
766
+
767
+ # --- 2b. Get Initial "Rough Draft" Estimates ---
768
+
769
+ estimated_data = self._convert_line_to_word_level_improved(line_data, img_w, img_h)
770
+
771
+
772
+
773
+ if not estimated_data["text"]:
774
+
775
+ return estimated_data
776
+
777
+
778
+
779
+ initial_boxes = []
780
+
781
+ for i in range(len(estimated_data["text"])):
782
+
783
+ initial_boxes.append({
784
+
785
+ "text": estimated_data["text"][i],
786
+
787
+ "left": estimated_data["left"][i],
788
+
789
+ "top": estimated_data["top"][i],
790
+
791
+ "width": estimated_data["width"][i],
792
+
793
+ "height": estimated_data["height"][i],
794
+
795
+ "conf": estimated_data["conf"][i],
796
+
797
+ })
798
+
799
+ # --- 2c. Iterate, Refine, and De-overlap (Now with Bounded Scan) ---
800
+
801
+ refined_boxes_list = []
802
+
803
+ last_corrected_right_edge = 0
804
+
805
+
806
+
807
+ for i, box in enumerate(initial_boxes):
808
+
809
+ left = int(box['left'])
810
+
811
+ right = int(box['left'] + box['width'])
812
+
813
+
814
+
815
+ left = max(0, min(left, img_w - 1))
816
+
817
+ right = max(0, min(right, img_w - 1))
818
+
819
+
820
+
821
+ new_left = left
822
+
823
+ new_right = right
824
+
825
+
826
+
827
+ # **Check Right Boundary (Bounded Scan)**
828
+
829
+ if right < img_w and vertical_projection[right] > 0:
830
+
831
+ scan_limit = min(img_w, right + max_scan_distance) # Don't scan past buffer
832
+
833
+ for x in range(right + 1, scan_limit):
834
+
835
+ if vertical_projection[x] == 0:
836
+
837
+ new_right = x # Found clear space *within* buffer
838
+
839
+ break
840
+
841
+ # If loop finishes without break, new_right is unchanged
842
+ # (i.e., we give up and keep the original estimate)
843
+
844
+ # **Check Left Boundary (Bounded Scan)**
845
+
846
+ if left > 0 and vertical_projection[left] > 0:
847
+
848
+ scan_limit = max(0, left - max_scan_distance) # Don't scan past buffer
849
+
850
+ for x in range(left - 1, scan_limit, -1):
851
+
852
+ if vertical_projection[x] == 0:
853
+
854
+ new_left = x # Found clear space *within* buffer
855
+
856
+ break
857
+
858
+ # If loop finishes without break, new_left is unchanged
859
+
860
+
861
+
862
+ # **De-overlapping Logic:** (Unchanged)
863
+
864
+ if new_left < last_corrected_right_edge:
865
+
866
+ new_left = last_corrected_right_edge
867
+
868
+
869
+
870
+ # **Validity Check:** (Unchanged)
871
+
872
+ new_width = new_right - new_left
873
+
874
+ if new_width > 1:
875
+
876
+ box['left'] = new_left
877
+
878
+ box['width'] = new_width
879
+
880
+ refined_boxes_list.append(box)
881
+
882
+ last_corrected_right_edge = new_right
883
+
884
+ elif i > 0:
885
+
886
+ # If the box has collapsed (e.g., fully overlapped),
887
+
888
+ # try to give it a 1px space just to keep it from disappearing.
889
+
890
+ # This is an edge case.
891
+
892
+ new_left = last_corrected_right_edge + 1
893
+
894
+ new_right = new_left + 1
895
+
896
+ if new_right < img_w:
897
+
898
+ box['left'] = new_left
899
+
900
+ box['width'] = 1
901
+
902
+ refined_boxes_list.append(box)
903
+
904
+ last_corrected_right_edge = new_right
905
+
906
+
907
+
908
+
909
+
910
+ # --- 2d. Convert back to Tesseract-style output dict ---
911
+
912
+ final_output = {k: [] for k in estimated_data.keys()}
913
+
914
+ for box in refined_boxes_list:
915
+
916
+ for key in final_output.keys():
917
+
918
+ final_output[key].append(box[key])
919
+
920
+
921
+
922
+ return final_output
923
+
924
+ # --- Example Usage ---
925
+ if __name__ == '__main__':
926
+ # Make sure you have the previous class available to import for the fallback
927
+ #image_path = 'inputs/example_partnership_p6_1.PNG'
928
+ #image_path = 'inputs/example_partnership_p6_2.PNG'
929
+ #image_path = 'inputs/example_partnership_p4_1.PNG'
930
+ image_path = 'inputs/line_image_3.png'
931
+ image_basename = os.path.basename(image_path)
932
+ image_name = os.path.splitext(image_basename)[0]
933
+ output_path = f'outputs/{image_name}_refined_morph.png'
934
+ if not os.path.exists("outputs"):
935
+ os.makedirs("outputs")
936
+ line_image_cv = cv2.imread(image_path)
937
+ h, w, _ = line_image_cv.shape
938
+
939
+ # Read in related text
940
+ with open(f'inputs/{image_name}_text.txt', 'r') as file:
941
+ text = file.read()
942
+ line_data = {
943
+ "text": [text],
944
+ "left": [0], "top": [0], "width": [w], "height": [h], "conf": [95.0]
945
+ }
946
+ segmenter = AdaptiveSegmenter()
947
+ final_word_data, used_fallback = segmenter.segment(line_data, line_image_cv)
948
+
949
+ # Visualisation
950
+ output_image_vis = line_image_cv.copy()
951
+ print(f"\nFinal refined {len(final_word_data['text'])} words:")
952
+ for i in range(len(final_word_data['text'])):
953
+ word = final_word_data['text'][i]
954
+ x, y, w, h = (
955
+ int(final_word_data['left'][i]), int(final_word_data['top'][i]),
956
+ int(final_word_data['width'][i]), int(final_word_data['height'][i])
957
+ )
958
+ print(f"- '{word}' at ({x}, {y}, {w}, {h})")
959
+ cv2.rectangle(output_image_vis, (x, y), (x + w, y + h), (0, 255, 0), 2)
960
+
961
+ cv2.imwrite(output_path, output_image_vis)
962
+ print(f"\nSaved visualisation to '{output_path}'")
963
+
964
+ # You can also use matplotlib to display it in a notebook
965
+ import matplotlib.pyplot as plt
966
+ plt.figure(figsize=(10, 5))
967
+ plt.imshow(cv2.cvtColor(output_image_vis, cv2.COLOR_BGR2RGB))
968
+
969
+ if used_fallback:
970
+ plt.title("Refined with Bounded Scan")
971
+ else:
972
+ plt.title("Refined with Morphological Closing")
973
+ plt.axis('off')
974
+ plt.show()