seanpedrickcase commited on
Commit
ad8fef5
·
1 Parent(s): fcd55d0

Fixed duplicate page argument mismatch. Readded Windows tests. Added refresh token options to cdk. Package updates

Browse files
.github/workflows/multi-os-test.yml CHANGED
@@ -15,12 +15,12 @@ jobs:
15
  runs-on: ${{ matrix.os }}
16
  strategy:
17
  matrix:
18
- os: [ubuntu-latest, macos-latest] # windows-latest removed for now as I have not been able to install tesseract on Windows using this method
19
  python-version: ["3.11", "3.12", "3.13"]
20
  exclude:
21
  # Exclude some combinations to reduce CI time
22
- #- os: windows-latest
23
- # python-version: "3.10"
24
  - os: macos-latest
25
  python-version: "3.11"
26
 
@@ -51,8 +51,37 @@ jobs:
51
  - name: Install system dependencies (Windows)
52
  if: matrix.os == 'windows-latest'
53
  run: |
54
- # Windows dependencies are handled by the Python packages
55
- echo "Windows system dependencies handled by Python packages"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  - name: Install Python dependencies
58
  run: |
 
15
  runs-on: ${{ matrix.os }}
16
  strategy:
17
  matrix:
18
+ os: [ubuntu-latest, windows-latest, macos-latest]
19
  python-version: ["3.11", "3.12", "3.13"]
20
  exclude:
21
  # Exclude some combinations to reduce CI time
22
+ - os: windows-latest
23
+ python-version: "3.11"
24
  - os: macos-latest
25
  python-version: "3.11"
26
 
 
51
  - name: Install system dependencies (Windows)
52
  if: matrix.os == 'windows-latest'
53
  run: |
54
+ # Create tools directory
55
+ mkdir C:\tools
56
+
57
+ # Download and install Tesseract
58
+ $tesseractUrl = "https://github.com/UB-Mannheim/tesseract/releases/download/v5.3.3.20231005/tesseract-ocr-w64-setup-5.3.3.20231005.exe"
59
+ $tesseractInstaller = "C:\tools\tesseract-installer.exe"
60
+ Invoke-WebRequest -Uri $tesseractUrl -OutFile $tesseractInstaller
61
+
62
+ # Install Tesseract silently
63
+ Start-Process -FilePath $tesseractInstaller -ArgumentList "/S", "/D=C:\tools\tesseract" -Wait
64
+
65
+ # Download and extract Poppler
66
+ $popplerUrl = "https://github.com/oschwartz10612/poppler-windows/releases/download/v24.02.0-0/Release-24.02.0-0.zip"
67
+ $popplerZip = "C:\tools\poppler.zip"
68
+ Invoke-WebRequest -Uri $popplerUrl -OutFile $popplerZip
69
+
70
+ # Extract Poppler
71
+ Expand-Archive -Path $popplerZip -DestinationPath C:\tools\poppler -Force
72
+
73
+ # Add to PATH
74
+ echo "C:\tools\tesseract" >> $env:GITHUB_PATH
75
+ echo "C:\tools\poppler\poppler-24.02.0\Library\bin" >> $env:GITHUB_PATH
76
+
77
+ # Set environment variables for your application
78
+ echo "TESSERACT_FOLDER=C:\tools\tesseract" >> $env:GITHUB_ENV
79
+ echo "POPPLER_FOLDER=C:\tools\poppler\poppler-24.02.0\Library\bin" >> $env:GITHUB_ENV
80
+ echo "TESSERACT_DATA_FOLDER=C:\tools\tesseract\tessdata" >> $env:GITHUB_ENV
81
+
82
+ # Verify installation
83
+ tesseract --version
84
+ pdftoppm -v
85
 
86
  - name: Install Python dependencies
87
  run: |
cdk/cdk_config.py CHANGED
@@ -219,6 +219,16 @@ COGNITO_USER_POOL_DOMAIN_PREFIX = get_or_create_env_var(
219
  "COGNITO_USER_POOL_DOMAIN_PREFIX", "redaction-app-domain"
220
  ) # Should change this to something unique or you'll probably hit an error
221
 
 
 
 
 
 
 
 
 
 
 
222
  # Application load balancer
223
  ALB_NAME = get_or_create_env_var(
224
  "ALB_NAME", f"{CDK_PREFIX}Alb"[-32:]
 
219
  "COGNITO_USER_POOL_DOMAIN_PREFIX", "redaction-app-domain"
220
  ) # Should change this to something unique or you'll probably hit an error
221
 
222
+ COGNITO_REFRESH_TOKEN_VALIDITY = int(
223
+ get_or_create_env_var("COGNITO_REFRESH_TOKEN_VALIDITY", "480")
224
+ ) # Minutes
225
+ COGNITO_ID_TOKEN_VALIDITY = int(
226
+ get_or_create_env_var("COGNITO_ID_TOKEN_VALIDITY", "60")
227
+ ) # Minutes
228
+ COGNITO_ACCESS_TOKEN_VALIDITY = int(
229
+ get_or_create_env_var("COGNITO_ACCESS_TOKEN_VALIDITY", "60")
230
+ ) # Minutes
231
+
232
  # Application load balancer
233
  ALB_NAME = get_or_create_env_var(
234
  "ALB_NAME", f"{CDK_PREFIX}Alb"[-32:]
cdk/cdk_stack.py CHANGED
@@ -39,7 +39,10 @@ from cdk_config import (
39
  CLUSTER_NAME,
40
  CODEBUILD_PROJECT_NAME,
41
  CODEBUILD_ROLE_NAME,
 
 
42
  COGNITO_REDIRECTION_URL,
 
43
  COGNITO_USER_POOL_CLIENT_NAME,
44
  COGNITO_USER_POOL_CLIENT_SECRET_NAME,
45
  COGNITO_USER_POOL_DOMAIN_PREFIX,
@@ -1161,6 +1164,13 @@ class CdkStack(Stack):
1161
  ],
1162
  callback_urls=redirect_uris,
1163
  ),
 
 
 
 
 
 
 
1164
  )
1165
 
1166
  CfnOutput(
 
39
  CLUSTER_NAME,
40
  CODEBUILD_PROJECT_NAME,
41
  CODEBUILD_ROLE_NAME,
42
+ COGNITO_ACCESS_TOKEN_VALIDITY,
43
+ COGNITO_ID_TOKEN_VALIDITY,
44
  COGNITO_REDIRECTION_URL,
45
+ COGNITO_REFRESH_TOKEN_VALIDITY,
46
  COGNITO_USER_POOL_CLIENT_NAME,
47
  COGNITO_USER_POOL_CLIENT_SECRET_NAME,
48
  COGNITO_USER_POOL_DOMAIN_PREFIX,
 
1164
  ],
1165
  callback_urls=redirect_uris,
1166
  ),
1167
+ refresh_token_validity=Duration.minutes(
1168
+ COGNITO_REFRESH_TOKEN_VALIDITY
1169
+ ),
1170
+ id_token_validity=Duration.minutes(COGNITO_ID_TOKEN_VALIDITY),
1171
+ access_token_validity=Duration.minutes(
1172
+ COGNITO_ACCESS_TOKEN_VALIDITY
1173
+ ),
1174
  )
1175
 
1176
  CfnOutput(
pyproject.toml CHANGED
@@ -12,33 +12,33 @@ requires-python = ">=3.10"
12
  dependencies = [
13
  "pdfminer.six==20250506",
14
  "pdf2image==1.17.0",
15
- "pymupdf==1.26.3",
16
  "opencv-python==4.12.0.88",
17
- "presidio_analyzer==2.2.359",
18
- "presidio_anonymizer==2.2.359",
19
  "presidio-image-redactor==0.0.57",
20
- "pikepdf==9.10.2",
21
- "pandas==2.3.1",
22
- "scikit-learn==1.7.1",
23
  "spacy==3.8.7",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
- "gradio==5.46.1",
27
- "boto3==1.40.31",
28
  "pyarrow==21.0.0",
29
  "openpyxl==3.1.5",
30
- "Faker==37.5.3",
31
  "python-levenshtein==0.27.1",
32
  "spaczz==0.6.1",
33
  # Direct URL dependency for gradio_image_annotator wheel
34
  "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
35
- "rapidfuzz==3.13.0",
36
  "python-dotenv==1.0.1",
37
  "awslambdaric==3.1.1",
38
  "python-docx==1.2.0",
39
- "paddlepaddle==3.1.0",
40
- "paddleocr==3.1.1",
41
  "polars==1.33.1"
 
 
42
  ]
43
 
44
  [project.urls]
 
12
  dependencies = [
13
  "pdfminer.six==20250506",
14
  "pdf2image==1.17.0",
15
+ "pymupdf==1.26.4",
16
  "opencv-python==4.12.0.88",
17
+ "presidio_analyzer==2.2.360",
18
+ "presidio_anonymizer==2.2.360",
19
  "presidio-image-redactor==0.0.57",
20
+ "pikepdf==9.11.0",
21
+ "pandas==2.3.2",
22
+ "scikit-learn==1.7.2",
23
  "spacy==3.8.7",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
+ "gradio==5.47.0",
27
+ "boto3==1.40.37",
28
  "pyarrow==21.0.0",
29
  "openpyxl==3.1.5",
30
+ "Faker==37.8.0",
31
  "python-levenshtein==0.27.1",
32
  "spaczz==0.6.1",
33
  # Direct URL dependency for gradio_image_annotator wheel
34
  "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
35
+ "rapidfuzz==3.14.1",
36
  "python-dotenv==1.0.1",
37
  "awslambdaric==3.1.1",
38
  "python-docx==1.2.0",
 
 
39
  "polars==1.33.1"
40
+ #"paddlepaddle==3.2.0", # Optional paddle imports - only if you want to use hybrid OCR mode with tesseract and paddleOCR
41
+ #"paddleocr==3.2.0"
42
  ]
43
 
44
  [project.urls]
requirements.txt CHANGED
@@ -1,32 +1,31 @@
1
  pdfminer.six==20250506
2
  pdf2image==1.17.0
3
- pymupdf==1.26.3
4
  opencv-python==4.12.0.88
5
- presidio_analyzer==2.2.359
6
- presidio_anonymizer==2.2.359
7
  presidio-image-redactor==0.0.57
8
- pikepdf==9.10.2
9
  pandas==2.3.2
10
- scikit-learn==1.7.1
11
  spacy==3.8.7
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
- gradio==5.46.1
14
  polars==1.33.1
15
- boto3==1.40.31
16
  pyarrow==21.0.0
17
  openpyxl==3.1.5
18
- Faker==37.5.3
19
  python-levenshtein==0.27.1
20
  spaczz==0.6.1
21
- # The following version
22
  https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
23
- rapidfuzz==3.13.0
24
  python-dotenv==1.0.1
25
  awslambdaric==3.1.1
26
  python-docx==1.2.0
27
  # Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
28
- # paddlepaddle==3.1.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
29
- # paddleocr==3.1.1
30
 
31
  # Test dependencies
32
  pytest>=7.0.0
 
1
  pdfminer.six==20250506
2
  pdf2image==1.17.0
3
+ pymupdf==1.26.4
4
  opencv-python==4.12.0.88
5
+ presidio_analyzer==2.2.360
6
+ presidio_anonymizer==2.2.360
7
  presidio-image-redactor==0.0.57
8
+ pikepdf==9.11.0
9
  pandas==2.3.2
10
+ scikit-learn==1.7.2
11
  spacy==3.8.7
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
+ gradio==5.47.0
14
  polars==1.33.1
15
+ boto3==1.40.37
16
  pyarrow==21.0.0
17
  openpyxl==3.1.5
18
+ Faker==37.8.0
19
  python-levenshtein==0.27.1
20
  spaczz==0.6.1
 
21
  https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
22
+ rapidfuzz==3.14.1
23
  python-dotenv==1.0.1
24
  awslambdaric==3.1.1
25
  python-docx==1.2.0
26
  # Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
27
+ # paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
28
+ # paddleocr==3.2.0
29
 
30
  # Test dependencies
31
  pytest>=7.0.0
tools/find_duplicate_pages.py CHANGED
@@ -36,7 +36,7 @@ def split_text_with_punctuation(text: str) -> List[str]:
36
  # 2. OR a sequence of one or more characters that are NOT punctuation or whitespace `[^.,?!:;\s]+`
37
  pattern = re.compile(r"([.,?!:;]+|[^.,?!:;\s]+)")
38
 
39
- final_list = []
40
  # We first split by whitespace to handle sentences correctly
41
  for word in text.split():
42
  # Then, for each whitespace-separated word, we tokenize it further
@@ -381,7 +381,7 @@ def combine_ocr_dataframes(
381
  - The final combined and processed DataFrame.
382
  - A list containing the path to the saved output CSV file.
383
  """
384
- all_data = []
385
 
386
  for file_identifier, df_initial in input_data:
387
  df = df_initial.copy() # Work on a copy to avoid side effects
@@ -454,7 +454,7 @@ def combine_ocr_dataframes(
454
  combined_df = combined_df.copy()[existing_final_columns]
455
 
456
  # --- Save Output ---
457
- output_files = []
458
  if output_folder and output_filename:
459
  os.makedirs(output_folder, exist_ok=True)
460
  output_path = os.path.join(output_folder, output_filename)
@@ -490,7 +490,7 @@ def combine_ocr_output_text(
490
  else:
491
  file_paths_list = input_files
492
 
493
- data_to_process = []
494
  for file_path in file_paths_list:
495
  try:
496
  df = pd.read_csv(file_path)
@@ -644,7 +644,7 @@ def save_results_and_redaction_lists(
644
  Returns:
645
  list: A list of paths to all generated files.
646
  """
647
- output_paths = []
648
  output_folder_path = Path(output_folder)
649
  output_folder_path.mkdir(exist_ok=True)
650
 
@@ -657,7 +657,6 @@ def save_results_and_redaction_lists(
657
  final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
658
 
659
  output_paths.append(str(similarity_file_output_path))
660
- # print(f"Main results saved to {similarity_file_output_path}")
661
 
662
  # 2. Save per-file redaction lists
663
  # Use 'Page2_File' as the source of duplicate content
@@ -754,7 +753,6 @@ def find_consecutive_sequence_matches(
754
  A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
755
  consecutive match, or an empty DataFrame if no match is found.
756
  """
757
- # print(f"Starting sequence search for '{search_file_name}' in '{reference_file_name}'...")
758
 
759
  # Step 1: Isolate the data for each file
760
  search_df = df_filtered[df_filtered["file"] == search_file_name]
@@ -773,7 +771,7 @@ def find_consecutive_sequence_matches(
773
  reference_indices = reference_df.index.tolist()
774
 
775
  query_len = len(query_tokens)
776
- all_found_matches = []
777
 
778
  print(f"Searching for a sequence of {query_len} tokens...")
779
 
@@ -784,7 +782,6 @@ def find_consecutive_sequence_matches(
784
 
785
  # Step 4: If the window matches the query with or without punctuation on end
786
  if _sequences_match(query_tokens, window):
787
- # print(f"Found a consecutive match starting at reference index: {reference_indices[i]}")
788
 
789
  # Get the global indices for this entire matching block
790
  matching_reference_indices = reference_indices[i : i + query_len]
@@ -874,7 +871,7 @@ def identify_similar_text_sequences(
874
  # Use the original, simpler path for all-to-all comparisons (including intra-file).
875
  vectorizer = TfidfVectorizer()
876
  print("Standard Path: Calculating all-to-all similarity.")
877
- progress(0.2, desc="Vectorizing text...")
878
  tfidf_matrix = vectorizer.fit_transform(df_filtered["text_clean"])
879
 
880
  progress(0.3, desc="Calculating similarity matrix...")
@@ -897,12 +894,8 @@ def identify_similar_text_sequences(
897
  progress(0.7, desc="Aggregating results based on matching strategy")
898
 
899
  if greedy_match or min_consecutive_pages > 1:
900
- # print("Finding all consecutive page matches of minimum length:", min_consecutive_pages)
901
-
902
  # Sort the dataframe to ensure consecutive pages are adjacent
903
- similarity_df = (
904
- base_similarity_df # .sort_values(['Page1_Index', 'Page2_Index']).copy()
905
- )
906
 
907
  # A new sequence starts if the difference from the previous row is not (1, 1)
908
  # is_consecutive will be True if a row continues the sequence, False if it's a new one.
@@ -1023,8 +1016,8 @@ def run_duplicate_analysis(
1023
  min_consecutive: int,
1024
  greedy_match: bool,
1025
  combine_pages: bool = True,
1026
- preview_length: int = 500,
1027
  output_folder: str = OUTPUT_FOLDER,
 
1028
  progress=gr.Progress(track_tqdm=True),
1029
  ):
1030
  """
@@ -1039,8 +1032,8 @@ def run_duplicate_analysis(
1039
  min_consecutive (int): The minimum number of consecutive pages that must match for a sequence to be considered a duplicate.
1040
  greedy_match (bool): If True, uses a greedy matching strategy for identifying consecutive sequences.
1041
  combine_pages (bool, optional): If True, text from multiple pages is combined into larger segments for analysis. Defaults to True.
1042
- preview_length (int, optional): The maximum number of characters to display in the text preview panes. Defaults to 500.
1043
  output_folder (str, optional): The directory where the similarity results and redaction lists will be saved. Defaults to OUTPUT_FOLDER.
 
1044
  progress (gr.Progress, optional): A Gradio progress tracker object to display progress in the UI.
1045
  """
1046
 
@@ -1079,8 +1072,11 @@ def run_duplicate_analysis(
1079
  progress=progress,
1080
  )
1081
 
 
 
1082
  # Clip text to first 200 characters
1083
  full_df["text"] = full_df["text"].str[:preview_length]
 
1084
  # Preprocess full_data (without preview text) for fast access (run once)
1085
  full_data_by_file = {
1086
  file: df.sort_values("page").set_index("page")
@@ -1135,6 +1131,9 @@ def show_page_previews(
1135
  page1_data = full_data_by_file[file1].loc[[page1], ["text"]].reset_index()
1136
  page2_data = full_data_by_file[file2].loc[[page2], ["text"]].reset_index()
1137
 
 
 
 
1138
  page1_data["text"] = page1_data["text"].str[:preview_length]
1139
  page2_data["text"] = page2_data["text"].str[:preview_length]
1140
 
@@ -1217,10 +1216,10 @@ def apply_whole_page_redactions_from_list(
1217
  new_annotations_with_bounding_boxes (List[dict], optional): A list of new annotations with bounding boxes. Defaults to an empty list.
1218
  """
1219
  if all_existing_annotations is None:
1220
- all_existing_annotations = []
1221
 
1222
  if new_annotations_with_bounding_boxes is None:
1223
- new_annotations_with_bounding_boxes = []
1224
 
1225
  all_annotations = all_existing_annotations.copy()
1226
 
@@ -1229,7 +1228,7 @@ def apply_whole_page_redactions_from_list(
1229
  print(f"Warning: {message}")
1230
  raise Warning(message)
1231
 
1232
- list_whole_pages_to_redact = []
1233
 
1234
  if combine_pages is True:
1235
  # Get list of pages to redact from either dataframe or file
@@ -1270,7 +1269,7 @@ def apply_whole_page_redactions_from_list(
1270
  print(message)
1271
  raise Warning(message)
1272
 
1273
- list_whole_pages_to_redact = []
1274
  for annotation in new_annotations_with_bounding_boxes:
1275
  from tools.secure_regex_utils import safe_extract_page_number_from_path
1276
 
@@ -1285,7 +1284,7 @@ def apply_whole_page_redactions_from_list(
1285
 
1286
  list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
1287
 
1288
- new_annotations = []
1289
  # Process each page for redaction
1290
  for page in list_whole_pages_to_redact:
1291
  try:
@@ -1312,8 +1311,7 @@ def apply_whole_page_redactions_from_list(
1312
  )
1313
  continue
1314
 
1315
- # --- Create a LIST of boxes to add.---
1316
- boxes_to_add = []
1317
 
1318
  pymupdf_page = pymupdf_doc[page_index]
1319
 
@@ -1439,7 +1437,7 @@ def create_annotation_objects_from_duplicates(
1439
  Returns:
1440
  List[Dict]: A list of dictionaries, where each dict represents a page and its list of annotation boxes, in the format: [{"image": "path/to/img.png", "boxes": [...]}, ...]
1441
  """
1442
- final_output = []
1443
 
1444
  if duplicates_df.empty:
1445
  raise Warning("No duplicates found")
@@ -1503,7 +1501,7 @@ def create_annotation_objects_from_duplicates(
1503
  annotations_by_page[page_number].append(box)
1504
 
1505
  # --- Format the final output list using the page-to-image map ---
1506
- final_output = []
1507
  # Sort by page number for a predictable order
1508
  for page_num, boxes in sorted(annotations_by_page.items()):
1509
  # Look up the image path using the page number
 
36
  # 2. OR a sequence of one or more characters that are NOT punctuation or whitespace `[^.,?!:;\s]+`
37
  pattern = re.compile(r"([.,?!:;]+|[^.,?!:;\s]+)")
38
 
39
+ final_list = list()
40
  # We first split by whitespace to handle sentences correctly
41
  for word in text.split():
42
  # Then, for each whitespace-separated word, we tokenize it further
 
381
  - The final combined and processed DataFrame.
382
  - A list containing the path to the saved output CSV file.
383
  """
384
+ all_data = list()
385
 
386
  for file_identifier, df_initial in input_data:
387
  df = df_initial.copy() # Work on a copy to avoid side effects
 
454
  combined_df = combined_df.copy()[existing_final_columns]
455
 
456
  # --- Save Output ---
457
+ output_files = list()
458
  if output_folder and output_filename:
459
  os.makedirs(output_folder, exist_ok=True)
460
  output_path = os.path.join(output_folder, output_filename)
 
490
  else:
491
  file_paths_list = input_files
492
 
493
+ data_to_process = list()
494
  for file_path in file_paths_list:
495
  try:
496
  df = pd.read_csv(file_path)
 
644
  Returns:
645
  list: A list of paths to all generated files.
646
  """
647
+ output_paths = list()
648
  output_folder_path = Path(output_folder)
649
  output_folder_path.mkdir(exist_ok=True)
650
 
 
657
  final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
658
 
659
  output_paths.append(str(similarity_file_output_path))
 
660
 
661
  # 2. Save per-file redaction lists
662
  # Use 'Page2_File' as the source of duplicate content
 
753
  A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
754
  consecutive match, or an empty DataFrame if no match is found.
755
  """
 
756
 
757
  # Step 1: Isolate the data for each file
758
  search_df = df_filtered[df_filtered["file"] == search_file_name]
 
771
  reference_indices = reference_df.index.tolist()
772
 
773
  query_len = len(query_tokens)
774
+ all_found_matches = list()
775
 
776
  print(f"Searching for a sequence of {query_len} tokens...")
777
 
 
782
 
783
  # Step 4: If the window matches the query with or without punctuation on end
784
  if _sequences_match(query_tokens, window):
 
785
 
786
  # Get the global indices for this entire matching block
787
  matching_reference_indices = reference_indices[i : i + query_len]
 
871
  # Use the original, simpler path for all-to-all comparisons (including intra-file).
872
  vectorizer = TfidfVectorizer()
873
  print("Standard Path: Calculating all-to-all similarity.")
874
+ progress(0.2, desc="Vectorising text...")
875
  tfidf_matrix = vectorizer.fit_transform(df_filtered["text_clean"])
876
 
877
  progress(0.3, desc="Calculating similarity matrix...")
 
894
  progress(0.7, desc="Aggregating results based on matching strategy")
895
 
896
  if greedy_match or min_consecutive_pages > 1:
 
 
897
  # Sort the dataframe to ensure consecutive pages are adjacent
898
+ similarity_df = base_similarity_df
 
 
899
 
900
  # A new sequence starts if the difference from the previous row is not (1, 1)
901
  # is_consecutive will be True if a row continues the sequence, False if it's a new one.
 
1016
  min_consecutive: int,
1017
  greedy_match: bool,
1018
  combine_pages: bool = True,
 
1019
  output_folder: str = OUTPUT_FOLDER,
1020
+ preview_length: int = 500,
1021
  progress=gr.Progress(track_tqdm=True),
1022
  ):
1023
  """
 
1032
  min_consecutive (int): The minimum number of consecutive pages that must match for a sequence to be considered a duplicate.
1033
  greedy_match (bool): If True, uses a greedy matching strategy for identifying consecutive sequences.
1034
  combine_pages (bool, optional): If True, text from multiple pages is combined into larger segments for analysis. Defaults to True.
 
1035
  output_folder (str, optional): The directory where the similarity results and redaction lists will be saved. Defaults to OUTPUT_FOLDER.
1036
+ preview_length (int, optional): The maximum number of characters to display in the text preview panes. Defaults to 500.
1037
  progress (gr.Progress, optional): A Gradio progress tracker object to display progress in the UI.
1038
  """
1039
 
 
1072
  progress=progress,
1073
  )
1074
 
1075
+ full_df["text"] = full_df["text"].astype(str)
1076
+
1077
  # Clip text to first 200 characters
1078
  full_df["text"] = full_df["text"].str[:preview_length]
1079
+
1080
  # Preprocess full_data (without preview text) for fast access (run once)
1081
  full_data_by_file = {
1082
  file: df.sort_values("page").set_index("page")
 
1131
  page1_data = full_data_by_file[file1].loc[[page1], ["text"]].reset_index()
1132
  page2_data = full_data_by_file[file2].loc[[page2], ["text"]].reset_index()
1133
 
1134
+ page1_data["text"] = page1_data["text"].astype(str)
1135
+ page2_data["text"] = page2_data["text"].astype(str)
1136
+
1137
  page1_data["text"] = page1_data["text"].str[:preview_length]
1138
  page2_data["text"] = page2_data["text"].str[:preview_length]
1139
 
 
1216
  new_annotations_with_bounding_boxes (List[dict], optional): A list of new annotations with bounding boxes. Defaults to an empty list.
1217
  """
1218
  if all_existing_annotations is None:
1219
+ all_existing_annotations = list()
1220
 
1221
  if new_annotations_with_bounding_boxes is None:
1222
+ new_annotations_with_bounding_boxes = list()
1223
 
1224
  all_annotations = all_existing_annotations.copy()
1225
 
 
1228
  print(f"Warning: {message}")
1229
  raise Warning(message)
1230
 
1231
+ list_whole_pages_to_redact = list()
1232
 
1233
  if combine_pages is True:
1234
  # Get list of pages to redact from either dataframe or file
 
1269
  print(message)
1270
  raise Warning(message)
1271
 
1272
+ list_whole_pages_to_redact = list()
1273
  for annotation in new_annotations_with_bounding_boxes:
1274
  from tools.secure_regex_utils import safe_extract_page_number_from_path
1275
 
 
1284
 
1285
  list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
1286
 
1287
+ new_annotations = list()
1288
  # Process each page for redaction
1289
  for page in list_whole_pages_to_redact:
1290
  try:
 
1311
  )
1312
  continue
1313
 
1314
+ boxes_to_add = list()
 
1315
 
1316
  pymupdf_page = pymupdf_doc[page_index]
1317
 
 
1437
  Returns:
1438
  List[Dict]: A list of dictionaries, where each dict represents a page and its list of annotation boxes, in the format: [{"image": "path/to/img.png", "boxes": [...]}, ...]
1439
  """
1440
+ final_output = list()
1441
 
1442
  if duplicates_df.empty:
1443
  raise Warning("No duplicates found")
 
1501
  annotations_by_page[page_number].append(box)
1502
 
1503
  # --- Format the final output list using the page-to-image map ---
1504
+ final_output = list()
1505
  # Sort by page number for a predictable order
1506
  for page_num, boxes in sorted(annotations_by_page.items()):
1507
  # Look up the image path using the page number
tools/redaction_review.py CHANGED
@@ -1017,8 +1017,6 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
1017
 
1018
  progress(1.0, desc="Completed annotation processing")
1019
 
1020
- print("final_annotations_list:", final_annotations_list)
1021
-
1022
  return (
1023
  final_annotations_list,
1024
  existing_annotations_list,
 
1017
 
1018
  progress(1.0, desc="Completed annotation processing")
1019
 
 
 
1020
  return (
1021
  final_annotations_list,
1022
  existing_annotations_list,