Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on 17 days ago

Commit

ad8fef5

1 Parent(s): fcd55d0

Fixed duplicate page argument mismatch. Readded Windows tests. Added refresh token options to cdk. Package updates

Browse files

Files changed (7) hide show

.github/workflows/multi-os-test.yml +34 -5
cdk/cdk_config.py +10 -0
cdk/cdk_stack.py +10 -0
pyproject.toml +12 -12
requirements.txt +11 -12
tools/find_duplicate_pages.py +24 -26
tools/redaction_review.py +0 -2

.github/workflows/multi-os-test.yml CHANGED Viewed

@@ -15,12 +15,12 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest] # windows-latest removed for now as I have not been able to install tesseract on Windows using this method
         python-version: ["3.11", "3.12", "3.13"]
         exclude:
           # Exclude some combinations to reduce CI time
-          #- os: windows-latest
-          #  python-version: "3.10"
           - os: macos-latest
             python-version: "3.11"
@@ -51,8 +51,37 @@ jobs:
     - name: Install system dependencies (Windows)
       if: matrix.os == 'windows-latest'
       run: |
-        # Windows dependencies are handled by the Python packages
-        echo "Windows system dependencies handled by Python packages"
     - name: Install Python dependencies
       run: |

     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
         python-version: ["3.11", "3.12", "3.13"]
         exclude:
           # Exclude some combinations to reduce CI time
+          - os: windows-latest
+            python-version: "3.11"
           - os: macos-latest
             python-version: "3.11"
     - name: Install system dependencies (Windows)
       if: matrix.os == 'windows-latest'
       run: |
+        # Create tools directory
+        mkdir C:\tools
+        # Download and install Tesseract
+        $tesseractUrl = "https://github.com/UB-Mannheim/tesseract/releases/download/v5.3.3.20231005/tesseract-ocr-w64-setup-5.3.3.20231005.exe"
+        $tesseractInstaller = "C:\tools\tesseract-installer.exe"
+        Invoke-WebRequest -Uri $tesseractUrl -OutFile $tesseractInstaller
+        # Install Tesseract silently
+        Start-Process -FilePath $tesseractInstaller -ArgumentList "/S", "/D=C:\tools\tesseract" -Wait
+        # Download and extract Poppler
+        $popplerUrl = "https://github.com/oschwartz10612/poppler-windows/releases/download/v24.02.0-0/Release-24.02.0-0.zip"
+        $popplerZip = "C:\tools\poppler.zip"
+        Invoke-WebRequest -Uri $popplerUrl -OutFile $popplerZip
+        # Extract Poppler
+        Expand-Archive -Path $popplerZip -DestinationPath C:\tools\poppler -Force
+        # Add to PATH
+        echo "C:\tools\tesseract" >> $env:GITHUB_PATH
+        echo "C:\tools\poppler\poppler-24.02.0\Library\bin" >> $env:GITHUB_PATH
+        # Set environment variables for your application
+        echo "TESSERACT_FOLDER=C:\tools\tesseract" >> $env:GITHUB_ENV
+        echo "POPPLER_FOLDER=C:\tools\poppler\poppler-24.02.0\Library\bin" >> $env:GITHUB_ENV
+        echo "TESSERACT_DATA_FOLDER=C:\tools\tesseract\tessdata" >> $env:GITHUB_ENV
+        # Verify installation
+        tesseract --version
+        pdftoppm -v
     - name: Install Python dependencies
       run: |

cdk/cdk_config.py CHANGED Viewed

@@ -219,6 +219,16 @@ COGNITO_USER_POOL_DOMAIN_PREFIX = get_or_create_env_var(
     "COGNITO_USER_POOL_DOMAIN_PREFIX", "redaction-app-domain"
 )  # Should change this to something unique or you'll probably hit an error
 # Application load balancer
 ALB_NAME = get_or_create_env_var(
     "ALB_NAME", f"{CDK_PREFIX}Alb"[-32:]

     "COGNITO_USER_POOL_DOMAIN_PREFIX", "redaction-app-domain"
 )  # Should change this to something unique or you'll probably hit an error
+COGNITO_REFRESH_TOKEN_VALIDITY = int(
+    get_or_create_env_var("COGNITO_REFRESH_TOKEN_VALIDITY", "480")
+)  # Minutes
+COGNITO_ID_TOKEN_VALIDITY = int(
+    get_or_create_env_var("COGNITO_ID_TOKEN_VALIDITY", "60")
+)  # Minutes
+COGNITO_ACCESS_TOKEN_VALIDITY = int(
+    get_or_create_env_var("COGNITO_ACCESS_TOKEN_VALIDITY", "60")
+)  # Minutes
 # Application load balancer
 ALB_NAME = get_or_create_env_var(
     "ALB_NAME", f"{CDK_PREFIX}Alb"[-32:]

cdk/cdk_stack.py CHANGED Viewed

@@ -39,7 +39,10 @@ from cdk_config import (
     CLUSTER_NAME,
     CODEBUILD_PROJECT_NAME,
     CODEBUILD_ROLE_NAME,
     COGNITO_REDIRECTION_URL,
     COGNITO_USER_POOL_CLIENT_NAME,
     COGNITO_USER_POOL_CLIENT_SECRET_NAME,
     COGNITO_USER_POOL_DOMAIN_PREFIX,
@@ -1161,6 +1164,13 @@ class CdkStack(Stack):
                         ],
                         callback_urls=redirect_uris,
                     ),
                 )
             CfnOutput(

     CLUSTER_NAME,
     CODEBUILD_PROJECT_NAME,
     CODEBUILD_ROLE_NAME,
+    COGNITO_ACCESS_TOKEN_VALIDITY,
+    COGNITO_ID_TOKEN_VALIDITY,
     COGNITO_REDIRECTION_URL,
+    COGNITO_REFRESH_TOKEN_VALIDITY,
     COGNITO_USER_POOL_CLIENT_NAME,
     COGNITO_USER_POOL_CLIENT_SECRET_NAME,
     COGNITO_USER_POOL_DOMAIN_PREFIX,
                         ],
                         callback_urls=redirect_uris,
                     ),
+                    refresh_token_validity=Duration.minutes(
+                        COGNITO_REFRESH_TOKEN_VALIDITY
+                    ),
+                    id_token_validity=Duration.minutes(COGNITO_ID_TOKEN_VALIDITY),
+                    access_token_validity=Duration.minutes(
+                        COGNITO_ACCESS_TOKEN_VALIDITY
+                    ),
                 )
             CfnOutput(

pyproject.toml CHANGED Viewed

@@ -12,33 +12,33 @@ requires-python = ">=3.10"
 dependencies = [
     "pdfminer.six==20250506",
     "pdf2image==1.17.0",
-    "pymupdf==1.26.3",
     "opencv-python==4.12.0.88",
-    "presidio_analyzer==2.2.359",
-    "presidio_anonymizer==2.2.359",
     "presidio-image-redactor==0.0.57",
-    "pikepdf==9.10.2",
-    "pandas==2.3.1",
-    "scikit-learn==1.7.1",
     "spacy==3.8.7",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
-    "gradio==5.46.1",
-    "boto3==1.40.31",
     "pyarrow==21.0.0",
     "openpyxl==3.1.5",
-    "Faker==37.5.3",
     "python-levenshtein==0.27.1",
     "spaczz==0.6.1",
     # Direct URL dependency for gradio_image_annotator wheel
     "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
-    "rapidfuzz==3.13.0",
     "python-dotenv==1.0.1",
     "awslambdaric==3.1.1",
     "python-docx==1.2.0",
-    "paddlepaddle==3.1.0",
-    "paddleocr==3.1.1",
     "polars==1.33.1"
 ]
 [project.urls]

 dependencies = [
     "pdfminer.six==20250506",
     "pdf2image==1.17.0",
+    "pymupdf==1.26.4",
     "opencv-python==4.12.0.88",
+    "presidio_analyzer==2.2.360",
+    "presidio_anonymizer==2.2.360",
     "presidio-image-redactor==0.0.57",
+    "pikepdf==9.11.0",
+    "pandas==2.3.2",
+    "scikit-learn==1.7.2",
     "spacy==3.8.7",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
+    "gradio==5.47.0",
+    "boto3==1.40.37",
     "pyarrow==21.0.0",
     "openpyxl==3.1.5",
+    "Faker==37.8.0",
     "python-levenshtein==0.27.1",
     "spaczz==0.6.1",
     # Direct URL dependency for gradio_image_annotator wheel
     "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
+    "rapidfuzz==3.14.1",
     "python-dotenv==1.0.1",
     "awslambdaric==3.1.1",
     "python-docx==1.2.0",
     "polars==1.33.1"
+    #"paddlepaddle==3.2.0", # Optional paddle imports - only if you want to use hybrid OCR mode with tesseract and paddleOCR
+    #"paddleocr==3.2.0"
 ]
 [project.urls]

requirements.txt CHANGED Viewed

@@ -1,32 +1,31 @@
 pdfminer.six==20250506
 pdf2image==1.17.0
-pymupdf==1.26.3
 opencv-python==4.12.0.88
-presidio_analyzer==2.2.359
-presidio_anonymizer==2.2.359
 presidio-image-redactor==0.0.57
-pikepdf==9.10.2
 pandas==2.3.2
-scikit-learn==1.7.1
 spacy==3.8.7
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
-gradio==5.46.1
 polars==1.33.1
-boto3==1.40.31
 pyarrow==21.0.0
 openpyxl==3.1.5
-Faker==37.5.3
 python-levenshtein==0.27.1
 spaczz==0.6.1
-# The following version
 https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
-rapidfuzz==3.13.0
 python-dotenv==1.0.1
 awslambdaric==3.1.1
 python-docx==1.2.0
 # Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
-# paddlepaddle==3.1.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
-# paddleocr==3.1.1
 # Test dependencies
 pytest>=7.0.0

 pdfminer.six==20250506
 pdf2image==1.17.0
+pymupdf==1.26.4
 opencv-python==4.12.0.88
+presidio_analyzer==2.2.360
+presidio_anonymizer==2.2.360
 presidio-image-redactor==0.0.57
+pikepdf==9.11.0
 pandas==2.3.2
+scikit-learn==1.7.2
 spacy==3.8.7
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+gradio==5.47.0
 polars==1.33.1
+boto3==1.40.37
 pyarrow==21.0.0
 openpyxl==3.1.5
+Faker==37.8.0
 python-levenshtein==0.27.1
 spaczz==0.6.1
 https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
+rapidfuzz==3.14.1
 python-dotenv==1.0.1
 awslambdaric==3.1.1
 python-docx==1.2.0
 # Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
+# paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
+# paddleocr==3.2.0
 # Test dependencies
 pytest>=7.0.0

tools/find_duplicate_pages.py CHANGED Viewed

@@ -36,7 +36,7 @@ def split_text_with_punctuation(text: str) -> List[str]:
     # 2. OR a sequence of one or more characters that are NOT punctuation or whitespace `[^.,?!:;\s]+`
     pattern = re.compile(r"([.,?!:;]+|[^.,?!:;\s]+)")
-    final_list = []
     # We first split by whitespace to handle sentences correctly
     for word in text.split():
         # Then, for each whitespace-separated word, we tokenize it further
@@ -381,7 +381,7 @@ def combine_ocr_dataframes(
             - The final combined and processed DataFrame.
             - A list containing the path to the saved output CSV file.
     """
-    all_data = []
     for file_identifier, df_initial in input_data:
         df = df_initial.copy()  # Work on a copy to avoid side effects
@@ -454,7 +454,7 @@ def combine_ocr_dataframes(
     combined_df = combined_df.copy()[existing_final_columns]
     # --- Save Output ---
-    output_files = []
     if output_folder and output_filename:
         os.makedirs(output_folder, exist_ok=True)
         output_path = os.path.join(output_folder, output_filename)
@@ -490,7 +490,7 @@ def combine_ocr_output_text(
     else:
         file_paths_list = input_files
-    data_to_process = []
     for file_path in file_paths_list:
         try:
             df = pd.read_csv(file_path)
@@ -644,7 +644,7 @@ def save_results_and_redaction_lists(
     Returns:
         list: A list of paths to all generated files.
     """
-    output_paths = []
     output_folder_path = Path(output_folder)
     output_folder_path.mkdir(exist_ok=True)
@@ -657,7 +657,6 @@ def save_results_and_redaction_lists(
     final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
     output_paths.append(str(similarity_file_output_path))
-    # print(f"Main results saved to {similarity_file_output_path}")
     # 2. Save per-file redaction lists
     # Use 'Page2_File' as the source of duplicate content
@@ -754,7 +753,6 @@ def find_consecutive_sequence_matches(
         A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
         consecutive match, or an empty DataFrame if no match is found.
     """
-    # print(f"Starting sequence search for '{search_file_name}' in '{reference_file_name}'...")
     # Step 1: Isolate the data for each file
     search_df = df_filtered[df_filtered["file"] == search_file_name]
@@ -773,7 +771,7 @@ def find_consecutive_sequence_matches(
     reference_indices = reference_df.index.tolist()
     query_len = len(query_tokens)
-    all_found_matches = []
     print(f"Searching for a sequence of {query_len} tokens...")
@@ -784,7 +782,6 @@ def find_consecutive_sequence_matches(
         # Step 4: If the window matches the query with or without punctuation on end
         if _sequences_match(query_tokens, window):
-            # print(f"Found a consecutive match starting at reference index: {reference_indices[i]}")
             # Get the global indices for this entire matching block
             matching_reference_indices = reference_indices[i : i + query_len]
@@ -874,7 +871,7 @@ def identify_similar_text_sequences(
         # Use the original, simpler path for all-to-all comparisons (including intra-file).
         vectorizer = TfidfVectorizer()
         print("Standard Path: Calculating all-to-all similarity.")
-        progress(0.2, desc="Vectorizing text...")
         tfidf_matrix = vectorizer.fit_transform(df_filtered["text_clean"])
         progress(0.3, desc="Calculating similarity matrix...")
@@ -897,12 +894,8 @@ def identify_similar_text_sequences(
     progress(0.7, desc="Aggregating results based on matching strategy")
     if greedy_match or min_consecutive_pages > 1:
-        # print("Finding all consecutive page matches of minimum length:", min_consecutive_pages)
         # Sort the dataframe to ensure consecutive pages are adjacent
-        similarity_df = (
-            base_similarity_df  # .sort_values(['Page1_Index', 'Page2_Index']).copy()
-        )
         # A new sequence starts if the difference from the previous row is not (1, 1)
         # is_consecutive will be True if a row continues the sequence, False if it's a new one.
@@ -1023,8 +1016,8 @@ def run_duplicate_analysis(
     min_consecutive: int,
     greedy_match: bool,
     combine_pages: bool = True,
-    preview_length: int = 500,
     output_folder: str = OUTPUT_FOLDER,
     progress=gr.Progress(track_tqdm=True),
 ):
     """
@@ -1039,8 +1032,8 @@ def run_duplicate_analysis(
         min_consecutive (int): The minimum number of consecutive pages that must match for a sequence to be considered a duplicate.
         greedy_match (bool): If True, uses a greedy matching strategy for identifying consecutive sequences.
         combine_pages (bool, optional): If True, text from multiple pages is combined into larger segments for analysis. Defaults to True.
-        preview_length (int, optional): The maximum number of characters to display in the text preview panes. Defaults to 500.
         output_folder (str, optional): The directory where the similarity results and redaction lists will be saved. Defaults to OUTPUT_FOLDER.
         progress (gr.Progress, optional): A Gradio progress tracker object to display progress in the UI.
     """
@@ -1079,8 +1072,11 @@ def run_duplicate_analysis(
         progress=progress,
     )
     # Clip text to first 200 characters
     full_df["text"] = full_df["text"].str[:preview_length]
     # Preprocess full_data (without preview text) for fast access (run once)
     full_data_by_file = {
         file: df.sort_values("page").set_index("page")
@@ -1135,6 +1131,9 @@ def show_page_previews(
         page1_data = full_data_by_file[file1].loc[[page1], ["text"]].reset_index()
         page2_data = full_data_by_file[file2].loc[[page2], ["text"]].reset_index()
     page1_data["text"] = page1_data["text"].str[:preview_length]
     page2_data["text"] = page2_data["text"].str[:preview_length]
@@ -1217,10 +1216,10 @@ def apply_whole_page_redactions_from_list(
         new_annotations_with_bounding_boxes (List[dict], optional): A list of new annotations with bounding boxes. Defaults to an empty list.
     """
     if all_existing_annotations is None:
-        all_existing_annotations = []
     if new_annotations_with_bounding_boxes is None:
-        new_annotations_with_bounding_boxes = []
     all_annotations = all_existing_annotations.copy()
@@ -1229,7 +1228,7 @@ def apply_whole_page_redactions_from_list(
         print(f"Warning: {message}")
         raise Warning(message)
-    list_whole_pages_to_redact = []
     if combine_pages is True:
         # Get list of pages to redact from either dataframe or file
@@ -1270,7 +1269,7 @@ def apply_whole_page_redactions_from_list(
             print(message)
             raise Warning(message)
-        list_whole_pages_to_redact = []
         for annotation in new_annotations_with_bounding_boxes:
             from tools.secure_regex_utils import safe_extract_page_number_from_path
@@ -1285,7 +1284,7 @@ def apply_whole_page_redactions_from_list(
         list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
-    new_annotations = []
     # Process each page for redaction
     for page in list_whole_pages_to_redact:
         try:
@@ -1312,8 +1311,7 @@ def apply_whole_page_redactions_from_list(
                 )
                 continue
-            # --- Create a LIST of boxes to add.---
-            boxes_to_add = []
             pymupdf_page = pymupdf_doc[page_index]
@@ -1439,7 +1437,7 @@ def create_annotation_objects_from_duplicates(
     Returns:
         List[Dict]: A list of dictionaries, where each dict represents a page and its list of annotation boxes, in the format: [{"image": "path/to/img.png", "boxes": [...]}, ...]
     """
-    final_output = []
     if duplicates_df.empty:
         raise Warning("No duplicates found")
@@ -1503,7 +1501,7 @@ def create_annotation_objects_from_duplicates(
                 annotations_by_page[page_number].append(box)
         # --- Format the final output list using the page-to-image map ---
-        final_output = []
         # Sort by page number for a predictable order
         for page_num, boxes in sorted(annotations_by_page.items()):
             # Look up the image path using the page number

     # 2. OR a sequence of one or more characters that are NOT punctuation or whitespace `[^.,?!:;\s]+`
     pattern = re.compile(r"([.,?!:;]+|[^.,?!:;\s]+)")
+    final_list = list()
     # We first split by whitespace to handle sentences correctly
     for word in text.split():
         # Then, for each whitespace-separated word, we tokenize it further
             - The final combined and processed DataFrame.
             - A list containing the path to the saved output CSV file.
     """
+    all_data = list()
     for file_identifier, df_initial in input_data:
         df = df_initial.copy()  # Work on a copy to avoid side effects
     combined_df = combined_df.copy()[existing_final_columns]
     # --- Save Output ---
+    output_files = list()
     if output_folder and output_filename:
         os.makedirs(output_folder, exist_ok=True)
         output_path = os.path.join(output_folder, output_filename)
     else:
         file_paths_list = input_files
+    data_to_process = list()
     for file_path in file_paths_list:
         try:
             df = pd.read_csv(file_path)
     Returns:
         list: A list of paths to all generated files.
     """
+    output_paths = list()
     output_folder_path = Path(output_folder)
     output_folder_path.mkdir(exist_ok=True)
     final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
     output_paths.append(str(similarity_file_output_path))
     # 2. Save per-file redaction lists
     # Use 'Page2_File' as the source of duplicate content
         A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
         consecutive match, or an empty DataFrame if no match is found.
     """
     # Step 1: Isolate the data for each file
     search_df = df_filtered[df_filtered["file"] == search_file_name]
     reference_indices = reference_df.index.tolist()
     query_len = len(query_tokens)
+    all_found_matches = list()
     print(f"Searching for a sequence of {query_len} tokens...")
         # Step 4: If the window matches the query with or without punctuation on end
         if _sequences_match(query_tokens, window):
             # Get the global indices for this entire matching block
             matching_reference_indices = reference_indices[i : i + query_len]
         # Use the original, simpler path for all-to-all comparisons (including intra-file).
         vectorizer = TfidfVectorizer()
         print("Standard Path: Calculating all-to-all similarity.")
+        progress(0.2, desc="Vectorising text...")
         tfidf_matrix = vectorizer.fit_transform(df_filtered["text_clean"])
         progress(0.3, desc="Calculating similarity matrix...")
     progress(0.7, desc="Aggregating results based on matching strategy")
     if greedy_match or min_consecutive_pages > 1:
         # Sort the dataframe to ensure consecutive pages are adjacent
+        similarity_df = base_similarity_df
         # A new sequence starts if the difference from the previous row is not (1, 1)
         # is_consecutive will be True if a row continues the sequence, False if it's a new one.
     min_consecutive: int,
     greedy_match: bool,
     combine_pages: bool = True,
     output_folder: str = OUTPUT_FOLDER,
+    preview_length: int = 500,
     progress=gr.Progress(track_tqdm=True),
 ):
     """
         min_consecutive (int): The minimum number of consecutive pages that must match for a sequence to be considered a duplicate.
         greedy_match (bool): If True, uses a greedy matching strategy for identifying consecutive sequences.
         combine_pages (bool, optional): If True, text from multiple pages is combined into larger segments for analysis. Defaults to True.
         output_folder (str, optional): The directory where the similarity results and redaction lists will be saved. Defaults to OUTPUT_FOLDER.
+        preview_length (int, optional): The maximum number of characters to display in the text preview panes. Defaults to 500.
         progress (gr.Progress, optional): A Gradio progress tracker object to display progress in the UI.
     """
         progress=progress,
     )
+    full_df["text"] = full_df["text"].astype(str)
     # Clip text to first 200 characters
     full_df["text"] = full_df["text"].str[:preview_length]
     # Preprocess full_data (without preview text) for fast access (run once)
     full_data_by_file = {
         file: df.sort_values("page").set_index("page")
         page1_data = full_data_by_file[file1].loc[[page1], ["text"]].reset_index()
         page2_data = full_data_by_file[file2].loc[[page2], ["text"]].reset_index()
+    page1_data["text"] = page1_data["text"].astype(str)
+    page2_data["text"] = page2_data["text"].astype(str)
     page1_data["text"] = page1_data["text"].str[:preview_length]
     page2_data["text"] = page2_data["text"].str[:preview_length]
         new_annotations_with_bounding_boxes (List[dict], optional): A list of new annotations with bounding boxes. Defaults to an empty list.
     """
     if all_existing_annotations is None:
+        all_existing_annotations = list()
     if new_annotations_with_bounding_boxes is None:
+        new_annotations_with_bounding_boxes = list()
     all_annotations = all_existing_annotations.copy()
         print(f"Warning: {message}")
         raise Warning(message)
+    list_whole_pages_to_redact = list()
     if combine_pages is True:
         # Get list of pages to redact from either dataframe or file
             print(message)
             raise Warning(message)
+        list_whole_pages_to_redact = list()
         for annotation in new_annotations_with_bounding_boxes:
             from tools.secure_regex_utils import safe_extract_page_number_from_path
         list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
+    new_annotations = list()
     # Process each page for redaction
     for page in list_whole_pages_to_redact:
         try:
                 )
                 continue
+            boxes_to_add = list()
             pymupdf_page = pymupdf_doc[page_index]
     Returns:
         List[Dict]: A list of dictionaries, where each dict represents a page and its list of annotation boxes, in the format: [{"image": "path/to/img.png", "boxes": [...]}, ...]
     """
+    final_output = list()
     if duplicates_df.empty:
         raise Warning("No duplicates found")
                 annotations_by_page[page_number].append(box)
         # --- Format the final output list using the page-to-image map ---
+        final_output = list()
         # Sort by page number for a predictable order
         for page_num, boxes in sorted(annotations_by_page.items()):
             # Look up the image path using the page number

tools/redaction_review.py CHANGED Viewed

@@ -1017,8 +1017,6 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
     progress(1.0, desc="Completed annotation processing")
-    print("final_annotations_list:", final_annotations_list)
     return (
         final_annotations_list,
         existing_annotations_list,

     progress(1.0, desc="Completed annotation processing")
     return (
         final_annotations_list,
         existing_annotations_list,