Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

Sean Pedrick-Case commited on 8 days ago

Commit

10da194

unverified ·

2 Parent(s): 95ca426 3946be6

Merge pull request #49 from seanpedrick-case/dev_new

Browse files

Revamped duplicate page/subdocument removal, CDK code, updated documentation, read-only file system compatability.

Files changed (31) hide show

.dockerignore +14 -6
.gitignore +11 -2
Dockerfile +75 -40
README.md +42 -10
_quarto.yml +28 -0
app.py +198 -79
cdk/__init__.py +0 -0
cdk/app.py +81 -0
cdk/cdk_config.py +225 -0
cdk/cdk_functions.py +1293 -0
cdk/cdk_stack.py +1317 -0
cdk/check_resources.py +297 -0
cdk/post_cdk_build_quickstart.py +27 -0
cdk/requirements.txt +5 -0
index.qmd +23 -0
pyproject.toml +4 -4
requirements.txt +2 -2
src/app_settings.qmd +481 -0
src/faq.qmd +222 -0
src/installation_guide.qmd +233 -0
src/management_guide.qmd +226 -0
src/styles.css +1 -0
src/user_guide.qmd +543 -0
tld/.tld_set_snapshot +0 -0
tools/aws_functions.py +1 -0
tools/config.py +12 -8
tools/file_conversion.py +20 -10
tools/file_redaction.py +42 -22
tools/find_duplicate_pages.py +470 -125
tools/helper_functions.py +8 -0
tools/redaction_review.py +15 -4

.dockerignore CHANGED Viewed

@@ -4,10 +4,9 @@
 *.jpg
 *.png
 *.ipynb
 examples/*
 processing/*
-input/*
-output/*
 tools/__pycache__/*
 old_code/*
 tesseract/*
@@ -15,9 +14,18 @@ poppler/*
 build/*
 dist/*
 build_deps/*
-logs/*
-config/*
 user_guide/*
-cdk/*
 cdk/config/*
-web/*

 *.jpg
 *.png
 *.ipynb
+*.pyc
 examples/*
 processing/*
 tools/__pycache__/*
 old_code/*
 tesseract/*
 build/*
 dist/*
 build_deps/*
 user_guide/*
 cdk/config/*
+tld/*
+cdk/config/*
+cdk/cdk.out/*
+cdk/archive/*
+cdk.json
+cdk.context.json
+.quarto/*
+logs/
+output/
+input/
+feedback/
+config/
+usage/

.gitignore CHANGED Viewed

@@ -4,6 +4,7 @@
 *.jpg
 *.png
 *.ipynb
 examples/*
 processing/*
 input/*
@@ -19,6 +20,14 @@ logs/*
 config/*
 doc_redaction_amplify_app/*
 user_guide/*
-cdk/*
 cdk/config/*
-web/*

 *.jpg
 *.png
 *.ipynb
+*.pyc
 examples/*
 processing/*
 input/*
 config/*
 doc_redaction_amplify_app/*
 user_guide/*
 cdk/config/*
+cdk/cdk.out/*
+cdk/archive/*
+tld/*
+tmp/*
+cdk.out/*
+cdk.json
+cdk.context.json
+.quarto/*
+/.quarto/
+/_site/

Dockerfile CHANGED Viewed

@@ -1,14 +1,14 @@
 # Stage 1: Build dependencies and download models
 FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
-# Install system dependencies. Need to specify -y for poppler to get it to install
 RUN apt-get update \
     && apt-get install -y \
         g++ \
         make \
         cmake \
         unzip \
-        libcurl4-openssl-dev \
         git \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
@@ -17,28 +17,20 @@ WORKDIR /src
 COPY requirements.txt .
-RUN pip install --no-cache-dir --target=/install -r requirements.txt
-RUN rm requirements.txt
-# Add lambda_entrypoint.py to the container
 COPY lambda_entrypoint.py .
 COPY entrypoint.sh .
 # Stage 2: Final runtime image
 FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
-# Define a build argument with a default value
 ARG APP_MODE=gradio
-# Echo the APP_MODE during the build to confirm its value
-RUN echo "APP_MODE is set to: ${APP_MODE}"
-# Set APP_MODE as an environment variable for runtime
 ENV APP_MODE=${APP_MODE}
-# Install system dependencies
 RUN apt-get update \
     && apt-get install -y \
         tesseract-ocr \
@@ -48,30 +40,85 @@ RUN apt-get update \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
-# Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user
-# Create required directories
-RUN mkdir -p /home/user/app/{output,input,tld,logs,usage,feedback,config} \
-    && chown -R user:user /home/user/app
 # Copy installed packages from builder stage
 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
-# Download NLTK data packages - now no longer necessary
-# RUN python -m nltk.downloader --quiet punkt stopwords punkt_tab
-# Entrypoint helps to switch between Gradio and Lambda mode
 COPY entrypoint.sh /entrypoint.sh
 RUN chmod +x /entrypoint.sh
-# Switch to the "user" user
 USER user
-ENV APP_HOME=/home/user
-# Set environmental variables
 ENV PATH=$APP_HOME/.local/bin:$PATH \
     PYTHONPATH=$APP_HOME/app \
     PYTHONUNBUFFERED=1 \
@@ -80,20 +127,8 @@ ENV PATH=$APP_HOME/.local/bin:$PATH \
     GRADIO_NUM_PORTS=1 \
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_SERVER_PORT=7860 \
-    GRADIO_ANALYTICS_ENABLED=False \
-    TLDEXTRACT_CACHE=$APP_HOME/app/tld/.tld_set_snapshot \
-    SYSTEM=spaces
-# Set the working directory to the user's home directory
-WORKDIR $APP_HOME/app
-# Copy the app code to the container
-COPY --chown=user . $APP_HOME/app
-# Ensure permissions are really user:user again after copying
-RUN chown -R user:user $APP_HOME/app && chmod -R u+rwX $APP_HOME/app
-ENTRYPOINT [ "/entrypoint.sh" ]
-# Default command for Lambda mode
-CMD [ "lambda_entrypoint.lambda_handler" ]

 # Stage 1: Build dependencies and download models
 FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
+# Install system dependencies
 RUN apt-get update \
     && apt-get install -y \
         g++ \
         make \
         cmake \
         unzip \
+        libcurl4-openssl-dev \
         git \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
+RUN pip install --no-cache-dir --target=/install -r requirements.txt && rm requirements.txt
+# Add lambda entrypoint and script
 COPY lambda_entrypoint.py .
 COPY entrypoint.sh .
 # Stage 2: Final runtime image
 FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
+# Set build-time and runtime environment variable
 ARG APP_MODE=gradio
 ENV APP_MODE=${APP_MODE}
+# Install runtime dependencies
 RUN apt-get update \
     && apt-get install -y \
         tesseract-ocr \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
+# Create non-root user
 RUN useradd -m -u 1000 user
+ENV APP_HOME=/home/user
+# Set env variables for Gradio & other apps
+ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
+    TLDEXTRACT_CACHE=/tmp/tld/ \
+    MPLCONFIGDIR=/tmp/matplotlib_cache/ \
+    GRADIO_OUTPUT_FOLDER=$APP_HOME/app/output/ \
+    GRADIO_INPUT_FOLDER=$APP_HOME/app/input/ \
+    FEEDBACK_LOGS_FOLDER=$APP_HOME/app/feedback/ \
+    ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
+    USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
+    CONFIG_FOLDER=$APP_HOME/app/config/ \
+    XDG_CACHE_HOME=/tmp/xdg_cache/user_1000
+# Create the base application directory and set its ownership
+RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
+# Create required sub-folders within the app directory and set their permissions
+# This ensures these specific directories are owned by 'user'
+RUN mkdir -p \
+    ${APP_HOME}/app/output \
+    ${APP_HOME}/app/input \
+    ${APP_HOME}/app/logs \
+    ${APP_HOME}/app/usage \
+    ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config \
+    && chown user:user \
+    ${APP_HOME}/app/output \
+    ${APP_HOME}/app/input \
+    ${APP_HOME}/app/logs \
+    ${APP_HOME}/app/usage \
+    ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config \
+    && chmod 755 \
+    ${APP_HOME}/app/output \
+    ${APP_HOME}/app/input \
+    ${APP_HOME}/app/logs \
+    ${APP_HOME}/app/usage \
+    ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config
+# Now handle the /tmp and /var/tmp directories and their subdirectories
+RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
+    && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
+    && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
+    && chmod 700 ${XDG_CACHE_HOME}
 # Copy installed packages from builder stage
 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
+# Copy app code and entrypoint with correct ownership
+COPY --chown=user . $APP_HOME/app
+# Copy and chmod entrypoint
 COPY entrypoint.sh /entrypoint.sh
 RUN chmod +x /entrypoint.sh
+# Switch to user
 USER user
+# Declare working directory
+WORKDIR $APP_HOME/app
+# Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
+VOLUME ["/tmp/matplotlib_cache"]
+VOLUME ["/tmp/gradio_tmp"]
+VOLUME ["/tmp/tld"]
+VOLUME ["/home/user/app/output"]
+VOLUME ["/home/user/app/input"]
+VOLUME ["/home/user/app/logs"]
+VOLUME ["/home/user/app/usage"]
+VOLUME ["/home/user/app/feedback"]
+VOLUME ["/home/user/app/config"]
+VOLUME ["/tmp"]
+VOLUME ["/var/tmp"]
+# Set runtime environment
 ENV PATH=$APP_HOME/.local/bin:$PATH \
     PYTHONPATH=$APP_HOME/app \
     PYTHONUNBUFFERED=1 \
     GRADIO_NUM_PORTS=1 \
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_SERVER_PORT=7860 \
+    GRADIO_ANALYTICS_ENABLED=False
+ENTRYPOINT ["/entrypoint.sh"]
+CMD ["lambda_entrypoint.lambda_handler"]

README.md CHANGED Viewed

@@ -5,16 +5,16 @@ colorFrom: blue
 colorTo: yellow
 sdk: docker
 app_file: app.py
-pinned: false
 license: agpl-3.0
 ---
 # Document redaction
-version: 0.6.8
 Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
-To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
 After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
@@ -181,6 +181,8 @@ If the table is empty, you can add a new entry, you can add a new row by clickin
 ![Manually modify allow or deny list filled](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/manually_modify_filled.PNG)
 ### Redacting additional types of personal information
 You may want to redact additional types of information beyond the defaults, or you may not be interested in default suggested entity types. There are dates in the example complaint letter. Say we wanted to redact those dates also?
@@ -390,21 +392,49 @@ You can find this option at the bottom of the 'Redaction Settings' tab. Upload m
 The files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/).
-Some redaction tasks involve removing duplicate pages of text that may exist across multiple documents. This feature calculates the similarity of text in all pages of input PDFs, calculates a similarity score, and then flags pages above a certain similarity score (90%) for removal by creating a 'whole page' redaction list file for each input PDF.
-![Example duplicate page outputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_output_interface.PNG)
-The similarity calculation is based on using the 'ocr_outputs.csv' file that is output every time that you perform a redaction task. From the file folder, upload the four 'ocr_output.csv' files provided in the example folder into the file area. Click 'Identify duplicate pages' and you will see a number of files returned. In case you want to see the original PDFs, they are available [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/input_pdfs/).
-![Identify duplicate pages interface](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_input_interface.PNG)
-First, there is a 'combined_ocr_result...' file that just merges together all the text from the input files. 'page_similarity_results.csv' shows a breakdown of the pages from each file that are most similar to each other above the threshold (90% similarity). You can compare the text in the two columns 'Page_1_Text' and 'Page_2_Text'.
 ![Page similarity file example](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/page_similarity_example.PNG)
-The remaining output files are suffixed with '_whole_page.csv'. These are the same files that can be used to redact whole pages as described in the ['Full page redaction list example' section](#full-page-redaction-list-example). For each PDF involved in the duplicate detection process, you can upload the relevant '_whole_page.csv' file into the relevant area, then do a new redaction task for the PDF file without any entity types selected. This way, only the suggested whole pages will be suggested for redaction and nothing else.
-![Example duplicate page redaction list](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/output_file_2_whole_page_outputs.PNG)
 If you want to combine the results from this redaction process with previous redaction tasks for the same PDF, you could merge review file outputs following the steps described in [Merging existing redaction review files](#merging-existing-redaction-review-files) above.
@@ -505,6 +535,8 @@ Again, a lot can potentially go wrong with AWS solutions that are insecure, so b
 ## Modifying existing redaction review files
 You can find the folder containing the files discussed in this section [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/).
 As well as serving as inputs to the document redaction app's review function, the 'review_file.csv' output can be modified outside of the app, and also merged with others from multiple redaction attempts on the same file. This gives you the flexibility to change redaction details outside of the app.

 colorTo: yellow
 sdk: docker
 app_file: app.py
+pinned: true
 license: agpl-3.0
 ---
 # Document redaction
+version: 0.7.0
 Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
+To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works quite well for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
 After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
 ![Manually modify allow or deny list filled](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/manually_modify_filled.PNG)
+**Note:** As of version 0.7.0 you can now apply your whole page redaction list directly to the document file currently under review by clicking the 'Apply whole page redaction list to document currently under review' button that appears here.
 ### Redacting additional types of personal information
 You may want to redact additional types of information beyond the defaults, or you may not be interested in default suggested entity types. There are dates in the example complaint letter. Say we wanted to redact those dates also?
 The files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/).
+Some redaction tasks involve removing duplicate pages of text that may exist across multiple documents. This feature helps you find and remove duplicate content that may exist in single or multiple documents.  It can identify everything from single identical pages to multi-page sections (subdocuments). The process involves three main steps: configuring the analysis, reviewing the results in the interactive interface, and then using the generated files to perform the redactions.
+![Example duplicate page inputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_input_interface_new.PNG)
+**Step 1: Upload and Configure the Analysis**
+First, navigate to the "Identify duplicate pages" tab. Upload all the ocr_output.csv files you wish to compare into the file area. These files are generated every time you run a redaction task and contain the text for each page of a document.
+For our example, you can upload the four 'ocr_output.csv' files provided in the example folder into the file area. Click 'Identify duplicate pages' and you will see a number of files returned. In case you want to see the original PDFs, they are available [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/input_pdfs/).
+The default options will search for matching subdocuments of any length. Before running the analysis, you can configure these matching parameters to tell the tool what you're looking for:
+![Duplicate matching parameters](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_matching_parameters.PNG)
+*Matching Parameters*
+- **Similarity Threshold:** A score from 0 to 1. Pages or sequences of pages with a calculated text similarity above this value will be considered a match. The default of 0.9 (90%) is a good starting point for finding near-identical pages.
+- **Min Word Count:** Pages with fewer words than this value will be completely ignored during the comparison. This is extremely useful for filtering out blank pages, title pages, or boilerplate pages that might otherwise create noise in the results. The default is 10.
+- **Choosing a Matching Strategy:** You have three main options to find duplicate content.
+    - *'Subdocument' matching (default):* Use this to find the longest possible sequence of matching pages. The tool will find an initial match and then automatically expand it forward page-by-page until the consecutive match breaks. This is the best method for identifying complete copied chapters or sections of unknown length. This is enabled by default by ticking the "Enable 'subdocument' matching" box. This setting overrides the described below.
+    - *Minimum length subdocument matching:* Use this to find sequences of consecutively matching pages with a minimum page lenght. For example, setting the slider to 3 will only return sections that are at least 3 pages long. How to enable: Untick the "Enable 'subdocument' matching" box and set the "Minimum consecutive pages" slider to a value greater than 1.
+    - *Single Page Matching:* Use this to find all individual page pairs that are similar to each other. Leave the "Enable 'subdocument' matching" box unchecked and keep the "Minimum consecutive pages" slider at 1.
+Once your parameters are set, click the "Identify duplicate pages/subdocuments" button.
+**Step 2: Review Results in the Interface**
+After the analysis is complete, the results will be displayed directly in the interface.
+*Analysis Summary:* A table will appear showing a summary of all the matches found. The columns will change depending on the matching strategy you chose. For subdocument matches, it will show the start and end pages of the matched sequence.
+*Interactive Preview:* This is the most important part of the review process. Click on any row in the summary table. The full text of the matching page(s) will appear side-by-side in the "Full Text Preview" section below, allowing you to instantly verify the accuracy of the match.
+![Duplicate review interface](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_output_review_overview.PNG)
+**Step 3: Download and Use the Output Files**
+The analysis also generates a set of downloadable files for your records and for performing redactions.
+- page_similarity_results.csv: This is a detailed report of the analysis you just ran. It shows a breakdown of the pages from each file that are most similar to each other above the similarity threshold. You can compare the text in the two columns 'Page_1_Text' and 'Page_2_Text'. For single-page matches, it will list each pair of matching pages. For subdocument matches, it will list the start and end pages of each matched sequence, along with the total length of the match.
 ![Page similarity file example](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/page_similarity_example.PNG)
+- [Original_Filename]_pages_to_redact.csv: For each input document that was found to contain duplicate content, a separate redaction list is created. This is a simple, one-column CSV file containing a list of all page numbers that should be removed. To use these files, you can either upload the original document (i.e. the PDF) on the 'Review redactions' tab, and then click on the 'Apply relevant duplicate page output to document currently under review' button. You should see the whole pages suggested for redaction on the 'Review redactions' tab. Alternatively, you can reupload the file into the whole page redaction section as described in the ['Full page redaction list example' section](#full-page-redaction-list-example).
+![Example duplicate page redaction list](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_output_interface_new.PNG)
 If you want to combine the results from this redaction process with previous redaction tasks for the same PDF, you could merge review file outputs following the steps described in [Merging existing redaction review files](#merging-existing-redaction-review-files) above.
 ## Modifying existing redaction review files
+*Note:* As of version 0.7.0 you can now modify redaction review files directly in the app on the 'Review redactions' tab. Open the accordion 'View and edit review data' under the file input area. You can edit review file data cells here - press Enter to apply changes. You should see the effect on the current page if you click the 'Save changes on current page to file' button to the right.
 You can find the folder containing the files discussed in this section [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/).
 As well as serving as inputs to the document redaction app's review function, the 'review_file.csv' output can be modified outside of the app, and also merged with others from multiple redaction attempts on the same file. This gives you the flexibility to change redaction details outside of the app.

_quarto.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+project:
+  type: website
+  output-dir: docs # Common for GitHub Pages
+  render:
+    - "*.qmd"
+website:
+  title: "Document Redaction App"
+  page-navigation: true # Often enabled for floating TOC to highlight current section
+  back-to-top-navigation: true
+  search: true
+  navbar:
+    left:
+      - href: index.qmd
+        text: Home
+      - href: src/user_guide.qmd
+        text: User guide
+      - href: src/faq.qmd
+        text: User FAQ
+      - href: src/installation_guide.qmd
+        text: App installation guide (with CDK)
+      - href: src/app_settings.qmd
+        text: App settings management guide
+format:
+  html:
+    theme: cosmo
+    css: styles.css

app.py CHANGED Viewed

@@ -3,37 +3,39 @@ import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
 from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC,  TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS
-from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars
 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
-from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
-from tools.find_duplicate_pages import identify_similar_pages
 from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
 # Convert string environment variables to string or list
-SAVE_LOGS_TO_CSV = eval(SAVE_LOGS_TO_CSV)
-SAVE_LOGS_TO_DYNAMODB = eval(SAVE_LOGS_TO_DYNAMODB)
-if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = eval(CSV_ACCESS_LOG_HEADERS)
-if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = eval(CSV_FEEDBACK_LOG_HEADERS)
-if CSV_USAGE_LOG_HEADERS: CSV_USAGE_LOG_HEADERS = eval(CSV_USAGE_LOG_HEADERS)
-if DYNAMODB_ACCESS_LOG_HEADERS: DYNAMODB_ACCESS_LOG_HEADERS = eval(DYNAMODB_ACCESS_LOG_HEADERS)
-if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = eval(DYNAMODB_FEEDBACK_LOG_HEADERS)
-if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = eval(DYNAMODB_USAGE_LOG_HEADERS)
-if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = eval(CHOSEN_COMPREHEND_ENTITIES)
-if FULL_COMPREHEND_ENTITY_LIST: FULL_COMPREHEND_ENTITY_LIST = eval(FULL_COMPREHEND_ENTITY_LIST)
-if CHOSEN_REDACT_ENTITIES: CHOSEN_REDACT_ENTITIES = eval(CHOSEN_REDACT_ENTITIES)
-if FULL_ENTITY_LIST: FULL_ENTITY_LIST = eval(FULL_ENTITY_LIST)
 # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
 CHOSEN_COMPREHEND_ENTITIES.extend(custom_entities)
@@ -42,7 +44,7 @@ FULL_COMPREHEND_ENTITY_LIST.extend(custom_entities)
 FILE_INPUT_HEIGHT = int(FILE_INPUT_HEIGHT)
 # Create the gradio interface
-app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
 with app:
@@ -55,7 +57,7 @@ with app:
     all_image_annotations_state = gr.State([])
     all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas", wrap=True)
-    review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
     all_page_line_level_ocr_results = gr.State([])
     all_page_line_level_ocr_results_with_children = gr.State([])
@@ -186,6 +188,9 @@ with app:
     # Duplicate page detection
     in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
     duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
     # Tracking variables for current page (not visible)
     current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
@@ -230,7 +235,7 @@ with app:
     Redact personally identifiable information (PII) from documents (PDF, images), open text, or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use the app. Below is a very brief overview.
-    To identify text in documents, the 'Local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
     After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...review_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
@@ -259,9 +264,9 @@ with app:
                             local_ocr_output_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=True)
                         with gr.Column(scale=4):
                             with gr.Row(equal_height=True):
-                                total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
-                                estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost (£)", value=0.00, precision=2, visible=True)
-                                estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
             if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
                 with gr.Accordion("Assign task to cost code", open = True, visible=True):
@@ -317,7 +322,10 @@ with app:
             annotate_zoom_in = gr.Button("Zoom in", visible=False)
             annotate_zoom_out = gr.Button("Zoom out", visible=False)
         with gr.Row():
-            clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
         with gr.Row():
             with gr.Column(scale=2):
@@ -376,7 +384,8 @@ with app:
                 with gr.Accordion("Search all extracted text", open=True):
                     all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"),  label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
-                    reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
         with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
             convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
@@ -387,13 +396,67 @@ with app:
     # IDENTIFY DUPLICATE PAGES TAB
     ###
     with gr.Tab(label="Identify duplicate pages"):
-        with gr.Accordion("Identify duplicate pages to redact", open = True):
-            in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv'])
             with gr.Row():
-                duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
-                find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 4)
-            duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv'])
     ###
     # TEXT / TABULAR DATA TAB
@@ -448,6 +511,13 @@ with app:
                     in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["allow_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Allow list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, wrap=True)
                     in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["deny_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Deny list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, wrap=True)
                     in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["fully_redacted_pages_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Fully redacted pages", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, datatype='number', wrap=True)
         with gr.Accordion("Select entity types to redact", open = True):
                 in_redact_entities = gr.Dropdown(value=CHOSEN_REDACT_ENTITIES, choices=FULL_ENTITY_LIST, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
@@ -517,24 +587,24 @@ with app:
         cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-    success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
     success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox])
     # Run redaction function
     document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
         success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
-        success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
-                    outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children], api_name="redact_doc")
     # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
-    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
-                    outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children])
     # If a file has been completed, the function will continue onto the next document
-    latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
-                    outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
-                    success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
                     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
                     success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
                     success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title]).\
@@ -556,62 +626,67 @@ with app:
     textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
     convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-        success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
         success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
         success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
         success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
         success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
-        success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
-                    outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
-                    success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     ###
     # REVIEW PDF REDACTIONS
     ###
     # Upload previous files for modifying redactions
     upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox], api_name="prepare_doc").\
-        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     # Page number controls
     annotate_current_page.submit(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
-        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
-        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
-        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
-        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
     annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
-        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
-        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
     annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
-        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
-        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
     annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
-        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
-        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
     annotate_current_page_bottom.submit(update_other_annotator_number_from_current, inputs=[annotate_current_page_bottom], outputs=[annotate_current_page]).\
         success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
-        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
-        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
     # Apply page redactions
-    annotation_button_apply.click(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state], scroll_to_output=True)
     # Save current page redactions
     update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
-    success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
-    success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
     # Review table controls
     recogniser_entity_dropdown.select(update_entities_df_recogniser_entities, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, text_entity_dropdown, page_entity_dropdown])
@@ -620,50 +695,52 @@ with app:
     # Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
     recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
-        success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_state, selected_entity_id, selected_entity_colour], outputs=[review_file_state, selected_entity_id, selected_entity_colour]).\
-        success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_state, annotate_previous_page])
     reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
-        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     # Exclude current selection from annotator and outputs
     # Exclude only selected row
-    exclude_selected_row_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_state, selected_entity_dataframe_row, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
-        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
-        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state]).\
         success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
     # Exclude all items with same text as selected row
     exclude_text_with_same_as_selected_row_btn.click(get_all_rows_with_same_text, inputs=[recogniser_entity_dataframe_base, selected_entity_dataframe_row_text], outputs=[recogniser_entity_dataframe_same_text]).\
-    success(exclude_selected_items_from_redaction, inputs=[review_file_state, recogniser_entity_dataframe_same_text, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
-        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
-        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state]).\
         success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
     # Exclude everything visible in table
-    exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_state, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
-        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
-        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state]).\
         success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
-    undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base]).\
-        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
-        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
     # Review OCR text button
-    all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row])
     reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
         success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-        success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
         success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
     ###
@@ -676,7 +753,7 @@ with app:
     success(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number], api_name="redact_data").\
     success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
-    # Currently only supports redacting one data file at a time
     # If the output file count text box changes, keep going with redacting each data file until done
     # text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number]).\
     # success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
@@ -684,7 +761,42 @@ with app:
     ###
     # IDENTIFY DUPLICATE PAGES
     ###
-    find_duplicate_pages_btn.click(fn=identify_similar_pages, inputs=[in_duplicate_pages, duplicate_threshold_value, output_folder_textbox], outputs=[duplicate_pages_df, duplicate_pages_out])
     ###
     # SETTINGS PAGE INPUT / OUTPUT
@@ -699,6 +811,13 @@ with app:
     in_deny_list_state.input(update_dataframe, inputs=[in_deny_list_state], outputs=[in_deny_list_state])
     in_fully_redacted_list_state.input(update_dataframe, inputs=[in_fully_redacted_list_state], outputs=[in_fully_redacted_list_state])
     # Merge multiple review csv files together
     merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)

 import gradio as gr
 from gradio_image_annotation import image_annotator
 from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC,  TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS
+from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list
 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
+from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top, store_duplicate_selection
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
+from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list
 from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
 # Convert string environment variables to string or list
+if SAVE_LOGS_TO_CSV == "True": SAVE_LOGS_TO_CSV = True
+else: SAVE_LOGS_TO_CSV = False
+if SAVE_LOGS_TO_DYNAMODB == "True": SAVE_LOGS_TO_DYNAMODB = True
+else: SAVE_LOGS_TO_DYNAMODB = False
+if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
+if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
+if CSV_USAGE_LOG_HEADERS: CSV_USAGE_LOG_HEADERS = _get_env_list(CSV_USAGE_LOG_HEADERS)
+if DYNAMODB_ACCESS_LOG_HEADERS: DYNAMODB_ACCESS_LOG_HEADERS = _get_env_list(DYNAMODB_ACCESS_LOG_HEADERS)
+if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = _get_env_list(DYNAMODB_FEEDBACK_LOG_HEADERS)
+if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = _get_env_list(DYNAMODB_USAGE_LOG_HEADERS)
+if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
+if FULL_COMPREHEND_ENTITY_LIST: FULL_COMPREHEND_ENTITY_LIST = _get_env_list(FULL_COMPREHEND_ENTITY_LIST)
+if CHOSEN_REDACT_ENTITIES: CHOSEN_REDACT_ENTITIES = _get_env_list(CHOSEN_REDACT_ENTITIES)
+if FULL_ENTITY_LIST: FULL_ENTITY_LIST = _get_env_list(FULL_ENTITY_LIST)
 # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
 CHOSEN_COMPREHEND_ENTITIES.extend(custom_entities)
 FILE_INPUT_HEIGHT = int(FILE_INPUT_HEIGHT)
 # Create the gradio interface
+app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True) #gr.themes.Base()
 with app:
     all_image_annotations_state = gr.State([])
     all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas", wrap=True)
     all_page_line_level_ocr_results = gr.State([])
     all_page_line_level_ocr_results_with_children = gr.State([])
     # Duplicate page detection
     in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
     duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
+    full_duplicated_data_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="full_duplicated_data_df", visible=False, type="pandas", wrap=True)
+    selected_duplicate_data_row_index = gr.Number(value=None, label="selected_duplicate_data_row_index", visible=False)
+    full_duplicate_data_by_file = gr.State() # A dictionary of the full duplicate data indexed by file
     # Tracking variables for current page (not visible)
     current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
     Redact personally identifiable information (PII) from documents (PDF, images), open text, or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use the app. Below is a very brief overview.
+    To identify text in documents, the 'Local' text/OCR image analysis uses spaCy/Tesseract, and works well only for documents with typed text. If available, choose 'AWS Textract' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
     After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...review_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
                             local_ocr_output_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=True)
                         with gr.Column(scale=4):
                             with gr.Row(equal_height=True):
+                                total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True, interactive=False)
+                                estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost (£)", value=0.00, precision=2, visible=True, interactive=False)
+                                estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2, interactive=False)
             if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
                 with gr.Accordion("Assign task to cost code", open = True, visible=True):
             annotate_zoom_in = gr.Button("Zoom in", visible=False)
             annotate_zoom_out = gr.Button("Zoom out", visible=False)
         with gr.Row():
+            clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
+        with gr.Accordion(label = "View and edit review file data", open=False):
+            review_file_df = gr.Dataframe(value=pd.DataFrame(), headers=['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id'], row_count = (0, "dynamic"), label="Review file data", visible=True, type="pandas", wrap=True, show_search=True, show_fullscreen_button=True, show_copy_button=True)
         with gr.Row():
             with gr.Column(scale=2):
                 with gr.Accordion("Search all extracted text", open=True):
                     all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"),  label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
+                    reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
+                    selected_ocr_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "text":[]}), col_count=2, type="pandas", visible=False, headers=["page", "text"], wrap=True)
         with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
             convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
     # IDENTIFY DUPLICATE PAGES TAB
     ###
     with gr.Tab(label="Identify duplicate pages"):
+        gr.Markdown("Search for duplicate pages/subdocuments in your ocr_output files. By default, this function will search for duplicate text across multiple pages, and then join consecutive matching pages together into matched 'subdocuments'. The results can be reviewed below, false positives removed, and then the verified results applied to a document you have loaded in on the 'Review redactions' tab.")
+        with gr.Accordion("Step 1: Configure and run analysis", open = True):
+            in_duplicate_pages = gr.File(
+                label="Upload one or multiple 'ocr_output.csv' files to find duplicate pages and subdocuments",
+                file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv']
+            )
+            with gr.Accordion("Duplicate matching parameters", open = False):
+                with gr.Row():
+                    duplicate_threshold_input = gr.Number(value=0.95, label="Similarity threshold", info="Score (0-1) to consider pages a match.")
+                    min_word_count_input = gr.Number(value=10, label="Minimum word count", info="Pages with fewer words than this value are ignored.")
+                gr.Markdown("#### Matching Strategy")
+                greedy_match_input = gr.Checkbox(
+                    label="Enable 'subdocument' matching",
+                    value=True,
+                    info="If checked, finds the longest possible sequence of matching pages (subdocuments), minimum length one page. Overrides the slider below."
+                )
+                min_consecutive_pages_input = gr.Slider(
+                    minimum=1, maximum=20, value=1, step=1,
+                    label="Minimum consecutive pages (modified subdocument match)",
+                    info="If greedy matching option above is unticked, use this to find only subdocuments of a minimum number of consecutive pages."
+                )
+            find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages/subdocuments", variant="primary")
+        with gr.Accordion("Step 2: Review and refine results", open=True):
+            gr.Markdown("### Analysis summary\nClick on a row to select it for preview or exclusion.")
+            with gr.Row():
+                results_df_preview = gr.Dataframe(
+                    label="Similarity Results",
+                    wrap=True,
+                    show_fullscreen_button=True,
+                    show_search=True,
+                    show_copy_button=True
+                )
             with gr.Row():
+                exclude_match_btn = gr.Button(
+                    value="❌ Exclude Selected Match",
+                    variant="stop"
+                )
+                gr.Markdown("Click a row in the table, then click this button to remove it from the results and update the downloadable files.")
+            gr.Markdown("### Full Text Preview of Selected Match")
+            with gr.Row():
+                page1_text_preview = gr.Dataframe(label="Match Source (Document 1)", wrap=True, headers=["page", "text"], show_fullscreen_button=True, show_search=True, show_copy_button=True)
+                page2_text_preview = gr.Dataframe(label="Match Duplicate (Document 2)", wrap=True, headers=["page", "text"], show_fullscreen_button=True, show_search=True, show_copy_button=True)
+            gr.Markdown("### Downloadable Files")
+            duplicate_files_out = gr.File(
+                label="Download analysis summary and redaction lists (.csv)",
+                file_count="multiple",
+                height=FILE_INPUT_HEIGHT
+            )
+            with gr.Row():
+                apply_match_btn = gr.Button(
+                    value="Apply relevant duplicate page output to document currently under review",
+                    variant="secondary")
     ###
     # TEXT / TABULAR DATA TAB
                     in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["allow_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Allow list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, wrap=True)
                     in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["deny_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Deny list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, wrap=True)
                     in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["fully_redacted_pages_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Fully redacted pages", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, datatype='number', wrap=True)
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        markdown_placeholder = gr.Markdown("")
+                    with gr.Column(scale=1):
+                        apply_fully_redacted_list_btn = gr.Button(
+                    value="Apply whole page redaction list to document currently under review",
+                    variant="secondary")
         with gr.Accordion("Select entity types to redact", open = True):
                 in_redact_entities = gr.Dropdown(value=CHOSEN_REDACT_ENTITIES, choices=FULL_ENTITY_LIST, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
         cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+    success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
     success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox])
     # Run redaction function
     document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
         success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
+        success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
+                    outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children], api_name="redact_doc")
     # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
+    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
+                    outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children])
     # If a file has been completed, the function will continue onto the next document
+    latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
+                    outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
+                    success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
                     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
                     success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
                     success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title]).\
     textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
     convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
         success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
         success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
         success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
         success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
+        success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
+                    outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
+                    success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     ###
     # REVIEW PDF REDACTIONS
     ###
     # Upload previous files for modifying redactions
     upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox], api_name="prepare_doc").\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
+    # Manual updates to review di
+    review_file_df.input(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     # Page number controls
     annotate_current_page.submit(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
+        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
+        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
     annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
+        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
     annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
+        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
     annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
+        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
     annotate_current_page_bottom.submit(update_other_annotator_number_from_current, inputs=[annotate_current_page_bottom], outputs=[annotate_current_page]).\
         success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
+        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
     # Apply page redactions
+    annotation_button_apply.click(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df], scroll_to_output=True)
     # Save current page redactions
     update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+    success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
+    success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
     # Review table controls
     recogniser_entity_dropdown.select(update_entities_df_recogniser_entities, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, text_entity_dropdown, page_entity_dropdown])
     # Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
     recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
+        success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_df, selected_entity_id, selected_entity_colour], outputs=[review_file_df, selected_entity_id, selected_entity_colour]).\
+        success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
+        success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
     reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     # Exclude current selection from annotator and outputs
     # Exclude only selected row
+    exclude_selected_row_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_df, selected_entity_dataframe_row, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
+        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df]).\
         success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
     # Exclude all items with same text as selected row
     exclude_text_with_same_as_selected_row_btn.click(get_all_rows_with_same_text, inputs=[recogniser_entity_dataframe_base, selected_entity_dataframe_row_text], outputs=[recogniser_entity_dataframe_same_text]).\
+    success(exclude_selected_items_from_redaction, inputs=[review_file_df, recogniser_entity_dataframe_same_text, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
+        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df]).\
         success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
     # Exclude everything visible in table
+    exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_df, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
+        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df]).\
         success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
+    undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base]).\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
+        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
     # Review OCR text button
+    all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_ocr_dataframe_row]).\
+        success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_ocr_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
+        success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
     reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
         success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
         success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
     ###
     success(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number], api_name="redact_data").\
     success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
+    # Currently only supports redacting one data file at a time, following code block not used
     # If the output file count text box changes, keep going with redacting each data file until done
     # text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number]).\
     # success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     ###
     # IDENTIFY DUPLICATE PAGES
     ###
+    find_duplicate_pages_btn.click(
+        fn=run_duplicate_analysis,
+        inputs=[
+            in_duplicate_pages,
+            duplicate_threshold_input,
+            min_word_count_input,
+            min_consecutive_pages_input,
+            greedy_match_input
+        ],
+        outputs=[
+            results_df_preview,
+            duplicate_files_out,
+            full_duplicate_data_by_file
+        ]
+    )
+    # full_duplicated_data_df,
+    results_df_preview.select(
+        fn=handle_selection_and_preview,
+        inputs=[results_df_preview, full_duplicate_data_by_file],
+        outputs=[selected_duplicate_data_row_index, page1_text_preview, page2_text_preview]
+    )
+    # When the user clicks the "Exclude" button
+    exclude_match_btn.click(
+        fn=exclude_match,
+        inputs=[results_df_preview, selected_duplicate_data_row_index],
+        outputs=[results_df_preview, duplicate_files_out, page1_text_preview, page2_text_preview]
+    )
+    apply_match_btn.click(
+        fn=apply_whole_page_redactions_from_list,
+        inputs=[in_fully_redacted_list_state, doc_file_name_with_extension_textbox, review_file_df, duplicate_files_out, pdf_doc_state, page_sizes, all_image_annotations_state],
+        outputs=[review_file_df, all_image_annotations_state]).\
+        success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     ###
     # SETTINGS PAGE INPUT / OUTPUT
     in_deny_list_state.input(update_dataframe, inputs=[in_deny_list_state], outputs=[in_deny_list_state])
     in_fully_redacted_list_state.input(update_dataframe, inputs=[in_fully_redacted_list_state], outputs=[in_fully_redacted_list_state])
+    apply_fully_redacted_list_btn.click(
+        fn=apply_whole_page_redactions_from_list,
+        inputs=[in_fully_redacted_list_state, doc_file_name_with_extension_textbox, review_file_df, duplicate_files_out, pdf_doc_state, page_sizes, all_image_annotations_state],
+        outputs=[review_file_df, all_image_annotations_state]).\
+        success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     # Merge multiple review csv files together
     merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)

cdk/__init__.py ADDED Viewed

File without changes

cdk/app.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import os
+from aws_cdk import (App, Environment)
+# Assuming these are still relevant for you
+from check_resources import check_and_set_context, CONTEXT_FILE
+from cdk_config import AWS_ACCOUNT_ID, AWS_REGION, RUN_USEAST_STACK, USE_CLOUDFRONT
+from cdk_stack import CdkStack, CdkStackCloudfront#, CdkStackMain
+from cdk_functions import load_context_from_file, create_basic_config_env
+# Initialize the CDK app
+app = App()
+# --- ENHANCED CONTEXT GENERATION AND LOADING ---
+# 1. Always ensure the old context file is removed before generation
+if os.path.exists(CONTEXT_FILE):
+    try:
+        os.remove(CONTEXT_FILE)
+        print(f"Removed stale context file: {CONTEXT_FILE}")
+    except OSError as e:
+        print(f"Warning: Could not remove old context file {CONTEXT_FILE}: {e}")
+        # Proceed anyway, check_and_set_context might handle overwriting
+# 2. Always run the pre-check script to generate fresh context
+print("Running pre-check script to generate application context...")
+try:
+    check_and_set_context()
+    if not os.path.exists(CONTEXT_FILE):
+        raise RuntimeError(f"check_and_set_context() finished, but {CONTEXT_FILE} was not created.")
+    print(f"Context generated successfully at {CONTEXT_FILE}.")
+except Exception as e:
+    raise RuntimeError(f"Failed to generate context via check_and_set_context(): {e}")
+if os.path.exists(CONTEXT_FILE):
+    load_context_from_file(app, CONTEXT_FILE)
+else:
+    raise RuntimeError(f"Could not find {CONTEXT_FILE}.")
+# Create basic config.env file that user can use to run the app later. Input is the folder it is saved into.
+create_basic_config_env("config")
+# Define the environment for the regional stack (where ALB resides)
+aws_env_regional = Environment(account=AWS_ACCOUNT_ID, region=AWS_REGION)
+# Create the regional stack (ALB, SGs, etc.)
+# regional_stack = CdkStack(app,
+#                           "RedactionStackSubnets",
+#                           env=aws_env_regional,
+#                           cross_region_references=True)
+# regional_stack_main = CdkStackMain(app,
+#                         "RedactionStackMain",
+#                         env=aws_env_regional,
+#                         private_subnets=regional_stack.params["private_subnets"],
+#                         private_route_tables=regional_stack.params["private_route_tables"],
+#                         public_subnets=regional_stack.params["public_subnets"],
+#                         public_route_tables=regional_stack.params["public_route_tables"],
+#                         cross_region_references=True)
+regional_stack = CdkStack(app,
+                          "RedactionStack",
+                          env=aws_env_regional,
+                          cross_region_references=True)
+if USE_CLOUDFRONT == 'True' and RUN_USEAST_STACK == 'True':
+    # Define the environment for the CloudFront stack (always us-east-1 for CF-level resources like WAFv2 WebACLs for CF)
+    aws_env_us_east_1 = Environment(account=AWS_ACCOUNT_ID, region="us-east-1")
+    # Create the CloudFront stack, passing the outputs from the regional stack
+    cloudfront_stack = CdkStackCloudfront(
+        app,
+        "RedactionStackCloudfront",
+        env=aws_env_us_east_1,
+        alb_arn=regional_stack.params["alb_arn_output"],
+        alb_sec_group_id=regional_stack.params["alb_security_group_id"],
+        alb_dns_name=regional_stack.params["alb_dns_name"],
+        cross_region_references=True
+    )
+# Synthesize the CloudFormation template
+app.synth(validate_on_synthesis=True)

cdk/cdk_config.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import os
+import tempfile
+from dotenv import load_dotenv
+# Set or retrieve configuration variables for CDK redaction deployment
+def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False):
+    '''
+    Get an environmental variable, and set it to a default value if it doesn't exist
+    '''
+    # Get the environment variable if it exists
+    value = os.environ.get(var_name)
+    # If it doesn't exist, set the environment variable to the default value
+    if value is None:
+        os.environ[var_name] = default_value
+        value = default_value
+    if print_val == True:
+        print(f'The value of {var_name} is {value}')
+    return value
+def ensure_folder_exists(output_folder:str):
+    """Checks if the specified folder exists, creates it if not."""
+    if not os.path.exists(output_folder):
+        # Create the folder if it doesn't exist
+        os.makedirs(output_folder, exist_ok=True)
+        print(f"Created the {output_folder} folder.")
+    else:
+        print(f"The {output_folder} folder already exists.")
+def add_folder_to_path(folder_path: str):
+    '''
+    Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
+    '''
+    if os.path.exists(folder_path) and os.path.isdir(folder_path):
+        print(folder_path, "folder exists.")
+        # Resolve relative path to absolute path
+        absolute_path = os.path.abspath(folder_path)
+        current_path = os.environ['PATH']
+        if absolute_path not in current_path.split(os.pathsep):
+            full_path_extension = absolute_path + os.pathsep + current_path
+            os.environ['PATH'] = full_path_extension
+            #print(f"Updated PATH with: ", full_path_extension)
+        else:
+            print(f"Directory {folder_path} already exists in PATH.")
+    else:
+        print(f"Folder not found at {folder_path} - not added to PATH")
+###
+# LOAD CONFIG FROM ENV FILE
+###
+CONFIG_FOLDER = get_or_create_env_var('CONFIG_FOLDER', "config/")
+ensure_folder_exists(CONFIG_FOLDER)
+# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/cdk_config.env'
+CDK_CONFIG_PATH = get_or_create_env_var('CDK_CONFIG_PATH', 'config/cdk_config.env') # e.g. config/cdk_config.env
+if CDK_CONFIG_PATH:
+    if os.path.exists(CDK_CONFIG_PATH):
+        print(f"Loading CDK variables from config file {CDK_CONFIG_PATH}")
+        load_dotenv(CDK_CONFIG_PATH)
+    else: print("CDK config file not found at location:", CDK_CONFIG_PATH)
+###
+# AWS OPTIONS
+###
+AWS_REGION = get_or_create_env_var('AWS_REGION', '')
+AWS_ACCOUNT_ID = get_or_create_env_var('AWS_ACCOUNT_ID', '')
+###
+# CDK OPTIONS
+###
+CDK_PREFIX = get_or_create_env_var('CDK_PREFIX', '')
+CONTEXT_FILE = get_or_create_env_var('CONTEXT_FILE', 'cdk.context.json') # Define the CDK output context file name
+CDK_FOLDER = get_or_create_env_var('CDK_FOLDER', '') # FULL_PATH_TO_CDK_FOLDER_HERE (with forward slash)
+RUN_USEAST_STACK = get_or_create_env_var('RUN_USEAST_STACK', 'False')
+### VPC
+VPC_NAME = get_or_create_env_var('VPC_NAME', '')
+EXISTING_IGW_ID = get_or_create_env_var('EXISTING_IGW_ID', '')
+SINGLE_NAT_GATEWAY_ID = get_or_create_env_var('SINGLE_NAT_GATEWAY_ID', '')
+### SUBNETS / ROUTE TABLES / NAT GATEWAY
+PUBLIC_SUBNETS_TO_USE = get_or_create_env_var('PUBLIC_SUBNETS_TO_USE', '') # e.g. ['PublicSubnet1', 'PublicSubnet2']
+PUBLIC_SUBNET_CIDR_BLOCKS = get_or_create_env_var('PUBLIC_SUBNET_CIDR_BLOCKS', '') # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
+PUBLIC_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var('PUBLIC_SUBNET_AVAILABILITY_ZONES', '') # e.g. ["eu-east-1b", "eu-east1b"]
+PRIVATE_SUBNETS_TO_USE = get_or_create_env_var('PRIVATE_SUBNETS_TO_USE', '') # e.g. ['PrivateSubnet1', 'PrivateSubnet2']
+PRIVATE_SUBNET_CIDR_BLOCKS = get_or_create_env_var('PRIVATE_SUBNET_CIDR_BLOCKS', '') # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
+PRIVATE_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var('PRIVATE_SUBNET_AVAILABILITY_ZONES', '') # e.g. ["eu-east-1b", "eu-east1b"]
+ROUTE_TABLE_BASE_NAME = get_or_create_env_var('ROUTE_TABLE_BASE_NAME', f'{CDK_PREFIX}PrivateRouteTable')
+NAT_GATEWAY_EIP_NAME = get_or_create_env_var('NAT_GATEWAY_EIP_NAME', f"{CDK_PREFIX}NatGatewayEip")
+NAT_GATEWAY_NAME = get_or_create_env_var('NAT_GATEWAY_NAME', f"{CDK_PREFIX}NatGateway")
+# IAM roles
+AWS_MANAGED_TASK_ROLES_LIST = get_or_create_env_var('AWS_MANAGED_TASK_ROLES_LIST', '["AmazonCognitoReadOnly", "service-role/AmazonECSTaskExecutionRolePolicy", "AmazonS3FullAccess", "AmazonTextractFullAccess", "ComprehendReadOnly", "AmazonDynamoDBFullAccess", "service-role/AWSAppSyncPushToCloudWatchLogs"]')
+POLICY_FILE_LOCATIONS = get_or_create_env_var('POLICY_FILE_LOCATIONS', '') # e.g. '["config/sts_permissions.json"]'
+POLICY_FILE_ARNS = get_or_create_env_var('POLICY_FILE_ARNS', '')
+# GITHUB REPO
+GITHUB_REPO_USERNAME = get_or_create_env_var('GITHUB_REPO_USERNAME', 'seanpedrick-case')
+GITHUB_REPO_NAME = get_or_create_env_var('GITHUB_REPO_NAME', 'doc_redaction')
+GITHUB_REPO_BRANCH = get_or_create_env_var('GITHUB_REPO_BRANCH', 'main')
+### CODEBUILD
+CODEBUILD_ROLE_NAME = get_or_create_env_var('CODEBUILD_ROLE_NAME', f"{CDK_PREFIX}CodeBuildRole")
+CODEBUILD_PROJECT_NAME = get_or_create_env_var('CODEBUILD_PROJECT_NAME', f"{CDK_PREFIX}CodeBuildProject")
+### ECR
+ECR_REPO_NAME = get_or_create_env_var('ECR_REPO_NAME', 'doc-redaction') # Beware - cannot have underscores and must be lower case
+ECR_CDK_REPO_NAME = get_or_create_env_var('ECR_CDK_REPO_NAME', f"{CDK_PREFIX}{ECR_REPO_NAME}".lower())
+### S3
+S3_LOG_CONFIG_BUCKET_NAME = get_or_create_env_var('S3_LOG_CONFIG_BUCKET_NAME', f"{CDK_PREFIX}s3-logs".lower()) # S3 bucket names need to be lower case
+S3_OUTPUT_BUCKET_NAME = get_or_create_env_var('S3_OUTPUT_BUCKET_NAME', f"{CDK_PREFIX}s3-output".lower())
+### ECS
+FARGATE_TASK_DEFINITION_NAME = get_or_create_env_var('FARGATE_TASK_DEFINITION_NAME', f"{CDK_PREFIX}FargateTaskDefinition")
+TASK_DEFINITION_FILE_LOCATION = get_or_create_env_var('TASK_DEFINITION_FILE_LOCATION', CDK_FOLDER + CONFIG_FOLDER + "task_definition.json")
+CLUSTER_NAME = get_or_create_env_var('CLUSTER_NAME', f"{CDK_PREFIX}Cluster")
+ECS_SERVICE_NAME = get_or_create_env_var('ECS_SERVICE_NAME', f"{CDK_PREFIX}ECSService")
+ECS_TASK_ROLE_NAME = get_or_create_env_var('ECS_TASK_ROLE_NAME', f"{CDK_PREFIX}TaskRole")
+ECS_TASK_EXECUTION_ROLE_NAME = get_or_create_env_var('ECS_TASK_EXECUTION_ROLE_NAME', f"{CDK_PREFIX}ExecutionRole")
+ECS_SECURITY_GROUP_NAME = get_or_create_env_var('ECS_SECURITY_GROUP_NAME', f"{CDK_PREFIX}SecurityGroupECS")
+ECS_LOG_GROUP_NAME = get_or_create_env_var('ECS_LOG_GROUP_NAME', f"/ecs/{ECS_SERVICE_NAME}-logs".lower())
+ECS_TASK_CPU_SIZE = get_or_create_env_var('ECS_TASK_CPU_SIZE', '1024')
+ECS_TASK_MEMORY_SIZE = get_or_create_env_var('ECS_TASK_MEMORY_SIZE', '4096')
+ECS_USE_FARGATE_SPOT = get_or_create_env_var('USE_FARGATE_SPOT', 'False')
+ECS_READ_ONLY_FILE_SYSTEM = get_or_create_env_var('ECS_READ_ONLY_FILE_SYSTEM', 'True')
+### Cognito
+COGNITO_USER_POOL_NAME = get_or_create_env_var('COGNITO_USER_POOL_NAME', f"{CDK_PREFIX}UserPool")
+COGNITO_USER_POOL_CLIENT_NAME = get_or_create_env_var('COGNITO_USER_POOL_CLIENT_NAME', f"{CDK_PREFIX}UserPoolClient")
+COGNITO_USER_POOL_CLIENT_SECRET_NAME = get_or_create_env_var('COGNITO_USER_POOL_CLIENT_SECRET_NAME', f"{CDK_PREFIX}ParamCognitoSecret")
+COGNITO_USER_POOL_DOMAIN_PREFIX = get_or_create_env_var('COGNITO_USER_POOL_DOMAIN_PREFIX', "redaction-app-domain") # Should change this to something unique or you'll probably hit an error
+# Application load balancer
+ALB_NAME = get_or_create_env_var('ALB_NAME', f"{CDK_PREFIX}Alb"[-32:]) # Application load balancer name can be max 32 characters, so taking the last 32 characters of the suggested name
+ALB_NAME_SECURITY_GROUP_NAME = get_or_create_env_var('ALB_SECURITY_GROUP_NAME', f"{CDK_PREFIX}SecurityGroupALB")
+ALB_TARGET_GROUP_NAME = get_or_create_env_var('ALB_TARGET_GROUP_NAME', f"{CDK_PREFIX}-tg"[-32:]) # Max 32 characters
+EXISTING_LOAD_BALANCER_ARN = get_or_create_env_var('EXISTING_LOAD_BALANCER_ARN', '')
+EXISTING_LOAD_BALANCER_DNS = get_or_create_env_var('EXISTING_LOAD_BALANCER_ARN', 'placeholder_load_balancer_dns.net')
+## CLOUDFRONT
+USE_CLOUDFRONT = get_or_create_env_var('USE_CLOUDFRONT', 'True')
+CLOUDFRONT_PREFIX_LIST_ID = get_or_create_env_var('CLOUDFRONT_PREFIX_LIST_ID', 'pl-93a247fa')
+CLOUDFRONT_GEO_RESTRICTION = get_or_create_env_var('CLOUDFRONT_GEO_RESTRICTION', '') # A country that Cloudfront restricts access to. See here: https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/georestrictions.html
+CLOUDFRONT_DISTRIBUTION_NAME = get_or_create_env_var('CLOUDFRONT_DISTRIBUTION_NAME', f"{CDK_PREFIX}CfDist")
+CLOUDFRONT_DOMAIN = get_or_create_env_var('CLOUDFRONT_DOMAIN', "cloudfront_placeholder.net")
+# Certificate for Application load balancer (optional, for HTTPS and logins through the ALB)
+ACM_CERTIFICATE_ARN = get_or_create_env_var('ACM_CERTIFICATE_ARN', '')
+SSL_CERTIFICATE_DOMAIN = get_or_create_env_var('SSL_CERTIFICATE_DOMAIN', '') # e.g. example.com or www.example.com
+# This should be the CloudFront domain, the domain linked to your ACM certificate, or the DNS of your application load balancer in console afterwards
+if USE_CLOUDFRONT == "True":
+    COGNITO_REDIRECTION_URL = get_or_create_env_var('COGNITO_REDIRECTION_URL', "https://" + CLOUDFRONT_DOMAIN)
+elif SSL_CERTIFICATE_DOMAIN:
+    COGNITO_REDIRECTION_URL = get_or_create_env_var('COGNITO_REDIRECTION_URL', "https://" + SSL_CERTIFICATE_DOMAIN)
+else:
+    COGNITO_REDIRECTION_URL = get_or_create_env_var('COGNITO_REDIRECTION_URL', "https://" + EXISTING_LOAD_BALANCER_DNS)
+# Custom headers e.g. if routing traffic through Cloudfront
+CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '') # Retrieving or setting CUSTOM_HEADER
+CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '') # Retrieving or setting CUSTOM_HEADER_VALUE
+# Firewall on top of load balancer
+LOAD_BALANCER_WEB_ACL_NAME = get_or_create_env_var('LOAD_BALANCER_WEB_ACL_NAME', f"{CDK_PREFIX}alb-web-acl")
+# Firewall on top of CloudFront
+WEB_ACL_NAME = get_or_create_env_var('WEB_ACL_NAME', f"{CDK_PREFIX}cloudfront-web-acl")
+###
+# File I/O options
+###
+OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
+INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
+# Allow for files to be saved in a temporary folder for increased security in some instances
+if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
+    # Create a temporary directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        print(f'Temporary directory created at: {temp_dir}')
+        if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
+        if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
+###
+# LOGGING OPTIONS
+###
+SAVE_LOGS_TO_CSV = get_or_create_env_var('SAVE_LOGS_TO_CSV', 'True')
+### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
+SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'True')
+ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-access-log".lower())
+FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('FEEDBACK_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-feedback".lower())
+USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-usage".lower())
+###
+# REDACTION OPTIONS
+###
+# Get some environment variables and Launch the Gradio app
+COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
+GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
+###
+# WHOLE DOCUMENT API OPTIONS
+###
+DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var('DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS', '7') # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.

cdk/cdk_functions.py ADDED Viewed

	@@ -0,0 +1,1293 @@

+import boto3
+from botocore.exceptions import ClientError
+import json
+import os
+import pandas as pd
+import ipaddress
+from constructs import Construct
+from dotenv import set_key
+from typing import List, Tuple, Optional, Dict, Any
+from aws_cdk import (
+    App,
+    CfnTag,
+    aws_ec2 as ec2,
+    aws_wafv2 as wafv2,
+    aws_elasticloadbalancingv2 as elb,
+    aws_elasticloadbalancingv2_actions as elb_act,
+    aws_certificatemanager as acm, # You might need this if you were looking up a cert, but not strictly for ARN
+    aws_cognito as cognito,
+    aws_iam as iam,
+    CfnOutput,
+    Tags
+)
+from cdk_config import PUBLIC_SUBNETS_TO_USE, PRIVATE_SUBNETS_TO_USE, PUBLIC_SUBNET_CIDR_BLOCKS, PRIVATE_SUBNET_CIDR_BLOCKS, PUBLIC_SUBNET_AVAILABILITY_ZONES, PRIVATE_SUBNET_AVAILABILITY_ZONES, POLICY_FILE_LOCATIONS, NAT_GATEWAY_EIP_NAME, S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, AWS_REGION
+# --- Function to load context from file ---
+def load_context_from_file(app: App, file_path: str):
+    if os.path.exists(file_path):
+        with open(file_path, 'r') as f:
+            context_data = json.load(f)
+            for key, value in context_data.items():
+                app.node.set_context(key, value)
+            print(f"Loaded context from {file_path}")
+    else:
+        print(f"Context file not found: {file_path}")
+# --- Helper to parse environment variables into lists ---
+def _get_env_list(env_var_name: str) -> List[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(',') if s.strip()]
+# 1. Try to load CIDR/AZs from environment variables
+if PUBLIC_SUBNETS_TO_USE: PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
+if PRIVATE_SUBNETS_TO_USE: PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
+if PUBLIC_SUBNET_CIDR_BLOCKS: PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
+if PUBLIC_SUBNET_AVAILABILITY_ZONES: PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
+if PRIVATE_SUBNET_CIDR_BLOCKS: PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
+if PRIVATE_SUBNET_AVAILABILITY_ZONES: PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list("PRIVATE_SUBNET_AVAILABILITY_ZONES")
+if POLICY_FILE_LOCATIONS: POLICY_FILE_LOCATIONS = _get_env_list(POLICY_FILE_LOCATIONS)
+def check_for_existing_role(role_name:str):
+    try:
+        iam = boto3.client('iam')
+        #iam.get_role(RoleName=role_name)
+        response = iam.get_role(RoleName=role_name)
+        role = response['Role']['Arn']
+        print("Response Role:", role)
+        return True, role, ""
+    except iam.exceptions.NoSuchEntityException:
+        return False, "", ""
+    except Exception as e:
+        raise Exception("Getting information on IAM role failed due to:", e)
+import json
+from typing import List, Dict, Any, Union, Optional
+from aws_cdk import (
+    aws_iam as iam,
+)
+from constructs import Construct
+# Assume POLICY_FILE_LOCATIONS is defined globally or passed as a default
+# For example:
+# POLICY_FILE_LOCATIONS = ["./policies/my_read_policy.json", "./policies/my_write_policy.json"]
+def add_statement_to_policy(role: iam.IRole, policy_document: Dict[str, Any]):
+    """
+    Adds individual policy statements from a parsed policy document to a CDK Role.
+    Args:
+        role: The CDK Role construct to attach policies to.
+        policy_document: A Python dictionary representing an IAM policy document.
+    """
+    # Ensure the loaded JSON is a valid policy document structure
+    if 'Statement' not in policy_document or not isinstance(policy_document['Statement'], list):
+        print(f"Warning: Policy document does not contain a 'Statement' list. Skipping.")
+        return # Do not return role, just log and exit
+    for statement_dict in policy_document['Statement']:
+        try:
+            # Create a CDK PolicyStatement from the dictionary
+            cdk_policy_statement = iam.PolicyStatement.from_json(statement_dict)
+            # Add the policy statement to the role
+            role.add_to_policy(cdk_policy_statement)
+            print(f"  - Added statement: {statement_dict.get('Sid', 'No Sid')}")
+        except Exception as e:
+            print(f"Warning: Could not process policy statement: {statement_dict}. Error: {e}")
+def add_custom_policies(
+    scope: Construct, # Not strictly used here, but good practice if you expand to ManagedPolicies
+    role: iam.IRole,
+    policy_file_locations: Optional[List[str]] = None,
+    custom_policy_text: Optional[str] = None
+) -> iam.IRole:
+    """
+    Loads custom policies from JSON files or a string and attaches them to a CDK Role.
+    Args:
+        scope: The scope in which to define constructs (if needed, e.g., for iam.ManagedPolicy).
+        role: The CDK Role construct to attach policies to.
+        policy_file_locations: List of file paths to JSON policy documents.
+        custom_policy_text: A JSON string representing a policy document.
+    Returns:
+        The modified CDK Role construct.
+    """
+    if policy_file_locations is None:
+        policy_file_locations = []
+    current_source = "unknown source" # For error messages
+    try:
+        if policy_file_locations:
+            print(f"Attempting to add policies from files to role {role.node.id}...")
+            for path in policy_file_locations:
+                current_source = f"file: {path}"
+                try:
+                    with open(path, 'r') as f:
+                        policy_document = json.load(f)
+                    print(f"Processing policy from {current_source}...")
+                    add_statement_to_policy(role, policy_document)
+                except FileNotFoundError:
+                    print(f"Warning: Policy file not found at {path}. Skipping.")
+                except json.JSONDecodeError as e:
+                    print(f"Warning: Invalid JSON in policy file {path}: {e}. Skipping.")
+                except Exception as e:
+                    print(f"An unexpected error occurred processing policy from {path}: {e}. Skipping.")
+        if custom_policy_text:
+            current_source = "custom policy text string"
+            print(f"Attempting to add policy from custom text to role {role.node.id}...")
+            try:
+                # *** FIX: Parse the JSON string into a Python dictionary ***
+                policy_document = json.loads(custom_policy_text)
+                print(f"Processing policy from {current_source}...")
+                add_statement_to_policy(role, policy_document)
+            except json.JSONDecodeError as e:
+                print(f"Warning: Invalid JSON in custom_policy_text: {e}. Skipping.")
+            except Exception as e:
+                print(f"An unexpected error occurred processing policy from custom_policy_text: {e}. Skipping.")
+        # You might want a final success message, but individual processing messages are also good.
+        print(f"Finished processing custom policies for role {role.node.id}.")
+    except Exception as e:
+        print(f"An unhandled error occurred during policy addition for {current_source}: {e}")
+    return role
+# Import the S3 Bucket class if you intend to return a CDK object later
+# from aws_cdk import aws_s3 as s3
+def check_s3_bucket_exists(bucket_name: str): # Return type hint depends on what you return
+    """
+    Checks if an S3 bucket with the given name exists and is accessible.
+    Args:
+        bucket_name: The name of the S3 bucket to check.
+    Returns:
+        A tuple: (bool indicating existence, optional S3 Bucket object or None)
+        Note: Returning a Boto3 S3 Bucket object from here is NOT ideal
+              for direct use in CDK. You'll likely only need the boolean result
+              or the bucket name for CDK lookups/creations.
+              For this example, let's return the boolean and the name.
+    """
+    s3_client = boto3.client('s3')
+    try:
+        # Use head_bucket to check for existence and access
+        s3_client.head_bucket(Bucket=bucket_name)
+        print(f"Bucket '{bucket_name}' exists and is accessible.")
+        return True, bucket_name # Return True and the bucket name
+    except ClientError as e:
+        # If a ClientError occurs, check the error code.
+        # '404' means the bucket does not exist.
+        # '403' means the bucket exists but you don't have permission.
+        error_code = e.response['Error']['Code']
+        if error_code == '404':
+            print(f"Bucket '{bucket_name}' does not exist.")
+            return False, None
+        elif error_code == '403':
+             # The bucket exists, but you can't access it.
+             # Depending on your requirements, this might be treated as "exists"
+             # or "not accessible for our purpose". For checking existence,
+             # we'll say it exists here, but note the permission issue.
+             # NOTE - when I tested this, it was returning 403 even for buckets that don't exist. So I will return False instead
+            print(f"Bucket '{bucket_name}' returned 403, which indicates it may exist but is not accessible due to permissions, or that it doesn't exist. Returning False for existence just in case.")
+            return False, bucket_name # It exists, even if not accessible
+        else:
+            # For other errors, it's better to raise the exception
+            # to indicate something unexpected happened.
+            print(f"An unexpected AWS ClientError occurred checking bucket '{bucket_name}': {e}")
+            # Decide how to handle other errors - raising might be safer
+            raise # Re-raise the original exception
+    except Exception as e:
+        print(f"An unexpected non-ClientError occurred checking bucket '{bucket_name}': {e}")
+        # Decide how to handle other errors
+        raise # Re-raise the original exception
+# Example usage in your check_resources.py:
+# exists, bucket_name_if_exists = check_s3_bucket_exists(log_bucket_name)
+# context_data[f"exists:{log_bucket_name}"] = exists
+# # You don't necessarily need to store the name in context if using from_bucket_name
+# Delete an S3 bucket
+def delete_s3_bucket(bucket_name:str):
+    s3 = boto3.client('s3')
+    try:
+        # List and delete all objects
+        response = s3.list_object_versions(Bucket=bucket_name)
+        versions = response.get('Versions', []) + response.get('DeleteMarkers', [])
+        for version in versions:
+            s3.delete_object(Bucket=bucket_name, Key=version['Key'], VersionId=version['VersionId'])
+        # Delete the bucket
+        s3.delete_bucket(Bucket=bucket_name)
+        return {'Status': 'SUCCESS'}
+    except Exception as e:
+        return {'Status': 'FAILED', 'Reason': str(e)}
+# Function to get subnet ID from subnet name
+def get_subnet_id(vpc:str, ec2_client:str, subnet_name:str):
+    response = ec2_client.describe_subnets(Filters=[{'Name': 'vpc-id', 'Values': [vpc.vpc_id]}])
+    for subnet in response['Subnets']:
+        if subnet['Tags'] and any(tag['Key'] == 'Name' and tag['Value'] == subnet_name for tag in subnet['Tags']):
+            return subnet['SubnetId']
+    return None
+def check_ecr_repo_exists(repo_name: str) -> tuple[bool, dict]:
+    """
+    Checks if an ECR repository with the given name exists.
+    Args:
+        repo_name: The name of the ECR repository to check.
+    Returns:
+        True if the repository exists, False otherwise.
+    """
+    ecr_client = boto3.client('ecr')
+    try:
+        print("ecr repo_name to check:", repo_name)
+        response = ecr_client.describe_repositories(repositoryNames=[repo_name])
+        # If describe_repositories succeeds and returns a list of repositories,
+        # and the list is not empty, the repository exists.
+        return len(response['repositories']) > 0, response['repositories'][0]
+    except ClientError as e:
+        # Check for the specific error code indicating the repository doesn't exist
+        if e.response['Error']['Code'] == 'RepositoryNotFoundException':
+            return False, {}
+        else:
+            # Re-raise other exceptions to handle unexpected errors
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, {}
+def check_codebuild_project_exists(project_name: str): # Adjust return type hint as needed
+    """
+    Checks if a CodeBuild project with the given name exists.
+    Args:
+        project_name: The name of the CodeBuild project to check.
+    Returns:
+        A tuple:
+        - The first element is True if the project exists, False otherwise.
+        - The second element is the project object (dictionary) if found,
+          None otherwise.
+    """
+    codebuild_client = boto3.client('codebuild')
+    try:
+        # Use batch_get_projects with a list containing the single project name
+        response = codebuild_client.batch_get_projects(names=[project_name])
+        # The response for batch_get_projects includes 'projects' (found)
+        # and 'projectsNotFound' (not found).
+        if response['projects']:
+            # If the project is found in the 'projects' list
+            print(f"CodeBuild project '{project_name}' found.")
+            return True, response['projects'][0]['arn']  # Return True and the project details dict
+        elif response['projectsNotFound'] and project_name in response['projectsNotFound']:
+             # If the project name is explicitly in the 'projectsNotFound' list
+             print(f"CodeBuild project '{project_name}' not found.")
+             return False, None
+        else:
+            # This case is less expected for a single name lookup,
+            # but could happen if there's an internal issue or the response
+            # structure is slightly different than expected for an error.
+            # It's safer to assume it wasn't found if not in 'projects'.
+            print(f"CodeBuild project '{project_name}' not found (not in 'projects' list).")
+            return False, None
+    except ClientError as e:
+        # Catch specific ClientErrors. batch_get_projects might not throw
+        # 'InvalidInputException' for a non-existent project name if the
+        # name format is valid. It typically just lists it in projectsNotFound.
+        # However, other ClientErrors are possible (e.g., permissions).
+        print(f"An AWS ClientError occurred checking CodeBuild project '{project_name}': {e}")
+        # Decide how to handle other ClientErrors - raising might be safer
+        raise # Re-raise the original exception
+    except Exception as e:
+        print(f"An unexpected non-ClientError occurred checking CodeBuild project '{project_name}': {e}")
+        # Decide how to handle other errors
+        raise # Re-raise the original exception
+def get_vpc_id_by_name(vpc_name: str) -> Optional[str]:
+    """
+    Finds a VPC ID by its 'Name' tag.
+    """
+    ec2_client = boto3.client('ec2')
+    try:
+        response = ec2_client.describe_vpcs(
+            Filters=[
+                {'Name': 'tag:Name', 'Values': [vpc_name]}
+            ]
+        )
+        if response and response['Vpcs']:
+            vpc_id = response['Vpcs'][0]['VpcId']
+            print(f"VPC '{vpc_name}' found with ID: {vpc_id}")
+            # In get_vpc_id_by_name, after finding VPC ID:
+            # Look for NAT Gateways in this VPC
+            ec2_client = boto3.client('ec2')
+            nat_gateways = []
+            try:
+                response = ec2_client.describe_nat_gateways(
+                    Filters=[
+                        {'Name': 'vpc-id', 'Values': [vpc_id]},
+                        # Optional: Add a tag filter if you consistently tag your NATs
+                        # {'Name': 'tag:Name', 'Values': [f"{prefix}-nat-gateway"]}
+                    ]
+                )
+                nat_gateways = response.get('NatGateways', [])
+            except Exception as e:
+                print(f"Warning: Could not describe NAT Gateways in VPC '{vpc_id}': {e}")
+                # Decide how to handle this error - proceed or raise?
+            # Decide how to identify the specific NAT Gateway you want to check for.
+            return vpc_id, nat_gateways
+        else:
+            print(f"VPC '{vpc_name}' not found.")
+            return None
+    except Exception as e:
+        print(f"An unexpected error occurred finding VPC '{vpc_name}': {e}")
+        raise
+# --- Helper to fetch all existing subnets in a VPC once ---
+def _get_existing_subnets_in_vpc(vpc_id: str) -> Dict[str, Any]:
+    """
+    Fetches all subnets in a given VPC.
+    Returns a dictionary with 'by_name' (map of name to subnet data),
+    'by_id' (map of id to subnet data), and 'cidr_networks' (list of ipaddress.IPv4Network).
+    """
+    ec2_client = boto3.client('ec2')
+    existing_subnets_data = {
+        "by_name": {},  # {subnet_name: {'id': 'subnet-id', 'cidr': 'x.x.x.x/x'}}
+        "by_id": {},    # {subnet_id: {'name': 'subnet-name', 'cidr': 'x.x.x.x/x'}}
+        "cidr_networks": [] # List of ipaddress.IPv4Network objects
+    }
+    try:
+        response = ec2_client.describe_subnets(Filters=[{'Name': 'vpc-id', 'Values': [vpc_id]}])
+        for s in response.get('Subnets', []):
+            subnet_id = s['SubnetId']
+            cidr_block = s.get('CidrBlock')
+            # Extract 'Name' tag, which is crucial for lookup by name
+            name_tag = next((tag['Value'] for tag in s.get('Tags', []) if tag['Key'] == 'Name'), None)
+            subnet_info = {'id': subnet_id, 'cidr': cidr_block, 'name': name_tag}
+            if name_tag:
+                existing_subnets_data["by_name"][name_tag] = subnet_info
+            existing_subnets_data["by_id"][subnet_id] = subnet_info
+            if cidr_block:
+                try:
+                    existing_subnets_data["cidr_networks"].append(ipaddress.ip_network(cidr_block, strict=False))
+                except ValueError:
+                    print(f"Warning: Existing subnet {subnet_id} has an invalid CIDR: {cidr_block}. Skipping for overlap check.")
+        print(f"Fetched {len(response.get('Subnets', []))} existing subnets from VPC '{vpc_id}'.")
+    except Exception as e:
+        print(f"Error describing existing subnets in VPC '{vpc_id}': {e}. Cannot perform full validation.")
+        raise # Re-raise if this essential step fails
+    return existing_subnets_data
+# --- Modified validate_subnet_creation_parameters to take pre-fetched data ---
+def validate_subnet_creation_parameters(
+    vpc_id: str,
+    proposed_subnets_data: List[Dict[str, str]], # e.g., [{'name': 'my-public-subnet', 'cidr': '10.0.0.0/24', 'az': 'us-east-1a'}]
+    existing_aws_subnets_data: Dict[str, Any] # Pre-fetched data from _get_existing_subnets_in_vpc
+) -> None:
+    """
+    Validates proposed subnet names and CIDR blocks against existing AWS subnets
+    in the specified VPC and against each other.
+    This function uses pre-fetched AWS subnet data.
+    Args:
+        vpc_id: The ID of the VPC (for logging/error messages).
+        proposed_subnets_data: A list of dictionaries, where each dict represents
+                               a proposed subnet with 'name', 'cidr', and 'az'.
+        existing_aws_subnets_data: Dictionary containing existing AWS subnet data
+                                   (e.g., from _get_existing_subnets_in_vpc).
+    Raises:
+        ValueError: If any proposed subnet name or CIDR block
+                    conflicts with existing AWS resources or other proposed resources.
+    """
+    if not proposed_subnets_data:
+        print("No proposed subnet data provided for validation. Skipping.")
+        return
+    print(f"--- Starting pre-synth validation for VPC '{vpc_id}' with proposed subnets ---")
+    print("Existing subnet data:", pd.DataFrame(existing_aws_subnets_data['by_name']))
+    existing_aws_subnet_names = set(existing_aws_subnets_data["by_name"].keys())
+    existing_aws_cidr_networks = existing_aws_subnets_data["cidr_networks"]
+    # Sets to track names and list to track networks for internal batch consistency
+    proposed_names_seen: set[str] = set()
+    proposed_cidr_networks_seen: List[ipaddress.IPv4Network] = []
+    for i, proposed_subnet in enumerate(proposed_subnets_data):
+        subnet_name = proposed_subnet.get('name')
+        cidr_block_str = proposed_subnet.get('cidr')
+        availability_zone = proposed_subnet.get('az')
+        if not all([subnet_name, cidr_block_str, availability_zone]):
+            raise ValueError(f"Proposed subnet at index {i} is incomplete. Requires 'name', 'cidr', and 'az'.")
+        # 1. Check for duplicate names within the proposed batch
+        if subnet_name in proposed_names_seen:
+            raise ValueError(f"Proposed subnet name '{subnet_name}' is duplicated within the input list.")
+        proposed_names_seen.add(subnet_name)
+        # 2. Check for duplicate names against existing AWS subnets
+        if subnet_name in existing_aws_subnet_names:
+            print(f"Proposed subnet name '{subnet_name}' already exists in VPC '{vpc_id}'.")
+        # Parse proposed CIDR
+        try:
+            proposed_net = ipaddress.ip_network(cidr_block_str, strict=False)
+        except ValueError as e:
+            raise ValueError(f"Invalid CIDR format '{cidr_block_str}' for proposed subnet '{subnet_name}': {e}")
+        # 3. Check for overlapping CIDRs within the proposed batch
+        for existing_proposed_net in proposed_cidr_networks_seen:
+            if proposed_net.overlaps(existing_proposed_net):
+                raise ValueError(
+                    f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
+                    f"overlaps with another proposed CIDR '{str(existing_proposed_net)}' "
+                    f"within the same batch."
+                )
+        # 4. Check for overlapping CIDRs against existing AWS subnets
+        for existing_aws_net in existing_aws_cidr_networks:
+            if proposed_net.overlaps(existing_aws_net):
+                raise ValueError(
+                    f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
+                    f"overlaps with an existing AWS subnet CIDR '{str(existing_aws_net)}' "
+                    f"in VPC '{vpc_id}'."
+                )
+        # If all checks pass for this subnet, add its network to the list for subsequent checks
+        proposed_cidr_networks_seen.append(proposed_net)
+        print(f"Validation successful for proposed subnet '{subnet_name}' with CIDR '{cidr_block_str}'.")
+    print(f"--- All proposed subnets passed pre-synth validation checks for VPC '{vpc_id}'. ---")
+# --- Modified check_subnet_exists_by_name (Uses pre-fetched data) ---
+def check_subnet_exists_by_name(
+    subnet_name: str,
+    existing_aws_subnets_data: Dict[str, Any]
+) -> Tuple[bool, Optional[str]]:
+    """
+    Checks if a subnet with the given name exists within the pre-fetched data.
+    Args:
+        subnet_name: The 'Name' tag value of the subnet to check.
+        existing_aws_subnets_data: Dictionary containing existing AWS subnet data
+                                   (e.g., from _get_existing_subnets_in_vpc).
+    Returns:
+        A tuple:
+        - The first element is True if the subnet exists, False otherwise.
+        - The second element is the Subnet ID if found, None otherwise.
+    """
+    subnet_info = existing_aws_subnets_data["by_name"].get(subnet_name)
+    if subnet_info:
+        print(f"Subnet '{subnet_name}' found with ID: {subnet_info['id']}")
+        return True, subnet_info['id']
+    else:
+        print(f"Subnet '{subnet_name}' not found.")
+        return False, None
+def create_nat_gateway(
+    scope: Construct,
+    public_subnet_for_nat: ec2.ISubnet, # Expects a proper ISubnet
+    nat_gateway_name: str,
+    nat_gateway_id_context_key: str
+) -> str:
+    """
+    Creates a single NAT Gateway in the specified public subnet.
+    It does not handle lookup from context; the calling stack should do that.
+    Returns the CloudFormation Ref of the NAT Gateway ID.
+    """
+    print(f"Defining a new NAT Gateway '{nat_gateway_name}' in subnet '{public_subnet_for_nat.subnet_id}'.")
+    # Create an Elastic IP for the NAT Gateway
+    eip = ec2.CfnEIP(scope, NAT_GATEWAY_EIP_NAME,
+        tags=[CfnTag(key="Name", value=NAT_GATEWAY_EIP_NAME)]
+    )
+    # Create the NAT Gateway
+    nat_gateway_logical_id = nat_gateway_name.replace('-', '') + "NatGateway"
+    nat_gateway = ec2.CfnNatGateway(scope, nat_gateway_logical_id,
+        subnet_id=public_subnet_for_nat.subnet_id,  # Associate with the public subnet
+        allocation_id=eip.attr_allocation_id,       # Associate with the EIP
+        tags=[CfnTag(key="Name", value=nat_gateway_name)]
+    )
+    # The NAT GW depends on the EIP. The dependency on the subnet is implicit via subnet_id.
+    nat_gateway.add_dependency(eip)
+    # *** CRUCIAL: Use CfnOutput to export the ID after deployment ***
+    # This is how you will get the ID to put into cdk.context.json
+    CfnOutput(scope, "SingleNatGatewayIdOutput",
+        value=nat_gateway.ref,
+        description=f"Physical ID of the Single NAT Gateway. Add this to cdk.context.json under the key '{nat_gateway_id_context_key}'.",
+        export_name=f"{scope.stack_name}-NatGatewayId" # Make export name unique
+    )
+    print(f"CDK: Defined new NAT Gateway '{nat_gateway.ref}'. Its physical ID will be available in the stack outputs after deployment.")
+    # Return the tokenised reference for use within this synthesis
+    return nat_gateway.ref
+def create_subnets(
+    scope: Construct,
+    vpc: ec2.IVpc,
+    prefix: str,
+    subnet_names: List[str],
+    cidr_blocks: List[str],
+    availability_zones: List[str],
+    is_public: bool,
+    internet_gateway_id: Optional[str] = None,
+    single_nat_gateway_id: Optional[str] = None
+) -> Tuple[List[ec2.CfnSubnet], List[ec2.CfnRouteTable]]:
+    """
+    Creates subnets using L2 constructs but returns the underlying L1 Cfn objects
+    for backward compatibility.
+    """
+    # --- Validations remain the same ---
+    if not (len(subnet_names) == len(cidr_blocks) == len(availability_zones) > 0):
+        raise ValueError("Subnet names, CIDR blocks, and Availability Zones lists must be non-empty and match in length.")
+    if is_public and not internet_gateway_id:
+        raise ValueError("internet_gateway_id must be provided for public subnets.")
+    if not is_public and not single_nat_gateway_id:
+        raise ValueError("single_nat_gateway_id must be provided for private subnets when using a single NAT Gateway.")
+    # --- We will populate these lists with the L1 objects to return ---
+    created_subnets: List[ec2.CfnSubnet] = []
+    created_route_tables: List[ec2.CfnRouteTable] = []
+    subnet_type_tag = "public" if is_public else "private"
+    for i, subnet_name in enumerate(subnet_names):
+        logical_id = f"{prefix}{subnet_type_tag.capitalize()}Subnet{i+1}"
+        # 1. Create the L2 Subnet (this is the easy part)
+        subnet = ec2.Subnet(
+            scope,
+            logical_id,
+            vpc_id=vpc.vpc_id,
+            cidr_block=cidr_blocks[i],
+            availability_zone=availability_zones[i],
+            map_public_ip_on_launch=is_public
+        )
+        Tags.of(subnet).add("Name", subnet_name)
+        Tags.of(subnet).add("Type", subnet_type_tag)
+        if is_public:
+            # The subnet's route_table is automatically created by the L2 Subnet construct
+            try:
+                subnet.add_route(
+                    "DefaultInternetRoute", # A logical ID for the CfnRoute resource
+                    router_id=internet_gateway_id,
+                    router_type=ec2.RouterType.GATEWAY,
+                    # destination_cidr_block="0.0.0.0/0" is the default for this method
+                )
+            except Exception as e:
+                print("Could not create IGW route for public subnet due to:", e)
+            print(f"CDK: Defined public L2 subnet '{subnet_name}' and added IGW route.")
+        else:
+            try:
+                # Using .add_route() for private subnets as well for consistency
+                subnet.add_route(
+                    "DefaultNatRoute", # A logical ID for the CfnRoute resource
+                    router_id=single_nat_gateway_id,
+                    router_type=ec2.RouterType.NAT_GATEWAY,
+                )
+            except Exception as e:
+                print("Could not create NAT gateway route for public subnet due to:", e)
+            print(f"CDK: Defined private L2 subnet '{subnet_name}' and added NAT GW route.")
+        route_table = subnet.route_table
+        created_subnets.append(subnet)
+        created_route_tables.append(route_table)
+    return created_subnets, created_route_tables
+def ingress_rule_exists(security_group:str, peer:str, port:str):
+    for rule in security_group.connections.security_groups:
+        if port:
+            if rule.peer == peer and rule.connection == port:
+                return True
+        else:
+            if rule.peer == peer:
+                return True
+    return False
+def check_for_existing_user_pool(user_pool_name:str):
+    cognito_client = boto3.client("cognito-idp")
+    list_pools_response = cognito_client.list_user_pools(MaxResults=60) # MaxResults up to 60
+    # ListUserPools might require pagination if you have more than 60 pools
+    # This simple example doesn't handle pagination, which could miss your pool
+    existing_user_pool_id = ""
+    for pool in list_pools_response.get('UserPools', []):
+        if pool.get('Name') == user_pool_name:
+            existing_user_pool_id = pool['Id']
+            print(f"Found existing user pool by name '{user_pool_name}' with ID: {existing_user_pool_id}")
+            break # Found the one we're looking for
+    if existing_user_pool_id:
+        return True, existing_user_pool_id, pool
+    else:
+        return False, "", ""
+def check_for_existing_user_pool_client(user_pool_id: str, user_pool_client_name: str):
+    """
+    Checks if a Cognito User Pool Client with the given name exists in the specified User Pool.
+    Args:
+        user_pool_id: The ID of the Cognito User Pool.
+        user_pool_client_name: The name of the User Pool Client to check for.
+    Returns:
+        A tuple:
+        - True, client_id, client_details if the client exists.
+        - False, "", {} otherwise.
+    """
+    cognito_client = boto3.client("cognito-idp")
+    next_token = 'string'
+    while True:
+        try:
+            response = cognito_client.list_user_pool_clients(
+                UserPoolId=user_pool_id,
+                MaxResults=60,
+                NextToken=next_token
+            )
+        except cognito_client.exceptions.ResourceNotFoundException:
+            print(f"Error: User pool with ID '{user_pool_id}' not found.")
+            return False, "", {}
+        except cognito_client.exceptions.InvalidParameterException:
+            print(f"Error: No app clients for '{user_pool_id}' found.")
+            return False, "", {}
+        except Exception as e:
+            print("Could not check User Pool clients due to:", e)
+        for client in response.get('UserPoolClients', []):
+            if client.get('ClientName') == user_pool_client_name:
+                print(f"Found existing user pool client '{user_pool_client_name}' with ID: {client['ClientId']}")
+                return True, client['ClientId'], client
+        next_token = response.get('NextToken')
+        if not next_token:
+            break
+    return False, "", {}
+def check_for_secret(secret_name: str, secret_value: dict=""):
+    """
+    Checks if a Secrets Manager secret with the given name exists.
+    If it doesn't exist, it creates the secret.
+    Args:
+        secret_name: The name of the Secrets Manager secret.
+        secret_value: A dictionary containing the key-value pairs for the secret.
+    Returns:
+        True if the secret existed or was created, False otherwise (due to other errors).
+    """
+    secretsmanager_client = boto3.client("secretsmanager")
+    try:
+        # Try to get the secret. If it doesn't exist, a ResourceNotFoundException will be raised.
+        secret_value = secretsmanager_client.get_secret_value(SecretId=secret_name)
+        print(f"Secret '{secret_name}' already exists.")
+        return True, secret_value
+    except secretsmanager_client.exceptions.ResourceNotFoundException:
+        print("Secret not found")
+        return False, {}
+    except Exception as e:
+        # Handle other potential exceptions during the get operation
+        print(f"Error checking for secret '{secret_name}': {e}")
+        return False, {}
+def check_alb_exists(load_balancer_name: str, region_name: str = None) -> tuple[bool, dict]:
+    """
+    Checks if an Application Load Balancer (ALB) with the given name exists.
+    Args:
+        load_balancer_name: The name of the ALB to check.
+        region_name: The AWS region to check in.  If None, uses the default
+                     session region.
+    Returns:
+        A tuple:
+        - The first element is True if the ALB exists, False otherwise.
+        - The second element is the ALB object (dictionary) if found,
+          None otherwise.  Specifically, it returns the first element of
+          the LoadBalancers list from the describe_load_balancers response.
+    """
+    if region_name:
+        elbv2_client = boto3.client('elbv2', region_name=region_name)
+    else:
+        elbv2_client = boto3.client('elbv2')
+    try:
+        response = elbv2_client.describe_load_balancers(Names=[load_balancer_name])
+        if response['LoadBalancers']:
+            return True, response['LoadBalancers'][0]  # Return True and the first ALB object
+        else:
+            return False, {}
+    except ClientError as e:
+        #  If the error indicates the ALB doesn't exist, return False
+        if e.response['Error']['Code'] == 'LoadBalancerNotFound':
+            return False, {}
+        else:
+            # Re-raise other exceptions
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, {}
+def check_fargate_task_definition_exists(task_definition_name: str, region_name: str = None) -> tuple[bool, dict]:
+    """
+    Checks if a Fargate task definition with the given name exists.
+    Args:
+        task_definition_name: The name or ARN of the task definition to check.
+        region_name: The AWS region to check in. If None, uses the default
+                     session region.
+    Returns:
+        A tuple:
+        - The first element is True if the task definition exists, False otherwise.
+        - The second element is the task definition object (dictionary) if found,
+          None otherwise.  Specifically, it returns the first element of the
+          taskDefinitions list from the describe_task_definition response.
+    """
+    if region_name:
+        ecs_client = boto3.client('ecs', region_name=region_name)
+    else:
+        ecs_client = boto3.client('ecs')
+    try:
+        response = ecs_client.describe_task_definition(taskDefinition=task_definition_name)
+        # If describe_task_definition succeeds, it returns the task definition.
+        # We can directly return True and the task definition.
+        return True, response['taskDefinition']
+    except ClientError as e:
+        # Check for the error code indicating the task definition doesn't exist.
+        if e.response['Error']['Code'] == 'ClientException' and 'Task definition' in e.response['Message'] and 'does not exist' in e.response['Message']:
+            return False, {}
+        else:
+            # Re-raise other exceptions.
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, {}
+def check_ecs_service_exists(cluster_name: str, service_name: str, region_name: str = None) -> tuple[bool, dict]:
+    """
+    Checks if an ECS service with the given name exists in the specified cluster.
+    Args:
+        cluster_name: The name or ARN of the ECS cluster.
+        service_name: The name of the ECS service to check.
+        region_name: The AWS region to check in. If None, uses the default
+                     session region.
+    Returns:
+        A tuple:
+        - The first element is True if the service exists, False otherwise.
+        - The second element is the service object (dictionary) if found,
+          None otherwise.
+    """
+    if region_name:
+        ecs_client = boto3.client('ecs', region_name=region_name)
+    else:
+        ecs_client = boto3.client('ecs')
+    try:
+        response = ecs_client.describe_services(cluster=cluster_name, services=[service_name])
+        if response['services']:
+            return True, response['services'][0]  # Return True and the first service object
+        else:
+            return False, {}
+    except ClientError as e:
+        # Check for the error code indicating the service doesn't exist.
+        if e.response['Error']['Code'] == 'ClusterNotFoundException':
+            return False, {}
+        elif e.response['Error']['Code'] == 'ServiceNotFoundException':
+            return False, {}
+        else:
+            # Re-raise other exceptions.
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, {}
+def check_cloudfront_distribution_exists(distribution_name: str, region_name: str = None) -> tuple[bool, dict | None]:
+    """
+    Checks if a CloudFront distribution with the given name exists.
+    Args:
+        distribution_name: The name of the CloudFront distribution to check.
+        region_name: The AWS region to check in. If None, uses the default
+                     session region.  Note: CloudFront is a global service,
+                     so the region is usually 'us-east-1', but this parameter
+                     is included for completeness.
+    Returns:
+        A tuple:
+        - The first element is True if the distribution exists, False otherwise.
+        - The second element is the distribution object (dictionary) if found,
+          None otherwise.  Specifically, it returns the first element of the
+          DistributionList from the ListDistributions response.
+    """
+    if region_name:
+        cf_client = boto3.client('cloudfront', region_name=region_name)
+    else:
+        cf_client = boto3.client('cloudfront')
+    try:
+        response = cf_client.list_distributions()
+        if 'Items' in response['DistributionList']:
+            for distribution in response['DistributionList']['Items']:
+                # CloudFront doesn't directly filter by name, so we have to iterate.
+                if distribution['AliasSet']['Items'] and distribution['AliasSet']['Items'][0] == distribution_name:
+                    return True, distribution
+            return False, None
+        else:
+            return False, None
+    except ClientError as e:
+        #  If the error indicates the Distribution doesn't exist, return False
+        if e.response['Error']['Code'] == 'NoSuchDistribution':
+            return False, None
+        else:
+            # Re-raise other exceptions
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, None
+def create_web_acl_with_common_rules(scope:Construct, web_acl_name: str, waf_scope:str="CLOUDFRONT"):
+    '''
+    Use CDK to create a web ACL based on an AWS common rule set with overrides.
+    This function now expects a 'scope' argument, typically 'self' from your stack,
+    as CfnWebACL requires a construct scope.
+    '''
+    # Create full list of rules
+    rules = []
+    aws_ruleset_names = [
+        "AWSManagedRulesCommonRuleSet",
+        "AWSManagedRulesKnownBadInputsRuleSet",
+        "AWSManagedRulesAmazonIpReputationList"
+    ]
+    # Use a separate counter to assign unique priorities sequentially
+    priority_counter = 1
+    for aws_rule_name in aws_ruleset_names:
+        current_rule_action_overrides = None
+        # All managed rule groups need an override_action.
+        # 'none' means use the managed rule group's default action.
+        current_override_action = wafv2.CfnWebACL.OverrideActionProperty(none={})
+        current_priority = priority_counter
+        priority_counter += 1
+        if aws_rule_name == "AWSManagedRulesCommonRuleSet":
+            current_rule_action_overrides = [
+                wafv2.CfnWebACL.RuleActionOverrideProperty(
+                    name="SizeRestrictions_BODY",
+                    action_to_use=wafv2.CfnWebACL.RuleActionProperty(
+                        allow={}
+                    )
+                )
+            ]
+            # No need to set current_override_action here, it's already set above.
+            # If you wanted this specific rule to have a *fixed* priority, you'd handle it differently
+            # For now, it will get priority 1 from the counter.
+        rule_property = wafv2.CfnWebACL.RuleProperty(
+            name=aws_rule_name,
+            priority=current_priority,
+            statement=wafv2.CfnWebACL.StatementProperty(
+                managed_rule_group_statement=wafv2.CfnWebACL.ManagedRuleGroupStatementProperty(
+                    vendor_name="AWS",
+                    name=aws_rule_name,
+                    rule_action_overrides=current_rule_action_overrides
+                )
+            ),
+            visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
+                cloud_watch_metrics_enabled=True,
+                metric_name=aws_rule_name,
+                sampled_requests_enabled=True
+            ),
+            override_action=current_override_action # THIS IS THE CRUCIAL PART FOR ALL MANAGED RULES
+        )
+        rules.append(rule_property)
+    # Add the rate limit rule
+    rate_limit_priority = priority_counter # Use the next available priority
+    rules.append(wafv2.CfnWebACL.RuleProperty(
+        name="RateLimitRule",
+        priority=rate_limit_priority,
+        statement=wafv2.CfnWebACL.StatementProperty(
+            rate_based_statement=wafv2.CfnWebACL.RateBasedStatementProperty(
+                limit=1000,
+                aggregate_key_type="IP"
+            )
+        ),
+        visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
+            cloud_watch_metrics_enabled=True,
+            metric_name="RateLimitRule",
+            sampled_requests_enabled=True
+        ),
+        action=wafv2.CfnWebACL.RuleActionProperty(
+            block={}
+        )
+    ))
+    web_acl = wafv2.CfnWebACL(
+        scope,
+        "WebACL",
+        name=web_acl_name,
+        default_action=wafv2.CfnWebACL.DefaultActionProperty(allow={}),
+        scope=waf_scope,
+        visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
+            cloud_watch_metrics_enabled=True,
+            metric_name="webACL",
+            sampled_requests_enabled=True
+        ),
+        rules=rules
+    )
+    CfnOutput(scope, "WebACLArn", value=web_acl.attr_arn)
+    return web_acl
+def check_web_acl_exists(web_acl_name: str, scope: str, region_name: str = None) -> tuple[bool, dict]:
+    """
+    Checks if a Web ACL with the given name and scope exists.
+    Args:
+        web_acl_name: The name of the Web ACL to check.
+        scope: The scope of the Web ACL ('CLOUDFRONT' or 'REGIONAL').
+        region_name: The AWS region to check in. Required for REGIONAL scope.
+                     If None, uses the default session region.  For CLOUDFRONT,
+                     the region should be 'us-east-1'.
+    Returns:
+        A tuple:
+        - The first element is True if the Web ACL exists, False otherwise.
+        - The second element is the Web ACL object (dictionary) if found,
+          None otherwise.
+    """
+    if scope not in ['CLOUDFRONT', 'REGIONAL']:
+        raise ValueError("Scope must be either 'CLOUDFRONT' or 'REGIONAL'")
+    if scope == 'REGIONAL' and not region_name:
+        raise ValueError("Region name is required for REGIONAL scope")
+    if scope == 'CLOUDFRONT':
+        region_name = 'us-east-1'  # CloudFront scope requires us-east-1
+    if region_name:
+        waf_client = boto3.client('wafv2', region_name=region_name)
+    else:
+        waf_client = boto3.client('wafv2')
+    try:
+        response = waf_client.list_web_acls(Scope=scope)
+        if 'WebACLs' in response:
+            for web_acl in response['WebACLs']:
+                if web_acl['Name'] == web_acl_name:
+                    # Describe the Web ACL to get the full object.
+                    describe_response = waf_client.describe_web_acl(Name=web_acl_name, Scope=scope)
+                    return True, describe_response['WebACL']
+            return False, {}
+        else:
+            return False, {}
+    except ClientError as e:
+        # Check for the error code indicating the web ACL doesn't exist.
+        if e.response['Error']['Code'] == 'ResourceNotFoundException':
+            return False, {}
+        else:
+            # Re-raise other exceptions.
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, {}
+def add_alb_https_listener_with_cert(
+    scope: Construct,
+    logical_id: str, # A unique ID for this listener construct
+    alb: elb.ApplicationLoadBalancer,
+    acm_certificate_arn: Optional[str], # Optional: If None, no HTTPS listener will be created
+    default_target_group: elb.ITargetGroup, # Mandatory: The target group to forward traffic to
+    listener_port_https: int = 443,
+    listener_open_to_internet: bool = False, # Be cautious with True, ensure ALB security group restricts access
+    # --- Cognito Authentication Parameters ---
+    enable_cognito_auth: bool = False,
+    cognito_user_pool: Optional[cognito.IUserPool] = None,
+    cognito_user_pool_client: Optional[cognito.IUserPoolClient] = None,
+    cognito_user_pool_domain: Optional[str] = None, # E.g., "my-app-domain" for "my-app-domain.auth.region.amazoncognito.com"
+    cognito_auth_scope: Optional[str] = "openid profile email", # Default recommended scope
+    cognito_auth_on_unauthenticated_request: elb.UnauthenticatedAction = elb.UnauthenticatedAction.AUTHENTICATE,
+    stickiness_cookie_duration=None
+    # --- End Cognito Parameters ---
+) -> Optional[elb.ApplicationListener]:
+    """
+    Conditionally adds an HTTPS listener to an ALB with an ACM certificate,
+    and optionally enables Cognito User Pool authentication.
+    Args:
+        scope (Construct): The scope in which to define this construct (e.g., your CDK Stack).
+        logical_id (str): A unique logical ID for the listener construct within the stack.
+        alb (elb.ApplicationLoadBalancer): The Application Load Balancer to add the listener to.
+        acm_certificate_arn (Optional[str]): The ARN of the ACM certificate to attach.
+                                             If None, the HTTPS listener will NOT be created.
+        default_target_group (elb.ITargetGroup): The default target group for the listener to forward traffic to.
+                                                 This is mandatory for a functional listener.
+        listener_port_https (int): The HTTPS port to listen on (default: 443).
+        listener_open_to_internet (bool): Whether the listener should allow connections from all sources.
+                                          If False (recommended), ensure your ALB's security group allows
+                                          inbound traffic on this port from desired sources.
+        enable_cognito_auth (bool): Set to True to enable Cognito User Pool authentication.
+        cognito_user_pool (Optional[cognito.IUserPool]): The Cognito User Pool object. Required if enable_cognito_auth is True.
+        cognito_user_pool_client (Optional[cognito.IUserPoolClient]): The Cognito User Pool App Client object. Required if enable_cognito_auth is True.
+        cognito_user_pool_domain (Optional[str]): The domain prefix for your Cognito User Pool. Required if enable_cognito_auth is True.
+        cognito_auth_scope (Optional[str]): The scope for the Cognito authentication.
+        cognito_auth_on_unauthenticated_request (elb.UnauthenticatedAction): Action for unauthenticated requests.
+                                                                           Defaults to AUTHENTICATE (redirect to login).
+    Returns:
+        Optional[elb.ApplicationListener]: The created ApplicationListener if successful,
+                                           None if no ACM certificate ARN was provided.
+    """
+    https_listener = None
+    if acm_certificate_arn:
+        certificates_list = [elb.ListenerCertificate.from_arn(acm_certificate_arn)]
+        print(f"Attempting to add ALB HTTPS listener on port {listener_port_https} with ACM certificate: {acm_certificate_arn}")
+        # Determine the default action based on whether Cognito auth is enabled
+        default_action = None
+        if enable_cognito_auth == True:
+            if not all([cognito_user_pool, cognito_user_pool_client, cognito_user_pool_domain]):
+                raise ValueError(
+                    "Cognito User Pool, Client, and Domain must be provided if enable_cognito_auth is True."
+                )
+            print(f"Enabling Cognito authentication with User Pool: {cognito_user_pool.user_pool_id}")
+            default_action = elb_act.AuthenticateCognitoAction(
+                next=elb.ListenerAction.forward([default_target_group]), # After successful auth, forward to TG
+                user_pool=cognito_user_pool,
+                user_pool_client=cognito_user_pool_client,
+                user_pool_domain=cognito_user_pool_domain,
+                scope=cognito_auth_scope,
+                on_unauthenticated_request=cognito_auth_on_unauthenticated_request,
+                session_timeout=stickiness_cookie_duration
+                # Additional options you might want to configure:
+                # session_cookie_name="AWSELBCookies"
+            )
+        else:
+            default_action = elb.ListenerAction.forward([default_target_group])
+            print("Cognito authentication is NOT enabled for this listener.")
+        # Add the HTTPS listener
+        https_listener = alb.add_listener(
+            logical_id,
+            port=listener_port_https,
+            open=listener_open_to_internet,
+            certificates=certificates_list,
+            default_action=default_action # Use the determined default action
+        )
+        print(f"ALB HTTPS listener on port {listener_port_https} defined.")
+    else:
+        print("ACM_CERTIFICATE_ARN is not provided. Skipping HTTPS listener creation.")
+    return https_listener
+def ensure_folder_exists(output_folder:str):
+    """Checks if the specified folder exists, creates it if not."""
+    if not os.path.exists(output_folder):
+        # Create the folder if it doesn't exist
+        os.makedirs(output_folder, exist_ok=True)
+        print(f"Created the {output_folder} folder.")
+    else:
+        print(f"The {output_folder} folder already exists.")
+def create_basic_config_env(out_dir:str="config", S3_LOG_CONFIG_BUCKET_NAME=S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME=S3_OUTPUT_BUCKET_NAME, ACCESS_LOG_DYNAMODB_TABLE_NAME=ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME=USAGE_LOG_DYNAMODB_TABLE_NAME):
+    '''
+    Create a basic config.env file for the user to use with their newly deployed redaction app.
+    '''
+    variables = {
+    'COGNITO_AUTH':'1',
+    'RUN_AWS_FUNCTIONS':'1',
+    'DISPLAY_FILE_NAMES_IN_LOGS':'False',
+    'SESSION_OUTPUT_FOLDER':'True',
+    'SAVE_LOGS_TO_DYNAMODB':'True',
+    'SHOW_COSTS':'True',
+    'SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS':'True',
+    'LOAD_PREVIOUS_TEXTRACT_JOBS_S3':'True',
+    'DOCUMENT_REDACTION_BUCKET':S3_LOG_CONFIG_BUCKET_NAME,
+    'TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET':S3_OUTPUT_BUCKET_NAME,
+    'ACCESS_LOG_DYNAMODB_TABLE_NAME':ACCESS_LOG_DYNAMODB_TABLE_NAME,
+    'FEEDBACK_LOG_DYNAMODB_TABLE_NAME':FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+    'USAGE_LOG_DYNAMODB_TABLE_NAME':USAGE_LOG_DYNAMODB_TABLE_NAME,
+    'DISPLAY_FILE_NAMES_IN_LOGS':'False'
+    }
+    # Write variables to .env file
+    ensure_folder_exists(out_dir + "/")
+    env_file_path = os.path.abspath(os.path.join(out_dir, 'config.env'))
+    # It's good practice to ensure the file exists before calling set_key repeatedly.
+    # set_key will create it, but for a loop, it might be cleaner to ensure it's empty/exists once.
+    if not os.path.exists(env_file_path):
+        with open(env_file_path, 'w') as f:
+            pass # Create empty file
+    for key, value in variables.items():
+        set_key(env_file_path, key, str(value), quote_mode="never")
+    return variables
+def start_codebuild_build(PROJECT_NAME:str, AWS_REGION:str = AWS_REGION):
+    '''
+    Start an existing Codebuild project build
+    '''
+    # --- Initialize CodeBuild client ---
+    client = boto3.client('codebuild', region_name=AWS_REGION)
+    try:
+        print(f"Attempting to start build for project: {PROJECT_NAME}")
+        response = client.start_build(
+            projectName=PROJECT_NAME
+        )
+        build_id = response['build']['id']
+        print(f"Successfully started build with ID: {build_id}")
+        print(f"Build ARN: {response['build']['arn']}")
+        print(f"Build URL (approximate - construct based on region and ID):")
+        print(f"https://{AWS_REGION}.console.aws.amazon.com/codesuite/codebuild/projects/{PROJECT_NAME}/build/{build_id.split(':')[-1]}/detail")
+        # You can inspect the full response if needed
+        # print("\nFull response:")
+        # import json
+        # print(json.dumps(response, indent=2))
+    except client.exceptions.ResourceNotFoundException:
+        print(f"Error: Project '{PROJECT_NAME}' not found in region '{AWS_REGION}'.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str, RUN_AWS_FUNCTIONS:str = "1"):
+    """
+    Uploads a file from local machine to Amazon S3.
+    Args:
+    - local_file_path: Local file path(s) of the file(s) to upload.
+    - s3_key: Key (path) to the file in the S3 bucket.
+    - s3_bucket: Name of the S3 bucket.
+    Returns:
+    - Message as variable/printed to console
+    """
+    final_out_message = []
+    final_out_message_str = ""
+    if RUN_AWS_FUNCTIONS == "1":
+        try:
+            if s3_bucket and local_file_paths:
+                s3_client = boto3.client('s3', region_name=AWS_REGION)
+                if isinstance(local_file_paths, str):
+                    local_file_paths = [local_file_paths]
+                for file in local_file_paths:
+                    if s3_client:
+                        #print(s3_client)
+                        try:
+                            # Get file name off file path
+                            file_name = os.path.basename(file)
+                            s3_key_full = s3_key + file_name
+                            print("S3 key: ", s3_key_full)
+                            s3_client.upload_file(file, s3_bucket, s3_key_full)
+                            out_message = "File " + file_name + " uploaded successfully!"
+                            print(out_message)
+                        except Exception as e:
+                            out_message = f"Error uploading file(s): {e}"
+                            print(out_message)
+                        final_out_message.append(out_message)
+                        final_out_message_str = '\n'.join(final_out_message)
+                    else: final_out_message_str = "Could not connect to AWS."
+            else: final_out_message_str = "At least one essential variable is empty, could not upload to S3"
+        except Exception as e:
+            final_out_message_str = "Could not upload files to S3 due to: " + str(e)
+            print(final_out_message_str)
+    else:
+        final_out_message_str = "App not set to run AWS functions"
+    return final_out_message_str
+# Initialize ECS client
+def start_ecs_task(cluster_name, service_name):
+    ecs_client = boto3.client('ecs')
+    try:
+        # Update the service to set the desired count to 1
+        response = ecs_client.update_service(
+            cluster=cluster_name,
+            service=service_name,
+            desiredCount=1
+        )
+        return {
+            "statusCode": 200,
+            "body": f"Service {service_name} in cluster {cluster_name} has been updated to 1 task."
+        }
+    except Exception as e:
+        return {
+            "statusCode": 500,
+            "body": f"Error updating service: {str(e)}"
+        }

cdk/cdk_stack.py ADDED Viewed

	@@ -0,0 +1,1317 @@

+import os
+import json # You might still need json if loading task_definition.json
+from typing import List, Dict, Any
+from aws_cdk import (
+    Stack,
+    CfnTag,    # <-- Import CfnTag directly
+    CfnOutput, # <-- Import CfnOutput directly
+    Duration,
+    RemovalPolicy,
+    SecretValue,
+    aws_ec2 as ec2,
+    aws_ecr as ecr,
+    aws_s3 as s3,
+    aws_ecs as ecs,
+    aws_iam as iam,
+    aws_codebuild as codebuild,
+    aws_cognito as cognito,
+    aws_secretsmanager as secretsmanager,
+    aws_cloudfront as cloudfront,
+    aws_cloudfront_origins as origins,
+    aws_elasticloadbalancingv2 as elbv2,
+    aws_logs as logs,
+    aws_wafv2 as wafv2,
+    aws_dynamodb as dynamodb # Import the DynamoDB module
+)
+from constructs import Construct
+from cdk_config import CDK_PREFIX, VPC_NAME, AWS_MANAGED_TASK_ROLES_LIST, GITHUB_REPO_USERNAME, GITHUB_REPO_NAME, GITHUB_REPO_BRANCH, ECS_TASK_MEMORY_SIZE, ECS_TASK_CPU_SIZE, CUSTOM_HEADER, CUSTOM_HEADER_VALUE, AWS_REGION, CLOUDFRONT_GEO_RESTRICTION, DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS, GRADIO_SERVER_PORT, PUBLIC_SUBNETS_TO_USE, PUBLIC_SUBNET_CIDR_BLOCKS, PUBLIC_SUBNET_AVAILABILITY_ZONES, PRIVATE_SUBNETS_TO_USE, PRIVATE_SUBNET_CIDR_BLOCKS, PRIVATE_SUBNET_AVAILABILITY_ZONES, CODEBUILD_PROJECT_NAME, ECS_SECURITY_GROUP_NAME, ALB_NAME_SECURITY_GROUP_NAME, ALB_NAME, COGNITO_USER_POOL_NAME, COGNITO_USER_POOL_CLIENT_NAME, COGNITO_USER_POOL_CLIENT_SECRET_NAME, FARGATE_TASK_DEFINITION_NAME, ECS_SERVICE_NAME, WEB_ACL_NAME, CLOUDFRONT_DISTRIBUTION_NAME, ECS_TASK_ROLE_NAME, ALB_TARGET_GROUP_NAME, S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME, ACM_CERTIFICATE_ARN, CLUSTER_NAME, CODEBUILD_ROLE_NAME, ECS_TASK_EXECUTION_ROLE_NAME, ECR_CDK_REPO_NAME, ECS_LOG_GROUP_NAME, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, TASK_DEFINITION_FILE_LOCATION, EXISTING_IGW_ID, SINGLE_NAT_GATEWAY_ID, NAT_GATEWAY_NAME, COGNITO_USER_POOL_DOMAIN_PREFIX, COGNITO_REDIRECTION_URL, AWS_ACCOUNT_ID, ECS_USE_FARGATE_SPOT, ECS_READ_ONLY_FILE_SYSTEM, USE_CLOUDFRONT, LOAD_BALANCER_WEB_ACL_NAME
+from cdk_functions import create_subnets, create_web_acl_with_common_rules, add_custom_policies, add_alb_https_listener_with_cert, create_nat_gateway # Only keep CDK-native functions
+def _get_env_list(env_var_name: str) -> List[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(',') if s.strip()]
+# 1. Try to load CIDR/AZs from environment variables
+if PUBLIC_SUBNETS_TO_USE: PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
+if PRIVATE_SUBNETS_TO_USE: PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
+if PUBLIC_SUBNET_CIDR_BLOCKS: PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
+if PUBLIC_SUBNET_AVAILABILITY_ZONES: PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
+if PRIVATE_SUBNET_CIDR_BLOCKS: PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
+if PRIVATE_SUBNET_AVAILABILITY_ZONES: PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list("PRIVATE_SUBNET_AVAILABILITY_ZONES")
+if AWS_MANAGED_TASK_ROLES_LIST: AWS_MANAGED_TASK_ROLES_LIST = _get_env_list(AWS_MANAGED_TASK_ROLES_LIST)
+class CdkStack(Stack):
+    def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
+        super().__init__(scope, construct_id, **kwargs)
+# --- Helper to get context values ---
+        def get_context_bool(key: str, default: bool = False) -> bool:
+            return self.node.try_get_context(key) or default
+        def get_context_str(key: str, default: str = None) -> str:
+             return self.node.try_get_context(key) or default
+        def get_context_dict(key: str, default: dict = None) -> dict:
+            return self.node.try_get_context(key) or default
+        def get_context_list_of_dicts(key: str) -> List[Dict[str, Any]]:
+            ctx_value = self.node.try_get_context(key)
+            if not isinstance(ctx_value, list):
+                print(f"Warning: Context key '{key}' not found or not a list. Returning empty list.")
+                return []
+            # Optional: Add validation that all items in the list are dicts
+            return ctx_value
+        # --- VPC and Subnets (Assuming VPC is always lookup, Subnets are created/returned by create_subnets) ---
+        # --- VPC Lookup (Always lookup as per your assumption) ---
+        try:
+            vpc = ec2.Vpc.from_lookup(
+                self,
+                "VPC",
+                vpc_name=VPC_NAME
+            )
+            print("Successfully looked up VPC:", vpc.vpc_id)
+        except Exception as e:
+            raise Exception(f"Could not look up VPC with name '{VPC_NAME}' due to: {e}")
+        # --- Subnet Handling (Check Context and Create/Import) ---
+        # Initialize lists to hold ISubnet objects (L2) and CfnSubnet/CfnRouteTable (L1)
+        # We will store ISubnet for consistency, as CfnSubnet has a .subnet_id property
+        self.public_subnets: List[ec2.ISubnet] = []
+        self.private_subnets: List[ec2.ISubnet] = []
+        # Store L1 CfnRouteTables explicitly if you need to reference them later
+        self.private_route_tables_cfn: List[ec2.CfnRouteTable] = []
+        self.public_route_tables_cfn: List[ec2.CfnRouteTable] = [] # New: to store public RTs
+        names_to_create_private = []
+        names_to_create_public = []
+        if not PUBLIC_SUBNETS_TO_USE and not PRIVATE_SUBNETS_TO_USE:
+            print("Warning: No public or private subnets specified in *_SUBNETS_TO_USE. Attempting to select from existing VPC subnets.")
+            print("vpc.public_subnets:", vpc.public_subnets)
+            print("vpc.private_subnets:", vpc.private_subnets)
+            # public_subnets_by_az: Dict[str, List[ec2.ISubnet]] = {}
+            # private_subnets_by_az: Dict[str, List[ec2.ISubnet]] = {}
+            # Iterate through the subnets exposed by the Vpc L2 construct.
+            # for subnet in vpc.public_subnets:
+            #     az = subnet.availability_zone
+            #     if az not in public_subnets_by_az:
+            #         public_subnets_by_az[az] = []
+            #     public_subnets_by_az[az].append(subnet)
+            selected_public_subnets = vpc.select_subnets(subnet_type=ec2.SubnetType.PUBLIC, one_per_az=True)
+            private_subnets_egress = vpc.select_subnets(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS, one_per_az=True)
+            private_subnets_isolated = vpc.select_subnets(subnet_type=ec2.SubnetType.PRIVATE_ISOLATED, one_per_az=True)
+            combined_subnet_objects = []
+            if private_subnets_egress.subnets:
+                # Add the first PRIVATE_WITH_EGRESS subnet
+                combined_subnet_objects.append(private_subnets_egress.subnets[0])
+            else:
+                self.node.add_warning("No PRIVATE_WITH_EGRESS subnets found to select the first one.")
+            # Add all PRIVATE_ISOLATED subnets *except* the first one (if they exist)
+            if len(private_subnets_isolated.subnets) > 1:
+                combined_subnet_objects.extend(private_subnets_isolated.subnets[1:])
+            elif private_subnets_isolated.subnets: # Only 1 isolated subnet, add a warning if [1:] was desired
+                self.node.add_warning("Only one PRIVATE_ISOLATED subnet found, private_subnets_isolated.subnets[1:] will be empty.")
+            else:
+                self.node.add_warning("No PRIVATE_ISOLATED subnets found.")
+            # Create an ec2.SelectedSubnets object from the combined private subnet list.
+            selected_private_subnets = vpc.select_subnets(
+                subnets=combined_subnet_objects
+            )
+            print("selected_public_subnets:", selected_public_subnets)
+            print("selected_private_subnets:", selected_private_subnets)
+            #self.private_route_tables_cfn = []
+            # for subnet in vpc.private_subnets:
+            #     az = subnet.availability_zone
+            #     if az not in private_subnets_by_az:
+            #         private_subnets_by_az[az] = []
+            #     private_subnets_by_az[az].append(subnet)
+            #selected_public_subnets: List[ec2.ISubnet] = []
+            #selected_private_subnets: List[ec2.ISubnet] = []
+            # Select one public subnet per AZ, preferring the first one found
+            # for az in sorted(public_subnets_by_az.keys()):
+            #     if public_subnets_by_az[az]:
+            #         selected_public_subnets.append(public_subnets_by_az[az][0])
+            #         print(f"Selected existing public subnet: {public_subnets_by_az[az][0].subnet_id} from AZ {az}.")
+            # Select one private subnet per AZ, preferring the first one found
+            # for az in sorted(private_subnets_by_az.keys()):
+            #     if private_subnets_by_az[az]:
+            #         selected_private_subnets.append(private_subnets_by_az[az][0])
+            #         print(f"Selected existing private subnet: {private_subnets_by_az[az][0].subnet_id} from AZ {az}.")
+            if len(selected_public_subnets.subnet_ids) < 2 or len(selected_private_subnets.subnet_ids) < 2:
+                raise Exception("Need at least two public or private subnets in different availability zones")
+            if not selected_public_subnets and not selected_private_subnets:
+                # If no subnets could be found even with automatic selection, raise an error.
+                # This ensures the stack doesn't proceed if it absolutely needs subnets.
+                print("Error: No existing public or private subnets could be found in the VPC for automatic selection. "
+                      "You must either specify subnets in *_SUBNETS_TO_USE or ensure the VPC has discoverable subnets.")
+                raise RuntimeError("No suitable subnets found for automatic selection.")
+            else:
+                self.public_subnets = selected_public_subnets.subnets
+                self.private_subnets = selected_private_subnets.subnets
+                print(f"Automatically selected {len(self.public_subnets)} public and {len(self.private_subnets)} private subnets based on VPC discovery.")
+                print("self.public_subnets:", self.public_subnets)
+                print("self.private_subnets:", self.private_subnets)
+                # Since subnets are now assigned, we can exit this processing block.
+                # The rest of the original code (which iterates *_SUBNETS_TO_USE) will be skipped.
+        checked_public_subnets_ctx = get_context_dict("checked_public_subnets")
+        checked_private_subnets_ctx = get_context_dict("checked_private_subnets")
+        public_subnets_data_for_creation_ctx = get_context_list_of_dicts("public_subnets_to_create")
+        private_subnets_data_for_creation_ctx = get_context_list_of_dicts("private_subnets_to_create")
+        # --- 3. Process Public Subnets ---
+        print("\n--- Processing Public Subnets ---")
+        # Import existing public subnets
+        if checked_public_subnets_ctx:
+            for i, subnet_name in enumerate(PUBLIC_SUBNETS_TO_USE):
+                subnet_info = checked_public_subnets_ctx.get(subnet_name)
+                if subnet_info and subnet_info.get("exists"):
+                    subnet_id = subnet_info.get("id")
+                    if not subnet_id:
+                        raise RuntimeError(f"Context for existing public subnet '{subnet_name}' is missing 'id'.")
+                    try:
+                        imported_subnet = ec2.Subnet.from_subnet_id(
+                            self, f"ImportedPublicSubnet{subnet_name.replace('-', '')}{i}", subnet_id
+                        )
+                        #self.public_subnets.append(imported_subnet)
+                        print(f"Imported existing public subnet: {subnet_name} (ID: {subnet_id})")
+                    except Exception as e:
+                        raise RuntimeError(f"Failed to import public subnet '{subnet_name}' with ID '{subnet_id}'. Error: {e}")
+        # Create new public subnets based on public_subnets_data_for_creation_ctx
+        if public_subnets_data_for_creation_ctx:
+            names_to_create_public = [s['name'] for s in public_subnets_data_for_creation_ctx]
+            cidrs_to_create_public = [s['cidr'] for s in public_subnets_data_for_creation_ctx]
+            azs_to_create_public = [s['az'] for s in public_subnets_data_for_creation_ctx]
+            if names_to_create_public:
+                print(f"Attempting to create {len(names_to_create_public)} new public subnets: {names_to_create_public}")
+                newly_created_public_subnets, newly_created_public_rts_cfn = create_subnets(
+                self, vpc, CDK_PREFIX, names_to_create_public, cidrs_to_create_public, azs_to_create_public,
+                is_public=True,
+                internet_gateway_id=EXISTING_IGW_ID
+                )
+                self.public_subnets.extend(newly_created_public_subnets)
+                self.public_route_tables_cfn.extend(newly_created_public_rts_cfn)
+        if not self.public_subnets:
+            raise Exception("No public subnets found or created, exiting.")
+        # --- NAT Gateway Creation/Lookup ---
+        self.single_nat_gateway_id = None
+        nat_gw_id_from_context = SINGLE_NAT_GATEWAY_ID
+        if nat_gw_id_from_context:
+            print(f"Using existing NAT Gateway ID from context: {nat_gw_id_from_context}")
+            self.single_nat_gateway_id = nat_gw_id_from_context
+        else:
+            # If not in context, create a new one, but only if we have a public subnet.
+            if self.public_subnets:
+                print("NAT Gateway ID not found in context. Creating a new one.")
+                # Place the NAT GW in the first available public subnet
+                first_public_subnet = self.public_subnets[0]
+                self.single_nat_gateway_id = create_nat_gateway(
+                    self,
+                    first_public_subnet,
+                    nat_gateway_name=NAT_GATEWAY_NAME,
+                    nat_gateway_id_context_key=SINGLE_NAT_GATEWAY_ID
+                )
+            else:
+                print("WARNING: No public subnets available. Cannot create a NAT Gateway.")
+        # --- 4. Process Private Subnets ---
+        print("\n--- Processing Private Subnets ---")
+        # ... (rest of your existing subnet processing logic for checked_private_subnets_ctx) ...
+        # (This part for importing existing subnets remains the same)
+        # Create new private subnets
+        if private_subnets_data_for_creation_ctx:
+            names_to_create_private = [s['name'] for s in private_subnets_data_for_creation_ctx]
+            cidrs_to_create_private = [s['cidr'] for s in private_subnets_data_for_creation_ctx]
+            azs_to_create_private = [s['az'] for s in private_subnets_data_for_creation_ctx]
+            if names_to_create_private:
+                print(f"Attempting to create {len(names_to_create_private)} new private subnets: {names_to_create_private}")
+                # --- CALL THE NEW CREATE_SUBNETS FUNCTION FOR PRIVATE ---
+                # Ensure self.single_nat_gateway_id is available before this call
+                if not self.single_nat_gateway_id:
+                    raise ValueError("A single NAT Gateway ID is required for private subnets but was not resolved.")
+                newly_created_private_subnets_cfn, newly_created_private_rts_cfn = create_subnets(
+                    self, vpc, CDK_PREFIX, names_to_create_private, cidrs_to_create_private, azs_to_create_private,
+                    is_public=False,
+                    single_nat_gateway_id=self.single_nat_gateway_id # Pass the single NAT Gateway ID
+                )
+                self.private_subnets.extend(newly_created_private_subnets_cfn)
+                self.private_route_tables_cfn.extend(newly_created_private_rts_cfn)
+                print(f"Successfully defined {len(newly_created_private_subnets_cfn)} new private subnets and their route tables for creation.")
+        else:
+            print("No private subnets specified for creation in context ('private_subnets_to_create').")
+        if not self.private_subnets:
+            raise Exception("No private subnets found or created, exiting.")
+        # --- 5. Sanity Check and Output ---
+        # Output the single NAT Gateway ID for verification
+        if self.single_nat_gateway_id:
+            CfnOutput(self, "SingleNatGatewayId", value=self.single_nat_gateway_id,
+                      description="ID of the single NAT Gateway used for private subnets.")
+        else:
+            raise Exception("No single NAT Gateway was created or resolved.")
+        # --- Outputs for other stacks/regions ---
+        # These are crucial for cross-stack, cross-region referencing
+        self.params = dict()
+        self.params["vpc_id"] = vpc.vpc_id
+        self.params["private_subnets"] = self.private_subnets
+        self.params["private_route_tables"] = self.private_route_tables_cfn
+        self.params["public_subnets"] = self.public_subnets
+        self.params["public_route_tables"] = self.public_route_tables_cfn
+#class CdkStackMain(Stack):
+ #   def __init__(self, scope: Construct, construct_id: str, private_subnets:List[ec2.ISubnet]=[], private_route_tables: List[ec2.CfnRouteTable]=[], public_subnets:List[ec2.ISubnet]=[], public_route_tables: List[ec2.CfnRouteTable]=[], **kwargs) -> None:
+  #      super().__init__(scope, construct_id, **kwargs)
+        # --- Helper to get context values ---
+        # def get_context_bool(key: str, default: bool = False) -> bool:
+        #     return self.node.try_get_context(key) or default
+        # def get_context_str(key: str, default: str = None) -> str:
+        #      return self.node.try_get_context(key) or default
+        # def get_context_dict(key: str, default: dict = None) -> dict:
+        #     return self.node.try_get_context(key) or default
+        # def get_context_list_of_dicts(key: str) -> List[Dict[str, Any]]:
+        #     ctx_value = self.node.try_get_context(key)
+        #     if not isinstance(ctx_value, list):
+        #         print(f"Warning: Context key '{key}' not found or not a list. Returning empty list.")
+        #         return []
+        #     # Optional: Add validation that all items in the list are dicts
+        #     return ctx_value
+        # self.private_subnets: List[ec2.ISubnet] = private_subnets
+        # self.private_route_tables_cfn: List[ec2.CfnRouteTable] = private_route_tables
+        # self.public_subnets: List[ec2.ISubnet] = public_subnets
+        # self.public_route_tables_cfn: List[ec2.CfnRouteTable] = public_route_tables
+        private_subnet_selection = ec2.SubnetSelection(subnets=self.private_subnets)
+        public_subnet_selection = ec2.SubnetSelection(subnets=self.public_subnets)
+        for sub in private_subnet_selection.subnets:
+            print("private subnet:", sub.subnet_id, "is in availability zone:", sub.availability_zone)
+        for sub in public_subnet_selection.subnets:
+            print("public subnet:", sub.subnet_id, "is in availability zone:", sub.availability_zone)
+        # try:
+        #     vpc = ec2.Vpc.from_lookup(
+        #         self,
+        #         "VPC",
+        #         vpc_name=VPC_NAME
+        #     )
+        #     print("Successfully looked up VPC")
+        # except Exception as e:
+        #     raise Exception(f"Could not look up VPC with name '{VPC_NAME}' due to: {e}")
+        print("Private subnet route tables:", self.private_route_tables_cfn)
+        # Add the S3 Gateway Endpoint to the VPC
+        if names_to_create_private:
+            try:
+                s3_gateway_endpoint = vpc.add_gateway_endpoint(
+                    "S3GatewayEndpoint",
+                    service=ec2.GatewayVpcEndpointAwsService.S3, subnets=[private_subnet_selection])
+            except Exception as e:
+                print("Could not add S3 gateway endpoint to subnets due to:", e)
+            #Output some useful information
+            CfnOutput(self, "VpcIdOutput", value=vpc.vpc_id,
+                description="The ID of the VPC where the S3 Gateway Endpoint is deployed.")
+            CfnOutput(self, "S3GatewayEndpointService", value=s3_gateway_endpoint.vpc_endpoint_id,
+                description="The id for the S3 Gateway Endpoint.") # Specify the S3 service
+        # --- IAM Roles ---
+        try:
+            codebuild_role_name = CODEBUILD_ROLE_NAME
+            custom_sts_kms_policy = """{
+"Version": "2012-10-17",
+"Statement": [
+    {
+        "Sid": "STSCallerIdentity",
+        "Effect": "Allow",
+        "Action": [
+            "sts:GetCallerIdentity"
+        ],
+        "Resource": "*"
+    },
+    {
+      "Sid": "KMSAccess",
+      "Effect": "Allow",
+      "Action": [
+        "kms:Encrypt",
+        "kms:Decrypt",
+        "kms:GenerateDataKey"
+      ],
+      "Resource": "*"
+    }
+]
+}"""
+            if get_context_bool(f"exists:{codebuild_role_name}"):
+                # If exists, lookup/import the role using ARN from context
+                role_arn = get_context_str(f"arn:{codebuild_role_name}")
+                if not role_arn:
+                     raise ValueError(f"Context value 'arn:{codebuild_role_name}' is required if role exists.")
+                codebuild_role = iam.Role.from_role_arn(self, "CodeBuildRole", role_arn=role_arn)
+                print("Using existing CodeBuild role")
+            else:
+                # If not exists, create the role
+                codebuild_role = iam.Role(
+                    self, "CodeBuildRole", # Logical ID
+                    role_name=codebuild_role_name, # Explicit resource name
+                    assumed_by=iam.ServicePrincipal("codebuild.amazonaws.com")
+                )
+                codebuild_role.add_managed_policy(iam.ManagedPolicy.from_aws_managed_policy_name(f"EC2InstanceProfileForImageBuilderECRContainerBuilds"))
+                print("Successfully created new CodeBuild role")
+            task_role_name = ECS_TASK_ROLE_NAME
+            if get_context_bool(f"exists:{task_role_name}"):
+                role_arn = get_context_str(f"arn:{task_role_name}")
+                if not role_arn:
+                     raise ValueError(f"Context value 'arn:{task_role_name}' is required if role exists.")
+                task_role = iam.Role.from_role_arn(self, "TaskRole", role_arn=role_arn)
+                print("Using existing ECS task role")
+            else:
+                task_role = iam.Role(
+                    self, "TaskRole", # Logical ID
+                    role_name=task_role_name, # Explicit resource name
+                    assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com")
+                )
+                for role in AWS_MANAGED_TASK_ROLES_LIST:
+                    print(f"Adding {role} to policy")
+                    task_role.add_managed_policy(iam.ManagedPolicy.from_aws_managed_policy_name(f"{role}"))
+                task_role = add_custom_policies(self, task_role, custom_policy_text=custom_sts_kms_policy)
+                print("Successfully created new ECS task role")
+            execution_role_name = ECS_TASK_EXECUTION_ROLE_NAME
+            if get_context_bool(f"exists:{execution_role_name}"):
+                 role_arn = get_context_str(f"arn:{execution_role_name}")
+                 if not role_arn:
+                      raise ValueError(f"Context value 'arn:{execution_role_name}' is required if role exists.")
+                 execution_role = iam.Role.from_role_arn(self, "ExecutionRole", role_arn=role_arn)
+                 print("Using existing ECS execution role")
+            else:
+                 execution_role = iam.Role(
+                     self, "ExecutionRole", # Logical ID
+                     role_name=execution_role_name, # Explicit resource name
+                     assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com")
+                 )
+                 for role in AWS_MANAGED_TASK_ROLES_LIST:
+                     execution_role.add_managed_policy(iam.ManagedPolicy.from_aws_managed_policy_name(f"{role}"))
+                 execution_role = add_custom_policies(self, execution_role, custom_policy_text=custom_sts_kms_policy)
+                 print("Successfully created new ECS execution role")
+        except Exception as e:
+            raise Exception("Failed at IAM role step due to:", e)
+        # --- S3 Buckets ---
+        try:
+            log_bucket_name = S3_LOG_CONFIG_BUCKET_NAME
+            if get_context_bool(f"exists:{log_bucket_name}"):
+                bucket = s3.Bucket.from_bucket_name(self, "LogConfigBucket", bucket_name=log_bucket_name)
+                print("Using existing S3 bucket", log_bucket_name)
+            else:
+                bucket = s3.Bucket(self, "LogConfigBucket", bucket_name=log_bucket_name,
+                versioned=False, # Set to True if you need versioning
+                # IMPORTANT: Set removal_policy to DESTROY
+                removal_policy=RemovalPolicy.DESTROY,
+                # IMPORTANT: Set auto_delete_objects to True to empty the bucket before deletion
+                auto_delete_objects=True
+                ) # Explicitly set bucket_name
+                print("Created S3 bucket", log_bucket_name)
+            # Add policies - this will apply to both created and imported buckets
+            # CDK handles idempotent policy additions
+            bucket.add_to_resource_policy(
+                iam.PolicyStatement(
+                    effect=iam.Effect.ALLOW,
+                    principals=[task_role], # Pass the role object directly
+                    actions=["s3:GetObject", "s3:PutObject"],
+                    resources=[f"{bucket.bucket_arn}/*"]
+                )
+            )
+            bucket.add_to_resource_policy(
+                iam.PolicyStatement(
+                    effect=iam.Effect.ALLOW,
+                    principals=[task_role],
+                    actions=["s3:ListBucket"],
+                    resources=[bucket.bucket_arn]
+                )
+            )
+            output_bucket_name = S3_OUTPUT_BUCKET_NAME
+            if get_context_bool(f"exists:{output_bucket_name}"):
+                output_bucket = s3.Bucket.from_bucket_name(self, "OutputBucket", bucket_name=output_bucket_name)
+                print("Using existing Output bucket", output_bucket_name)
+            else:
+                output_bucket = s3.Bucket(self, "OutputBucket", bucket_name=output_bucket_name,
+                     lifecycle_rules=[
+                         s3.LifecycleRule(
+                             expiration=Duration.days(int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS))
+                         )
+                     ],
+                    versioned=False, # Set to True if you need versioning
+                    # IMPORTANT: Set removal_policy to DESTROY
+                    removal_policy=RemovalPolicy.DESTROY,
+                    # IMPORTANT: Set auto_delete_objects to True to empty the bucket before deletion
+                    auto_delete_objects=True
+                )
+                print("Created Output bucket:", output_bucket_name)
+            # Add policies to output bucket
+            output_bucket.add_to_resource_policy(
+                iam.PolicyStatement(
+                    effect=iam.Effect.ALLOW,
+                    principals=[task_role],
+                    actions=["s3:GetObject", "s3:PutObject"],
+                    resources=[f"{output_bucket.bucket_arn}/*"]
+                )
+            )
+            output_bucket.add_to_resource_policy(
+                iam.PolicyStatement(
+                    effect=iam.Effect.ALLOW,
+                    principals=[task_role],
+                    actions=["s3:ListBucket"],
+                    resources=[output_bucket.bucket_arn]
+                )
+            )
+        except Exception as e:
+            raise Exception("Could not handle S3 buckets due to:", e)
+        # --- Elastic Container Registry ---
+        try:
+            full_ecr_repo_name = ECR_CDK_REPO_NAME
+            if get_context_bool(f"exists:{full_ecr_repo_name}"):
+                ecr_repo = ecr.Repository.from_repository_name(self, "ECRRepo", repository_name=full_ecr_repo_name)
+                print("Using existing ECR repository")
+            else:
+                ecr_repo = ecr.Repository(self, "ECRRepo", repository_name=full_ecr_repo_name) # Explicitly set repository_name
+                print("Created ECR repository", full_ecr_repo_name)
+            ecr_image_loc = ecr_repo.repository_uri
+        except Exception as e:
+            raise Exception("Could not handle ECR repo due to:", e)
+        # --- CODEBUILD ---
+        try:
+            codebuild_project_name = CODEBUILD_PROJECT_NAME
+            if get_context_bool(f"exists:{codebuild_project_name}"):
+                # Lookup CodeBuild project by ARN from context
+                 project_arn = get_context_str(f"arn:{codebuild_project_name}")
+                 if not project_arn:
+                     raise ValueError(f"Context value 'arn:{codebuild_project_name}' is required if project exists.")
+                 codebuild_project = codebuild.Project.from_project_arn(self, "CodeBuildProject", project_arn=project_arn)
+                 print("Using existing CodeBuild project")
+            else:
+                codebuild_project = codebuild.Project(self,
+                                            "CodeBuildProject", # Logical ID
+                                            project_name=codebuild_project_name, # Explicit resource name
+                                            source=codebuild.Source.git_hub(
+                                            owner=GITHUB_REPO_USERNAME,
+                                            repo=GITHUB_REPO_NAME,
+                                            branch_or_ref=GITHUB_REPO_BRANCH
+                                            ),
+                    environment=codebuild.BuildEnvironment(
+                        build_image=codebuild.LinuxBuildImage.STANDARD_7_0,
+                        privileged=True,
+                        environment_variables={"ECR_REPO_NAME": codebuild.BuildEnvironmentVariable(value=full_ecr_repo_name),
+                                               "AWS_DEFAULT_REGION": codebuild.BuildEnvironmentVariable(value=AWS_REGION),
+                                               "AWS_ACCOUNT_ID": codebuild.BuildEnvironmentVariable(value=AWS_ACCOUNT_ID)}
+                    ),
+                    build_spec=codebuild.BuildSpec.from_object({
+                        "version": "0.2",
+                        "phases": {
+                            "pre_build": {
+                                "commands": [
+                                    "echo Logging in to Amazon ECR",
+                                    "aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
+                                ]
+                            },
+                            "build": {
+                                "commands": [
+                                    "echo Building the Docker image",
+                                    "docker build -t $ECR_REPO_NAME:latest .",
+                                    "docker tag $ECR_REPO_NAME:latest $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO_NAME:latest"
+                                ]
+                            },
+                            "post_build": {
+                                "commands": [
+                                    "echo Pushing the Docker image",
+                                    "docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO_NAME:latest"
+                                ]
+                            }
+                        }
+                    })
+                )
+                print("Successfully created CodeBuild project", codebuild_project_name)
+            # Grant permissions - applies to both created and imported project role
+            ecr_repo.grant_pull_push(codebuild_project.role)
+        except Exception as e:
+            raise Exception("Could not handle Codebuild project due to:", e)
+        # --- Security Groups ---
+        try:
+            ecs_security_group_name = ECS_SECURITY_GROUP_NAME
+            # Following checks by name don't really work
+            # Use CDK's from_lookup_by_name which handles lookup or throws an error if not found
+            #try:
+            #     ecs_security_group = ec2.SecurityGroup.from_lookup_by_name(
+            #         self, "ECSSecurityGroup", vpc=vpc, security_group_name=ecs_security_group_name
+            #     )
+            #     print(f"Using existing Security Group: {ecs_security_group_name}")
+            # except Exception: # If lookup fails, create
+            try:
+                 ecs_security_group = ec2.SecurityGroup(
+                     self,
+                     "ECSSecurityGroup", # Logical ID
+                     security_group_name=ecs_security_group_name, # Explicit resource name
+                     vpc=vpc,
+                 )
+                 print(f"Created Security Group: {ecs_security_group_name}")
+            except Exception as e: # If lookup fails, create
+                print("Failed to create ECS security group due to:", e)
+            alb_security_group_name = ALB_NAME_SECURITY_GROUP_NAME
+            # try:
+            #     alb_security_group = ec2.SecurityGroup.from_lookup_by_name(
+            #         self, "ALBSecurityGroup", vpc=vpc, security_group_name=alb_security_group_name
+            #     )
+            #     print(f"Using existing Security Group: {alb_security_group_name}")
+            # except Exception: # If lookup fails, create
+            try:
+                alb_security_group = ec2.SecurityGroup(
+                    self,
+                    "ALBSecurityGroup", # Logical ID
+                    security_group_name=alb_security_group_name, # Explicit resource name
+                    vpc=vpc
+                )
+                print(f"Created Security Group: {alb_security_group_name}")
+            except Exception as e: # If lookup fails, create
+                print("Failed to create ALB security group due to:", e)
+            # Define Ingress Rules - CDK will manage adding/removing these as needed
+            ec2_port_gradio_server_port = ec2.Port.tcp(int(GRADIO_SERVER_PORT)) # Ensure port is int
+            ecs_security_group.add_ingress_rule(
+                peer=alb_security_group,
+                connection=ec2_port_gradio_server_port,
+                description="ALB traffic",
+            )
+            alb_security_group.add_ingress_rule(
+                peer=ec2.Peer.prefix_list("pl-93a247fa"),
+                connection=ec2.Port.all_traffic(),
+                description="CloudFront traffic",
+            )
+        except Exception as e:
+            raise Exception("Could not handle security groups due to:", e)
+        # --- DynamoDB tables for logs (optional) ---
+        if SAVE_LOGS_TO_DYNAMODB == 'True':
+            try:
+                print("Creating DynamoDB tables for logs")
+                dynamodb_table_access = dynamodb.Table(self, "RedactionAccessDataTable",
+                table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME,
+                partition_key=dynamodb.Attribute(
+                    name="id",
+                    type=dynamodb.AttributeType.STRING),
+                billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
+                removal_policy=RemovalPolicy.DESTROY)
+                dynamodb_table_feedback = dynamodb.Table(self, "RedactionFeedbackDataTable",
+                table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+                partition_key=dynamodb.Attribute(
+                    name="id",
+                    type=dynamodb.AttributeType.STRING),
+                billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
+                removal_policy=RemovalPolicy.DESTROY)
+                dynamodb_table_usage = dynamodb.Table(self, "RedactionUsageDataTable",
+                table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
+                partition_key=dynamodb.Attribute(
+                    name="id",
+                    type=dynamodb.AttributeType.STRING),
+                billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
+                removal_policy=RemovalPolicy.DESTROY)
+            except Exception as e:
+                raise Exception("Could not create DynamoDB tables due to:", e)
+        # --- ALB ---
+        try:
+            load_balancer_name = ALB_NAME
+            if len(load_balancer_name) > 32: load_balancer_name = load_balancer_name[-32:]
+            if get_context_bool(f"exists:{load_balancer_name}"):
+                 # Lookup ALB by ARN from context
+                 alb_arn = get_context_str(f"arn:{load_balancer_name}")
+                 if not alb_arn:
+                     raise ValueError(f"Context value 'arn:{load_balancer_name}' is required if ALB exists.")
+                 alb = elbv2.ApplicationLoadBalancer.from_lookup(
+                     self, "ALB", # Logical ID
+                     load_balancer_arn=alb_arn
+                 )
+                 print(f"Using existing Application Load Balancer {load_balancer_name}.")
+            else:
+                alb = elbv2.ApplicationLoadBalancer(
+                    self,
+                    "ALB", # Logical ID
+                    load_balancer_name=load_balancer_name, # Explicit resource name
+                    vpc=vpc,
+                    internet_facing=True,
+                    security_group=alb_security_group, # Link to SG
+                    vpc_subnets=public_subnet_selection # Link to subnets
+                )
+                print("Successfully created new Application Load Balancer")
+        except Exception as e:
+            raise Exception("Could not handle application load balancer due to:", e)
+        # --- Cognito User Pool ---
+        try:
+            if get_context_bool(f"exists:{COGNITO_USER_POOL_NAME}"):
+                # Lookup by ID from context
+                user_pool_id = get_context_str(f"id:{COGNITO_USER_POOL_NAME}")
+                if not user_pool_id:
+                     raise ValueError(f"Context value 'id:{COGNITO_USER_POOL_NAME}' is required if User Pool exists.")
+                user_pool = cognito.UserPool.from_user_pool_id(self, "UserPool", user_pool_id=user_pool_id)
+                print(f"Using existing user pool {user_pool_id}.")
+            else:
+                user_pool = cognito.UserPool(self, "UserPool",
+                                            user_pool_name=COGNITO_USER_POOL_NAME,
+                                            mfa=cognito.Mfa.OFF, # Adjust as needed
+                                            sign_in_aliases=cognito.SignInAliases(email=True),
+                                            removal_policy=RemovalPolicy.DESTROY) # Adjust as needed
+                print(f"Created new user pool {user_pool.user_pool_id}.")
+            # If you're using a certificate, assume that you will be using the ALB Cognito login features. You need different redirect URLs to accept the token that comes from Cognito authentication.
+            if ACM_CERTIFICATE_ARN:
+                redirect_uris = [COGNITO_REDIRECTION_URL, COGNITO_REDIRECTION_URL + "/oauth2/idpresponse"]
+            else:
+                redirect_uris = [COGNITO_REDIRECTION_URL]
+            user_pool_client_name = COGNITO_USER_POOL_CLIENT_NAME
+            if get_context_bool(f"exists:{user_pool_client_name}"):
+                 # Lookup by ID from context (requires User Pool object)
+                 user_pool_client_id = get_context_str(f"id:{user_pool_client_name}")
+                 if not user_pool_client_id:
+                     raise ValueError(f"Context value 'id:{user_pool_client_name}' is required if User Pool Client exists.")
+                 user_pool_client = cognito.UserPoolClient.from_user_pool_client_id(self, "UserPoolClient", user_pool_client_id=user_pool_client_id)
+                 print(f"Using existing user pool client {user_pool_client_id}.")
+            else:
+                 user_pool_client = cognito.UserPoolClient(self, "UserPoolClient",
+                                                        auth_flows=cognito.AuthFlow(user_srp=True, user_password=True), # Example: enable SRP for secure sign-in
+                                                        user_pool=user_pool,
+                                                        generate_secret=True,
+                                                        user_pool_client_name=user_pool_client_name,
+                                                        supported_identity_providers=[cognito.UserPoolClientIdentityProvider.COGNITO],
+                                                        o_auth=cognito.OAuthSettings(
+                                                        flows=cognito.OAuthFlows(authorization_code_grant=True),
+                                                        scopes=[cognito.OAuthScope.OPENID, cognito.OAuthScope.EMAIL, cognito.OAuthScope.PROFILE],
+                                                        callback_urls=redirect_uris
+                                                        )
+                                    )
+            CfnOutput(self, "CognitoAppClientId", value=user_pool_client.user_pool_client_id)
+            print(f"Created new user pool client {user_pool_client.user_pool_client_id}.")
+            # Add a domain to the User Pool (crucial for ALB integration)
+            user_pool_domain = user_pool.add_domain(
+                "UserPoolDomain",
+                cognito_domain=cognito.CognitoDomainOptions(
+                    domain_prefix=COGNITO_USER_POOL_DOMAIN_PREFIX)
+            )
+            # Apply removal_policy to the created UserPoolDomain construct
+            user_pool_domain.apply_removal_policy(policy=RemovalPolicy.DESTROY)
+            CfnOutput(self, "CognitoUserPoolLoginUrl", value=user_pool_domain.base_url())
+        except Exception as e:
+            raise Exception("Could not handle Cognito resources due to:", e)
+        # --- Secrets Manager Secret ---
+        try:
+             secret_name = COGNITO_USER_POOL_CLIENT_SECRET_NAME
+             if get_context_bool(f"exists:{secret_name}"):
+                 # Lookup by name
+                 secret = secretsmanager.Secret.from_secret_name_v2(self, "CognitoSecret", secret_name=secret_name)
+                 print(f"Using existing Secret {secret_name}.")
+             else:
+                 secret = secretsmanager.Secret(self, "CognitoSecret", # Logical ID
+                     secret_name=secret_name, # Explicit resource name
+                     secret_object_value={
+                         "REDACTION_USER_POOL_ID": SecretValue.unsafe_plain_text(user_pool.user_pool_id), # Use the CDK attribute
+                         "REDACTION_CLIENT_ID": SecretValue.unsafe_plain_text(user_pool_client.user_pool_client_id), # Use the CDK attribute
+                         "REDACTION_CLIENT_SECRET": user_pool_client.user_pool_client_secret # Use the CDK attribute
+                     }
+                 )
+                 print(f"Created new secret {secret_name}.")
+        except Exception as e:
+             raise Exception("Could not handle Secrets Manager secret due to:", e)
+        # --- Fargate Task Definition ---
+        try:
+            # For task definitions, re-creating with the same logical ID creates new revisions.
+            # If you want to use a *specific existing revision*, you'd need to look it up by ARN.
+            # If you want to update the latest revision, defining it here is the standard.
+            # Let's assume we always define it here to get revision management.
+            fargate_task_definition_name = FARGATE_TASK_DEFINITION_NAME
+            read_only_file_system = ECS_READ_ONLY_FILE_SYSTEM == 'True'
+            if os.path.exists(TASK_DEFINITION_FILE_LOCATION):
+                with open(TASK_DEFINITION_FILE_LOCATION) as f: # Use correct path
+                    task_def_params = json.load(f)
+                # Need to ensure taskRoleArn and executionRoleArn in JSON are correct ARN strings
+            else:
+                epheremal_storage_volume_name = "appEphemeralVolume"
+                task_def_params = {}
+                task_def_params['taskRoleArn'] = task_role.role_arn # Use CDK role object ARN
+                task_def_params['executionRoleArn'] = execution_role.role_arn # Use CDK role object ARN
+                task_def_params['memory'] = ECS_TASK_MEMORY_SIZE
+                task_def_params['cpu'] = ECS_TASK_CPU_SIZE
+                container_def = {
+                    "name": full_ecr_repo_name,
+                    "image": ecr_image_loc + ":latest",
+                    "essential": True,
+                    "portMappings": [{"containerPort": int(GRADIO_SERVER_PORT), "hostPort": int(GRADIO_SERVER_PORT), "protocol": "tcp", "appProtocol": "http"}],
+                    "logConfiguration": {"logDriver": "awslogs", "options": {"awslogs-group": ECS_LOG_GROUP_NAME, "awslogs-region": AWS_REGION, "awslogs-stream-prefix": "ecs"}},
+                    "environmentFiles": [{"value": bucket.bucket_arn + "/config.env", "type": "s3"}],
+                    "memoryReservation": int(task_def_params['memory']) - 512, # Reserve some memory for the container
+                    "mountPoints": [
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/home/user/app/logs",
+                        "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/home/user/app/feedback",
+                        "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/home/user/app/usage",
+                        "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/home/user/app/input",
+                        "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/home/user/app/output",
+                        "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/home/user/app/tmp",
+                        "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/home/user/app/config",
+                        "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/tmp/matplotlib_cache",
+                        "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/tmp",
+                        "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/var/tmp",
+                        "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/tmp/tld",
+                        "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/tmp/gradio_tmp",
+                        "readOnly": False
+                    }
+                ],
+                "readonlyRootFilesystem": read_only_file_system,
+                }
+                task_def_params['containerDefinitions'] = [container_def]
+            log_group_name_from_config=task_def_params['containerDefinitions'][0]['logConfiguration']['options']['awslogs-group']
+            cdk_managed_log_group = logs.LogGroup(self, "MyTaskLogGroup", # CDK Logical ID
+            log_group_name=log_group_name_from_config,
+            retention=logs.RetentionDays.ONE_MONTH, # Example: set retention
+            removal_policy=RemovalPolicy.DESTROY # If you want it deleted when stack is deleted
+            )
+            epheremal_storage_volume_cdk_obj = ecs.Volume(
+            name=epheremal_storage_volume_name
+            )
+            fargate_task_definition = ecs.FargateTaskDefinition(
+            self,
+            "FargateTaskDefinition", # Logical ID
+            family=fargate_task_definition_name,
+            cpu=int(task_def_params['cpu']),
+            memory_limit_mib=int(task_def_params['memory']),
+            task_role=task_role,
+            execution_role=execution_role,
+            runtime_platform=ecs.RuntimePlatform(
+                cpu_architecture=ecs.CpuArchitecture.X86_64,
+                operating_system_family=ecs.OperatingSystemFamily.LINUX
+            ),
+            # 1. Specify the total ephemeral storage for the task
+            ephemeral_storage_gib=21, # Minimum is 21 GiB
+            # 2. Define the volume at the task level
+            # This volume will use the ephemeral storage configured above.
+            volumes=[epheremal_storage_volume_cdk_obj]
+            )
+            print("Fargate task definition defined.")
+            # Add container definitions to the task definition object
+            if task_def_params['containerDefinitions']:
+                container_def_params = task_def_params['containerDefinitions'][0]
+                if container_def_params.get('environmentFiles'):
+                    env_files = []
+                    for env_file_param in container_def_params['environmentFiles']:
+                        # Need to parse the ARN to get the bucket object and key
+                        env_file_arn_parts = env_file_param['value'].split(":::")
+                        bucket_name_and_key = env_file_arn_parts[-1]
+                        env_bucket_name, env_key = bucket_name_and_key.split("/", 1)
+                        env_file = ecs.EnvironmentFile.from_bucket(bucket, env_key)
+                        env_files.append(env_file)
+                container = fargate_task_definition.add_container(
+                    container_def_params['name'],
+                    image=ecs.ContainerImage.from_registry(container_def_params['image']),
+                    logging=ecs.LogDriver.aws_logs(
+                        stream_prefix=container_def_params['logConfiguration']['options']['awslogs-stream-prefix'],
+                        log_group=cdk_managed_log_group
+                        ),
+                        secrets={
+                            "AWS_USER_POOL_ID": ecs.Secret.from_secrets_manager(secret, "REDACTION_USER_POOL_ID"),
+                            "AWS_CLIENT_ID": ecs.Secret.from_secrets_manager(secret, "REDACTION_CLIENT_ID"),
+                            "AWS_CLIENT_SECRET": ecs.Secret.from_secrets_manager(secret, "REDACTION_CLIENT_SECRET")
+                        },
+                    environment_files=env_files,
+                    readonly_root_filesystem=read_only_file_system
+                )
+                for port_mapping in container_def_params['portMappings']:
+                    container.add_port_mappings(
+                        ecs.PortMapping(
+                            container_port=int(port_mapping['containerPort']),
+                            host_port=int(port_mapping['hostPort']),
+                            name="port-" + str(port_mapping['containerPort']),
+                            app_protocol=ecs.AppProtocol.http,
+                            protocol=ecs.Protocol.TCP
+                        )
+                    )
+                container.add_port_mappings(ecs.PortMapping(
+                            container_port=80,
+                            host_port=80,
+                            name="port-80",
+                            app_protocol=ecs.AppProtocol.http,
+                            protocol=ecs.Protocol.TCP
+                        ))
+                if container_def_params.get('mountPoints'):
+                    mount_points=[]
+                    for mount_point in container_def_params['mountPoints']:
+                        mount_points.append(ecs.MountPoint(container_path=mount_point['containerPath'], read_only=mount_point['readOnly'], source_volume=epheremal_storage_volume_name))
+                    container.add_mount_points(*mount_points)
+        except Exception as e:
+            raise Exception("Could not handle Fargate task definition due to:", e)
+        # --- ECS Cluster ---
+        try:
+            cluster = ecs.Cluster(
+                self,
+                "ECSCluster", # Logical ID
+                cluster_name=CLUSTER_NAME, # Explicit resource name
+                enable_fargate_capacity_providers=True,
+                vpc=vpc
+            )
+            print("Successfully created new ECS cluster")
+        except Exception as e:
+            raise Exception("Could not handle ECS cluster due to:", e)
+        # --- ECS Service ---
+        try:
+            ecs_service_name = ECS_SERVICE_NAME
+            if ECS_USE_FARGATE_SPOT == 'True': use_fargate_spot = "FARGATE_SPOT"
+            if ECS_USE_FARGATE_SPOT == 'False': use_fargate_spot = "FARGATE"
+            # Check if service exists - from_service_arn or from_service_name (needs cluster)
+            try:
+                 # from_service_name is useful if you have the cluster object
+                 ecs_service = ecs.FargateService.from_service_attributes(
+                     self, "ECSService", # Logical ID
+                     cluster=cluster, # Requires the cluster object
+                     service_name=ecs_service_name
+                 )
+                 print(f"Using existing ECS service {ecs_service_name}.")
+            except Exception:
+                 # Service will be created with a count of 0, because you haven't yet actually built the initial Docker container with CodeBuild
+                 ecs_service = ecs.FargateService(
+                     self,
+                     "ECSService", # Logical ID
+                     service_name=ecs_service_name, # Explicit resource name
+                     platform_version=ecs.FargatePlatformVersion.LATEST,
+                     capacity_provider_strategies=[ecs.CapacityProviderStrategy(capacity_provider=use_fargate_spot, base=0, weight=1)],
+                     cluster=cluster,
+                     task_definition=fargate_task_definition, # Link to TD
+                     security_groups=[ecs_security_group], # Link to SG
+                     vpc_subnets=ec2.SubnetSelection(subnets=self.private_subnets), # Link to subnets
+                     min_healthy_percent=0,
+                     max_healthy_percent=100,
+                     desired_count=0
+                 )
+                 print("Successfully created new ECS service")
+            # Note: Auto-scaling setup would typically go here if needed for the service
+        except Exception as e:
+            raise Exception("Could not handle ECS service due to:", e)
+        # --- Grant Secret Read Access (Applies to both created and imported roles) ---
+        try:
+             secret.grant_read(task_role)
+             secret.grant_read(execution_role)
+        except Exception as e:
+             raise Exception("Could not grant access to Secrets Manager due to:", e)
+        # --- ALB TARGET GROUPS AND LISTENERS ---
+        # This section should primarily define the resources if they are managed by this stack.
+        # CDK handles adding/removing targets and actions on updates.
+        # If they might pre-exist outside the stack, you need lookups.
+        cookie_duration = Duration.hours(12)
+        target_group_name = ALB_TARGET_GROUP_NAME # Explicit resource name
+        cloudfront_distribution_url = "cloudfront_placeholder.net" # Need to replace this afterwards with the actual cloudfront_distribution.domain_name
+        try:
+            # --- CREATING TARGET GROUPS AND ADDING THE CLOUDFRONT LISTENER RULE ---
+            target_group = elbv2.ApplicationTargetGroup(
+                self,
+                "AppTargetGroup", # Logical ID
+                target_group_name=target_group_name, # Explicit resource name
+                port=int(GRADIO_SERVER_PORT), # Ensure port is int
+                protocol=elbv2.ApplicationProtocol.HTTP,
+                targets=[ecs_service], # Link to ECS Service
+                stickiness_cookie_duration=cookie_duration,
+                vpc=vpc, # Target Groups need VPC
+            )
+            print(f"ALB target group {target_group_name} defined.")
+            # First HTTP
+            listener_port = 80
+            # Check if Listener exists - from_listener_arn or lookup by port/ALB
+            http_listener = alb.add_listener(
+                "HttpListener", # Logical ID
+                port=listener_port,
+                open=False, # Be cautious with open=True, usually restrict source SG
+            )
+            print(f"ALB listener on port {listener_port} defined.")
+            if ACM_CERTIFICATE_ARN:
+                http_listener.add_action(
+                        "DefaultAction", # Logical ID for the default action
+                        action=elbv2.ListenerAction.redirect(protocol='HTTPS',
+                                                             host='#{host}',
+                                                             port='443',
+                                                             path='/#{path}',
+                                                             query='#{query}')
+                )
+            else:
+                if USE_CLOUDFRONT == 'True':
+                    # The following default action can be added for the listener after a host header rule is added to the listener manually in the Console as suggested in the above comments.
+                    http_listener.add_action(
+                        "DefaultAction", # Logical ID for the default action
+                        action=elbv2.ListenerAction.fixed_response(
+                            status_code=403,
+                            content_type="text/plain",
+                            message_body="Access denied",
+                        ),
+                    )
+                    # Add the Listener Rule for the specific CloudFront Host Header
+                    http_listener.add_action(
+                        "CloudFrontHostHeaderRule",
+                        action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration),
+                        priority=1, # Example priority. Adjust as needed. Lower is evaluated first.
+                        conditions=[
+                            elbv2.ListenerCondition.host_headers([cloudfront_distribution_url]) # May have to redefine url in console afterwards if not specified in config file
+                        ]
+                    )
+                else:
+                    # Add the Listener Rule for the specific CloudFront Host Header
+                    http_listener.add_action(
+                        "CloudFrontHostHeaderRule",
+                        action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration)
+                    )
+                print("Added targets and actions to ALB HTTP listener.")
+            # Now the same for HTTPS if you have an ACM certificate
+            if ACM_CERTIFICATE_ARN:
+                listener_port_https = 443
+                # Check if Listener exists - from_listener_arn or lookup by port/ALB
+                https_listener = add_alb_https_listener_with_cert(
+                self,
+                "MyHttpsListener", # Logical ID for the HTTPS listener
+                alb,
+                acm_certificate_arn=ACM_CERTIFICATE_ARN,
+                default_target_group=target_group,
+                enable_cognito_auth=True,
+                cognito_user_pool=user_pool,
+                cognito_user_pool_client=user_pool_client,
+                cognito_user_pool_domain=user_pool_domain,
+                listener_open_to_internet=True,
+                stickiness_cookie_duration=cookie_duration
+                )
+                if https_listener:
+                    CfnOutput(self, "HttpsListenerArn", value=https_listener.listener_arn)
+                print(f"ALB listener on port {listener_port_https} defined.")
+                # if USE_CLOUDFRONT == 'True':
+                #     # Add default action to the listener
+                #     https_listener.add_action(
+                #         "DefaultAction", # Logical ID for the default action
+                #         action=elbv2.ListenerAction.fixed_response(
+                #             status_code=403,
+                #             content_type="text/plain",
+                #             message_body="Access denied",
+                #         ),
+                #     )
+                #     # Add the Listener Rule for the specific CloudFront Host Header
+                #     https_listener.add_action(
+                #         "CloudFrontHostHeaderRuleHTTPS",
+                #         action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration),
+                #         priority=1, # Example priority. Adjust as needed. Lower is evaluated first.
+                #         conditions=[
+                #             elbv2.ListenerCondition.host_headers([cloudfront_distribution_url])
+                #         ]
+                #     )
+                # else:
+                #     https_listener.add_action(
+                #         "CloudFrontHostHeaderRuleHTTPS",
+                #         action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration))
+                print("Added targets and actions to ALB HTTPS listener.")
+        except Exception as e:
+            raise Exception("Could not handle ALB target groups and listeners due to:", e)
+        # Create WAF to attach to load balancer
+        try:
+            web_acl_name = LOAD_BALANCER_WEB_ACL_NAME
+            if get_context_bool(f"exists:{web_acl_name}"):
+                # Lookup WAF ACL by ARN from context
+                    web_acl_arn = get_context_str(f"arn:{web_acl_name}")
+                    if not web_acl_arn:
+                        raise ValueError(f"Context value 'arn:{web_acl_name}' is required if Web ACL exists.")
+                    web_acl = create_web_acl_with_common_rules(self, web_acl_name, waf_scope="REGIONAL") # Assuming it takes scope and name
+                    print(f"Handled ALB WAF web ACL {web_acl_name}.")
+            else:
+                web_acl = create_web_acl_with_common_rules(self, web_acl_name, waf_scope="REGIONAL") # Assuming it takes scope and name
+                print(f"Created ALB WAF web ACL {web_acl_name}.")
+            alb_waf_association = wafv2.CfnWebACLAssociation(self, id="alb_waf_association", resource_arn=alb.load_balancer_arn, web_acl_arn=web_acl.attr_arn)
+        except Exception as e:
+            raise Exception("Could not handle create ALB WAF web ACL due to:", e)
+        # --- Outputs for other stacks/regions ---
+        self.params = dict()
+        self.params["alb_arn_output"] = alb.load_balancer_arn
+        self.params["alb_security_group_id"] = alb_security_group.security_group_id
+        self.params["alb_dns_name"] = alb.load_balancer_dns_name
+        CfnOutput(self, "AlbArnOutput",
+                    value=alb.load_balancer_arn,
+                    description="ARN of the Application Load Balancer",
+                    export_name=f"{self.stack_name}-AlbArn") # Export name must be unique within the account/region
+        CfnOutput(self, "AlbSecurityGroupIdOutput",
+                        value=alb_security_group.security_group_id,
+                        description="ID of the ALB's Security Group",
+                        export_name=f"{self.stack_name}-AlbSgId")
+        CfnOutput(self, "ALBName", value=alb.load_balancer_name)
+        CfnOutput(self, "RegionalAlbDnsName", value=alb.load_balancer_dns_name)
+        CfnOutput(self, "CognitoPoolId", value=user_pool.user_pool_id)
+        # Add other outputs if needed
+        CfnOutput(self, "ECRRepoUri", value=ecr_repo.repository_uri)
+# --- CLOUDFRONT DISTRIBUTION in separate stack (us-east-1 required) ---
+class CdkStackCloudfront(Stack):
+    def __init__(self, scope: Construct, construct_id: str, alb_arn: str, alb_sec_group_id:str, alb_dns_name:str, **kwargs) -> None:
+        super().__init__(scope, construct_id, **kwargs)
+        # --- Helper to get context values ---
+        def get_context_bool(key: str, default: bool = False) -> bool:
+            return self.node.try_get_context(key) or default
+        def get_context_str(key: str, default: str = None) -> str:
+             return self.node.try_get_context(key) or default
+        def get_context_dict(scope: Construct, key: str, default: dict = None) -> dict:
+            return scope.node.try_get_context(key) or default
+        print(f"CloudFront Stack: Received ALB ARN: {alb_arn}")
+        print(f"CloudFront Stack: Received ALB Security Group ID: {alb_sec_group_id}")
+        if not alb_arn:
+            raise ValueError("ALB ARN must be provided to CloudFront stack")
+        if not alb_sec_group_id:
+             raise ValueError("ALB Security Group ID must be provided to CloudFront stack")
+        # 2. Import the ALB using its ARN
+        # This imports an existing ALB as a construct in the CloudFront stack's context.
+        # CloudFormation will understand this reference at deploy time.
+        alb = elbv2.ApplicationLoadBalancer.from_application_load_balancer_attributes(
+            self, "ImportedAlb", load_balancer_arn=alb_arn, security_group_id=alb_sec_group_id, load_balancer_dns_name=alb_dns_name
+        )
+        try:
+            web_acl_name = WEB_ACL_NAME
+            if get_context_bool(f"exists:{web_acl_name}"):
+                # Lookup WAF ACL by ARN from context
+                    web_acl_arn = get_context_str(f"arn:{web_acl_name}")
+                    if not web_acl_arn:
+                        raise ValueError(f"Context value 'arn:{web_acl_name}' is required if Web ACL exists.")
+                    web_acl = create_web_acl_with_common_rules(self, web_acl_name) # Assuming it takes scope and name
+                    print(f"Handled Cloudfront WAF web ACL {web_acl_name}.")
+            else:
+                web_acl = create_web_acl_with_common_rules(self, web_acl_name) # Assuming it takes scope and name
+                print(f"Created Cloudfront WAF web ACL {web_acl_name}.")
+        # Add ALB as CloudFront Origin
+            origin = origins.LoadBalancerV2Origin(
+                alb, # Use the created or looked-up ALB object
+                custom_headers={CUSTOM_HEADER: CUSTOM_HEADER_VALUE},
+                origin_shield_enabled=False,
+                protocol_policy=cloudfront.OriginProtocolPolicy.HTTP_ONLY,
+            )
+            if CLOUDFRONT_GEO_RESTRICTION: geo_restrict = cloudfront.GeoRestriction.allowlist(CLOUDFRONT_GEO_RESTRICTION)
+            else: geo_restrict = None
+            cloudfront_distribution = cloudfront.Distribution(
+                self,
+                "CloudFrontDistribution", # Logical ID
+                comment=CLOUDFRONT_DISTRIBUTION_NAME, # Use name as comment for easier identification
+                geo_restriction=geo_restrict,
+                default_behavior=cloudfront.BehaviorOptions(
+                    origin=origin,
+                    viewer_protocol_policy=cloudfront.ViewerProtocolPolicy.REDIRECT_TO_HTTPS,
+                    allowed_methods=cloudfront.AllowedMethods.ALLOW_ALL,
+                    cache_policy=cloudfront.CachePolicy.CACHING_DISABLED,
+                    origin_request_policy=cloudfront.OriginRequestPolicy.ALL_VIEWER,
+                ),
+                web_acl_id=web_acl.attr_arn
+            )
+            print(f"Cloudfront distribution {CLOUDFRONT_DISTRIBUTION_NAME} defined.")
+        except Exception as e:
+            raise Exception("Could not handle Cloudfront distribution due to:", e)
+        # --- Outputs ---
+        CfnOutput(self, "CloudFrontDistributionURL",
+                  value=cloudfront_distribution.domain_name)

cdk/check_resources.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import json
+import os
+from cdk_config import CDK_PREFIX, VPC_NAME, AWS_REGION, PUBLIC_SUBNETS_TO_USE, PRIVATE_SUBNETS_TO_USE, CODEBUILD_ROLE_NAME, ECS_TASK_ROLE_NAME, ECS_TASK_EXECUTION_ROLE_NAME, S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME, ECR_CDK_REPO_NAME, CODEBUILD_PROJECT_NAME, ALB_NAME, COGNITO_USER_POOL_NAME, COGNITO_USER_POOL_CLIENT_NAME, COGNITO_USER_POOL_CLIENT_SECRET_NAME, WEB_ACL_NAME, CONTEXT_FILE, PUBLIC_SUBNET_CIDR_BLOCKS, PRIVATE_SUBNET_CIDR_BLOCKS, PUBLIC_SUBNET_AVAILABILITY_ZONES, PRIVATE_SUBNET_AVAILABILITY_ZONES, CDK_FOLDER, CDK_CONFIG_PATH  # Import necessary config
+from cdk_functions import ( # Import your check functions (assuming they use Boto3)
+    get_vpc_id_by_name,
+    check_subnet_exists_by_name,
+    check_for_existing_role,
+    check_s3_bucket_exists,
+    check_ecr_repo_exists,
+    check_codebuild_project_exists,
+    check_alb_exists,
+    check_for_existing_user_pool,
+    check_for_existing_user_pool_client,
+    check_for_secret,
+    check_cloudfront_distribution_exists,
+    check_web_acl_exists,
+    _get_existing_subnets_in_vpc,
+    validate_subnet_creation_parameters
+    # Add other check functions as needed
+)
+from typing import List, Dict, Any
+cdk_folder = CDK_FOLDER #<FULL_PATH_TO_CDK_FOLDER_HERE>
+# Full path needed to find config file
+os.environ["CDK_CONFIG_PATH"] = cdk_folder + CDK_CONFIG_PATH
+# --- Helper to parse environment variables into lists ---
+def _get_env_list(env_var_name: str) -> List[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(',') if s.strip()]
+if PUBLIC_SUBNETS_TO_USE and not isinstance(PUBLIC_SUBNETS_TO_USE, list): PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
+if PRIVATE_SUBNETS_TO_USE and not isinstance(PRIVATE_SUBNETS_TO_USE, list): PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
+if PUBLIC_SUBNET_CIDR_BLOCKS and not isinstance(PUBLIC_SUBNET_CIDR_BLOCKS, list): PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list(PUBLIC_SUBNET_CIDR_BLOCKS)
+if PUBLIC_SUBNET_AVAILABILITY_ZONES and not isinstance(PUBLIC_SUBNET_AVAILABILITY_ZONES, list): PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list(PUBLIC_SUBNET_AVAILABILITY_ZONES)
+if PRIVATE_SUBNET_CIDR_BLOCKS and not isinstance(PRIVATE_SUBNET_CIDR_BLOCKS, list): PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list(PRIVATE_SUBNET_CIDR_BLOCKS)
+if PRIVATE_SUBNET_AVAILABILITY_ZONES and not isinstance(PRIVATE_SUBNET_AVAILABILITY_ZONES, list): PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(PRIVATE_SUBNET_AVAILABILITY_ZONES)
+# Check for the existence of elements in your AWS environment to see if it's necessary to create new versions of the same
+def check_and_set_context():
+    context_data = {}
+    # --- Find the VPC ID first ---
+    print("VPC_NAME:", VPC_NAME)
+    vpc_id, nat_gateways = get_vpc_id_by_name(VPC_NAME)
+    # If you expect only one, or one per AZ and you're creating one per AZ in CDK:
+    if nat_gateways:
+        # For simplicity, let's just check if *any* NAT exists in the VPC
+        # A more robust check would match by subnet, AZ, or a specific tag.
+        context_data["exists:NatGateway"] = True
+        context_data["id:NatGateway"] = nat_gateways[0]['NatGatewayId'] # Store the ID of the first one found
+    else:
+        context_data["exists:NatGateway"] = False
+        context_data["id:NatGateway"] = None
+    if not vpc_id:
+        # If the VPC doesn't exist, you might not be able to check/create subnets.
+        # Decide how to handle this: raise an error, set a flag, etc.
+        raise RuntimeError(f"Required VPC '{VPC_NAME}' not found. Cannot proceed with subnet checks.")
+    context_data["vpc_id"] = vpc_id # Store VPC ID in context
+    # SUBNET CHECKS
+    context_data: Dict[str, Any] = {}
+    all_proposed_subnets_data: List[Dict[str, str]] = []
+    # Flag to indicate if full validation mode (with CIDR/AZs) is active
+    full_validation_mode = False
+    # Determine if full validation mode is possible/desired
+    # It's 'desired' if CIDR/AZs are provided, and their lengths match the name lists.
+    public_ready_for_full_validation = (
+        len(PUBLIC_SUBNETS_TO_USE) > 0 and
+        len(PUBLIC_SUBNET_CIDR_BLOCKS) == len(PUBLIC_SUBNETS_TO_USE) and
+        len(PUBLIC_SUBNET_AVAILABILITY_ZONES) == len(PUBLIC_SUBNETS_TO_USE)
+    )
+    private_ready_for_full_validation = (
+        len(PRIVATE_SUBNETS_TO_USE) > 0 and
+        len(PRIVATE_SUBNET_CIDR_BLOCKS) == len(PRIVATE_SUBNETS_TO_USE) and
+        len(PRIVATE_SUBNET_AVAILABILITY_ZONES) == len(PRIVATE_SUBNETS_TO_USE)
+    )
+    # Activate full validation if *any* type of subnet (public or private) has its full details provided.
+    # You might adjust this logic if you require ALL subnet types to have CIDRs, or NONE.
+    if public_ready_for_full_validation or private_ready_for_full_validation:
+        full_validation_mode = True
+        # If some are ready but others aren't, print a warning or raise an error based on your strictness
+        if public_ready_for_full_validation and not private_ready_for_full_validation and PRIVATE_SUBNETS_TO_USE:
+            print("Warning: Public subnets have CIDRs/AZs, but private subnets do not. Only public will be fully validated/created with CIDRs.")
+        if private_ready_for_full_validation and not public_ready_for_full_validation and PUBLIC_SUBNETS_TO_USE:
+            print("Warning: Private subnets have CIDRs/AZs, but public subnets do not. Only private will be fully validated/created with CIDRs.")
+        # Prepare data for validate_subnet_creation_parameters for all subnets that have full details
+        if public_ready_for_full_validation:
+            for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
+                all_proposed_subnets_data.append({
+                    'name': name,
+                    'cidr': PUBLIC_SUBNET_CIDR_BLOCKS[i],
+                    'az': PUBLIC_SUBNET_AVAILABILITY_ZONES[i]
+                })
+        if private_ready_for_full_validation:
+            for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
+                all_proposed_subnets_data.append({
+                    'name': name,
+                    'cidr': PRIVATE_SUBNET_CIDR_BLOCKS[i],
+                    'az': PRIVATE_SUBNET_AVAILABILITY_ZONES[i]
+                })
+    print(f"Target VPC ID for Boto3 lookup: {vpc_id}")
+    # Fetch all existing subnets in the target VPC once to avoid repeated API calls
+    try:
+        existing_aws_subnets = _get_existing_subnets_in_vpc(vpc_id)
+    except Exception as e:
+        print(f"Failed to fetch existing VPC subnets. Aborting. Error: {e}")
+        raise SystemExit(1) # Exit immediately if we can't get baseline data
+    print("\n--- Running Name-Only Subnet Existence Check Mode ---")
+    # Fallback: check only by name using the existing data
+    checked_public_subnets = {}
+    if PUBLIC_SUBNETS_TO_USE:
+        for subnet_name in PUBLIC_SUBNETS_TO_USE:
+            print("subnet_name:", subnet_name)
+            exists, subnet_id = check_subnet_exists_by_name(subnet_name, existing_aws_subnets)
+            checked_public_subnets[subnet_name] = {"exists": exists, "id": subnet_id}
+            # If the subnet exists, remove it from the proposed subnets list
+            if checked_public_subnets[subnet_name]["exists"] == True:
+                all_proposed_subnets_data = [
+                    subnet for subnet in all_proposed_subnets_data
+                    if subnet['name'] != subnet_name
+                ]
+    context_data["checked_public_subnets"] = checked_public_subnets
+    checked_private_subnets = {}
+    if PRIVATE_SUBNETS_TO_USE:
+        for subnet_name in PRIVATE_SUBNETS_TO_USE:
+            print("subnet_name:", subnet_name)
+            exists, subnet_id = check_subnet_exists_by_name(subnet_name, existing_aws_subnets)
+            checked_private_subnets[subnet_name] = {"exists": exists, "id": subnet_id}
+            # If the subnet exists, remove it from the proposed subnets list
+            if checked_private_subnets[subnet_name]["exists"] == True:
+                all_proposed_subnets_data = [
+                    subnet for subnet in all_proposed_subnets_data
+                    if subnet['name'] != subnet_name
+                ]
+    context_data["checked_private_subnets"] = checked_private_subnets
+    print("\nName-only existence subnet check complete.\n")
+    if full_validation_mode:
+        print("\n--- Running in Full Subnet Validation Mode (CIDR/AZs provided) ---")
+        try:
+            validate_subnet_creation_parameters(vpc_id, all_proposed_subnets_data, existing_aws_subnets)
+            print("\nPre-synth validation successful. Proceeding with CDK synth.\n")
+            # Populate context_data for downstream CDK construct creation
+            context_data["public_subnets_to_create"] = []
+            if public_ready_for_full_validation:
+                for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
+                    context_data["public_subnets_to_create"].append({
+                        'name': name,
+                        'cidr': PUBLIC_SUBNET_CIDR_BLOCKS[i],
+                        'az': PUBLIC_SUBNET_AVAILABILITY_ZONES[i],
+                        'is_public': True
+                    })
+            context_data["private_subnets_to_create"] = []
+            if private_ready_for_full_validation:
+                for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
+                    context_data["private_subnets_to_create"].append({
+                        'name': name,
+                        'cidr': PRIVATE_SUBNET_CIDR_BLOCKS[i],
+                        'az': PRIVATE_SUBNET_AVAILABILITY_ZONES[i],
+                        'is_public': False
+                    })
+        except (ValueError, Exception) as e:
+            print(f"\nFATAL ERROR: Subnet parameter validation failed: {e}\n")
+            raise SystemExit(1) # Exit if validation fails
+    # Example checks and setting context values
+    # IAM Roles
+    role_name = CODEBUILD_ROLE_NAME
+    exists, _, _ = check_for_existing_role(role_name)
+    context_data[f"exists:{role_name}"] = exists # Use boolean
+    if exists:
+         _, role_arn, _ = check_for_existing_role(role_name) # Get ARN if needed
+         context_data[f"arn:{role_name}"] = role_arn
+    role_name = ECS_TASK_ROLE_NAME
+    exists, _, _ = check_for_existing_role(role_name)
+    context_data[f"exists:{role_name}"] = exists
+    if exists:
+         _, role_arn, _ = check_for_existing_role(role_name)
+         context_data[f"arn:{role_name}"] = role_arn
+    role_name = ECS_TASK_EXECUTION_ROLE_NAME
+    exists, _, _ = check_for_existing_role(role_name)
+    context_data[f"exists:{role_name}"] = exists
+    if exists:
+         _, role_arn, _ = check_for_existing_role(role_name)
+         context_data[f"arn:{role_name}"] = role_arn
+    # S3 Buckets
+    bucket_name = S3_LOG_CONFIG_BUCKET_NAME
+    exists, _ = check_s3_bucket_exists(bucket_name)
+    context_data[f"exists:{bucket_name}"] = exists
+    if exists:
+        # You might not need the ARN if using from_bucket_name
+        pass
+    output_bucket_name = S3_OUTPUT_BUCKET_NAME
+    exists, _ = check_s3_bucket_exists(output_bucket_name)
+    context_data[f"exists:{output_bucket_name}"] = exists
+    if exists:
+         pass
+    # ECR Repository
+    repo_name = ECR_CDK_REPO_NAME
+    exists, _ = check_ecr_repo_exists(repo_name)
+    context_data[f"exists:{repo_name}"] = exists
+    if exists:
+         pass # from_repository_name is sufficient
+    # CodeBuild Project
+    project_name = CODEBUILD_PROJECT_NAME
+    exists, _ = check_codebuild_project_exists(project_name)
+    context_data[f"exists:{project_name}"] = exists
+    if exists:
+         # Need a way to get the ARN from the check function
+         _, project_arn = check_codebuild_project_exists(project_name) # Assuming it returns ARN
+         context_data[f"arn:{project_name}"] = project_arn
+    # ALB (by name lookup)
+    alb_name = ALB_NAME
+    exists, _ = check_alb_exists(alb_name, region_name=AWS_REGION)
+    context_data[f"exists:{alb_name}"] = exists
+    if exists:
+        _, alb_object = check_alb_exists(alb_name, region_name=AWS_REGION) # Assuming check returns object
+        print("alb_object:", alb_object)
+        context_data[f"arn:{alb_name}"] = alb_object['LoadBalancerArn']
+    # Cognito User Pool (by name)
+    user_pool_name = COGNITO_USER_POOL_NAME
+    exists, user_pool_id, _ = check_for_existing_user_pool(user_pool_name)
+    context_data[f"exists:{user_pool_name}"] = exists
+    if exists:
+        context_data[f"id:{user_pool_name}"] = user_pool_id
+    # Cognito User Pool Client (by name and pool ID) - requires User Pool ID from check
+    if user_pool_id:
+        user_pool_id_for_client_check = user_pool_id #context_data.get(f"id:{user_pool_name}") # Use ID from context
+        user_pool_client_name = COGNITO_USER_POOL_CLIENT_NAME
+        if user_pool_id_for_client_check:
+            exists, client_id, _ = check_for_existing_user_pool_client(user_pool_client_name, user_pool_id_for_client_check)
+            context_data[f"exists:{user_pool_client_name}"] = exists
+            if exists:
+                context_data[f"id:{user_pool_client_name}"] = client_id
+    # Secrets Manager Secret (by name)
+    secret_name = COGNITO_USER_POOL_CLIENT_SECRET_NAME
+    exists, _ = check_for_secret(secret_name)
+    context_data[f"exists:{secret_name}"] = exists
+    # You might not need the ARN if using from_secret_name_v2
+    # WAF Web ACL (by name and scope)
+    web_acl_name = WEB_ACL_NAME
+    exists, _ = check_web_acl_exists(web_acl_name, scope="CLOUDFRONT") # Assuming check returns object
+    context_data[f"exists:{web_acl_name}"] = exists
+    if exists:
+        _, existing_web_acl = check_web_acl_exists(web_acl_name, scope="CLOUDFRONT")
+        context_data[f"arn:{web_acl_name}"] = existing_web_acl.attr_arn
+    # Write the context data to the file
+    with open(CONTEXT_FILE, "w") as f:
+        json.dump(context_data, f, indent=2)
+    print(f"Context data written to {CONTEXT_FILE}")

cdk/post_cdk_build_quickstart.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import time
+from cdk_config import CODEBUILD_PROJECT_NAME, S3_LOG_CONFIG_BUCKET_NAME, CLUSTER_NAME, ECS_SERVICE_NAME
+from cdk_functions import start_codebuild_build, upload_file_to_s3, start_ecs_task, create_basic_config_env
+from tqdm import tqdm
+# Create basic config.env file that user can use to run the app later. Input is the folder it is saved into.
+create_basic_config_env("config")
+# Start codebuild build
+print("Starting CodeBuild project.")
+start_codebuild_build(PROJECT_NAME=CODEBUILD_PROJECT_NAME)
+# Upload config.env file to S3 bucket
+upload_file_to_s3(local_file_paths="config/config.env", s3_key="", s3_bucket=S3_LOG_CONFIG_BUCKET_NAME)
+total_seconds = 480 # 8 minutes * 60 seconds/minute
+update_interval = 1 # Update every second
+print("Waiting eight minutes for the CodeBuild container to build.")
+# tqdm iterates over a range, and you perform a small sleep in each iteration
+for i in tqdm(range(total_seconds), desc="Building container"):
+    time.sleep(update_interval)
+# Start task on ECS
+print("Starting ECS task")
+start_ecs_task(cluster_name=CLUSTER_NAME, service_name=ECS_SERVICE_NAME)

cdk/requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+aws-cdk-lib==2.200.2
+boto3==1.38.35
+pandas==2.2.3
+nodejs==0.1.1
+python-dotenv==1.0.1

index.qmd ADDED Viewed

	@@ -0,0 +1,23 @@

+---
+title: "Home"
+---
+version: 0.7.0
+Welcome to the Document Redaction App documentation. This site provides comprehensive documentation for the Document Redaction App.
+Navigate through the sections to learn how to install, use, and manage the application. Below is a brief introduction to the app.
+## Document redaction
+Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](src/user_guide.qmd) for a walkthrough on how to use the app.
+![Handwriting and signatures redacted example](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/refs/heads/main/review_redactions/Signatures%20and%20handwriting%20found.PNG)
+To identify text in documents, the app provides several options. 'Local' text/OCR image analysis uses spacy/tesseract, and works quite well for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. The app then identifies personal information to redaction. The 'Local' is based on spaCy, is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
+After redaction, suggested redactions can be reviewed and modified on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
+NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "doc_redaction"
-version = "0.6.8"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -23,8 +23,8 @@ dependencies = [
     "spacy==3.8.4",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
-    "gradio==5.29.1",
-    "boto3==1.38.4",
     "pyarrow==19.0.1",
     "openpyxl==3.1.5",
     "Faker==36.1.1",
@@ -39,7 +39,7 @@ dependencies = [
 ]
 [project.urls]
-Homepage = "https://seanpedrick-case.github.io/doc_redaction/README.html"
 repository = "https://github.com/seanpedrick-case/doc_redaction"
 [project.optional-dependencies]

 [project]
 name = "doc_redaction"
+version = "0.7.0"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"
     "spacy==3.8.4",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
+    "gradio==5.34.0",
+    "boto3==1.38.35",
     "pyarrow==19.0.1",
     "openpyxl==3.1.5",
     "Faker==36.1.1",
 ]
 [project.urls]
+Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
 repository = "https://github.com/seanpedrick-case/doc_redaction"
 [project.optional-dependencies]

requirements.txt CHANGED Viewed

@@ -10,8 +10,8 @@ pandas==2.2.3
 scikit-learn==1.6.1
 spacy==3.8.4
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
-gradio==5.29.1
-boto3==1.38.4
 pyarrow==19.0.1
 openpyxl==3.1.5
 Faker==36.1.1

 scikit-learn==1.6.1
 spacy==3.8.4
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+gradio==5.34.0
+boto3==1.38.35
 pyarrow==19.0.1
 openpyxl==3.1.5
 Faker==36.1.1

src/app_settings.qmd ADDED Viewed

	@@ -0,0 +1,481 @@

+---
+title: "App settings management guide"
+format:
+  html:
+    toc: true # Enable the table of contents
+    toc-depth: 3 # Include headings up to level 2 (##)
+    toc-title: "On this page" # Optional: Title for your TOC
+---
+Settings for the redaction app can be set from outside by changing values in the  `config.env` file stored in your local config folder, or in S3 if running on AWS. This guide provides an overview of how to configure the application using environment variables. The application loads configurations using `os.environ.get()`. It first attempts to load variables from the file specified by `APP_CONFIG_PATH` (which defaults to `config/app_config.env`). If `AWS_CONFIG_PATH` is also set (e.g., to `config/aws_config.env`), variables are loaded from that file as well. Environment variables set directly in the system will always take precedence over those defined in these `.env` files.
+## App Configuration File (config.env)
+This section details variables related to the main application configuration file.
+*   **`APP_CONFIG_PATH`**
+    *   **Description:** Specifies the path to the application configuration `.env` file. This file contains various settings that control the application's behavior.
+    *   **Default Value:** `config/app_config.env`
+    *   **Configuration:** Set as an environment variable directly. This variable defines where to load other application configurations, so it cannot be set within `config/app_config.env` itself.
+## AWS Options
+This section covers configurations related to AWS services used by the application.
+*   **`AWS_CONFIG_PATH`**
+    *   **Description:** Specifies the path to the AWS configuration `.env` file. This file is intended to store AWS credentials and specific settings.
+    *   **Default Value:** `''` (empty string)
+    *   **Configuration:** Set as an environment variable directly. This variable defines an additional source for AWS-specific configurations.
+*   **`RUN_AWS_FUNCTIONS`**
+    *   **Description:** Enables or disables AWS-specific functionalities within the application. Set to `"1"` to enable and `"0"` to disable.
+    *   **Default Value:** `"0"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`AWS_REGION`**
+    *   **Description:** Defines the AWS region where services like S3, Cognito, and Textract are located.
+    *   **Default Value:** `''`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/aws_config.env` (if `AWS_CONFIG_PATH` is configured).
+*   **`AWS_CLIENT_ID`**
+    *   **Description:** The client ID for AWS Cognito, used for user authentication.
+    *   **Default Value:** `''`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/aws_config.env` (if `AWS_CONFIG_PATH` is configured).
+*   **`AWS_CLIENT_SECRET`**
+    *   **Description:** The client secret for AWS Cognito, used in conjunction with the client ID for authentication.
+    *   **Default Value:** `''`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/aws_config.env` (if `AWS_CONFIG_PATH` is configured).
+*   **`AWS_USER_POOL_ID`**
+    *   **Description:** The user pool ID for AWS Cognito, identifying the user directory.
+    *   **Default Value:** `''`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/aws_config.env` (if `AWS_CONFIG_PATH` is configured).
+*   **`AWS_ACCESS_KEY`**
+    *   **Description:** The AWS access key ID for programmatic access to AWS services.
+    *   **Default Value:** `''` (Note: Often found in the environment or AWS credentials file.)
+    *   **Configuration:** Set as an environment variable directly, or include in `config/aws_config.env` (if `AWS_CONFIG_PATH` is configured). It's also commonly configured via shared AWS credentials files or IAM roles.
+*   **`AWS_SECRET_KEY`**
+    *   **Description:** The AWS secret access key corresponding to the AWS access key ID.
+    *   **Default Value:** `''` (Note: Often found in the environment or AWS credentials file.)
+    *   **Configuration:** Set as an environment variable directly, or include in `config/aws_config.env` (if `AWS_CONFIG_PATH` is configured). It's also commonly configured via shared AWS credentials files or IAM roles.
+*   **`DOCUMENT_REDACTION_BUCKET`**
+    *   **Description:** The name of the S3 bucket used for storing documents related to the redaction process.
+    *   **Default Value:** `''`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/aws_config.env` (if `AWS_CONFIG_PATH` is configured).
+*   **`CUSTOM_HEADER`**
+    *   **Description:** Specifies a custom header name to be included in requests, often used for services like AWS CloudFront.
+    *   **Default Value:** `''`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`CUSTOM_HEADER_VALUE`**
+    *   **Description:** The value for the custom header specified by `CUSTOM_HEADER`.
+    *   **Default Value:** `''`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+## Image Options
+Settings related to image processing within the application.
+*   **`IMAGES_DPI`**
+    *   **Description:** Dots Per Inch (DPI) setting for image processing, affecting the resolution and quality of processed images.
+    *   **Default Value:** `'300.0'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`LOAD_TRUNCATED_IMAGES`**
+    *   **Description:** Controls whether the application attempts to load truncated images. Set to `'True'` to enable.
+    *   **Default Value:** `'True'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`MAX_IMAGE_PIXELS`**
+    *   **Description:** Sets the maximum number of pixels for an image that the application will process. Leave blank for no limit. This can help prevent issues with very large images.
+    *   **Default Value:** `''`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+## File I/O Options
+Configuration for input and output file handling.
+*   **`SESSION_OUTPUT_FOLDER`**
+    *   **Description:** If set to `'True'`, the application will save output and input files into session-specific subfolders, helping to organise files from different user sessions.
+    *   **Default Value:** `'False'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`GRADIO_OUTPUT_FOLDER`** (aliased as `OUTPUT_FOLDER`)
+    *   **Description:** Specifies the default output folder for files generated by Gradio components. Can be set to "TEMP" to use a temporary directory.
+    *   **Default Value:** `'output/'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`GRADIO_INPUT_FOLDER`** (aliased as `INPUT_FOLDER`)
+    *   **Description:** Specifies the default input folder for files used by Gradio components. Can be set to "TEMP" to use a temporary directory.
+    *   **Default Value:** `'input/'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+## Logging Options
+Settings for configuring application logging, including log formats and storage locations.
+*   **`SAVE_LOGS_TO_CSV`**
+    *   **Description:** Enables or disables saving logs to CSV files. Set to `'True'` to enable.
+    *   **Default Value:** `'True'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`USE_LOG_SUBFOLDERS`**
+    *   **Description:** If enabled (`'True'`), logs will be stored in subfolders based on date and hostname, aiding in log organisation.
+    *   **Default Value:** `'True'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`FEEDBACK_LOGS_FOLDER`**
+    *   **Description:** Specifies the base folder for storing feedback logs. If `USE_LOG_SUBFOLDERS` is true, date/hostname subfolders will be created within this folder.
+    *   **Default Value:** `'feedback/'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`ACCESS_LOGS_FOLDER`**
+    *   **Description:** Specifies the base folder for storing access logs. If `USE_LOG_SUBFOLDERS` is true, date/hostname subfolders will be created within this folder.
+    *   **Default Value:** `'logs/'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`USAGE_LOGS_FOLDER`**
+    *   **Description:** Specifies the base folder for storing usage logs. If `USE_LOG_SUBFOLDERS` is true, date/hostname subfolders will be created within this folder.
+    *   **Default Value:** `'usage/'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`DISPLAY_FILE_NAMES_IN_LOGS`**
+    *   **Description:** If set to `'True'`, file names will be included in the log entries.
+    *   **Default Value:** `'False'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`CSV_ACCESS_LOG_HEADERS`**
+    *   **Description:** Defines custom headers for CSV access logs. If left blank, component labels will be used as headers.
+    *   **Default Value:** `''`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`CSV_FEEDBACK_LOG_HEADERS`**
+    *   **Description:** Defines custom headers for CSV feedback logs. If left blank, component labels will be used as headers.
+    *   **Default Value:** `''`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`CSV_USAGE_LOG_HEADERS`**
+    *   **Description:** Defines custom headers for CSV usage logs.
+    *   **Default Value:** A predefined list of header names. Refer to `tools/config.py` for the complete list.
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`SAVE_LOGS_TO_DYNAMODB`**
+    *   **Description:** Enables or disables saving logs to AWS DynamoDB. Set to `'True'` to enable. Requires appropriate AWS setup.
+    *   **Default Value:** `'False'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`ACCESS_LOG_DYNAMODB_TABLE_NAME`**
+    *   **Description:** The name of the DynamoDB table used for storing access logs.
+    *   **Default Value:** `'redaction_access_log'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`DYNAMODB_ACCESS_LOG_HEADERS`**
+    *   **Description:** Specifies the headers (attributes) for the DynamoDB access log table.
+    *   **Default Value:** `''`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`FEEDBACK_LOG_DYNAMODB_TABLE_NAME`**
+    *   **Description:** The name of the DynamoDB table used for storing feedback logs.
+    *   **Default Value:** `'redaction_feedback'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`DYNAMODB_FEEDBACK_LOG_HEADERS`**
+    *   **Description:** Specifies the headers (attributes) for the DynamoDB feedback log table.
+    *   **Default Value:** `''`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`USAGE_LOG_DYNAMODB_TABLE_NAME`**
+    *   **Description:** The name of the DynamoDB table used for storing usage logs.
+    *   **Default Value:** `'redaction_usage'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`DYNAMODB_USAGE_LOG_HEADERS`**
+    *   **Description:** Specifies the headers (attributes) for the DynamoDB usage log table.
+    *   **Default Value:** `''`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`LOGGING`**
+    *   **Description:** Enables or disables general console logging. Set to `'True'` to enable.
+    *   **Default Value:** `'False'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`LOG_FILE_NAME`**
+    *   **Description:** Specifies the name for the CSV log file if `SAVE_LOGS_TO_CSV` is enabled.
+    *   **Default Value:** `'log.csv'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+## Redaction Options
+Configurations related to the text redaction process, including PII detection models and external tool paths.
+*   **`TESSERACT_FOLDER`**
+    *   **Description:** Path to the local Tesseract OCR installation folder. Only required if Tesseract is not in path, or you are running a version of the app as an .exe installed with Pyinstaller. Gives the path to the local Tesseract OCR model for text extraction.
+    *   **Default Value:** `""` (empty string)
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`POPPLER_FOLDER`**
+    *   **Description:** Path to the local Poppler installation's `bin` folder. Only required if Tesseract is not in path, or you are running a version of the app as an .exe installed with Pyinstaller. Poppler is used for PDF processing.
+    *   **Default Value:** `""` (empty string)
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`SELECTABLE_TEXT_EXTRACT_OPTION`**
+    *   **Description:** Display name in the UI for the text extraction method that processes selectable text directly from PDFs.
+    *   **Default Value:** `"Local model - selectable text"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`TESSERACT_TEXT_EXTRACT_OPTION`**
+    *   **Description:** Display name in the UI for the text extraction method using local Tesseract OCR (for PDFs without selectable text).
+    *   **Default Value:** `"Local OCR model - PDFs without selectable text"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`TEXTRACT_TEXT_EXTRACT_OPTION`**
+    *   **Description:** Display name in the UI for the text extraction method using AWS Textract service.
+    *   **Default Value:** `"AWS Textract service - all PDF types"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`NO_REDACTION_PII_OPTION`**
+    *   **Description:** Display name in the UI for the option to only extract text without performing any PII detection or redaction.
+    *   **Default Value:** `"Only extract text (no redaction)"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`LOCAL_PII_OPTION`**
+    *   **Description:** Display name in the UI for the PII detection method using a local model.
+    *   **Default Value:** `"Local"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`AWS_PII_OPTION`**
+    *   **Description:** Display name in the UI for the PII detection method using AWS Comprehend.
+    *   **Default Value:** `"AWS Comprehend"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS`**
+    *   **Description:** Controls whether local text extraction options (selectable text, Tesseract) are shown in the UI. Set to `'True'` to show.
+    *   **Default Value:** `'True'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`SHOW_AWS_TEXT_EXTRACTION_OPTIONS`**
+    *   **Description:** Controls whether AWS Textract text extraction option is shown in the UI. Set to `'True'` to show.
+    *   **Default Value:** `'True'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`DEFAULT_TEXT_EXTRACTION_MODEL`**
+    *   **Description:** Sets the default text extraction model selected in the UI. Defaults to `TEXTRACT_TEXT_EXTRACT_OPTION` if AWS options are shown; otherwise, defaults to `SELECTABLE_TEXT_EXTRACT_OPTION`.
+    *   **Default Value:** Value of `TEXTRACT_TEXT_EXTRACT_OPTION` if `SHOW_AWS_TEXT_EXTRACTION_OPTIONS` is True, else value of `SELECTABLE_TEXT_EXTRACT_OPTION`.
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. Provide one of the text extraction option display names.
+*   **`SHOW_LOCAL_PII_DETECTION_OPTIONS`**
+    *   **Description:** Controls whether the local PII detection option is shown in the UI. Set to `'True'` to show.
+    *   **Default Value:** `'True'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`SHOW_AWS_PII_DETECTION_OPTIONS`**
+    *   **Description:** Controls whether the AWS Comprehend PII detection option is shown in the UI. Set to `'True'` to show.
+    *   **Default Value:** `'True'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`DEFAULT_PII_DETECTION_MODEL`**
+    *   **Description:** Sets the default PII detection model selected in the UI. Defaults to `AWS_PII_OPTION` if AWS options are shown; otherwise, defaults to `LOCAL_PII_OPTION`.
+    *   **Default Value:** Value of `AWS_PII_OPTION` if `SHOW_AWS_PII_DETECTION_OPTIONS` is True, else value of `LOCAL_PII_OPTION`.
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. Provide one of the PII detection option display names.
+*   **`CHOSEN_COMPREHEND_ENTITIES`**
+    *   **Description:** A list of AWS Comprehend PII entity types to be redacted when using AWS Comprehend.
+    *   **Default Value:** A predefined list of entity types. Refer to `tools/config.py` for the complete list.
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. This should be a string representation of a Python list.
+*   **`FULL_COMPREHEND_ENTITY_LIST`**
+    *   **Description:** The complete list of PII entity types supported by AWS Comprehend that can be selected for redaction.
+    *   **Default Value:** A predefined list of entity types. Refer to `tools/config.py` for the complete list.
+    *   **Configuration:** This is typically an informational variable reflecting the capabilities of AWS Comprehend and is not meant to be changed by users directly affecting redaction behavior (use `CHOSEN_COMPREHEND_ENTITIES` for that). Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`CHOSEN_REDACT_ENTITIES`**
+    *   **Description:** A list of local PII entity types to be redacted when using the local PII detection model.
+    *   **Default Value:** A predefined list of entity types. Refer to `tools/config.py` for the complete list.
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. This should be a string representation of a Python list.
+*   **`FULL_ENTITY_LIST`**
+    *   **Description:** The complete list of PII entity types supported by the local PII detection model that can be selected for redaction.
+    *   **Default Value:** A predefined list of entity types. Refer to `tools/config.py` for the complete list.
+    *   **Configuration:** This is typically an informational variable reflecting the capabilities of the local model and is not meant to be changed by users directly affecting redaction behavior (use `CHOSEN_REDACT_ENTITIES` for that). Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`PAGE_BREAK_VALUE`**
+    *   **Description:** Defines a page count after which a function might restart. (Note: Currently not activated).
+    *   **Default Value:** `'99999'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`MAX_TIME_VALUE`**
+    *   **Description:** Specifies the maximum time (in arbitrary units, likely seconds or milliseconds depending on implementation) for a process before it might be timed out.
+    *   **Default Value:** `'999999'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`CUSTOM_BOX_COLOUR`**
+    *   **Description:** Allows specifying a custom color for the redaction boxes drawn on documents (e.g., "grey", "red", "#FF0000"). If empty, a default color is used.
+    *   **Default Value:** `""` (empty string)
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`REDACTION_LANGUAGE`**
+    *   **Description:** Specifies the language for redaction processing. Currently, only "en" (English) is supported.
+    *   **Default Value:** `"en"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`RETURN_PDF_END_OF_REDACTION`**
+    *   **Description:** If set to `'True'`, the application will return a PDF document at the end of the redaction task.
+    *   **Default Value:** `"True"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`COMPRESS_REDACTED_PDF`**
+    *   **Description:** If set to `'True'`, the redacted PDF output will be compressed. This can reduce file size but may cause issues on systems with low memory.
+    *   **Default Value:** `"False"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+## App Run Options
+General runtime configurations for the application.
+*   **`TLDEXTRACT_CACHE`**
+    *   **Description:** Path to the cache file used by the `tldextract` library, which helps in accurately extracting top-level domains (TLDs) from URLs.
+    *   **Default Value:** `'tld/.tld_set_snapshot'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`COGNITO_AUTH`**
+    *   **Description:** Enables or disables AWS Cognito authentication for the application. Set to `'1'` to enable.
+    *   **Default Value:** `'0'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`RUN_DIRECT_MODE`**
+    *   **Description:** If set to `'1'`, runs the application in a "direct mode", which might alter certain behaviors (e.g., UI elements, processing flow).
+    *   **Default Value:** `'0'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`MAX_QUEUE_SIZE`**
+    *   **Description:** The maximum number of requests that can be queued in the Gradio interface.
+    *   **Default Value:** `'5'` (integer)
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`MAX_FILE_SIZE`**
+    *   **Description:** Maximum file size allowed for uploads (e.g., "250mb", "1gb").
+    *   **Default Value:** `'250mb'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`GRADIO_SERVER_PORT`**
+    *   **Description:** The network port on which the Gradio server will listen.
+    *   **Default Value:** `'7860'` (integer)
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`ROOT_PATH`**
+    *   **Description:** The root path for the application, useful if running behind a reverse proxy (e.g., `/app`).
+    *   **Default Value:** `''` (empty string)
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`DEFAULT_CONCURRENCY_LIMIT`**
+    *   **Description:** The default concurrency limit for Gradio event handlers, controlling how many requests can be processed simultaneously.
+    *   **Default Value:** `'3'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`GET_DEFAULT_ALLOW_LIST`**
+    *   **Description:** If set, enables the use of a default allow list for user access or specific functionalities. The exact behavior depends on application logic.
+    *   **Default Value:** `''` (empty string)
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`ALLOW_LIST_PATH`**
+    *   **Description:** Path to a local CSV file containing an allow list (e.g., `config/default_allow_list.csv`).
+    *   **Default Value:** `''` (empty string)
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`S3_ALLOW_LIST_PATH`**
+    *   **Description:** Path to an allow list CSV file stored in an S3 bucket (e.g., `default_allow_list.csv`). Requires `DOCUMENT_REDACTION_BUCKET` to be set.
+    *   **Default Value:** `''` (empty string)
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`FILE_INPUT_HEIGHT`**
+    *   **Description:** Sets the height (in pixels or other CSS unit) of the file input component in the Gradio UI.
+    *   **Default Value:** `'200'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+## Cost Code Options
+Settings related to tracking and applying cost codes for application usage.
+*   **`SHOW_COSTS`**
+    *   **Description:** If set to `'True'`, cost-related information will be displayed in the UI.
+    *   **Default Value:** `'False'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`GET_COST_CODES`**
+    *   **Description:** Enables fetching and using cost codes within the application. Set to `'True'` to enable.
+    *   **Default Value:** `'False'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`DEFAULT_COST_CODE`**
+    *   **Description:** Specifies a default cost code to be used if cost codes are enabled but none is selected by the user.
+    *   **Default Value:** `''` (empty string)
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`COST_CODES_PATH`**
+    *   **Description:** Path to a local CSV file containing available cost codes (e.g., `config/COST_CENTRES.csv`).
+    *   **Default Value:** `''` (empty string)
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`S3_COST_CODES_PATH`**
+    *   **Description:** Path to a cost codes CSV file stored in an S3 bucket (e.g., `COST_CENTRES.csv`). Requires `DOCUMENT_REDACTION_BUCKET` to be set.
+    *   **Default Value:** `''` (empty string)
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`ENFORCE_COST_CODES`**
+    *   **Description:** If set to `'True'` and `GET_COST_CODES` is also enabled, makes the selection of a cost code mandatory for users.
+    *   **Default Value:** `'False'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+## Whole Document API Options
+Configurations for features related to processing whole documents via APIs, particularly AWS Textract for large documents.
+*   **`SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS`**
+    *   **Description:** Controls whether UI options for whole document Textract calls are displayed. (Note: Mentioned as not currently implemented in the source).
+    *   **Default Value:** `'False'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET`**
+    *   **Description:** The S3 bucket used for input and output of whole document analysis with AWS Textract.
+    *   **Default Value:** `''` (empty string)
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER`**
+    *   **Description:** The subfolder within `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET` where input documents for Textract analysis are placed.
+    *   **Default Value:** `'input'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER`**
+    *   **Description:** The subfolder within `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET` where output results from Textract analysis are stored.
+    *   **Default Value:** `'output'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`LOAD_PREVIOUS_TEXTRACT_JOBS_S3`**
+    *   **Description:** If set to `'True'`, the application will attempt to load data from previous Textract jobs stored in S3.
+    *   **Default Value:** `'False'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`TEXTRACT_JOBS_S3_LOC`**
+    *   **Description:** The S3 subfolder (within `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET`) where Textract job data (output) is stored.
+    *   **Default Value:** `'output'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`TEXTRACT_JOBS_S3_INPUT_LOC`**
+    *   **Description:** The S3 subfolder (within `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET`) where Textract job input is stored.
+    *   **Default Value:** `'input'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
+*   **`TEXTRACT_JOBS_LOCAL_LOC`**
+    *   **Description:** The local subfolder where Textract job data is stored if not using S3 or as a cache.
+    *   **Default Value:** `'output'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS`**
+    *   **Description:** Specifies the number of past days for which to display whole document Textract jobs in the UI.
+    *   **Default Value:** `'7'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.

src/faq.qmd ADDED Viewed

	@@ -0,0 +1,222 @@

+---
+title: "User FAQ"
+format:
+  html:
+    toc: true # Enable the table of contents
+    toc-depth: 3 # Include headings up to level 2 (##)
+    toc-title: "On this page" # Optional: Title for your TOC
+---
+## General Advice:
+*   **Read the User Guide**: Many common questions are addressed in the detailed User Guide sections.
+*   **Start Simple**: If you're new, try redacting with default options first before customising extensively.
+*   **Human Review is Key**: Always manually review the `...redacted.pdf` or use the '**Review redactions**' tab. No automated system is perfect.
+*   **Save Incrementally**: When working on the '**Review redactions**' tab, use the '**Save changes on current page to file**' button periodically, especially for large documents.
+## General questions
+#### What is document redaction and what does this app do?
+Document redaction is the process of removing sensitive or personally identifiable information (PII) from documents. This application is a tool that automates this process for various document types, including PDFs, images, open text, and tabular data (`XLSX`/`CSV`/`Parquet`). It identifies potential PII using different methods and allows users to review, modify, and export the suggested redactions.
+#### What types of documents and data can be redacted?
+The app can handle a variety of formats. For documents, it supports `PDF`s and images (`JPG`, `PNG`). For tabular data, it works with `XLSX`, `CSV`, and `Parquet` files. Additionally, it can redact open text that is copied and pasted directly into the application interface.
+#### How does the app identify text and PII for redaction?
+The app employs several methods for text extraction and PII identification. Text can be extracted directly from selectable `PDF` text, using a local Optical Character Recognition (OCR) model for image-based content, or through the **AWS Textract service** for more complex documents, handwriting, and signatures (if available). For PII identification, it can use a local model based on the `spacy` package or the **AWS Comprehend service** for more accurate results (if available).
+#### Can I customise what information is redacted?
+Yes, the app offers extensive customisation options. You can define terms that should never be redacted (an '**allow list**'), terms that should always be redacted (a '**deny list**'), and specify entire pages to be fully redacted using `CSV` files. You can also select specific types of entities to redact, such as dates, or remove default entity types that are not relevant to your needs.
+#### How can I review and modify the suggested redactions?
+The app provides a dedicated '**Review redactions**' tab with a visual interface. You can upload the original document and the generated review file (`CSV`) to see the suggested redactions overlaid on the document. Here, you can move, resize, delete, and add new redaction boxes. You can also filter suggested redactions based on criteria and exclude them individually or in groups.
+#### Can I work with tabular data or copy and pasted text?
+Yes, the app has a dedicated tab for redacting tabular data files (`XLSX`/`CSV`) and open text. For tabular data, you can upload your file and select which columns to redact. For open text, you can simply paste the text into a box. You can then choose the redaction method and the desired output format for the anonymised data.
+#### What are the options for the anonymisation format of redacted text?
+When redacting tabular data or open text, you have several options for how the redacted information is replaced. The default is to replace the text with '**REDACTED**'. Other options include replacing it with the entity type (e.g., 'PERSON'), redacting completely (removing the text), replacing it with a consistent hash value, or masking it with stars ('*').
+#### Can I export or import redactions to/from other software like Adobe Acrobat?
+Yes, the app supports exporting and importing redaction data using the **Adobe Acrobat** comment file format (`.xfdf`). You can export suggested redactions from the app to an `.xfdf` file that can be opened in **Adobe**. Conversely, you can import an `.xfdf` file created in **Adobe** into the app to generate a review file (`CSV`) for further work within the application.
+## Troubleshooting
+#### Q1: The app missed some personal information or redacted things it shouldn't have. Is it broken?
+A: Not necessarily. The app is not 100% accurate and is designed as an aid. The `README` explicitly states: "**NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed by a human before using the final outputs.**"
+*   **Solution**: Always use the '**Review redactions**' tab to manually inspect, add, remove, or modify redactions.
+#### Q2: I uploaded a `PDF`, but no text was found, or redactions are very poor using the '**Local model - selectable text**' option.
+A: This option only works if your `PDF` has actual selectable text. If your `PDF` is an image scan (even if it looks like text), this method won't work well.
+*   **Solution**:
+    *   Try the '**Local OCR model - PDFs without selectable text**' option. This uses Tesseract OCR to "read" the text from images.
+    *   For best results, especially with complex documents, handwriting, or signatures, use the '**AWS Textract service - all PDF types**' if available.
+#### Q3: Handwriting or signatures are not being redacted properly.
+A: The '**Local**' text/OCR methods (selectable text or Tesseract) struggle with handwriting and signatures.
+*   **Solution**:
+    *   Use the '**AWS Textract service**' for text extraction.
+    *   Ensure that on the main '**Redact PDFs/images**' tab, under "**Optional - select signature extraction**" (when **AWS Textract** is chosen), you have enabled handwriting and/or signature detection. Note that signature detection has higher cost implications.
+#### Q4: The options for '**AWS Textract service**' or '**AWS Comprehend**' are missing or greyed out.
+A: These services are typically only available when the app is running in an **AWS** environment or has been specifically configured by your system admin to access these services (e.g., via `API` keys).
+*   **Solution**:
+    *   Check if your instance of the app is supposed to have **AWS** services enabled.
+    *   If running outside **AWS**, see the "**Using AWS Textract and Comprehend when not running in an AWS environment**" section in the advanced guide. This involves configuring **AWS** access keys, which should be done with IT and data security approval.
+#### Q5: I re-processed the same document, and it seems to be taking a long time and potentially costing more with **AWS** services. Can I avoid this?
+A: Yes. If you have previously processed a document with **AWS Textract** or the **Local OCR** model, the app generates a `.json` output file (`..._textract.json` or `..._ocr_results_with_words.json`).
+*   **Solution**: When re-uploading your original document for redaction, also upload the corresponding `.json` file. The app should detect this (the "**Existing Textract output file found**" box may be checked), skipping the expensive text extraction step.
+#### Q6: My app crashed, or I reloaded the page. Are my output files lost?
+A: If you are logged in via **AWS Cognito** and the server hasn't been shut down, you might be able to recover them.
+*   **Solution**: Go to the '**Redaction settings**' tab, scroll to the bottom, and look for '**View all output files from this session**'.
+#### Q7: My custom allow list (terms to never redact) or deny list (terms to always redact) isn't working.
+A: There are a few common reasons:
+*   **File Format**: Ensure your list is a `.csv` file with terms in the first column only, with no column header.
+*   **Case Sensitivity**: Terms in the allow/deny list are case sensitive.
+*   **Deny List & 'CUSTOM' Entity**: For a deny list to work, you must select the '**CUSTOM**' entity type in '**Redaction settings**' under '**Entities to redact**'.
+*   **Manual Additions**: If you manually added terms in the app interface (under '**Manually modify custom allow...**'), ensure you pressed `Enter` after typing each term in its cell.
+*   **Fuzzy Search for Deny List**: If you intend to use fuzzy matching for your deny list, ensure '**CUSTOM_FUZZY**' is selected as an entity type, and you've configured the "**maximum number of spelling mistakes allowed.**"
+#### Q8: I'm trying to review redactions, but the `PDF` in the viewer looks like it's already redacted with black boxes.
+A: You likely uploaded the `...redacted.pdf` file instead of the original document.
+*   **Solution**: On the '**Review redactions**' tab, ensure you upload the original, unredacted `PDF` alongside the `..._review_file.csv`.
+#### Q9: I can't move or pan the document in the '**Review redactions**' viewer when zoomed in.
+A: You are likely in "**add redaction boxes**" mode.
+*   **Solution**: Scroll to the bottom of the document viewer pane and click the hand icon. This switches to "**modify mode**," allowing you to pan the document by clicking and dragging, and also to move/resize existing redaction boxes.
+#### Q10: I accidentally clicked "**Exclude all items in table from redactions**" on the '**Review redactions**' tab without filtering, and now all my redactions are gone!
+A: This can happen if you don't apply a filter first.
+*   **Solution**: Click the '**Undo last element removal**' button immediately. This should restore the redactions. Always ensure you have clicked the blue tick icon next to the search box to apply your filter before using "**Exclude all items...**".
+#### Q11: Redaction of my `CSV` or `XLSX` file isn't working correctly.
+A: The app expects a specific format for tabular data.
+*   **Solution**: Ensure your data file has a simple table format, with the table starting in the first cell (`A1`). There should be no other information or multiple tables within the sheet you intend to redact. For `XLSX` files, each sheet to be redacted must follow this format.
+#### Q12: The "**Identify duplicate pages**" feature isn't finding duplicates I expect, or it's flagging too many pages.
+A: This feature uses text similarity based on the `ocr_outputs.csv` files and has a default similarity threshold (e.g., 90%).
+*   **Solution**:
+    *   Ensure you've uploaded the correct `ocr_outputs.csv` files for all documents you're comparing.
+    *   Review the `page_similarity_results.csv` output to see the similarity scores. The 90% threshold might be too high or too low for your specific documents. The current version of the app described doesn't seem to allow changing this threshold in the `UI`, so you'd mainly use the output to inform your manual review.
+#### Q13: I exported a review file to Adobe (`.xfdf`), but when I open it in Adobe Acrobat, it can't find the `PDF` or shows no redactions.
+A: When **Adobe Acrobat** prompts you, it needs to be pointed to the exact original `PDF`.
+*   **Solution**: Ensure you select the original, unredacted `PDF` file that was used to generate the `..._review_file.csv` (and subsequently the `.xfdf` file) when **Adobe Acrobat** asks for the associated document.
+#### Q14: My **AWS Textract API** job (submitted via "**Submit whole document to AWS Textract API...**") is taking a long time, or I don't know if it's finished.
+A: Large documents can take time. The document estimates about five seconds per page as a rough guide.
+*   **Solution**:
+    *   After submitting, a **Job ID** will appear.
+    *   Periodically click the '**Check status of Textract job and download**' button. Processing continues in the background.
+    *   Once ready, the `_textract.json` output will appear in the output area.
+#### Q15: I'm trying to redact specific terms from my deny list, but they are not being picked up, even though the '**CUSTOM**' entity is selected.
+A: The deny list matches whole words with exact spelling by default.
+*   **Solution**:
+    *   Double-check the spelling and case in your deny list.
+    *   If you expect misspellings to be caught, you need to use the '**CUSTOM_FUZZY**' entity type and configure the "**maximum number of spelling mistakes allowed**" under '**Redaction settings**'. Then, upload your deny list.
+#### Q16: I set the "**Lowest page to redact**" and "**Highest page to redact**" in '**Redaction settings**', but the app still seems to process or show redactions outside this range.
+A: The page range setting primarily controls which pages have redactions applied in the final `...redacted.pdf`. The underlying text extraction (especially with OCR/Textract) might still process the whole document to generate the `...ocr_results.csv` or `..._textract.json`. When reviewing, the `review_file.csv` might initially contain all potential redactions found across the document.
+*   **Solution**:
+    *   Ensure the `...redacted.pdf` correctly reflects the page range.
+    *   When reviewing, use the page navigation and filters on the '**Review redactions**' tab to focus on your desired page range. The final application of redactions from the review tab should also respect the range if it's still set, but primarily it works off the `review_file.csv`.
+#### Q17: My "**Full page redaction list**" isn't working. I uploaded a `CSV` with page numbers, but those pages aren't blacked out.
+A: Common issues include:
+*   **File Format**: Ensure your list is a `.csv` file with page numbers in the first column only, with no column header. Each page number should be on a new row.
+*   **Redaction Task**: Simply uploading the list doesn't automatically redact. You need to:
+    1.  Upload the `PDF` you want to redact.
+    2.  Upload the full page redaction `CSV` in '**Redaction settings**'.
+    3.  It's often best to deselect all other entity types in '**Redaction settings**' if you only want to redact these full pages.
+    4.  Run the '**Redact document**' process. The output `...redacted.pdf` should show the full pages redacted, and the `...review_file.csv` will list these pages.
+#### Q18: I merged multiple `...review_file.csv` files, but the output seems to have duplicate redaction boxes or some are missing.
+A: The merge feature simply combines all rows from the input review files.
+*   **Solution**:
+    *   **Duplicates**: If the same redaction (same location, text, label) was present in multiple input files, it will appear multiple times in the merged file. You'll need to manually remove these duplicates on the '**Review redactions**' tab or by editing the merged `...review_file.csv` in a spreadsheet editor before review.
+    *   **Missing**: Double-check that all intended `...review_file.csv` files were correctly uploaded for the merge. Ensure the files themselves contained the expected redactions.
+#### Q19: I imported an `.xfdf` Adobe comment file, but the `review_file.csv` generated doesn't accurately reflect the highlights or comments I made in Adobe Acrobat.
+A: The app converts Adobe's comment/highlight information into its review_file format. Discrepancies can occur if:
+*   **Comment Types**: The app primarily looks for highlight-style annotations that it can interpret as redaction areas. Other Adobe comment types (e.g., sticky notes without highlights, text strike-throughs not intended as redactions) might not translate.
+*   **Complexity**: Very complex or unusually shaped Adobe annotations might not convert perfectly.
+*   **PDF Version**: Ensure the `PDF` uploaded alongside the `.xfdf` is the exact same original, unredacted `PDF` that the comments were made on in Adobe.
+*   **Solution**: After import, always open the generated `review_file.csv` (with the original `PDF`) on the '**Review redactions**' tab to verify and adjust as needed.
+#### Q20: The **Textract API** job status table (under "**Submit whole document to AWS Textract API...**") only shows recent jobs, or I can't find an older **Job ID** I submitted.
+A: The table showing **Textract** job statuses might have a limit or only show jobs from the current session or within a certain timeframe (e.g., "up to seven days old" is mentioned).
+*   **Solution**:
+    *   It's good practice to note down the **Job ID** immediately after submission if you plan to check it much later.
+    *   If the `_textract.json` file was successfully created from a previous job, you can re-upload that `.json` file with your original `PDF` to bypass the `API` call and proceed directly to redaction or OCR conversion.
+#### Q21: I edited a `...review_file.csv` in Excel (e.g., changed coordinates, labels, colors), but when I upload it to the '**Review redactions**' tab, the boxes are misplaced, the wrong color, or it causes errors.
+A: The `review_file.csv` has specific columns and data formats (e.g., coordinates, `RGB` color tuples like `(0,0,255)`).
+*   **Solution**:
+    *   **Coordinates (xmin, ymin, xmax, ymax)**: Ensure these are numeric and make sense for `PDF` coordinates. Drastic incorrect changes can misplace boxes.
+    *   **Colors**: Ensure the color column uses the `(R,G,B)` format, e.g., `(0,0,255)` for blue, not hex codes or color names, unless the app specifically handles that (the guide mentions `RGB`).
+    *   **CSV Integrity**: Ensure you save the file strictly as a `CSV`. Excel sometimes adds extra formatting or changes delimiters if not saved carefully.
+    *   **Column Order**: Do not change the order of columns in the `review_file.csv`.
+    *   **Test Small Changes**: Modify one or two rows/values first to see the effect before making bulk changes.
+#### Q22: The cost and time estimation feature isn't showing up, or it's giving unexpected results.
+A: This feature depends on admin configuration and certain conditions.
+*   **Solution**:
+    *   **Admin Enabled**: Confirm with your system admin that the cost/time estimation feature is enabled in the app's configuration.
+    *   **AWS Services**: Estimation is typically most relevant when using **AWS Textract** or **Comprehend**. If you're only using '**Local**' models, the estimation might be simpler or not show **AWS**-related costs.
+    *   **Existing Output**: If "**Existing Textract output file found**" is checked (because you uploaded a pre-existing `_textract.json`), the estimated cost and time should be significantly lower for the **Textract** part of the process.
+#### Q23: I'm prompted for a "**cost code**," but I don't know what to enter, or my search isn't finding it.
+A: Cost code selection is an optional feature enabled by system admins for tracking **AWS** usage.
+*   **Solution**:
+    *   **Contact Admin/Team**: If you're unsure which cost code to use, consult your team lead or the system administrator who manages the redaction app. They should provide the correct code or guidance.
+    *   **Search Tips**: Try searching by project name, department, or any known identifiers for your cost center. The search might be case-sensitive or require exact phrasing.
+#### Q24: I selected "**hash**" as the anonymisation output format for my tabular data, but the output still shows "**REDACTED**" or something else.
+A: Ensure the selection was correctly registered before redacting.
+*   **Solution**:
+    *   Double-check on the '**Open text or Excel/csv files**' tab, under '**Anonymisation output format**,' that "**hash**" (or your desired format) is indeed selected.
+    *   Try re-selecting it and then click '**Redact text/data files**' again.
+    *   If the issue persists, it might be a bug or a specific interaction with your data type that prevents hashing. Report this to your app administrator. "**Hash**" should replace PII with a consistent unique `ID` for each unique piece of PII.
+#### Q25: I'm using '**CUSTOM_FUZZY**' for my deny list. I have "**Should fuzzy search match on entire phrases in deny list**" checked, but it's still matching individual words within my phrases or matching things I don't expect.
+A: Fuzzy matching on entire phrases can be complex. The "**maximum number of spelling mistakes allowed**" applies to the entire phrase.
+*   **Solution**:
+    *   **Mistake Count**: If your phrase is long and the allowed mistakes are few, it might not find matches if the errors are distributed. Conversely, too many allowed mistakes on a short phrase can lead to over-matching. Experiment with the mistake count.
+    *   **Specificity**: If "**match on entire phrases**" is unchecked, it will fuzzy match each individual word (excluding stop words) in your deny list phrases. This can be very broad. Ensure this option is set according to your needs.
+    *   **Test with Simple Phrases**: Try a very simple phrase with a known, small number of errors to see if the core fuzzy logic is working as you expect, then build up complexity.
+#### Q26: I "**locked in**" a new redaction box format on the '**Review redactions**' tab (label, colour), but now I want to change it or go back to the pop-up for each new box.
+A: When a format is locked, a new icon (described as looking like a "**gift tag**") appears at the bottom of the document viewer.
+*   **Solution**:
+    *   Click the "**gift tag**" icon at the bottom of the document viewer pane.
+    *   This will allow you to change the default locked format.
+    *   To go back to the pop-up appearing for each new box, click the lock icon within that "**gift tag**" menu again to "**unlock**" it (it should turn from blue to its original state).
+#### Q27: I clicked "**Redact document**," processing seemed to complete (e.g., progress bar finished, "complete" message shown), but no output files (`...redacted.pdf`, `...review_file.csv`) appeared in the output area.
+A: This could be due to various reasons:
+*   **No PII Found**: If absolutely no PII was detected according to your settings (entities, allow/deny lists), the app might not generate a `...redacted.pdf` if there's nothing to redact, though a `review_file.csv` (potentially empty) and `ocr_results.csv` should still ideally appear.
+*   **Error During File Generation**: An unhandled error might have occurred silently during the final file creation step.
+*   **Browser/UI Issue**: The `UI` might not have refreshed to show the files.
+*   **Permissions**: In rare cases, if running locally, there might be file system permission issues preventing the app from writing outputs.
+*   **Solution**:
+    *   Try refreshing the browser page (if feasible without losing input data, or after re-uploading).
+    *   Check the '**Redaction settings**' tab for '**View all output files from this session**' (if logged in via Cognito) – they might be listed there.
+    *   Try a very simple document with obvious PII and default settings to see if any output is generated.
+    *   Check browser developer console (`F12`) for any error messages.
+#### Q28: When reviewing, I click on a row in the '**Search suggested redactions**' table. The page changes, but the specific redaction box isn't highlighted, or the view doesn't scroll to it.
+A: The highlighting feature ("should change the colour of redaction box to blue") is an aid.
+*   **Solution**:
+    *   Ensure you are on the correct page. The table click should take you there.
+    *   The highlighting might be subtle or conflict with other `UI` elements. Manually scan the page for the text/label mentioned in the table row.
+    *   Scrolling to the exact box isn't explicitly guaranteed, especially on very dense pages. The main function is page navigation.
+#### Q29: I rotated a page in the '**Review redactions**' document viewer, and now all subsequent pages are also rotated, or if I navigate away and back, the rotation is lost.
+A: The `README` states: "**When you switch page, the viewer will stay in your selected orientation, so if it looks strange, just rotate the page again and hopefully it will look correct!**"
+*   **Solution**:
+    *   The rotation is a viewing aid for the current page session in the viewer. It does not permanently alter the original `PDF`.
+    *   If subsequent pages appear incorrectly rotated, use the rotation buttons again for that new page.
+    *   The rotation state might reset if you reload files or perform certain actions. Simply re-apply rotation as needed for viewing.

src/installation_guide.qmd ADDED Viewed

	@@ -0,0 +1,233 @@

+---
+title: "App installation guide (with CDK)"
+format:
+  html:
+    toc: true # Enable the table of contents
+    toc-depth: 3 # Include headings up to level 2 (##)
+    toc-title: "On this page" # Optional: Title for your TOC
+---
+# Introduction
+This guide gives an overview of how to install the app in an AWS environment using the code in the cdk/ folder of this Github repo. The most important thing you need is some familiarity with AWS and how to use it via console or command line, as well as administrator access to at least one region. Then follow the below steps.
+## Prerequisites
+*   Install git on your computer from: [https://git-scm.com](https://git-scm.com)
+*   You will also need to install nodejs and npm: [https://docs.npmjs.com/downloading-and-installing-node-js-and-npm](https://docs.npmjs.com/downloading-and-installing-node-js-and-npm)
+*   You will need an AWS Administrator account in your desired region to install.
+*   You will need AWS CDK v2 installed: [https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
+*   You will need to bootstrap the environment with CDK in both your primary region, and `us-east-1` if installing CloudFront and associated WAF.
+    ```bash
+    # Bootstrap your primary region
+    cdk bootstrap aws://<YOUR_AWS_ACCOUNT>/eu-west-1
+    # Bootstrap the us-east-1 region
+    cdk bootstrap aws://<YOUR_AWS_ACCOUNT>/us-east-1
+    ```
+*   In command line, write:
+    ```bash
+    git clone https://github.com/seanpedrick-case/doc_redaction.git
+    ```
+# VPC ACM Certificate
+This CDK code is designed to work within an existing VPC. The code does not create a new VPC if it doesn't exist. So you will need to do that yourself.
+Additionally, to get full HTTPS data transfer through the app, you will need an SSL certificate registered with AWS Certificate Manager.
+You can either use the SSL certificate from a domain, or import an existing certificate into Certificate Manager. Ask your IT admin if you need help with this.
+## If getting an SSL certificate for an existing domain
+Make sure to point the certificate to `*.<domain-name>`.
+Update your DNS records to include the CNAME record given by AWS. After your stack has been created, you will also need to create a CNAME DNS record for your domain pointing to your load balancer DNS with a subdomain, e.g., `redaction.<domain-name>`.
+1.  Create a python environment, load in packages from `requirements.txt`.
+    Need a `cdk.json` in the `cdk` folder. It should contain the following:
+    ```json
+    {
+        "app": "<PATH TO PYTHON ENVIRONMENT FOLDER WHERE REQUIREMENTS HAVE BEEN LOADED>python.exe app.py",
+        "context": {
+          "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
+          "@aws-cdk/core:stackRelativeExports": true,
+          "@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
+          "@aws-cdk/aws-lambda:recognizeVersionProps": true,
+          "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
+          "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
+          "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
+          "@aws-cdk/core:newStyleStackSynthesis": true,
+          "aws-cdk:enableDiffNoFail": true,
+          "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true,
+          "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
+          "@aws-cdk/core:target-partitions": [
+            "aws",
+            "aws-cn"
+          ]
+        }
+      }
+    ```
+2.  Create a `cdk_config.env` file in the `config` subfolder. Here as a minimum it would be useful to put the following details in the env file (below are example values, other possible variables to use here can be seen in the `cdk` folder/`cdk_config.py`).
+    ```ini
+    CDK_PREFIX=example-prefix # Prefix to most created elements in your stack
+    VPC_NAME=example-vpc-name # Name of the VPC within which all the other elements will be created
+    AWS_REGION=us-west-1 # Region where elements will be created
+    AWS_ACCOUNT_ID=1234567890 # AWS account ID that has administrator access that you will use for deploying the stack
+    CDK_FOLDER=C:/path_to_cdk_folder/ # The place where the cdk folder code is located
+    CONTEXT_FILE=C:/path_to_cdk_folder/cdk.context.json
+    EXISTING_IGW_ID=igw-1234567890 # (optional) The ID for an existing internet gateway that you want to use instead of creating a new one
+    SINGLE_NAT_GATEWAY_ID=nat-123456789 # (optional) The ID for an existing NAT gateway that you want to use instead of creating a new one
+    COGNITO_USER_POOL_DOMAIN_PREFIX=lambeth-redaction-37924 # The prefix of the login / user sign up domain that you want to use with Cognito login. Should not contain the terms amazon, aws, or cognito.
+    RUN_USEAST_STACK=False # Set this to True only if you have permissions to create a Cloudfront distribution and web ACL on top of it in the us-east-1 region. If you don't, the section below shows how you can create the CloudFront resource manually and map it to your application load balancer (as you should have permissions for that if you are admin in your region).
+    ```
+**Note: If you are using an SSL certificate with Cognito login on the application load balancer, you can set COGNITO_AUTH to 0 above, as you don't need the second login step to get to the app**
+# Subnets
+### NOTE: I would generally advise creating new subnets as then you will be sure about connectivity between AWS resources that underpin your app.
+*   If you set no subnets, the app will try to use existing private and public subnets. This approach is risky as the app may overlap with IP addresses assigned to existing AWS resources. It is advised to at least specify existing subnets that you know are available, or create your own using one of the below methods.
+*   If you want to use existing subnets, you can list them in the following environment variables:
+    *   `PUBLIC_SUBNETS_TO_USE=["PublicSubnet1", "PublicSubnet2", "PublicSubnet3"]`
+    *   `PRIVATE_SUBNETS_TO_USE=["PrivateSubnet1", "PrivateSubnet2", "PrivateSubnet3"]`
+*   If you want to create new subnets, you need to also specify CIDR blocks and availability zones for the new subnets. The app will check with you upon deployment whether these CIDR blocks are available before trying to create.
+    *   `PUBLIC_SUBNET_CIDR_BLOCKS=['10.222.333.0/28', '10.222.333.16/28', '10.222.333.32/28']`
+    *   `PUBLIC_SUBNET_AVAILABILITY_ZONES=['eu-east-1a', 'eu-east-1b', 'eu-east-1c']`
+    *   `PRIVATE_SUBNET_CIDR_BLOCKS=['10.222.333.48/28', '10.222.333.64/28', '10.222.333.80/28']`
+    *   `PRIVATE_SUBNET_AVAILABILITY_ZONES=['eu-east-1a', 'eu-east-1b', 'eu-east-1c']`
+If you try to create subnets in invalid CIDR blocks / availability zones, the console output will tell you and it will show you the currently occupied CIDR blocks to help find a space for new subnets you want to create.
+3.  In command line in console, go to your `cdk` folder in the redaction app folder. Run `cdk deploy --all`. This should try to deploy the first stack in the `app.py` file.
+    Hopefully everything will deploy successfully and you will be able to see your new stack in CloudFormation in the AWS console.
+4.  Tasks for after CDK deployment
+# Tasks performed by `post_cdk_build_quickstart.py`
+**Note:** The following tasks are done by the `post_cdk_build_quickstart.py` file that you can find in the `cdk` folder. You will need to run this when logged in with AWS SSO through command line. I will describe how to do this in AWS console just in case the `.py` file doesn't work for you.
+## Codebuild
+Need to build CodeBuild project after stack has finished building, as there will be no container in ECR.
+Go to CodeBuild -> your project -> click Start build. Check the logs, the build should be progressing.
+## Create a `config.env` file and upload to S3
+The Fargate task definition references a `config.env` file.
+Need to create a `config.env` file to upload to the S3 bucket that has the variables:
+```ini
+COGNITO_AUTH=1
+RUN_AWS_FUNCTIONS=1
+SESSION_OUTPUT_FOLDER=True # If this is False it currently seems to fail to allow for writable log directories
+```
+Go to S3 and choose the new `...-logs` bucket that you created. Upload the `config.env` file into this bucket.
+## Update Elastic Container Service
+Now that the app container is in Elastic Container Registry, you can proceed to run the app on a Fargate server.
+Go to your new cluster, your new service, and select 'Update service'.
+Select 'Force new deployment', and then set 'Desired number of tasks' to 1.
+# Additional Manual Tasks
+# Update DNS records for your domain (If using a domain for the SSL certificate)
+To do this, you need to create a CNAME DNS record for your domain pointing to your load balancer DNS from a subdomain of your main domain registration, e.g., `redaction.<domain-name>`.
+# Cognito
+Go to Cognito and create a user with your own email address. Generate a password.
+Go to Cognito -> App clients -> Login pages -> View login page.
+Enter the email and temporary password details that come in the email (don't include the last full stop!).
+Change your password.
+## Set MFA (optional)
+On the Cognito user pool page you can also enable MFA, if you are using an SSL certificate with Cognito login on the Application Load Balancer. Go to Cognito -> your user pool -> Sign in ->  Multi-factor authentication
+# Create CloudFront distribution
+**Note: this is only relevant if you set `RUN_USEAST_STACK` to 'False' during CDK deployment**
+If you were not able to create a CloudFront distribution via CDK, you should be able to do it through console. I would advise using CloudFront as the front end to the app.
+Create a new CloudFront distribution.
+*   **If you have used an SSL certificate in your CDK code:**
+    *   **For Origin:**
+        *   Choose the domain name associated with the certificate as the origin.
+        *   Choose HTTPS only as the protocol.
+        *   Keep everything else default.
+    *   **For Behavior (modify default behavior):**
+        *   Under Viewer protocol policy choose 'Redirect HTTP to HTTPS'.
+*   **If you have not used an SSL certificate in your CDK code:**
+    *   **For Origin:**
+        *   Choose your elastic load balancer as the origin. This will fill in the elastic load balancer DNS.
+        *   Choose HTTP only as the protocol.
+        *   Keep everything else default.
+    *   **For Behavior (modify default behavior):**
+        *   Under Viewer protocol policy choose 'HTTP and HTTPS'.
+## Security features
+In your CloudFront distribution, under 'Security' -> Edit -> Enable security protections.
+Choose rate limiting (default is fine).
+Create.
+In CloudFront geographic restrictions -> Countries -> choose an Allow list of countries.
+Click again on Edit.
+AWS WAF protection enabled you should see a link titled 'View details of your configuration'.
+Go to Rules -> `AWS-AWSManagedRulesCommonRuleSet`, click Edit.
+Under `SizeRestrictions_BODY` choose rule action override 'Override to Allow'. This is needed to allow for file upload to the app.
+# Change Cognito redirection URL to your CloudFront distribution
+Go to Cognito -> your user pool -> App Clients -> Login pages -> Managed login configuration.
+Ensure that the callback URL is:
+*   If not using an SSL certificate and Cognito login - `https://<CloudFront domain name>`
+*   If using an SSL certificate, you should have three:
+    *   `https://<CloudFront domain name>`
+    *   `https://<CloudFront domain name>/oauth2/idpresponse`
+    *   `https://<CloudFront domain name>/oauth/idpresponse`
+# Force traffic to come from specific CloudFront distribution (optional)
+Note that this only potentially helps with security if you are not using an SSL certificate with Cognito login on your application load balancer.
+Go to EC2 - Load Balancers -> Your load balancer -> Listeners -> Your listener -> Add rule.
+*   Add Condition -> Host header.
+*   Change Host header value to your CloudFront distribution without the `https://` or `http://` at the front.
+*   Forward to redaction target group.
+*   Turn on group stickiness for 12 hours.
+*   Next.
+*   Choose priority 1.
+Then, change the default listener rule.
+*   Under Routing action change to 'Return fixed response'.

src/management_guide.qmd ADDED Viewed

	@@ -0,0 +1,226 @@

+---
+title: "User and AWS instance management guide"
+format:
+  html:
+    toc: true # Enable the table of contents
+    toc-depth: 3 # Include headings up to level 2 (##)
+    toc-title: "On this page" # Optional: Title for your TOC
+---
+This guide gives an overview of how to manage users of the redaction app, and how to start, stop, and manage instances of the app running on AWS Cloud.
+# User management guide
+This guide provides an overview for administrators to manage users within an AWS Cognito User Pool, specifically for an application utilising phone-app-based Two-Factor Authentication (2FA).
+## Managing Users in AWS Cognito User Pools
+AWS Cognito User Pools provide a secure and scalable user directory for your applications. This guide focuses on common administrative tasks within the AWS Management Console.
+### Accessing Your User Pool
+1.  Log in to the AWS Management Console.
+2.  Navigate to **Cognito** (you can use the search bar).
+3.  In the left navigation pane, select **User Pools**.
+4.  Click on the name of the user pool associated with your redaction app.
+### Creating Users
+Creating a new user in Cognito involves setting their initial credentials and attributes.
+1.  From your User Pool's dashboard, click on the **Users** tab.
+2.  Click the **Create user** button.
+3.  **Username:** Enter a unique username for the user. This is what they will use to log in.
+4.  **Temporary password:**
+    * Select **Generate a password** to have Cognito create a strong, temporary password.
+    * Alternatively, you can choose **Set a password** and enter one manually. If you do this, ensure it meets the password policy configured for your user pool.
+    * **Important:** Cognito will typically require users to change this temporary password upon their first login.
+5.  **Email:** Enter the user's email address. This is crucial for communication and potentially for password recovery if configured.
+6.  **Phone number (optional):** The phone number is not needed for login or user management in this app, you can leave this blank.
+7.  **Mark email as verified/Mark phone number as verified:** For new users, you can choose to automatically verify their email and/or phone number. If unchecked, the user might need to verify these themselves during the signup process (depending on your User Pool's verification settings).
+8.  **Groups (optional):** If you have defined groups in your user pool, you can add the user to relevant groups here. Groups are useful for managing permissions and access control within your application.
+9.  Click **Create user**.
+### Information to Give to Users to Sign Up
+Once a user is created, they'll need specific information to access the application.
+* **Application URL:** The web address of your redaction app's login page.
+* **Username:** The username you created for them in Cognito.
+* **Temporary Password:** The temporary password you generated or set.
+* **Instructions for First Login:**
+    * "Upon your first login, you will be prompted to change your temporary password to a new, secure password."
+    * "You will also need to set up Two-Factor Authentication using a phone authenticator app (e.g., Google Authenticator, Authy)."
+### Resetting User Access (Password Reset)
+If a user forgets their password or needs their access reset, you can do this in the console.
+1.  From your User Pool's dashboard, click on the **Users** tab.
+2.  Locate the user you wish to reset. You can use the search bar.
+3.  Click on the user's username.
+4.  On the user details page, click the **Reset password** button.
+5.  Cognito will generate a new temporary password and mark the user to change it on next login.
+6.  **Important:** You will need to communicate this new temporary password to the user securely.
+### Two-Factor Authentication (2FA) with Apps Only
+Your application uses phone app-based 2FA. This section covers what administrators need to know.
+#### How it Works for the User
+When a user logs in for the first time or when 2FA is enabled for their account, they will be prompted to set up 2FA. This typically involves:
+1.  **Scanning a QR Code:** The application will display a QR code.
+2.  **Using an Authenticator App:** The user opens their authenticator app (e.g., Google Authenticator, Authy, Microsoft Authenticator) and scans the QR code.
+3.  **Entering a Code:** The authenticator app will generate a time-based one-time password (TOTP). The user enters this code into the application to verify the setup.
+#### Administrator's Role in 2FA
+As an administrator, you generally don't directly "set up" the user's 2FA device in the console. The user performs this self-enrollment process within the application. However, you can manage the 2FA status of a user:
+1.  **Enabling/Disabling 2FA for a User:**
+    * From your User Pool's dashboard, click on the **Users** tab.
+    * Click on the user's username.
+    * Under the "Multi-factor authentication (MFA)" section, you'll see the current MFA status.
+    * If 2FA is not enabled, you might have the option to "Enable MFA" for the user. If your user pool requires 2FA, it might be automatically enabled upon signup.
+    * You can also **Disable MFA** for a user if necessary. This will remove their registered 2FA device and they will no longer be prompted for a 2FA code during login until they re-enroll.
+2.  **Removing a User's 2FA Device:** If a user loses their phone or needs to re-configure 2FA, you can remove their existing MFA device.
+    * On the user's details page, under the "Multi-factor authentication (MFA)" section, you will see a list of registered MFA devices (if any).
+    * Select the device and click **Remove**.
+    * The next time the user logs in, they will be prompted to set up 2FA again.
+### Other Useful Information for Administrators
+* **User Status:** In the "Users" tab, you'll see the status of each user (e.g., `CONFIRMED`, `UNCONFIRMED`, `FORCE_CHANGE_PASSWORD`, `ARCHIVED`, `COMPROMISED`).
+    * `CONFIRMED`: User has confirmed their account and set their password.
+    * `UNCONFIRMED`: User has been created but hasn't confirmed their account (e.g., through email verification) or changed their temporary password.
+    * `FORCE_CHANGE_PASSWORD`: User must change their password on next login.
+* **Searching and Filtering Users:** The "Users" tab provides search and filtering options to quickly find specific users or groups of users.
+* **User Attributes:** You can view and sometimes edit user attributes (like email, phone number, custom attributes) on the user's detail page.
+* **Groups:**
+    * You can create and manage groups under the **Groups** tab of your User Pool.
+    * Groups are useful for organising users and applying different permissions or configurations through AWS Identity and Access Management (IAM) roles.
+* **User Pool Settings:**
+    * Explore the various settings under the **User Pool Properties** tab (e.g., Policies, MFA and verifications, Message customisations).
+    * **Policies:** Define password complexity requirements.
+    * **MFA and verifications:** Configure whether MFA is optional, required, or disabled, and the types of MFA allowed (SMS, TOTP). Ensure "Authenticator apps" is enabled for your setup.
+    * **Message customisations:** Customise the email and SMS messages sent by Cognito (e.g., for verification codes, password resets).
+* **Monitoring and Logging:**
+    * Integrate your Cognito User Pool with AWS CloudWatch to monitor user activities and potential issues.
+    * Enable CloudTrail logging for Cognito to track API calls and administrative actions.
+* **Security Best Practices:**
+    * Always use strong, unique passwords for your AWS console login.
+    * Enable MFA for your AWS console login.
+    * Regularly review user access and permissions.
+    * Educate users on strong password practices and the importance of 2FA.
+By understanding these features and following best practices, administrators can effectively manage users within their AWS Cognito User Pool, ensuring secure and smooth operation of their redaction application.
+# Guide to running app instances on AWS
+This guide provides basic instructions for administrators to manage service tasks within AWS Elastic Container Service (ECS) using the AWS Management Console, focusing on scaling services on and off and forcing redeployments.
+## Basic Service Task Management in AWS ECS Console
+AWS Elastic Container Service (ECS) allows you to run, stop, and manage Docker containers on a cluster. This guide focuses on managing your ECS *services*, which maintain a desired number of tasks (container instances).
+### Accessing Your ECS Cluster and Services
+1.  Log in to the AWS Management Console.
+2.  Navigate to **ECS (Elastic Container Service)** (you can use the search bar).
+3.  In the left navigation pane, select **Clusters**.
+4.  Click on the name of the ECS cluster where your redaction app's service is running.
+### Understanding Services and Tasks
+Before we dive into management, let's clarify key concepts:
+* **Task Definition:** A blueprint for your application. It specifies the Docker image, CPU, memory, environment variables, port mappings, and other configurations for your containers.
+* **Task:** An actual running instance of a task definition. It's an individual container or a set of tightly coupled containers running together.
+* **Service:** A mechanism that allows you to run and maintain a specified number of identical tasks simultaneously in an ECS cluster. The service ensures that if a task fails or stops, it's replaced. It also handles load balancing and scaling.
+### Setting the Number of Running Tasks to 0 (Turning Everything Off)
+Setting the desired number of tasks to 0 for a service effectively "turns off" your application by stopping all its running containers.
+1.  From your Cluster's dashboard, click on the **Services** tab.
+2.  Locate the service associated with your redaction app (e.g., `redaction-app-service`).
+3.  Select the service by checking the box next to its name.
+4.  Click the **Update** button.
+5.  On the "Configure service" page, find the **Number of tasks** field.
+6.  Change the value in this field to `0`.
+7.  Scroll to the bottom and click **Update service**.
+**What happens next:**
+* ECS will begin terminating all running tasks associated with that service.
+* The "Running tasks" count for your service will gradually decrease to 0.
+* Your application will become inaccessible as its containers are stopped.
+**Important Considerations:**
+* **Cost Savings:** Setting tasks to 0 can save costs by stopping the consumption of compute resources (CPU, memory) for your containers.
+* **Associated Resources:** This action *only* stops the ECS tasks. It does not stop underlying EC2 instances (if using EC2 launch type), associated databases, load balancers, or other AWS resources. You'll need to manage those separately if you want to completely shut down your environment.
+* **Container Images:** Your Docker images will still reside in Amazon ECR (or wherever you store them).
+* **Downtime:** This action will cause immediate downtime for your application.
+### Turning the Desired Number of Tasks On
+To bring your application back online, you'll set the desired number of tasks to your operational value (usually 1 or more).
+1.  From your Cluster's dashboard, click on the **Services** tab.
+2.  Locate the service associated with your redaction app.
+3.  Select the service by checking the box next to its name.
+4.  Click the **Update** button.
+5.  On the "Configure service" page, find the **Number of tasks** field.
+6.  Change the value in this field to your desired number of running tasks (e.g., `1`, `2`, etc.).
+7.  Scroll to the bottom and click **Update service**.
+**What happens next:**
+* ECS will begin launching new tasks based on your service's configuration and task definition.
+* The "Running tasks" count will increase until it reaches your desired number.
+* Once tasks are running and healthy (according to your health checks), your application should become accessible again.
+**Important Considerations:**
+* **Startup Time:** Allow some time for tasks to pull images, start containers, and pass health checks before your application is fully available.
+* **Resource Availability:** Ensure your ECS cluster has sufficient available resources (EC2 instances or Fargate capacity) to launch the desired number of tasks.
+### Forcing Redeployment
+Forcing a redeployment is useful when you've updated your task definition (e.g., pushed a new Docker image, changed environment variables) but the service hasn't automatically picked up the new version. It's also useful for "restarting" a service.
+1.  From your Cluster's dashboard, click on the **Services** tab.
+2.  Locate the service you want to redeploy.
+3.  Select the service by checking the box next to its name.
+4.  Click the **Update** button.
+5.  On the "Configure service" page, scroll down to the **Deployment options** section.
+6.  Check the box next to **Force new deployment**.
+7.  Scroll to the bottom and click **Update service**.
+**What happens next:**
+* ECS will initiate a new deployment for your service.
+* It will launch new tasks using the *latest active task definition revision* associated with your service.
+* Existing tasks will be drained and terminated according to your service's deployment configuration (e.g., `minimum healthy percent`, `maximum percent`).
+* This process effectively replaces all running tasks with fresh instances.
+**Important Considerations:**
+* **Latest Task Definition:** Ensure you have activated the correct and latest task definition revision before forcing a new deployment if your intention is to deploy new code. You can update the task definition used by a service via the "Update" service flow.
+* **Downtime (minimal if configured correctly):** If your service has a properly configured load balancer and healthy deployment settings (e.g., blue/green or rolling updates), forced redeployments should result in minimal to no downtime. ECS will bring up new tasks before shutting down old ones.
+* **Troubleshooting:** If a deployment gets stuck or tasks fail to start, check the "Events" tab of your service for error messages. Also, check the CloudWatch logs for your tasks.
+### Other Useful Information for Administrators
+* **Service Events:** On your service's detail page, click the **Events** tab. This provides a chronological log of actions taken by the ECS service, such as task launches, stops, and scaling events. This is invaluable for troubleshooting.
+* **Tasks Tab:** On your service's detail page, click the **Tasks** tab to see a list of all individual tasks running (or recently stopped) for that service. You can click on individual tasks to view their details, including logs, network configuration, and CPU/memory utilisation.
+* **Logs:** For each task, you can often find a link to its CloudWatch Logs under the "Logs" section of the task details. This is critical for debugging application errors.
+* **Metrics:** The **Metrics** tab on your service provides graphs for CPU utilisation, memory utilisation, and the number of running tasks, helping you monitor your service's performance.
+* **Deployment Configuration:** When updating a service, review the **Deployment options** section. This allows you to control how new deployments are rolled out (e.g., minimum healthy percent, maximum percent). Proper configuration here ensures minimal impact during updates.
+* **Auto Scaling (beyond basic management):** For dynamic scaling based on demand, explore **Service Auto Scaling**. This allows ECS to automatically adjust the desired number of tasks up or down based on metrics like CPU utilisation or request count.
+* **Task Definitions:** Before updating a service, you might need to create a new revision of your task definition if you're deploying new code or configuration changes to your containers. You can find Task Definitions in the left navigation pane under ECS.
+By mastering these basic service management operations in the AWS Console, administrators can effectively control the lifecycle of their ECS-based applications.

src/styles.css ADDED Viewed

	@@ -0,0 +1 @@


1	+ /* Custom styles can be added here later */

src/user_guide.qmd ADDED Viewed

	@@ -0,0 +1,543 @@

+---
+title: "User guide"
+format:
+  html:
+    toc: true # Enable the table of contents
+    toc-depth: 3 # Include headings up to level 3 (##)
+    toc-title: "On this page" # Optional: Title for your TOC
+---
+## Table of contents
+- [Example data files](#example-data-files)
+- [Basic redaction](#basic-redaction)
+- [Customising redaction options](#customising-redaction-options)
+    - [Custom allow, deny, and page redaction lists](#custom-allow-deny-and-page-redaction-lists)
+        - [Allow list example](#allow-list-example)
+        - [Deny list example](#deny-list-example)
+        - [Full page redaction list example](#full-page-redaction-list-example)
+    - [Redacting additional types of personal information](#redacting-additional-types-of-personal-information)
+    - [Redacting only specific pages](#redacting-only-specific-pages)
+    - [Handwriting and signature redaction](#handwriting-and-signature-redaction)
+- [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
+- [Redacting tabular data files (CSV/XLSX) or copy and pasted text](#redacting-tabular-data-files-xlsxcsv-or-copy-and-pasted-text)
+See the [advanced user guide here](#advanced-user-guide):
+- [Merging redaction review files](#merging-redaction-review-files)
+- [Identifying and redacting duplicate pages](#identifying-and-redacting-duplicate-pages)
+- [Fuzzy search and redaction](#fuzzy-search-and-redaction)
+- [Export redactions to and import from Adobe Acrobat](#export-to-and-import-from-adobe)
+    - [Exporting to Adobe Acrobat](#exporting-to-adobe-acrobat)
+    - [Importing from Adobe Acrobat](#importing-from-adobe-acrobat)
+- [Using the AWS Textract document API](#using-the-aws-textract-document-api)
+- [Using AWS Textract and Comprehend when not running in an AWS environment](#using-aws-textract-and-comprehend-when-not-running-in-an-aws-environment)
+- [Modifying existing redaction review files](#modifying-existing-redaction-review-files)
+## Example data files
+Please try these example files to follow along with this guide:
+- [Example of files sent to a professor before applying](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/example_of_emails_sent_to_a_professor_before_applying.pdf)
+- [Example complaint letter (jpg)](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/example_complaint_letter.jpg)
+- [Partnership Agreement Toolkit (for signatures and more advanced usage)](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/Partnership-Agreement-Toolkit_0_0.pdf)
+- [Dummy case note data](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv)
+## Basic redaction
+The document redaction app can detect personally-identifiable information (PII) in documents. Documents can be redacted directly, or suggested redactions can be reviewed and modified using a grapical user interface. Basic document redaction can be performed quickly using the default options.
+Download the example PDFs above to your computer. Open up the redaction app with the link provided by email.
+![Upload files](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/file_upload_highlight.PNG)
+### Upload files to the app
+The 'Redact PDFs/images tab' currently accepts PDFs and image files (JPG, PNG) for redaction. Click on the 'Drop files here or Click to Upload' area of the screen, and select one of the three different [example files](#example-data-files) (they should all be stored in the same folder if you want them to be redacted at the same time).
+### Text extraction
+First, select one of the three text extraction options:
+- **'Local model - selectable text'** - This will read text directly from PDFs that have selectable text to redact (using PikePDF). This is fine for most PDFs, but will find nothing if the PDF does not have selectable text, and it is not good for handwriting or signatures. If it encounters an image file, it will send it onto the second option below.
+- **'Local OCR model - PDFs without selectable text'** - This option will use a simple Optical Character Recognition (OCR) model (Tesseract) to pull out text from a PDF/image that it 'sees'. This can handle most typed text in PDFs/images without selectable text, but struggles with handwriting/signatures. If you are interested in the latter, then you should use the third option if available.
+- **'AWS Textract service - all PDF types'** - Only available for instances of the app running on AWS. AWS Textract is a service that performs OCR on documents within their secure service. This is a more advanced version of OCR compared to the local option, and carries a (relatively small) cost. Textract excels in complex documents based on images, or documents that contain a lot of handwriting and signatures.
+### Optional - select signature extraction
+If you chose the AWS Textract service above, you can choose if you want handwriting and/or signatures redacted by default. Choosing signatures here will have a cost implication, as identifying signatures will cost ~£2.66 ($3.50) per 1,000 pages vs ~£1.14 ($1.50) per 1,000 pages without signature detection.
+![AWS Textract handwriting and signature options](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/textract_handwriting_signatures.PNG)
+### PII redaction method
+If you are running with the AWS service enabled, here you will also have a choice for PII redaction method:
+- **'Only extract text - (no redaction)'** - If you are only interested in getting the text out of the document for further processing (e.g. to find duplicate pages, or to review text on the Review redactions page)
+- **'Local'** - This uses the spacy package to rapidly detect PII in extracted text. This method is often sufficient if you are just interested in redacting specific terms defined in a custom list.
+- **'AWS Comprehend'** - This method calls an AWS service to provide more accurate identification of PII in extracted text.
+### Optional - costs and time estimation
+If the option is enabled (by your system admin, in the config file), you will see a cost and time estimate for the redaction process. 'Existing Textract output file found' will be checked automatically if previous Textract text extraction files exist in the output folder, or have been [previously uploaded by the user](#aws-textract-outputs) (saving time and money for redaction).
+![Cost and time estimation](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/costs_and_time.PNG)
+### Optional - cost code selection
+If the option is enabled (by your system admin, in the config file), you may be prompted to select a cost code before continuing with the redaction task.
+![Cost code selection](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/cost_code_selection.PNG)
+The relevant cost code can be found either by: 1. Using the search bar above the data table to find relevant cost codes, then clicking on the relevant row, or 2. typing it directly into the dropdown to the right, where it should filter as you type.
+### Optional - Submit whole documents to Textract API
+If this option is enabled (by your system admin, in the config file), you will have the option to submit whole documents in quick succession to the AWS Textract service to get extracted text outputs quickly (faster than using the 'Redact document' process described here). This feature is described in more detail in the [advanced user guide](#using-the-aws-textract-document-api).
+![Textract document API](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/textract_document_api.PNG)
+### Redact the document
+Click 'Redact document'. After loading in the document, the app should be able to process about 30 pages per minute (depending on redaction methods chose above). When ready, you should see a message saying that processing is complete, with output files appearing in the bottom right.
+### Redaction outputs
+![Redaction outputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/redaction_outputs.PNG)
+- **'...redacted.pdf'** files contain the original pdf with suggested redacted text deleted and replaced by a black box on top of the document.
+- **'...ocr_results.csv'** files contain the line-by-line text outputs from the entire document. This file can be useful for later searching through for any terms of interest in the document (e.g. using Excel or a similar program).
+- **'...review_file.csv'** files are the review files that contain details and locations of all of the suggested redactions in the document. This file is key to the [review process](#reviewing-and-modifying-suggested-redactions), and should be downloaded to use later for this.
+### Additional AWS Textract / local OCR outputs
+If you have used the AWS Textract option for extracting text, you may also see a '..._textract.json' file. This file contains all the relevant extracted text information that comes from the AWS Textract service. You can keep this file and upload it at a later date alongside your input document, which will enable you to skip calling AWS Textract every single time you want to do a redaction task, as follows:
+![Document upload alongside Textract](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/document_upload_with_textract.PNG)
+Similarly, if you have used the 'Local OCR method' to extract text, you may see a '..._ocr_results_with_words.json' file. This file works in the same way as the AWS Textract .json results described above, and can be uploaded alongside an input document to save time on text extraction in future in the same way.
+### Downloading output files from previous redaction tasks
+If you are logged in via AWS Cognito and you lose your app page for some reason (e.g. from a crash, reloading), it is possible recover your previous output files, provided the server has not been shut down since you redacted the document. Go to 'Redaction settings', then scroll to the bottom to see 'View all output files from this session'.
+![View all output files](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/view_all_output_files.PNG)
+### Basic redaction summary
+We have covered redacting documents with the default redaction options. The '...redacted.pdf' file output may be enough for your purposes. But it is very likely that you will need to customise your redaction options, which we will cover below.
+## Customising redaction options
+On the 'Redaction settings' page, there are a number of options that you can tweak to better match your use case and needs.
+### Custom allow, deny, and page redaction lists
+The app allows you to specify terms that should never be redacted (an allow list), terms that should always be redacted (a deny list), and also to provide a list of page numbers for pages that should be fully redacted.
+![Custom allow, deny, and page redaction lists](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/allow_deny_full_page_list.PNG)
+#### Allow list example
+It may be the case that specific terms that are frequently redacted are not interesting to
+In the redacted outputs of the 'Example of files sent to a professor before applying' PDF, you can see that it is frequently redacting references to Dr Hyde's lab in the main body of the text. Let's say that references to Dr Hyde were not considered personal information in this context. You can exclude this term from redaction (and others) by providing an 'allow list' file. This is simply a csv that contains the case sensitive terms to exclude in the first column, in our example, 'Hyde' and 'Muller glia'. The example file is provided [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/allow_list/allow_list.csv).
+To import this to use with your redaction tasks, go to the 'Redaction settings' tab, click on the 'Import allow list file' button halfway down, and select the csv file you have created. It should be loaded for next time you hit the redact button. Go back to the first tab and do this.
+#### Deny list example
+Say you wanted to remove specific terms from a document. In this app you can do this by providing a custom deny list as a csv. Like for the allow list described above, this should be a one-column csv without a column header. The app will suggest each individual term in the list with exact spelling as whole words. So it won't select text from within words. To enable this feature, the 'CUSTOM' tag needs to be chosen as a redaction entity [(the process for adding/removing entity types to redact is described below)](#redacting-additional-types-of-personal-information).
+Here is an example using the [Partnership Agreement Toolkit file](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/Partnership-Agreement-Toolkit_0_0.pdf). This is an [example of a custom deny list file](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/allow_list/partnership_toolkit_redact_custom_deny_list.csv). 'Sister', 'Sister City'
+'Sister Cities', 'Friendship City' have been listed as specific terms to redact. You can see the outputs of this redaction process on the review page:
+![Deny list redaction Partnership file](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/deny_list_partnership_example.PNG).
+You can see that the app has highlighted all instances of these terms on the page shown. You can then consider each of these terms for modification or removal on the review page [explained here](#reviewing-and-modifying-suggested-redactions).
+#### Full page redaction list example
+There may be full pages in a document that you want to redact. The app also provides the capability of redacting pages completely based on a list of input page numbers in a csv. The format of the input file is the same as that for the allow and deny lists described above - a one-column csv without a column header. An [example of this is here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/allow_list/partnership_toolkit_redact_some_pages.csv). You can see an example of the redacted page on the review page:
+![Whole page partnership redaction](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/whole_page_partnership_example.PNG).
+Using the above approaches to allow, deny, and full page redaction lists will give you an output [like this](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/allow_list/Partnership-Agreement-Toolkit_0_0_redacted.pdf).
+#### Adding to the loaded allow, deny, and whole page lists in-app
+If you open the accordion below the allow list options called 'Manually modify custom allow...', you should be able to see a few tables with options to add new rows:
+![Manually modify allow or deny list](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/manually_modify.PNG)
+If the table is empty, you can add a new entry, you can add a new row by clicking on the '+' item below each table header. If there is existing data, you may need to click on the three dots to the right and select 'Add row below'. Type the item you wish to keep/remove in the cell, and then (important) press enter to add this new item to the allow/deny/whole page list. Your output tables should look something like below.
+![Manually modify allow or deny list filled](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/manually_modify_filled.PNG)
+**Note:** As of version 0.7.0 you can now apply your whole page redaction list directly to the document file currently under review by clicking the 'Apply whole page redaction list to document currently under review' button that appears here.
+### Redacting additional types of personal information
+You may want to redact additional types of information beyond the defaults, or you may not be interested in default suggested entity types. There are dates in the example complaint letter. Say we wanted to redact those dates also?
+Under the 'Redaction settings' tab, go to 'Entities to redact (click close to down arrow for full list)'. Different dropdowns are provided according to whether you are using the Local service to redact PII, or the AWS Comprehend service. Click within the empty box close to the dropdown arrow and you should see a list of possible 'entities' to redact. Select 'DATE_TIME' and it should appear in the main list. To remove items, click on the 'x' next to their name.
+![Redacting additional types of information dropdown](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/additional_entities/additional_entities_select.PNG)
+Now, go back to the main screen and click 'Redact Document' again. You should now get a redacted version of 'Example complaint letter' that has the dates and times removed.
+If you want to redact different files, I suggest you refresh your browser page to start a new session and unload all previous data.
+## Redacting only specific pages
+Say also we are only interested in redacting page 1 of the loaded documents. On the Redaction settings tab, select 'Lowest page to redact' as 1, and 'Highest page to redact' also as 1. When you next redact your documents, only the first page will be modified.
+![Selecting specific pages to redact](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/select_pages.PNG)
+## Handwriting and signature redaction
+The file [Partnership Agreement Toolkit (for signatures and more advanced usage)](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/Partnership-Agreement-Toolkit_0_0.pdf) is provided as an example document to test AWS Textract + redaction with a document that has signatures in. If you have access to AWS Textract in the app, try removing all entity types from redaction on the Redaction settings and clicking the big X to the right of 'Entities to redact'.
+To ensure that handwriting and signatures are enabled (enabled by default), on the front screen go the 'AWS Textract signature detection' to enable/disable the following options :
+![Handwriting and signatures](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/textract_handwriting_signatures.PNG)
+The outputs should show handwriting/signatures redacted (see pages 5 - 7), which you can inspect and modify on the 'Review redactions' tab.
+![Handwriting and signatures redacted example](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/refs/heads/main/review_redactions/Signatures%20and%20handwriting%20found.PNG)
+## Reviewing and modifying suggested redactions
+Sometimes the app will suggest redactions that are incorrect, or will miss personal information entirely. The app allows you to review and modify suggested redactions to compensate for this. You can do this on the 'Review redactions' tab.
+We will go through ways to review suggested redactions with an example.On the first tab 'PDFs/images' upload the ['Example of files sent to a professor before applying.pdf'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/example_of_emails_sent_to_a_professor_before_applying.pdf) file. Let's stick with the 'Local model - selectable text' option, and click 'Redact document'. Once the outputs are created, go to the 'Review redactions' tab.
+On the 'Review redactions' tab you have a visual interface that allows you to inspect and modify redactions suggested by the app. There are quite a few options to look at, so we'll go from top to bottom.
+![Review redactions](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/review_redactions.PNG)
+### Uploading documents for review
+The top area has a file upload area where you can upload original, unredacted PDFs, alongside the '..._review_file.csv' that is produced by the redaction process. Once you have uploaded these two files, click the '**Review redactions based on original PDF...**' button to load in the files for review. This will allow you to visualise and modify the suggested redactions using the interface below.
+Optionally, you can also upload one of the '..._ocr_output.csv' files here that comes out of a redaction task, so that you can navigate the extracted text from the document.
+![Search extracted text](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/search_extracted_text.PNG)
+You can upload the three review files in the box (unredacted document, '..._review_file.csv' and '..._ocr_output.csv' file) before clicking '**Review redactions based on original PDF...**', as in the image below:
+![Upload three files for review](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/upload_three_files.PNG)
+**NOTE:** ensure you upload the ***unredacted*** document here and not the redacted version, otherwise you will be checking over a document that already has redaction boxes applied!
+### Page navigation
+You can change the page viewed either by clicking 'Previous page' or 'Next page', or by typing a specific page number in the 'Current page' box and pressing Enter on your keyboard. Each time you switch page, it will save redactions you have made on the page you are moving from, so you will not lose changes you have made.
+You can also navigate to different pages by clicking on rows in the tables under 'Search suggested redactions' to the right, or 'search all extracted text' (if enabled) beneath that.
+### The document viewer pane
+On the selected page, each redaction is highlighted with a box next to its suggested redaction label (e.g. person, email).
+![Document view pane](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/document_viewer_pane.PNG)
+There are a number of different options to add and modify redaction boxes and page on the document viewer pane. To zoom in and out of the page, use your mouse wheel. To move around the page while zoomed, you need to be in modify mode. Scroll to the bottom of the document viewer to see the relevant controls. You should see a box icon, a hand icon, and two arrows pointing counter-clockwise and clockwise.
+![Change redaction mode](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/change_review_mode.PNG)
+Click on the hand icon to go into modify mode. When you click and hold on the document viewer, This will allow you to move around the page when zoomed in. To rotate the page, you can click on either of the round arrow buttons to turn in that direction.
+**NOTE:** When you switch page, the viewer will stay in your selected orientation, so if it looks strange, just rotate the page again and hopefully it will look correct!
+#### Modify existing redactions (hand icon)
+After clicking on the hand icon, the interface allows you to modify existing redaction boxes. When in this mode, you can click and hold on an existing box to move it.
+![Modify existing redaction box](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/modify_existing_redaction_box.PNG)
+Click on one of the small boxes at the edges to change the size of the box. To delete a box, click on it to highlight it, then press delete on your keyboard. Alternatively, double click on a box and click 'Remove' on the box that appears.
+![Remove existing redaction box](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/existing_redaction_box_remove.PNG)
+#### Add new redaction boxes (box icon)
+To change to 'add redaction boxes' mode, scroll to the bottom of the page. Click on the box icon, and your cursor will change into a crosshair. Now you can add new redaction boxes where you wish. A popup will appear when you create a new box so you can select a label and colour for the new box.
+#### 'Locking in' new redaction box format
+It is possible to lock in a chosen format for new redaction boxes so that you don't have the popup appearing each time. When you make a new box, select the options for your 'locked' format, and then click on the lock icon on the left side of the popup, which should turn blue.
+![Lock redaction box format](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/new_redaction_box_lock_mode.PNG)
+You can now add new redaction boxes without a popup appearing. If you want to change or 'unlock' the your chosen box format, you can click on the new icon that has appeared at the bottom of the document viewer pane that looks a little like a gift tag. You can then change the defaults, or click on the lock icon again to 'unlock' the new box format - then popups will appear again each time you create a new box.
+![Change or unlock redaction box format](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/change_review_mode_with_lock.PNG)
+### Apply redactions to PDF and Save changes on current page
+Once you have reviewed all the redactions in your document and you are happy with the outputs, you can click 'Apply revised redactions to PDF' to create a new '_redacted.pdf' output alongside a new '_review_file.csv' output.
+If you are working on a page and haven't saved for a while, you can click 'Save changes on current page to file' to ensure that they are saved to an updated 'review_file.csv' output.
+![Review modified outputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/review_mod_outputs.PNG)
+### Selecting and removing redaction boxes using the 'Search suggested redactions' table
+The table shows a list of all the suggested redactions in the document alongside the page, label, and text (if available).
+![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/list_find_labels.PNG)
+If you click on one of the rows in this table, you will be taken to the page of the redaction. Clicking on a redaction row on the same page will change the colour of redaction box to blue to help you locate it in the document viewer (just when using the app, not in redacted output PDFs).
+![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/review_row_highlight.PNG)
+You can choose a specific entity type to see which pages the entity is present on. If you want to go to the page specified in the table, you can click on a cell in the table and the review page will be changed to that page.
+To filter the 'Search suggested redactions' table you can:
+1. Click on one of the dropdowns (Redaction category, Page, Text), and select an option, or
+2. Write text in the 'Filter' box just above the table. Click the blue box to apply the filter to the table.
+Once you have filtered the table, or selected a row from the table, you have a few options underneath on what you can do with the filtered rows:
+- Click the **Exclude all redactions in table** button to remove all redactions visible in the table from the document. **Important:** ensure that you have clicked the blue tick icon next to the search box before doing this, or you will remove all redactions from the document. If you do end up doing this, click the 'Undo last element removal' button below to restore the redactions.
+- Click the **Exclude specific redaction row** button to remove only the redaction from the last row you clicked on from the document. The currently selected row is visible below.
+- Click the **Exclude all redactions with the same text as selected row** button to remove all redactions from the document that are exactly the same as the selected row text.
+**NOTE**: After excluding redactions using any of the above options, click the 'Reset filters' button below to ensure that the dropdowns and table return to seeing all remaining redactions in the document.
+If you made a mistake, click the 'Undo last element removal' button to restore the Search suggested redactions table to its previous state (can only undo the last action).
+### Navigating through the document using the 'Search all extracted text'
+The 'search all extracted text' table will contain text if you have just redacted a document, or if you have uploaded a '..._ocr_output.csv' file alongside a document file and review file on the Review redactions tab as [described above](#uploading-documents-for-review).
+You can navigate through the document using this table. When you click on a row, the Document viewer pane to the left will change to the selected page.
+![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/select_extracted_text.PNG)
+You can search through the extracted text by using the search bar just above the table, which should filter as you type. To apply the filter and 'cut' the table, click on the blue tick inside the box next to your search term. To return the table to its original content, click the button below the table 'Reset OCR output table filter'.
+![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/search_extracted_text.PNG)
+## Redacting tabular data files (XLSX/CSV) or copy and pasted text
+### Tabular data files (XLSX/CSV)
+The app can be used to redact tabular data files such as xlsx or csv files. For this to work properly, your data file needs to be in a simple table format, with a single table starting from the first cell (A1), and no other information in the sheet. Similarly for .xlsx files, each sheet in the file that you want to redact should be in this simple format.
+To demonstrate this, we can use [the example csv file 'combined_case_notes.csv'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv), which is a small dataset of dummy social care case notes. Go to the 'Open text or Excel/csv files' tab. Drop the file into the upload area. After the file is loaded, you should see the suggested columns for redaction in the box underneath. You can select and deselect columns to redact as you wish from this list.
+![csv upload](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/tabular_files/file_upload_csv_columns.PNG)
+If you were instead to upload an xlsx file, you would see also a list of all the sheets in the xlsx file that can be redacted. The 'Select columns' area underneath will suggest a list of all columns in the file across all sheets.
+![xlsx upload](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/tabular_files/file_upload_xlsx_columns.PNG)
+Once you have chosen your input file and sheets/columns to redact, you can choose the redaction method. 'Local' will use the same local model as used for documents on the first tab. 'AWS Comprehend' will give better results, at a slight cost.
+When you click Redact text/data files, you will see the progress of the redaction task by file and sheet, and you will receive a csv output with the redacted data.
+### Choosing output anonymisation format
+You can also choose the anonymisation format of your output results.  Open the tab 'Anonymisation output format' to see the options. By default, any detected PII will be replaced with the word 'REDACTED' in the cell. You can choose one of the following options as the form of replacement for the redacted text:
+- replace with 'REDACTED': Replaced by the word 'REDACTED' (default)
+- replace with <ENTITY_NAME>: Replaced by e.g. 'PERSON' for people, 'EMAIL_ADDRESS' for emails etc.
+- redact completely: Text is removed completely and replaced by nothing.
+- hash: Replaced by a unique long ID code that is consistent with entity text. I.e. a particular name will always have the same ID code.
+- mask: Replace with stars '*'.
+### Redacting copy and pasted text
+You can also write open text into an input box and redact that using the same methods as described above. To do this, write or paste text into the 'Enter open text' box that appears when you open the 'Redact open text' tab. Then select a redaction method, and an anonymisation output format as described above. The redacted text will be printed in the output textbox, and will also be saved to a simple csv file in the output file box.
+![Text analysis output](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/tabular_files/text_anonymisation_outputs.PNG)
+### Redaction log outputs
+A list of the suggested redaction outputs from the tabular data / open text data redaction is available on the Redaction settings page under 'Log file outputs'.
+# ADVANCED USER GUIDE
+This advanced user guide will go over some of the features recently added to the app, including: modifying and merging redaction review files, identifying and redacting duplicate pages across multiple PDFs, 'fuzzy' search and redact, and exporting redactions to Adobe Acrobat.
+## Table of contents
+- [Merging redaction review files](#merging-redaction-review-files)
+- [Identifying and redacting duplicate pages](#identifying-and-redacting-duplicate-pages)
+- [Fuzzy search and redaction](#fuzzy-search-and-redaction)
+- [Export redactions to and import from Adobe Acrobat](#export-to-and-import-from-adobe)
+    - [Exporting to Adobe Acrobat](#exporting-to-adobe-acrobat)
+    - [Importing from Adobe Acrobat](#importing-from-adobe-acrobat)
+- [Using the AWS Textract document API](#using-the-aws-textract-document-api)
+- [Using AWS Textract and Comprehend when not running in an AWS environment](#using-aws-textract-and-comprehend-when-not-running-in-an-aws-environment)
+- [Modifying existing redaction review files](#modifying-existing-redaction-review-files)
+## Merging redaction review files
+Say you have run multiple redaction tasks on the same document, and you want to merge all of these redactions together. You could do this in your spreadsheet editor, but this could be fiddly especially if dealing with multiple review files or large numbers of redactions. The app has a feature to combine multiple review files together to create a 'merged' review file.
+![Merging review files in the user interface](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/merge_review_files/img/merge_review_files_interface.PNG)
+You can find this option at the bottom of the 'Redaction Settings' tab. Upload multiple review files here to get a single output 'merged' review_file. In the examples file, merging the 'review_file_custom.csv' and 'review_file_local.csv' files give you an output containing redaction boxes from both. This combined review file can then be uploaded into the review tab following the usual procedure.
+![Merging review files outputs in spreadsheet](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/merge_review_files/img/merged_review_file_outputs_csv.PNG)
+## Identifying and redacting duplicate pages
+The files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/).
+Some redaction tasks involve removing duplicate pages of text that may exist across multiple documents. This feature helps you find and remove duplicate content that may exist in single or multiple documents.  It can identify everything from single identical pages to multi-page sections (subdocuments). The process involves three main steps: configuring the analysis, reviewing the results in the interactive interface, and then using the generated files to perform the redactions.
+![Example duplicate page inputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_input_interface_new.PNG)
+**Step 1: Upload and Configure the Analysis**
+First, navigate to the "Identify duplicate pages" tab. Upload all the ocr_output.csv files you wish to compare into the file area. These files are generated every time you run a redaction task and contain the text for each page of a document.
+For our example, you can upload the four 'ocr_output.csv' files provided in the example folder into the file area. Click 'Identify duplicate pages' and you will see a number of files returned. In case you want to see the original PDFs, they are available [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/input_pdfs/).
+The default options will search for matching subdocuments of any length. Before running the analysis, you can configure these matching parameters to tell the tool what you're looking for:
+![Duplicate matching parameters](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_matching_parameters.PNG)
+*Matching Parameters*
+- **Similarity Threshold:** A score from 0 to 1. Pages or sequences of pages with a calculated text similarity above this value will be considered a match. The default of 0.9 (90%) is a good starting point for finding near-identical pages.
+- **Min Word Count:** Pages with fewer words than this value will be completely ignored during the comparison. This is extremely useful for filtering out blank pages, title pages, or boilerplate pages that might otherwise create noise in the results. The default is 10.
+- **Choosing a Matching Strategy:** You have three main options to find duplicate content.
+    - *'Subdocument' matching (default):* Use this to find the longest possible sequence of matching pages. The tool will find an initial match and then automatically expand it forward page-by-page until the consecutive match breaks. This is the best method for identifying complete copied chapters or sections of unknown length. This is enabled by default by ticking the "Enable 'subdocument' matching" box. This setting overrides the described below.
+    - *Minimum length subdocument matching:* Use this to find sequences of consecutively matching pages with a minimum page lenght. For example, setting the slider to 3 will only return sections that are at least 3 pages long. How to enable: Untick the "Enable 'subdocument' matching" box and set the "Minimum consecutive pages" slider to a value greater than 1.
+    - *Single Page Matching:* Use this to find all individual page pairs that are similar to each other. Leave the "Enable 'subdocument' matching" box unchecked and keep the "Minimum consecutive pages" slider at 1.
+Once your parameters are set, click the "Identify duplicate pages/subdocuments" button.
+**Step 2: Review Results in the Interface**
+After the analysis is complete, the results will be displayed directly in the interface.
+*Analysis Summary:* A table will appear showing a summary of all the matches found. The columns will change depending on the matching strategy you chose. For subdocument matches, it will show the start and end pages of the matched sequence.
+*Interactive Preview:* This is the most important part of the review process. Click on any row in the summary table. The full text of the matching page(s) will appear side-by-side in the "Full Text Preview" section below, allowing you to instantly verify the accuracy of the match.
+![Duplicate review interface](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_output_review_overview.PNG)
+**Step 3: Download and Use the Output Files**
+The analysis also generates a set of downloadable files for your records and for performing redactions.
+- page_similarity_results.csv: This is a detailed report of the analysis you just ran. It shows a breakdown of the pages from each file that are most similar to each other above the similarity threshold. You can compare the text in the two columns 'Page_1_Text' and 'Page_2_Text'. For single-page matches, it will list each pair of matching pages. For subdocument matches, it will list the start and end pages of each matched sequence, along with the total length of the match.
+![Page similarity file example](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/page_similarity_example.PNG)
+- [Original_Filename]_pages_to_redact.csv: For each input document that was found to contain duplicate content, a separate redaction list is created. This is a simple, one-column CSV file containing a list of all page numbers that should be removed. To use these files, you can either upload the original document (i.e. the PDF) on the 'Review redactions' tab, and then click on the 'Apply relevant duplicate page output to document currently under review' button. You should see the whole pages suggested for redaction on the 'Review redactions' tab. Alternatively, you can reupload the file into the whole page redaction section as described in the ['Full page redaction list example' section](#full-page-redaction-list-example).
+![Example duplicate page redaction list](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_output_interface_new.PNG)
+If you want to combine the results from this redaction process with previous redaction tasks for the same PDF, you could merge review file outputs following the steps described in [Merging existing redaction review files](#merging-existing-redaction-review-files) above.
+## Fuzzy search and redaction
+The files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/fuzzy_search/).
+Sometimes you may be searching for terns that are slightly mispelled throughout a document, for example names. The document redaction app gives the option for searching for long phrases that may contain spelling mistakes, a method called 'fuzzy matching'.
+To do this, go to the Redaction Settings, and the 'Select entity types to redact' area. In the box below relevant to your chosen redaction method (local or AWS Comprehend), select 'CUSTOM_FUZZY' from the list. Next, we can select the maximum number of spelling mistakes allowed in the search (up to nine). Here, you can either type in a number or use the small arrows to the right of the box. Change this option to 3. This will allow for a maximum of three 'changes' in text needed to match to the desired search terms.
+The other option we can leave as is (should fuzzy search match on entire phrases in deny list) - this option would allow you to fuzzy search on each individual word in the search phrase (apart from stop words).
+Next, we can upload a deny list on the same page to do the fuzzy search. A relevant deny list file can be found [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/fuzzy_search/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv) - you can upload it following [these steps](#deny-list-example). You will notice that the suggested deny list has spelling mistakes compared to phrases found in the example document.
+![Deny list example with spelling mistakes](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/fuzzy_search/img/fuzzy_deny_list_example.PNG)
+Upload the [Partnership-Agreement-Toolkit file](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/Partnership-Agreement-Toolkit_0_0.pdf) into the 'Redact document' area on the first tab. Now, press the 'Redact document' button.
+Using these deny list with spelling mistakes, the app fuzzy match these terms to the correct text in the document. After redaction is complete, go to the Review Redactions tab to check the first tabs. You should see that the phrases in the deny list have been successfully matched.
+![Fuzzy match review outputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/fuzzy_search/img/fuzzy_search_review.PNG)
+## Export to and import from Adobe
+Files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/export_to_adobe/).
+### Exporting to Adobe Acrobat
+The Document Redaction app has a feature to export suggested redactions to Adobe, and likewise to import Adobe comment files into the app. The file format used is the .xfdf Adobe comment file format - [you can find more information about how to use these files here](https://helpx.adobe.com/uk/acrobat/using/importing-exporting-comments.html).
+To convert suggested redactions to Adobe format, you need to have the original PDF and a review file csv in the input box at the top of the Review redactions page.
+![Input area for files for Adobe export](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/adobe_export_input_area.PNG)
+Then, you can find the export to Adobe option at the bottom of the Review redactions tab. Adobe comment files will be output here.
+![Adobe export/import options](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/export_to_adobe_interface.PNG)
+Once the input files are ready, you can click on the 'Convert review file to Adobe comment format'. You should see a file appear in the output box with a '.xfdf' file type. To use this in Adobe, after download to your computer, you should be able to double click on it, and a pop-up box will appear asking you to find the PDF file associated with it. Find the original PDF file used for your redaction task. The file should be opened up in Adobe Acrobat with the suggested redactions.
+![Suggested redactions in Adobe Acrobat](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/adobe_redact_example.PNG)
+### Importing from Adobe Acrobat
+The app also allows you to import .xfdf files from Adobe Acrobat. To do this, go to the same Adobe import/export area as described above at the bottom of the Review Redactions tab. In this box, you need to upload a .xfdf Adobe comment file, along with the relevant original PDF for redaction.
+![Adobe import interface](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/import_from_adobe_interface.PNG)
+When you click the 'convert .xfdf comment file to review_file.csv' button, the app should take you up to the top of the screen where the new review file has been created and can be downloaded.
+![Outputs from Adobe import](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/import_from_adobe_interface_outputs.PNG)
+## Using the AWS Textract document API
+This option can be enabled by your system admin, in the config file ('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS' environment variable, and subsequent variables). Using this, you will have the option to submit whole documents in quick succession to the AWS Textract service to get extracted text outputs quickly (faster than using the 'Redact document' process described here).
+### Starting a new Textract API job
+To use this feature, first upload a document file in the file input box [in the usual way](#upload-files-to-the-app) on the first tab of the app. Under AWS Textract signature detection you can select whether or not you would like to analyse signatures or not (with a [cost implication](#optional---select-signature-extraction)).
+Then, open the section under the heading 'Submit whole document to AWS Textract API...'.
+![Textract document API menu](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/textract_document_api.PNG)
+Click 'Analyse document with AWS Textract API call'. After a few seconds, the job should be submitted to the AWS Textract service. The box 'Job ID to check status' should now have an ID filled in. If it is not already filled with previous jobs (up to seven days old), the table should have a row added with details of the new API job.
+Click the button underneath, 'Check status of Textract job and download', to see progress on the job. Processing will continue in the background until the job is ready, so it is worth periodically clicking this button to see if the outputs are ready. In testing, and as a rough estimate, it seems like this process takes about five seconds per page. However, this has not been tested with very large documents. Once ready, the '_textract.json' output should appear below.
+### Textract API job outputs
+The '_textract.json' output can be used to speed up further redaction tasks as [described previously](#optional---costs-and-time-estimation), the 'Existing Textract output file found' flag should now be ticked.
+![Textract document API initial ouputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/textract_api/textract_api_initial_outputs.PNG)
+You can now easily get the '..._ocr_output.csv' redaction output based on this '_textract.json' (described in [Redaction outputs](#redaction-outputs)) by clicking on the button 'Convert Textract job outputs to OCR results'. You can now use this file e.g. for [identifying duplicate pages](#identifying-and-redacting-duplicate-pages), or for redaction review.
+## Using AWS Textract and Comprehend when not running in an AWS environment
+AWS Textract and Comprehend give much better results for text extraction and document redaction than the local model options in the app. The most secure way to access them in the Redaction app is to run the app in a secure AWS environment with relevant permissions. Alternatively, you could run the app on your own system while logged in to AWS SSO with relevant permissions.
+However, it is possible to access these services directly via API from outside an AWS environment by creating IAM users and access keys with relevant permissions to access AWS Textract and Comprehend services. Please check with your IT and data security teams that this approach is acceptable for your data before trying the following approaches.
+To do the following, in your AWS environment you will need to create a new user with permissions for "textract:AnalyzeDocument", "textract:DetectDocumentText", and "comprehend:DetectPiiEntities". Under security credentials, create new access keys - note down the access key and secret key.
+### Direct access by passing AWS access keys through app
+The Redaction Settings tab now has boxes for entering the AWS access key and secret key. If you paste the relevant keys into these boxes before performing redaction, you should be able to use these services in the app.
+### Picking up AWS access keys through an .env file
+The app also has the capability of picking up AWS access key details through a .env file located in a '/config/aws_config.env' file (default), or alternative .env file location specified by the environment variable AWS_CONFIG_PATH. The env file should look like the following with just two lines:
+AWS_ACCESS_KEY= your-access-key
+AWS_SECRET_KEY= your-secret-key
+The app should then pick up these keys when trying to access the AWS Textract and Comprehend services during redaction.
+Again, a lot can potentially go wrong with AWS solutions that are insecure, so before trying the above please consult with your AWS and data security teams.
+## Modifying existing redaction review files
+*Note:* As of version 0.7.0 you can now modify redaction review files directly in the app on the 'Review redactions' tab. Open the accordion 'View and edit review data' under the file input area. You can edit review file data cells here - press Enter to apply changes. You should see the effect on the current page if you click the 'Save changes on current page to file' button to the right.
+You can find the folder containing the files discussed in this section [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/).
+As well as serving as inputs to the document redaction app's review function, the 'review_file.csv' output can be modified outside of the app, and also merged with others from multiple redaction attempts on the same file. This gives you the flexibility to change redaction details outside of the app.
+If you open up a 'review_file' csv output using a spreadsheet software program such as Microsoft Excel you can easily modify redaction properties. Open the file '[Partnership-Agreement-Toolkit_0_0_redacted.pdf_review_file_local.csv](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/Partnership-Agreement-Toolkit_0_0.pdf_review_file_local.csv)', and you should see a spreadshet with just four suggested redactions (see below). The following instructions are for using Excel.
+![Review file before](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/merge_review_files/img/review_file_before.PNG)
+The first thing we can do is remove the first row - 'et' is suggested as a person, but is obviously not a genuine instance of personal information. Right click on the row number and select delete on this menu. Next, let's imagine that what the app identified as a 'phone number' was in fact another type of number and so we wanted to change the label. Simply click on the relevant label cells, let's change it to 'SECURITY_NUMBER'. You could also use 'Find & Select' -> 'Replace' from the top ribbon menu if you wanted to change a number of labels simultaneously.
+How about we wanted to change the colour of the 'email address' entry on the redaction review tab of the redaction app? The colours in a review file are based on an RGB scale with three numbers ranging from 0-255. [You can find suitable colours here](https://rgbcolorpicker.com). Using this scale, if I wanted my review box to be pure blue, I can change the cell value to (0,0,255).
+Imagine that a redaction box was slightly too small, and I didn't want to use the in-app options to change the size. In the review file csv, we can modify e.g. the ymin and ymax values for any box to increase the extent of the redaction box. For the 'email address' entry, let's decrease ymin by 5, and increase ymax by 5.
+I have saved an output file following the above steps as '[Partnership-Agreement-Toolkit_0_0_redacted.pdf_review_file_local_mod.csv](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file_local_mod.csv)' in the same folder that the original was found. Let's upload this file to the app along with the original pdf to see how the redactions look now.
+![Review file after modification](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/merge_review_files/img/partnership_redactions_after.PNG)
+We can see from the above that we have successfully removed a redaction box, changed labels, colours, and redaction box sizes.

tld/.tld_set_snapshot DELETED Viewed

The diff for this file is too large to render. See raw diff

tools/aws_functions.py CHANGED Viewed

@@ -228,5 +228,6 @@ def upload_log_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=
             print(final_out_message_str)
     else:
         final_out_message_str = "App not set to run AWS functions"
     return final_out_message_str

             print(final_out_message_str)
     else:
         final_out_message_str = "App not set to run AWS functions"
+        print(final_out_message_str)
     return final_out_message_str

tools/config.py CHANGED Viewed

@@ -64,10 +64,12 @@ def add_folder_to_path(folder_path: str):
 # LOAD CONFIG FROM ENV FILE
 ###
-ensure_folder_exists("config/")
 # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
-APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', 'config/app_config.env') # e.g. config/app_config.env
 if APP_CONFIG_PATH:
     if os.path.exists(APP_CONFIG_PATH):
@@ -75,10 +77,6 @@ if APP_CONFIG_PATH:
         load_dotenv(APP_CONFIG_PATH)
     else: print("App config file not found at location:", APP_CONFIG_PATH)
 ###
 # AWS OPTIONS
 ###
@@ -149,6 +147,12 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
         if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
 ###
 # LOGGING OPTIONS
 ###
@@ -182,7 +186,7 @@ DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS',
 CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
 CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
-CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox",	"doc_full_file_name_textbox",	"data_full_file_name_textbox",	"actual_time_taken_number",	"total_page_count",	"textract_query_number", "pii_detection_method", "comprehend_query_number",  "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') # If blank, uses component labels
 ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
@@ -310,7 +314,7 @@ COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") #
 # APP RUN OPTIONS
 ###
-TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot')
 try:
     extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
 except:

 # LOAD CONFIG FROM ENV FILE
 ###
+CONFIG_FOLDER = get_or_create_env_var('CONFIG_FOLDER', 'config/')
+ensure_folder_exists(CONFIG_FOLDER)
 # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
+APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', CONFIG_FOLDER + 'app_config.env') # e.g. config/app_config.env
 if APP_CONFIG_PATH:
     if os.path.exists(APP_CONFIG_PATH):
         load_dotenv(APP_CONFIG_PATH)
     else: print("App config file not found at location:", APP_CONFIG_PATH)
 ###
 # AWS OPTIONS
 ###
         if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
+GRADIO_TEMP_DIR = get_or_create_env_var('GRADIO_TEMP_DIR', 'tmp/gradio_tmp/') # Default Gradio temp folder
+MPLCONFIGDIR = get_or_create_env_var('MPLCONFIGDIR', 'tmp/matplotlib_cache/') # Matplotlib cache folder
+ensure_folder_exists(GRADIO_TEMP_DIR)
+ensure_folder_exists(MPLCONFIGDIR)
 ###
 # LOGGING OPTIONS
 ###
 CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
 CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
+CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number",	"total_page_count",	"textract_query_number", "pii_detection_method", "comprehend_query_number",  "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') # If blank, uses component labels
 ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
 # APP RUN OPTIONS
 ###
+TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tmp/tld/')
 try:
     extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
 except:

tools/file_conversion.py CHANGED Viewed

@@ -385,28 +385,32 @@ def convert_pymupdf_to_image_coords(pymupdf_page:Page, x1:float, y1:float, x2:fl
     return x1_image, y1_image, x2_image, y2_image
-def redact_whole_pymupdf_page(rect_height:float, rect_width:float, image:Image, page:Page, custom_colours, border:float = 5, image_dimensions:dict={}):
     # Small border to page that remains white
-    border = 5
     # Define the coordinates for the Rect
     whole_page_x1, whole_page_y1 = 0 + border, 0 + border  # Bottom-left corner
-    whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border  # Top-right corner
-    # whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image, image_dimensions=image_dimensions)
     # Create new image annotation element based on whole page coordinates
     whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
     # Write whole page annotation to annotation boxes
     whole_page_img_annotation_box = {}
-    whole_page_img_annotation_box["xmin"] = whole_page_x1 #whole_page_image_x1
-    whole_page_img_annotation_box["ymin"] = whole_page_y1 #whole_page_image_y1
-    whole_page_img_annotation_box["xmax"] = whole_page_x2 #whole_page_image_x2
-    whole_page_img_annotation_box["ymax"] =  whole_page_y2 #whole_page_image_y2
     whole_page_img_annotation_box["color"] = (0,0,0)
     whole_page_img_annotation_box["label"] = "Whole page"
-    redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
     return whole_page_img_annotation_box
@@ -1292,7 +1296,13 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
     df = pd.DataFrame({
         "image": [anno.get("image") for anno in all_annotations],
         # Ensure 'boxes' defaults to an empty list if missing or None
-        "boxes": [anno.get("boxes") if isinstance(anno.get("boxes"), list) else [] for anno in all_annotations]
     })
     # 2. Calculate the page number using the helper function

     return x1_image, y1_image, x2_image, y2_image
+def redact_whole_pymupdf_page(rect_height:float, rect_width:float, page:Page, custom_colours:bool=False, border:float = 5, redact_pdf:bool=True):
     # Small border to page that remains white
     # Define the coordinates for the Rect
     whole_page_x1, whole_page_y1 = 0 + border, 0 + border  # Bottom-left corner
+    # If border is a tiny value, assume that we want relative values
+    if border < 0.1:
+        whole_page_x2, whole_page_y2 = 1 - border, 1 - border  # Top-right corner
+    else:
+        whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border  # Top-right corner
     # Create new image annotation element based on whole page coordinates
     whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
     # Write whole page annotation to annotation boxes
     whole_page_img_annotation_box = {}
+    whole_page_img_annotation_box["xmin"] = whole_page_x1
+    whole_page_img_annotation_box["ymin"] = whole_page_y1
+    whole_page_img_annotation_box["xmax"] = whole_page_x2
+    whole_page_img_annotation_box["ymax"] =  whole_page_y2
     whole_page_img_annotation_box["color"] = (0,0,0)
     whole_page_img_annotation_box["label"] = "Whole page"
+    if redact_pdf == True:
+        redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
     return whole_page_img_annotation_box
     df = pd.DataFrame({
         "image": [anno.get("image") for anno in all_annotations],
         # Ensure 'boxes' defaults to an empty list if missing or None
+        "boxes": [
+                    anno.get("boxes") if isinstance(anno.get("boxes"), list)
+                    else [anno.get("boxes")] if isinstance(anno.get("boxes"), dict)
+                    else []
+                    for anno in all_annotations
+                ]
     })
     # 2. Calculate the page number using the helper function

tools/file_redaction.py CHANGED Viewed

@@ -1114,7 +1114,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
     # If whole page is to be redacted, do that here
     if redact_whole_page == True:
-        whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5, image_dimensions=image_dimensions)
         all_image_annotation_boxes.append(whole_page_img_annotation_box)
     out_annotation_boxes = {
@@ -1372,10 +1372,19 @@ def redact_image_pdf(file_path:str,
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
-    progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
-    all_line_level_ocr_results_df_list = [all_line_level_ocr_results_df]
-    all_pages_decision_process_table_list = [all_pages_decision_process_table]
     # Go through each page
     for page_no in progress_bar:
@@ -1525,7 +1534,10 @@ def redact_image_pdf(file_path:str,
                 'height': result.height
             } for result in page_line_level_ocr_results['results']])
-            all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
             if pii_identification_method != NO_REDACTION_PII_OPTION:
                 # Step 2: Analyse text and identify PII
@@ -1637,7 +1649,10 @@ def redact_image_pdf(file_path:str,
                 'page': reported_page_number
             } for result in page_merged_redaction_bboxes])
-            all_pages_decision_process_table_list.append(decision_process_table)
             decision_process_table = fill_missing_ids(decision_process_table)
             decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
@@ -1685,8 +1700,11 @@ def redact_image_pdf(file_path:str,
                     if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
                         log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
-                all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
-                all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
                 current_loop_page += 1
@@ -1733,9 +1751,11 @@ def redact_image_pdf(file_path:str,
                 if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
                     log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
-            all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
-            all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
             return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
@@ -1758,8 +1778,8 @@ def redact_image_pdf(file_path:str,
         if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
             log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
-    all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
-    all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
     # Convert decision table and ocr results to relative coordinates
     all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
@@ -2002,11 +2022,11 @@ def redact_text_pdf(
     tic = time.perf_counter()
     if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
-        all_line_level_ocr_results_df_list = [all_line_level_ocr_results_df]
     if isinstance(all_pages_decision_process_table, pd.DataFrame):
         # Convert decision outputs to list of dataframes:
-        all_pages_decision_process_table_list = [all_pages_decision_process_table]
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         out_message = "Connection to AWS Comprehend service not found."
@@ -2133,7 +2153,7 @@ def redact_text_pdf(
                     page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
                     if not page_decision_process_table.empty:
-                        all_pages_decision_process_table_list.append(page_decision_process_table)
                 # Else, user chose not to run redaction
                 else:
@@ -2145,7 +2165,7 @@ def redact_text_pdf(
                 if not page_text_ocr_outputs.empty:
                     page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
                     page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
-                    all_line_level_ocr_results_df_list.append(page_text_ocr_outputs)
                 toc = time.perf_counter()
@@ -2168,8 +2188,8 @@ def redact_text_pdf(
                         annotations_all_pages.append(page_image_annotations)
                     # Write logs
-                    all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
-                    all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
                     current_loop_page += 1
@@ -2193,16 +2213,16 @@ def redact_text_pdf(
             progress.close(_tqdm=progress_bar)
             # Write logs
-            all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
             return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
     # Write all page outputs
-    all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
-    #print("all_line_level_ocr_results_df_list:", all_line_level_ocr_results_df_list)
-    all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
     #print("all_line_level_ocr_results_df after concat:", all_line_level_ocr_results_df)

     # If whole page is to be redacted, do that here
     if redact_whole_page == True:
+        whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, page, custom_colours, border = 5)
         all_image_annotation_boxes.append(whole_page_img_annotation_box)
     out_annotation_boxes = {
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
+    progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
+    # If there's data from a previous run (passed in via the DataFrame parameters), add it
+    all_line_level_ocr_results_list = []
+    all_pages_decision_process_list = []
+    if not all_line_level_ocr_results_df.empty:
+        all_line_level_ocr_results_list.extend(all_line_level_ocr_results_df.to_dict('records'))
+    if not all_pages_decision_process_table.empty:
+        all_pages_decision_process_list.extend(all_pages_decision_process_table.to_dict('records'))
+    #all_line_level_ocr_results_list = [all_line_level_ocr_results_df.to_dict('records')]#[all_line_level_ocr_results_df]
+    #all_pages_decision_process_list = [all_pages_decision_process_table.to_dict('records')]#[all_pages_decision_process_table]
     # Go through each page
     for page_no in progress_bar:
                 'height': result.height
             } for result in page_line_level_ocr_results['results']])
+            #all_line_level_ocr_results_list.append(line_level_ocr_results_df.to_dict('records'))
+            if not line_level_ocr_results_df.empty: # Ensure there are records to add
+                all_line_level_ocr_results_list.extend(line_level_ocr_results_df.to_dict('records'))
             if pii_identification_method != NO_REDACTION_PII_OPTION:
                 # Step 2: Analyse text and identify PII
                 'page': reported_page_number
             } for result in page_merged_redaction_bboxes])
+            #all_pages_decision_process_list.append(decision_process_table.to_dict('records'))
+            if not decision_process_table.empty: # Ensure there are records to add
+                all_pages_decision_process_list.extend(decision_process_table.to_dict('records'))
             decision_process_table = fill_missing_ids(decision_process_table)
             decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
                     if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
                         log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
+                #all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
+                #all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
+                all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
+                all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
                 current_loop_page += 1
                 if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
                     log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
+            #all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
+            #all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
+            all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
+            all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
             return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
         if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
             log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
+    all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list) #pd.concat(all_pages_decision_process_list)
+    all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list) #pd.concat(all_line_level_ocr_results_list)
     # Convert decision table and ocr results to relative coordinates
     all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
     tic = time.perf_counter()
     if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
+        all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
     if isinstance(all_pages_decision_process_table, pd.DataFrame):
         # Convert decision outputs to list of dataframes:
+        all_pages_decision_process_list = [all_pages_decision_process_table]
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         out_message = "Connection to AWS Comprehend service not found."
                     page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
                     if not page_decision_process_table.empty:
+                        all_pages_decision_process_list.append(page_decision_process_table)
                 # Else, user chose not to run redaction
                 else:
                 if not page_text_ocr_outputs.empty:
                     page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
                     page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
+                    all_line_level_ocr_results_list.append(page_text_ocr_outputs)
                 toc = time.perf_counter()
                         annotations_all_pages.append(page_image_annotations)
                     # Write logs
+                    all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
+                    all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
                     current_loop_page += 1
             progress.close(_tqdm=progress_bar)
             # Write logs
+            all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
             return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
     # Write all page outputs
+    all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
+    #print("all_line_level_ocr_results_list:", all_line_level_ocr_results_list)
+    all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
     #print("all_line_level_ocr_results_df after concat:", all_line_level_ocr_results_df)

tools/find_duplicate_pages.py CHANGED Viewed

@@ -1,32 +1,19 @@
 import pandas as pd
-#import argparse
-#import glob
 import os
 import re
 from tools.helper_functions import OUTPUT_FOLDER
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-# import nltk
-# from nltk.corpus import stopwords
-# from nltk.tokenize import word_tokenize
-# from nltk.stem import PorterStemmer
-#import spacy
-import numpy as np
-import random
-import string
-from typing import List
 from gradio import Progress
-import en_core_web_lg #en_core_web_sm
 nlp = en_core_web_lg.load()
-#from tqdm import tqdm
-# nltk.download('punkt')
-# nltk.download('stopwords')
-# nltk.download('punkt_tab')
-similarity_threshold = 0.9
 def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
     """
@@ -96,31 +83,11 @@ def process_data(df:pd.DataFrame, column:str):
     def _clean_text(raw_text):
         # Remove HTML tags
         clean = re.sub(r'<.*?>', '', raw_text)
-        # clean = re.sub(r'&nbsp;', ' ', clean)
-        # clean = re.sub(r'\r\n', ' ', clean)
-        # clean = re.sub(r'&lt;', ' ', clean)
-        # clean = re.sub(r'&gt;', ' ', clean)
-        # clean = re.sub(r'<strong>', ' ', clean)
-        # clean = re.sub(r'</strong>', ' ', clean)
-        # Replace non-breaking space \xa0 with a space
-        # clean = clean.replace(u'\xa0', u' ')
-        # Remove extra whitespace
         clean = ' '.join(clean.split())
-        # # Tokenize the text
-        # words = word_tokenize(clean.lower())
-        # # Remove punctuation and numbers
-        # words = [word for word in words if word.isalpha()]
-        # # Remove stopwords
-        # words = [word for word in words if word not in stop_words]
         # Join the cleaned words back into a string
         return clean
-    # Function to apply lemmatization and remove stopwords
     def _apply_lemmatization(text):
         doc = nlp(text)
         # Keep only alphabetic tokens and remove stopwords
@@ -133,119 +100,497 @@ def process_data(df:pd.DataFrame, column:str):
     return df
-def identify_similar_pages(input_files: List[str], similarity_threshold: float = 0.9, output_folder:str=OUTPUT_FOLDER, progress=Progress(track_tqdm=True)):
-    output_paths = []
-    progress(0.1, desc="Cleaning input text")
-    # Load and clean data
-    df, output_files = combine_ocr_output_text(input_files)
-    output_paths.extend(output_files)
-    df = process_data(df, 'text')  # Assume this returns 'text_clean', 'file', and 'page' columns
-    # Vectorize text
     vectorizer = TfidfVectorizer()
-    tfidf_matrix = vectorizer.fit_transform(df['text_clean'])
     progress(0.3, desc="Calculating text similarity")
-    # Compute sparse cosine similarity
-    similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)  # Keep sparse format
-    # Extract indices of similar pages above threshold
-    coo_matrix = similarity_matrix.tocoo()
-    similar_pages = np.array([(i, j, v) for i, j, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data) if v > similarity_threshold])
-    if similar_pages.size == 0:
-        return pd.DataFrame(), output_paths  # Return empty if no matches
-    # Create a DataFrame for similar pairs
-    similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
-    # Remove duplicate pairs (keep one direction)
-    similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
-    progress(0.8, desc="Mapping back results")
-    # Map indices to metadata
-    # index_map = df[['file', 'page', 'text']].to_dict(orient='index')
-    # similarity_df['Page1_File'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['file'])
-    # similarity_df['Page2_File'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['file'])
-    # similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['page'])
-    # similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['page'])
-    # similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['text'][0:200])
-    # similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['text'][0:200])
-    # Create a DataFrame with the metadata
-    metadata_df = df[['file', 'page', 'text']].reset_index()
-    # Merge to get the metadata for Page1
-    similarity_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_on='index', suffixes=('', '_Page1'))
-    similarity_df = similarity_df.rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'})
-    # Merge to get the metadata for Page2
-    similarity_df = similarity_df.merge(metadata_df, left_on='Page2_Index', right_on='index', suffixes=('', '_Page2'))
-    similarity_df = similarity_df.rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'})
-    # Optionally, drop the index columns if not needed
-    #similarity_df = similarity_df.drop(columns=['index_Page1', 'index_Page2'])
-    similarity_df["Similarity_Score"] = similarity_df["Similarity_Score"].round(3)
-    # Sort results
-    similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
-    similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])
-    similarity_df_out['Page1_Text'] = similarity_df_out['Page1_Text'][0:100]
-    similarity_df_out['Page2_Text'] = similarity_df_out['Page2_Text'][0:100]
-    progress(0.8, desc="Saving output files")
-    # Save results
-    similarity_file_output_path = output_folder + 'page_similarity_results.csv'
-    similarity_df_out.to_csv(similarity_file_output_path, index=False)
-    output_paths.append(similarity_file_output_path)
-    # Save per-file redaction lists
-    for redact_file in similarity_df_out['Page2_File'].unique():
-        output_file_name = output_folder + redact_file + "_whole_page.csv"
-        whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File'] == redact_file, ['Page2_Page']].drop_duplicates(['Page2_Page']).sort_values('Page2_Page')
-        whole_pages_to_redact_df.to_csv(output_file_name, header=False, index=False)
-        output_paths.append(output_file_name)
-    return similarity_df_out, output_paths
-# Perturb text
-# Apply the perturbation function with a 10% error probability
-def perturb_text_with_errors(series:pd.Series):
-    def _perturb_text(text, error_probability=0.1):
-        words = text.split()  # Split text into words
-        perturbed_words = []
-        for word in words:
-            if random.random() < error_probability:  # Add a random error
-                perturbation_type = random.choice(['char_error', 'extra_space', 'extra_punctuation'])
-                if perturbation_type == 'char_error':  # Introduce a character error
-                    idx = random.randint(0, len(word) - 1)
-                    char = random.choice(string.ascii_lowercase)  # Add a random letter
-                    word = word[:idx] + char + word[idx:]
-                elif perturbation_type == 'extra_space':  # Add extra space around a word
-                    word = ' ' + word + ' '
-                elif perturbation_type == 'extra_punctuation':  # Add punctuation to the word
-                    punctuation = random.choice(string.punctuation)
-                    idx = random.randint(0, len(word))  # Insert punctuation randomly
-                    word = word[:idx] + punctuation + word[idx:]
-            perturbed_words.append(word)
-        return ' '.join(perturbed_words)
-    series = series.apply(lambda x: _perturb_text(x, error_probability=0.1))
-    return series

 import pandas as pd
 import os
 import re
 from tools.helper_functions import OUTPUT_FOLDER
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+from typing import List, Tuple
+import gradio as gr
 from gradio import Progress
+from pathlib import Path
+from pymupdf import Document
+from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe
+import en_core_web_lg
 nlp = en_core_web_lg.load()
+similarity_threshold = 0.95
 def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
     """
     def _clean_text(raw_text):
         # Remove HTML tags
         clean = re.sub(r'<.*?>', '', raw_text)
         clean = ' '.join(clean.split())
         # Join the cleaned words back into a string
         return clean
+    # Function to apply lemmatisation and remove stopwords
     def _apply_lemmatization(text):
         doc = nlp(text)
         # Keep only alphabetic tokens and remove stopwords
     return df
+def map_metadata_single_page(similarity_df:pd.DataFrame, metadata_source_df:pd.DataFrame, preview_length:int=200):
+    """Helper to map metadata for single page results."""
+    metadata_df = metadata_source_df[['file', 'page', 'text']]
+    results_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_index=True)\
+                            .rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'})
+    results_df = results_df.merge(metadata_df, left_on='Page2_Index', right_index=True, suffixes=('_1', '_2'))\
+                            .rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'})
+    results_df["Similarity_Score"] = results_df["Similarity_Score"].round(3)
+    final_df = results_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
+    final_df = final_df.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page"])
+    final_df['Page1_Text'] = final_df['Page1_Text'].str[:preview_length]
+    final_df['Page2_Text'] = final_df['Page2_Text'].str[:preview_length]
+    return final_df
+def map_metadata_subdocument(subdocument_df:pd.DataFrame, metadata_source_df:pd.DataFrame, preview_length:int=200):
+    """Helper to map metadata for subdocument results."""
+    metadata_df = metadata_source_df[['file', 'page', 'text']]
+    subdocument_df = subdocument_df.merge(metadata_df, left_on='Page1_Start_Index', right_index=True)\
+                                   .rename(columns={'file': 'Page1_File', 'page': 'Page1_Start_Page', 'text': 'Page1_Text'})
+    subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page1_End_Index', right_index=True)\
+                                   .rename(columns={'page': 'Page1_End_Page'})
+    subdocument_df = subdocument_df.merge(metadata_df, left_on='Page2_Start_Index', right_index=True)\
+                                   .rename(columns={'file': 'Page2_File', 'page': 'Page2_Start_Page', 'text': 'Page2_Text'})
+    subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page2_End_Index', right_index=True)\
+                                   .rename(columns={'page': 'Page2_End_Page'})
+    cols = ['Page1_File', 'Page1_Start_Page', 'Page1_End_Page',
+            'Page2_File', 'Page2_Start_Page', 'Page2_End_Page',
+            'Match_Length', 'Page1_Text', 'Page2_Text']
+    # Add Avg_Similarity if it exists (it won't for greedy match unless we add it)
+    if 'Avg_Similarity' in subdocument_df.columns:
+        subdocument_df['Avg_Similarity'] = subdocument_df['Avg_Similarity'].round(3)
+        cols.insert(7, 'Avg_Similarity')
+    final_df = subdocument_df[cols]
+    final_df = final_df.sort_values(['Page1_File', 'Page1_Start_Page', 'Page2_File', 'Page2_Start_Page'])
+    final_df['Page1_Text'] = final_df['Page1_Text'].str[:preview_length]
+    final_df['Page2_Text'] = final_df['Page2_Text'].str[:preview_length]
+    return final_df
+def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str) -> list:
+    """
+    Saves the main results DataFrame and generates per-file redaction lists.
+    This function is extracted to be reusable.
+    Args:
+        final_df (pd.DataFrame): The DataFrame containing the final match results.
+        output_folder (str): The folder to save the output files.
+    Returns:
+        list: A list of paths to all generated files.
+    """
+    output_paths = []
+    output_folder_path = Path(output_folder)
+    output_folder_path.mkdir(exist_ok=True)
+    if final_df.empty:
+        print("No matches to save.")
+        return []
+    # 1. Save the main results DataFrame
+    similarity_file_output_path = output_folder_path / 'page_similarity_results.csv'
+    final_df.to_csv(similarity_file_output_path, index=False)
+    output_paths.append(str(similarity_file_output_path))
+    print(f"Main results saved to {similarity_file_output_path}")
+    # 2. Save per-file redaction lists
+    # Use 'Page2_File' as the source of duplicate content
+    grouping_col = 'Page2_File'
+    if grouping_col not in final_df.columns:
+        print("Warning: 'Page2_File' column not found. Cannot generate redaction lists.")
+        return output_paths
+    for redact_file, group in final_df.groupby(grouping_col):
+        output_file_name_stem = Path(redact_file).stem
+        output_file_path = output_folder_path / f"{output_file_name_stem}_pages_to_redact.csv"
+        all_pages_to_redact = set()
+        is_subdocument_match = 'Page2_Start_Page' in group.columns
+        if is_subdocument_match:
+            for _, row in group.iterrows():
+                pages_in_range = range(int(row['Page2_Start_Page']), int(row['Page2_End_Page']) + 1)
+                all_pages_to_redact.update(pages_in_range)
+        else:
+            pages = group['Page2_Page'].unique()
+            all_pages_to_redact.update(pages)
+        if all_pages_to_redact:
+            redaction_df = pd.DataFrame(sorted(list(all_pages_to_redact)), columns=['Page_to_Redact'])
+            redaction_df.to_csv(output_file_path, header=False, index=False)
+            output_paths.append(str(output_file_path))
+            print(f"Redaction list for {redact_file} saved to {output_file_path}")
+    return output_paths
+def identify_similar_pages(
+    df_combined: pd.DataFrame,
+    similarity_threshold: float = 0.9,
+    min_word_count: int = 10,
+    min_consecutive_pages: int = 1,
+    greedy_match: bool = False, # NEW parameter
+    output_folder: str = OUTPUT_FOLDER,
+    progress=Progress(track_tqdm=True)
+) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
+    """
+    Identifies similar pages with three possible strategies:
+    1. Single Page: If greedy_match=False and min_consecutive_pages=1.
+    2. Fixed-Length Subdocument: If greedy_match=False and min_consecutive_pages > 1.
+    3. Greedy Consecutive Match: If greedy_match=True.
+    """
+    output_paths = []
+    progress(0.1, desc="Processing and filtering text")
+    df = process_data(df_combined, 'text')
+    df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
+    original_row_count = len(df)
+    df_filtered = df[df['word_count'] >= min_word_count].copy()
+    df_filtered.reset_index(drop=True, inplace=True)
+    print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
+    if len(df_filtered) < 2:
+        return pd.DataFrame(), [], df_combined
     vectorizer = TfidfVectorizer()
+    tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
     progress(0.3, desc="Calculating text similarity")
+    similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
+    coo_matrix = similarity_matrix.tocoo()
+    # Create a DataFrame of all individual page pairs above the threshold.
+    # This is the base for all three matching strategies.
+    similar_pages = [
+        (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
+        if r < c and v >= similarity_threshold
+    ]
+    if not similar_pages:
+        return pd.DataFrame(), [], df_combined
+    base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
+    progress(0.6, desc="Aggregating results based on matching strategy")
+    if greedy_match:
+        print("Finding matches using greedy consecutive strategy.")
+        # A set of pairs for fast lookups of (page1_idx, page2_idx)
+        valid_pairs_set = set(zip(base_similarity_df['Page1_Index'], base_similarity_df['Page2_Index']))
+        # Keep track of indices that have been used in a sequence
+        consumed_indices_1 = set()
+        consumed_indices_2 = set()
+        all_sequences = []
+        # Iterate through all potential starting pairs, sorted for consistent results
+        sorted_pairs = base_similarity_df.sort_values(['Page1_Index', 'Page2_Index'])
+        for _, row in sorted_pairs.iterrows():
+            start_idx1, start_idx2 = int(row['Page1_Index']), int(row['Page2_Index'])
+            # If this pair has already been consumed by a previous sequence, skip it
+            if start_idx1 in consumed_indices_1 or start_idx2 in consumed_indices_2:
+                continue
+            # This is a new sequence, start expanding it
+            current_sequence = [(start_idx1, start_idx2)]
+            k = 1
+            while True:
+                next_idx1 = start_idx1 + k
+                next_idx2 = start_idx2 + k
+                # Check if the next pair in the sequence is a valid match
+                if (next_idx1, next_idx2) in valid_pairs_set and \
+                   next_idx1 not in consumed_indices_1 and \
+                   next_idx2 not in consumed_indices_2:
+                    current_sequence.append((next_idx1, next_idx2))
+                    k += 1
+                else:
+                    # The sequence has ended
+                    break
+            # Record the found sequence and mark all its pages as consumed
+            sequence_indices_1 = [p[0] for p in current_sequence]
+            sequence_indices_2 = [p[1] for p in current_sequence]
+            all_sequences.append({
+                'Page1_Start_Index': sequence_indices_1[0], 'Page1_End_Index': sequence_indices_1[-1],
+                'Page2_Start_Index': sequence_indices_2[0], 'Page2_End_Index': sequence_indices_2[-1],
+                'Match_Length': len(current_sequence)
+            })
+            consumed_indices_1.update(sequence_indices_1)
+            consumed_indices_2.update(sequence_indices_2)
+        if not all_sequences:
+            return pd.DataFrame(), [], df_combined
+        subdocument_df = pd.DataFrame(all_sequences)
+        # We can add back the average similarity if needed, but it requires more lookups.
+        # For now, we'll omit it for simplicity in the greedy approach.
+        # ... (The rest is metadata mapping, same as the subdocument case)
+    elif min_consecutive_pages > 1:
+        # --- STRATEGY 2: Fixed-Length Subdocument Matching ---
+        print(f"Finding consecutive page matches (min_consecutive_pages > 1)")
+        similarity_df = base_similarity_df.copy()
+        similarity_df.sort_values(['Page1_Index', 'Page2_Index'], inplace=True)
+        is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
+        block_id = is_consecutive.eq(False).cumsum()
+        grouped = similarity_df.groupby(block_id)
+        agg_results = grouped.agg(
+            Page1_Start_Index=('Page1_Index', 'first'), Page2_Start_Index=('Page2_Index', 'first'),
+            Page1_End_Index=('Page1_Index', 'last'), Page2_End_Index=('Page2_Index', 'last'),
+            Match_Length=('Page1_Index', 'size'), Avg_Similarity=('Similarity_Score', 'mean')
+        ).reset_index(drop=True)
+        subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
+        if subdocument_df.empty: return pd.DataFrame(), [], df_combined
+    else:
+        # --- STRATEGY 1: Single Page Matching ---
+        print(f"Finding single page matches (min_consecutive_pages=1)")
+        final_df = map_metadata_single_page(base_similarity_df, df_filtered)
+        # The rest of the logic (saving files) is handled after this if/else block
+        pass # The final_df is already prepared
+    # --- Map metadata and format output ---
+    # This block now handles the output for both subdocument strategies (2 and 3)
+    if greedy_match or min_consecutive_pages > 1:
+        final_df = map_metadata_subdocument(subdocument_df, df_filtered)
+    progress(0.8, desc="Saving output files")
+    output_paths = save_results_and_redaction_lists(final_df, output_folder)
+    return final_df, output_paths, df_combined
+# ==============================================================================
+# GRADIO HELPER FUNCTIONS
+# ==============================================================================
+# full_data:pd.DataFrame,
+def handle_selection_and_preview(evt: gr.SelectData, results_df:pd.DataFrame, full_duplicate_data_by_file: dict):
+    """
+    This single function handles a user selecting a row. It:
+    1. Determines the selected row index.
+    2. Calls the show_page_previews function to get the text data.
+    3. Returns all the necessary outputs for the UI.
+    """
+    # If the user deselects, the event might be None.
+    if not evt:
+        return None, None, None # Clear state and both preview panes
+    # 1. Get the selected index
+    selected_index = evt.index[0]
+    # 2. Get the preview data
+    page1_data, page2_data = show_page_previews(full_duplicate_data_by_file, results_df, evt)
+    # 3. Return all three outputs in the correct order
+    return selected_index, page1_data, page2_data
+def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder="./output/"):
+    """
+    Removes a selected row from the results DataFrame, regenerates output files,
+    and clears the text preview panes.
+    """
+    if selected_index is None:
+        gr.Warning("No match selected. Please click on a row in the table first.")
+        # Return the original dataframe and update=False for the files
+        return results_df, gr.update(), None, None
+    if results_df.empty:
+        gr.Warning("No duplicate page results found, nothing to exclude.")
+        return results_df, gr.update(), None, None
+    # Drop the selected row
+    updated_df = results_df.drop(selected_index).reset_index(drop=True)
+    # Recalculate all output files using the helper function
+    new_output_paths = save_results_and_redaction_lists(updated_df, output_folder)
+    gr.Info(f"Match at row {selected_index} excluded. Output files have been updated.")
+    # Return the updated dataframe, the new file list, and clear the preview panes
+    return updated_df, new_output_paths, None, None
+def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
+    """
+    Wrapper function updated to include the 'greedy_match' boolean.
+    """
+    if not files:
+        gr.Warning("Please upload files to analyze.")
+        return None, None, None
+    progress(0, desc="Combining input files...")
+    df_combined, _ = combine_ocr_output_text(files)
+    if df_combined.empty:
+        gr.Warning("No data found in the uploaded files.")
+        return None, None, None
+    # Call the main analysis function with the new parameter
+    results_df, output_paths, full_df = identify_similar_pages(
+        df_combined=df_combined,
+        similarity_threshold=threshold,
+        min_word_count=min_words,
+        min_consecutive_pages=int(min_consecutive),
+        greedy_match=greedy_match,
+        progress=progress
+    )
+    # Clip text to first 200 characters
+    full_df['text'] = full_df['text'].str[:preview_length]
+    # Preprocess full_data (without preview text) for fast access (run once)
+    full_data_by_file = {
+    file: df.sort_values('page').set_index('page')
+    for file, df in full_df.drop(["text_clean"],axis=1).groupby('file')
+    }
+    if results_df.empty:
+        gr.Info(f"No duplicate pages found, no results returned.")
+    return results_df, output_paths, full_data_by_file # full_df,
+def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
+    """
+    Optimized version using pre-partitioned and indexed full_data.
+    Triggered when a user selects a row in the results DataFrame.
+    """
+    if not full_data_by_file or results_df is None or not evt:
+        return None, None
+    selected_row = results_df.iloc[evt.index[0], :]
+    is_subdocument_match = 'Page1_Start_Page' in selected_row
+    if is_subdocument_match:
+        file1, start1, end1 = selected_row['Page1_File'], selected_row['Page1_Start_Page'], selected_row['Page1_End_Page']
+        file2, start2, end2 = selected_row['Page2_File'], selected_row['Page2_Start_Page'], selected_row['Page2_End_Page']
+        page1_data = full_data_by_file[file1].loc[start1:end1, ['text']].reset_index()
+        page2_data = full_data_by_file[file2].loc[start2:end2, ['text']].reset_index()
+    else:
+        file1, page1 = selected_row['Page1_File'], selected_row['Page1_Page']
+        file2, page2 = selected_row['Page2_File'], selected_row['Page2_Page']
+        page1_data = full_data_by_file[file1].loc[[page1], ['text']].reset_index()
+        page2_data = full_data_by_file[file2].loc[[page2], ['text']].reset_index()
+    page1_data['text'] = page1_data['text'].str[:preview_length]
+    page2_data['text'] = page2_data['text'].str[:preview_length]
+    return page1_data[['page', 'text']], page2_data[['page', 'text']]
+def apply_whole_page_redactions_from_list(duplicate_page_numbers_df:pd.DataFrame, doc_file_name_with_extension_textbox:str, review_file_state:pd.DataFrame, duplicate_output_paths:list[str], pymupdf_doc:object, page_sizes:list[dict], all_existing_annotations:list[dict]):
+    '''
+    Take a list of suggested whole pages to redact and apply it to review file data currently available from an existing PDF under review
+    '''
+    # Create a copy of annotations to avoid modifying the original
+    all_annotations = all_existing_annotations.copy()
+    if not pymupdf_doc:
+        print("Warning: No document file currently under review. Please upload a document on the 'Review redactions' tab to apply whole page redactions.")
+        raise Warning("No document file currently under review. Please upload a document on the 'Review redactions' tab to apply whole page redactions.")
+        return review_file_state, all_annotations
+    # Initialize list of pages to redact
+    list_whole_pages_to_redact = []
+    # Get list of pages to redact from either dataframe or file
+    if not duplicate_page_numbers_df.empty:
+        list_whole_pages_to_redact = duplicate_page_numbers_df.iloc[:, 0].tolist()
+    elif duplicate_output_paths:
+        expected_duplicate_pages_to_redact_name = f"{doc_file_name_with_extension_textbox}"
+        whole_pages_list = pd.DataFrame()  # Initialize empty DataFrame
+        for output_file in duplicate_output_paths:
+            # Note: output_file.name might not be available if output_file is just a string path
+            # If it's a Path object or similar, .name is fine. Otherwise, parse from string.
+            file_name_from_path = output_file.split('/')[-1] if isinstance(output_file, str) else output_file.name
+            if expected_duplicate_pages_to_redact_name in file_name_from_path:
+                whole_pages_list = pd.read_csv(output_file, header=None) # Use output_file directly if it's a path
+                break
+        if not whole_pages_list.empty:
+            list_whole_pages_to_redact = whole_pages_list.iloc[:, 0].tolist()
+    # Convert to set to remove duplicates, then back to list
+    list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
+    if not list_whole_pages_to_redact:
+        # Assuming gr is defined (e.g., gradio)
+        print("No relevant list of whole pages to redact found, returning inputs.")
+        raise Warning("Warning: No relevant list of whole pages to redact found, returning inputs.")
+        return review_file_state, all_existing_annotations
+    new_annotations = []
+    # Process each page for redaction
+    for page in list_whole_pages_to_redact:
+        try:
+            page_index = int(page) - 1
+            if page_index < 0 or page_index >= len(pymupdf_doc):
+                print(f"Page {page} is out of bounds for a document with {len(pymupdf_doc)} pages, skipping.")
+                continue
+            pymupdf_page = pymupdf_doc[page_index]
+            # Find the matching page size dictionary
+            page_size = next((size for size in page_sizes if size["page"] == int(page)), None)
+            if not page_size:
+                print(f"Page {page} not found in page_sizes object, skipping.")
+                continue
+            rect_height = page_size["cropbox_height"]
+            rect_width = page_size["cropbox_width"]
+            image = page_size["image_path"] # This `image` likely represents the page identifier
+            # Create the whole page redaction box
+            annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, pymupdf_page, border=0.005, redact_pdf=False)
+            # Find existing annotation for this image/page
+            current_page_existing_boxes_group = next((annot_group for annot_group in all_annotations if annot_group["image"] == image), None)
+            new_annotation_group = {
+                    "image": image,
+                    "boxes": [annotation_box]
+                }
+            if current_page_existing_boxes_group:
+                # Check if we already have a whole page redaction for this page
+                if not any(box["label"] == "Whole page" for box in current_page_existing_boxes_group["boxes"]):
+                    current_page_existing_boxes_group["boxes"].append(annotation_box)
+                else:
+                    # Optional: Print a message if a whole-page redaction already exists for this page
+                    print(f"Whole page redaction for page {page} already exists in annotations, skipping addition.")
+                    pass
+            else:                # Create new annotation entry
+                all_annotations.append(new_annotation_group)
+            new_annotations.append(new_annotation_group)
+        except Exception as e:
+            print(f"Error processing page {page}: {str(e)}")
+            continue
+    # Convert annotations to dataframe and combine with existing review file
+    whole_page_review_file = convert_annotation_data_to_dataframe(new_annotations)
+    # Ensure all required columns are present in both DataFrames before concat
+    # This is a common point of error if DFs have different schemas
+    expected_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
+    for col in expected_cols:
+        if col not in review_file_state.columns:
+            review_file_state[col] = None # Or an appropriate default value
+        if col not in whole_page_review_file.columns:
+            whole_page_review_file[col] = None
+    review_file_out = pd.concat([review_file_state, whole_page_review_file], ignore_index=True)
+    review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"])
+    # --- Remove duplicate entries from the final DataFrame ---
+    dedup_subset_cols = ['page', 'label', 'text', 'id']
+    # Ensure these columns exist before trying to use them as subset for drop_duplicates
+    if all(col in review_file_out.columns for col in dedup_subset_cols):
+        review_file_out = review_file_out.drop_duplicates(
+            subset=dedup_subset_cols,
+            keep='first' # Keep the first occurrence of a duplicate redaction
+        )
+    else:
+        print(f"Warning: Not all columns required for de-duplication ({dedup_subset_cols}) are present in review_file_out. Skipping specific de-duplication.")
+        # You might want a fallback or to inspect what's missing
+    review_file_out.to_csv(OUTPUT_FOLDER + "review_file_out_after_whole_page.csv")
+    gr.Info("Successfully created whole page redactions. Go to the 'Review redactions' tab to see them.")
+    return review_file_out, all_annotations

tools/helper_functions.py CHANGED Viewed

@@ -146,6 +146,14 @@ def ensure_output_folder_exists(output_folder:str):
     else:
         print(f"The {output_folder} folder already exists.")
 def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
     '''
     When file is loaded, update the column dropdown choices and write to relevant data states.

     else:
         print(f"The {output_folder} folder already exists.")
+def _get_env_list(env_var_name: str) -> List[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(',') if s.strip()]
 def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
     '''
     When file is loaded, update the column dropdown choices and write to relevant data states.

tools/redaction_review.py CHANGED Viewed

@@ -180,7 +180,7 @@ def update_annotator_page_from_review_df(
 ) -> Tuple[object, List[dict], int, List[dict], pd.DataFrame, int]: # Correcting return types based on usage
     '''
     Update the visible annotation object and related objects with the latest review file information,
-    optimizing by processing only the current page's data.
     '''
     # Assume current_image_annotations_state is List[dict] and current_page_annotator is dict
     out_image_annotations_state: List[dict] = list(current_image_annotations_state) # Make a copy to avoid modifying input in place
@@ -220,7 +220,6 @@ def update_annotator_page_from_review_df(
         else:
             print("Warning: Page sizes DataFrame became empty after processing.")
-    # --- OPTIMIZATION: Process only the current page's data from review_df ---
     if not review_df.empty:
         # Filter review_df for the current page
         # Ensure 'page' column in review_df is comparable to page_num_reported
@@ -1040,9 +1039,12 @@ def reset_dropdowns(df:pd.DataFrame):
     return recogniser_entities_drop, text_entities_drop, page_entities_drop
 def df_select_callback_dataframe_row(df: pd.DataFrame, evt: gr.SelectData):
-        row_value_page = evt.row_value[0] # This is the page number value
         row_value_label = evt.row_value[1] # This is the label number value
         row_value_text = evt.row_value[2] # This is the text number value
         row_value_id = evt.row_value[3] # This is the text number value
@@ -1072,13 +1074,22 @@ def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
 def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
-        row_value_page = evt.row_value[0] # This is the page_number value
         row_value_text = evt.row_value[1] # This is the text contents
         row_value_df = pd.DataFrame(data={"page":[row_value_page], "text":[row_value_text]})
         return row_value_page, row_value_df
 def get_all_rows_with_same_text(df: pd.DataFrame, text: str):
     '''
     Get all rows with the same text as the selected row

 ) -> Tuple[object, List[dict], int, List[dict], pd.DataFrame, int]: # Correcting return types based on usage
     '''
     Update the visible annotation object and related objects with the latest review file information,
+    optimising by processing only the current page's data.
     '''
     # Assume current_image_annotations_state is List[dict] and current_page_annotator is dict
     out_image_annotations_state: List[dict] = list(current_image_annotations_state) # Make a copy to avoid modifying input in place
         else:
             print("Warning: Page sizes DataFrame became empty after processing.")
     if not review_df.empty:
         # Filter review_df for the current page
         # Ensure 'page' column in review_df is comparable to page_num_reported
     return recogniser_entities_drop, text_entities_drop, page_entities_drop
+def increase_bottom_page_count_based_on_top(page_number:int):
+    return int(page_number)
 def df_select_callback_dataframe_row(df: pd.DataFrame, evt: gr.SelectData):
+        row_value_page = int(evt.row_value[0]) # This is the page number value
         row_value_label = evt.row_value[1] # This is the label number value
         row_value_text = evt.row_value[2] # This is the text number value
         row_value_id = evt.row_value[3] # This is the text number value
 def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
+        row_value_page = int(evt.row_value[0]) # This is the page_number value
         row_value_text = evt.row_value[1] # This is the text contents
         row_value_df = pd.DataFrame(data={"page":[row_value_page], "text":[row_value_text]})
         return row_value_page, row_value_df
+# When a user selects a row in the duplicate results table
+def store_duplicate_selection(evt: gr.SelectData):
+    if not evt.empty:
+        selected_index = evt.index[0]
+    else:
+        selected_index = None
+    return selected_index
 def get_all_rows_with_same_text(df: pd.DataFrame, text: str):
     '''
     Get all rows with the same text as the selected row